31 #include "../../asmlib.hpp"
32 #include "../../utils.hpp"
36 void a64_smallK_hybrid_s8s32_dot_8x4_a55(
const int8_t *A,
int lda,
const int8_t *B, int32_t *C,
int ldc,
int M,
int N,
int K,
const int32_t *,
Activation,
bool) {
37 const long loops_count =
iceildiv(
N, (
int)4) - 1;
38 const long ldab = lda *
sizeof(int8_t);
39 const long ldcb = ldc *
sizeof(int32_t);
40 const long odds_count =
K % 4;
43 for (
int y0=0; y0<
M; y0+=8) {
44 long loops = loops_count;
45 long oob_rows = std::max(8 - (
M-y0), 0);
46 long odds = odds_count;
47 const int8_t *b_ptr0 =
B;
48 const int8_t *a_ptr0 =
A + (y0 * lda);
50 int32_t *c_ptr0 = C + (y0 * ldc);
69 "temploadreg0 .req X14\n"
70 "temploadreg1 .req X15\n"
71 "temploadreg2 .req X16\n"
72 "temploadreg3 .req X17\n"
73 "add a_ptr1, %[a_ptr0], %[lda]\n"
74 "add c_ptr1, %[c_ptr0], %[ldc]\n"
75 "add a_ptr2, a_ptr1, %[lda]\n"
76 "add c_ptr2, c_ptr1, %[ldc]\n"
77 "add a_ptr3, a_ptr2, %[lda]\n"
78 "add c_ptr3, c_ptr2, %[ldc]\n"
79 "add a_ptr4, a_ptr3, %[lda]\n"
80 "add c_ptr4, c_ptr3, %[ldc]\n"
81 "add a_ptr5, a_ptr4, %[lda]\n"
82 "add c_ptr5, c_ptr4, %[ldc]\n"
83 "add a_ptr6, a_ptr5, %[lda]\n"
84 "add c_ptr6, c_ptr5, %[ldc]\n"
85 "add a_ptr7, a_ptr6, %[lda]\n"
86 "add c_ptr7, c_ptr6, %[ldc]\n"
87 "cbz %[oob_rows], 1f\n"
88 "subs %[oob_rows], %[oob_rows], #0x1\n"
89 "add c_ptr7, %[c_ptr0], #0x0\n"
90 "add a_ptr7, %[a_ptr0], #0x0\n"
92 "subs %[oob_rows], %[oob_rows], #0x1\n"
93 "add c_ptr6, %[c_ptr0], #0x0\n"
94 "add a_ptr6, %[a_ptr0], #0x0\n"
96 "subs %[oob_rows], %[oob_rows], #0x1\n"
97 "add c_ptr5, %[c_ptr0], #0x0\n"
98 "add a_ptr5, %[a_ptr0], #0x0\n"
100 "subs %[oob_rows], %[oob_rows], #0x1\n"
101 "add c_ptr4, %[c_ptr0], #0x0\n"
102 "add a_ptr4, %[a_ptr0], #0x0\n"
104 "subs %[oob_rows], %[oob_rows], #0x1\n"
105 "add c_ptr3, %[c_ptr0], #0x0\n"
106 "add a_ptr3, %[a_ptr0], #0x0\n"
108 "subs %[oob_rows], %[oob_rows], #0x1\n"
109 "add c_ptr2, %[c_ptr0], #0x0\n"
110 "add a_ptr2, %[a_ptr0], #0x0\n"
112 "subs %[oob_rows], %[oob_rows], #0x1\n"
113 "add c_ptr1, %[c_ptr0], #0x0\n"
114 "add a_ptr1, %[a_ptr0], #0x0\n"
117 "ldr s0, [%[a_ptr0]]\n"
127 "subs %[odds], %[odds], #0x1\n"
129 "ldr b0, [%[a_ptr0]]\n"
139 "ldr h0, [%[a_ptr0]], #0x2\n"
140 "ldr h1, [a_ptr1], #0x2\n"
141 "ldr h2, [a_ptr2], #0x2\n"
142 "ldr h3, [a_ptr3], #0x2\n"
143 "ldr h4, [a_ptr4], #0x2\n"
144 "ldr h5, [a_ptr5], #0x2\n"
145 "ldr h6, [a_ptr6], #0x2\n"
146 "ldr h7, [a_ptr7], #0x2\n"
147 "subs %[odds], %[odds], #0x1\n"
151 "ld1 {v0.b}[2], [%[a_ptr0]]\n"
152 "ld1 {v1.b}[2], [a_ptr1]\n"
153 "ld1 {v2.b}[2], [a_ptr2]\n"
154 "ld1 {v3.b}[2], [a_ptr3]\n"
155 "ld1 {v4.b}[2], [a_ptr4]\n"
156 "ld1 {v5.b}[2], [a_ptr5]\n"
157 "ld1 {v6.b}[2], [a_ptr6]\n"
158 "ld1 {v7.b}[2], [a_ptr7]\n"
160 "ldr q16, [%[b_ptr0]]\n"
161 "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
162 "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
163 "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
164 "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
165 "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
166 "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
167 "add %[b_ptr0], %[b_ptr0], #0x10\n"
170 "subs %[loops], %[loops], #0x1\n"
178 ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
179 ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
180 ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
181 ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
182 ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
183 ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
184 ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
185 ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
188 "str q24, [%[c_ptr0]]\n"
189 "subs %[loops], %[loops], #0x1\n"
191 "ldr d16, [%[b_ptr0]]\n"
192 "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
193 "add %[c_ptr0], %[c_ptr0], #0x10\n"
194 "str q25, [c_ptr1]\n"
195 "add c_ptr1, c_ptr1, #0x10\n"
197 "ins v16.d[1], temploadreg0\n"
198 "add %[b_ptr0], %[b_ptr0], #0x10\n"
199 "str q26, [c_ptr2]\n"
200 "add c_ptr2, c_ptr2, #0x10\n"
202 "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
203 ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
204 "str q27, [c_ptr3]\n"
206 "add c_ptr3, c_ptr3, #0x10\n"
207 ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
208 "str q28, [c_ptr4]\n"
210 "add c_ptr4, c_ptr4, #0x10\n"
211 ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
212 "str q29, [c_ptr5]\n"
214 "add c_ptr5, c_ptr5, #0x10\n"
215 ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
216 "str q30, [c_ptr6]\n"
218 "add c_ptr6, c_ptr6, #0x10\n"
219 ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
220 "str q31, [c_ptr7]\n"
222 "add c_ptr7, c_ptr7, #0x10\n"
223 ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
224 "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
225 ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
226 "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
227 ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
228 "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
229 "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
230 "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
231 "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
232 "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
235 "str q24, [%[c_ptr0]]\n"
236 "add %[c_ptr0], %[c_ptr0], #0x10\n"
238 "ldr q16, [%[b_ptr0]]\n"
239 "add %[b_ptr0], %[b_ptr0], #0x10\n"
240 "str q25, [c_ptr1]\n"
241 "add c_ptr1, c_ptr1, #0x10\n"
243 ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
244 "str q26, [c_ptr2]\n"
246 "add c_ptr2, c_ptr2, #0x10\n"
247 ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
248 "str q27, [c_ptr3]\n"
250 "add c_ptr3, c_ptr3, #0x10\n"
251 ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
252 "str q28, [c_ptr4]\n"
254 "add c_ptr4, c_ptr4, #0x10\n"
255 ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
256 "str q29, [c_ptr5]\n"
258 "add c_ptr5, c_ptr5, #0x10\n"
259 ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
260 "str q30, [c_ptr6]\n"
262 "add c_ptr6, c_ptr6, #0x10\n"
263 ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
264 "str q31, [c_ptr7]\n"
266 "add c_ptr7, c_ptr7, #0x10\n"
267 ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
268 ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
279 ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
280 ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
281 ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
282 ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
283 ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
284 ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
285 ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
286 ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
288 "str q24, [%[c_ptr0]]\n"
289 "add %[c_ptr0], %[c_ptr0], #0x10\n"
290 "str q25, [c_ptr1]\n"
291 "str q26, [c_ptr2]\n"
292 "str q27, [c_ptr3]\n"
293 "str q28, [c_ptr4]\n"
294 "str q29, [c_ptr5]\n"
295 "str q30, [c_ptr6]\n"
296 "str q31, [c_ptr7]\n"
311 ".unreq temploadreg0\n"
312 ".unreq temploadreg1\n"
313 ".unreq temploadreg2\n"
314 ".unreq temploadreg3\n"
315 : [a_ptr0]
"+r" (a_ptr0), [b_ptr0]
"+r" (b_ptr0), [c_ptr0]
"+r" (c_ptr0), [loops]
"+r" (loops), [oob_rows]
"+r" (oob_rows), [odds]
"+r" (odds)
316 : [lda]
"r" (ldab), [ldc]
"r" (ldcb)
317 :
"x0",
"x1",
"x2",
"x3",
"x4",
"x5",
"x6",
"x7",
"x8",
"x9",
"x10",
"x11",
"x12",
"x13",
"x14",
"x15",
"x16",
"x17",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23",
"v24",
"v25",
"v26",
"v27",
"v28",
"v29",
"v30",
"v31",
"cc",
"memory"
336 "temploadreg0 .req X14\n"
337 "temploadreg1 .req X15\n"
338 "temploadreg2 .req X16\n"
339 "temploadreg3 .req X17\n"
340 "add a_ptr1, %[a_ptr0], %[lda]\n"
341 "add c_ptr1, %[c_ptr0], %[ldc]\n"
342 "add a_ptr2, a_ptr1, %[lda]\n"
343 "add c_ptr2, c_ptr1, %[ldc]\n"
344 "add a_ptr3, a_ptr2, %[lda]\n"
345 "add c_ptr3, c_ptr2, %[ldc]\n"
346 "add a_ptr4, a_ptr3, %[lda]\n"
347 "add c_ptr4, c_ptr3, %[ldc]\n"
348 "add a_ptr5, a_ptr4, %[lda]\n"
349 "add c_ptr5, c_ptr4, %[ldc]\n"
350 "add a_ptr6, a_ptr5, %[lda]\n"
351 "add c_ptr6, c_ptr5, %[ldc]\n"
352 "add a_ptr7, a_ptr6, %[lda]\n"
353 "add c_ptr7, c_ptr6, %[ldc]\n"
354 "cbz %[oob_rows], 1f\n"
355 "subs %[oob_rows], %[oob_rows], #0x1\n"
356 "add c_ptr7, %[c_ptr0], #0x0\n"
357 "add a_ptr7, %[a_ptr0], #0x0\n"
359 "subs %[oob_rows], %[oob_rows], #0x1\n"
360 "add c_ptr6, %[c_ptr0], #0x0\n"
361 "add a_ptr6, %[a_ptr0], #0x0\n"
363 "subs %[oob_rows], %[oob_rows], #0x1\n"
364 "add c_ptr5, %[c_ptr0], #0x0\n"
365 "add a_ptr5, %[a_ptr0], #0x0\n"
367 "subs %[oob_rows], %[oob_rows], #0x1\n"
368 "add c_ptr4, %[c_ptr0], #0x0\n"
369 "add a_ptr4, %[a_ptr0], #0x0\n"
371 "subs %[oob_rows], %[oob_rows], #0x1\n"
372 "add c_ptr3, %[c_ptr0], #0x0\n"
373 "add a_ptr3, %[a_ptr0], #0x0\n"
375 "subs %[oob_rows], %[oob_rows], #0x1\n"
376 "add c_ptr2, %[c_ptr0], #0x0\n"
377 "add a_ptr2, %[a_ptr0], #0x0\n"
379 "subs %[oob_rows], %[oob_rows], #0x1\n"
380 "add c_ptr1, %[c_ptr0], #0x0\n"
381 "add a_ptr1, %[a_ptr0], #0x0\n"
384 "ldr d0, [%[a_ptr0]]\n"
394 "ldr s0, [%[a_ptr0]], #0x4\n"
395 "ldr s1, [a_ptr1], #0x4\n"
396 "ldr s2, [a_ptr2], #0x4\n"
397 "ldr s3, [a_ptr3], #0x4\n"
398 "ldr s4, [a_ptr4], #0x4\n"
399 "ldr s5, [a_ptr5], #0x4\n"
400 "ldr s6, [a_ptr6], #0x4\n"
401 "ldr s7, [a_ptr7], #0x4\n"
402 "subs %[odds], %[odds], #0x1\n"
404 "ld1 {v0.b}[4], [%[a_ptr0]]\n"
405 "ld1 {v1.b}[4], [a_ptr1]\n"
406 "ld1 {v2.b}[4], [a_ptr2]\n"
407 "ld1 {v3.b}[4], [a_ptr3]\n"
408 "ld1 {v4.b}[4], [a_ptr4]\n"
409 "ld1 {v5.b}[4], [a_ptr5]\n"
410 "ld1 {v6.b}[4], [a_ptr6]\n"
411 "ld1 {v7.b}[4], [a_ptr7]\n"
414 "ld1 {v0.h}[2], [%[a_ptr0]], #2\n"
415 "ld1 {v1.h}[2], [a_ptr1], #2\n"
416 "ld1 {v2.h}[2], [a_ptr2], #2\n"
417 "ld1 {v3.h}[2], [a_ptr3], #2\n"
418 "ld1 {v4.h}[2], [a_ptr4], #2\n"
419 "ld1 {v5.h}[2], [a_ptr5], #2\n"
420 "ld1 {v6.h}[2], [a_ptr6], #2\n"
421 "ld1 {v7.h}[2], [a_ptr7], #2\n"
422 "subs %[odds], %[odds], #0x1\n"
426 "ld1 {v0.b}[6], [%[a_ptr0]]\n"
427 "ld1 {v1.b}[6], [a_ptr1]\n"
428 "ld1 {v2.b}[6], [a_ptr2]\n"
429 "ld1 {v3.b}[6], [a_ptr3]\n"
430 "ld1 {v4.b}[6], [a_ptr4]\n"
431 "ld1 {v5.b}[6], [a_ptr5]\n"
432 "ld1 {v6.b}[6], [a_ptr6]\n"
433 "ld1 {v7.b}[6], [a_ptr7]\n"
435 "ldr q16, [%[b_ptr0]]\n"
436 "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
437 "ldr q17, [%[b_ptr0], #0x10]\n"
438 "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
439 "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
440 "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
441 "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
442 "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
443 "add %[b_ptr0], %[b_ptr0], #0x20\n"
446 "subs %[loops], %[loops], #0x1\n"
454 ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
455 ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
456 ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
457 ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
458 ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
459 ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
460 ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
461 ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
462 ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
463 ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
464 ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
465 ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
466 ".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
467 ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
468 ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
469 ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
472 "str q24, [%[c_ptr0]]\n"
473 "subs %[loops], %[loops], #0x1\n"
475 "ldr d16, [%[b_ptr0]]\n"
476 "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
477 "add %[c_ptr0], %[c_ptr0], #0x10\n"
478 "str q25, [c_ptr1]\n"
479 "add c_ptr1, c_ptr1, #0x10\n"
481 "ldr d17, [%[b_ptr0], #0x10]\n"
482 "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
483 "add %[b_ptr0], %[b_ptr0], #0x20\n"
484 "str q26, [c_ptr2]\n"
485 "add c_ptr2, c_ptr2, #0x10\n"
487 "ins v16.d[1], temploadreg0\n"
488 "ins v17.d[1], temploadreg1\n"
489 "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
490 "str q27, [c_ptr3]\n"
491 "add c_ptr3, c_ptr3, #0x10\n"
493 "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
494 ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
495 "str q28, [c_ptr4]\n"
497 "add c_ptr4, c_ptr4, #0x10\n"
498 ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
499 "str q29, [c_ptr5]\n"
501 "add c_ptr5, c_ptr5, #0x10\n"
502 ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
503 "str q30, [c_ptr6]\n"
505 "add c_ptr6, c_ptr6, #0x10\n"
506 ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
507 "str q31, [c_ptr7]\n"
509 "add c_ptr7, c_ptr7, #0x10\n"
510 ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
511 "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
512 ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
513 "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
514 ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
515 "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
516 ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
517 "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
518 ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
519 "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
520 ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
521 "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
522 ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
523 ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
524 ".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
525 ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
526 ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
527 ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
530 "str q24, [%[c_ptr0]]\n"
531 "add %[c_ptr0], %[c_ptr0], #0x10\n"
533 "ldr q16, [%[b_ptr0]]\n"
534 "ldr q17, [%[b_ptr0], #0x10]\n"
535 "add %[b_ptr0], %[b_ptr0], #0x20\n"
536 "str q25, [c_ptr1]\n"
537 "add c_ptr1, c_ptr1, #0x10\n"
539 ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
540 "str q26, [c_ptr2]\n"
542 "add c_ptr2, c_ptr2, #0x10\n"
543 ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
544 "str q27, [c_ptr3]\n"
546 "add c_ptr3, c_ptr3, #0x10\n"
547 ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
548 "str q28, [c_ptr4]\n"
550 "add c_ptr4, c_ptr4, #0x10\n"
551 ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
552 "str q29, [c_ptr5]\n"
554 "add c_ptr5, c_ptr5, #0x10\n"
555 ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
556 "str q30, [c_ptr6]\n"
558 "add c_ptr6, c_ptr6, #0x10\n"
559 ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
560 "str q31, [c_ptr7]\n"
562 "add c_ptr7, c_ptr7, #0x10\n"
563 ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
564 ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
565 ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
566 ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
567 ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
568 ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
569 ".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
570 ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
571 ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
572 ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
583 ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
584 ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
585 ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
586 ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
587 ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
588 ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
589 ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
590 ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
591 ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
592 ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
593 ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
594 ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
595 ".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
596 ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
597 ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
598 ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
600 "str q24, [%[c_ptr0]]\n"
601 "add %[c_ptr0], %[c_ptr0], #0x10\n"
602 "str q25, [c_ptr1]\n"
603 "str q26, [c_ptr2]\n"
604 "str q27, [c_ptr3]\n"
605 "str q28, [c_ptr4]\n"
606 "str q29, [c_ptr5]\n"
607 "str q30, [c_ptr6]\n"
608 "str q31, [c_ptr7]\n"
623 ".unreq temploadreg0\n"
624 ".unreq temploadreg1\n"
625 ".unreq temploadreg2\n"
626 ".unreq temploadreg3\n"
627 : [a_ptr0]
"+r" (a_ptr0), [b_ptr0]
"+r" (b_ptr0), [c_ptr0]
"+r" (c_ptr0), [loops]
"+r" (loops), [oob_rows]
"+r" (oob_rows), [odds]
"+r" (odds)
628 : [lda]
"r" (ldab), [ldc]
"r" (ldcb)
629 :
"x0",
"x1",
"x2",
"x3",
"x4",
"x5",
"x6",
"x7",
"x8",
"x9",
"x10",
"x11",
"x12",
"x13",
"x14",
"x15",
"x16",
"x17",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23",
"v24",
"v25",
"v26",
"v27",
"v28",
"v29",
"v30",
"v31",
"cc",
"memory"
648 "temploadreg0 .req X14\n"
649 "temploadreg1 .req X15\n"
650 "temploadreg2 .req X16\n"
651 "temploadreg3 .req X17\n"
652 "add a_ptr1, %[a_ptr0], %[lda]\n"
653 "add c_ptr1, %[c_ptr0], %[ldc]\n"
654 "add a_ptr2, a_ptr1, %[lda]\n"
655 "add c_ptr2, c_ptr1, %[ldc]\n"
656 "add a_ptr3, a_ptr2, %[lda]\n"
657 "add c_ptr3, c_ptr2, %[ldc]\n"
658 "add a_ptr4, a_ptr3, %[lda]\n"
659 "add c_ptr4, c_ptr3, %[ldc]\n"
660 "add a_ptr5, a_ptr4, %[lda]\n"
661 "add c_ptr5, c_ptr4, %[ldc]\n"
662 "add a_ptr6, a_ptr5, %[lda]\n"
663 "add c_ptr6, c_ptr5, %[ldc]\n"
664 "add a_ptr7, a_ptr6, %[lda]\n"
665 "add c_ptr7, c_ptr6, %[ldc]\n"
666 "cbz %[oob_rows], 1f\n"
667 "subs %[oob_rows], %[oob_rows], #0x1\n"
668 "add c_ptr7, %[c_ptr0], #0x0\n"
669 "add a_ptr7, %[a_ptr0], #0x0\n"
671 "subs %[oob_rows], %[oob_rows], #0x1\n"
672 "add c_ptr6, %[c_ptr0], #0x0\n"
673 "add a_ptr6, %[a_ptr0], #0x0\n"
675 "subs %[oob_rows], %[oob_rows], #0x1\n"
676 "add c_ptr5, %[c_ptr0], #0x0\n"
677 "add a_ptr5, %[a_ptr0], #0x0\n"
679 "subs %[oob_rows], %[oob_rows], #0x1\n"
680 "add c_ptr4, %[c_ptr0], #0x0\n"
681 "add a_ptr4, %[a_ptr0], #0x0\n"
683 "subs %[oob_rows], %[oob_rows], #0x1\n"
684 "add c_ptr3, %[c_ptr0], #0x0\n"
685 "add a_ptr3, %[a_ptr0], #0x0\n"
687 "subs %[oob_rows], %[oob_rows], #0x1\n"
688 "add c_ptr2, %[c_ptr0], #0x0\n"
689 "add a_ptr2, %[a_ptr0], #0x0\n"
691 "subs %[oob_rows], %[oob_rows], #0x1\n"
692 "add c_ptr1, %[c_ptr0], #0x0\n"
693 "add a_ptr1, %[a_ptr0], #0x0\n"
695 "ldr d0, [%[a_ptr0]], #0x8\n"
696 "ldr d1, [a_ptr1], #0x8\n"
697 "ldr d2, [a_ptr2], #0x8\n"
698 "ldr d3, [a_ptr3], #0x8\n"
699 "ldr d4, [a_ptr4], #0x8\n"
700 "ldr d5, [a_ptr5], #0x8\n"
701 "ldr d6, [a_ptr6], #0x8\n"
702 "ldr d7, [a_ptr7], #0x8\n"
704 "ld1 {v0.s}[2], [%[a_ptr0]]\n"
705 "ld1 {v1.s}[2], [a_ptr1]\n"
706 "ld1 {v2.s}[2], [a_ptr2]\n"
707 "ld1 {v3.s}[2], [a_ptr3]\n"
708 "ld1 {v4.s}[2], [a_ptr4]\n"
709 "ld1 {v5.s}[2], [a_ptr5]\n"
710 "ld1 {v6.s}[2], [a_ptr6]\n"
711 "ld1 {v7.s}[2], [a_ptr7]\n"
714 "subs %[odds], %[odds], #0x1\n"
716 "ld1 {v0.b}[8], [%[a_ptr0]]\n"
717 "ld1 {v1.b}[8], [a_ptr1]\n"
718 "ld1 {v2.b}[8], [a_ptr2]\n"
719 "ld1 {v3.b}[8], [a_ptr3]\n"
720 "ld1 {v4.b}[8], [a_ptr4]\n"
721 "ld1 {v5.b}[8], [a_ptr5]\n"
722 "ld1 {v6.b}[8], [a_ptr6]\n"
723 "ld1 {v7.b}[8], [a_ptr7]\n"
726 "ld1 {v0.h}[4], [%[a_ptr0]], #2\n"
727 "ld1 {v1.h}[4], [a_ptr1], #2\n"
728 "ld1 {v2.h}[4], [a_ptr2], #2\n"
729 "ld1 {v3.h}[4], [a_ptr3], #2\n"
730 "ld1 {v4.h}[4], [a_ptr4], #2\n"
731 "ld1 {v5.h}[4], [a_ptr5], #2\n"
732 "ld1 {v6.h}[4], [a_ptr6], #2\n"
733 "ld1 {v7.h}[4], [a_ptr7], #2\n"
734 "subs %[odds], %[odds], #0x1\n"
738 "ld1 {v0.b}[10], [%[a_ptr0]]\n"
739 "ld1 {v1.b}[10], [a_ptr1]\n"
740 "ld1 {v2.b}[10], [a_ptr2]\n"
741 "ld1 {v3.b}[10], [a_ptr3]\n"
742 "ld1 {v4.b}[10], [a_ptr4]\n"
743 "ld1 {v5.b}[10], [a_ptr5]\n"
744 "ld1 {v6.b}[10], [a_ptr6]\n"
745 "ld1 {v7.b}[10], [a_ptr7]\n"
747 "ldr q16, [%[b_ptr0]]\n"
748 "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
749 "ldr q17, [%[b_ptr0], #0x10]\n"
750 "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
751 "ldr q18, [%[b_ptr0], #0x20]\n"
752 "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
753 "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
754 "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
755 "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
756 "add %[b_ptr0], %[b_ptr0], #0x30\n"
759 "subs %[loops], %[loops], #0x1\n"
767 ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
768 ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
769 ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
770 ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
771 ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
772 ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
773 ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
774 ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
775 ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
776 ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
777 ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
778 ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
779 ".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
780 ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
781 ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
782 ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
783 ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
784 ".inst 0x4f81ea59 // sdot v25.4s, v18.16b, v1.4b[2]\n"
785 ".inst 0x4f82ea5a // sdot v26.4s, v18.16b, v2.4b[2]\n"
786 ".inst 0x4f83ea5b // sdot v27.4s, v18.16b, v3.4b[2]\n"
787 ".inst 0x4f84ea5c // sdot v28.4s, v18.16b, v4.4b[2]\n"
788 ".inst 0x4f85ea5d // sdot v29.4s, v18.16b, v5.4b[2]\n"
789 ".inst 0x4f86ea5e // sdot v30.4s, v18.16b, v6.4b[2]\n"
790 ".inst 0x4f87ea5f // sdot v31.4s, v18.16b, v7.4b[2]\n"
793 "str q24, [%[c_ptr0]]\n"
794 "subs %[loops], %[loops], #0x1\n"
796 "ldr d16, [%[b_ptr0]]\n"
797 "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
798 "add %[c_ptr0], %[c_ptr0], #0x10\n"
799 "str q25, [c_ptr1]\n"
800 "add c_ptr1, c_ptr1, #0x10\n"
802 "ldr d17, [%[b_ptr0], #0x10]\n"
803 "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
804 "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
805 "str q26, [c_ptr2]\n"
806 "add c_ptr2, c_ptr2, #0x10\n"
808 "ldr d18, [%[b_ptr0], #0x20]\n"
809 "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
810 "add %[b_ptr0], %[b_ptr0], #0x30\n"
811 "str q27, [c_ptr3]\n"
812 "add c_ptr3, c_ptr3, #0x10\n"
814 "ins v16.d[1], temploadreg0\n"
815 "ins v17.d[1], temploadreg1\n"
816 "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
817 "str q28, [c_ptr4]\n"
818 "add c_ptr4, c_ptr4, #0x10\n"
820 "ins v18.d[1], temploadreg2\n"
821 ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
822 "str q29, [c_ptr5]\n"
824 "add c_ptr5, c_ptr5, #0x10\n"
825 ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
826 "str q30, [c_ptr6]\n"
828 "add c_ptr6, c_ptr6, #0x10\n"
829 ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
830 "str q31, [c_ptr7]\n"
832 "add c_ptr7, c_ptr7, #0x10\n"
833 ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
834 "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
835 ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
836 "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
837 ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
838 "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
839 ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
840 "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
841 ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
842 "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
843 ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
844 "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
845 ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
846 ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
847 ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
848 ".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
849 ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
850 ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
851 ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
852 ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
853 ".inst 0x4f81ea59 // sdot v25.4s, v18.16b, v1.4b[2]\n"
854 ".inst 0x4f82ea5a // sdot v26.4s, v18.16b, v2.4b[2]\n"
855 ".inst 0x4f83ea5b // sdot v27.4s, v18.16b, v3.4b[2]\n"
856 ".inst 0x4f84ea5c // sdot v28.4s, v18.16b, v4.4b[2]\n"
857 ".inst 0x4f85ea5d // sdot v29.4s, v18.16b, v5.4b[2]\n"
858 ".inst 0x4f86ea5e // sdot v30.4s, v18.16b, v6.4b[2]\n"
859 ".inst 0x4f87ea5f // sdot v31.4s, v18.16b, v7.4b[2]\n"
862 "str q24, [%[c_ptr0]]\n"
863 "add %[c_ptr0], %[c_ptr0], #0x10\n"
865 "ldr q16, [%[b_ptr0]]\n"
866 "ldr q17, [%[b_ptr0], #0x10]\n"
867 "str q25, [c_ptr1]\n"
868 "add c_ptr1, c_ptr1, #0x10\n"
870 "ldr q18, [%[b_ptr0], #0x20]\n"
871 ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
872 "str q26, [c_ptr2]\n"
874 "add c_ptr2, c_ptr2, #0x10\n"
875 ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
876 "str q27, [c_ptr3]\n"
878 "add c_ptr3, c_ptr3, #0x10\n"
879 ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
880 "str q28, [c_ptr4]\n"
882 "add c_ptr4, c_ptr4, #0x10\n"
883 ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
884 "str q29, [c_ptr5]\n"
886 "add c_ptr5, c_ptr5, #0x10\n"
887 ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
888 "str q30, [c_ptr6]\n"
890 "add c_ptr6, c_ptr6, #0x10\n"
891 ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
892 "str q31, [c_ptr7]\n"
894 "add c_ptr7, c_ptr7, #0x10\n"
895 ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
896 "add %[b_ptr0], %[b_ptr0], #0x30\n"
897 ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
898 ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
899 ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
900 ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
901 ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
902 ".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
903 ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
904 ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
905 ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
906 ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
907 ".inst 0x4f81ea59 // sdot v25.4s, v18.16b, v1.4b[2]\n"
908 ".inst 0x4f82ea5a // sdot v26.4s, v18.16b, v2.4b[2]\n"
909 ".inst 0x4f83ea5b // sdot v27.4s, v18.16b, v3.4b[2]\n"
910 ".inst 0x4f84ea5c // sdot v28.4s, v18.16b, v4.4b[2]\n"
911 ".inst 0x4f85ea5d // sdot v29.4s, v18.16b, v5.4b[2]\n"
912 ".inst 0x4f86ea5e // sdot v30.4s, v18.16b, v6.4b[2]\n"
913 ".inst 0x4f87ea5f // sdot v31.4s, v18.16b, v7.4b[2]\n"
924 ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
925 ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
926 ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
927 ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
928 ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
929 ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
930 ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
931 ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
932 ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
933 ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
934 ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
935 ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
936 ".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
937 ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
938 ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
939 ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
940 ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
941 ".inst 0x4f81ea59 // sdot v25.4s, v18.16b, v1.4b[2]\n"
942 ".inst 0x4f82ea5a // sdot v26.4s, v18.16b, v2.4b[2]\n"
943 ".inst 0x4f83ea5b // sdot v27.4s, v18.16b, v3.4b[2]\n"
944 ".inst 0x4f84ea5c // sdot v28.4s, v18.16b, v4.4b[2]\n"
945 ".inst 0x4f85ea5d // sdot v29.4s, v18.16b, v5.4b[2]\n"
946 ".inst 0x4f86ea5e // sdot v30.4s, v18.16b, v6.4b[2]\n"
947 ".inst 0x4f87ea5f // sdot v31.4s, v18.16b, v7.4b[2]\n"
949 "str q24, [%[c_ptr0]]\n"
950 "add %[c_ptr0], %[c_ptr0], #0x10\n"
951 "str q25, [c_ptr1]\n"
952 "str q26, [c_ptr2]\n"
953 "str q27, [c_ptr3]\n"
954 "str q28, [c_ptr4]\n"
955 "str q29, [c_ptr5]\n"
956 "str q30, [c_ptr6]\n"
957 "str q31, [c_ptr7]\n"
972 ".unreq temploadreg0\n"
973 ".unreq temploadreg1\n"
974 ".unreq temploadreg2\n"
975 ".unreq temploadreg3\n"
976 : [a_ptr0]
"+r" (a_ptr0), [b_ptr0]
"+r" (b_ptr0), [c_ptr0]
"+r" (c_ptr0), [loops]
"+r" (loops), [oob_rows]
"+r" (oob_rows), [odds]
"+r" (odds)
977 : [lda]
"r" (ldab), [ldc]
"r" (ldcb)
978 :
"x0",
"x1",
"x2",
"x3",
"x4",
"x5",
"x6",
"x7",
"x8",
"x9",
"x10",
"x11",
"x12",
"x13",
"x14",
"x15",
"x16",
"x17",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23",
"v24",
"v25",
"v26",
"v27",
"v28",
"v29",
"v30",
"v31",
"cc",
"memory"
997 "temploadreg0 .req X14\n"
998 "temploadreg1 .req X15\n"
999 "temploadreg2 .req X16\n"
1000 "temploadreg3 .req X17\n"
1001 "add a_ptr1, %[a_ptr0], %[lda]\n"
1002 "add c_ptr1, %[c_ptr0], %[ldc]\n"
1003 "add a_ptr2, a_ptr1, %[lda]\n"
1004 "add c_ptr2, c_ptr1, %[ldc]\n"
1005 "add a_ptr3, a_ptr2, %[lda]\n"
1006 "add c_ptr3, c_ptr2, %[ldc]\n"
1007 "add a_ptr4, a_ptr3, %[lda]\n"
1008 "add c_ptr4, c_ptr3, %[ldc]\n"
1009 "add a_ptr5, a_ptr4, %[lda]\n"
1010 "add c_ptr5, c_ptr4, %[ldc]\n"
1011 "add a_ptr6, a_ptr5, %[lda]\n"
1012 "add c_ptr6, c_ptr5, %[ldc]\n"
1013 "add a_ptr7, a_ptr6, %[lda]\n"
1014 "add c_ptr7, c_ptr6, %[ldc]\n"
1015 "cbz %[oob_rows], 1f\n"
1016 "subs %[oob_rows], %[oob_rows], #0x1\n"
1017 "add c_ptr7, %[c_ptr0], #0x0\n"
1018 "add a_ptr7, %[a_ptr0], #0x0\n"
1020 "subs %[oob_rows], %[oob_rows], #0x1\n"
1021 "add c_ptr6, %[c_ptr0], #0x0\n"
1022 "add a_ptr6, %[a_ptr0], #0x0\n"
1024 "subs %[oob_rows], %[oob_rows], #0x1\n"
1025 "add c_ptr5, %[c_ptr0], #0x0\n"
1026 "add a_ptr5, %[a_ptr0], #0x0\n"
1028 "subs %[oob_rows], %[oob_rows], #0x1\n"
1029 "add c_ptr4, %[c_ptr0], #0x0\n"
1030 "add a_ptr4, %[a_ptr0], #0x0\n"
1032 "subs %[oob_rows], %[oob_rows], #0x1\n"
1033 "add c_ptr3, %[c_ptr0], #0x0\n"
1034 "add a_ptr3, %[a_ptr0], #0x0\n"
1036 "subs %[oob_rows], %[oob_rows], #0x1\n"
1037 "add c_ptr2, %[c_ptr0], #0x0\n"
1038 "add a_ptr2, %[a_ptr0], #0x0\n"
1040 "subs %[oob_rows], %[oob_rows], #0x1\n"
1041 "add c_ptr1, %[c_ptr0], #0x0\n"
1042 "add a_ptr1, %[a_ptr0], #0x0\n"
1044 "cbnz %[odds], 2f\n"
1045 "ldr q0, [%[a_ptr0]]\n"
1046 "ldr q1, [a_ptr1]\n"
1047 "ldr q2, [a_ptr2]\n"
1048 "ldr q3, [a_ptr3]\n"
1049 "ldr q4, [a_ptr4]\n"
1050 "ldr q5, [a_ptr5]\n"
1051 "ldr q6, [a_ptr6]\n"
1052 "ldr q7, [a_ptr7]\n"
1055 "ldr d0, [%[a_ptr0]], #0x8\n"
1056 "ldr d1, [a_ptr1], #0x8\n"
1057 "ldr d2, [a_ptr2], #0x8\n"
1058 "ldr d3, [a_ptr3], #0x8\n"
1059 "ldr d4, [a_ptr4], #0x8\n"
1060 "ldr d5, [a_ptr5], #0x8\n"
1061 "ldr d6, [a_ptr6], #0x8\n"
1062 "ldr d7, [a_ptr7], #0x8\n"
1063 "ld1 {v0.s}[2], [%[a_ptr0]], #4\n"
1064 "ld1 {v1.s}[2], [a_ptr1], #4\n"
1065 "ld1 {v2.s}[2], [a_ptr2], #4\n"
1066 "ld1 {v3.s}[2], [a_ptr3], #4\n"
1067 "ld1 {v4.s}[2], [a_ptr4], #4\n"
1068 "ld1 {v5.s}[2], [a_ptr5], #4\n"
1069 "ld1 {v6.s}[2], [a_ptr6], #4\n"
1070 "ld1 {v7.s}[2], [a_ptr7], #4\n"
1071 "subs %[odds], %[odds], #0x1\n"
1073 "ld1 {v0.b}[12], [%[a_ptr0]]\n"
1074 "ld1 {v1.b}[12], [a_ptr1]\n"
1075 "ld1 {v2.b}[12], [a_ptr2]\n"
1076 "ld1 {v3.b}[12], [a_ptr3]\n"
1077 "ld1 {v4.b}[12], [a_ptr4]\n"
1078 "ld1 {v5.b}[12], [a_ptr5]\n"
1079 "ld1 {v6.b}[12], [a_ptr6]\n"
1080 "ld1 {v7.b}[12], [a_ptr7]\n"
1083 "ld1 {v0.h}[6], [%[a_ptr0]], #2\n"
1084 "ld1 {v1.h}[6], [a_ptr1], #2\n"
1085 "ld1 {v2.h}[6], [a_ptr2], #2\n"
1086 "ld1 {v3.h}[6], [a_ptr3], #2\n"
1087 "ld1 {v4.h}[6], [a_ptr4], #2\n"
1088 "ld1 {v5.h}[6], [a_ptr5], #2\n"
1089 "ld1 {v6.h}[6], [a_ptr6], #2\n"
1090 "ld1 {v7.h}[6], [a_ptr7], #2\n"
1091 "subs %[odds], %[odds], #0x1\n"
1095 "ld1 {v0.b}[14], [%[a_ptr0]]\n"
1096 "ld1 {v1.b}[14], [a_ptr1]\n"
1097 "ld1 {v2.b}[14], [a_ptr2]\n"
1098 "ld1 {v3.b}[14], [a_ptr3]\n"
1099 "ld1 {v4.b}[14], [a_ptr4]\n"
1100 "ld1 {v5.b}[14], [a_ptr5]\n"
1101 "ld1 {v6.b}[14], [a_ptr6]\n"
1102 "ld1 {v7.b}[14], [a_ptr7]\n"
1104 "ldr q16, [%[b_ptr0]]\n"
1105 "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
1106 "ldr q17, [%[b_ptr0], #0x10]\n"
1107 "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
1108 "ldr q18, [%[b_ptr0], #0x20]\n"
1109 "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
1110 "ldr q19, [%[b_ptr0], #0x30]\n"
1111 "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
1112 "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
1113 "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
1114 "add %[b_ptr0], %[b_ptr0], #0x40\n"
1115 "cbz %[loops], 6f\n"
1117 "subs %[loops], %[loops], #0x1\n"
1125 ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
1126 ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
1127 ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
1128 ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
1129 ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
1130 ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
1131 ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
1132 ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
1133 ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
1134 ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
1135 ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
1136 ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
1137 ".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
1138 ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
1139 ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
1140 ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
1141 ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
1142 ".inst 0x4f81ea59 // sdot v25.4s, v18.16b, v1.4b[2]\n"
1143 ".inst 0x4f82ea5a // sdot v26.4s, v18.16b, v2.4b[2]\n"
1144 ".inst 0x4f83ea5b // sdot v27.4s, v18.16b, v3.4b[2]\n"
1145 ".inst 0x4f84ea5c // sdot v28.4s, v18.16b, v4.4b[2]\n"
1146 ".inst 0x4f85ea5d // sdot v29.4s, v18.16b, v5.4b[2]\n"
1147 ".inst 0x4f86ea5e // sdot v30.4s, v18.16b, v6.4b[2]\n"
1148 ".inst 0x4f87ea5f // sdot v31.4s, v18.16b, v7.4b[2]\n"
1149 ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
1150 ".inst 0x4fa1ea79 // sdot v25.4s, v19.16b, v1.4b[3]\n"
1151 ".inst 0x4fa2ea7a // sdot v26.4s, v19.16b, v2.4b[3]\n"
1152 ".inst 0x4fa3ea7b // sdot v27.4s, v19.16b, v3.4b[3]\n"
1153 ".inst 0x4fa4ea7c // sdot v28.4s, v19.16b, v4.4b[3]\n"
1154 ".inst 0x4fa5ea7d // sdot v29.4s, v19.16b, v5.4b[3]\n"
1155 ".inst 0x4fa6ea7e // sdot v30.4s, v19.16b, v6.4b[3]\n"
1156 ".inst 0x4fa7ea7f // sdot v31.4s, v19.16b, v7.4b[3]\n"
1159 "str q24, [%[c_ptr0]]\n"
1160 "subs %[loops], %[loops], #0x1\n"
1162 "ldr d16, [%[b_ptr0]]\n"
1163 "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
1164 "add %[c_ptr0], %[c_ptr0], #0x10\n"
1165 "str q25, [c_ptr1]\n"
1166 "add c_ptr1, c_ptr1, #0x10\n"
1168 "ldr d17, [%[b_ptr0], #0x10]\n"
1169 "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
1170 "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
1171 "str q26, [c_ptr2]\n"
1172 "add c_ptr2, c_ptr2, #0x10\n"
1174 "ldr d18, [%[b_ptr0], #0x20]\n"
1175 "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
1176 "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
1177 "str q27, [c_ptr3]\n"
1178 "add c_ptr3, c_ptr3, #0x10\n"
1180 "ldr d19, [%[b_ptr0], #0x30]\n"
1181 "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
1182 "add %[b_ptr0], %[b_ptr0], #0x40\n"
1183 "str q28, [c_ptr4]\n"
1184 "add c_ptr4, c_ptr4, #0x10\n"
1186 "ins v16.d[1], temploadreg0\n"
1187 "ins v17.d[1], temploadreg1\n"
1188 "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
1189 "str q29, [c_ptr5]\n"
1190 "add c_ptr5, c_ptr5, #0x10\n"
1192 "ins v18.d[1], temploadreg2\n"
1193 ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
1194 "str q30, [c_ptr6]\n"
1196 "ins v19.d[1], temploadreg3\n"
1197 ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
1198 "add c_ptr6, c_ptr6, #0x10\n"
1199 ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
1200 "str q31, [c_ptr7]\n"
1202 "add c_ptr7, c_ptr7, #0x10\n"
1203 ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
1204 "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
1205 ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
1206 "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
1207 ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
1208 "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
1209 ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
1210 "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
1211 ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
1212 "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
1213 ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
1214 ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
1215 ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
1216 ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
1217 ".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
1218 ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
1219 ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
1220 ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
1221 ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
1222 ".inst 0x4f81ea59 // sdot v25.4s, v18.16b, v1.4b[2]\n"
1223 ".inst 0x4f82ea5a // sdot v26.4s, v18.16b, v2.4b[2]\n"
1224 ".inst 0x4f83ea5b // sdot v27.4s, v18.16b, v3.4b[2]\n"
1225 ".inst 0x4f84ea5c // sdot v28.4s, v18.16b, v4.4b[2]\n"
1226 ".inst 0x4f85ea5d // sdot v29.4s, v18.16b, v5.4b[2]\n"
1227 ".inst 0x4f86ea5e // sdot v30.4s, v18.16b, v6.4b[2]\n"
1228 ".inst 0x4f87ea5f // sdot v31.4s, v18.16b, v7.4b[2]\n"
1229 ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
1230 ".inst 0x4fa1ea79 // sdot v25.4s, v19.16b, v1.4b[3]\n"
1231 ".inst 0x4fa2ea7a // sdot v26.4s, v19.16b, v2.4b[3]\n"
1232 ".inst 0x4fa3ea7b // sdot v27.4s, v19.16b, v3.4b[3]\n"
1233 ".inst 0x4fa4ea7c // sdot v28.4s, v19.16b, v4.4b[3]\n"
1234 ".inst 0x4fa5ea7d // sdot v29.4s, v19.16b, v5.4b[3]\n"
1235 ".inst 0x4fa6ea7e // sdot v30.4s, v19.16b, v6.4b[3]\n"
1236 ".inst 0x4fa7ea7f // sdot v31.4s, v19.16b, v7.4b[3]\n"
1239 "str q24, [%[c_ptr0]]\n"
1240 "add %[c_ptr0], %[c_ptr0], #0x10\n"
1242 "ldr q16, [%[b_ptr0]]\n"
1243 "ldr q17, [%[b_ptr0], #0x10]\n"
1244 "str q25, [c_ptr1]\n"
1245 "add c_ptr1, c_ptr1, #0x10\n"
1247 "ldr q18, [%[b_ptr0], #0x20]\n"
1248 ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
1249 "str q26, [c_ptr2]\n"
1251 "ldr q19, [%[b_ptr0], #0x30]\n"
1252 "add c_ptr2, c_ptr2, #0x10\n"
1253 ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
1254 "str q27, [c_ptr3]\n"
1256 "add c_ptr3, c_ptr3, #0x10\n"
1257 ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
1258 "str q28, [c_ptr4]\n"
1260 "add c_ptr4, c_ptr4, #0x10\n"
1261 ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
1262 "str q29, [c_ptr5]\n"
1264 "add c_ptr5, c_ptr5, #0x10\n"
1265 ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
1266 "str q30, [c_ptr6]\n"
1268 "add c_ptr6, c_ptr6, #0x10\n"
1269 ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
1270 "str q31, [c_ptr7]\n"
1272 "add c_ptr7, c_ptr7, #0x10\n"
1273 ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
1274 "add %[b_ptr0], %[b_ptr0], #0x40\n"
1275 ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
1276 ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
1277 ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
1278 ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
1279 ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
1280 ".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
1281 ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
1282 ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
1283 ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
1284 ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
1285 ".inst 0x4f81ea59 // sdot v25.4s, v18.16b, v1.4b[2]\n"
1286 ".inst 0x4f82ea5a // sdot v26.4s, v18.16b, v2.4b[2]\n"
1287 ".inst 0x4f83ea5b // sdot v27.4s, v18.16b, v3.4b[2]\n"
1288 ".inst 0x4f84ea5c // sdot v28.4s, v18.16b, v4.4b[2]\n"
1289 ".inst 0x4f85ea5d // sdot v29.4s, v18.16b, v5.4b[2]\n"
1290 ".inst 0x4f86ea5e // sdot v30.4s, v18.16b, v6.4b[2]\n"
1291 ".inst 0x4f87ea5f // sdot v31.4s, v18.16b, v7.4b[2]\n"
1292 ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
1293 ".inst 0x4fa1ea79 // sdot v25.4s, v19.16b, v1.4b[3]\n"
1294 ".inst 0x4fa2ea7a // sdot v26.4s, v19.16b, v2.4b[3]\n"
1295 ".inst 0x4fa3ea7b // sdot v27.4s, v19.16b, v3.4b[3]\n"
1296 ".inst 0x4fa4ea7c // sdot v28.4s, v19.16b, v4.4b[3]\n"
1297 ".inst 0x4fa5ea7d // sdot v29.4s, v19.16b, v5.4b[3]\n"
1298 ".inst 0x4fa6ea7e // sdot v30.4s, v19.16b, v6.4b[3]\n"
1299 ".inst 0x4fa7ea7f // sdot v31.4s, v19.16b, v7.4b[3]\n"
1310 ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
1311 ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
1312 ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
1313 ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
1314 ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
1315 ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
1316 ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
1317 ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
1318 ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
1319 ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
1320 ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
1321 ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
1322 ".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
1323 ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
1324 ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
1325 ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
1326 ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
1327 ".inst 0x4f81ea59 // sdot v25.4s, v18.16b, v1.4b[2]\n"
1328 ".inst 0x4f82ea5a // sdot v26.4s, v18.16b, v2.4b[2]\n"
1329 ".inst 0x4f83ea5b // sdot v27.4s, v18.16b, v3.4b[2]\n"
1330 ".inst 0x4f84ea5c // sdot v28.4s, v18.16b, v4.4b[2]\n"
1331 ".inst 0x4f85ea5d // sdot v29.4s, v18.16b, v5.4b[2]\n"
1332 ".inst 0x4f86ea5e // sdot v30.4s, v18.16b, v6.4b[2]\n"
1333 ".inst 0x4f87ea5f // sdot v31.4s, v18.16b, v7.4b[2]\n"
1334 ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
1335 ".inst 0x4fa1ea79 // sdot v25.4s, v19.16b, v1.4b[3]\n"
1336 ".inst 0x4fa2ea7a // sdot v26.4s, v19.16b, v2.4b[3]\n"
1337 ".inst 0x4fa3ea7b // sdot v27.4s, v19.16b, v3.4b[3]\n"
1338 ".inst 0x4fa4ea7c // sdot v28.4s, v19.16b, v4.4b[3]\n"
1339 ".inst 0x4fa5ea7d // sdot v29.4s, v19.16b, v5.4b[3]\n"
1340 ".inst 0x4fa6ea7e // sdot v30.4s, v19.16b, v6.4b[3]\n"
1341 ".inst 0x4fa7ea7f // sdot v31.4s, v19.16b, v7.4b[3]\n"
1343 "str q24, [%[c_ptr0]]\n"
1344 "add %[c_ptr0], %[c_ptr0], #0x10\n"
1345 "str q25, [c_ptr1]\n"
1346 "str q26, [c_ptr2]\n"
1347 "str q27, [c_ptr3]\n"
1348 "str q28, [c_ptr4]\n"
1349 "str q29, [c_ptr5]\n"
1350 "str q30, [c_ptr6]\n"
1351 "str q31, [c_ptr7]\n"
1366 ".unreq temploadreg0\n"
1367 ".unreq temploadreg1\n"
1368 ".unreq temploadreg2\n"
1369 ".unreq temploadreg3\n"
1370 : [a_ptr0]
"+r" (a_ptr0), [b_ptr0]
"+r" (b_ptr0), [c_ptr0]
"+r" (c_ptr0), [loops]
"+r" (loops), [oob_rows]
"+r" (oob_rows), [odds]
"+r" (odds)
1371 : [lda]
"r" (ldab), [ldc]
"r" (ldcb)
1372 :
"x0",
"x1",
"x2",
"x3",
"x4",
"x5",
"x6",
"x7",
"x8",
"x9",
"x10",
"x11",
"x12",
"x13",
"x14",
"x15",
"x16",
"x17",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23",
"v24",
"v25",
"v26",
"v27",
"v28",
"v29",
"v30",
"v31",
"cc",
"memory"
1391 "temploadreg0 .req X14\n"
1392 "temploadreg1 .req X15\n"
1393 "temploadreg2 .req X16\n"
1394 "temploadreg3 .req X17\n"
1395 "add a_ptr1, %[a_ptr0], %[lda]\n"
1396 "add c_ptr1, %[c_ptr0], %[ldc]\n"
1397 "add a_ptr2, a_ptr1, %[lda]\n"
1398 "add c_ptr2, c_ptr1, %[ldc]\n"
1399 "add a_ptr3, a_ptr2, %[lda]\n"
1400 "add c_ptr3, c_ptr2, %[ldc]\n"
1401 "add a_ptr4, a_ptr3, %[lda]\n"
1402 "add c_ptr4, c_ptr3, %[ldc]\n"
1403 "add a_ptr5, a_ptr4, %[lda]\n"
1404 "add c_ptr5, c_ptr4, %[ldc]\n"
1405 "add a_ptr6, a_ptr5, %[lda]\n"
1406 "add c_ptr6, c_ptr5, %[ldc]\n"
1407 "add a_ptr7, a_ptr6, %[lda]\n"
1408 "add c_ptr7, c_ptr6, %[ldc]\n"
1409 "cbz %[oob_rows], 1f\n"
1410 "subs %[oob_rows], %[oob_rows], #0x1\n"
1411 "add c_ptr7, %[c_ptr0], #0x0\n"
1412 "add a_ptr7, %[a_ptr0], #0x0\n"
1414 "subs %[oob_rows], %[oob_rows], #0x1\n"
1415 "add c_ptr6, %[c_ptr0], #0x0\n"
1416 "add a_ptr6, %[a_ptr0], #0x0\n"
1418 "subs %[oob_rows], %[oob_rows], #0x1\n"
1419 "add c_ptr5, %[c_ptr0], #0x0\n"
1420 "add a_ptr5, %[a_ptr0], #0x0\n"
1422 "subs %[oob_rows], %[oob_rows], #0x1\n"
1423 "add c_ptr4, %[c_ptr0], #0x0\n"
1424 "add a_ptr4, %[a_ptr0], #0x0\n"
1426 "subs %[oob_rows], %[oob_rows], #0x1\n"
1427 "add c_ptr3, %[c_ptr0], #0x0\n"
1428 "add a_ptr3, %[a_ptr0], #0x0\n"
1430 "subs %[oob_rows], %[oob_rows], #0x1\n"
1431 "add c_ptr2, %[c_ptr0], #0x0\n"
1432 "add a_ptr2, %[a_ptr0], #0x0\n"
1434 "subs %[oob_rows], %[oob_rows], #0x1\n"
1435 "add c_ptr1, %[c_ptr0], #0x0\n"
1436 "add a_ptr1, %[a_ptr0], #0x0\n"
1438 "cbnz %[odds], 2f\n"
1439 "ldr q0, [%[a_ptr0]], #0x10\n"
1440 "ldr q2, [a_ptr1], #0x10\n"
1441 "ldr q4, [a_ptr2], #0x10\n"
1442 "ldr q6, [a_ptr3], #0x10\n"
1443 "ldr s1, [%[a_ptr0]]\n"
1444 "ldr q8, [a_ptr4], #0x10\n"
1445 "ldr s3, [a_ptr1]\n"
1446 "ldr q10, [a_ptr5], #0x10\n"
1447 "ldr s5, [a_ptr2]\n"
1448 "ldr q12, [a_ptr6], #0x10\n"
1449 "ldr s7, [a_ptr3]\n"
1450 "ldr q14, [a_ptr7], #0x10\n"
1451 "ldr s9, [a_ptr4]\n"
1452 "ldr s11, [a_ptr5]\n"
1453 "ldr s13, [a_ptr6]\n"
1454 "ldr s15, [a_ptr7]\n"
1457 "ldr q0, [%[a_ptr0]], #0x10\n"
1458 "subs %[odds], %[odds], #0x1\n"
1459 "ldr q2, [a_ptr1], #0x10\n"
1460 "ldr q4, [a_ptr2], #0x10\n"
1461 "ldr q6, [a_ptr3], #0x10\n"
1462 "ldr q8, [a_ptr4], #0x10\n"
1463 "ldr q10, [a_ptr5], #0x10\n"
1464 "ldr q12, [a_ptr6], #0x10\n"
1465 "ldr q14, [a_ptr7], #0x10\n"
1467 "ldr b1, [%[a_ptr0]]\n"
1468 "ldr b3, [a_ptr1]\n"
1469 "ldr b5, [a_ptr2]\n"
1470 "ldr b7, [a_ptr3]\n"
1471 "ldr b9, [a_ptr4]\n"
1472 "ldr b11, [a_ptr5]\n"
1473 "ldr b13, [a_ptr6]\n"
1474 "ldr b15, [a_ptr7]\n"
1477 "ldr h1, [%[a_ptr0]], #0x2\n"
1478 "ldr h3, [a_ptr1], #0x2\n"
1479 "ldr h5, [a_ptr2], #0x2\n"
1480 "ldr h7, [a_ptr3], #0x2\n"
1481 "ldr h9, [a_ptr4], #0x2\n"
1482 "ldr h11, [a_ptr5], #0x2\n"
1483 "ldr h13, [a_ptr6], #0x2\n"
1484 "ldr h15, [a_ptr7], #0x2\n"
1485 "subs %[odds], %[odds], #0x1\n"
1489 "ld1 {v1.b}[2], [%[a_ptr0]]\n"
1490 "ld1 {v3.b}[2], [a_ptr1]\n"
1491 "ld1 {v5.b}[2], [a_ptr2]\n"
1492 "ld1 {v7.b}[2], [a_ptr3]\n"
1493 "ld1 {v9.b}[2], [a_ptr4]\n"
1494 "ld1 {v11.b}[2], [a_ptr5]\n"
1495 "ld1 {v13.b}[2], [a_ptr6]\n"
1496 "ld1 {v15.b}[2], [a_ptr7]\n"
1498 "ldr q16, [%[b_ptr0]]\n"
1499 "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
1500 "ldr q17, [%[b_ptr0], #0x10]\n"
1501 "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
1502 "ldr q18, [%[b_ptr0], #0x20]\n"
1503 "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
1504 "ldr q19, [%[b_ptr0], #0x30]\n"
1505 "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
1506 "ldr q20, [%[b_ptr0], #0x40]\n"
1507 "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
1508 "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
1509 "add %[b_ptr0], %[b_ptr0], #0x50\n"
1510 "cbz %[loops], 6f\n"
1512 "subs %[loops], %[loops], #0x1\n"
1520 ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
1521 ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
1522 ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
1523 ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
1524 ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
1525 ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
1526 ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
1527 ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
1528 ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
1529 ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
1530 ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
1531 ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
1532 ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
1533 ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
1534 ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
1535 ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
1536 ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
1537 ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
1538 ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
1539 ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
1540 ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
1541 ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
1542 ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
1543 ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
1544 ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
1545 ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
1546 ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
1547 ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
1548 ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
1549 ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
1550 ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
1551 ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
1552 ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
1553 ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
1554 ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
1555 ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
1556 ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
1557 ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
1558 ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
1559 ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
1562 "str q24, [%[c_ptr0]]\n"
1563 "subs %[loops], %[loops], #0x1\n"
1565 "ldr d16, [%[b_ptr0]]\n"
1566 "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
1567 "add %[c_ptr0], %[c_ptr0], #0x10\n"
1568 "str q25, [c_ptr1]\n"
1569 "add c_ptr1, c_ptr1, #0x10\n"
1571 "ldr d17, [%[b_ptr0], #0x10]\n"
1572 "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
1573 "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
1574 "str q26, [c_ptr2]\n"
1575 "add c_ptr2, c_ptr2, #0x10\n"
1577 "ldr d18, [%[b_ptr0], #0x20]\n"
1578 "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
1579 "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
1580 "str q27, [c_ptr3]\n"
1581 "add c_ptr3, c_ptr3, #0x10\n"
1583 "ldr d19, [%[b_ptr0], #0x30]\n"
1584 "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
1585 "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
1586 "str q28, [c_ptr4]\n"
1587 "add c_ptr4, c_ptr4, #0x10\n"
1589 "ldr d20, [%[b_ptr0], #0x40]\n"
1590 "ins v16.d[1], temploadreg0\n"
1591 "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
1592 "str q29, [c_ptr5]\n"
1593 "add c_ptr5, c_ptr5, #0x10\n"
1595 "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
1596 ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
1597 "str q30, [c_ptr6]\n"
1599 "ins v17.d[1], temploadreg1\n"
1600 ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
1601 "ins v18.d[1], temploadreg2\n"
1602 ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
1603 "str q31, [c_ptr7]\n"
1605 "ins v19.d[1], temploadreg3\n"
1606 ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
1607 "ins v20.d[1], temploadreg0\n"
1608 ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
1609 "add c_ptr6, c_ptr6, #0x10\n"
1610 ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
1611 "add c_ptr7, c_ptr7, #0x10\n"
1612 ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
1613 "add %[b_ptr0], %[b_ptr0], #0x50\n"
1614 ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
1615 "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
1616 ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
1617 "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
1618 ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
1619 "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
1620 ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
1621 "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
1622 ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
1623 ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
1624 ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
1625 ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
1626 ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
1627 ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
1628 ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
1629 ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
1630 ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
1631 ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
1632 ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
1633 ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
1634 ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
1635 ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
1636 ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
1637 ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
1638 ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
1639 ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
1640 ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
1641 ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
1642 ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
1643 ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
1644 ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
1645 ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
1646 ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
1647 ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
1648 ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
1649 ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
1650 ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
1653 "str q24, [%[c_ptr0]]\n"
1654 "add %[c_ptr0], %[c_ptr0], #0x10\n"
1656 "ldr q16, [%[b_ptr0]]\n"
1657 "ldr q17, [%[b_ptr0], #0x10]\n"
1658 "str q25, [c_ptr1]\n"
1659 "add c_ptr1, c_ptr1, #0x10\n"
1661 "ldr q18, [%[b_ptr0], #0x20]\n"
1662 ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
1663 "str q26, [c_ptr2]\n"
1665 "ldr q19, [%[b_ptr0], #0x30]\n"
1666 "ldr q20, [%[b_ptr0], #0x40]\n"
1667 "add c_ptr2, c_ptr2, #0x10\n"
1668 ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
1669 "str q27, [c_ptr3]\n"
1671 "add c_ptr3, c_ptr3, #0x10\n"
1672 ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
1673 "str q28, [c_ptr4]\n"
1675 "add c_ptr4, c_ptr4, #0x10\n"
1676 ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
1677 "str q29, [c_ptr5]\n"
1679 "add c_ptr5, c_ptr5, #0x10\n"
1680 ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
1681 "str q30, [c_ptr6]\n"
1683 "add c_ptr6, c_ptr6, #0x10\n"
1684 ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
1685 "str q31, [c_ptr7]\n"
1687 "add c_ptr7, c_ptr7, #0x10\n"
1688 ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
1689 "add %[b_ptr0], %[b_ptr0], #0x50\n"
1690 ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
1691 ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
1692 ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
1693 ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
1694 ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
1695 ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
1696 ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
1697 ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
1698 ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
1699 ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
1700 ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
1701 ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
1702 ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
1703 ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
1704 ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
1705 ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
1706 ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
1707 ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
1708 ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
1709 ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
1710 ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
1711 ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
1712 ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
1713 ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
1714 ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
1715 ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
1716 ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
1717 ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
1718 ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
1719 ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
1720 ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
1721 ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
1722 ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
1733 ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
1734 ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
1735 ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
1736 ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
1737 ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
1738 ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
1739 ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
1740 ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
1741 ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
1742 ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
1743 ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
1744 ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
1745 ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
1746 ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
1747 ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
1748 ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
1749 ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
1750 ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
1751 ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
1752 ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
1753 ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
1754 ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
1755 ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
1756 ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
1757 ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
1758 ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
1759 ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
1760 ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
1761 ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
1762 ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
1763 ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
1764 ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
1765 ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
1766 ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
1767 ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
1768 ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
1769 ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
1770 ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
1771 ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
1772 ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
1774 "str q24, [%[c_ptr0]]\n"
1775 "add %[c_ptr0], %[c_ptr0], #0x10\n"
1776 "str q25, [c_ptr1]\n"
1777 "str q26, [c_ptr2]\n"
1778 "str q27, [c_ptr3]\n"
1779 "str q28, [c_ptr4]\n"
1780 "str q29, [c_ptr5]\n"
1781 "str q30, [c_ptr6]\n"
1782 "str q31, [c_ptr7]\n"
1797 ".unreq temploadreg0\n"
1798 ".unreq temploadreg1\n"
1799 ".unreq temploadreg2\n"
1800 ".unreq temploadreg3\n"
1801 : [a_ptr0]
"+r" (a_ptr0), [b_ptr0]
"+r" (b_ptr0), [c_ptr0]
"+r" (c_ptr0), [loops]
"+r" (loops), [oob_rows]
"+r" (oob_rows), [odds]
"+r" (odds)
1802 : [lda]
"r" (ldab), [ldc]
"r" (ldcb)
1803 :
"x0",
"x1",
"x2",
"x3",
"x4",
"x5",
"x6",
"x7",
"x8",
"x9",
"x10",
"x11",
"x12",
"x13",
"x14",
"x15",
"x16",
"x17",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23",
"v24",
"v25",
"v26",
"v27",
"v28",
"v29",
"v30",
"v31",
"cc",
"memory"
1822 "temploadreg0 .req X14\n"
1823 "temploadreg1 .req X15\n"
1824 "temploadreg2 .req X16\n"
1825 "temploadreg3 .req X17\n"
1826 "add a_ptr1, %[a_ptr0], %[lda]\n"
1827 "add c_ptr1, %[c_ptr0], %[ldc]\n"
1828 "add a_ptr2, a_ptr1, %[lda]\n"
1829 "add c_ptr2, c_ptr1, %[ldc]\n"
1830 "add a_ptr3, a_ptr2, %[lda]\n"
1831 "add c_ptr3, c_ptr2, %[ldc]\n"
1832 "add a_ptr4, a_ptr3, %[lda]\n"
1833 "add c_ptr4, c_ptr3, %[ldc]\n"
1834 "add a_ptr5, a_ptr4, %[lda]\n"
1835 "add c_ptr5, c_ptr4, %[ldc]\n"
1836 "add a_ptr6, a_ptr5, %[lda]\n"
1837 "add c_ptr6, c_ptr5, %[ldc]\n"
1838 "add a_ptr7, a_ptr6, %[lda]\n"
1839 "add c_ptr7, c_ptr6, %[ldc]\n"
1840 "cbz %[oob_rows], 1f\n"
1841 "subs %[oob_rows], %[oob_rows], #0x1\n"
1842 "add c_ptr7, %[c_ptr0], #0x0\n"
1843 "add a_ptr7, %[a_ptr0], #0x0\n"
1845 "subs %[oob_rows], %[oob_rows], #0x1\n"
1846 "add c_ptr6, %[c_ptr0], #0x0\n"
1847 "add a_ptr6, %[a_ptr0], #0x0\n"
1849 "subs %[oob_rows], %[oob_rows], #0x1\n"
1850 "add c_ptr5, %[c_ptr0], #0x0\n"
1851 "add a_ptr5, %[a_ptr0], #0x0\n"
1853 "subs %[oob_rows], %[oob_rows], #0x1\n"
1854 "add c_ptr4, %[c_ptr0], #0x0\n"
1855 "add a_ptr4, %[a_ptr0], #0x0\n"
1857 "subs %[oob_rows], %[oob_rows], #0x1\n"
1858 "add c_ptr3, %[c_ptr0], #0x0\n"
1859 "add a_ptr3, %[a_ptr0], #0x0\n"
1861 "subs %[oob_rows], %[oob_rows], #0x1\n"
1862 "add c_ptr2, %[c_ptr0], #0x0\n"
1863 "add a_ptr2, %[a_ptr0], #0x0\n"
1865 "subs %[oob_rows], %[oob_rows], #0x1\n"
1866 "add c_ptr1, %[c_ptr0], #0x0\n"
1867 "add a_ptr1, %[a_ptr0], #0x0\n"
1869 "cbnz %[odds], 2f\n"
1870 "ldr q0, [%[a_ptr0]], #0x10\n"
1871 "ldr q2, [a_ptr1], #0x10\n"
1872 "ldr q4, [a_ptr2], #0x10\n"
1873 "ldr q6, [a_ptr3], #0x10\n"
1874 "ldr d1, [%[a_ptr0]]\n"
1875 "ldr q8, [a_ptr4], #0x10\n"
1876 "ldr d3, [a_ptr1]\n"
1877 "ldr q10, [a_ptr5], #0x10\n"
1878 "ldr d5, [a_ptr2]\n"
1879 "ldr q12, [a_ptr6], #0x10\n"
1880 "ldr d7, [a_ptr3]\n"
1881 "ldr q14, [a_ptr7], #0x10\n"
1882 "ldr d9, [a_ptr4]\n"
1883 "ldr d11, [a_ptr5]\n"
1884 "ldr d13, [a_ptr6]\n"
1885 "ldr d15, [a_ptr7]\n"
1888 "ldr q0, [%[a_ptr0]], #0x10\n"
1889 "subs %[odds], %[odds], #0x1\n"
1890 "ldr q2, [a_ptr1], #0x10\n"
1891 "ldr q4, [a_ptr2], #0x10\n"
1892 "ldr s1, [%[a_ptr0]], #0x4\n"
1893 "ldr q6, [a_ptr3], #0x10\n"
1894 "ldr s3, [a_ptr1], #0x4\n"
1895 "ldr q8, [a_ptr4], #0x10\n"
1896 "ldr s5, [a_ptr2], #0x4\n"
1897 "ldr q10, [a_ptr5], #0x10\n"
1898 "ldr s7, [a_ptr3], #0x4\n"
1899 "ldr q12, [a_ptr6], #0x10\n"
1900 "ldr s9, [a_ptr4], #0x4\n"
1901 "ldr q14, [a_ptr7], #0x10\n"
1902 "ldr s11, [a_ptr5], #0x4\n"
1903 "ldr s13, [a_ptr6], #0x4\n"
1904 "ldr s15, [a_ptr7], #0x4\n"
1906 "ld1 {v1.b}[4], [%[a_ptr0]]\n"
1907 "ld1 {v3.b}[4], [a_ptr1]\n"
1908 "ld1 {v5.b}[4], [a_ptr2]\n"
1909 "ld1 {v7.b}[4], [a_ptr3]\n"
1910 "ld1 {v9.b}[4], [a_ptr4]\n"
1911 "ld1 {v11.b}[4], [a_ptr5]\n"
1912 "ld1 {v13.b}[4], [a_ptr6]\n"
1913 "ld1 {v15.b}[4], [a_ptr7]\n"
1916 "ld1 {v1.h}[2], [%[a_ptr0]], #2\n"
1917 "ld1 {v3.h}[2], [a_ptr1], #2\n"
1918 "ld1 {v5.h}[2], [a_ptr2], #2\n"
1919 "ld1 {v7.h}[2], [a_ptr3], #2\n"
1920 "ld1 {v9.h}[2], [a_ptr4], #2\n"
1921 "ld1 {v11.h}[2], [a_ptr5], #2\n"
1922 "ld1 {v13.h}[2], [a_ptr6], #2\n"
1923 "ld1 {v15.h}[2], [a_ptr7], #2\n"
1924 "subs %[odds], %[odds], #0x1\n"
1928 "ld1 {v1.b}[6], [%[a_ptr0]]\n"
1929 "ld1 {v3.b}[6], [a_ptr1]\n"
1930 "ld1 {v5.b}[6], [a_ptr2]\n"
1931 "ld1 {v7.b}[6], [a_ptr3]\n"
1932 "ld1 {v9.b}[6], [a_ptr4]\n"
1933 "ld1 {v11.b}[6], [a_ptr5]\n"
1934 "ld1 {v13.b}[6], [a_ptr6]\n"
1935 "ld1 {v15.b}[6], [a_ptr7]\n"
1937 "ldr q16, [%[b_ptr0]]\n"
1938 "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
1939 "ldr q17, [%[b_ptr0], #0x10]\n"
1940 "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
1941 "ldr q18, [%[b_ptr0], #0x20]\n"
1942 "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
1943 "ldr q19, [%[b_ptr0], #0x30]\n"
1944 "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
1945 "ldr q20, [%[b_ptr0], #0x40]\n"
1946 "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
1947 "ldr q21, [%[b_ptr0], #0x50]\n"
1948 "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
1949 "add %[b_ptr0], %[b_ptr0], #0x60\n"
1950 "cbz %[loops], 6f\n"
1952 "subs %[loops], %[loops], #0x1\n"
1960 ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
1961 ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
1962 ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
1963 ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
1964 ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
1965 ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
1966 ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
1967 ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
1968 ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
1969 ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
1970 ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
1971 ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
1972 ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
1973 ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
1974 ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
1975 ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
1976 ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
1977 ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
1978 ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
1979 ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
1980 ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
1981 ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
1982 ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
1983 ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
1984 ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
1985 ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
1986 ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
1987 ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
1988 ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
1989 ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
1990 ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
1991 ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
1992 ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
1993 ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
1994 ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
1995 ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
1996 ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
1997 ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
1998 ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
1999 ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
2000 ".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
2001 ".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
2002 ".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
2003 ".inst 0x4fa7e2bb // sdot v27.4s, v21.16b, v7.4b[1]\n"
2004 ".inst 0x4fa9e2bc // sdot v28.4s, v21.16b, v9.4b[1]\n"
2005 ".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
2006 ".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
2007 ".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
2010 "str q24, [%[c_ptr0]]\n"
2011 "subs %[loops], %[loops], #0x1\n"
2013 "ldr d16, [%[b_ptr0]]\n"
2014 "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
2015 "add %[c_ptr0], %[c_ptr0], #0x10\n"
2016 "str q25, [c_ptr1]\n"
2017 "add c_ptr1, c_ptr1, #0x10\n"
2019 "ldr d17, [%[b_ptr0], #0x10]\n"
2020 "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
2021 "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
2022 "str q26, [c_ptr2]\n"
2023 "add c_ptr2, c_ptr2, #0x10\n"
2025 "ldr d18, [%[b_ptr0], #0x20]\n"
2026 "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
2027 "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
2028 "str q27, [c_ptr3]\n"
2029 "add c_ptr3, c_ptr3, #0x10\n"
2031 "ldr d19, [%[b_ptr0], #0x30]\n"
2032 "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
2033 "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
2034 "str q28, [c_ptr4]\n"
2035 "add c_ptr4, c_ptr4, #0x10\n"
2037 "ldr d20, [%[b_ptr0], #0x40]\n"
2038 "ins v16.d[1], temploadreg0\n"
2039 "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
2040 "str q29, [c_ptr5]\n"
2041 "add c_ptr5, c_ptr5, #0x10\n"
2043 "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
2044 ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
2045 "str q30, [c_ptr6]\n"
2047 "ldr d21, [%[b_ptr0], #0x50]\n"
2048 ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
2049 "ins v17.d[1], temploadreg1\n"
2050 ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
2051 "str q31, [c_ptr7]\n"
2053 "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
2054 ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
2055 "ins v18.d[1], temploadreg2\n"
2056 ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
2057 "ins v19.d[1], temploadreg3\n"
2058 ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
2059 "ins v20.d[1], temploadreg0\n"
2060 ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
2061 "ins v21.d[1], temploadreg1\n"
2062 ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
2063 "add c_ptr6, c_ptr6, #0x10\n"
2064 ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
2065 "add c_ptr7, c_ptr7, #0x10\n"
2066 ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
2067 "add %[b_ptr0], %[b_ptr0], #0x60\n"
2068 ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
2069 "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
2070 ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
2071 "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
2072 ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
2073 "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
2074 ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
2075 "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
2076 ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
2077 ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
2078 ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
2079 ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
2080 ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
2081 ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
2082 ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
2083 ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
2084 ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
2085 ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
2086 ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
2087 ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
2088 ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
2089 ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
2090 ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
2091 ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
2092 ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
2093 ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
2094 ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
2095 ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
2096 ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
2097 ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
2098 ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
2099 ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
2100 ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
2101 ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
2102 ".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
2103 ".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
2104 ".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
2105 ".inst 0x4fa7e2bb // sdot v27.4s, v21.16b, v7.4b[1]\n"
2106 ".inst 0x4fa9e2bc // sdot v28.4s, v21.16b, v9.4b[1]\n"
2107 ".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
2108 ".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
2109 ".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
2112 "str q24, [%[c_ptr0]]\n"
2113 "add %[c_ptr0], %[c_ptr0], #0x10\n"
2115 "ldr q16, [%[b_ptr0]]\n"
2116 "ldr q17, [%[b_ptr0], #0x10]\n"
2117 "str q25, [c_ptr1]\n"
2118 "add c_ptr1, c_ptr1, #0x10\n"
2120 "ldr q18, [%[b_ptr0], #0x20]\n"
2121 ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
2122 "str q26, [c_ptr2]\n"
2124 "ldr q19, [%[b_ptr0], #0x30]\n"
2125 "ldr q20, [%[b_ptr0], #0x40]\n"
2126 "add c_ptr2, c_ptr2, #0x10\n"
2127 ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
2128 "str q27, [c_ptr3]\n"
2130 "ldr q21, [%[b_ptr0], #0x50]\n"
2131 ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
2132 "add c_ptr3, c_ptr3, #0x10\n"
2133 ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
2134 "str q28, [c_ptr4]\n"
2136 "add c_ptr4, c_ptr4, #0x10\n"
2137 ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
2138 "str q29, [c_ptr5]\n"
2140 "add c_ptr5, c_ptr5, #0x10\n"
2141 ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
2142 "str q30, [c_ptr6]\n"
2144 "add c_ptr6, c_ptr6, #0x10\n"
2145 ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
2146 "str q31, [c_ptr7]\n"
2148 "add c_ptr7, c_ptr7, #0x10\n"
2149 ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
2150 "add %[b_ptr0], %[b_ptr0], #0x60\n"
2151 ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
2152 ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
2153 ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
2154 ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
2155 ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
2156 ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
2157 ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
2158 ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
2159 ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
2160 ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
2161 ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
2162 ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
2163 ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
2164 ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
2165 ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
2166 ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
2167 ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
2168 ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
2169 ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
2170 ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
2171 ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
2172 ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
2173 ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
2174 ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
2175 ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
2176 ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
2177 ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
2178 ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
2179 ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
2180 ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
2181 ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
2182 ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
2183 ".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
2184 ".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
2185 ".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
2186 ".inst 0x4fa7e2bb // sdot v27.4s, v21.16b, v7.4b[1]\n"
2187 ".inst 0x4fa9e2bc // sdot v28.4s, v21.16b, v9.4b[1]\n"
2188 ".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
2189 ".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
2190 ".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
2201 ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
2202 ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
2203 ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
2204 ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
2205 ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
2206 ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
2207 ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
2208 ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
2209 ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
2210 ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
2211 ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
2212 ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
2213 ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
2214 ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
2215 ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
2216 ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
2217 ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
2218 ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
2219 ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
2220 ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
2221 ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
2222 ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
2223 ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
2224 ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
2225 ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
2226 ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
2227 ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
2228 ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
2229 ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
2230 ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
2231 ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
2232 ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
2233 ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
2234 ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
2235 ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
2236 ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
2237 ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
2238 ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
2239 ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
2240 ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
2241 ".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
2242 ".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
2243 ".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
2244 ".inst 0x4fa7e2bb // sdot v27.4s, v21.16b, v7.4b[1]\n"
2245 ".inst 0x4fa9e2bc // sdot v28.4s, v21.16b, v9.4b[1]\n"
2246 ".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
2247 ".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
2248 ".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
2250 "str q24, [%[c_ptr0]]\n"
2251 "add %[c_ptr0], %[c_ptr0], #0x10\n"
2252 "str q25, [c_ptr1]\n"
2253 "str q26, [c_ptr2]\n"
2254 "str q27, [c_ptr3]\n"
2255 "str q28, [c_ptr4]\n"
2256 "str q29, [c_ptr5]\n"
2257 "str q30, [c_ptr6]\n"
2258 "str q31, [c_ptr7]\n"
2273 ".unreq temploadreg0\n"
2274 ".unreq temploadreg1\n"
2275 ".unreq temploadreg2\n"
2276 ".unreq temploadreg3\n"
2277 : [a_ptr0]
"+r" (a_ptr0), [b_ptr0]
"+r" (b_ptr0), [c_ptr0]
"+r" (c_ptr0), [loops]
"+r" (loops), [oob_rows]
"+r" (oob_rows), [odds]
"+r" (odds)
2278 : [lda]
"r" (ldab), [ldc]
"r" (ldcb)
2279 :
"x0",
"x1",
"x2",
"x3",
"x4",
"x5",
"x6",
"x7",
"x8",
"x9",
"x10",
"x11",
"x12",
"x13",
"x14",
"x15",
"x16",
"x17",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23",
"v24",
"v25",
"v26",
"v27",
"v28",
"v29",
"v30",
"v31",
"cc",
"memory"
2298 "temploadreg0 .req X14\n"
2299 "temploadreg1 .req X15\n"
2300 "temploadreg2 .req X16\n"
2301 "temploadreg3 .req X17\n"
2302 "add a_ptr1, %[a_ptr0], %[lda]\n"
2303 "add c_ptr1, %[c_ptr0], %[ldc]\n"
2304 "add a_ptr2, a_ptr1, %[lda]\n"
2305 "add c_ptr2, c_ptr1, %[ldc]\n"
2306 "add a_ptr3, a_ptr2, %[lda]\n"
2307 "add c_ptr3, c_ptr2, %[ldc]\n"
2308 "add a_ptr4, a_ptr3, %[lda]\n"
2309 "add c_ptr4, c_ptr3, %[ldc]\n"
2310 "add a_ptr5, a_ptr4, %[lda]\n"
2311 "add c_ptr5, c_ptr4, %[ldc]\n"
2312 "add a_ptr6, a_ptr5, %[lda]\n"
2313 "add c_ptr6, c_ptr5, %[ldc]\n"
2314 "add a_ptr7, a_ptr6, %[lda]\n"
2315 "add c_ptr7, c_ptr6, %[ldc]\n"
2316 "cbz %[oob_rows], 1f\n"
2317 "subs %[oob_rows], %[oob_rows], #0x1\n"
2318 "add c_ptr7, %[c_ptr0], #0x0\n"
2319 "add a_ptr7, %[a_ptr0], #0x0\n"
2321 "subs %[oob_rows], %[oob_rows], #0x1\n"
2322 "add c_ptr6, %[c_ptr0], #0x0\n"
2323 "add a_ptr6, %[a_ptr0], #0x0\n"
2325 "subs %[oob_rows], %[oob_rows], #0x1\n"
2326 "add c_ptr5, %[c_ptr0], #0x0\n"
2327 "add a_ptr5, %[a_ptr0], #0x0\n"
2329 "subs %[oob_rows], %[oob_rows], #0x1\n"
2330 "add c_ptr4, %[c_ptr0], #0x0\n"
2331 "add a_ptr4, %[a_ptr0], #0x0\n"
2333 "subs %[oob_rows], %[oob_rows], #0x1\n"
2334 "add c_ptr3, %[c_ptr0], #0x0\n"
2335 "add a_ptr3, %[a_ptr0], #0x0\n"
2337 "subs %[oob_rows], %[oob_rows], #0x1\n"
2338 "add c_ptr2, %[c_ptr0], #0x0\n"
2339 "add a_ptr2, %[a_ptr0], #0x0\n"
2341 "subs %[oob_rows], %[oob_rows], #0x1\n"
2342 "add c_ptr1, %[c_ptr0], #0x0\n"
2343 "add a_ptr1, %[a_ptr0], #0x0\n"
2345 "ldr q0, [%[a_ptr0]], #0x10\n"
2346 "ldr q2, [a_ptr1], #0x10\n"
2347 "ldr q4, [a_ptr2], #0x10\n"
2348 "ldr q6, [a_ptr3], #0x10\n"
2349 "ldr d1, [%[a_ptr0]], #0x8\n"
2350 "ldr q8, [a_ptr4], #0x10\n"
2351 "ldr d3, [a_ptr1], #0x8\n"
2352 "ldr q10, [a_ptr5], #0x10\n"
2353 "ldr d5, [a_ptr2], #0x8\n"
2354 "ldr q12, [a_ptr6], #0x10\n"
2355 "ldr d7, [a_ptr3], #0x8\n"
2356 "ldr q14, [a_ptr7], #0x10\n"
2357 "ldr d9, [a_ptr4], #0x8\n"
2358 "ldr d11, [a_ptr5], #0x8\n"
2359 "ldr d13, [a_ptr6], #0x8\n"
2360 "ldr d15, [a_ptr7], #0x8\n"
2361 "cbnz %[odds], 2f\n"
2362 "ld1 {v1.s}[2], [%[a_ptr0]]\n"
2363 "ld1 {v3.s}[2], [a_ptr1]\n"
2364 "ld1 {v5.s}[2], [a_ptr2]\n"
2365 "ld1 {v7.s}[2], [a_ptr3]\n"
2366 "ld1 {v9.s}[2], [a_ptr4]\n"
2367 "ld1 {v11.s}[2], [a_ptr5]\n"
2368 "ld1 {v13.s}[2], [a_ptr6]\n"
2369 "ld1 {v15.s}[2], [a_ptr7]\n"
2372 "subs %[odds], %[odds], #0x1\n"
2374 "ld1 {v1.b}[8], [%[a_ptr0]]\n"
2375 "ld1 {v3.b}[8], [a_ptr1]\n"
2376 "ld1 {v5.b}[8], [a_ptr2]\n"
2377 "ld1 {v7.b}[8], [a_ptr3]\n"
2378 "ld1 {v9.b}[8], [a_ptr4]\n"
2379 "ld1 {v11.b}[8], [a_ptr5]\n"
2380 "ld1 {v13.b}[8], [a_ptr6]\n"
2381 "ld1 {v15.b}[8], [a_ptr7]\n"
2384 "ld1 {v1.h}[4], [%[a_ptr0]], #2\n"
2385 "ld1 {v3.h}[4], [a_ptr1], #2\n"
2386 "ld1 {v5.h}[4], [a_ptr2], #2\n"
2387 "ld1 {v7.h}[4], [a_ptr3], #2\n"
2388 "ld1 {v9.h}[4], [a_ptr4], #2\n"
2389 "ld1 {v11.h}[4], [a_ptr5], #2\n"
2390 "ld1 {v13.h}[4], [a_ptr6], #2\n"
2391 "ld1 {v15.h}[4], [a_ptr7], #2\n"
2392 "subs %[odds], %[odds], #0x1\n"
2396 "ld1 {v1.b}[10], [%[a_ptr0]]\n"
2397 "ld1 {v3.b}[10], [a_ptr1]\n"
2398 "ld1 {v5.b}[10], [a_ptr2]\n"
2399 "ld1 {v7.b}[10], [a_ptr3]\n"
2400 "ld1 {v9.b}[10], [a_ptr4]\n"
2401 "ld1 {v11.b}[10], [a_ptr5]\n"
2402 "ld1 {v13.b}[10], [a_ptr6]\n"
2403 "ld1 {v15.b}[10], [a_ptr7]\n"
2405 "ldr q16, [%[b_ptr0]]\n"
2406 "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
2407 "ldr q17, [%[b_ptr0], #0x10]\n"
2408 "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
2409 "ldr q18, [%[b_ptr0], #0x20]\n"
2410 "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
2411 "ldr q19, [%[b_ptr0], #0x30]\n"
2412 "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
2413 "ldr q20, [%[b_ptr0], #0x40]\n"
2414 "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
2415 "ldr q21, [%[b_ptr0], #0x50]\n"
2416 "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
2417 "ldr q22, [%[b_ptr0], #0x60]\n"
2418 "add %[b_ptr0], %[b_ptr0], #0x70\n"
2419 "cbz %[loops], 6f\n"
2421 "subs %[loops], %[loops], #0x1\n"
2429 ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
2430 ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
2431 ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
2432 ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
2433 ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
2434 ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
2435 ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
2436 ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
2437 ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
2438 ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
2439 ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
2440 ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
2441 ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
2442 ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
2443 ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
2444 ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
2445 ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
2446 ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
2447 ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
2448 ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
2449 ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
2450 ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
2451 ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
2452 ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
2453 ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
2454 ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
2455 ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
2456 ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
2457 ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
2458 ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
2459 ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
2460 ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
2461 ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
2462 ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
2463 ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
2464 ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
2465 ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
2466 ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
2467 ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
2468 ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
2469 ".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
2470 ".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
2471 ".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
2472 ".inst 0x4fa7e2bb // sdot v27.4s, v21.16b, v7.4b[1]\n"
2473 ".inst 0x4fa9e2bc // sdot v28.4s, v21.16b, v9.4b[1]\n"
2474 ".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
2475 ".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
2476 ".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
2477 ".inst 0x4f81ead8 // sdot v24.4s, v22.16b, v1.4b[2]\n"
2478 ".inst 0x4f83ead9 // sdot v25.4s, v22.16b, v3.4b[2]\n"
2479 ".inst 0x4f85eada // sdot v26.4s, v22.16b, v5.4b[2]\n"
2480 ".inst 0x4f87eadb // sdot v27.4s, v22.16b, v7.4b[2]\n"
2481 ".inst 0x4f89eadc // sdot v28.4s, v22.16b, v9.4b[2]\n"
2482 ".inst 0x4f8beadd // sdot v29.4s, v22.16b, v11.4b[2]\n"
2483 ".inst 0x4f8deade // sdot v30.4s, v22.16b, v13.4b[2]\n"
2484 ".inst 0x4f8feadf // sdot v31.4s, v22.16b, v15.4b[2]\n"
2487 "str q24, [%[c_ptr0]]\n"
2488 "subs %[loops], %[loops], #0x1\n"
2490 "ldr d16, [%[b_ptr0]]\n"
2491 "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
2492 "add %[c_ptr0], %[c_ptr0], #0x10\n"
2493 "str q25, [c_ptr1]\n"
2494 "add c_ptr1, c_ptr1, #0x10\n"
2496 "ldr d17, [%[b_ptr0], #0x10]\n"
2497 "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
2498 "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
2499 "str q26, [c_ptr2]\n"
2500 "add c_ptr2, c_ptr2, #0x10\n"
2502 "ldr d18, [%[b_ptr0], #0x20]\n"
2503 "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
2504 "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
2505 "str q27, [c_ptr3]\n"
2506 "add c_ptr3, c_ptr3, #0x10\n"
2508 "ldr d19, [%[b_ptr0], #0x30]\n"
2509 "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
2510 "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
2511 "str q28, [c_ptr4]\n"
2512 "add c_ptr4, c_ptr4, #0x10\n"
2514 "ldr d20, [%[b_ptr0], #0x40]\n"
2515 "ins v16.d[1], temploadreg0\n"
2516 "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
2517 "str q29, [c_ptr5]\n"
2518 "add c_ptr5, c_ptr5, #0x10\n"
2520 "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
2521 ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
2522 "str q30, [c_ptr6]\n"
2524 "ldr d21, [%[b_ptr0], #0x50]\n"
2525 ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
2526 "ins v17.d[1], temploadreg1\n"
2527 ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
2528 "str q31, [c_ptr7]\n"
2530 "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
2531 ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
2532 "ldr d22, [%[b_ptr0], #0x60]\n"
2533 ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
2534 "ins v18.d[1], temploadreg2\n"
2535 ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
2536 "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
2537 ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
2538 "ins v19.d[1], temploadreg3\n"
2539 ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
2540 "ins v20.d[1], temploadreg0\n"
2541 ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
2542 "ins v21.d[1], temploadreg1\n"
2543 ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
2544 "ins v22.d[1], temploadreg2\n"
2545 ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
2546 "add c_ptr6, c_ptr6, #0x10\n"
2547 ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
2548 "add c_ptr7, c_ptr7, #0x10\n"
2549 ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
2550 "add %[b_ptr0], %[b_ptr0], #0x70\n"
2551 ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
2552 "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
2553 ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
2554 "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
2555 ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
2556 "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
2557 ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
2558 "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
2559 ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
2560 ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
2561 ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
2562 ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
2563 ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
2564 ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
2565 ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
2566 ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
2567 ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
2568 ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
2569 ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
2570 ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
2571 ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
2572 ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
2573 ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
2574 ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
2575 ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
2576 ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
2577 ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
2578 ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
2579 ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
2580 ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
2581 ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
2582 ".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
2583 ".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
2584 ".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
2585 ".inst 0x4fa7e2bb // sdot v27.4s, v21.16b, v7.4b[1]\n"
2586 ".inst 0x4fa9e2bc // sdot v28.4s, v21.16b, v9.4b[1]\n"
2587 ".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
2588 ".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
2589 ".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
2590 ".inst 0x4f81ead8 // sdot v24.4s, v22.16b, v1.4b[2]\n"
2591 ".inst 0x4f83ead9 // sdot v25.4s, v22.16b, v3.4b[2]\n"
2592 ".inst 0x4f85eada // sdot v26.4s, v22.16b, v5.4b[2]\n"
2593 ".inst 0x4f87eadb // sdot v27.4s, v22.16b, v7.4b[2]\n"
2594 ".inst 0x4f89eadc // sdot v28.4s, v22.16b, v9.4b[2]\n"
2595 ".inst 0x4f8beadd // sdot v29.4s, v22.16b, v11.4b[2]\n"
2596 ".inst 0x4f8deade // sdot v30.4s, v22.16b, v13.4b[2]\n"
2597 ".inst 0x4f8feadf // sdot v31.4s, v22.16b, v15.4b[2]\n"
2600 "str q24, [%[c_ptr0]]\n"
2601 "add %[c_ptr0], %[c_ptr0], #0x10\n"
2603 "ldr q16, [%[b_ptr0]]\n"
2604 "ldr q17, [%[b_ptr0], #0x10]\n"
2605 "str q25, [c_ptr1]\n"
2606 "add c_ptr1, c_ptr1, #0x10\n"
2608 "ldr q18, [%[b_ptr0], #0x20]\n"
2609 ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
2610 "str q26, [c_ptr2]\n"
2612 "ldr q19, [%[b_ptr0], #0x30]\n"
2613 "ldr q20, [%[b_ptr0], #0x40]\n"
2614 "add c_ptr2, c_ptr2, #0x10\n"
2615 ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
2616 "str q27, [c_ptr3]\n"
2618 "ldr q21, [%[b_ptr0], #0x50]\n"
2619 ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
2620 "ldr q22, [%[b_ptr0], #0x60]\n"
2621 ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
2622 "str q28, [c_ptr4]\n"
2624 "add c_ptr3, c_ptr3, #0x10\n"
2625 ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
2626 "str q29, [c_ptr5]\n"
2628 "add c_ptr4, c_ptr4, #0x10\n"
2629 ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
2630 "str q30, [c_ptr6]\n"
2632 "add c_ptr5, c_ptr5, #0x10\n"
2633 ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
2634 "str q31, [c_ptr7]\n"
2636 "add c_ptr6, c_ptr6, #0x10\n"
2637 ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
2638 "add c_ptr7, c_ptr7, #0x10\n"
2639 ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
2640 "add %[b_ptr0], %[b_ptr0], #0x70\n"
2641 ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
2642 ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
2643 ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
2644 ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
2645 ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
2646 ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
2647 ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
2648 ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
2649 ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
2650 ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
2651 ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
2652 ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
2653 ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
2654 ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
2655 ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
2656 ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
2657 ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
2658 ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
2659 ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
2660 ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
2661 ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
2662 ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
2663 ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
2664 ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
2665 ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
2666 ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
2667 ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
2668 ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
2669 ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
2670 ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
2671 ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
2672 ".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
2673 ".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
2674 ".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
2675 ".inst 0x4fa7e2bb // sdot v27.4s, v21.16b, v7.4b[1]\n"
2676 ".inst 0x4fa9e2bc // sdot v28.4s, v21.16b, v9.4b[1]\n"
2677 ".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
2678 ".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
2679 ".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
2680 ".inst 0x4f81ead8 // sdot v24.4s, v22.16b, v1.4b[2]\n"
2681 ".inst 0x4f83ead9 // sdot v25.4s, v22.16b, v3.4b[2]\n"
2682 ".inst 0x4f85eada // sdot v26.4s, v22.16b, v5.4b[2]\n"
2683 ".inst 0x4f87eadb // sdot v27.4s, v22.16b, v7.4b[2]\n"
2684 ".inst 0x4f89eadc // sdot v28.4s, v22.16b, v9.4b[2]\n"
2685 ".inst 0x4f8beadd // sdot v29.4s, v22.16b, v11.4b[2]\n"
2686 ".inst 0x4f8deade // sdot v30.4s, v22.16b, v13.4b[2]\n"
2687 ".inst 0x4f8feadf // sdot v31.4s, v22.16b, v15.4b[2]\n"
2698 ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
2699 ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
2700 ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
2701 ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
2702 ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
2703 ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
2704 ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
2705 ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
2706 ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
2707 ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
2708 ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
2709 ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
2710 ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
2711 ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
2712 ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
2713 ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
2714 ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
2715 ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
2716 ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
2717 ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
2718 ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
2719 ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
2720 ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
2721 ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
2722 ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
2723 ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
2724 ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
2725 ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
2726 ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
2727 ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
2728 ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
2729 ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
2730 ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
2731 ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
2732 ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
2733 ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
2734 ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
2735 ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
2736 ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
2737 ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
2738 ".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
2739 ".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
2740 ".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
2741 ".inst 0x4fa7e2bb // sdot v27.4s, v21.16b, v7.4b[1]\n"
2742 ".inst 0x4fa9e2bc // sdot v28.4s, v21.16b, v9.4b[1]\n"
2743 ".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
2744 ".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
2745 ".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
2746 ".inst 0x4f81ead8 // sdot v24.4s, v22.16b, v1.4b[2]\n"
2747 ".inst 0x4f83ead9 // sdot v25.4s, v22.16b, v3.4b[2]\n"
2748 ".inst 0x4f85eada // sdot v26.4s, v22.16b, v5.4b[2]\n"
2749 ".inst 0x4f87eadb // sdot v27.4s, v22.16b, v7.4b[2]\n"
2750 ".inst 0x4f89eadc // sdot v28.4s, v22.16b, v9.4b[2]\n"
2751 ".inst 0x4f8beadd // sdot v29.4s, v22.16b, v11.4b[2]\n"
2752 ".inst 0x4f8deade // sdot v30.4s, v22.16b, v13.4b[2]\n"
2753 ".inst 0x4f8feadf // sdot v31.4s, v22.16b, v15.4b[2]\n"
2755 "str q24, [%[c_ptr0]]\n"
2756 "add %[c_ptr0], %[c_ptr0], #0x10\n"
2757 "str q25, [c_ptr1]\n"
2758 "str q26, [c_ptr2]\n"
2759 "str q27, [c_ptr3]\n"
2760 "str q28, [c_ptr4]\n"
2761 "str q29, [c_ptr5]\n"
2762 "str q30, [c_ptr6]\n"
2763 "str q31, [c_ptr7]\n"
2778 ".unreq temploadreg0\n"
2779 ".unreq temploadreg1\n"
2780 ".unreq temploadreg2\n"
2781 ".unreq temploadreg3\n"
2782 : [a_ptr0]
"+r" (a_ptr0), [b_ptr0]
"+r" (b_ptr0), [c_ptr0]
"+r" (c_ptr0), [loops]
"+r" (loops), [oob_rows]
"+r" (oob_rows), [odds]
"+r" (odds)
2783 : [lda]
"r" (ldab), [ldc]
"r" (ldcb)
2784 :
"x0",
"x1",
"x2",
"x3",
"x4",
"x5",
"x6",
"x7",
"x8",
"x9",
"x10",
"x11",
"x12",
"x13",
"x14",
"x15",
"x16",
"x17",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23",
"v24",
"v25",
"v26",
"v27",
"v28",
"v29",
"v30",
"v31",
"cc",
"memory"
2804 "temploadreg0 .req X14\n"
2805 "temploadreg1 .req X15\n"
2806 "temploadreg2 .req X16\n"
2807 "temploadreg3 .req X17\n"
2808 "add a_ptr1, %[a_ptr0], %[lda]\n"
2809 "add c_ptr1, %[c_ptr0], %[ldc]\n"
2810 "add a_ptr2, a_ptr1, %[lda]\n"
2811 "add c_ptr2, c_ptr1, %[ldc]\n"
2812 "add a_ptr3, a_ptr2, %[lda]\n"
2813 "add c_ptr3, c_ptr2, %[ldc]\n"
2814 "add a_ptr4, a_ptr3, %[lda]\n"
2815 "add c_ptr4, c_ptr3, %[ldc]\n"
2816 "add a_ptr5, a_ptr4, %[lda]\n"
2817 "add c_ptr5, c_ptr4, %[ldc]\n"
2818 "add a_ptr6, a_ptr5, %[lda]\n"
2819 "add c_ptr6, c_ptr5, %[ldc]\n"
2820 "add a_ptr7, a_ptr6, %[lda]\n"
2821 "add c_ptr7, c_ptr6, %[ldc]\n"
2822 "cbz %[oob_rows], 1f\n"
2823 "subs %[oob_rows], %[oob_rows], #0x1\n"
2824 "add c_ptr7, %[c_ptr0], #0x0\n"
2825 "add a_ptr7, %[a_ptr0], #0x0\n"
2827 "subs %[oob_rows], %[oob_rows], #0x1\n"
2828 "add c_ptr6, %[c_ptr0], #0x0\n"
2829 "add a_ptr6, %[a_ptr0], #0x0\n"
2831 "subs %[oob_rows], %[oob_rows], #0x1\n"
2832 "add c_ptr5, %[c_ptr0], #0x0\n"
2833 "add a_ptr5, %[a_ptr0], #0x0\n"
2835 "subs %[oob_rows], %[oob_rows], #0x1\n"
2836 "add c_ptr4, %[c_ptr0], #0x0\n"
2837 "add a_ptr4, %[a_ptr0], #0x0\n"
2839 "subs %[oob_rows], %[oob_rows], #0x1\n"
2840 "add c_ptr3, %[c_ptr0], #0x0\n"
2841 "add a_ptr3, %[a_ptr0], #0x0\n"
2843 "subs %[oob_rows], %[oob_rows], #0x1\n"
2844 "add c_ptr2, %[c_ptr0], #0x0\n"
2845 "add a_ptr2, %[a_ptr0], #0x0\n"
2847 "subs %[oob_rows], %[oob_rows], #0x1\n"
2848 "add c_ptr1, %[c_ptr0], #0x0\n"
2849 "add a_ptr1, %[a_ptr0], #0x0\n"
2851 "cbnz %[odds], 2f\n"
2852 "ldr q0, [%[a_ptr0]], #0x10\n"
2853 "ldr q2, [a_ptr1], #0x10\n"
2854 "ldr q4, [a_ptr2], #0x10\n"
2855 "ldr q6, [a_ptr3], #0x10\n"
2856 "ldr q8, [a_ptr4], #0x10\n"
2857 "ldr q10, [a_ptr5], #0x10\n"
2858 "ldr q12, [a_ptr6], #0x10\n"
2859 "ldr q14, [a_ptr7], #0x10\n"
2860 "ldr q1, [%[a_ptr0]]\n"
2861 "ldr q3, [a_ptr1]\n"
2862 "ldr q5, [a_ptr2]\n"
2863 "ldr q7, [a_ptr3]\n"
2864 "ldr q9, [a_ptr4]\n"
2865 "ldr q11, [a_ptr5]\n"
2866 "ldr q13, [a_ptr6]\n"
2867 "ldr q15, [a_ptr7]\n"
2870 "ldr q0, [%[a_ptr0]], #0x10\n"
2871 "subs %[odds], %[odds], #0x1\n"
2872 "ldr q2, [a_ptr1], #0x10\n"
2873 "ldr q4, [a_ptr2], #0x10\n"
2874 "ldr d1, [%[a_ptr0]], #0x8\n"
2875 "ldr q6, [a_ptr3], #0x10\n"
2876 "ldr d3, [a_ptr1], #0x8\n"
2877 "ldr q8, [a_ptr4], #0x10\n"
2878 "ldr d5, [a_ptr2], #0x8\n"
2879 "ldr q10, [a_ptr5], #0x10\n"
2880 "ldr d7, [a_ptr3], #0x8\n"
2881 "ldr q12, [a_ptr6], #0x10\n"
2882 "ldr d9, [a_ptr4], #0x8\n"
2883 "ldr q14, [a_ptr7], #0x10\n"
2884 "ldr d11, [a_ptr5], #0x8\n"
2885 "ldr d13, [a_ptr6], #0x8\n"
2886 "ld1 {v1.s}[2], [%[a_ptr0]], #4\n"
2887 "ldr d15, [a_ptr7], #0x8\n"
2888 "ld1 {v3.s}[2], [a_ptr1], #4\n"
2889 "ld1 {v5.s}[2], [a_ptr2], #4\n"
2890 "ld1 {v7.s}[2], [a_ptr3], #4\n"
2891 "ld1 {v9.s}[2], [a_ptr4], #4\n"
2892 "ld1 {v11.s}[2], [a_ptr5], #4\n"
2893 "ld1 {v13.s}[2], [a_ptr6], #4\n"
2894 "ld1 {v15.s}[2], [a_ptr7], #4\n"
2896 "ld1 {v1.b}[12], [%[a_ptr0]]\n"
2897 "ld1 {v3.b}[12], [a_ptr1]\n"
2898 "ld1 {v5.b}[12], [a_ptr2]\n"
2899 "ld1 {v7.b}[12], [a_ptr3]\n"
2900 "ld1 {v9.b}[12], [a_ptr4]\n"
2901 "ld1 {v11.b}[12], [a_ptr5]\n"
2902 "ld1 {v13.b}[12], [a_ptr6]\n"
2903 "ld1 {v15.b}[12], [a_ptr7]\n"
2906 "ld1 {v1.h}[6], [%[a_ptr0]], #2\n"
2907 "ld1 {v3.h}[6], [a_ptr1], #2\n"
2908 "ld1 {v5.h}[6], [a_ptr2], #2\n"
2909 "ld1 {v7.h}[6], [a_ptr3], #2\n"
2910 "ld1 {v9.h}[6], [a_ptr4], #2\n"
2911 "ld1 {v11.h}[6], [a_ptr5], #2\n"
2912 "ld1 {v13.h}[6], [a_ptr6], #2\n"
2913 "ld1 {v15.h}[6], [a_ptr7], #2\n"
2914 "subs %[odds], %[odds], #0x1\n"
2918 "ld1 {v1.b}[14], [%[a_ptr0]]\n"
2919 "ld1 {v3.b}[14], [a_ptr1]\n"
2920 "ld1 {v5.b}[14], [a_ptr2]\n"
2921 "ld1 {v7.b}[14], [a_ptr3]\n"
2922 "ld1 {v9.b}[14], [a_ptr4]\n"
2923 "ld1 {v11.b}[14], [a_ptr5]\n"
2924 "ld1 {v13.b}[14], [a_ptr6]\n"
2925 "ld1 {v15.b}[14], [a_ptr7]\n"
2927 "ldr q16, [%[b_ptr0]]\n"
2928 "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
2929 "ldr q17, [%[b_ptr0], #0x10]\n"
2930 "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
2931 "ldr q18, [%[b_ptr0], #0x20]\n"
2932 "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
2933 "ldr q19, [%[b_ptr0], #0x30]\n"
2934 "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
2935 "ldr q20, [%[b_ptr0], #0x40]\n"
2936 "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
2937 "ldr q21, [%[b_ptr0], #0x50]\n"
2938 "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
2939 "ldr q22, [%[b_ptr0], #0x60]\n"
2940 "ldr q23, [%[b_ptr0], #0x70]\n"
2941 "add %[b_ptr0], %[b_ptr0], #0x80\n"
2942 "cbz %[loops], 6f\n"
2944 "subs %[loops], %[loops], #0x1\n"
2952 ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
2953 ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
2954 ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
2955 ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
2956 ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
2957 ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
2958 ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
2959 ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
2960 ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
2961 ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
2962 ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
2963 ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
2964 ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
2965 ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
2966 ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
2967 ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
2968 ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
2969 ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
2970 ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
2971 ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
2972 ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
2973 ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
2974 ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
2975 ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
2976 ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
2977 ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
2978 ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
2979 ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
2980 ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
2981 ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
2982 ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
2983 ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
2984 ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
2985 ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
2986 ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
2987 ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
2988 ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
2989 ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
2990 ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
2991 ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
2992 ".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
2993 ".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
2994 ".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
2995 ".inst 0x4fa7e2bb // sdot v27.4s, v21.16b, v7.4b[1]\n"
2996 ".inst 0x4fa9e2bc // sdot v28.4s, v21.16b, v9.4b[1]\n"
2997 ".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
2998 ".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
2999 ".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
3000 ".inst 0x4f81ead8 // sdot v24.4s, v22.16b, v1.4b[2]\n"
3001 ".inst 0x4f83ead9 // sdot v25.4s, v22.16b, v3.4b[2]\n"
3002 ".inst 0x4f85eada // sdot v26.4s, v22.16b, v5.4b[2]\n"
3003 ".inst 0x4f87eadb // sdot v27.4s, v22.16b, v7.4b[2]\n"
3004 ".inst 0x4f89eadc // sdot v28.4s, v22.16b, v9.4b[2]\n"
3005 ".inst 0x4f8beadd // sdot v29.4s, v22.16b, v11.4b[2]\n"
3006 ".inst 0x4f8deade // sdot v30.4s, v22.16b, v13.4b[2]\n"
3007 ".inst 0x4f8feadf // sdot v31.4s, v22.16b, v15.4b[2]\n"
3008 ".inst 0x4fa1eaf8 // sdot v24.4s, v23.16b, v1.4b[3]\n"
3009 ".inst 0x4fa3eaf9 // sdot v25.4s, v23.16b, v3.4b[3]\n"
3010 ".inst 0x4fa5eafa // sdot v26.4s, v23.16b, v5.4b[3]\n"
3011 ".inst 0x4fa7eafb // sdot v27.4s, v23.16b, v7.4b[3]\n"
3012 ".inst 0x4fa9eafc // sdot v28.4s, v23.16b, v9.4b[3]\n"
3013 ".inst 0x4fabeafd // sdot v29.4s, v23.16b, v11.4b[3]\n"
3014 ".inst 0x4fadeafe // sdot v30.4s, v23.16b, v13.4b[3]\n"
3015 ".inst 0x4fafeaff // sdot v31.4s, v23.16b, v15.4b[3]\n"
3018 "str q24, [%[c_ptr0]]\n"
3019 "subs %[loops], %[loops], #0x1\n"
3021 "ldr d16, [%[b_ptr0]]\n"
3022 "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
3023 "add %[c_ptr0], %[c_ptr0], #0x10\n"
3024 "str q25, [c_ptr1]\n"
3025 "add c_ptr1, c_ptr1, #0x10\n"
3027 "ldr d17, [%[b_ptr0], #0x10]\n"
3028 "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
3029 "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
3030 "str q26, [c_ptr2]\n"
3031 "add c_ptr2, c_ptr2, #0x10\n"
3033 "ldr d18, [%[b_ptr0], #0x20]\n"
3034 "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
3035 "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
3036 "str q27, [c_ptr3]\n"
3037 "add c_ptr3, c_ptr3, #0x10\n"
3039 "ldr d19, [%[b_ptr0], #0x30]\n"
3040 "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
3041 "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
3042 "str q28, [c_ptr4]\n"
3043 "add c_ptr4, c_ptr4, #0x10\n"
3045 "ldr d20, [%[b_ptr0], #0x40]\n"
3046 "ins v16.d[1], temploadreg0\n"
3047 "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
3048 "str q29, [c_ptr5]\n"
3049 "add c_ptr5, c_ptr5, #0x10\n"
3051 "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
3052 ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
3053 "str q30, [c_ptr6]\n"
3055 "ldr d21, [%[b_ptr0], #0x50]\n"
3056 ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
3057 "ins v17.d[1], temploadreg1\n"
3058 ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
3059 "str q31, [c_ptr7]\n"
3061 "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
3062 ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
3063 "ldr d22, [%[b_ptr0], #0x60]\n"
3064 ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
3065 "ins v18.d[1], temploadreg2\n"
3066 ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
3067 "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
3068 ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
3069 "ldr d23, [%[b_ptr0], #0x70]\n"
3070 ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
3071 "ins v19.d[1], temploadreg3\n"
3072 ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
3073 "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
3074 ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
3075 "ins v20.d[1], temploadreg0\n"
3076 ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
3077 "ins v21.d[1], temploadreg1\n"
3078 ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
3079 "ins v22.d[1], temploadreg2\n"
3080 ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
3081 "ins v23.d[1], temploadreg3\n"
3082 ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
3083 "add c_ptr6, c_ptr6, #0x10\n"
3084 ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
3085 "add c_ptr7, c_ptr7, #0x10\n"
3086 ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
3087 "add %[b_ptr0], %[b_ptr0], #0x80\n"
3088 ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
3089 "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
3090 ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
3091 "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
3092 ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
3093 "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
3094 ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
3095 "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
3096 ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
3097 ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
3098 ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
3099 ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
3100 ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
3101 ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
3102 ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
3103 ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
3104 ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
3105 ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
3106 ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
3107 ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
3108 ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
3109 ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
3110 ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
3111 ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
3112 ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
3113 ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
3114 ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
3115 ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
3116 ".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
3117 ".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
3118 ".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
3119 ".inst 0x4fa7e2bb // sdot v27.4s, v21.16b, v7.4b[1]\n"
3120 ".inst 0x4fa9e2bc // sdot v28.4s, v21.16b, v9.4b[1]\n"
3121 ".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
3122 ".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
3123 ".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
3124 ".inst 0x4f81ead8 // sdot v24.4s, v22.16b, v1.4b[2]\n"
3125 ".inst 0x4f83ead9 // sdot v25.4s, v22.16b, v3.4b[2]\n"
3126 ".inst 0x4f85eada // sdot v26.4s, v22.16b, v5.4b[2]\n"
3127 ".inst 0x4f87eadb // sdot v27.4s, v22.16b, v7.4b[2]\n"
3128 ".inst 0x4f89eadc // sdot v28.4s, v22.16b, v9.4b[2]\n"
3129 ".inst 0x4f8beadd // sdot v29.4s, v22.16b, v11.4b[2]\n"
3130 ".inst 0x4f8deade // sdot v30.4s, v22.16b, v13.4b[2]\n"
3131 ".inst 0x4f8feadf // sdot v31.4s, v22.16b, v15.4b[2]\n"
3132 ".inst 0x4fa1eaf8 // sdot v24.4s, v23.16b, v1.4b[3]\n"
3133 ".inst 0x4fa3eaf9 // sdot v25.4s, v23.16b, v3.4b[3]\n"
3134 ".inst 0x4fa5eafa // sdot v26.4s, v23.16b, v5.4b[3]\n"
3135 ".inst 0x4fa7eafb // sdot v27.4s, v23.16b, v7.4b[3]\n"
3136 ".inst 0x4fa9eafc // sdot v28.4s, v23.16b, v9.4b[3]\n"
3137 ".inst 0x4fabeafd // sdot v29.4s, v23.16b, v11.4b[3]\n"
3138 ".inst 0x4fadeafe // sdot v30.4s, v23.16b, v13.4b[3]\n"
3139 ".inst 0x4fafeaff // sdot v31.4s, v23.16b, v15.4b[3]\n"
3142 "str q24, [%[c_ptr0]]\n"
3143 "add %[c_ptr0], %[c_ptr0], #0x10\n"
3145 "ldr q16, [%[b_ptr0]]\n"
3146 "ldr q17, [%[b_ptr0], #0x10]\n"
3147 "str q25, [c_ptr1]\n"
3148 "add c_ptr1, c_ptr1, #0x10\n"
3150 "ldr q18, [%[b_ptr0], #0x20]\n"
3151 ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
3152 "str q26, [c_ptr2]\n"
3154 "ldr q19, [%[b_ptr0], #0x30]\n"
3155 "ldr q20, [%[b_ptr0], #0x40]\n"
3156 "add c_ptr2, c_ptr2, #0x10\n"
3157 ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
3158 "str q27, [c_ptr3]\n"
3160 "ldr q21, [%[b_ptr0], #0x50]\n"
3161 ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
3162 "ldr q22, [%[b_ptr0], #0x60]\n"
3163 ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
3164 "str q28, [c_ptr4]\n"
3166 "ldr q23, [%[b_ptr0], #0x70]\n"
3167 ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
3168 "add c_ptr3, c_ptr3, #0x10\n"
3169 ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
3170 "str q29, [c_ptr5]\n"
3172 "add c_ptr4, c_ptr4, #0x10\n"
3173 ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
3174 "str q30, [c_ptr6]\n"
3176 "add c_ptr5, c_ptr5, #0x10\n"
3177 ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
3178 "str q31, [c_ptr7]\n"
3180 "add c_ptr6, c_ptr6, #0x10\n"
3181 ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
3182 "add c_ptr7, c_ptr7, #0x10\n"
3183 ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
3184 "add %[b_ptr0], %[b_ptr0], #0x80\n"
3185 ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
3186 ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
3187 ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
3188 ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
3189 ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
3190 ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
3191 ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
3192 ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
3193 ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
3194 ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
3195 ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
3196 ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
3197 ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
3198 ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
3199 ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
3200 ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
3201 ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
3202 ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
3203 ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
3204 ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
3205 ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
3206 ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
3207 ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
3208 ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
3209 ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
3210 ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
3211 ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
3212 ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
3213 ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
3214 ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
3215 ".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
3216 ".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
3217 ".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
3218 ".inst 0x4fa7e2bb // sdot v27.4s, v21.16b, v7.4b[1]\n"
3219 ".inst 0x4fa9e2bc // sdot v28.4s, v21.16b, v9.4b[1]\n"
3220 ".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
3221 ".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
3222 ".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
3223 ".inst 0x4f81ead8 // sdot v24.4s, v22.16b, v1.4b[2]\n"
3224 ".inst 0x4f83ead9 // sdot v25.4s, v22.16b, v3.4b[2]\n"
3225 ".inst 0x4f85eada // sdot v26.4s, v22.16b, v5.4b[2]\n"
3226 ".inst 0x4f87eadb // sdot v27.4s, v22.16b, v7.4b[2]\n"
3227 ".inst 0x4f89eadc // sdot v28.4s, v22.16b, v9.4b[2]\n"
3228 ".inst 0x4f8beadd // sdot v29.4s, v22.16b, v11.4b[2]\n"
3229 ".inst 0x4f8deade // sdot v30.4s, v22.16b, v13.4b[2]\n"
3230 ".inst 0x4f8feadf // sdot v31.4s, v22.16b, v15.4b[2]\n"
3231 ".inst 0x4fa1eaf8 // sdot v24.4s, v23.16b, v1.4b[3]\n"
3232 ".inst 0x4fa3eaf9 // sdot v25.4s, v23.16b, v3.4b[3]\n"
3233 ".inst 0x4fa5eafa // sdot v26.4s, v23.16b, v5.4b[3]\n"
3234 ".inst 0x4fa7eafb // sdot v27.4s, v23.16b, v7.4b[3]\n"
3235 ".inst 0x4fa9eafc // sdot v28.4s, v23.16b, v9.4b[3]\n"
3236 ".inst 0x4fabeafd // sdot v29.4s, v23.16b, v11.4b[3]\n"
3237 ".inst 0x4fadeafe // sdot v30.4s, v23.16b, v13.4b[3]\n"
3238 ".inst 0x4fafeaff // sdot v31.4s, v23.16b, v15.4b[3]\n"
3249 ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
3250 ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
3251 ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
3252 ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
3253 ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
3254 ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
3255 ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
3256 ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
3257 ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
3258 ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
3259 ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
3260 ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
3261 ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
3262 ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
3263 ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
3264 ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
3265 ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
3266 ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
3267 ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
3268 ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
3269 ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
3270 ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
3271 ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
3272 ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
3273 ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
3274 ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
3275 ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
3276 ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
3277 ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
3278 ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
3279 ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
3280 ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
3281 ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
3282 ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
3283 ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
3284 ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
3285 ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
3286 ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
3287 ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
3288 ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
3289 ".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
3290 ".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
3291 ".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
3292 ".inst 0x4fa7e2bb // sdot v27.4s, v21.16b, v7.4b[1]\n"
3293 ".inst 0x4fa9e2bc // sdot v28.4s, v21.16b, v9.4b[1]\n"
3294 ".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
3295 ".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
3296 ".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
3297 ".inst 0x4f81ead8 // sdot v24.4s, v22.16b, v1.4b[2]\n"
3298 ".inst 0x4f83ead9 // sdot v25.4s, v22.16b, v3.4b[2]\n"
3299 ".inst 0x4f85eada // sdot v26.4s, v22.16b, v5.4b[2]\n"
3300 ".inst 0x4f87eadb // sdot v27.4s, v22.16b, v7.4b[2]\n"
3301 ".inst 0x4f89eadc // sdot v28.4s, v22.16b, v9.4b[2]\n"
3302 ".inst 0x4f8beadd // sdot v29.4s, v22.16b, v11.4b[2]\n"
3303 ".inst 0x4f8deade // sdot v30.4s, v22.16b, v13.4b[2]\n"
3304 ".inst 0x4f8feadf // sdot v31.4s, v22.16b, v15.4b[2]\n"
3305 ".inst 0x4fa1eaf8 // sdot v24.4s, v23.16b, v1.4b[3]\n"
3306 ".inst 0x4fa3eaf9 // sdot v25.4s, v23.16b, v3.4b[3]\n"
3307 ".inst 0x4fa5eafa // sdot v26.4s, v23.16b, v5.4b[3]\n"
3308 ".inst 0x4fa7eafb // sdot v27.4s, v23.16b, v7.4b[3]\n"
3309 ".inst 0x4fa9eafc // sdot v28.4s, v23.16b, v9.4b[3]\n"
3310 ".inst 0x4fabeafd // sdot v29.4s, v23.16b, v11.4b[3]\n"
3311 ".inst 0x4fadeafe // sdot v30.4s, v23.16b, v13.4b[3]\n"
3312 ".inst 0x4fafeaff // sdot v31.4s, v23.16b, v15.4b[3]\n"
3314 "str q24, [%[c_ptr0]]\n"
3315 "add %[c_ptr0], %[c_ptr0], #0x10\n"
3316 "str q25, [c_ptr1]\n"
3317 "str q26, [c_ptr2]\n"
3318 "str q27, [c_ptr3]\n"
3319 "str q28, [c_ptr4]\n"
3320 "str q29, [c_ptr5]\n"
3321 "str q30, [c_ptr6]\n"
3322 "str q31, [c_ptr7]\n"
3337 ".unreq temploadreg0\n"
3338 ".unreq temploadreg1\n"
3339 ".unreq temploadreg2\n"
3340 ".unreq temploadreg3\n"
3341 : [a_ptr0]
"+r" (a_ptr0), [b_ptr0]
"+r" (b_ptr0), [c_ptr0]
"+r" (c_ptr0), [loops]
"+r" (loops), [oob_rows]
"+r" (oob_rows), [odds]
"+r" (odds)
3342 : [lda]
"r" (ldab), [ldc]
"r" (ldcb)
3343 :
"x0",
"x1",
"x2",
"x3",
"x4",
"x5",
"x6",
"x7",
"x8",
"x9",
"x10",
"x11",
"x12",
"x13",
"x14",
"x15",
"x16",
"x17",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23",
"v24",
"v25",
"v26",
"v27",
"v28",
"v29",
"v30",
"v31",
"cc",
"memory"
3352 #endif // __aarch64__