27 #include "../../utils.hpp"
33 void a64_hybrid_u8u32_dot_6x16 (
34 unsigned int num_strings,
const unsigned int *string_lengths, IndirectInputArg<uint8_t> A_arg,
35 size_t M,
size_t N,
const uint8_t *B_ptr, IndirectOutputArg<uint32_t> output_arg,
40 unsigned int num_strings = {};
41 const unsigned int *string_lengths = {};
43 const uint8_t *B_ptr = {};
44 size_t output_offset = {};
45 size_t input_initial_col = {};
46 size_t input_offset = {};
49 unsigned long flags=0;
53 if (output_arg.is_indirect) {
54 output_ptr=(
void *)(output_arg.indirect.ptr);
55 ka.output_offset=output_arg.indirect.offset;
58 output_ptr=(
void *)(output_arg.direct.base);
59 ka.output_offset=output_arg.direct.stride;
62 if (A_arg.is_indirect) {
63 input_ptr=(
void *)(A_arg.indirect.ptr);
64 ka.input_offset=A_arg.indirect.start_row;
65 ka.input_initial_col=A_arg.indirect.start_col;
68 assert(num_strings==1);
69 input_ptr=(
void *)(A_arg.direct.base);
70 ka.input_offset=A_arg.direct.stride;
75 ka.num_strings = num_strings;
76 ka.string_lengths = string_lengths;
89 "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
90 "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
91 "mov x9, %x[output_ptr]\n"
93 "tbz %x[flags], #0, 12f\n"
97 "ld1 { v8.4s }, [x9], #0x10\n"
98 "ld1 { v9.4s }, [x9], #0x10\n"
100 "ld1 { v10.4s }, [x9], #0x10\n"
102 "ldr d11, [x9], #0x8\n"
105 "ld1 { v11.s }[2], [x9]\n"
110 "ldr s11, [x9, #0x0]\n"
114 "ldr d10, [x9], #0x8\n"
117 "ld1 { v10.s }[2], [x9]\n"
122 "ldr s10, [x9, #0x0]\n"
126 "ld1 { v8.4s }, [x9], #0x10\n"
128 "ldr d9, [x9], #0x8\n"
131 "ld1 { v9.s }[2], [x9]\n"
136 "ldr s9, [x9, #0x0]\n"
140 "ldr d8, [x9], #0x8\n"
143 "ld1 { v8.s }[2], [x9]\n"
146 "ldr s8, [x9, #0x0]\n"
152 "ldr q8, [x9, #0x0]\n"
153 "ldr q9, [x9, #0x10]\n"
154 "ldr q10, [x9, #0x20]\n"
155 "ldr q11, [x9, #0x30]\n"
160 "movi v10.4s, #0x0\n"
161 "movi v11.4s, #0x0\n"
165 "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
166 "ldr w27, [x20, x28, LSL #0x2]\n"
167 "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
168 "tbz %x[flags], #3, 15f\n"
169 "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
170 "add x20, x20, x21, LSL #3\n"
171 "ldr x26, [x20, #0x0]\n"
173 "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
174 "add x26, x26, x20\n"
177 "mov x26, %x[input_ptr]\n"
181 "ldr q0, [x26, #0x0]\n"
182 "ldr q6, [x10, #0x0]\n"
184 "ldr q7, [x10, #0x10]\n"
187 ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
188 "ldr q17, [x10, #0x20]\n"
189 ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
190 "ldr q16, [x10, #0x30]\n"
191 ".inst 0x6f80e22a // udot v10.4s, v17.16b, v0.4b[0]\n"
192 "ldr q17, [x10, #0x40]\n"
193 ".inst 0x6f80e20b // udot v11.4s, v16.16b, v0.4b[0]\n"
194 "ldr q16, [x10, #0x50]\n"
195 ".inst 0x6fa0e228 // udot v8.4s, v17.16b, v0.4b[1]\n"
196 "ldr q17, [x10, #0x60]\n"
197 ".inst 0x6fa0e209 // udot v9.4s, v16.16b, v0.4b[1]\n"
198 "ldr q16, [x10, #0x70]\n"
199 ".inst 0x6fa0e22a // udot v10.4s, v17.16b, v0.4b[1]\n"
200 "ldr q17, [x10, #0x80]\n"
201 ".inst 0x6fa0e20b // udot v11.4s, v16.16b, v0.4b[1]\n"
202 "ldr q16, [x10, #0x90]\n"
203 ".inst 0x6f80ea28 // udot v8.4s, v17.16b, v0.4b[2]\n"
204 "ldr q17, [x10, #0xa0]\n"
205 ".inst 0x6f80ea09 // udot v9.4s, v16.16b, v0.4b[2]\n"
206 "ldr q16, [x10, #0xb0]\n"
207 ".inst 0x6f80ea2a // udot v10.4s, v17.16b, v0.4b[2]\n"
208 "ldr q17, [x10, #0xc0]\n"
209 ".inst 0x6f80ea0b // udot v11.4s, v16.16b, v0.4b[2]\n"
210 "ldr q16, [x10, #0xd0]\n"
211 ".inst 0x6fa0ea28 // udot v8.4s, v17.16b, v0.4b[3]\n"
212 "ldr q17, [x10, #0xe0]\n"
213 ".inst 0x6fa0ea09 // udot v9.4s, v16.16b, v0.4b[3]\n"
214 "ldr q16, [x10, #0xf0]\n"
215 "sub x27, x27, #0x10\n"
216 "add x26, x26, #0x10\n"
217 ".inst 0x6fa0ea2a // udot v10.4s, v17.16b, v0.4b[3]\n"
218 ".inst 0x6fa0ea0b // udot v11.4s, v16.16b, v0.4b[3]\n"
219 "ldr q0, [x26, #0x0]\n"
221 "add x10, x10, #0x100\n"
222 "ldr q6, [x10, #0x0]\n"
223 "ldr q7, [x10, #0x10]\n"
224 "prfm pldl1keep, [x26, #0x80]\n"
227 ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
228 "ldr q17, [x10, #0x20]\n"
229 ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
230 "ldr q16, [x10, #0x30]\n"
231 ".inst 0x6f80e22a // udot v10.4s, v17.16b, v0.4b[0]\n"
232 "ldr q17, [x10, #0x40]\n"
233 ".inst 0x6f80e20b // udot v11.4s, v16.16b, v0.4b[0]\n"
234 "ldr q16, [x10, #0x50]\n"
235 ".inst 0x6fa0e228 // udot v8.4s, v17.16b, v0.4b[1]\n"
236 "ldr q17, [x10, #0x60]\n"
237 ".inst 0x6fa0e209 // udot v9.4s, v16.16b, v0.4b[1]\n"
238 "ldr q16, [x10, #0x70]\n"
239 ".inst 0x6fa0e22a // udot v10.4s, v17.16b, v0.4b[1]\n"
240 "ldr q17, [x10, #0x80]\n"
241 ".inst 0x6fa0e20b // udot v11.4s, v16.16b, v0.4b[1]\n"
242 "ldr q16, [x10, #0x90]\n"
243 ".inst 0x6f80ea28 // udot v8.4s, v17.16b, v0.4b[2]\n"
244 "ldr q17, [x10, #0xa0]\n"
245 ".inst 0x6f80ea09 // udot v9.4s, v16.16b, v0.4b[2]\n"
246 "ldr q16, [x10, #0xb0]\n"
247 ".inst 0x6f80ea2a // udot v10.4s, v17.16b, v0.4b[2]\n"
248 "ldr q17, [x10, #0xc0]\n"
249 ".inst 0x6f80ea0b // udot v11.4s, v16.16b, v0.4b[2]\n"
250 "ldr q16, [x10, #0xd0]\n"
251 ".inst 0x6fa0ea28 // udot v8.4s, v17.16b, v0.4b[3]\n"
252 "ldr q17, [x10, #0xe0]\n"
253 ".inst 0x6fa0ea09 // udot v9.4s, v16.16b, v0.4b[3]\n"
254 "ldr q16, [x10, #0xf0]\n"
255 "add x26, x26, #0x10\n"
256 "sub x27, x27, #0x10\n"
257 ".inst 0x6fa0ea2a // udot v10.4s, v17.16b, v0.4b[3]\n"
258 ".inst 0x6fa0ea0b // udot v11.4s, v16.16b, v0.4b[3]\n"
259 "prfm pldl1keep, [x26, #0x80]\n"
260 "add x10, x10, #0x100\n"
266 "ldr s18, [x26], #0x4\n"
267 "ldr q16, [x10, #0x0]\n"
268 ".inst 0x6f92e208 // udot v8.4s, v16.16b, v18.4b[0]\n"
269 "sub x27, x27, #0x4\n"
270 "ldr q16, [x10, #0x10]\n"
271 "ldr q17, [x10, #0x20]\n"
272 ".inst 0x6f92e209 // udot v9.4s, v16.16b, v18.4b[0]\n"
274 "ldr q16, [x10, #0x30]\n"
275 ".inst 0x6f92e22a // udot v10.4s, v17.16b, v18.4b[0]\n"
276 ".inst 0x6f92e20b // udot v11.4s, v16.16b, v18.4b[0]\n"
277 "add x10, x10, #0x40\n"
282 "ldr h0, [x26], #0x2\n"
284 "ld1 { v0.b }[2], [x26]\n"
287 "ldr b0, [x26, #0x0]\n"
289 "ldr q17, [x10, #0x0]\n"
290 "ldr q16, [x10, #0x10]\n"
291 ".inst 0x6f80e228 // udot v8.4s, v17.16b, v0.4b[0]\n"
292 ".inst 0x6f80e209 // udot v9.4s, v16.16b, v0.4b[0]\n"
293 "ldr q17, [x10, #0x20]\n"
294 "ldr q16, [x10, #0x30]\n"
295 ".inst 0x6f80e22a // udot v10.4s, v17.16b, v0.4b[0]\n"
296 ".inst 0x6f80e20b // udot v11.4s, v16.16b, v0.4b[0]\n"
297 "add x10, x10, #0x40\n"
299 "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
300 "add x28, x28, #0x1\n"
304 "prfm pstl1keep, [x9, #0x0]\n"
307 "st1 { v8.4s }, [x9], #0x10\n"
308 "st1 { v9.4s }, [x9], #0x10\n"
310 "st1 { v10.4s }, [x9], #0x10\n"
312 "str d11, [x9], #0x8\n"
314 "st1 { v11.s }[2], [x9]\n"
318 "str s11, [x9, #0x0]\n"
322 "str d10, [x9], #0x8\n"
324 "st1 { v10.s }[2], [x9]\n"
328 "str s10, [x9, #0x0]\n"
332 "st1 { v8.4s }, [x9], #0x10\n"
334 "str d9, [x9], #0x8\n"
336 "st1 { v9.s }[2], [x9]\n"
340 "str s9, [x9, #0x0]\n"
344 "str d8, [x9], #0x8\n"
346 "st1 { v8.s }[2], [x9]\n"
349 "str s8, [x9, #0x0]\n"
353 "str q8, [x9, #0x0]\n"
354 "str q9, [x9, #0x10]\n"
355 "str q10, [x9, #0x20]\n"
356 "str q11, [x9, #0x30]\n"
357 "add x9, x9, #0x40\n"
359 "subs x11, x11, #0x10\n"
363 "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
364 "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
365 "mov x9, %x[output_ptr]\n"
367 "tbz %x[flags], #0, 46f\n"
368 "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
370 "add x24, x9, x20, LSL #2\n"
373 "ld1 { v8.4s }, [x9], #0x10\n"
374 "ld1 { v12.4s }, [x24], #0x10\n"
375 "ld1 { v9.4s }, [x9], #0x10\n"
376 "ld1 { v13.4s }, [x24], #0x10\n"
378 "ld1 { v10.4s }, [x9], #0x10\n"
379 "ld1 { v14.4s }, [x24], #0x10\n"
381 "ldr d11, [x9], #0x8\n"
382 "ldr d15, [x24], #0x8\n"
385 "ld1 { v11.s }[2], [x9]\n"
386 "ld1 { v15.s }[2], [x24]\n"
391 "ldr s11, [x9, #0x0]\n"
392 "ldr s15, [x24, #0x0]\n"
396 "ldr d10, [x9], #0x8\n"
397 "ldr d14, [x24], #0x8\n"
400 "ld1 { v10.s }[2], [x9]\n"
401 "ld1 { v14.s }[2], [x24]\n"
406 "ldr s10, [x9, #0x0]\n"
407 "ldr s14, [x24, #0x0]\n"
411 "ld1 { v8.4s }, [x9], #0x10\n"
412 "ld1 { v12.4s }, [x24], #0x10\n"
414 "ldr d9, [x9], #0x8\n"
415 "ldr d13, [x24], #0x8\n"
418 "ld1 { v9.s }[2], [x9]\n"
419 "ld1 { v13.s }[2], [x24]\n"
424 "ldr s9, [x9, #0x0]\n"
425 "ldr s13, [x24, #0x0]\n"
429 "ldr d8, [x9], #0x8\n"
430 "ldr d12, [x24], #0x8\n"
433 "ld1 { v8.s }[2], [x9]\n"
434 "ld1 { v12.s }[2], [x24]\n"
437 "ldr s8, [x9, #0x0]\n"
438 "ldr s12, [x24, #0x0]\n"
444 "ldr q8, [x9, #0x0]\n"
445 "ldr q9, [x9, #0x10]\n"
446 "ldr q10, [x9, #0x20]\n"
447 "ldr q11, [x9, #0x30]\n"
448 "ldr q12, [x24, #0x0]\n"
449 "ldr q13, [x24, #0x10]\n"
450 "ldr q14, [x24, #0x20]\n"
451 "ldr q15, [x24, #0x30]\n"
456 "movi v10.4s, #0x0\n"
457 "movi v11.4s, #0x0\n"
458 "movi v12.4s, #0x0\n"
459 "movi v13.4s, #0x0\n"
460 "movi v14.4s, #0x0\n"
461 "movi v15.4s, #0x0\n"
465 "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
466 "ldr w27, [x20, x28, LSL #0x2]\n"
467 "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
468 "tbz %x[flags], #3, 49f\n"
469 "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
470 "add x20, x20, x21, LSL #3\n"
471 "ldr x26, [x20, #0x0]\n"
472 "ldr x25, [x20, #0x8]\n"
474 "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
475 "add x26, x26, x20\n"
476 "add x25, x25, x20\n"
479 "mov x26, %x[input_ptr]\n"
480 "add x25, x26, x21\n"
484 "ldr q0, [x26, #0x0]\n"
485 "ldr q1, [x25, #0x0]\n"
487 "ldr q6, [x10, #0x0]\n"
488 "ldr q7, [x10, #0x10]\n"
491 ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
492 ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
493 "ldr q17, [x10, #0x20]\n"
494 "sub x27, x27, #0x10\n"
495 ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
496 ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
497 "ldr q16, [x10, #0x30]\n"
498 "add x26, x26, #0x10\n"
499 ".inst 0x6f80e22a // udot v10.4s, v17.16b, v0.4b[0]\n"
500 ".inst 0x6f81e22e // udot v14.4s, v17.16b, v1.4b[0]\n"
501 "ldr q17, [x10, #0x40]\n"
502 "add x25, x25, #0x10\n"
503 ".inst 0x6f80e20b // udot v11.4s, v16.16b, v0.4b[0]\n"
504 ".inst 0x6f81e20f // udot v15.4s, v16.16b, v1.4b[0]\n"
505 "ldr q16, [x10, #0x50]\n"
507 ".inst 0x6fa0e228 // udot v8.4s, v17.16b, v0.4b[1]\n"
508 ".inst 0x6fa1e22c // udot v12.4s, v17.16b, v1.4b[1]\n"
509 "ldr q17, [x10, #0x60]\n"
510 "prfm pldl1keep, [x26, #0x80]\n"
511 ".inst 0x6fa0e209 // udot v9.4s, v16.16b, v0.4b[1]\n"
512 ".inst 0x6fa1e20d // udot v13.4s, v16.16b, v1.4b[1]\n"
513 "ldr q16, [x10, #0x70]\n"
514 "prfm pldl1keep, [x25, #0x80]\n"
515 ".inst 0x6fa0e22a // udot v10.4s, v17.16b, v0.4b[1]\n"
516 ".inst 0x6fa1e22e // udot v14.4s, v17.16b, v1.4b[1]\n"
517 "ldr q17, [x10, #0x80]\n"
518 ".inst 0x6fa0e20b // udot v11.4s, v16.16b, v0.4b[1]\n"
519 ".inst 0x6fa1e20f // udot v15.4s, v16.16b, v1.4b[1]\n"
520 "ldr q16, [x10, #0x90]\n"
521 ".inst 0x6f80ea28 // udot v8.4s, v17.16b, v0.4b[2]\n"
522 ".inst 0x6f81ea2c // udot v12.4s, v17.16b, v1.4b[2]\n"
523 "ldr q17, [x10, #0xa0]\n"
524 ".inst 0x6f80ea09 // udot v9.4s, v16.16b, v0.4b[2]\n"
525 ".inst 0x6f81ea0d // udot v13.4s, v16.16b, v1.4b[2]\n"
526 "ldr q16, [x10, #0xb0]\n"
527 ".inst 0x6f80ea2a // udot v10.4s, v17.16b, v0.4b[2]\n"
528 ".inst 0x6f81ea2e // udot v14.4s, v17.16b, v1.4b[2]\n"
529 "ldr q17, [x10, #0xc0]\n"
530 ".inst 0x6f80ea0b // udot v11.4s, v16.16b, v0.4b[2]\n"
531 ".inst 0x6f81ea0f // udot v15.4s, v16.16b, v1.4b[2]\n"
532 "ldr q16, [x10, #0xd0]\n"
533 ".inst 0x6fa0ea28 // udot v8.4s, v17.16b, v0.4b[3]\n"
534 ".inst 0x6fa1ea2c // udot v12.4s, v17.16b, v1.4b[3]\n"
535 "ldr q17, [x10, #0xe0]\n"
536 ".inst 0x6fa0ea09 // udot v9.4s, v16.16b, v0.4b[3]\n"
537 ".inst 0x6fa1ea0d // udot v13.4s, v16.16b, v1.4b[3]\n"
538 "ldr q16, [x10, #0xf0]\n"
539 "add x10, x10, #0x100\n"
540 ".inst 0x6fa0ea2a // udot v10.4s, v17.16b, v0.4b[3]\n"
541 ".inst 0x6fa1ea2e // udot v14.4s, v17.16b, v1.4b[3]\n"
542 "ldr q6, [x10, #0x0]\n"
543 ".inst 0x6fa0ea0b // udot v11.4s, v16.16b, v0.4b[3]\n"
544 "ldr q0, [x26, #0x0]\n"
545 ".inst 0x6fa1ea0f // udot v15.4s, v16.16b, v1.4b[3]\n"
546 "ldr q1, [x25, #0x0]\n"
547 "ldr q7, [x10, #0x10]\n"
550 ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
551 ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
552 "ldr q17, [x10, #0x20]\n"
553 "add x26, x26, #0x10\n"
554 ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
555 ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
556 "ldr q16, [x10, #0x30]\n"
557 "add x25, x25, #0x10\n"
558 ".inst 0x6f80e22a // udot v10.4s, v17.16b, v0.4b[0]\n"
559 ".inst 0x6f81e22e // udot v14.4s, v17.16b, v1.4b[0]\n"
560 "ldr q17, [x10, #0x40]\n"
561 "sub x27, x27, #0x10\n"
562 ".inst 0x6f80e20b // udot v11.4s, v16.16b, v0.4b[0]\n"
563 ".inst 0x6f81e20f // udot v15.4s, v16.16b, v1.4b[0]\n"
564 "ldr q16, [x10, #0x50]\n"
565 "prfm pldl1keep, [x26, #0x80]\n"
566 ".inst 0x6fa0e228 // udot v8.4s, v17.16b, v0.4b[1]\n"
567 ".inst 0x6fa1e22c // udot v12.4s, v17.16b, v1.4b[1]\n"
568 "ldr q17, [x10, #0x60]\n"
569 "prfm pldl1keep, [x25, #0x80]\n"
570 ".inst 0x6fa0e209 // udot v9.4s, v16.16b, v0.4b[1]\n"
571 ".inst 0x6fa1e20d // udot v13.4s, v16.16b, v1.4b[1]\n"
572 "ldr q16, [x10, #0x70]\n"
573 ".inst 0x6fa0e22a // udot v10.4s, v17.16b, v0.4b[1]\n"
574 ".inst 0x6fa1e22e // udot v14.4s, v17.16b, v1.4b[1]\n"
575 "ldr q17, [x10, #0x80]\n"
576 ".inst 0x6fa0e20b // udot v11.4s, v16.16b, v0.4b[1]\n"
577 ".inst 0x6fa1e20f // udot v15.4s, v16.16b, v1.4b[1]\n"
578 "ldr q16, [x10, #0x90]\n"
579 ".inst 0x6f80ea28 // udot v8.4s, v17.16b, v0.4b[2]\n"
580 ".inst 0x6f81ea2c // udot v12.4s, v17.16b, v1.4b[2]\n"
581 "ldr q17, [x10, #0xa0]\n"
582 ".inst 0x6f80ea09 // udot v9.4s, v16.16b, v0.4b[2]\n"
583 ".inst 0x6f81ea0d // udot v13.4s, v16.16b, v1.4b[2]\n"
584 "ldr q16, [x10, #0xb0]\n"
585 ".inst 0x6f80ea2a // udot v10.4s, v17.16b, v0.4b[2]\n"
586 ".inst 0x6f81ea2e // udot v14.4s, v17.16b, v1.4b[2]\n"
587 "ldr q17, [x10, #0xc0]\n"
588 ".inst 0x6f80ea0b // udot v11.4s, v16.16b, v0.4b[2]\n"
589 ".inst 0x6f81ea0f // udot v15.4s, v16.16b, v1.4b[2]\n"
590 "ldr q16, [x10, #0xd0]\n"
591 ".inst 0x6fa0ea28 // udot v8.4s, v17.16b, v0.4b[3]\n"
592 ".inst 0x6fa1ea2c // udot v12.4s, v17.16b, v1.4b[3]\n"
593 "ldr q17, [x10, #0xe0]\n"
594 ".inst 0x6fa0ea09 // udot v9.4s, v16.16b, v0.4b[3]\n"
595 ".inst 0x6fa1ea0d // udot v13.4s, v16.16b, v1.4b[3]\n"
596 "ldr q16, [x10, #0xf0]\n"
597 "add x10, x10, #0x100\n"
598 ".inst 0x6fa0ea2a // udot v10.4s, v17.16b, v0.4b[3]\n"
599 ".inst 0x6fa1ea2e // udot v14.4s, v17.16b, v1.4b[3]\n"
600 ".inst 0x6fa0ea0b // udot v11.4s, v16.16b, v0.4b[3]\n"
601 ".inst 0x6fa1ea0f // udot v15.4s, v16.16b, v1.4b[3]\n"
607 "ldr s19, [x26], #0x4\n"
608 "ldr s18, [x25], #0x4\n"
609 "sub x27, x27, #0x4\n"
611 "ldr q17, [x10, #0x0]\n"
612 "ldr q16, [x10, #0x10]\n"
613 ".inst 0x6f93e228 // udot v8.4s, v17.16b, v19.4b[0]\n"
614 ".inst 0x6f92e22c // udot v12.4s, v17.16b, v18.4b[0]\n"
615 "ldr q17, [x10, #0x20]\n"
616 ".inst 0x6f93e209 // udot v9.4s, v16.16b, v19.4b[0]\n"
617 ".inst 0x6f92e20d // udot v13.4s, v16.16b, v18.4b[0]\n"
618 "ldr q16, [x10, #0x30]\n"
619 ".inst 0x6f93e22a // udot v10.4s, v17.16b, v19.4b[0]\n"
620 ".inst 0x6f92e22e // udot v14.4s, v17.16b, v18.4b[0]\n"
621 "add x10, x10, #0x40\n"
622 ".inst 0x6f93e20b // udot v11.4s, v16.16b, v19.4b[0]\n"
623 ".inst 0x6f92e20f // udot v15.4s, v16.16b, v18.4b[0]\n"
628 "ldr h0, [x26], #0x2\n"
629 "ldr h1, [x25], #0x2\n"
631 "ld1 { v0.b }[2], [x26]\n"
632 "ld1 { v1.b }[2], [x25]\n"
635 "ldr b0, [x26, #0x0]\n"
636 "ldr b1, [x25, #0x0]\n"
638 "ldr q17, [x10, #0x0]\n"
639 "ldr q16, [x10, #0x10]\n"
640 ".inst 0x6f80e228 // udot v8.4s, v17.16b, v0.4b[0]\n"
641 ".inst 0x6f81e22c // udot v12.4s, v17.16b, v1.4b[0]\n"
642 "ldr q17, [x10, #0x20]\n"
643 ".inst 0x6f80e209 // udot v9.4s, v16.16b, v0.4b[0]\n"
644 ".inst 0x6f81e20d // udot v13.4s, v16.16b, v1.4b[0]\n"
645 "ldr q16, [x10, #0x30]\n"
646 ".inst 0x6f80e22a // udot v10.4s, v17.16b, v0.4b[0]\n"
647 ".inst 0x6f81e22e // udot v14.4s, v17.16b, v1.4b[0]\n"
648 "add x10, x10, #0x40\n"
649 ".inst 0x6f80e20b // udot v11.4s, v16.16b, v0.4b[0]\n"
650 ".inst 0x6f81e20f // udot v15.4s, v16.16b, v1.4b[0]\n"
652 "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
653 "add x28, x28, #0x1\n"
656 "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
657 "add x24, x9, x20, LSL #2\n"
659 "prfm pstl1keep, [x9, #0x0]\n"
660 "prfm pstl1keep, [x24, #0x0]\n"
663 "st1 { v8.4s }, [x9], #0x10\n"
664 "st1 { v9.4s }, [x9], #0x10\n"
665 "st1 { v12.4s }, [x24], #0x10\n"
666 "st1 { v13.4s }, [x24], #0x10\n"
668 "st1 { v10.4s }, [x9], #0x10\n"
669 "st1 { v14.4s }, [x24], #0x10\n"
671 "str d11, [x9], #0x8\n"
672 "str d15, [x24], #0x8\n"
674 "st1 { v11.s }[2], [x9]\n"
675 "st1 { v15.s }[2], [x24]\n"
679 "str s11, [x9, #0x0]\n"
680 "str s15, [x24, #0x0]\n"
684 "str d10, [x9], #0x8\n"
685 "str d14, [x24], #0x8\n"
687 "st1 { v10.s }[2], [x9]\n"
688 "st1 { v14.s }[2], [x24]\n"
692 "str s10, [x9, #0x0]\n"
693 "str s14, [x24, #0x0]\n"
697 "st1 { v8.4s }, [x9], #0x10\n"
698 "st1 { v12.4s }, [x24], #0x10\n"
700 "str d9, [x9], #0x8\n"
701 "str d13, [x24], #0x8\n"
703 "st1 { v9.s }[2], [x9]\n"
704 "st1 { v13.s }[2], [x24]\n"
708 "str s9, [x9, #0x0]\n"
709 "str s13, [x24, #0x0]\n"
713 "str d8, [x9], #0x8\n"
714 "str d12, [x24], #0x8\n"
716 "st1 { v8.s }[2], [x9]\n"
717 "st1 { v12.s }[2], [x24]\n"
720 "str s8, [x9, #0x0]\n"
721 "str s12, [x24, #0x0]\n"
725 "str q8, [x9, #0x0]\n"
726 "str q9, [x9, #0x10]\n"
727 "str q10, [x9, #0x20]\n"
728 "str q11, [x9, #0x30]\n"
729 "add x9, x9, #0x40\n"
730 "str q12, [x24, #0x0]\n"
731 "str q13, [x24, #0x10]\n"
732 "str q14, [x24, #0x20]\n"
733 "str q15, [x24, #0x30]\n"
735 "subs x11, x11, #0x10\n"
739 "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
740 "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
741 "mov x9, %x[output_ptr]\n"
743 "tbz %x[flags], #0, 80f\n"
744 "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
745 "add x24, x9, x20, LSL #2\n"
747 "add x23, x24, x20, LSL #2\n"
750 "ld1 { v8.4s }, [x9], #0x10\n"
751 "ld1 { v12.4s }, [x24], #0x10\n"
752 "ld1 { v16.4s }, [x23], #0x10\n"
753 "ld1 { v9.4s }, [x9], #0x10\n"
754 "ld1 { v13.4s }, [x24], #0x10\n"
755 "ld1 { v17.4s }, [x23], #0x10\n"
757 "ld1 { v10.4s }, [x9], #0x10\n"
758 "ld1 { v14.4s }, [x24], #0x10\n"
759 "ld1 { v18.4s }, [x23], #0x10\n"
761 "ldr d11, [x9], #0x8\n"
762 "ldr d15, [x24], #0x8\n"
764 "ldr d19, [x23], #0x8\n"
766 "ld1 { v11.s }[2], [x9]\n"
767 "ld1 { v15.s }[2], [x24]\n"
768 "ld1 { v19.s }[2], [x23]\n"
773 "ldr s11, [x9, #0x0]\n"
774 "ldr s15, [x24, #0x0]\n"
775 "ldr s19, [x23, #0x0]\n"
779 "ldr d10, [x9], #0x8\n"
780 "ldr d14, [x24], #0x8\n"
782 "ldr d18, [x23], #0x8\n"
784 "ld1 { v10.s }[2], [x9]\n"
785 "ld1 { v14.s }[2], [x24]\n"
786 "ld1 { v18.s }[2], [x23]\n"
791 "ldr s10, [x9, #0x0]\n"
792 "ldr s14, [x24, #0x0]\n"
793 "ldr s18, [x23, #0x0]\n"
797 "ld1 { v8.4s }, [x9], #0x10\n"
798 "ld1 { v12.4s }, [x24], #0x10\n"
799 "ld1 { v16.4s }, [x23], #0x10\n"
801 "ldr d9, [x9], #0x8\n"
802 "ldr d13, [x24], #0x8\n"
804 "ldr d17, [x23], #0x8\n"
806 "ld1 { v9.s }[2], [x9]\n"
807 "ld1 { v13.s }[2], [x24]\n"
808 "ld1 { v17.s }[2], [x23]\n"
813 "ldr s9, [x9, #0x0]\n"
814 "ldr s13, [x24, #0x0]\n"
815 "ldr s17, [x23, #0x0]\n"
819 "ldr d8, [x9], #0x8\n"
820 "ldr d12, [x24], #0x8\n"
822 "ldr d16, [x23], #0x8\n"
824 "ld1 { v8.s }[2], [x9]\n"
825 "ld1 { v12.s }[2], [x24]\n"
826 "ld1 { v16.s }[2], [x23]\n"
829 "ldr s8, [x9, #0x0]\n"
830 "ldr s12, [x24, #0x0]\n"
832 "ldr s16, [x23, #0x0]\n"
837 "ldr q8, [x9, #0x0]\n"
838 "ldr q9, [x9, #0x10]\n"
839 "ldr q10, [x9, #0x20]\n"
840 "ldr q11, [x9, #0x30]\n"
841 "ldr q12, [x24, #0x0]\n"
842 "ldr q13, [x24, #0x10]\n"
843 "ldr q14, [x24, #0x20]\n"
844 "ldr q15, [x24, #0x30]\n"
845 "ldr q16, [x23, #0x0]\n"
846 "ldr q17, [x23, #0x10]\n"
847 "ldr q18, [x23, #0x20]\n"
848 "ldr q19, [x23, #0x30]\n"
853 "movi v10.4s, #0x0\n"
854 "movi v11.4s, #0x0\n"
855 "movi v12.4s, #0x0\n"
856 "movi v13.4s, #0x0\n"
857 "movi v14.4s, #0x0\n"
858 "movi v15.4s, #0x0\n"
859 "movi v16.4s, #0x0\n"
860 "movi v17.4s, #0x0\n"
861 "movi v18.4s, #0x0\n"
862 "movi v19.4s, #0x0\n"
866 "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
867 "ldr w27, [x20, x28, LSL #0x2]\n"
868 "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
869 "tbz %x[flags], #3, 83f\n"
870 "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
871 "add x20, x20, x21, LSL #3\n"
872 "ldr x26, [x20, #0x0]\n"
873 "ldr x25, [x20, #0x8]\n"
874 "ldr x24, [x20, #0x10]\n"
876 "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
877 "add x26, x26, x20\n"
878 "add x25, x25, x20\n"
879 "add x24, x24, x20\n"
882 "mov x26, %x[input_ptr]\n"
883 "add x25, x26, x21\n"
884 "add x24, x25, x21\n"
888 "ldr q0, [x26, #0x0]\n"
889 "ldr q1, [x25, #0x0]\n"
891 "ldr q2, [x24, #0x0]\n"
892 "ldr q6, [x10, #0x0]\n"
893 "ldr q7, [x10, #0x10]\n"
896 ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
897 ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
898 "sub x27, x27, #0x10\n"
899 "add x26, x26, #0x10\n"
900 ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
901 "ldr q21, [x10, #0x20]\n"
902 ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
903 "add x25, x25, #0x10\n"
904 ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
905 ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
906 "ldr q20, [x10, #0x30]\n"
907 "add x24, x24, #0x10\n"
908 ".inst 0x6f80e2aa // udot v10.4s, v21.16b, v0.4b[0]\n"
909 ".inst 0x6f81e2ae // udot v14.4s, v21.16b, v1.4b[0]\n"
911 "prfm pldl1keep, [x26, #0x80]\n"
912 ".inst 0x6f82e2b2 // udot v18.4s, v21.16b, v2.4b[0]\n"
913 "ldr q21, [x10, #0x40]\n"
914 ".inst 0x6f80e28b // udot v11.4s, v20.16b, v0.4b[0]\n"
915 "prfm pldl1keep, [x25, #0x80]\n"
916 ".inst 0x6f81e28f // udot v15.4s, v20.16b, v1.4b[0]\n"
917 ".inst 0x6f82e293 // udot v19.4s, v20.16b, v2.4b[0]\n"
918 "ldr q20, [x10, #0x50]\n"
919 "prfm pldl1keep, [x24, #0x80]\n"
920 ".inst 0x6fa0e2a8 // udot v8.4s, v21.16b, v0.4b[1]\n"
921 ".inst 0x6fa1e2ac // udot v12.4s, v21.16b, v1.4b[1]\n"
922 ".inst 0x6fa2e2b0 // udot v16.4s, v21.16b, v2.4b[1]\n"
923 "ldr q21, [x10, #0x60]\n"
924 ".inst 0x6fa0e289 // udot v9.4s, v20.16b, v0.4b[1]\n"
925 ".inst 0x6fa1e28d // udot v13.4s, v20.16b, v1.4b[1]\n"
926 ".inst 0x6fa2e291 // udot v17.4s, v20.16b, v2.4b[1]\n"
927 "ldr q20, [x10, #0x70]\n"
928 ".inst 0x6fa0e2aa // udot v10.4s, v21.16b, v0.4b[1]\n"
929 ".inst 0x6fa1e2ae // udot v14.4s, v21.16b, v1.4b[1]\n"
930 ".inst 0x6fa2e2b2 // udot v18.4s, v21.16b, v2.4b[1]\n"
931 "ldr q21, [x10, #0x80]\n"
932 ".inst 0x6fa0e28b // udot v11.4s, v20.16b, v0.4b[1]\n"
933 ".inst 0x6fa1e28f // udot v15.4s, v20.16b, v1.4b[1]\n"
934 ".inst 0x6fa2e293 // udot v19.4s, v20.16b, v2.4b[1]\n"
935 "ldr q20, [x10, #0x90]\n"
936 ".inst 0x6f80eaa8 // udot v8.4s, v21.16b, v0.4b[2]\n"
937 ".inst 0x6f81eaac // udot v12.4s, v21.16b, v1.4b[2]\n"
938 ".inst 0x6f82eab0 // udot v16.4s, v21.16b, v2.4b[2]\n"
939 "ldr q21, [x10, #0xa0]\n"
940 ".inst 0x6f80ea89 // udot v9.4s, v20.16b, v0.4b[2]\n"
941 ".inst 0x6f81ea8d // udot v13.4s, v20.16b, v1.4b[2]\n"
942 ".inst 0x6f82ea91 // udot v17.4s, v20.16b, v2.4b[2]\n"
943 "ldr q20, [x10, #0xb0]\n"
944 ".inst 0x6f80eaaa // udot v10.4s, v21.16b, v0.4b[2]\n"
945 ".inst 0x6f81eaae // udot v14.4s, v21.16b, v1.4b[2]\n"
946 ".inst 0x6f82eab2 // udot v18.4s, v21.16b, v2.4b[2]\n"
947 "ldr q21, [x10, #0xc0]\n"
948 ".inst 0x6f80ea8b // udot v11.4s, v20.16b, v0.4b[2]\n"
949 ".inst 0x6f81ea8f // udot v15.4s, v20.16b, v1.4b[2]\n"
950 ".inst 0x6f82ea93 // udot v19.4s, v20.16b, v2.4b[2]\n"
951 "ldr q20, [x10, #0xd0]\n"
952 ".inst 0x6fa0eaa8 // udot v8.4s, v21.16b, v0.4b[3]\n"
953 ".inst 0x6fa1eaac // udot v12.4s, v21.16b, v1.4b[3]\n"
954 ".inst 0x6fa2eab0 // udot v16.4s, v21.16b, v2.4b[3]\n"
955 "ldr q21, [x10, #0xe0]\n"
956 ".inst 0x6fa0ea89 // udot v9.4s, v20.16b, v0.4b[3]\n"
957 ".inst 0x6fa1ea8d // udot v13.4s, v20.16b, v1.4b[3]\n"
958 ".inst 0x6fa2ea91 // udot v17.4s, v20.16b, v2.4b[3]\n"
959 "ldr q20, [x10, #0xf0]\n"
960 "add x10, x10, #0x100\n"
961 ".inst 0x6fa0eaaa // udot v10.4s, v21.16b, v0.4b[3]\n"
962 ".inst 0x6fa1eaae // udot v14.4s, v21.16b, v1.4b[3]\n"
963 ".inst 0x6fa2eab2 // udot v18.4s, v21.16b, v2.4b[3]\n"
964 "ldr q6, [x10, #0x0]\n"
965 ".inst 0x6fa0ea8b // udot v11.4s, v20.16b, v0.4b[3]\n"
966 "ldr q0, [x26, #0x0]\n"
967 ".inst 0x6fa1ea8f // udot v15.4s, v20.16b, v1.4b[3]\n"
968 "ldr q1, [x25, #0x0]\n"
969 ".inst 0x6fa2ea93 // udot v19.4s, v20.16b, v2.4b[3]\n"
970 "ldr q2, [x24, #0x0]\n"
971 "ldr q7, [x10, #0x10]\n"
974 ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
975 ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
976 "add x26, x26, #0x10\n"
977 "add x25, x25, #0x10\n"
978 ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
979 "ldr q21, [x10, #0x20]\n"
980 ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
981 "add x24, x24, #0x10\n"
982 ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
983 ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
984 "ldr q20, [x10, #0x30]\n"
985 "sub x27, x27, #0x10\n"
986 ".inst 0x6f80e2aa // udot v10.4s, v21.16b, v0.4b[0]\n"
987 ".inst 0x6f81e2ae // udot v14.4s, v21.16b, v1.4b[0]\n"
988 "prfm pldl1keep, [x26, #0x80]\n"
989 "prfm pldl1keep, [x25, #0x80]\n"
990 ".inst 0x6f82e2b2 // udot v18.4s, v21.16b, v2.4b[0]\n"
991 "ldr q21, [x10, #0x40]\n"
992 ".inst 0x6f80e28b // udot v11.4s, v20.16b, v0.4b[0]\n"
993 "prfm pldl1keep, [x24, #0x80]\n"
994 ".inst 0x6f81e28f // udot v15.4s, v20.16b, v1.4b[0]\n"
995 ".inst 0x6f82e293 // udot v19.4s, v20.16b, v2.4b[0]\n"
996 "ldr q20, [x10, #0x50]\n"
997 ".inst 0x6fa0e2a8 // udot v8.4s, v21.16b, v0.4b[1]\n"
998 ".inst 0x6fa1e2ac // udot v12.4s, v21.16b, v1.4b[1]\n"
999 ".inst 0x6fa2e2b0 // udot v16.4s, v21.16b, v2.4b[1]\n"
1000 "ldr q21, [x10, #0x60]\n"
1001 ".inst 0x6fa0e289 // udot v9.4s, v20.16b, v0.4b[1]\n"
1002 ".inst 0x6fa1e28d // udot v13.4s, v20.16b, v1.4b[1]\n"
1003 ".inst 0x6fa2e291 // udot v17.4s, v20.16b, v2.4b[1]\n"
1004 "ldr q20, [x10, #0x70]\n"
1005 ".inst 0x6fa0e2aa // udot v10.4s, v21.16b, v0.4b[1]\n"
1006 ".inst 0x6fa1e2ae // udot v14.4s, v21.16b, v1.4b[1]\n"
1007 ".inst 0x6fa2e2b2 // udot v18.4s, v21.16b, v2.4b[1]\n"
1008 "ldr q21, [x10, #0x80]\n"
1009 ".inst 0x6fa0e28b // udot v11.4s, v20.16b, v0.4b[1]\n"
1010 ".inst 0x6fa1e28f // udot v15.4s, v20.16b, v1.4b[1]\n"
1011 ".inst 0x6fa2e293 // udot v19.4s, v20.16b, v2.4b[1]\n"
1012 "ldr q20, [x10, #0x90]\n"
1013 ".inst 0x6f80eaa8 // udot v8.4s, v21.16b, v0.4b[2]\n"
1014 ".inst 0x6f81eaac // udot v12.4s, v21.16b, v1.4b[2]\n"
1015 ".inst 0x6f82eab0 // udot v16.4s, v21.16b, v2.4b[2]\n"
1016 "ldr q21, [x10, #0xa0]\n"
1017 ".inst 0x6f80ea89 // udot v9.4s, v20.16b, v0.4b[2]\n"
1018 ".inst 0x6f81ea8d // udot v13.4s, v20.16b, v1.4b[2]\n"
1019 ".inst 0x6f82ea91 // udot v17.4s, v20.16b, v2.4b[2]\n"
1020 "ldr q20, [x10, #0xb0]\n"
1021 ".inst 0x6f80eaaa // udot v10.4s, v21.16b, v0.4b[2]\n"
1022 ".inst 0x6f81eaae // udot v14.4s, v21.16b, v1.4b[2]\n"
1023 ".inst 0x6f82eab2 // udot v18.4s, v21.16b, v2.4b[2]\n"
1024 "ldr q21, [x10, #0xc0]\n"
1025 ".inst 0x6f80ea8b // udot v11.4s, v20.16b, v0.4b[2]\n"
1026 ".inst 0x6f81ea8f // udot v15.4s, v20.16b, v1.4b[2]\n"
1027 ".inst 0x6f82ea93 // udot v19.4s, v20.16b, v2.4b[2]\n"
1028 "ldr q20, [x10, #0xd0]\n"
1029 ".inst 0x6fa0eaa8 // udot v8.4s, v21.16b, v0.4b[3]\n"
1030 ".inst 0x6fa1eaac // udot v12.4s, v21.16b, v1.4b[3]\n"
1031 ".inst 0x6fa2eab0 // udot v16.4s, v21.16b, v2.4b[3]\n"
1032 "ldr q21, [x10, #0xe0]\n"
1033 ".inst 0x6fa0ea89 // udot v9.4s, v20.16b, v0.4b[3]\n"
1034 ".inst 0x6fa1ea8d // udot v13.4s, v20.16b, v1.4b[3]\n"
1035 ".inst 0x6fa2ea91 // udot v17.4s, v20.16b, v2.4b[3]\n"
1036 "ldr q20, [x10, #0xf0]\n"
1037 "add x10, x10, #0x100\n"
1038 ".inst 0x6fa0eaaa // udot v10.4s, v21.16b, v0.4b[3]\n"
1039 ".inst 0x6fa1eaae // udot v14.4s, v21.16b, v1.4b[3]\n"
1040 ".inst 0x6fa2eab2 // udot v18.4s, v21.16b, v2.4b[3]\n"
1041 ".inst 0x6fa0ea8b // udot v11.4s, v20.16b, v0.4b[3]\n"
1042 ".inst 0x6fa1ea8f // udot v15.4s, v20.16b, v1.4b[3]\n"
1043 ".inst 0x6fa2ea93 // udot v19.4s, v20.16b, v2.4b[3]\n"
1049 "ldr s24, [x26], #0x4\n"
1050 "ldr s23, [x25], #0x4\n"
1051 "sub x27, x27, #0x4\n"
1053 "ldr s22, [x24], #0x4\n"
1054 "ldr q21, [x10, #0x0]\n"
1055 ".inst 0x6f98e2a8 // udot v8.4s, v21.16b, v24.4b[0]\n"
1056 ".inst 0x6f97e2ac // udot v12.4s, v21.16b, v23.4b[0]\n"
1057 "ldr q20, [x10, #0x10]\n"
1058 ".inst 0x6f96e2b0 // udot v16.4s, v21.16b, v22.4b[0]\n"
1059 "ldr q21, [x10, #0x20]\n"
1060 ".inst 0x6f98e289 // udot v9.4s, v20.16b, v24.4b[0]\n"
1061 ".inst 0x6f97e28d // udot v13.4s, v20.16b, v23.4b[0]\n"
1062 ".inst 0x6f96e291 // udot v17.4s, v20.16b, v22.4b[0]\n"
1063 "ldr q20, [x10, #0x30]\n"
1064 "add x10, x10, #0x40\n"
1065 ".inst 0x6f98e2aa // udot v10.4s, v21.16b, v24.4b[0]\n"
1066 ".inst 0x6f97e2ae // udot v14.4s, v21.16b, v23.4b[0]\n"
1067 ".inst 0x6f96e2b2 // udot v18.4s, v21.16b, v22.4b[0]\n"
1068 ".inst 0x6f98e28b // udot v11.4s, v20.16b, v24.4b[0]\n"
1069 ".inst 0x6f97e28f // udot v15.4s, v20.16b, v23.4b[0]\n"
1070 ".inst 0x6f96e293 // udot v19.4s, v20.16b, v22.4b[0]\n"
1074 "tbz x27, #1, 90f\n"
1075 "ldr h0, [x26], #0x2\n"
1076 "ldr h1, [x25], #0x2\n"
1077 "ldr h2, [x24], #0x2\n"
1078 "tbz x27, #0, 91f\n"
1079 "ld1 { v0.b }[2], [x26]\n"
1080 "ld1 { v1.b }[2], [x25]\n"
1081 "ld1 { v2.b }[2], [x24]\n"
1084 "ldr b0, [x26, #0x0]\n"
1085 "ldr b1, [x25, #0x0]\n"
1086 "ldr b2, [x24, #0x0]\n"
1088 "ldr q21, [x10, #0x0]\n"
1089 "ldr q20, [x10, #0x10]\n"
1090 ".inst 0x6f80e2a8 // udot v8.4s, v21.16b, v0.4b[0]\n"
1091 ".inst 0x6f81e2ac // udot v12.4s, v21.16b, v1.4b[0]\n"
1092 ".inst 0x6f82e2b0 // udot v16.4s, v21.16b, v2.4b[0]\n"
1093 "ldr q21, [x10, #0x20]\n"
1094 ".inst 0x6f80e289 // udot v9.4s, v20.16b, v0.4b[0]\n"
1095 ".inst 0x6f81e28d // udot v13.4s, v20.16b, v1.4b[0]\n"
1096 ".inst 0x6f82e291 // udot v17.4s, v20.16b, v2.4b[0]\n"
1097 "ldr q20, [x10, #0x30]\n"
1098 "add x10, x10, #0x40\n"
1099 ".inst 0x6f80e2aa // udot v10.4s, v21.16b, v0.4b[0]\n"
1100 ".inst 0x6f81e2ae // udot v14.4s, v21.16b, v1.4b[0]\n"
1101 ".inst 0x6f82e2b2 // udot v18.4s, v21.16b, v2.4b[0]\n"
1102 ".inst 0x6f80e28b // udot v11.4s, v20.16b, v0.4b[0]\n"
1103 ".inst 0x6f81e28f // udot v15.4s, v20.16b, v1.4b[0]\n"
1104 ".inst 0x6f82e293 // udot v19.4s, v20.16b, v2.4b[0]\n"
1106 "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
1107 "add x28, x28, #0x1\n"
1110 "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
1111 "add x24, x9, x20, LSL #2\n"
1112 "add x23, x24, x20, LSL #2\n"
1113 "prfm pstl1keep, [x9, #0x0]\n"
1115 "prfm pstl1keep, [x24, #0x0]\n"
1116 "prfm pstl1keep, [x23, #0x0]\n"
1118 "tbz x11, #3, 96f\n"
1119 "st1 { v8.4s }, [x9], #0x10\n"
1120 "st1 { v9.4s }, [x9], #0x10\n"
1121 "st1 { v12.4s }, [x24], #0x10\n"
1122 "st1 { v13.4s }, [x24], #0x10\n"
1123 "st1 { v16.4s }, [x23], #0x10\n"
1124 "st1 { v17.4s }, [x23], #0x10\n"
1125 "tbz x11, #2, 94f\n"
1126 "st1 { v10.4s }, [x9], #0x10\n"
1127 "st1 { v14.4s }, [x24], #0x10\n"
1128 "st1 { v18.4s }, [x23], #0x10\n"
1129 "tbz x11, #1, 93f\n"
1130 "str d11, [x9], #0x8\n"
1131 "str d15, [x24], #0x8\n"
1132 "str d19, [x23], #0x8\n"
1133 "tbz x11, #0, 100f\n"
1134 "st1 { v11.s }[2], [x9]\n"
1135 "st1 { v15.s }[2], [x24]\n"
1136 "st1 { v19.s }[2], [x23]\n"
1139 "tbz x11, #0, 100f\n"
1140 "str s11, [x9, #0x0]\n"
1141 "str s15, [x24, #0x0]\n"
1142 "str s19, [x23, #0x0]\n"
1145 "tbz x11, #1, 95f\n"
1146 "str d10, [x9], #0x8\n"
1147 "str d14, [x24], #0x8\n"
1148 "str d18, [x23], #0x8\n"
1149 "tbz x11, #0, 100f\n"
1150 "st1 { v10.s }[2], [x9]\n"
1151 "st1 { v14.s }[2], [x24]\n"
1152 "st1 { v18.s }[2], [x23]\n"
1155 "tbz x11, #0, 100f\n"
1156 "str s10, [x9, #0x0]\n"
1157 "str s14, [x24, #0x0]\n"
1158 "str s18, [x23, #0x0]\n"
1161 "tbz x11, #2, 98f\n"
1162 "st1 { v8.4s }, [x9], #0x10\n"
1163 "st1 { v12.4s }, [x24], #0x10\n"
1164 "st1 { v16.4s }, [x23], #0x10\n"
1165 "tbz x11, #1, 97f\n"
1166 "str d9, [x9], #0x8\n"
1167 "str d13, [x24], #0x8\n"
1168 "str d17, [x23], #0x8\n"
1169 "tbz x11, #0, 100f\n"
1170 "st1 { v9.s }[2], [x9]\n"
1171 "st1 { v13.s }[2], [x24]\n"
1172 "st1 { v17.s }[2], [x23]\n"
1175 "tbz x11, #0, 100f\n"
1176 "str s9, [x9, #0x0]\n"
1177 "str s13, [x24, #0x0]\n"
1178 "str s17, [x23, #0x0]\n"
1181 "tbz x11, #1, 99f\n"
1182 "str d8, [x9], #0x8\n"
1183 "str d12, [x24], #0x8\n"
1184 "str d16, [x23], #0x8\n"
1185 "tbz x11, #0, 100f\n"
1186 "st1 { v8.s }[2], [x9]\n"
1187 "st1 { v12.s }[2], [x24]\n"
1188 "st1 { v16.s }[2], [x23]\n"
1191 "str s8, [x9, #0x0]\n"
1192 "str s12, [x24, #0x0]\n"
1193 "str s16, [x23, #0x0]\n"
1197 "str q8, [x9, #0x0]\n"
1198 "str q9, [x9, #0x10]\n"
1199 "str q10, [x9, #0x20]\n"
1200 "str q11, [x9, #0x30]\n"
1201 "add x9, x9, #0x40\n"
1202 "str q12, [x24, #0x0]\n"
1203 "str q13, [x24, #0x10]\n"
1204 "str q14, [x24, #0x20]\n"
1205 "str q15, [x24, #0x30]\n"
1206 "str q16, [x23, #0x0]\n"
1207 "str q17, [x23, #0x10]\n"
1208 "str q18, [x23, #0x20]\n"
1209 "str q19, [x23, #0x30]\n"
1211 "subs x11, x11, #0x10\n"
1215 "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
1216 "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
1217 "mov x9, %x[output_ptr]\n"
1219 "tbz %x[flags], #0, 114f\n"
1220 "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
1221 "add x24, x9, x20, LSL #2\n"
1222 "add x23, x24, x20, LSL #2\n"
1224 "add x22, x23, x20, LSL #2\n"
1226 "tbz x11, #3, 108f\n"
1227 "ld1 { v8.4s }, [x9], #0x10\n"
1228 "ld1 { v12.4s }, [x24], #0x10\n"
1229 "ld1 { v16.4s }, [x23], #0x10\n"
1230 "ld1 { v20.4s }, [x22], #0x10\n"
1231 "ld1 { v9.4s }, [x9], #0x10\n"
1232 "ld1 { v13.4s }, [x24], #0x10\n"
1233 "ld1 { v17.4s }, [x23], #0x10\n"
1234 "ld1 { v21.4s }, [x22], #0x10\n"
1235 "tbz x11, #2, 106f\n"
1236 "ld1 { v10.4s }, [x9], #0x10\n"
1237 "ld1 { v14.4s }, [x24], #0x10\n"
1238 "ld1 { v18.4s }, [x23], #0x10\n"
1239 "ld1 { v22.4s }, [x22], #0x10\n"
1240 "tbz x11, #1, 105f\n"
1241 "ldr d11, [x9], #0x8\n"
1242 "ldr d15, [x24], #0x8\n"
1244 "ldr d19, [x23], #0x8\n"
1245 "ldr d23, [x22], #0x8\n"
1246 "tbz x11, #0, 112f\n"
1247 "ld1 { v11.s }[2], [x9]\n"
1248 "ld1 { v15.s }[2], [x24]\n"
1249 "ld1 { v19.s }[2], [x23]\n"
1250 "ld1 { v23.s }[2], [x22]\n"
1254 "tbz x11, #0, 112f\n"
1255 "ldr s11, [x9, #0x0]\n"
1256 "ldr s15, [x24, #0x0]\n"
1257 "ldr s19, [x23, #0x0]\n"
1258 "ldr s23, [x22, #0x0]\n"
1261 "tbz x11, #1, 107f\n"
1262 "ldr d10, [x9], #0x8\n"
1263 "ldr d14, [x24], #0x8\n"
1265 "ldr d18, [x23], #0x8\n"
1266 "ldr d22, [x22], #0x8\n"
1267 "tbz x11, #0, 112f\n"
1268 "ld1 { v10.s }[2], [x9]\n"
1269 "ld1 { v14.s }[2], [x24]\n"
1270 "ld1 { v18.s }[2], [x23]\n"
1271 "ld1 { v22.s }[2], [x22]\n"
1275 "tbz x11, #0, 112f\n"
1276 "ldr s10, [x9, #0x0]\n"
1277 "ldr s14, [x24, #0x0]\n"
1278 "ldr s18, [x23, #0x0]\n"
1279 "ldr s22, [x22, #0x0]\n"
1282 "tbz x11, #2, 110f\n"
1283 "ld1 { v8.4s }, [x9], #0x10\n"
1284 "ld1 { v12.4s }, [x24], #0x10\n"
1285 "ld1 { v16.4s }, [x23], #0x10\n"
1286 "ld1 { v20.4s }, [x22], #0x10\n"
1287 "tbz x11, #1, 109f\n"
1288 "ldr d9, [x9], #0x8\n"
1289 "ldr d13, [x24], #0x8\n"
1291 "ldr d17, [x23], #0x8\n"
1292 "ldr d21, [x22], #0x8\n"
1293 "tbz x11, #0, 112f\n"
1294 "ld1 { v9.s }[2], [x9]\n"
1295 "ld1 { v13.s }[2], [x24]\n"
1296 "ld1 { v17.s }[2], [x23]\n"
1297 "ld1 { v21.s }[2], [x22]\n"
1301 "tbz x11, #0, 112f\n"
1302 "ldr s9, [x9, #0x0]\n"
1303 "ldr s13, [x24, #0x0]\n"
1304 "ldr s17, [x23, #0x0]\n"
1305 "ldr s21, [x22, #0x0]\n"
1308 "tbz x11, #1, 111f\n"
1309 "ldr d8, [x9], #0x8\n"
1310 "ldr d12, [x24], #0x8\n"
1312 "ldr d16, [x23], #0x8\n"
1313 "ldr d20, [x22], #0x8\n"
1314 "tbz x11, #0, 112f\n"
1315 "ld1 { v8.s }[2], [x9]\n"
1316 "ld1 { v12.s }[2], [x24]\n"
1317 "ld1 { v16.s }[2], [x23]\n"
1318 "ld1 { v20.s }[2], [x22]\n"
1321 "ldr s8, [x9, #0x0]\n"
1322 "ldr s12, [x24, #0x0]\n"
1324 "ldr s16, [x23, #0x0]\n"
1325 "ldr s20, [x22, #0x0]\n"
1330 "ldr q8, [x9, #0x0]\n"
1331 "ldr q9, [x9, #0x10]\n"
1332 "ldr q10, [x9, #0x20]\n"
1333 "ldr q11, [x9, #0x30]\n"
1334 "ldr q12, [x24, #0x0]\n"
1335 "ldr q13, [x24, #0x10]\n"
1336 "ldr q14, [x24, #0x20]\n"
1337 "ldr q15, [x24, #0x30]\n"
1338 "ldr q16, [x23, #0x0]\n"
1339 "ldr q17, [x23, #0x10]\n"
1340 "ldr q18, [x23, #0x20]\n"
1341 "ldr q19, [x23, #0x30]\n"
1342 "ldr q20, [x22, #0x0]\n"
1343 "ldr q21, [x22, #0x10]\n"
1344 "ldr q22, [x22, #0x20]\n"
1345 "ldr q23, [x22, #0x30]\n"
1348 "movi v8.4s, #0x0\n"
1349 "movi v9.4s, #0x0\n"
1350 "movi v10.4s, #0x0\n"
1351 "movi v11.4s, #0x0\n"
1352 "movi v12.4s, #0x0\n"
1353 "movi v13.4s, #0x0\n"
1354 "movi v14.4s, #0x0\n"
1355 "movi v15.4s, #0x0\n"
1356 "movi v16.4s, #0x0\n"
1357 "movi v17.4s, #0x0\n"
1358 "movi v18.4s, #0x0\n"
1359 "movi v19.4s, #0x0\n"
1360 "movi v20.4s, #0x0\n"
1361 "movi v21.4s, #0x0\n"
1362 "movi v22.4s, #0x0\n"
1363 "movi v23.4s, #0x0\n"
1367 "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
1368 "ldr w27, [x20, x28, LSL #0x2]\n"
1369 "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
1370 "tbz %x[flags], #3, 117f\n"
1371 "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
1372 "add x20, x20, x21, LSL #3\n"
1373 "ldr x26, [x20, #0x0]\n"
1374 "ldr x25, [x20, #0x8]\n"
1375 "ldr x24, [x20, #0x10]\n"
1376 "ldr x23, [x20, #0x18]\n"
1378 "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
1379 "add x26, x26, x20\n"
1380 "add x25, x25, x20\n"
1381 "add x24, x24, x20\n"
1382 "add x23, x23, x20\n"
1385 "mov x26, %x[input_ptr]\n"
1386 "add x25, x26, x21\n"
1387 "add x24, x25, x21\n"
1388 "add x23, x24, x21\n"
1392 "ldr q0, [x26, #0x0]\n"
1393 "ldr q1, [x25, #0x0]\n"
1395 "ldr q2, [x24, #0x0]\n"
1396 "ldr q3, [x23, #0x0]\n"
1397 "ldr q6, [x10, #0x0]\n"
1398 "ldr q7, [x10, #0x10]\n"
1401 ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
1402 ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
1403 "sub x27, x27, #0x10\n"
1404 "add x26, x26, #0x10\n"
1405 ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
1406 ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
1407 "ldr q25, [x10, #0x20]\n"
1408 "add x25, x25, #0x10\n"
1409 ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
1410 ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
1411 "add x24, x24, #0x10\n"
1412 "add x23, x23, #0x10\n"
1413 ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
1414 ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
1415 "ldr q24, [x10, #0x30]\n"
1417 ".inst 0x6f80e32a // udot v10.4s, v25.16b, v0.4b[0]\n"
1418 ".inst 0x6f81e32e // udot v14.4s, v25.16b, v1.4b[0]\n"
1419 "prfm pldl1keep, [x26, #0x80]\n"
1420 "prfm pldl1keep, [x25, #0x80]\n"
1421 ".inst 0x6f82e332 // udot v18.4s, v25.16b, v2.4b[0]\n"
1422 ".inst 0x6f83e336 // udot v22.4s, v25.16b, v3.4b[0]\n"
1423 "ldr q25, [x10, #0x40]\n"
1424 "prfm pldl1keep, [x24, #0x80]\n"
1425 ".inst 0x6f80e30b // udot v11.4s, v24.16b, v0.4b[0]\n"
1426 ".inst 0x6f81e30f // udot v15.4s, v24.16b, v1.4b[0]\n"
1427 "prfm pldl1keep, [x23, #0x80]\n"
1428 ".inst 0x6f82e313 // udot v19.4s, v24.16b, v2.4b[0]\n"
1429 ".inst 0x6f83e317 // udot v23.4s, v24.16b, v3.4b[0]\n"
1430 "ldr q24, [x10, #0x50]\n"
1431 ".inst 0x6fa0e328 // udot v8.4s, v25.16b, v0.4b[1]\n"
1432 ".inst 0x6fa1e32c // udot v12.4s, v25.16b, v1.4b[1]\n"
1433 ".inst 0x6fa2e330 // udot v16.4s, v25.16b, v2.4b[1]\n"
1434 ".inst 0x6fa3e334 // udot v20.4s, v25.16b, v3.4b[1]\n"
1435 "ldr q25, [x10, #0x60]\n"
1436 ".inst 0x6fa0e309 // udot v9.4s, v24.16b, v0.4b[1]\n"
1437 ".inst 0x6fa1e30d // udot v13.4s, v24.16b, v1.4b[1]\n"
1438 ".inst 0x6fa2e311 // udot v17.4s, v24.16b, v2.4b[1]\n"
1439 ".inst 0x6fa3e315 // udot v21.4s, v24.16b, v3.4b[1]\n"
1440 "ldr q24, [x10, #0x70]\n"
1441 ".inst 0x6fa0e32a // udot v10.4s, v25.16b, v0.4b[1]\n"
1442 ".inst 0x6fa1e32e // udot v14.4s, v25.16b, v1.4b[1]\n"
1443 ".inst 0x6fa2e332 // udot v18.4s, v25.16b, v2.4b[1]\n"
1444 ".inst 0x6fa3e336 // udot v22.4s, v25.16b, v3.4b[1]\n"
1445 "ldr q25, [x10, #0x80]\n"
1446 ".inst 0x6fa0e30b // udot v11.4s, v24.16b, v0.4b[1]\n"
1447 ".inst 0x6fa1e30f // udot v15.4s, v24.16b, v1.4b[1]\n"
1448 ".inst 0x6fa2e313 // udot v19.4s, v24.16b, v2.4b[1]\n"
1449 ".inst 0x6fa3e317 // udot v23.4s, v24.16b, v3.4b[1]\n"
1450 "ldr q24, [x10, #0x90]\n"
1451 ".inst 0x6f80eb28 // udot v8.4s, v25.16b, v0.4b[2]\n"
1452 ".inst 0x6f81eb2c // udot v12.4s, v25.16b, v1.4b[2]\n"
1453 ".inst 0x6f82eb30 // udot v16.4s, v25.16b, v2.4b[2]\n"
1454 ".inst 0x6f83eb34 // udot v20.4s, v25.16b, v3.4b[2]\n"
1455 "ldr q25, [x10, #0xa0]\n"
1456 ".inst 0x6f80eb09 // udot v9.4s, v24.16b, v0.4b[2]\n"
1457 ".inst 0x6f81eb0d // udot v13.4s, v24.16b, v1.4b[2]\n"
1458 ".inst 0x6f82eb11 // udot v17.4s, v24.16b, v2.4b[2]\n"
1459 ".inst 0x6f83eb15 // udot v21.4s, v24.16b, v3.4b[2]\n"
1460 "ldr q24, [x10, #0xb0]\n"
1461 ".inst 0x6f80eb2a // udot v10.4s, v25.16b, v0.4b[2]\n"
1462 ".inst 0x6f81eb2e // udot v14.4s, v25.16b, v1.4b[2]\n"
1463 ".inst 0x6f82eb32 // udot v18.4s, v25.16b, v2.4b[2]\n"
1464 ".inst 0x6f83eb36 // udot v22.4s, v25.16b, v3.4b[2]\n"
1465 "ldr q25, [x10, #0xc0]\n"
1466 ".inst 0x6f80eb0b // udot v11.4s, v24.16b, v0.4b[2]\n"
1467 ".inst 0x6f81eb0f // udot v15.4s, v24.16b, v1.4b[2]\n"
1468 ".inst 0x6f82eb13 // udot v19.4s, v24.16b, v2.4b[2]\n"
1469 ".inst 0x6f83eb17 // udot v23.4s, v24.16b, v3.4b[2]\n"
1470 "ldr q24, [x10, #0xd0]\n"
1471 ".inst 0x6fa0eb28 // udot v8.4s, v25.16b, v0.4b[3]\n"
1472 ".inst 0x6fa1eb2c // udot v12.4s, v25.16b, v1.4b[3]\n"
1473 ".inst 0x6fa2eb30 // udot v16.4s, v25.16b, v2.4b[3]\n"
1474 ".inst 0x6fa3eb34 // udot v20.4s, v25.16b, v3.4b[3]\n"
1475 "ldr q25, [x10, #0xe0]\n"
1476 ".inst 0x6fa0eb09 // udot v9.4s, v24.16b, v0.4b[3]\n"
1477 ".inst 0x6fa1eb0d // udot v13.4s, v24.16b, v1.4b[3]\n"
1478 ".inst 0x6fa2eb11 // udot v17.4s, v24.16b, v2.4b[3]\n"
1479 ".inst 0x6fa3eb15 // udot v21.4s, v24.16b, v3.4b[3]\n"
1480 "ldr q24, [x10, #0xf0]\n"
1481 "add x10, x10, #0x100\n"
1482 ".inst 0x6fa0eb2a // udot v10.4s, v25.16b, v0.4b[3]\n"
1483 ".inst 0x6fa1eb2e // udot v14.4s, v25.16b, v1.4b[3]\n"
1484 ".inst 0x6fa2eb32 // udot v18.4s, v25.16b, v2.4b[3]\n"
1485 ".inst 0x6fa3eb36 // udot v22.4s, v25.16b, v3.4b[3]\n"
1486 "ldr q6, [x10, #0x0]\n"
1487 ".inst 0x6fa0eb0b // udot v11.4s, v24.16b, v0.4b[3]\n"
1488 "ldr q0, [x26, #0x0]\n"
1489 ".inst 0x6fa1eb0f // udot v15.4s, v24.16b, v1.4b[3]\n"
1490 "ldr q1, [x25, #0x0]\n"
1491 ".inst 0x6fa2eb13 // udot v19.4s, v24.16b, v2.4b[3]\n"
1492 "ldr q2, [x24, #0x0]\n"
1493 ".inst 0x6fa3eb17 // udot v23.4s, v24.16b, v3.4b[3]\n"
1494 "ldr q3, [x23, #0x0]\n"
1495 "ldr q7, [x10, #0x10]\n"
1498 ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
1499 ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
1500 "add x26, x26, #0x10\n"
1501 "add x25, x25, #0x10\n"
1502 ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
1503 ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
1504 "ldr q25, [x10, #0x20]\n"
1505 "add x24, x24, #0x10\n"
1506 ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
1507 ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
1508 "add x23, x23, #0x10\n"
1509 "sub x27, x27, #0x10\n"
1510 ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
1511 ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
1512 "ldr q24, [x10, #0x30]\n"
1513 "prfm pldl1keep, [x26, #0x80]\n"
1514 ".inst 0x6f80e32a // udot v10.4s, v25.16b, v0.4b[0]\n"
1515 ".inst 0x6f81e32e // udot v14.4s, v25.16b, v1.4b[0]\n"
1516 "prfm pldl1keep, [x25, #0x80]\n"
1517 "prfm pldl1keep, [x24, #0x80]\n"
1518 ".inst 0x6f82e332 // udot v18.4s, v25.16b, v2.4b[0]\n"
1519 ".inst 0x6f83e336 // udot v22.4s, v25.16b, v3.4b[0]\n"
1520 "ldr q25, [x10, #0x40]\n"
1521 "prfm pldl1keep, [x23, #0x80]\n"
1522 ".inst 0x6f80e30b // udot v11.4s, v24.16b, v0.4b[0]\n"
1523 ".inst 0x6f81e30f // udot v15.4s, v24.16b, v1.4b[0]\n"
1524 ".inst 0x6f82e313 // udot v19.4s, v24.16b, v2.4b[0]\n"
1525 ".inst 0x6f83e317 // udot v23.4s, v24.16b, v3.4b[0]\n"
1526 "ldr q24, [x10, #0x50]\n"
1527 ".inst 0x6fa0e328 // udot v8.4s, v25.16b, v0.4b[1]\n"
1528 ".inst 0x6fa1e32c // udot v12.4s, v25.16b, v1.4b[1]\n"
1529 ".inst 0x6fa2e330 // udot v16.4s, v25.16b, v2.4b[1]\n"
1530 ".inst 0x6fa3e334 // udot v20.4s, v25.16b, v3.4b[1]\n"
1531 "ldr q25, [x10, #0x60]\n"
1532 ".inst 0x6fa0e309 // udot v9.4s, v24.16b, v0.4b[1]\n"
1533 ".inst 0x6fa1e30d // udot v13.4s, v24.16b, v1.4b[1]\n"
1534 ".inst 0x6fa2e311 // udot v17.4s, v24.16b, v2.4b[1]\n"
1535 ".inst 0x6fa3e315 // udot v21.4s, v24.16b, v3.4b[1]\n"
1536 "ldr q24, [x10, #0x70]\n"
1537 ".inst 0x6fa0e32a // udot v10.4s, v25.16b, v0.4b[1]\n"
1538 ".inst 0x6fa1e32e // udot v14.4s, v25.16b, v1.4b[1]\n"
1539 ".inst 0x6fa2e332 // udot v18.4s, v25.16b, v2.4b[1]\n"
1540 ".inst 0x6fa3e336 // udot v22.4s, v25.16b, v3.4b[1]\n"
1541 "ldr q25, [x10, #0x80]\n"
1542 ".inst 0x6fa0e30b // udot v11.4s, v24.16b, v0.4b[1]\n"
1543 ".inst 0x6fa1e30f // udot v15.4s, v24.16b, v1.4b[1]\n"
1544 ".inst 0x6fa2e313 // udot v19.4s, v24.16b, v2.4b[1]\n"
1545 ".inst 0x6fa3e317 // udot v23.4s, v24.16b, v3.4b[1]\n"
1546 "ldr q24, [x10, #0x90]\n"
1547 ".inst 0x6f80eb28 // udot v8.4s, v25.16b, v0.4b[2]\n"
1548 ".inst 0x6f81eb2c // udot v12.4s, v25.16b, v1.4b[2]\n"
1549 ".inst 0x6f82eb30 // udot v16.4s, v25.16b, v2.4b[2]\n"
1550 ".inst 0x6f83eb34 // udot v20.4s, v25.16b, v3.4b[2]\n"
1551 "ldr q25, [x10, #0xa0]\n"
1552 ".inst 0x6f80eb09 // udot v9.4s, v24.16b, v0.4b[2]\n"
1553 ".inst 0x6f81eb0d // udot v13.4s, v24.16b, v1.4b[2]\n"
1554 ".inst 0x6f82eb11 // udot v17.4s, v24.16b, v2.4b[2]\n"
1555 ".inst 0x6f83eb15 // udot v21.4s, v24.16b, v3.4b[2]\n"
1556 "ldr q24, [x10, #0xb0]\n"
1557 ".inst 0x6f80eb2a // udot v10.4s, v25.16b, v0.4b[2]\n"
1558 ".inst 0x6f81eb2e // udot v14.4s, v25.16b, v1.4b[2]\n"
1559 ".inst 0x6f82eb32 // udot v18.4s, v25.16b, v2.4b[2]\n"
1560 ".inst 0x6f83eb36 // udot v22.4s, v25.16b, v3.4b[2]\n"
1561 "ldr q25, [x10, #0xc0]\n"
1562 ".inst 0x6f80eb0b // udot v11.4s, v24.16b, v0.4b[2]\n"
1563 ".inst 0x6f81eb0f // udot v15.4s, v24.16b, v1.4b[2]\n"
1564 ".inst 0x6f82eb13 // udot v19.4s, v24.16b, v2.4b[2]\n"
1565 ".inst 0x6f83eb17 // udot v23.4s, v24.16b, v3.4b[2]\n"
1566 "ldr q24, [x10, #0xd0]\n"
1567 ".inst 0x6fa0eb28 // udot v8.4s, v25.16b, v0.4b[3]\n"
1568 ".inst 0x6fa1eb2c // udot v12.4s, v25.16b, v1.4b[3]\n"
1569 ".inst 0x6fa2eb30 // udot v16.4s, v25.16b, v2.4b[3]\n"
1570 ".inst 0x6fa3eb34 // udot v20.4s, v25.16b, v3.4b[3]\n"
1571 "ldr q25, [x10, #0xe0]\n"
1572 ".inst 0x6fa0eb09 // udot v9.4s, v24.16b, v0.4b[3]\n"
1573 ".inst 0x6fa1eb0d // udot v13.4s, v24.16b, v1.4b[3]\n"
1574 ".inst 0x6fa2eb11 // udot v17.4s, v24.16b, v2.4b[3]\n"
1575 ".inst 0x6fa3eb15 // udot v21.4s, v24.16b, v3.4b[3]\n"
1576 "ldr q24, [x10, #0xf0]\n"
1577 "add x10, x10, #0x100\n"
1578 ".inst 0x6fa0eb2a // udot v10.4s, v25.16b, v0.4b[3]\n"
1579 ".inst 0x6fa1eb2e // udot v14.4s, v25.16b, v1.4b[3]\n"
1580 ".inst 0x6fa2eb32 // udot v18.4s, v25.16b, v2.4b[3]\n"
1581 ".inst 0x6fa3eb36 // udot v22.4s, v25.16b, v3.4b[3]\n"
1582 ".inst 0x6fa0eb0b // udot v11.4s, v24.16b, v0.4b[3]\n"
1583 ".inst 0x6fa1eb0f // udot v15.4s, v24.16b, v1.4b[3]\n"
1584 ".inst 0x6fa2eb13 // udot v19.4s, v24.16b, v2.4b[3]\n"
1585 ".inst 0x6fa3eb17 // udot v23.4s, v24.16b, v3.4b[3]\n"
1591 "ldr s29, [x26], #0x4\n"
1592 "ldr s28, [x25], #0x4\n"
1593 "sub x27, x27, #0x4\n"
1595 "ldr s27, [x24], #0x4\n"
1596 "ldr s26, [x23], #0x4\n"
1597 "ldr q25, [x10, #0x0]\n"
1598 "ldr q24, [x10, #0x10]\n"
1599 ".inst 0x6f9de328 // udot v8.4s, v25.16b, v29.4b[0]\n"
1600 ".inst 0x6f9ce32c // udot v12.4s, v25.16b, v28.4b[0]\n"
1601 ".inst 0x6f9be330 // udot v16.4s, v25.16b, v27.4b[0]\n"
1602 ".inst 0x6f9ae334 // udot v20.4s, v25.16b, v26.4b[0]\n"
1603 "ldr q25, [x10, #0x20]\n"
1604 ".inst 0x6f9de309 // udot v9.4s, v24.16b, v29.4b[0]\n"
1605 ".inst 0x6f9ce30d // udot v13.4s, v24.16b, v28.4b[0]\n"
1606 ".inst 0x6f9be311 // udot v17.4s, v24.16b, v27.4b[0]\n"
1607 ".inst 0x6f9ae315 // udot v21.4s, v24.16b, v26.4b[0]\n"
1608 "ldr q24, [x10, #0x30]\n"
1609 "add x10, x10, #0x40\n"
1610 ".inst 0x6f9de32a // udot v10.4s, v25.16b, v29.4b[0]\n"
1611 ".inst 0x6f9ce32e // udot v14.4s, v25.16b, v28.4b[0]\n"
1612 ".inst 0x6f9be332 // udot v18.4s, v25.16b, v27.4b[0]\n"
1613 ".inst 0x6f9ae336 // udot v22.4s, v25.16b, v26.4b[0]\n"
1614 ".inst 0x6f9de30b // udot v11.4s, v24.16b, v29.4b[0]\n"
1615 ".inst 0x6f9ce30f // udot v15.4s, v24.16b, v28.4b[0]\n"
1616 ".inst 0x6f9be313 // udot v19.4s, v24.16b, v27.4b[0]\n"
1617 ".inst 0x6f9ae317 // udot v23.4s, v24.16b, v26.4b[0]\n"
1621 "tbz x27, #1, 124f\n"
1622 "ldr h0, [x26], #0x2\n"
1623 "ldr h1, [x25], #0x2\n"
1624 "ldr h2, [x24], #0x2\n"
1625 "ldr h3, [x23], #0x2\n"
1626 "tbz x27, #0, 125f\n"
1627 "ld1 { v0.b }[2], [x26]\n"
1628 "ld1 { v1.b }[2], [x25]\n"
1629 "ld1 { v2.b }[2], [x24]\n"
1630 "ld1 { v3.b }[2], [x23]\n"
1633 "ldr b0, [x26, #0x0]\n"
1634 "ldr b1, [x25, #0x0]\n"
1635 "ldr b2, [x24, #0x0]\n"
1636 "ldr b3, [x23, #0x0]\n"
1638 "ldr q25, [x10, #0x0]\n"
1639 "ldr q24, [x10, #0x10]\n"
1640 ".inst 0x6f80e328 // udot v8.4s, v25.16b, v0.4b[0]\n"
1641 ".inst 0x6f81e32c // udot v12.4s, v25.16b, v1.4b[0]\n"
1642 ".inst 0x6f82e330 // udot v16.4s, v25.16b, v2.4b[0]\n"
1643 ".inst 0x6f83e334 // udot v20.4s, v25.16b, v3.4b[0]\n"
1644 "ldr q25, [x10, #0x20]\n"
1645 ".inst 0x6f80e309 // udot v9.4s, v24.16b, v0.4b[0]\n"
1646 ".inst 0x6f81e30d // udot v13.4s, v24.16b, v1.4b[0]\n"
1647 ".inst 0x6f82e311 // udot v17.4s, v24.16b, v2.4b[0]\n"
1648 ".inst 0x6f83e315 // udot v21.4s, v24.16b, v3.4b[0]\n"
1649 "ldr q24, [x10, #0x30]\n"
1650 "add x10, x10, #0x40\n"
1651 ".inst 0x6f80e32a // udot v10.4s, v25.16b, v0.4b[0]\n"
1652 ".inst 0x6f81e32e // udot v14.4s, v25.16b, v1.4b[0]\n"
1653 ".inst 0x6f82e332 // udot v18.4s, v25.16b, v2.4b[0]\n"
1654 ".inst 0x6f83e336 // udot v22.4s, v25.16b, v3.4b[0]\n"
1655 ".inst 0x6f80e30b // udot v11.4s, v24.16b, v0.4b[0]\n"
1656 ".inst 0x6f81e30f // udot v15.4s, v24.16b, v1.4b[0]\n"
1657 ".inst 0x6f82e313 // udot v19.4s, v24.16b, v2.4b[0]\n"
1658 ".inst 0x6f83e317 // udot v23.4s, v24.16b, v3.4b[0]\n"
1660 "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
1661 "add x28, x28, #0x1\n"
1664 "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
1665 "add x24, x9, x20, LSL #2\n"
1666 "add x23, x24, x20, LSL #2\n"
1667 "prfm pstl1keep, [x9, #0x0]\n"
1668 "add x22, x23, x20, LSL #2\n"
1670 "prfm pstl1keep, [x24, #0x0]\n"
1671 "prfm pstl1keep, [x23, #0x0]\n"
1672 "prfm pstl1keep, [x22, #0x0]\n"
1674 "tbz x11, #3, 130f\n"
1675 "st1 { v8.4s }, [x9], #0x10\n"
1676 "st1 { v9.4s }, [x9], #0x10\n"
1677 "st1 { v12.4s }, [x24], #0x10\n"
1678 "st1 { v13.4s }, [x24], #0x10\n"
1679 "st1 { v16.4s }, [x23], #0x10\n"
1680 "st1 { v17.4s }, [x23], #0x10\n"
1681 "st1 { v20.4s }, [x22], #0x10\n"
1682 "st1 { v21.4s }, [x22], #0x10\n"
1683 "tbz x11, #2, 128f\n"
1684 "st1 { v10.4s }, [x9], #0x10\n"
1685 "st1 { v14.4s }, [x24], #0x10\n"
1686 "st1 { v18.4s }, [x23], #0x10\n"
1687 "st1 { v22.4s }, [x22], #0x10\n"
1688 "tbz x11, #1, 127f\n"
1689 "str d11, [x9], #0x8\n"
1690 "str d15, [x24], #0x8\n"
1691 "str d19, [x23], #0x8\n"
1692 "str d23, [x22], #0x8\n"
1693 "tbz x11, #0, 134f\n"
1694 "st1 { v11.s }[2], [x9]\n"
1695 "st1 { v15.s }[2], [x24]\n"
1696 "st1 { v19.s }[2], [x23]\n"
1697 "st1 { v23.s }[2], [x22]\n"
1700 "tbz x11, #0, 134f\n"
1701 "str s11, [x9, #0x0]\n"
1702 "str s15, [x24, #0x0]\n"
1703 "str s19, [x23, #0x0]\n"
1704 "str s23, [x22, #0x0]\n"
1707 "tbz x11, #1, 129f\n"
1708 "str d10, [x9], #0x8\n"
1709 "str d14, [x24], #0x8\n"
1710 "str d18, [x23], #0x8\n"
1711 "str d22, [x22], #0x8\n"
1712 "tbz x11, #0, 134f\n"
1713 "st1 { v10.s }[2], [x9]\n"
1714 "st1 { v14.s }[2], [x24]\n"
1715 "st1 { v18.s }[2], [x23]\n"
1716 "st1 { v22.s }[2], [x22]\n"
1719 "tbz x11, #0, 134f\n"
1720 "str s10, [x9, #0x0]\n"
1721 "str s14, [x24, #0x0]\n"
1722 "str s18, [x23, #0x0]\n"
1723 "str s22, [x22, #0x0]\n"
1726 "tbz x11, #2, 132f\n"
1727 "st1 { v8.4s }, [x9], #0x10\n"
1728 "st1 { v12.4s }, [x24], #0x10\n"
1729 "st1 { v16.4s }, [x23], #0x10\n"
1730 "st1 { v20.4s }, [x22], #0x10\n"
1731 "tbz x11, #1, 131f\n"
1732 "str d9, [x9], #0x8\n"
1733 "str d13, [x24], #0x8\n"
1734 "str d17, [x23], #0x8\n"
1735 "str d21, [x22], #0x8\n"
1736 "tbz x11, #0, 134f\n"
1737 "st1 { v9.s }[2], [x9]\n"
1738 "st1 { v13.s }[2], [x24]\n"
1739 "st1 { v17.s }[2], [x23]\n"
1740 "st1 { v21.s }[2], [x22]\n"
1743 "tbz x11, #0, 134f\n"
1744 "str s9, [x9, #0x0]\n"
1745 "str s13, [x24, #0x0]\n"
1746 "str s17, [x23, #0x0]\n"
1747 "str s21, [x22, #0x0]\n"
1750 "tbz x11, #1, 133f\n"
1751 "str d8, [x9], #0x8\n"
1752 "str d12, [x24], #0x8\n"
1753 "str d16, [x23], #0x8\n"
1754 "str d20, [x22], #0x8\n"
1755 "tbz x11, #0, 134f\n"
1756 "st1 { v8.s }[2], [x9]\n"
1757 "st1 { v12.s }[2], [x24]\n"
1758 "st1 { v16.s }[2], [x23]\n"
1759 "st1 { v20.s }[2], [x22]\n"
1762 "str s8, [x9, #0x0]\n"
1763 "str s12, [x24, #0x0]\n"
1764 "str s16, [x23, #0x0]\n"
1765 "str s20, [x22, #0x0]\n"
1769 "str q8, [x9, #0x0]\n"
1770 "str q9, [x9, #0x10]\n"
1771 "str q10, [x9, #0x20]\n"
1772 "str q11, [x9, #0x30]\n"
1773 "add x9, x9, #0x40\n"
1774 "str q12, [x24, #0x0]\n"
1775 "str q13, [x24, #0x10]\n"
1776 "str q14, [x24, #0x20]\n"
1777 "str q15, [x24, #0x30]\n"
1778 "str q16, [x23, #0x0]\n"
1779 "str q17, [x23, #0x10]\n"
1780 "str q18, [x23, #0x20]\n"
1781 "str q19, [x23, #0x30]\n"
1782 "str q20, [x22, #0x0]\n"
1783 "str q21, [x22, #0x10]\n"
1784 "str q22, [x22, #0x20]\n"
1785 "str q23, [x22, #0x30]\n"
1787 "subs x11, x11, #0x10\n"
1791 "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
1792 "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
1793 "mov x9, %x[output_ptr]\n"
1795 "tbz %x[flags], #0, 148f\n"
1796 "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
1797 "add x24, x9, x20, LSL #2\n"
1798 "add x23, x24, x20, LSL #2\n"
1799 "add x22, x23, x20, LSL #2\n"
1801 "add x21, x22, x20, LSL #2\n"
1803 "tbz x11, #3, 142f\n"
1804 "ld1 { v8.4s }, [x9], #0x10\n"
1805 "ld1 { v12.4s }, [x24], #0x10\n"
1806 "ld1 { v16.4s }, [x23], #0x10\n"
1807 "ld1 { v20.4s }, [x22], #0x10\n"
1808 "ld1 { v24.4s }, [x21], #0x10\n"
1809 "ld1 { v9.4s }, [x9], #0x10\n"
1810 "ld1 { v13.4s }, [x24], #0x10\n"
1811 "ld1 { v17.4s }, [x23], #0x10\n"
1812 "ld1 { v21.4s }, [x22], #0x10\n"
1813 "ld1 { v25.4s }, [x21], #0x10\n"
1814 "tbz x11, #2, 140f\n"
1815 "ld1 { v10.4s }, [x9], #0x10\n"
1816 "ld1 { v14.4s }, [x24], #0x10\n"
1817 "ld1 { v18.4s }, [x23], #0x10\n"
1818 "ld1 { v22.4s }, [x22], #0x10\n"
1819 "ld1 { v26.4s }, [x21], #0x10\n"
1820 "tbz x11, #1, 139f\n"
1821 "ldr d11, [x9], #0x8\n"
1822 "ldr d15, [x24], #0x8\n"
1824 "ldr d19, [x23], #0x8\n"
1825 "ldr d23, [x22], #0x8\n"
1826 "ldr d27, [x21], #0x8\n"
1827 "tbz x11, #0, 146f\n"
1828 "ld1 { v11.s }[2], [x9]\n"
1829 "ld1 { v15.s }[2], [x24]\n"
1830 "ld1 { v19.s }[2], [x23]\n"
1831 "ld1 { v23.s }[2], [x22]\n"
1832 "ld1 { v27.s }[2], [x21]\n"
1836 "tbz x11, #0, 146f\n"
1837 "ldr s11, [x9, #0x0]\n"
1838 "ldr s15, [x24, #0x0]\n"
1839 "ldr s19, [x23, #0x0]\n"
1840 "ldr s23, [x22, #0x0]\n"
1841 "ldr s27, [x21, #0x0]\n"
1844 "tbz x11, #1, 141f\n"
1845 "ldr d10, [x9], #0x8\n"
1846 "ldr d14, [x24], #0x8\n"
1848 "ldr d18, [x23], #0x8\n"
1849 "ldr d22, [x22], #0x8\n"
1850 "ldr d26, [x21], #0x8\n"
1851 "tbz x11, #0, 146f\n"
1852 "ld1 { v10.s }[2], [x9]\n"
1853 "ld1 { v14.s }[2], [x24]\n"
1854 "ld1 { v18.s }[2], [x23]\n"
1855 "ld1 { v22.s }[2], [x22]\n"
1856 "ld1 { v26.s }[2], [x21]\n"
1860 "tbz x11, #0, 146f\n"
1861 "ldr s10, [x9, #0x0]\n"
1862 "ldr s14, [x24, #0x0]\n"
1863 "ldr s18, [x23, #0x0]\n"
1864 "ldr s22, [x22, #0x0]\n"
1865 "ldr s26, [x21, #0x0]\n"
1868 "tbz x11, #2, 144f\n"
1869 "ld1 { v8.4s }, [x9], #0x10\n"
1870 "ld1 { v12.4s }, [x24], #0x10\n"
1871 "ld1 { v16.4s }, [x23], #0x10\n"
1872 "ld1 { v20.4s }, [x22], #0x10\n"
1873 "ld1 { v24.4s }, [x21], #0x10\n"
1874 "tbz x11, #1, 143f\n"
1875 "ldr d9, [x9], #0x8\n"
1876 "ldr d13, [x24], #0x8\n"
1878 "ldr d17, [x23], #0x8\n"
1879 "ldr d21, [x22], #0x8\n"
1880 "ldr d25, [x21], #0x8\n"
1881 "tbz x11, #0, 146f\n"
1882 "ld1 { v9.s }[2], [x9]\n"
1883 "ld1 { v13.s }[2], [x24]\n"
1884 "ld1 { v17.s }[2], [x23]\n"
1885 "ld1 { v21.s }[2], [x22]\n"
1886 "ld1 { v25.s }[2], [x21]\n"
1890 "tbz x11, #0, 146f\n"
1891 "ldr s9, [x9, #0x0]\n"
1892 "ldr s13, [x24, #0x0]\n"
1893 "ldr s17, [x23, #0x0]\n"
1894 "ldr s21, [x22, #0x0]\n"
1895 "ldr s25, [x21, #0x0]\n"
1898 "tbz x11, #1, 145f\n"
1899 "ldr d8, [x9], #0x8\n"
1900 "ldr d12, [x24], #0x8\n"
1902 "ldr d16, [x23], #0x8\n"
1903 "ldr d20, [x22], #0x8\n"
1904 "ldr d24, [x21], #0x8\n"
1905 "tbz x11, #0, 146f\n"
1906 "ld1 { v8.s }[2], [x9]\n"
1907 "ld1 { v12.s }[2], [x24]\n"
1908 "ld1 { v16.s }[2], [x23]\n"
1909 "ld1 { v20.s }[2], [x22]\n"
1910 "ld1 { v24.s }[2], [x21]\n"
1913 "ldr s8, [x9, #0x0]\n"
1914 "ldr s12, [x24, #0x0]\n"
1916 "ldr s16, [x23, #0x0]\n"
1917 "ldr s20, [x22, #0x0]\n"
1918 "ldr s24, [x21, #0x0]\n"
1923 "ldr q8, [x9, #0x0]\n"
1924 "ldr q9, [x9, #0x10]\n"
1925 "ldr q10, [x9, #0x20]\n"
1926 "ldr q11, [x9, #0x30]\n"
1927 "ldr q12, [x24, #0x0]\n"
1928 "ldr q13, [x24, #0x10]\n"
1929 "ldr q14, [x24, #0x20]\n"
1930 "ldr q15, [x24, #0x30]\n"
1931 "ldr q16, [x23, #0x0]\n"
1932 "ldr q17, [x23, #0x10]\n"
1933 "ldr q18, [x23, #0x20]\n"
1934 "ldr q19, [x23, #0x30]\n"
1935 "ldr q20, [x22, #0x0]\n"
1936 "ldr q21, [x22, #0x10]\n"
1937 "ldr q22, [x22, #0x20]\n"
1938 "ldr q23, [x22, #0x30]\n"
1939 "ldr q24, [x21, #0x0]\n"
1940 "ldr q25, [x21, #0x10]\n"
1941 "ldr q26, [x21, #0x20]\n"
1942 "ldr q27, [x21, #0x30]\n"
1945 "movi v8.4s, #0x0\n"
1946 "movi v9.4s, #0x0\n"
1947 "movi v10.4s, #0x0\n"
1948 "movi v11.4s, #0x0\n"
1949 "movi v12.4s, #0x0\n"
1950 "movi v13.4s, #0x0\n"
1951 "movi v14.4s, #0x0\n"
1952 "movi v15.4s, #0x0\n"
1953 "movi v16.4s, #0x0\n"
1954 "movi v17.4s, #0x0\n"
1955 "movi v18.4s, #0x0\n"
1956 "movi v19.4s, #0x0\n"
1957 "movi v20.4s, #0x0\n"
1958 "movi v21.4s, #0x0\n"
1959 "movi v22.4s, #0x0\n"
1960 "movi v23.4s, #0x0\n"
1961 "movi v24.4s, #0x0\n"
1962 "movi v25.4s, #0x0\n"
1963 "movi v26.4s, #0x0\n"
1964 "movi v27.4s, #0x0\n"
1968 "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
1969 "ldr w27, [x20, x28, LSL #0x2]\n"
1970 "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
1971 "tbz %x[flags], #3, 151f\n"
1972 "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
1973 "add x20, x20, x21, LSL #3\n"
1974 "ldr x26, [x20, #0x0]\n"
1975 "ldr x25, [x20, #0x8]\n"
1976 "ldr x24, [x20, #0x10]\n"
1977 "ldr x23, [x20, #0x18]\n"
1978 "ldr x22, [x20, #0x20]\n"
1980 "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
1981 "add x26, x26, x20\n"
1982 "add x25, x25, x20\n"
1983 "add x24, x24, x20\n"
1984 "add x23, x23, x20\n"
1985 "add x22, x22, x20\n"
1988 "mov x26, %x[input_ptr]\n"
1989 "add x25, x26, x21\n"
1990 "add x24, x25, x21\n"
1991 "add x23, x24, x21\n"
1992 "add x22, x23, x21\n"
1996 "ldr q0, [x26, #0x0]\n"
1997 "ldr q1, [x25, #0x0]\n"
1999 "ldr q2, [x24, #0x0]\n"
2000 "ldr q3, [x23, #0x0]\n"
2001 "ldr q4, [x22, #0x0]\n"
2002 "ldr q6, [x10, #0x0]\n"
2003 "ldr q7, [x10, #0x10]\n"
2006 ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
2007 ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
2008 "sub x27, x27, #0x10\n"
2009 "add x26, x26, #0x10\n"
2010 ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
2011 ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
2012 "add x25, x25, #0x10\n"
2013 "add x24, x24, #0x10\n"
2014 ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n"
2015 "ldr q29, [x10, #0x20]\n"
2016 ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
2017 "add x23, x23, #0x10\n"
2018 ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
2019 ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
2020 "add x22, x22, #0x10\n"
2022 ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
2023 ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n"
2024 "ldr q28, [x10, #0x30]\n"
2025 "prfm pldl1keep, [x26, #0x80]\n"
2026 ".inst 0x6f80e3aa // udot v10.4s, v29.16b, v0.4b[0]\n"
2027 ".inst 0x6f81e3ae // udot v14.4s, v29.16b, v1.4b[0]\n"
2028 "prfm pldl1keep, [x25, #0x80]\n"
2029 "prfm pldl1keep, [x24, #0x80]\n"
2030 ".inst 0x6f82e3b2 // udot v18.4s, v29.16b, v2.4b[0]\n"
2031 ".inst 0x6f83e3b6 // udot v22.4s, v29.16b, v3.4b[0]\n"
2032 "prfm pldl1keep, [x23, #0x80]\n"
2033 "prfm pldl1keep, [x22, #0x80]\n"
2034 ".inst 0x6f84e3ba // udot v26.4s, v29.16b, v4.4b[0]\n"
2035 "ldr q29, [x10, #0x40]\n"
2036 ".inst 0x6f80e38b // udot v11.4s, v28.16b, v0.4b[0]\n"
2037 ".inst 0x6f81e38f // udot v15.4s, v28.16b, v1.4b[0]\n"
2038 ".inst 0x6f82e393 // udot v19.4s, v28.16b, v2.4b[0]\n"
2039 ".inst 0x6f83e397 // udot v23.4s, v28.16b, v3.4b[0]\n"
2040 ".inst 0x6f84e39b // udot v27.4s, v28.16b, v4.4b[0]\n"
2041 "ldr q28, [x10, #0x50]\n"
2042 ".inst 0x6fa0e3a8 // udot v8.4s, v29.16b, v0.4b[1]\n"
2043 ".inst 0x6fa1e3ac // udot v12.4s, v29.16b, v1.4b[1]\n"
2044 ".inst 0x6fa2e3b0 // udot v16.4s, v29.16b, v2.4b[1]\n"
2045 ".inst 0x6fa3e3b4 // udot v20.4s, v29.16b, v3.4b[1]\n"
2046 ".inst 0x6fa4e3b8 // udot v24.4s, v29.16b, v4.4b[1]\n"
2047 "ldr q29, [x10, #0x60]\n"
2048 ".inst 0x6fa0e389 // udot v9.4s, v28.16b, v0.4b[1]\n"
2049 ".inst 0x6fa1e38d // udot v13.4s, v28.16b, v1.4b[1]\n"
2050 ".inst 0x6fa2e391 // udot v17.4s, v28.16b, v2.4b[1]\n"
2051 ".inst 0x6fa3e395 // udot v21.4s, v28.16b, v3.4b[1]\n"
2052 ".inst 0x6fa4e399 // udot v25.4s, v28.16b, v4.4b[1]\n"
2053 "ldr q28, [x10, #0x70]\n"
2054 ".inst 0x6fa0e3aa // udot v10.4s, v29.16b, v0.4b[1]\n"
2055 ".inst 0x6fa1e3ae // udot v14.4s, v29.16b, v1.4b[1]\n"
2056 ".inst 0x6fa2e3b2 // udot v18.4s, v29.16b, v2.4b[1]\n"
2057 ".inst 0x6fa3e3b6 // udot v22.4s, v29.16b, v3.4b[1]\n"
2058 ".inst 0x6fa4e3ba // udot v26.4s, v29.16b, v4.4b[1]\n"
2059 "ldr q29, [x10, #0x80]\n"
2060 ".inst 0x6fa0e38b // udot v11.4s, v28.16b, v0.4b[1]\n"
2061 ".inst 0x6fa1e38f // udot v15.4s, v28.16b, v1.4b[1]\n"
2062 ".inst 0x6fa2e393 // udot v19.4s, v28.16b, v2.4b[1]\n"
2063 ".inst 0x6fa3e397 // udot v23.4s, v28.16b, v3.4b[1]\n"
2064 ".inst 0x6fa4e39b // udot v27.4s, v28.16b, v4.4b[1]\n"
2065 "ldr q28, [x10, #0x90]\n"
2066 ".inst 0x6f80eba8 // udot v8.4s, v29.16b, v0.4b[2]\n"
2067 ".inst 0x6f81ebac // udot v12.4s, v29.16b, v1.4b[2]\n"
2068 ".inst 0x6f82ebb0 // udot v16.4s, v29.16b, v2.4b[2]\n"
2069 ".inst 0x6f83ebb4 // udot v20.4s, v29.16b, v3.4b[2]\n"
2070 ".inst 0x6f84ebb8 // udot v24.4s, v29.16b, v4.4b[2]\n"
2071 "ldr q29, [x10, #0xa0]\n"
2072 ".inst 0x6f80eb89 // udot v9.4s, v28.16b, v0.4b[2]\n"
2073 ".inst 0x6f81eb8d // udot v13.4s, v28.16b, v1.4b[2]\n"
2074 ".inst 0x6f82eb91 // udot v17.4s, v28.16b, v2.4b[2]\n"
2075 ".inst 0x6f83eb95 // udot v21.4s, v28.16b, v3.4b[2]\n"
2076 ".inst 0x6f84eb99 // udot v25.4s, v28.16b, v4.4b[2]\n"
2077 "ldr q28, [x10, #0xb0]\n"
2078 ".inst 0x6f80ebaa // udot v10.4s, v29.16b, v0.4b[2]\n"
2079 ".inst 0x6f81ebae // udot v14.4s, v29.16b, v1.4b[2]\n"
2080 ".inst 0x6f82ebb2 // udot v18.4s, v29.16b, v2.4b[2]\n"
2081 ".inst 0x6f83ebb6 // udot v22.4s, v29.16b, v3.4b[2]\n"
2082 ".inst 0x6f84ebba // udot v26.4s, v29.16b, v4.4b[2]\n"
2083 "ldr q29, [x10, #0xc0]\n"
2084 ".inst 0x6f80eb8b // udot v11.4s, v28.16b, v0.4b[2]\n"
2085 ".inst 0x6f81eb8f // udot v15.4s, v28.16b, v1.4b[2]\n"
2086 ".inst 0x6f82eb93 // udot v19.4s, v28.16b, v2.4b[2]\n"
2087 ".inst 0x6f83eb97 // udot v23.4s, v28.16b, v3.4b[2]\n"
2088 ".inst 0x6f84eb9b // udot v27.4s, v28.16b, v4.4b[2]\n"
2089 "ldr q28, [x10, #0xd0]\n"
2090 ".inst 0x6fa0eba8 // udot v8.4s, v29.16b, v0.4b[3]\n"
2091 ".inst 0x6fa1ebac // udot v12.4s, v29.16b, v1.4b[3]\n"
2092 ".inst 0x6fa2ebb0 // udot v16.4s, v29.16b, v2.4b[3]\n"
2093 ".inst 0x6fa3ebb4 // udot v20.4s, v29.16b, v3.4b[3]\n"
2094 ".inst 0x6fa4ebb8 // udot v24.4s, v29.16b, v4.4b[3]\n"
2095 "ldr q29, [x10, #0xe0]\n"
2096 ".inst 0x6fa0eb89 // udot v9.4s, v28.16b, v0.4b[3]\n"
2097 ".inst 0x6fa1eb8d // udot v13.4s, v28.16b, v1.4b[3]\n"
2098 ".inst 0x6fa2eb91 // udot v17.4s, v28.16b, v2.4b[3]\n"
2099 ".inst 0x6fa3eb95 // udot v21.4s, v28.16b, v3.4b[3]\n"
2100 ".inst 0x6fa4eb99 // udot v25.4s, v28.16b, v4.4b[3]\n"
2101 "ldr q28, [x10, #0xf0]\n"
2102 "add x10, x10, #0x100\n"
2103 ".inst 0x6fa0ebaa // udot v10.4s, v29.16b, v0.4b[3]\n"
2104 ".inst 0x6fa1ebae // udot v14.4s, v29.16b, v1.4b[3]\n"
2105 ".inst 0x6fa2ebb2 // udot v18.4s, v29.16b, v2.4b[3]\n"
2106 ".inst 0x6fa3ebb6 // udot v22.4s, v29.16b, v3.4b[3]\n"
2107 ".inst 0x6fa4ebba // udot v26.4s, v29.16b, v4.4b[3]\n"
2108 "ldr q6, [x10, #0x0]\n"
2109 ".inst 0x6fa0eb8b // udot v11.4s, v28.16b, v0.4b[3]\n"
2110 "ldr q0, [x26, #0x0]\n"
2111 ".inst 0x6fa1eb8f // udot v15.4s, v28.16b, v1.4b[3]\n"
2112 "ldr q1, [x25, #0x0]\n"
2113 ".inst 0x6fa2eb93 // udot v19.4s, v28.16b, v2.4b[3]\n"
2114 "ldr q2, [x24, #0x0]\n"
2115 ".inst 0x6fa3eb97 // udot v23.4s, v28.16b, v3.4b[3]\n"
2116 "ldr q3, [x23, #0x0]\n"
2117 ".inst 0x6fa4eb9b // udot v27.4s, v28.16b, v4.4b[3]\n"
2118 "ldr q4, [x22, #0x0]\n"
2119 "ldr q7, [x10, #0x10]\n"
2122 ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
2123 ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
2124 "add x26, x26, #0x10\n"
2125 "add x25, x25, #0x10\n"
2126 ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
2127 ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
2128 "add x24, x24, #0x10\n"
2129 "add x23, x23, #0x10\n"
2130 ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n"
2131 "ldr q29, [x10, #0x20]\n"
2132 ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
2133 "add x22, x22, #0x10\n"
2134 ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
2135 ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
2136 "sub x27, x27, #0x10\n"
2137 "prfm pldl1keep, [x26, #0x80]\n"
2138 ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
2139 ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n"
2140 "ldr q28, [x10, #0x30]\n"
2141 "prfm pldl1keep, [x25, #0x80]\n"
2142 ".inst 0x6f80e3aa // udot v10.4s, v29.16b, v0.4b[0]\n"
2143 ".inst 0x6f81e3ae // udot v14.4s, v29.16b, v1.4b[0]\n"
2144 "prfm pldl1keep, [x24, #0x80]\n"
2145 "prfm pldl1keep, [x23, #0x80]\n"
2146 ".inst 0x6f82e3b2 // udot v18.4s, v29.16b, v2.4b[0]\n"
2147 ".inst 0x6f83e3b6 // udot v22.4s, v29.16b, v3.4b[0]\n"
2148 "prfm pldl1keep, [x22, #0x80]\n"
2149 ".inst 0x6f84e3ba // udot v26.4s, v29.16b, v4.4b[0]\n"
2150 "ldr q29, [x10, #0x40]\n"
2151 ".inst 0x6f80e38b // udot v11.4s, v28.16b, v0.4b[0]\n"
2152 ".inst 0x6f81e38f // udot v15.4s, v28.16b, v1.4b[0]\n"
2153 ".inst 0x6f82e393 // udot v19.4s, v28.16b, v2.4b[0]\n"
2154 ".inst 0x6f83e397 // udot v23.4s, v28.16b, v3.4b[0]\n"
2155 ".inst 0x6f84e39b // udot v27.4s, v28.16b, v4.4b[0]\n"
2156 "ldr q28, [x10, #0x50]\n"
2157 ".inst 0x6fa0e3a8 // udot v8.4s, v29.16b, v0.4b[1]\n"
2158 ".inst 0x6fa1e3ac // udot v12.4s, v29.16b, v1.4b[1]\n"
2159 ".inst 0x6fa2e3b0 // udot v16.4s, v29.16b, v2.4b[1]\n"
2160 ".inst 0x6fa3e3b4 // udot v20.4s, v29.16b, v3.4b[1]\n"
2161 ".inst 0x6fa4e3b8 // udot v24.4s, v29.16b, v4.4b[1]\n"
2162 "ldr q29, [x10, #0x60]\n"
2163 ".inst 0x6fa0e389 // udot v9.4s, v28.16b, v0.4b[1]\n"
2164 ".inst 0x6fa1e38d // udot v13.4s, v28.16b, v1.4b[1]\n"
2165 ".inst 0x6fa2e391 // udot v17.4s, v28.16b, v2.4b[1]\n"
2166 ".inst 0x6fa3e395 // udot v21.4s, v28.16b, v3.4b[1]\n"
2167 ".inst 0x6fa4e399 // udot v25.4s, v28.16b, v4.4b[1]\n"
2168 "ldr q28, [x10, #0x70]\n"
2169 ".inst 0x6fa0e3aa // udot v10.4s, v29.16b, v0.4b[1]\n"
2170 ".inst 0x6fa1e3ae // udot v14.4s, v29.16b, v1.4b[1]\n"
2171 ".inst 0x6fa2e3b2 // udot v18.4s, v29.16b, v2.4b[1]\n"
2172 ".inst 0x6fa3e3b6 // udot v22.4s, v29.16b, v3.4b[1]\n"
2173 ".inst 0x6fa4e3ba // udot v26.4s, v29.16b, v4.4b[1]\n"
2174 "ldr q29, [x10, #0x80]\n"
2175 ".inst 0x6fa0e38b // udot v11.4s, v28.16b, v0.4b[1]\n"
2176 ".inst 0x6fa1e38f // udot v15.4s, v28.16b, v1.4b[1]\n"
2177 ".inst 0x6fa2e393 // udot v19.4s, v28.16b, v2.4b[1]\n"
2178 ".inst 0x6fa3e397 // udot v23.4s, v28.16b, v3.4b[1]\n"
2179 ".inst 0x6fa4e39b // udot v27.4s, v28.16b, v4.4b[1]\n"
2180 "ldr q28, [x10, #0x90]\n"
2181 ".inst 0x6f80eba8 // udot v8.4s, v29.16b, v0.4b[2]\n"
2182 ".inst 0x6f81ebac // udot v12.4s, v29.16b, v1.4b[2]\n"
2183 ".inst 0x6f82ebb0 // udot v16.4s, v29.16b, v2.4b[2]\n"
2184 ".inst 0x6f83ebb4 // udot v20.4s, v29.16b, v3.4b[2]\n"
2185 ".inst 0x6f84ebb8 // udot v24.4s, v29.16b, v4.4b[2]\n"
2186 "ldr q29, [x10, #0xa0]\n"
2187 ".inst 0x6f80eb89 // udot v9.4s, v28.16b, v0.4b[2]\n"
2188 ".inst 0x6f81eb8d // udot v13.4s, v28.16b, v1.4b[2]\n"
2189 ".inst 0x6f82eb91 // udot v17.4s, v28.16b, v2.4b[2]\n"
2190 ".inst 0x6f83eb95 // udot v21.4s, v28.16b, v3.4b[2]\n"
2191 ".inst 0x6f84eb99 // udot v25.4s, v28.16b, v4.4b[2]\n"
2192 "ldr q28, [x10, #0xb0]\n"
2193 ".inst 0x6f80ebaa // udot v10.4s, v29.16b, v0.4b[2]\n"
2194 ".inst 0x6f81ebae // udot v14.4s, v29.16b, v1.4b[2]\n"
2195 ".inst 0x6f82ebb2 // udot v18.4s, v29.16b, v2.4b[2]\n"
2196 ".inst 0x6f83ebb6 // udot v22.4s, v29.16b, v3.4b[2]\n"
2197 ".inst 0x6f84ebba // udot v26.4s, v29.16b, v4.4b[2]\n"
2198 "ldr q29, [x10, #0xc0]\n"
2199 ".inst 0x6f80eb8b // udot v11.4s, v28.16b, v0.4b[2]\n"
2200 ".inst 0x6f81eb8f // udot v15.4s, v28.16b, v1.4b[2]\n"
2201 ".inst 0x6f82eb93 // udot v19.4s, v28.16b, v2.4b[2]\n"
2202 ".inst 0x6f83eb97 // udot v23.4s, v28.16b, v3.4b[2]\n"
2203 ".inst 0x6f84eb9b // udot v27.4s, v28.16b, v4.4b[2]\n"
2204 "ldr q28, [x10, #0xd0]\n"
2205 ".inst 0x6fa0eba8 // udot v8.4s, v29.16b, v0.4b[3]\n"
2206 ".inst 0x6fa1ebac // udot v12.4s, v29.16b, v1.4b[3]\n"
2207 ".inst 0x6fa2ebb0 // udot v16.4s, v29.16b, v2.4b[3]\n"
2208 ".inst 0x6fa3ebb4 // udot v20.4s, v29.16b, v3.4b[3]\n"
2209 ".inst 0x6fa4ebb8 // udot v24.4s, v29.16b, v4.4b[3]\n"
2210 "ldr q29, [x10, #0xe0]\n"
2211 ".inst 0x6fa0eb89 // udot v9.4s, v28.16b, v0.4b[3]\n"
2212 ".inst 0x6fa1eb8d // udot v13.4s, v28.16b, v1.4b[3]\n"
2213 ".inst 0x6fa2eb91 // udot v17.4s, v28.16b, v2.4b[3]\n"
2214 ".inst 0x6fa3eb95 // udot v21.4s, v28.16b, v3.4b[3]\n"
2215 ".inst 0x6fa4eb99 // udot v25.4s, v28.16b, v4.4b[3]\n"
2216 "ldr q28, [x10, #0xf0]\n"
2217 "add x10, x10, #0x100\n"
2218 ".inst 0x6fa0ebaa // udot v10.4s, v29.16b, v0.4b[3]\n"
2219 ".inst 0x6fa1ebae // udot v14.4s, v29.16b, v1.4b[3]\n"
2220 ".inst 0x6fa2ebb2 // udot v18.4s, v29.16b, v2.4b[3]\n"
2221 ".inst 0x6fa3ebb6 // udot v22.4s, v29.16b, v3.4b[3]\n"
2222 ".inst 0x6fa4ebba // udot v26.4s, v29.16b, v4.4b[3]\n"
2223 ".inst 0x6fa0eb8b // udot v11.4s, v28.16b, v0.4b[3]\n"
2224 ".inst 0x6fa1eb8f // udot v15.4s, v28.16b, v1.4b[3]\n"
2225 ".inst 0x6fa2eb93 // udot v19.4s, v28.16b, v2.4b[3]\n"
2226 ".inst 0x6fa3eb97 // udot v23.4s, v28.16b, v3.4b[3]\n"
2227 ".inst 0x6fa4eb9b // udot v27.4s, v28.16b, v4.4b[3]\n"
2233 "ldr s2, [x26], #0x4\n"
2234 "ldr s1, [x25], #0x4\n"
2235 "sub x27, x27, #0x4\n"
2237 "ldr s0, [x24], #0x4\n"
2238 "ldr s31, [x23], #0x4\n"
2239 "ldr s30, [x22], #0x4\n"
2240 "ldr q29, [x10, #0x0]\n"
2241 ".inst 0x6f82e3a8 // udot v8.4s, v29.16b, v2.4b[0]\n"
2242 ".inst 0x6f81e3ac // udot v12.4s, v29.16b, v1.4b[0]\n"
2243 "ldr q28, [x10, #0x10]\n"
2244 ".inst 0x6f80e3b0 // udot v16.4s, v29.16b, v0.4b[0]\n"
2245 ".inst 0x6f9fe3b4 // udot v20.4s, v29.16b, v31.4b[0]\n"
2246 ".inst 0x6f9ee3b8 // udot v24.4s, v29.16b, v30.4b[0]\n"
2247 "ldr q29, [x10, #0x20]\n"
2248 ".inst 0x6f82e389 // udot v9.4s, v28.16b, v2.4b[0]\n"
2249 ".inst 0x6f81e38d // udot v13.4s, v28.16b, v1.4b[0]\n"
2250 ".inst 0x6f80e391 // udot v17.4s, v28.16b, v0.4b[0]\n"
2251 ".inst 0x6f9fe395 // udot v21.4s, v28.16b, v31.4b[0]\n"
2252 ".inst 0x6f9ee399 // udot v25.4s, v28.16b, v30.4b[0]\n"
2253 "ldr q28, [x10, #0x30]\n"
2254 "add x10, x10, #0x40\n"
2255 ".inst 0x6f82e3aa // udot v10.4s, v29.16b, v2.4b[0]\n"
2256 ".inst 0x6f81e3ae // udot v14.4s, v29.16b, v1.4b[0]\n"
2257 ".inst 0x6f80e3b2 // udot v18.4s, v29.16b, v0.4b[0]\n"
2258 ".inst 0x6f9fe3b6 // udot v22.4s, v29.16b, v31.4b[0]\n"
2259 ".inst 0x6f9ee3ba // udot v26.4s, v29.16b, v30.4b[0]\n"
2260 ".inst 0x6f82e38b // udot v11.4s, v28.16b, v2.4b[0]\n"
2261 ".inst 0x6f81e38f // udot v15.4s, v28.16b, v1.4b[0]\n"
2262 ".inst 0x6f80e393 // udot v19.4s, v28.16b, v0.4b[0]\n"
2263 ".inst 0x6f9fe397 // udot v23.4s, v28.16b, v31.4b[0]\n"
2264 ".inst 0x6f9ee39b // udot v27.4s, v28.16b, v30.4b[0]\n"
2268 "tbz x27, #1, 158f\n"
2269 "ldr h0, [x26], #0x2\n"
2270 "ldr h1, [x25], #0x2\n"
2271 "ldr h2, [x24], #0x2\n"
2272 "ldr h3, [x23], #0x2\n"
2273 "ldr h4, [x22], #0x2\n"
2274 "tbz x27, #0, 159f\n"
2275 "ld1 { v0.b }[2], [x26]\n"
2276 "ld1 { v1.b }[2], [x25]\n"
2277 "ld1 { v2.b }[2], [x24]\n"
2278 "ld1 { v3.b }[2], [x23]\n"
2279 "ld1 { v4.b }[2], [x22]\n"
2282 "ldr b0, [x26, #0x0]\n"
2283 "ldr b1, [x25, #0x0]\n"
2284 "ldr b2, [x24, #0x0]\n"
2285 "ldr b3, [x23, #0x0]\n"
2286 "ldr b4, [x22, #0x0]\n"
2288 "ldr q29, [x10, #0x0]\n"
2289 "ldr q28, [x10, #0x10]\n"
2290 ".inst 0x6f80e3a8 // udot v8.4s, v29.16b, v0.4b[0]\n"
2291 ".inst 0x6f81e3ac // udot v12.4s, v29.16b, v1.4b[0]\n"
2292 ".inst 0x6f82e3b0 // udot v16.4s, v29.16b, v2.4b[0]\n"
2293 ".inst 0x6f83e3b4 // udot v20.4s, v29.16b, v3.4b[0]\n"
2294 ".inst 0x6f84e3b8 // udot v24.4s, v29.16b, v4.4b[0]\n"
2295 "ldr q29, [x10, #0x20]\n"
2296 ".inst 0x6f80e389 // udot v9.4s, v28.16b, v0.4b[0]\n"
2297 ".inst 0x6f81e38d // udot v13.4s, v28.16b, v1.4b[0]\n"
2298 ".inst 0x6f82e391 // udot v17.4s, v28.16b, v2.4b[0]\n"
2299 ".inst 0x6f83e395 // udot v21.4s, v28.16b, v3.4b[0]\n"
2300 ".inst 0x6f84e399 // udot v25.4s, v28.16b, v4.4b[0]\n"
2301 "ldr q28, [x10, #0x30]\n"
2302 "add x10, x10, #0x40\n"
2303 ".inst 0x6f80e3aa // udot v10.4s, v29.16b, v0.4b[0]\n"
2304 ".inst 0x6f81e3ae // udot v14.4s, v29.16b, v1.4b[0]\n"
2305 ".inst 0x6f82e3b2 // udot v18.4s, v29.16b, v2.4b[0]\n"
2306 ".inst 0x6f83e3b6 // udot v22.4s, v29.16b, v3.4b[0]\n"
2307 ".inst 0x6f84e3ba // udot v26.4s, v29.16b, v4.4b[0]\n"
2308 ".inst 0x6f80e38b // udot v11.4s, v28.16b, v0.4b[0]\n"
2309 ".inst 0x6f81e38f // udot v15.4s, v28.16b, v1.4b[0]\n"
2310 ".inst 0x6f82e393 // udot v19.4s, v28.16b, v2.4b[0]\n"
2311 ".inst 0x6f83e397 // udot v23.4s, v28.16b, v3.4b[0]\n"
2312 ".inst 0x6f84e39b // udot v27.4s, v28.16b, v4.4b[0]\n"
2314 "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
2315 "add x28, x28, #0x1\n"
2318 "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
2319 "add x24, x9, x20, LSL #2\n"
2320 "add x23, x24, x20, LSL #2\n"
2321 "prfm pstl1keep, [x9, #0x0]\n"
2322 "add x22, x23, x20, LSL #2\n"
2323 "add x21, x22, x20, LSL #2\n"
2324 "prfm pstl1keep, [x24, #0x0]\n"
2325 "prfm pstl1keep, [x23, #0x0]\n"
2327 "prfm pstl1keep, [x22, #0x0]\n"
2328 "prfm pstl1keep, [x21, #0x0]\n"
2330 "tbz x11, #3, 164f\n"
2331 "st1 { v8.4s }, [x9], #0x10\n"
2332 "st1 { v9.4s }, [x9], #0x10\n"
2333 "st1 { v12.4s }, [x24], #0x10\n"
2334 "st1 { v13.4s }, [x24], #0x10\n"
2335 "st1 { v16.4s }, [x23], #0x10\n"
2336 "st1 { v17.4s }, [x23], #0x10\n"
2337 "st1 { v20.4s }, [x22], #0x10\n"
2338 "st1 { v21.4s }, [x22], #0x10\n"
2339 "st1 { v24.4s }, [x21], #0x10\n"
2340 "st1 { v25.4s }, [x21], #0x10\n"
2341 "tbz x11, #2, 162f\n"
2342 "st1 { v10.4s }, [x9], #0x10\n"
2343 "st1 { v14.4s }, [x24], #0x10\n"
2344 "st1 { v18.4s }, [x23], #0x10\n"
2345 "st1 { v22.4s }, [x22], #0x10\n"
2346 "st1 { v26.4s }, [x21], #0x10\n"
2347 "tbz x11, #1, 161f\n"
2348 "str d11, [x9], #0x8\n"
2349 "str d15, [x24], #0x8\n"
2350 "str d19, [x23], #0x8\n"
2351 "str d23, [x22], #0x8\n"
2352 "str d27, [x21], #0x8\n"
2353 "tbz x11, #0, 168f\n"
2354 "st1 { v11.s }[2], [x9]\n"
2355 "st1 { v15.s }[2], [x24]\n"
2356 "st1 { v19.s }[2], [x23]\n"
2357 "st1 { v23.s }[2], [x22]\n"
2358 "st1 { v27.s }[2], [x21]\n"
2361 "tbz x11, #0, 168f\n"
2362 "str s11, [x9, #0x0]\n"
2363 "str s15, [x24, #0x0]\n"
2364 "str s19, [x23, #0x0]\n"
2365 "str s23, [x22, #0x0]\n"
2366 "str s27, [x21, #0x0]\n"
2369 "tbz x11, #1, 163f\n"
2370 "str d10, [x9], #0x8\n"
2371 "str d14, [x24], #0x8\n"
2372 "str d18, [x23], #0x8\n"
2373 "str d22, [x22], #0x8\n"
2374 "str d26, [x21], #0x8\n"
2375 "tbz x11, #0, 168f\n"
2376 "st1 { v10.s }[2], [x9]\n"
2377 "st1 { v14.s }[2], [x24]\n"
2378 "st1 { v18.s }[2], [x23]\n"
2379 "st1 { v22.s }[2], [x22]\n"
2380 "st1 { v26.s }[2], [x21]\n"
2383 "tbz x11, #0, 168f\n"
2384 "str s10, [x9, #0x0]\n"
2385 "str s14, [x24, #0x0]\n"
2386 "str s18, [x23, #0x0]\n"
2387 "str s22, [x22, #0x0]\n"
2388 "str s26, [x21, #0x0]\n"
2391 "tbz x11, #2, 166f\n"
2392 "st1 { v8.4s }, [x9], #0x10\n"
2393 "st1 { v12.4s }, [x24], #0x10\n"
2394 "st1 { v16.4s }, [x23], #0x10\n"
2395 "st1 { v20.4s }, [x22], #0x10\n"
2396 "st1 { v24.4s }, [x21], #0x10\n"
2397 "tbz x11, #1, 165f\n"
2398 "str d9, [x9], #0x8\n"
2399 "str d13, [x24], #0x8\n"
2400 "str d17, [x23], #0x8\n"
2401 "str d21, [x22], #0x8\n"
2402 "str d25, [x21], #0x8\n"
2403 "tbz x11, #0, 168f\n"
2404 "st1 { v9.s }[2], [x9]\n"
2405 "st1 { v13.s }[2], [x24]\n"
2406 "st1 { v17.s }[2], [x23]\n"
2407 "st1 { v21.s }[2], [x22]\n"
2408 "st1 { v25.s }[2], [x21]\n"
2411 "tbz x11, #0, 168f\n"
2412 "str s9, [x9, #0x0]\n"
2413 "str s13, [x24, #0x0]\n"
2414 "str s17, [x23, #0x0]\n"
2415 "str s21, [x22, #0x0]\n"
2416 "str s25, [x21, #0x0]\n"
2419 "tbz x11, #1, 167f\n"
2420 "str d8, [x9], #0x8\n"
2421 "str d12, [x24], #0x8\n"
2422 "str d16, [x23], #0x8\n"
2423 "str d20, [x22], #0x8\n"
2424 "str d24, [x21], #0x8\n"
2425 "tbz x11, #0, 168f\n"
2426 "st1 { v8.s }[2], [x9]\n"
2427 "st1 { v12.s }[2], [x24]\n"
2428 "st1 { v16.s }[2], [x23]\n"
2429 "st1 { v20.s }[2], [x22]\n"
2430 "st1 { v24.s }[2], [x21]\n"
2433 "str s8, [x9, #0x0]\n"
2434 "str s12, [x24, #0x0]\n"
2435 "str s16, [x23, #0x0]\n"
2436 "str s20, [x22, #0x0]\n"
2437 "str s24, [x21, #0x0]\n"
2441 "str q8, [x9, #0x0]\n"
2442 "str q9, [x9, #0x10]\n"
2443 "str q10, [x9, #0x20]\n"
2444 "str q11, [x9, #0x30]\n"
2445 "add x9, x9, #0x40\n"
2446 "str q12, [x24, #0x0]\n"
2447 "str q13, [x24, #0x10]\n"
2448 "str q14, [x24, #0x20]\n"
2449 "str q15, [x24, #0x30]\n"
2450 "str q16, [x23, #0x0]\n"
2451 "str q17, [x23, #0x10]\n"
2452 "str q18, [x23, #0x20]\n"
2453 "str q19, [x23, #0x30]\n"
2454 "str q20, [x22, #0x0]\n"
2455 "str q21, [x22, #0x10]\n"
2456 "str q22, [x22, #0x20]\n"
2457 "str q23, [x22, #0x30]\n"
2458 "str q24, [x21, #0x0]\n"
2459 "str q25, [x21, #0x10]\n"
2460 "str q26, [x21, #0x20]\n"
2461 "str q27, [x21, #0x30]\n"
2463 "subs x11, x11, #0x10\n"
2467 "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
2469 "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
2470 "mov x9, %x[output_ptr]\n"
2471 "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
2472 "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
2474 "tbz %x[flags], #0, 182f\n"
2475 "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
2476 "add x24, x9, x20, LSL #2\n"
2477 "add x23, x24, x20, LSL #2\n"
2478 "add x22, x23, x20, LSL #2\n"
2479 "add x21, x22, x20, LSL #2\n"
2481 "add x20, x21, x20, LSL #2\n"
2483 "tbz x11, #3, 176f\n"
2484 "ld1 { v8.4s }, [x9], #0x10\n"
2485 "ld1 { v12.4s }, [x24], #0x10\n"
2486 "ld1 { v16.4s }, [x23], #0x10\n"
2487 "ld1 { v20.4s }, [x22], #0x10\n"
2488 "ld1 { v24.4s }, [x21], #0x10\n"
2489 "ld1 { v28.4s }, [x20], #0x10\n"
2490 "ld1 { v9.4s }, [x9], #0x10\n"
2491 "ld1 { v13.4s }, [x24], #0x10\n"
2492 "ld1 { v17.4s }, [x23], #0x10\n"
2493 "ld1 { v21.4s }, [x22], #0x10\n"
2494 "ld1 { v25.4s }, [x21], #0x10\n"
2495 "ld1 { v29.4s }, [x20], #0x10\n"
2496 "tbz x11, #2, 174f\n"
2497 "ld1 { v10.4s }, [x9], #0x10\n"
2498 "ld1 { v14.4s }, [x24], #0x10\n"
2499 "ld1 { v18.4s }, [x23], #0x10\n"
2500 "ld1 { v22.4s }, [x22], #0x10\n"
2501 "ld1 { v26.4s }, [x21], #0x10\n"
2502 "ld1 { v30.4s }, [x20], #0x10\n"
2503 "tbz x11, #1, 173f\n"
2504 "ldr d11, [x9], #0x8\n"
2505 "ldr d15, [x24], #0x8\n"
2507 "ldr d19, [x23], #0x8\n"
2508 "ldr d23, [x22], #0x8\n"
2509 "ldr d27, [x21], #0x8\n"
2510 "ldr d31, [x20], #0x8\n"
2511 "tbz x11, #0, 180f\n"
2512 "ld1 { v11.s }[2], [x9]\n"
2513 "ld1 { v15.s }[2], [x24]\n"
2514 "ld1 { v19.s }[2], [x23]\n"
2515 "ld1 { v23.s }[2], [x22]\n"
2516 "ld1 { v27.s }[2], [x21]\n"
2517 "ld1 { v31.s }[2], [x20]\n"
2521 "tbz x11, #0, 180f\n"
2522 "ldr s11, [x9, #0x0]\n"
2523 "ldr s15, [x24, #0x0]\n"
2524 "ldr s19, [x23, #0x0]\n"
2525 "ldr s23, [x22, #0x0]\n"
2526 "ldr s27, [x21, #0x0]\n"
2527 "ldr s31, [x20, #0x0]\n"
2530 "tbz x11, #1, 175f\n"
2531 "ldr d10, [x9], #0x8\n"
2532 "ldr d14, [x24], #0x8\n"
2534 "ldr d18, [x23], #0x8\n"
2535 "ldr d22, [x22], #0x8\n"
2536 "ldr d26, [x21], #0x8\n"
2537 "ldr d30, [x20], #0x8\n"
2538 "tbz x11, #0, 180f\n"
2539 "ld1 { v10.s }[2], [x9]\n"
2540 "ld1 { v14.s }[2], [x24]\n"
2541 "ld1 { v18.s }[2], [x23]\n"
2542 "ld1 { v22.s }[2], [x22]\n"
2543 "ld1 { v26.s }[2], [x21]\n"
2544 "ld1 { v30.s }[2], [x20]\n"
2548 "tbz x11, #0, 180f\n"
2549 "ldr s10, [x9, #0x0]\n"
2550 "ldr s14, [x24, #0x0]\n"
2551 "ldr s18, [x23, #0x0]\n"
2552 "ldr s22, [x22, #0x0]\n"
2553 "ldr s26, [x21, #0x0]\n"
2554 "ldr s30, [x20, #0x0]\n"
2557 "tbz x11, #2, 178f\n"
2558 "ld1 { v8.4s }, [x9], #0x10\n"
2559 "ld1 { v12.4s }, [x24], #0x10\n"
2560 "ld1 { v16.4s }, [x23], #0x10\n"
2561 "ld1 { v20.4s }, [x22], #0x10\n"
2562 "ld1 { v24.4s }, [x21], #0x10\n"
2563 "ld1 { v28.4s }, [x20], #0x10\n"
2564 "tbz x11, #1, 177f\n"
2565 "ldr d9, [x9], #0x8\n"
2566 "ldr d13, [x24], #0x8\n"
2568 "ldr d17, [x23], #0x8\n"
2569 "ldr d21, [x22], #0x8\n"
2570 "ldr d25, [x21], #0x8\n"
2571 "ldr d29, [x20], #0x8\n"
2572 "tbz x11, #0, 180f\n"
2573 "ld1 { v9.s }[2], [x9]\n"
2574 "ld1 { v13.s }[2], [x24]\n"
2575 "ld1 { v17.s }[2], [x23]\n"
2576 "ld1 { v21.s }[2], [x22]\n"
2577 "ld1 { v25.s }[2], [x21]\n"
2578 "ld1 { v29.s }[2], [x20]\n"
2582 "tbz x11, #0, 180f\n"
2583 "ldr s9, [x9, #0x0]\n"
2584 "ldr s13, [x24, #0x0]\n"
2585 "ldr s17, [x23, #0x0]\n"
2586 "ldr s21, [x22, #0x0]\n"
2587 "ldr s25, [x21, #0x0]\n"
2588 "ldr s29, [x20, #0x0]\n"
2591 "tbz x11, #1, 179f\n"
2592 "ldr d8, [x9], #0x8\n"
2593 "ldr d12, [x24], #0x8\n"
2595 "ldr d16, [x23], #0x8\n"
2596 "ldr d20, [x22], #0x8\n"
2597 "ldr d24, [x21], #0x8\n"
2598 "ldr d28, [x20], #0x8\n"
2599 "tbz x11, #0, 180f\n"
2600 "ld1 { v8.s }[2], [x9]\n"
2601 "ld1 { v12.s }[2], [x24]\n"
2602 "ld1 { v16.s }[2], [x23]\n"
2603 "ld1 { v20.s }[2], [x22]\n"
2604 "ld1 { v24.s }[2], [x21]\n"
2605 "ld1 { v28.s }[2], [x20]\n"
2608 "ldr s8, [x9, #0x0]\n"
2609 "ldr s12, [x24, #0x0]\n"
2611 "ldr s16, [x23, #0x0]\n"
2612 "ldr s20, [x22, #0x0]\n"
2613 "ldr s24, [x21, #0x0]\n"
2614 "ldr s28, [x20, #0x0]\n"
2619 "ldr q8, [x9, #0x0]\n"
2620 "ldr q9, [x9, #0x10]\n"
2621 "ldr q10, [x9, #0x20]\n"
2622 "ldr q11, [x9, #0x30]\n"
2623 "ldr q12, [x24, #0x0]\n"
2624 "ldr q13, [x24, #0x10]\n"
2625 "ldr q14, [x24, #0x20]\n"
2626 "ldr q15, [x24, #0x30]\n"
2627 "ldr q16, [x23, #0x0]\n"
2628 "ldr q17, [x23, #0x10]\n"
2629 "ldr q18, [x23, #0x20]\n"
2630 "ldr q19, [x23, #0x30]\n"
2631 "ldr q20, [x22, #0x0]\n"
2632 "ldr q21, [x22, #0x10]\n"
2633 "ldr q22, [x22, #0x20]\n"
2634 "ldr q23, [x22, #0x30]\n"
2635 "ldr q24, [x21, #0x0]\n"
2636 "ldr q25, [x21, #0x10]\n"
2637 "ldr q26, [x21, #0x20]\n"
2638 "ldr q27, [x21, #0x30]\n"
2639 "ldr q28, [x20, #0x0]\n"
2640 "ldr q29, [x20, #0x10]\n"
2641 "ldr q30, [x20, #0x20]\n"
2642 "ldr q31, [x20, #0x30]\n"
2645 "movi v8.4s, #0x0\n"
2646 "movi v9.4s, #0x0\n"
2647 "movi v10.4s, #0x0\n"
2648 "movi v11.4s, #0x0\n"
2649 "movi v12.4s, #0x0\n"
2650 "movi v13.4s, #0x0\n"
2651 "movi v14.4s, #0x0\n"
2652 "movi v15.4s, #0x0\n"
2653 "movi v16.4s, #0x0\n"
2654 "movi v17.4s, #0x0\n"
2655 "movi v18.4s, #0x0\n"
2656 "movi v19.4s, #0x0\n"
2657 "movi v20.4s, #0x0\n"
2658 "movi v21.4s, #0x0\n"
2659 "movi v22.4s, #0x0\n"
2660 "movi v23.4s, #0x0\n"
2661 "movi v24.4s, #0x0\n"
2662 "movi v25.4s, #0x0\n"
2663 "movi v26.4s, #0x0\n"
2664 "movi v27.4s, #0x0\n"
2665 "movi v28.4s, #0x0\n"
2666 "movi v29.4s, #0x0\n"
2667 "movi v30.4s, #0x0\n"
2668 "movi v31.4s, #0x0\n"
2672 "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
2673 "ldr w27, [x20, x28, LSL #0x2]\n"
2674 "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
2675 "tbz %x[flags], #3, 185f\n"
2676 "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
2677 "add x20, x20, x21, LSL #3\n"
2678 "ldr x26, [x20, #0x0]\n"
2679 "ldr x25, [x20, #0x8]\n"
2680 "ldr x24, [x20, #0x10]\n"
2681 "ldr x23, [x20, #0x18]\n"
2682 "ldr x22, [x20, #0x20]\n"
2683 "ldr x21, [x20, #0x28]\n"
2685 "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
2686 "add x26, x26, x20\n"
2687 "add x25, x25, x20\n"
2688 "add x24, x24, x20\n"
2689 "add x23, x23, x20\n"
2690 "add x22, x22, x20\n"
2691 "add x21, x21, x20\n"
2694 "mov x26, %x[input_ptr]\n"
2695 "add x25, x26, x21\n"
2696 "add x24, x25, x21\n"
2697 "add x23, x24, x21\n"
2698 "add x22, x23, x21\n"
2699 "add x21, x22, x21\n"
2703 "ldr q0, [x26, #0x0]\n"
2704 "ldr q1, [x25, #0x0]\n"
2706 "ldr q2, [x24, #0x0]\n"
2707 "ldr q3, [x23, #0x0]\n"
2708 "ldr q4, [x22, #0x0]\n"
2709 "ldr q5, [x21, #0x0]\n"
2710 "ldr q6, [x10, #0x0]\n"
2711 "ldr q7, [x10, #0x10]\n"
2714 ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
2715 ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
2716 "sub x27, x27, #0x10\n"
2717 "add x26, x26, #0x10\n"
2718 ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
2719 ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
2720 "add x25, x25, #0x10\n"
2721 "add x24, x24, #0x10\n"
2722 ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n"
2723 ".inst 0x6f85e0dc // udot v28.4s, v6.16b, v5.4b[0]\n"
2724 "ldr q6, [x10, #0x20]\n"
2725 "add x23, x23, #0x10\n"
2726 ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
2727 ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
2728 "add x22, x22, #0x10\n"
2729 "add x21, x21, #0x10\n"
2730 ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
2731 ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
2733 "prfm pldl1keep, [x26, #0x80]\n"
2734 ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n"
2735 ".inst 0x6f85e0fd // udot v29.4s, v7.16b, v5.4b[0]\n"
2736 "ldr q7, [x10, #0x30]\n"
2737 "prfm pldl1keep, [x25, #0x80]\n"
2738 ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
2739 ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
2740 "prfm pldl1keep, [x24, #0x80]\n"
2741 "prfm pldl1keep, [x23, #0x80]\n"
2742 ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
2743 ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
2744 "prfm pldl1keep, [x22, #0x80]\n"
2745 "prfm pldl1keep, [x21, #0x80]\n"
2746 ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n"
2747 ".inst 0x6f85e0de // udot v30.4s, v6.16b, v5.4b[0]\n"
2748 "ldr q6, [x10, #0x40]\n"
2749 ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
2750 ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
2751 ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
2752 ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
2753 ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n"
2754 ".inst 0x6f85e0ff // udot v31.4s, v7.16b, v5.4b[0]\n"
2755 "ldr q7, [x10, #0x50]\n"
2756 ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
2757 ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n"
2758 ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n"
2759 ".inst 0x6fa3e0d4 // udot v20.4s, v6.16b, v3.4b[1]\n"
2760 ".inst 0x6fa4e0d8 // udot v24.4s, v6.16b, v4.4b[1]\n"
2761 ".inst 0x6fa5e0dc // udot v28.4s, v6.16b, v5.4b[1]\n"
2762 "ldr q6, [x10, #0x60]\n"
2763 ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
2764 ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n"
2765 ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n"
2766 ".inst 0x6fa3e0f5 // udot v21.4s, v7.16b, v3.4b[1]\n"
2767 ".inst 0x6fa4e0f9 // udot v25.4s, v7.16b, v4.4b[1]\n"
2768 ".inst 0x6fa5e0fd // udot v29.4s, v7.16b, v5.4b[1]\n"
2769 "ldr q7, [x10, #0x70]\n"
2770 ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
2771 ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n"
2772 ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n"
2773 ".inst 0x6fa3e0d6 // udot v22.4s, v6.16b, v3.4b[1]\n"
2774 ".inst 0x6fa4e0da // udot v26.4s, v6.16b, v4.4b[1]\n"
2775 ".inst 0x6fa5e0de // udot v30.4s, v6.16b, v5.4b[1]\n"
2776 "ldr q6, [x10, #0x80]\n"
2777 ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
2778 ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n"
2779 ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n"
2780 ".inst 0x6fa3e0f7 // udot v23.4s, v7.16b, v3.4b[1]\n"
2781 ".inst 0x6fa4e0fb // udot v27.4s, v7.16b, v4.4b[1]\n"
2782 ".inst 0x6fa5e0ff // udot v31.4s, v7.16b, v5.4b[1]\n"
2783 "ldr q7, [x10, #0x90]\n"
2784 ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
2785 ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n"
2786 ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n"
2787 ".inst 0x6f83e8d4 // udot v20.4s, v6.16b, v3.4b[2]\n"
2788 ".inst 0x6f84e8d8 // udot v24.4s, v6.16b, v4.4b[2]\n"
2789 ".inst 0x6f85e8dc // udot v28.4s, v6.16b, v5.4b[2]\n"
2790 "ldr q6, [x10, #0xa0]\n"
2791 ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
2792 ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n"
2793 ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n"
2794 ".inst 0x6f83e8f5 // udot v21.4s, v7.16b, v3.4b[2]\n"
2795 ".inst 0x6f84e8f9 // udot v25.4s, v7.16b, v4.4b[2]\n"
2796 ".inst 0x6f85e8fd // udot v29.4s, v7.16b, v5.4b[2]\n"
2797 "ldr q7, [x10, #0xb0]\n"
2798 ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
2799 ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n"
2800 ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n"
2801 ".inst 0x6f83e8d6 // udot v22.4s, v6.16b, v3.4b[2]\n"
2802 ".inst 0x6f84e8da // udot v26.4s, v6.16b, v4.4b[2]\n"
2803 ".inst 0x6f85e8de // udot v30.4s, v6.16b, v5.4b[2]\n"
2804 "ldr q6, [x10, #0xc0]\n"
2805 ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
2806 ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n"
2807 ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n"
2808 ".inst 0x6f83e8f7 // udot v23.4s, v7.16b, v3.4b[2]\n"
2809 ".inst 0x6f84e8fb // udot v27.4s, v7.16b, v4.4b[2]\n"
2810 ".inst 0x6f85e8ff // udot v31.4s, v7.16b, v5.4b[2]\n"
2811 "ldr q7, [x10, #0xd0]\n"
2812 ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
2813 ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n"
2814 ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n"
2815 ".inst 0x6fa3e8d4 // udot v20.4s, v6.16b, v3.4b[3]\n"
2816 ".inst 0x6fa4e8d8 // udot v24.4s, v6.16b, v4.4b[3]\n"
2817 ".inst 0x6fa5e8dc // udot v28.4s, v6.16b, v5.4b[3]\n"
2818 "ldr q6, [x10, #0xe0]\n"
2819 ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
2820 ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n"
2821 ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n"
2822 ".inst 0x6fa3e8f5 // udot v21.4s, v7.16b, v3.4b[3]\n"
2823 ".inst 0x6fa4e8f9 // udot v25.4s, v7.16b, v4.4b[3]\n"
2824 ".inst 0x6fa5e8fd // udot v29.4s, v7.16b, v5.4b[3]\n"
2825 "ldr q7, [x10, #0xf0]\n"
2826 "add x10, x10, #0x100\n"
2827 ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
2828 ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n"
2829 ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n"
2830 ".inst 0x6fa3e8d6 // udot v22.4s, v6.16b, v3.4b[3]\n"
2831 ".inst 0x6fa4e8da // udot v26.4s, v6.16b, v4.4b[3]\n"
2832 ".inst 0x6fa5e8de // udot v30.4s, v6.16b, v5.4b[3]\n"
2833 "ldr q6, [x10, #0x0]\n"
2834 ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
2835 "ldr q0, [x26, #0x0]\n"
2836 ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n"
2837 "ldr q1, [x25, #0x0]\n"
2838 ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n"
2839 "ldr q2, [x24, #0x0]\n"
2840 ".inst 0x6fa3e8f7 // udot v23.4s, v7.16b, v3.4b[3]\n"
2841 "ldr q3, [x23, #0x0]\n"
2842 ".inst 0x6fa4e8fb // udot v27.4s, v7.16b, v4.4b[3]\n"
2843 "ldr q4, [x22, #0x0]\n"
2844 ".inst 0x6fa5e8ff // udot v31.4s, v7.16b, v5.4b[3]\n"
2845 "ldr q5, [x21, #0x0]\n"
2846 "ldr q7, [x10, #0x10]\n"
2849 ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
2850 ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
2851 "add x26, x26, #0x10\n"
2852 "add x25, x25, #0x10\n"
2853 ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
2854 ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
2855 "add x24, x24, #0x10\n"
2856 "add x23, x23, #0x10\n"
2857 ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n"
2858 ".inst 0x6f85e0dc // udot v28.4s, v6.16b, v5.4b[0]\n"
2859 "ldr q6, [x10, #0x20]\n"
2860 "add x22, x22, #0x10\n"
2861 ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
2862 ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
2863 "add x21, x21, #0x10\n"
2864 "sub x27, x27, #0x10\n"
2865 ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
2866 ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
2867 "prfm pldl1keep, [x26, #0x80]\n"
2868 "prfm pldl1keep, [x25, #0x80]\n"
2869 ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n"
2870 ".inst 0x6f85e0fd // udot v29.4s, v7.16b, v5.4b[0]\n"
2871 "ldr q7, [x10, #0x30]\n"
2872 "prfm pldl1keep, [x24, #0x80]\n"
2873 ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
2874 ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
2875 "prfm pldl1keep, [x23, #0x80]\n"
2876 "prfm pldl1keep, [x22, #0x80]\n"
2877 ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
2878 ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
2879 "prfm pldl1keep, [x21, #0x80]\n"
2880 ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n"
2881 ".inst 0x6f85e0de // udot v30.4s, v6.16b, v5.4b[0]\n"
2882 "ldr q6, [x10, #0x40]\n"
2883 ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
2884 ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
2885 ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
2886 ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
2887 ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n"
2888 ".inst 0x6f85e0ff // udot v31.4s, v7.16b, v5.4b[0]\n"
2889 "ldr q7, [x10, #0x50]\n"
2890 ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
2891 ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n"
2892 ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n"
2893 ".inst 0x6fa3e0d4 // udot v20.4s, v6.16b, v3.4b[1]\n"
2894 ".inst 0x6fa4e0d8 // udot v24.4s, v6.16b, v4.4b[1]\n"
2895 ".inst 0x6fa5e0dc // udot v28.4s, v6.16b, v5.4b[1]\n"
2896 "ldr q6, [x10, #0x60]\n"
2897 ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
2898 ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n"
2899 ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n"
2900 ".inst 0x6fa3e0f5 // udot v21.4s, v7.16b, v3.4b[1]\n"
2901 ".inst 0x6fa4e0f9 // udot v25.4s, v7.16b, v4.4b[1]\n"
2902 ".inst 0x6fa5e0fd // udot v29.4s, v7.16b, v5.4b[1]\n"
2903 "ldr q7, [x10, #0x70]\n"
2904 ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
2905 ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n"
2906 ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n"
2907 ".inst 0x6fa3e0d6 // udot v22.4s, v6.16b, v3.4b[1]\n"
2908 ".inst 0x6fa4e0da // udot v26.4s, v6.16b, v4.4b[1]\n"
2909 ".inst 0x6fa5e0de // udot v30.4s, v6.16b, v5.4b[1]\n"
2910 "ldr q6, [x10, #0x80]\n"
2911 ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
2912 ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n"
2913 ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n"
2914 ".inst 0x6fa3e0f7 // udot v23.4s, v7.16b, v3.4b[1]\n"
2915 ".inst 0x6fa4e0fb // udot v27.4s, v7.16b, v4.4b[1]\n"
2916 ".inst 0x6fa5e0ff // udot v31.4s, v7.16b, v5.4b[1]\n"
2917 "ldr q7, [x10, #0x90]\n"
2918 ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
2919 ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n"
2920 ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n"
2921 ".inst 0x6f83e8d4 // udot v20.4s, v6.16b, v3.4b[2]\n"
2922 ".inst 0x6f84e8d8 // udot v24.4s, v6.16b, v4.4b[2]\n"
2923 ".inst 0x6f85e8dc // udot v28.4s, v6.16b, v5.4b[2]\n"
2924 "ldr q6, [x10, #0xa0]\n"
2925 ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
2926 ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n"
2927 ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n"
2928 ".inst 0x6f83e8f5 // udot v21.4s, v7.16b, v3.4b[2]\n"
2929 ".inst 0x6f84e8f9 // udot v25.4s, v7.16b, v4.4b[2]\n"
2930 ".inst 0x6f85e8fd // udot v29.4s, v7.16b, v5.4b[2]\n"
2931 "ldr q7, [x10, #0xb0]\n"
2932 ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
2933 ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n"
2934 ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n"
2935 ".inst 0x6f83e8d6 // udot v22.4s, v6.16b, v3.4b[2]\n"
2936 ".inst 0x6f84e8da // udot v26.4s, v6.16b, v4.4b[2]\n"
2937 ".inst 0x6f85e8de // udot v30.4s, v6.16b, v5.4b[2]\n"
2938 "ldr q6, [x10, #0xc0]\n"
2939 ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
2940 ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n"
2941 ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n"
2942 ".inst 0x6f83e8f7 // udot v23.4s, v7.16b, v3.4b[2]\n"
2943 ".inst 0x6f84e8fb // udot v27.4s, v7.16b, v4.4b[2]\n"
2944 ".inst 0x6f85e8ff // udot v31.4s, v7.16b, v5.4b[2]\n"
2945 "ldr q7, [x10, #0xd0]\n"
2946 ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
2947 ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n"
2948 ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n"
2949 ".inst 0x6fa3e8d4 // udot v20.4s, v6.16b, v3.4b[3]\n"
2950 ".inst 0x6fa4e8d8 // udot v24.4s, v6.16b, v4.4b[3]\n"
2951 ".inst 0x6fa5e8dc // udot v28.4s, v6.16b, v5.4b[3]\n"
2952 "ldr q6, [x10, #0xe0]\n"
2953 ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
2954 ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n"
2955 ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n"
2956 ".inst 0x6fa3e8f5 // udot v21.4s, v7.16b, v3.4b[3]\n"
2957 ".inst 0x6fa4e8f9 // udot v25.4s, v7.16b, v4.4b[3]\n"
2958 ".inst 0x6fa5e8fd // udot v29.4s, v7.16b, v5.4b[3]\n"
2959 "ldr q7, [x10, #0xf0]\n"
2960 "add x10, x10, #0x100\n"
2961 ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
2962 ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n"
2963 ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n"
2964 ".inst 0x6fa3e8d6 // udot v22.4s, v6.16b, v3.4b[3]\n"
2965 ".inst 0x6fa4e8da // udot v26.4s, v6.16b, v4.4b[3]\n"
2966 ".inst 0x6fa5e8de // udot v30.4s, v6.16b, v5.4b[3]\n"
2967 ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
2968 ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n"
2969 ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n"
2970 ".inst 0x6fa3e8f7 // udot v23.4s, v7.16b, v3.4b[3]\n"
2971 ".inst 0x6fa4e8fb // udot v27.4s, v7.16b, v4.4b[3]\n"
2972 ".inst 0x6fa5e8ff // udot v31.4s, v7.16b, v5.4b[3]\n"
2978 "ldr s7, [x26], #0x4\n"
2979 "ldr s6, [x25], #0x4\n"
2980 "sub x27, x27, #0x4\n"
2982 "ldr s5, [x24], #0x4\n"
2983 "ldr s4, [x23], #0x4\n"
2984 "ldr s3, [x22], #0x4\n"
2985 "ldr s2, [x21], #0x4\n"
2986 "ldr q1, [x10, #0x0]\n"
2987 "ldr q0, [x10, #0x10]\n"
2988 ".inst 0x6f87e028 // udot v8.4s, v1.16b, v7.4b[0]\n"
2989 ".inst 0x6f86e02c // udot v12.4s, v1.16b, v6.4b[0]\n"
2990 ".inst 0x6f85e030 // udot v16.4s, v1.16b, v5.4b[0]\n"
2991 ".inst 0x6f84e034 // udot v20.4s, v1.16b, v4.4b[0]\n"
2992 ".inst 0x6f83e038 // udot v24.4s, v1.16b, v3.4b[0]\n"
2993 ".inst 0x6f82e03c // udot v28.4s, v1.16b, v2.4b[0]\n"
2994 "ldr q1, [x10, #0x20]\n"
2995 ".inst 0x6f87e009 // udot v9.4s, v0.16b, v7.4b[0]\n"
2996 ".inst 0x6f86e00d // udot v13.4s, v0.16b, v6.4b[0]\n"
2997 ".inst 0x6f85e011 // udot v17.4s, v0.16b, v5.4b[0]\n"
2998 ".inst 0x6f84e015 // udot v21.4s, v0.16b, v4.4b[0]\n"
2999 ".inst 0x6f83e019 // udot v25.4s, v0.16b, v3.4b[0]\n"
3000 ".inst 0x6f82e01d // udot v29.4s, v0.16b, v2.4b[0]\n"
3001 "ldr q0, [x10, #0x30]\n"
3002 "add x10, x10, #0x40\n"
3003 ".inst 0x6f87e02a // udot v10.4s, v1.16b, v7.4b[0]\n"
3004 ".inst 0x6f86e02e // udot v14.4s, v1.16b, v6.4b[0]\n"
3005 ".inst 0x6f85e032 // udot v18.4s, v1.16b, v5.4b[0]\n"
3006 ".inst 0x6f84e036 // udot v22.4s, v1.16b, v4.4b[0]\n"
3007 ".inst 0x6f83e03a // udot v26.4s, v1.16b, v3.4b[0]\n"
3008 ".inst 0x6f82e03e // udot v30.4s, v1.16b, v2.4b[0]\n"
3009 ".inst 0x6f87e00b // udot v11.4s, v0.16b, v7.4b[0]\n"
3010 ".inst 0x6f86e00f // udot v15.4s, v0.16b, v6.4b[0]\n"
3011 ".inst 0x6f85e013 // udot v19.4s, v0.16b, v5.4b[0]\n"
3012 ".inst 0x6f84e017 // udot v23.4s, v0.16b, v4.4b[0]\n"
3013 ".inst 0x6f83e01b // udot v27.4s, v0.16b, v3.4b[0]\n"
3014 ".inst 0x6f82e01f // udot v31.4s, v0.16b, v2.4b[0]\n"
3018 "tbz x27, #1, 192f\n"
3019 "ldr h0, [x26], #0x2\n"
3020 "ldr h1, [x25], #0x2\n"
3021 "ldr h2, [x24], #0x2\n"
3022 "ldr h3, [x23], #0x2\n"
3023 "ldr h4, [x22], #0x2\n"
3024 "ldr h5, [x21], #0x2\n"
3025 "tbz x27, #0, 193f\n"
3026 "ld1 { v0.b }[2], [x26]\n"
3027 "ld1 { v1.b }[2], [x25]\n"
3028 "ld1 { v2.b }[2], [x24]\n"
3029 "ld1 { v3.b }[2], [x23]\n"
3030 "ld1 { v4.b }[2], [x22]\n"
3031 "ld1 { v5.b }[2], [x21]\n"
3034 "ldr b0, [x26, #0x0]\n"
3035 "ldr b1, [x25, #0x0]\n"
3036 "ldr b2, [x24, #0x0]\n"
3037 "ldr b3, [x23, #0x0]\n"
3038 "ldr b4, [x22, #0x0]\n"
3039 "ldr b5, [x21, #0x0]\n"
3041 "ldr q7, [x10, #0x0]\n"
3042 "ldr q6, [x10, #0x10]\n"
3043 ".inst 0x6f80e0e8 // udot v8.4s, v7.16b, v0.4b[0]\n"
3044 ".inst 0x6f81e0ec // udot v12.4s, v7.16b, v1.4b[0]\n"
3045 ".inst 0x6f82e0f0 // udot v16.4s, v7.16b, v2.4b[0]\n"
3046 ".inst 0x6f83e0f4 // udot v20.4s, v7.16b, v3.4b[0]\n"
3047 ".inst 0x6f84e0f8 // udot v24.4s, v7.16b, v4.4b[0]\n"
3048 ".inst 0x6f85e0fc // udot v28.4s, v7.16b, v5.4b[0]\n"
3049 "ldr q7, [x10, #0x20]\n"
3050 ".inst 0x6f80e0c9 // udot v9.4s, v6.16b, v0.4b[0]\n"
3051 ".inst 0x6f81e0cd // udot v13.4s, v6.16b, v1.4b[0]\n"
3052 ".inst 0x6f82e0d1 // udot v17.4s, v6.16b, v2.4b[0]\n"
3053 ".inst 0x6f83e0d5 // udot v21.4s, v6.16b, v3.4b[0]\n"
3054 ".inst 0x6f84e0d9 // udot v25.4s, v6.16b, v4.4b[0]\n"
3055 ".inst 0x6f85e0dd // udot v29.4s, v6.16b, v5.4b[0]\n"
3056 "ldr q6, [x10, #0x30]\n"
3057 "add x10, x10, #0x40\n"
3058 ".inst 0x6f80e0ea // udot v10.4s, v7.16b, v0.4b[0]\n"
3059 ".inst 0x6f81e0ee // udot v14.4s, v7.16b, v1.4b[0]\n"
3060 ".inst 0x6f82e0f2 // udot v18.4s, v7.16b, v2.4b[0]\n"
3061 ".inst 0x6f83e0f6 // udot v22.4s, v7.16b, v3.4b[0]\n"
3062 ".inst 0x6f84e0fa // udot v26.4s, v7.16b, v4.4b[0]\n"
3063 ".inst 0x6f85e0fe // udot v30.4s, v7.16b, v5.4b[0]\n"
3064 ".inst 0x6f80e0cb // udot v11.4s, v6.16b, v0.4b[0]\n"
3065 ".inst 0x6f81e0cf // udot v15.4s, v6.16b, v1.4b[0]\n"
3066 ".inst 0x6f82e0d3 // udot v19.4s, v6.16b, v2.4b[0]\n"
3067 ".inst 0x6f83e0d7 // udot v23.4s, v6.16b, v3.4b[0]\n"
3068 ".inst 0x6f84e0db // udot v27.4s, v6.16b, v4.4b[0]\n"
3069 ".inst 0x6f85e0df // udot v31.4s, v6.16b, v5.4b[0]\n"
3071 "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
3072 "add x28, x28, #0x1\n"
3075 "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
3076 "add x24, x9, x20, LSL #2\n"
3077 "add x23, x24, x20, LSL #2\n"
3078 "prfm pstl1keep, [x9, #0x0]\n"
3079 "add x22, x23, x20, LSL #2\n"
3080 "add x21, x22, x20, LSL #2\n"
3081 "prfm pstl1keep, [x24, #0x0]\n"
3082 "prfm pstl1keep, [x23, #0x0]\n"
3083 "add x20, x21, x20, LSL #2\n"
3085 "prfm pstl1keep, [x22, #0x0]\n"
3086 "prfm pstl1keep, [x21, #0x0]\n"
3087 "prfm pstl1keep, [x20, #0x0]\n"
3089 "tbz x11, #3, 198f\n"
3090 "st1 { v8.4s }, [x9], #0x10\n"
3091 "st1 { v9.4s }, [x9], #0x10\n"
3092 "st1 { v12.4s }, [x24], #0x10\n"
3093 "st1 { v13.4s }, [x24], #0x10\n"
3094 "st1 { v16.4s }, [x23], #0x10\n"
3095 "st1 { v17.4s }, [x23], #0x10\n"
3096 "st1 { v20.4s }, [x22], #0x10\n"
3097 "st1 { v21.4s }, [x22], #0x10\n"
3098 "st1 { v24.4s }, [x21], #0x10\n"
3099 "st1 { v25.4s }, [x21], #0x10\n"
3100 "st1 { v28.4s }, [x20], #0x10\n"
3101 "st1 { v29.4s }, [x20], #0x10\n"
3102 "tbz x11, #2, 196f\n"
3103 "st1 { v10.4s }, [x9], #0x10\n"
3104 "st1 { v14.4s }, [x24], #0x10\n"
3105 "st1 { v18.4s }, [x23], #0x10\n"
3106 "st1 { v22.4s }, [x22], #0x10\n"
3107 "st1 { v26.4s }, [x21], #0x10\n"
3108 "st1 { v30.4s }, [x20], #0x10\n"
3109 "tbz x11, #1, 195f\n"
3110 "str d11, [x9], #0x8\n"
3111 "str d15, [x24], #0x8\n"
3112 "str d19, [x23], #0x8\n"
3113 "str d23, [x22], #0x8\n"
3114 "str d27, [x21], #0x8\n"
3115 "str d31, [x20], #0x8\n"
3116 "tbz x11, #0, 202f\n"
3117 "st1 { v11.s }[2], [x9]\n"
3118 "st1 { v15.s }[2], [x24]\n"
3119 "st1 { v19.s }[2], [x23]\n"
3120 "st1 { v23.s }[2], [x22]\n"
3121 "st1 { v27.s }[2], [x21]\n"
3122 "st1 { v31.s }[2], [x20]\n"
3125 "tbz x11, #0, 202f\n"
3126 "str s11, [x9, #0x0]\n"
3127 "str s15, [x24, #0x0]\n"
3128 "str s19, [x23, #0x0]\n"
3129 "str s23, [x22, #0x0]\n"
3130 "str s27, [x21, #0x0]\n"
3131 "str s31, [x20, #0x0]\n"
3134 "tbz x11, #1, 197f\n"
3135 "str d10, [x9], #0x8\n"
3136 "str d14, [x24], #0x8\n"
3137 "str d18, [x23], #0x8\n"
3138 "str d22, [x22], #0x8\n"
3139 "str d26, [x21], #0x8\n"
3140 "str d30, [x20], #0x8\n"
3141 "tbz x11, #0, 202f\n"
3142 "st1 { v10.s }[2], [x9]\n"
3143 "st1 { v14.s }[2], [x24]\n"
3144 "st1 { v18.s }[2], [x23]\n"
3145 "st1 { v22.s }[2], [x22]\n"
3146 "st1 { v26.s }[2], [x21]\n"
3147 "st1 { v30.s }[2], [x20]\n"
3150 "tbz x11, #0, 202f\n"
3151 "str s10, [x9, #0x0]\n"
3152 "str s14, [x24, #0x0]\n"
3153 "str s18, [x23, #0x0]\n"
3154 "str s22, [x22, #0x0]\n"
3155 "str s26, [x21, #0x0]\n"
3156 "str s30, [x20, #0x0]\n"
3159 "tbz x11, #2, 200f\n"
3160 "st1 { v8.4s }, [x9], #0x10\n"
3161 "st1 { v12.4s }, [x24], #0x10\n"
3162 "st1 { v16.4s }, [x23], #0x10\n"
3163 "st1 { v20.4s }, [x22], #0x10\n"
3164 "st1 { v24.4s }, [x21], #0x10\n"
3165 "st1 { v28.4s }, [x20], #0x10\n"
3166 "tbz x11, #1, 199f\n"
3167 "str d9, [x9], #0x8\n"
3168 "str d13, [x24], #0x8\n"
3169 "str d17, [x23], #0x8\n"
3170 "str d21, [x22], #0x8\n"
3171 "str d25, [x21], #0x8\n"
3172 "str d29, [x20], #0x8\n"
3173 "tbz x11, #0, 202f\n"
3174 "st1 { v9.s }[2], [x9]\n"
3175 "st1 { v13.s }[2], [x24]\n"
3176 "st1 { v17.s }[2], [x23]\n"
3177 "st1 { v21.s }[2], [x22]\n"
3178 "st1 { v25.s }[2], [x21]\n"
3179 "st1 { v29.s }[2], [x20]\n"
3182 "tbz x11, #0, 202f\n"
3183 "str s9, [x9, #0x0]\n"
3184 "str s13, [x24, #0x0]\n"
3185 "str s17, [x23, #0x0]\n"
3186 "str s21, [x22, #0x0]\n"
3187 "str s25, [x21, #0x0]\n"
3188 "str s29, [x20, #0x0]\n"
3191 "tbz x11, #1, 201f\n"
3192 "str d8, [x9], #0x8\n"
3193 "str d12, [x24], #0x8\n"
3194 "str d16, [x23], #0x8\n"
3195 "str d20, [x22], #0x8\n"
3196 "str d24, [x21], #0x8\n"
3197 "str d28, [x20], #0x8\n"
3198 "tbz x11, #0, 202f\n"
3199 "st1 { v8.s }[2], [x9]\n"
3200 "st1 { v12.s }[2], [x24]\n"
3201 "st1 { v16.s }[2], [x23]\n"
3202 "st1 { v20.s }[2], [x22]\n"
3203 "st1 { v24.s }[2], [x21]\n"
3204 "st1 { v28.s }[2], [x20]\n"
3207 "str s8, [x9, #0x0]\n"
3208 "str s12, [x24, #0x0]\n"
3209 "str s16, [x23, #0x0]\n"
3210 "str s20, [x22, #0x0]\n"
3211 "str s24, [x21, #0x0]\n"
3212 "str s28, [x20, #0x0]\n"
3216 "str q8, [x9, #0x0]\n"
3217 "str q9, [x9, #0x10]\n"
3218 "str q10, [x9, #0x20]\n"
3219 "str q11, [x9, #0x30]\n"
3220 "add x9, x9, #0x40\n"
3221 "str q12, [x24, #0x0]\n"
3222 "str q13, [x24, #0x10]\n"
3223 "str q14, [x24, #0x20]\n"
3224 "str q15, [x24, #0x30]\n"
3225 "str q16, [x23, #0x0]\n"
3226 "str q17, [x23, #0x10]\n"
3227 "str q18, [x23, #0x20]\n"
3228 "str q19, [x23, #0x30]\n"
3229 "str q20, [x22, #0x0]\n"
3230 "str q21, [x22, #0x10]\n"
3231 "str q22, [x22, #0x20]\n"
3232 "str q23, [x22, #0x30]\n"
3233 "str q24, [x21, #0x0]\n"
3234 "str q25, [x21, #0x10]\n"
3235 "str q26, [x21, #0x20]\n"
3236 "str q27, [x21, #0x30]\n"
3237 "str q28, [x20, #0x0]\n"
3238 "str q29, [x20, #0x10]\n"
3239 "str q30, [x20, #0x20]\n"
3240 "str q31, [x20, #0x30]\n"
3242 "subs x11, x11, #0x10\n"
3244 "subs %x[M], %x[M], #0x6\n"
3246 "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
3247 "tbz %x[flags], #3, 205f\n"
3248 "add x21, x21, #0x6\n"
3249 "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
3253 "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
3256 : [
M]
"+&r" (
M), [input_ptr]
"+&r" (input_ptr), [output_ptr]
"+&r" (output_ptr)
3257 : [args_ptr]
"r" (&ka), [flags]
"r" (flags), [offsetof_B_ptr]
"I" (offsetof(KernelArgs, B_ptr)), [offsetof_N]
"I" (offsetof(KernelArgs,
N)), [offsetof_input_initial_col]
"I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset]
"I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings]
"I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset]
"I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths]
"I" (offsetof(KernelArgs, string_lengths))
3258 :
"cc",
"memory",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23",
"v24",
"v25",
"v26",
"v27",
"v28",
"v29",
"v30",
"v31",
"x9",
"x10",
"x11",
"x20",
"x21",
"x22",
"x23",
"x24",
"x25",
"x26",
"x27",
"x28"
3263 #endif // __aarch64__