27 #include "../../utils.hpp"
33 void a64_hybrid_u8u32_dot_6x16_a55 (
34 unsigned int num_strings,
const unsigned int *string_lengths, IndirectInputArg<uint8_t> A_arg,
35 size_t M,
size_t N,
const uint8_t *B_ptr, IndirectOutputArg<uint32_t> output_arg,
40 unsigned int num_strings = {};
41 const unsigned int *string_lengths = {};
43 const uint8_t *B_ptr = {};
44 size_t output_offset = {};
45 size_t input_initial_col = {};
46 size_t input_offset = {};
49 unsigned long flags=0;
53 if (output_arg.is_indirect) {
54 output_ptr=(
void *)(output_arg.indirect.ptr);
55 ka.output_offset=output_arg.indirect.offset;
58 output_ptr=(
void *)(output_arg.direct.base);
59 ka.output_offset=output_arg.direct.stride;
62 if (A_arg.is_indirect) {
63 input_ptr=(
void *)(A_arg.indirect.ptr);
64 ka.input_offset=A_arg.indirect.start_row;
65 ka.input_initial_col=A_arg.indirect.start_col;
68 assert(num_strings==1);
69 input_ptr=(
void *)(A_arg.direct.base);
70 ka.input_offset=A_arg.direct.stride;
75 ka.num_strings = num_strings;
76 ka.string_lengths = string_lengths;
89 "ldr x8, [%x[args_ptr], %[offsetof_N]]\n"
90 "mov x17, %x[output_ptr]\n"
91 "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
93 "tbz %x[flags], #0, 12f\n"
97 "ld1 { v8.4s }, [x17], #0x10\n"
98 "ld1 { v9.4s }, [x17], #0x10\n"
100 "ld1 { v10.4s }, [x17], #0x10\n"
102 "ldr d11, [x17], #0x8\n"
105 "ld1 { v11.s }[2], [x17]\n"
110 "ldr s11, [x17, #0x0]\n"
114 "ldr d10, [x17], #0x8\n"
117 "ld1 { v10.s }[2], [x17]\n"
122 "ldr s10, [x17, #0x0]\n"
126 "ld1 { v8.4s }, [x17], #0x10\n"
128 "ldr d9, [x17], #0x8\n"
131 "ld1 { v9.s }[2], [x17]\n"
136 "ldr s9, [x17, #0x0]\n"
140 "ldr d8, [x17], #0x8\n"
143 "ld1 { v8.s }[2], [x17]\n"
146 "ldr s8, [x17, #0x0]\n"
149 "sub x17, x17, x25\n"
152 "ldr q8, [x17, #0x0]\n"
153 "ldr q9, [x17, #0x10]\n"
154 "ldr q10, [x17, #0x20]\n"
155 "ldr q11, [x17, #0x30]\n"
160 "movi v10.4s, #0x0\n"
161 "movi v11.4s, #0x0\n"
165 "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
166 "ldr w14, [x20, x15, LSL #0x2]\n"
167 "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
168 "tbz %x[flags], #3, 15f\n"
169 "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
170 "add x20, x20, x21, LSL #3\n"
171 "ldr x13, [x20, #0x0]\n"
173 "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
174 "add x13, x13, x20\n"
177 "mov x13, %x[input_ptr]\n"
181 "ldr q0, [x13, #0x0]\n"
183 "ldr q6, [x16, #0x0]\n"
184 "ldr q7, [x16, #0x10]\n"
187 ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
188 "ldr d17, [x16, #0x20]\n"
189 "ldr x20, [x16, #0x28]\n"
190 ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
191 "ldr d16, [x16, #0x30]\n"
192 "mov v17.d[1], x20\n"
193 "ldr x20, [x16, #0x38]\n"
194 "mov v16.d[1], x20\n"
195 ".inst 0x6f80e22a // udot v10.4s, v17.16b, v0.4b[0]\n"
196 "ldr d17, [x16, #0x40]\n"
197 "ldr x20, [x16, #0x48]\n"
198 ".inst 0x6f80e20b // udot v11.4s, v16.16b, v0.4b[0]\n"
199 "ldr d16, [x16, #0x50]\n"
200 "mov v17.d[1], x20\n"
201 "ldr x20, [x16, #0x58]\n"
202 "mov v16.d[1], x20\n"
203 ".inst 0x6fa0e228 // udot v8.4s, v17.16b, v0.4b[1]\n"
204 "ldr d17, [x16, #0x60]\n"
205 "ldr x20, [x16, #0x68]\n"
206 ".inst 0x6fa0e209 // udot v9.4s, v16.16b, v0.4b[1]\n"
207 "ldr d16, [x16, #0x70]\n"
208 "mov v17.d[1], x20\n"
209 "ldr x20, [x16, #0x78]\n"
210 "mov v16.d[1], x20\n"
211 ".inst 0x6fa0e22a // udot v10.4s, v17.16b, v0.4b[1]\n"
212 "ldr d17, [x16, #0x80]\n"
213 "ldr x20, [x16, #0x88]\n"
214 ".inst 0x6fa0e20b // udot v11.4s, v16.16b, v0.4b[1]\n"
215 "ldr d16, [x16, #0x90]\n"
216 "mov v17.d[1], x20\n"
217 "ldr x20, [x16, #0x98]\n"
218 "mov v16.d[1], x20\n"
219 ".inst 0x6f80ea28 // udot v8.4s, v17.16b, v0.4b[2]\n"
220 "ldr d17, [x16, #0xa0]\n"
221 "ldr x20, [x16, #0xa8]\n"
222 ".inst 0x6f80ea09 // udot v9.4s, v16.16b, v0.4b[2]\n"
223 "ldr d16, [x16, #0xb0]\n"
224 "mov v17.d[1], x20\n"
225 "ldr x20, [x16, #0xb8]\n"
226 "mov v16.d[1], x20\n"
227 ".inst 0x6f80ea2a // udot v10.4s, v17.16b, v0.4b[2]\n"
228 "ldr d17, [x16, #0xc0]\n"
229 "ldr x20, [x16, #0xc8]\n"
230 ".inst 0x6f80ea0b // udot v11.4s, v16.16b, v0.4b[2]\n"
231 "ldr d16, [x16, #0xd0]\n"
232 "mov v17.d[1], x20\n"
233 "ldr x20, [x16, #0xd8]\n"
234 "mov v16.d[1], x20\n"
235 ".inst 0x6fa0ea28 // udot v8.4s, v17.16b, v0.4b[3]\n"
236 "ldr d17, [x16, #0xe0]\n"
237 "ldr x20, [x16, #0xe8]\n"
238 ".inst 0x6fa0ea09 // udot v9.4s, v16.16b, v0.4b[3]\n"
239 "ldr d16, [x16, #0xf0]\n"
240 "mov v17.d[1], x20\n"
241 "ldr x20, [x16, #0xf8]\n"
242 "mov v16.d[1], x20\n"
243 "add x13, x13, #0x10\n"
244 "add x16, x16, #0x100\n"
245 ".inst 0x6fa0ea2a // udot v10.4s, v17.16b, v0.4b[3]\n"
246 "ldr d6, [x16, #0x0]\n"
247 "ldr x20, [x16, #0x8]\n"
248 ".inst 0x6fa0ea0b // udot v11.4s, v16.16b, v0.4b[3]\n"
249 "ldr d0, [x13, #0x0]\n"
250 "sub x14, x14, #0x10\n"
251 "ldr d7, [x16, #0x10]\n"
253 "ldr x21, [x13, #0x8]\n"
255 "ldr x20, [x16, #0x18]\n"
258 "prfm pldl1keep, [x13, #0x80]\n"
261 ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
262 "ldr q17, [x16, #0x20]\n"
263 ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
264 "ldr q16, [x16, #0x30]\n"
265 ".inst 0x6f80e22a // udot v10.4s, v17.16b, v0.4b[0]\n"
266 "ldr q17, [x16, #0x40]\n"
267 ".inst 0x6f80e20b // udot v11.4s, v16.16b, v0.4b[0]\n"
268 "ldr q16, [x16, #0x50]\n"
269 ".inst 0x6fa0e228 // udot v8.4s, v17.16b, v0.4b[1]\n"
270 "ldr q17, [x16, #0x60]\n"
271 ".inst 0x6fa0e209 // udot v9.4s, v16.16b, v0.4b[1]\n"
272 "ldr q16, [x16, #0x70]\n"
273 ".inst 0x6fa0e22a // udot v10.4s, v17.16b, v0.4b[1]\n"
274 "ldr q17, [x16, #0x80]\n"
275 ".inst 0x6fa0e20b // udot v11.4s, v16.16b, v0.4b[1]\n"
276 "ldr q16, [x16, #0x90]\n"
277 ".inst 0x6f80ea28 // udot v8.4s, v17.16b, v0.4b[2]\n"
278 "ldr q17, [x16, #0xa0]\n"
279 ".inst 0x6f80ea09 // udot v9.4s, v16.16b, v0.4b[2]\n"
280 "ldr q16, [x16, #0xb0]\n"
281 ".inst 0x6f80ea2a // udot v10.4s, v17.16b, v0.4b[2]\n"
282 "ldr q17, [x16, #0xc0]\n"
283 ".inst 0x6f80ea0b // udot v11.4s, v16.16b, v0.4b[2]\n"
284 "ldr q16, [x16, #0xd0]\n"
285 ".inst 0x6fa0ea28 // udot v8.4s, v17.16b, v0.4b[3]\n"
286 "ldr q17, [x16, #0xe0]\n"
287 ".inst 0x6fa0ea09 // udot v9.4s, v16.16b, v0.4b[3]\n"
288 "ldr q16, [x16, #0xf0]\n"
289 "add x13, x13, #0x10\n"
290 "sub x14, x14, #0x10\n"
291 ".inst 0x6fa0ea2a // udot v10.4s, v17.16b, v0.4b[3]\n"
292 "prfm pldl1keep, [x13, #0x80]\n"
293 ".inst 0x6fa0ea0b // udot v11.4s, v16.16b, v0.4b[3]\n"
294 "add x16, x16, #0x100\n"
300 "ldr s18, [x13], #0x4\n"
301 "sub x14, x14, #0x4\n"
302 "ldr q16, [x16, #0x0]\n"
303 ".inst 0x6f92e208 // udot v8.4s, v16.16b, v18.4b[0]\n"
304 "ldr q16, [x16, #0x10]\n"
305 ".inst 0x6f92e209 // udot v9.4s, v16.16b, v18.4b[0]\n"
306 "ldr q17, [x16, #0x20]\n"
308 "ldr q16, [x16, #0x30]\n"
309 ".inst 0x6f92e22a // udot v10.4s, v17.16b, v18.4b[0]\n"
310 ".inst 0x6f92e20b // udot v11.4s, v16.16b, v18.4b[0]\n"
311 "add x16, x16, #0x40\n"
316 "ldr h0, [x13], #0x2\n"
318 "ld1 { v0.b }[2], [x13]\n"
321 "ldr b0, [x13, #0x0]\n"
323 "ldr q16, [x16, #0x0]\n"
324 ".inst 0x6f80e208 // udot v8.4s, v16.16b, v0.4b[0]\n"
325 "ldr q16, [x16, #0x10]\n"
326 ".inst 0x6f80e209 // udot v9.4s, v16.16b, v0.4b[0]\n"
327 "ldr q16, [x16, #0x20]\n"
328 ".inst 0x6f80e20a // udot v10.4s, v16.16b, v0.4b[0]\n"
329 "ldr q16, [x16, #0x30]\n"
330 ".inst 0x6f80e20b // udot v11.4s, v16.16b, v0.4b[0]\n"
331 "add x16, x16, #0x40\n"
333 "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
334 "add x15, x15, #0x1\n"
338 "prfm pstl1keep, [x17, #0x0]\n"
341 "st1 { v8.4s }, [x17], #0x10\n"
342 "st1 { v9.4s }, [x17], #0x10\n"
344 "st1 { v10.4s }, [x17], #0x10\n"
346 "str d11, [x17], #0x8\n"
348 "st1 { v11.s }[2], [x17]\n"
352 "str s11, [x17, #0x0]\n"
356 "str d10, [x17], #0x8\n"
358 "st1 { v10.s }[2], [x17]\n"
362 "str s10, [x17, #0x0]\n"
366 "st1 { v8.4s }, [x17], #0x10\n"
368 "str d9, [x17], #0x8\n"
370 "st1 { v9.s }[2], [x17]\n"
374 "str s9, [x17, #0x0]\n"
378 "str d8, [x17], #0x8\n"
380 "st1 { v8.s }[2], [x17]\n"
383 "str s8, [x17, #0x0]\n"
387 "str q8, [x17, #0x0]\n"
388 "str q9, [x17, #0x10]\n"
389 "str q10, [x17, #0x20]\n"
390 "str q11, [x17, #0x30]\n"
391 "add x17, x17, #0x40\n"
393 "subs x8, x8, #0x10\n"
397 "ldr x8, [%x[args_ptr], %[offsetof_N]]\n"
398 "mov x17, %x[output_ptr]\n"
399 "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
401 "tbz %x[flags], #0, 46f\n"
402 "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
404 "add x24, x17, x20, LSL #2\n"
407 "ld1 { v8.4s }, [x17], #0x10\n"
408 "ld1 { v12.4s }, [x24], #0x10\n"
409 "ld1 { v9.4s }, [x17], #0x10\n"
410 "ld1 { v13.4s }, [x24], #0x10\n"
412 "ld1 { v10.4s }, [x17], #0x10\n"
413 "ld1 { v14.4s }, [x24], #0x10\n"
415 "ldr d11, [x17], #0x8\n"
417 "ldr d15, [x24], #0x8\n"
419 "ld1 { v11.s }[2], [x17]\n"
420 "ld1 { v15.s }[2], [x24]\n"
425 "ldr s11, [x17, #0x0]\n"
426 "ldr s15, [x24, #0x0]\n"
430 "ldr d10, [x17], #0x8\n"
432 "ldr d14, [x24], #0x8\n"
434 "ld1 { v10.s }[2], [x17]\n"
435 "ld1 { v14.s }[2], [x24]\n"
440 "ldr s10, [x17, #0x0]\n"
441 "ldr s14, [x24, #0x0]\n"
445 "ld1 { v8.4s }, [x17], #0x10\n"
446 "ld1 { v12.4s }, [x24], #0x10\n"
448 "ldr d9, [x17], #0x8\n"
450 "ldr d13, [x24], #0x8\n"
452 "ld1 { v9.s }[2], [x17]\n"
453 "ld1 { v13.s }[2], [x24]\n"
458 "ldr s9, [x17, #0x0]\n"
459 "ldr s13, [x24, #0x0]\n"
463 "ldr d8, [x17], #0x8\n"
465 "ldr d12, [x24], #0x8\n"
467 "ld1 { v8.s }[2], [x17]\n"
468 "ld1 { v12.s }[2], [x24]\n"
471 "ldr s8, [x17, #0x0]\n"
473 "ldr s12, [x24, #0x0]\n"
475 "sub x17, x17, x25\n"
478 "ldr q8, [x17, #0x0]\n"
479 "ldr q9, [x17, #0x10]\n"
480 "ldr q10, [x17, #0x20]\n"
481 "ldr q11, [x17, #0x30]\n"
482 "ldr q12, [x24, #0x0]\n"
483 "ldr q13, [x24, #0x10]\n"
484 "ldr q14, [x24, #0x20]\n"
485 "ldr q15, [x24, #0x30]\n"
490 "movi v10.4s, #0x0\n"
491 "movi v11.4s, #0x0\n"
492 "movi v12.4s, #0x0\n"
493 "movi v13.4s, #0x0\n"
494 "movi v14.4s, #0x0\n"
495 "movi v15.4s, #0x0\n"
499 "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
500 "ldr w14, [x20, x15, LSL #0x2]\n"
501 "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
502 "tbz %x[flags], #3, 49f\n"
503 "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
504 "add x20, x20, x21, LSL #3\n"
505 "ldr x13, [x20, #0x0]\n"
506 "ldr x12, [x20, #0x8]\n"
508 "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
509 "add x13, x13, x20\n"
510 "add x12, x12, x20\n"
513 "mov x13, %x[input_ptr]\n"
514 "add x12, x13, x21\n"
518 "ldr q0, [x13, #0x0]\n"
520 "ldr q1, [x12, #0x0]\n"
521 "ldr q6, [x16, #0x0]\n"
522 "ldr q7, [x16, #0x10]\n"
525 ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
526 "ldr x21, [x16, #0x28]\n"
527 ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
528 "ldr d17, [x16, #0x20]\n"
529 ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
530 "ldr x20, [x16, #0x38]\n"
531 ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
532 "ldr d16, [x16, #0x30]\n"
533 "mov v17.d[1], x21\n"
534 ".inst 0x6f80e22a // udot v10.4s, v17.16b, v0.4b[0]\n"
535 "mov v16.d[1], x20\n"
536 ".inst 0x6f81e22e // udot v14.4s, v17.16b, v1.4b[0]\n"
537 "ldr d17, [x16, #0x40]\n"
538 ".inst 0x6f80e20b // udot v11.4s, v16.16b, v0.4b[0]\n"
539 "ldr x20, [x16, #0x48]\n"
540 ".inst 0x6f81e20f // udot v15.4s, v16.16b, v1.4b[0]\n"
541 "ldr d16, [x16, #0x50]\n"
542 "mov v17.d[1], x20\n"
543 "ldr x20, [x16, #0x58]\n"
544 "mov v16.d[1], x20\n"
545 ".inst 0x6fa0e228 // udot v8.4s, v17.16b, v0.4b[1]\n"
546 "ldr x21, [x16, #0x68]\n"
547 ".inst 0x6fa1e22c // udot v12.4s, v17.16b, v1.4b[1]\n"
548 "ldr d17, [x16, #0x60]\n"
549 ".inst 0x6fa0e209 // udot v9.4s, v16.16b, v0.4b[1]\n"
550 "ldr x20, [x16, #0x78]\n"
551 ".inst 0x6fa1e20d // udot v13.4s, v16.16b, v1.4b[1]\n"
552 "ldr d16, [x16, #0x70]\n"
553 "mov v17.d[1], x21\n"
554 ".inst 0x6fa0e22a // udot v10.4s, v17.16b, v0.4b[1]\n"
555 "mov v16.d[1], x20\n"
556 ".inst 0x6fa1e22e // udot v14.4s, v17.16b, v1.4b[1]\n"
557 "ldr d17, [x16, #0x80]\n"
558 ".inst 0x6fa0e20b // udot v11.4s, v16.16b, v0.4b[1]\n"
559 "ldr x20, [x16, #0x88]\n"
560 ".inst 0x6fa1e20f // udot v15.4s, v16.16b, v1.4b[1]\n"
561 "ldr d16, [x16, #0x90]\n"
562 "mov v17.d[1], x20\n"
563 "ldr x20, [x16, #0x98]\n"
564 "mov v16.d[1], x20\n"
565 ".inst 0x6f80ea28 // udot v8.4s, v17.16b, v0.4b[2]\n"
566 "ldr x21, [x16, #0xa8]\n"
567 ".inst 0x6f81ea2c // udot v12.4s, v17.16b, v1.4b[2]\n"
568 "ldr d17, [x16, #0xa0]\n"
569 ".inst 0x6f80ea09 // udot v9.4s, v16.16b, v0.4b[2]\n"
570 "ldr x20, [x16, #0xb8]\n"
571 ".inst 0x6f81ea0d // udot v13.4s, v16.16b, v1.4b[2]\n"
572 "ldr d16, [x16, #0xb0]\n"
573 "mov v17.d[1], x21\n"
574 ".inst 0x6f80ea2a // udot v10.4s, v17.16b, v0.4b[2]\n"
575 "mov v16.d[1], x20\n"
576 ".inst 0x6f81ea2e // udot v14.4s, v17.16b, v1.4b[2]\n"
577 "ldr d17, [x16, #0xc0]\n"
578 ".inst 0x6f80ea0b // udot v11.4s, v16.16b, v0.4b[2]\n"
579 "ldr x20, [x16, #0xc8]\n"
580 ".inst 0x6f81ea0f // udot v15.4s, v16.16b, v1.4b[2]\n"
581 "ldr d16, [x16, #0xd0]\n"
582 "mov v17.d[1], x20\n"
583 "ldr x20, [x16, #0xd8]\n"
584 "mov v16.d[1], x20\n"
585 ".inst 0x6fa0ea28 // udot v8.4s, v17.16b, v0.4b[3]\n"
586 "ldr x21, [x16, #0xe8]\n"
587 ".inst 0x6fa1ea2c // udot v12.4s, v17.16b, v1.4b[3]\n"
588 "ldr d17, [x16, #0xe0]\n"
589 ".inst 0x6fa0ea09 // udot v9.4s, v16.16b, v0.4b[3]\n"
590 "ldr x20, [x16, #0xf8]\n"
591 ".inst 0x6fa1ea0d // udot v13.4s, v16.16b, v1.4b[3]\n"
592 "ldr d16, [x16, #0xf0]\n"
593 "mov v17.d[1], x21\n"
594 "add x13, x13, #0x10\n"
595 "mov v16.d[1], x20\n"
596 "add x12, x12, #0x10\n"
597 "add x16, x16, #0x100\n"
598 ".inst 0x6fa0ea2a // udot v10.4s, v17.16b, v0.4b[3]\n"
599 ".inst 0x6fa1ea2e // udot v14.4s, v17.16b, v1.4b[3]\n"
600 "ldr d6, [x16, #0x0]\n"
601 "ldr x21, [x16, #0x8]\n"
602 ".inst 0x6fa0ea0b // udot v11.4s, v16.16b, v0.4b[3]\n"
603 "ldr d0, [x13, #0x0]\n"
604 ".inst 0x6fa1ea0f // udot v15.4s, v16.16b, v1.4b[3]\n"
605 "ldr d1, [x12, #0x0]\n"
606 "sub x14, x14, #0x10\n"
607 "ldr d7, [x16, #0x10]\n"
609 "ldr x20, [x13, #0x8]\n"
611 "ldr x21, [x12, #0x8]\n"
613 "ldr x20, [x16, #0x18]\n"
615 "prfm pldl1keep, [x13, #0x80]\n"
617 "prfm pldl1keep, [x12, #0x80]\n"
620 ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
621 "add x13, x13, #0x10\n"
622 ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
623 "ldr q17, [x16, #0x20]\n"
624 ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
625 "add x12, x12, #0x10\n"
626 ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
627 "ldr q16, [x16, #0x30]\n"
628 ".inst 0x6f80e22a // udot v10.4s, v17.16b, v0.4b[0]\n"
629 "sub x14, x14, #0x10\n"
630 ".inst 0x6f81e22e // udot v14.4s, v17.16b, v1.4b[0]\n"
631 "ldr q17, [x16, #0x40]\n"
632 ".inst 0x6f80e20b // udot v11.4s, v16.16b, v0.4b[0]\n"
633 "prfm pldl1keep, [x13, #0x80]\n"
634 ".inst 0x6f81e20f // udot v15.4s, v16.16b, v1.4b[0]\n"
635 "ldr q16, [x16, #0x50]\n"
636 ".inst 0x6fa0e228 // udot v8.4s, v17.16b, v0.4b[1]\n"
637 "prfm pldl1keep, [x12, #0x80]\n"
638 ".inst 0x6fa1e22c // udot v12.4s, v17.16b, v1.4b[1]\n"
639 "ldr q17, [x16, #0x60]\n"
640 ".inst 0x6fa0e209 // udot v9.4s, v16.16b, v0.4b[1]\n"
641 ".inst 0x6fa1e20d // udot v13.4s, v16.16b, v1.4b[1]\n"
642 "ldr q16, [x16, #0x70]\n"
643 ".inst 0x6fa0e22a // udot v10.4s, v17.16b, v0.4b[1]\n"
644 ".inst 0x6fa1e22e // udot v14.4s, v17.16b, v1.4b[1]\n"
645 "ldr q17, [x16, #0x80]\n"
646 ".inst 0x6fa0e20b // udot v11.4s, v16.16b, v0.4b[1]\n"
647 ".inst 0x6fa1e20f // udot v15.4s, v16.16b, v1.4b[1]\n"
648 "ldr q16, [x16, #0x90]\n"
649 ".inst 0x6f80ea28 // udot v8.4s, v17.16b, v0.4b[2]\n"
650 ".inst 0x6f81ea2c // udot v12.4s, v17.16b, v1.4b[2]\n"
651 "ldr q17, [x16, #0xa0]\n"
652 ".inst 0x6f80ea09 // udot v9.4s, v16.16b, v0.4b[2]\n"
653 ".inst 0x6f81ea0d // udot v13.4s, v16.16b, v1.4b[2]\n"
654 "ldr q16, [x16, #0xb0]\n"
655 ".inst 0x6f80ea2a // udot v10.4s, v17.16b, v0.4b[2]\n"
656 ".inst 0x6f81ea2e // udot v14.4s, v17.16b, v1.4b[2]\n"
657 "ldr q17, [x16, #0xc0]\n"
658 ".inst 0x6f80ea0b // udot v11.4s, v16.16b, v0.4b[2]\n"
659 ".inst 0x6f81ea0f // udot v15.4s, v16.16b, v1.4b[2]\n"
660 "ldr q16, [x16, #0xd0]\n"
661 ".inst 0x6fa0ea28 // udot v8.4s, v17.16b, v0.4b[3]\n"
662 ".inst 0x6fa1ea2c // udot v12.4s, v17.16b, v1.4b[3]\n"
663 "ldr q17, [x16, #0xe0]\n"
664 ".inst 0x6fa0ea09 // udot v9.4s, v16.16b, v0.4b[3]\n"
665 ".inst 0x6fa1ea0d // udot v13.4s, v16.16b, v1.4b[3]\n"
666 "ldr q16, [x16, #0xf0]\n"
667 ".inst 0x6fa0ea2a // udot v10.4s, v17.16b, v0.4b[3]\n"
668 "add x16, x16, #0x100\n"
669 ".inst 0x6fa1ea2e // udot v14.4s, v17.16b, v1.4b[3]\n"
670 ".inst 0x6fa0ea0b // udot v11.4s, v16.16b, v0.4b[3]\n"
671 ".inst 0x6fa1ea0f // udot v15.4s, v16.16b, v1.4b[3]\n"
677 "ldr s19, [x13], #0x4\n"
678 "sub x14, x14, #0x4\n"
679 "ldr s18, [x12], #0x4\n"
681 "ldr q17, [x16, #0x0]\n"
682 ".inst 0x6f93e228 // udot v8.4s, v17.16b, v19.4b[0]\n"
683 "ldr q16, [x16, #0x10]\n"
684 ".inst 0x6f92e22c // udot v12.4s, v17.16b, v18.4b[0]\n"
685 "ldr q17, [x16, #0x20]\n"
686 ".inst 0x6f93e209 // udot v9.4s, v16.16b, v19.4b[0]\n"
687 ".inst 0x6f92e20d // udot v13.4s, v16.16b, v18.4b[0]\n"
688 "ldr q16, [x16, #0x30]\n"
689 ".inst 0x6f93e22a // udot v10.4s, v17.16b, v19.4b[0]\n"
690 "add x16, x16, #0x40\n"
691 ".inst 0x6f92e22e // udot v14.4s, v17.16b, v18.4b[0]\n"
692 ".inst 0x6f93e20b // udot v11.4s, v16.16b, v19.4b[0]\n"
693 ".inst 0x6f92e20f // udot v15.4s, v16.16b, v18.4b[0]\n"
698 "ldr h0, [x13], #0x2\n"
699 "ldr h1, [x12], #0x2\n"
701 "ld1 { v0.b }[2], [x13]\n"
702 "ld1 { v1.b }[2], [x12]\n"
705 "ldr b0, [x13, #0x0]\n"
706 "ldr b1, [x12, #0x0]\n"
708 "ldr q17, [x16, #0x0]\n"
709 ".inst 0x6f80e228 // udot v8.4s, v17.16b, v0.4b[0]\n"
710 "ldr q16, [x16, #0x10]\n"
711 ".inst 0x6f81e22c // udot v12.4s, v17.16b, v1.4b[0]\n"
712 "ldr q17, [x16, #0x20]\n"
713 ".inst 0x6f80e209 // udot v9.4s, v16.16b, v0.4b[0]\n"
714 ".inst 0x6f81e20d // udot v13.4s, v16.16b, v1.4b[0]\n"
715 "ldr q16, [x16, #0x30]\n"
716 ".inst 0x6f80e22a // udot v10.4s, v17.16b, v0.4b[0]\n"
717 "add x16, x16, #0x40\n"
718 ".inst 0x6f81e22e // udot v14.4s, v17.16b, v1.4b[0]\n"
719 ".inst 0x6f80e20b // udot v11.4s, v16.16b, v0.4b[0]\n"
720 ".inst 0x6f81e20f // udot v15.4s, v16.16b, v1.4b[0]\n"
722 "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
723 "add x15, x15, #0x1\n"
726 "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
727 "add x24, x17, x20, LSL #2\n"
729 "prfm pstl1keep, [x17, #0x0]\n"
730 "prfm pstl1keep, [x24, #0x0]\n"
733 "st1 { v8.4s }, [x17], #0x10\n"
734 "st1 { v9.4s }, [x17], #0x10\n"
735 "st1 { v12.4s }, [x24], #0x10\n"
736 "st1 { v13.4s }, [x24], #0x10\n"
738 "st1 { v10.4s }, [x17], #0x10\n"
739 "st1 { v14.4s }, [x24], #0x10\n"
741 "str d11, [x17], #0x8\n"
742 "str d15, [x24], #0x8\n"
744 "st1 { v11.s }[2], [x17]\n"
745 "st1 { v15.s }[2], [x24]\n"
749 "str s11, [x17, #0x0]\n"
750 "str s15, [x24, #0x0]\n"
754 "str d10, [x17], #0x8\n"
755 "str d14, [x24], #0x8\n"
757 "st1 { v10.s }[2], [x17]\n"
758 "st1 { v14.s }[2], [x24]\n"
762 "str s10, [x17, #0x0]\n"
763 "str s14, [x24, #0x0]\n"
767 "st1 { v8.4s }, [x17], #0x10\n"
768 "st1 { v12.4s }, [x24], #0x10\n"
770 "str d9, [x17], #0x8\n"
771 "str d13, [x24], #0x8\n"
773 "st1 { v9.s }[2], [x17]\n"
774 "st1 { v13.s }[2], [x24]\n"
778 "str s9, [x17, #0x0]\n"
779 "str s13, [x24, #0x0]\n"
783 "str d8, [x17], #0x8\n"
784 "str d12, [x24], #0x8\n"
786 "st1 { v8.s }[2], [x17]\n"
787 "st1 { v12.s }[2], [x24]\n"
790 "str s8, [x17, #0x0]\n"
791 "str s12, [x24, #0x0]\n"
795 "str q8, [x17, #0x0]\n"
796 "str q9, [x17, #0x10]\n"
797 "str q10, [x17, #0x20]\n"
798 "str q11, [x17, #0x30]\n"
799 "add x17, x17, #0x40\n"
800 "str q12, [x24, #0x0]\n"
801 "str q13, [x24, #0x10]\n"
802 "str q14, [x24, #0x20]\n"
803 "str q15, [x24, #0x30]\n"
805 "subs x8, x8, #0x10\n"
809 "ldr x8, [%x[args_ptr], %[offsetof_N]]\n"
810 "mov x17, %x[output_ptr]\n"
811 "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
813 "tbz %x[flags], #0, 80f\n"
814 "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
815 "add x24, x17, x20, LSL #2\n"
817 "add x23, x24, x20, LSL #2\n"
820 "ld1 { v8.4s }, [x17], #0x10\n"
821 "ld1 { v12.4s }, [x24], #0x10\n"
822 "ld1 { v16.4s }, [x23], #0x10\n"
823 "ld1 { v9.4s }, [x17], #0x10\n"
824 "ld1 { v13.4s }, [x24], #0x10\n"
825 "ld1 { v17.4s }, [x23], #0x10\n"
827 "ld1 { v10.4s }, [x17], #0x10\n"
828 "ld1 { v14.4s }, [x24], #0x10\n"
829 "ld1 { v18.4s }, [x23], #0x10\n"
831 "ldr d11, [x17], #0x8\n"
833 "ldr d15, [x24], #0x8\n"
834 "ldr d19, [x23], #0x8\n"
836 "ld1 { v11.s }[2], [x17]\n"
837 "ld1 { v15.s }[2], [x24]\n"
838 "ld1 { v19.s }[2], [x23]\n"
843 "ldr s11, [x17, #0x0]\n"
844 "ldr s15, [x24, #0x0]\n"
845 "ldr s19, [x23, #0x0]\n"
849 "ldr d10, [x17], #0x8\n"
851 "ldr d14, [x24], #0x8\n"
852 "ldr d18, [x23], #0x8\n"
854 "ld1 { v10.s }[2], [x17]\n"
855 "ld1 { v14.s }[2], [x24]\n"
856 "ld1 { v18.s }[2], [x23]\n"
861 "ldr s10, [x17, #0x0]\n"
862 "ldr s14, [x24, #0x0]\n"
863 "ldr s18, [x23, #0x0]\n"
867 "ld1 { v8.4s }, [x17], #0x10\n"
868 "ld1 { v12.4s }, [x24], #0x10\n"
869 "ld1 { v16.4s }, [x23], #0x10\n"
871 "ldr d9, [x17], #0x8\n"
873 "ldr d13, [x24], #0x8\n"
874 "ldr d17, [x23], #0x8\n"
876 "ld1 { v9.s }[2], [x17]\n"
877 "ld1 { v13.s }[2], [x24]\n"
878 "ld1 { v17.s }[2], [x23]\n"
883 "ldr s9, [x17, #0x0]\n"
884 "ldr s13, [x24, #0x0]\n"
885 "ldr s17, [x23, #0x0]\n"
889 "ldr d8, [x17], #0x8\n"
891 "ldr d12, [x24], #0x8\n"
892 "ldr d16, [x23], #0x8\n"
894 "ld1 { v8.s }[2], [x17]\n"
895 "ld1 { v12.s }[2], [x24]\n"
896 "ld1 { v16.s }[2], [x23]\n"
899 "ldr s8, [x17, #0x0]\n"
901 "ldr s12, [x24, #0x0]\n"
902 "ldr s16, [x23, #0x0]\n"
904 "sub x17, x17, x25\n"
907 "ldr q8, [x17, #0x0]\n"
908 "ldr q9, [x17, #0x10]\n"
909 "ldr q10, [x17, #0x20]\n"
910 "ldr q11, [x17, #0x30]\n"
911 "ldr q12, [x24, #0x0]\n"
912 "ldr q13, [x24, #0x10]\n"
913 "ldr q14, [x24, #0x20]\n"
914 "ldr q15, [x24, #0x30]\n"
915 "ldr q16, [x23, #0x0]\n"
916 "ldr q17, [x23, #0x10]\n"
917 "ldr q18, [x23, #0x20]\n"
918 "ldr q19, [x23, #0x30]\n"
923 "movi v10.4s, #0x0\n"
924 "movi v11.4s, #0x0\n"
925 "movi v12.4s, #0x0\n"
926 "movi v13.4s, #0x0\n"
927 "movi v14.4s, #0x0\n"
928 "movi v15.4s, #0x0\n"
929 "movi v16.4s, #0x0\n"
930 "movi v17.4s, #0x0\n"
931 "movi v18.4s, #0x0\n"
932 "movi v19.4s, #0x0\n"
936 "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
937 "ldr w14, [x20, x15, LSL #0x2]\n"
938 "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
939 "tbz %x[flags], #3, 83f\n"
940 "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
941 "add x20, x20, x21, LSL #3\n"
942 "ldr x13, [x20, #0x0]\n"
943 "ldr x12, [x20, #0x8]\n"
944 "ldr x11, [x20, #0x10]\n"
946 "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
947 "add x13, x13, x20\n"
948 "add x12, x12, x20\n"
949 "add x11, x11, x20\n"
952 "mov x13, %x[input_ptr]\n"
953 "add x12, x13, x21\n"
954 "add x11, x12, x21\n"
958 "ldr q0, [x13, #0x0]\n"
960 "ldr q1, [x12, #0x0]\n"
961 "ldr q2, [x11, #0x0]\n"
962 "ldr q6, [x16, #0x0]\n"
963 "ldr q7, [x16, #0x10]\n"
966 ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
967 "ldr x21, [x16, #0x28]\n"
968 ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
969 "ldr x20, [x16, #0x38]\n"
970 ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
971 "ldr d21, [x16, #0x20]\n"
972 ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
973 "mov v21.d[1], x21\n"
974 ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
975 "ldr x21, [x16, #0x48]\n"
976 ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
977 "ldr d20, [x16, #0x30]\n"
978 "mov v20.d[1], x20\n"
979 ".inst 0x6f80e2aa // udot v10.4s, v21.16b, v0.4b[0]\n"
980 ".inst 0x6f81e2ae // udot v14.4s, v21.16b, v1.4b[0]\n"
981 "ldr x20, [x16, #0x58]\n"
982 ".inst 0x6f82e2b2 // udot v18.4s, v21.16b, v2.4b[0]\n"
983 "ldr d21, [x16, #0x40]\n"
984 ".inst 0x6f80e28b // udot v11.4s, v20.16b, v0.4b[0]\n"
985 "mov v21.d[1], x21\n"
986 ".inst 0x6f81e28f // udot v15.4s, v20.16b, v1.4b[0]\n"
987 "ldr x21, [x16, #0x68]\n"
988 ".inst 0x6f82e293 // udot v19.4s, v20.16b, v2.4b[0]\n"
989 "ldr d20, [x16, #0x50]\n"
990 "mov v20.d[1], x20\n"
991 ".inst 0x6fa0e2a8 // udot v8.4s, v21.16b, v0.4b[1]\n"
992 ".inst 0x6fa1e2ac // udot v12.4s, v21.16b, v1.4b[1]\n"
993 "ldr x20, [x16, #0x78]\n"
994 ".inst 0x6fa2e2b0 // udot v16.4s, v21.16b, v2.4b[1]\n"
995 "ldr d21, [x16, #0x60]\n"
996 ".inst 0x6fa0e289 // udot v9.4s, v20.16b, v0.4b[1]\n"
997 "mov v21.d[1], x21\n"
998 ".inst 0x6fa1e28d // udot v13.4s, v20.16b, v1.4b[1]\n"
999 "ldr x21, [x16, #0x88]\n"
1000 ".inst 0x6fa2e291 // udot v17.4s, v20.16b, v2.4b[1]\n"
1001 "ldr d20, [x16, #0x70]\n"
1002 "mov v20.d[1], x20\n"
1003 ".inst 0x6fa0e2aa // udot v10.4s, v21.16b, v0.4b[1]\n"
1004 ".inst 0x6fa1e2ae // udot v14.4s, v21.16b, v1.4b[1]\n"
1005 "ldr x20, [x16, #0x98]\n"
1006 ".inst 0x6fa2e2b2 // udot v18.4s, v21.16b, v2.4b[1]\n"
1007 "ldr d21, [x16, #0x80]\n"
1008 ".inst 0x6fa0e28b // udot v11.4s, v20.16b, v0.4b[1]\n"
1009 "mov v21.d[1], x21\n"
1010 ".inst 0x6fa1e28f // udot v15.4s, v20.16b, v1.4b[1]\n"
1011 "ldr x21, [x16, #0xa8]\n"
1012 ".inst 0x6fa2e293 // udot v19.4s, v20.16b, v2.4b[1]\n"
1013 "ldr d20, [x16, #0x90]\n"
1014 "mov v20.d[1], x20\n"
1015 ".inst 0x6f80eaa8 // udot v8.4s, v21.16b, v0.4b[2]\n"
1016 ".inst 0x6f81eaac // udot v12.4s, v21.16b, v1.4b[2]\n"
1017 "ldr x20, [x16, #0xb8]\n"
1018 ".inst 0x6f82eab0 // udot v16.4s, v21.16b, v2.4b[2]\n"
1019 "ldr d21, [x16, #0xa0]\n"
1020 ".inst 0x6f80ea89 // udot v9.4s, v20.16b, v0.4b[2]\n"
1021 "mov v21.d[1], x21\n"
1022 ".inst 0x6f81ea8d // udot v13.4s, v20.16b, v1.4b[2]\n"
1023 "ldr x21, [x16, #0xc8]\n"
1024 ".inst 0x6f82ea91 // udot v17.4s, v20.16b, v2.4b[2]\n"
1025 "ldr d20, [x16, #0xb0]\n"
1026 "mov v20.d[1], x20\n"
1027 ".inst 0x6f80eaaa // udot v10.4s, v21.16b, v0.4b[2]\n"
1028 ".inst 0x6f81eaae // udot v14.4s, v21.16b, v1.4b[2]\n"
1029 "ldr x20, [x16, #0xd8]\n"
1030 ".inst 0x6f82eab2 // udot v18.4s, v21.16b, v2.4b[2]\n"
1031 "ldr d21, [x16, #0xc0]\n"
1032 ".inst 0x6f80ea8b // udot v11.4s, v20.16b, v0.4b[2]\n"
1033 "mov v21.d[1], x21\n"
1034 ".inst 0x6f81ea8f // udot v15.4s, v20.16b, v1.4b[2]\n"
1035 "ldr x21, [x16, #0xe8]\n"
1036 ".inst 0x6f82ea93 // udot v19.4s, v20.16b, v2.4b[2]\n"
1037 "ldr d20, [x16, #0xd0]\n"
1038 "mov v20.d[1], x20\n"
1039 ".inst 0x6fa0eaa8 // udot v8.4s, v21.16b, v0.4b[3]\n"
1040 ".inst 0x6fa1eaac // udot v12.4s, v21.16b, v1.4b[3]\n"
1041 "ldr x20, [x16, #0xf8]\n"
1042 ".inst 0x6fa2eab0 // udot v16.4s, v21.16b, v2.4b[3]\n"
1043 "ldr d21, [x16, #0xe0]\n"
1044 ".inst 0x6fa0ea89 // udot v9.4s, v20.16b, v0.4b[3]\n"
1045 "mov v21.d[1], x21\n"
1046 ".inst 0x6fa1ea8d // udot v13.4s, v20.16b, v1.4b[3]\n"
1047 "add x13, x13, #0x10\n"
1048 ".inst 0x6fa2ea91 // udot v17.4s, v20.16b, v2.4b[3]\n"
1049 "ldr d20, [x16, #0xf0]\n"
1050 "mov v20.d[1], x20\n"
1051 "add x12, x12, #0x10\n"
1052 "add x11, x11, #0x10\n"
1053 "add x16, x16, #0x100\n"
1054 ".inst 0x6fa0eaaa // udot v10.4s, v21.16b, v0.4b[3]\n"
1055 "ldr x20, [x16, #0x8]\n"
1056 ".inst 0x6fa1eaae // udot v14.4s, v21.16b, v1.4b[3]\n"
1057 "ldr x23, [x13, #0x8]\n"
1058 ".inst 0x6fa2eab2 // udot v18.4s, v21.16b, v2.4b[3]\n"
1059 "ldr d6, [x16, #0x0]\n"
1060 ".inst 0x6fa0ea8b // udot v11.4s, v20.16b, v0.4b[3]\n"
1061 "ldr d0, [x13, #0x0]\n"
1062 ".inst 0x6fa1ea8f // udot v15.4s, v20.16b, v1.4b[3]\n"
1063 "ldr d1, [x12, #0x0]\n"
1064 "ldr x22, [x12, #0x8]\n"
1065 ".inst 0x6fa2ea93 // udot v19.4s, v20.16b, v2.4b[3]\n"
1066 "ldr d2, [x11, #0x0]\n"
1067 "sub x14, x14, #0x10\n"
1068 "ldr d7, [x16, #0x10]\n"
1070 "ldr x21, [x11, #0x8]\n"
1071 "mov v6.d[1], x20\n"
1072 "ldr x20, [x16, #0x18]\n"
1073 "mov v0.d[1], x23\n"
1074 "prfm pldl1keep, [x13, #0x80]\n"
1075 "mov v1.d[1], x22\n"
1076 "prfm pldl1keep, [x12, #0x80]\n"
1077 "mov v2.d[1], x21\n"
1078 "prfm pldl1keep, [x11, #0x80]\n"
1079 "mov v7.d[1], x20\n"
1082 ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
1083 "add x13, x13, #0x10\n"
1084 ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
1085 "add x12, x12, #0x10\n"
1086 ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
1087 "ldr q21, [x16, #0x20]\n"
1088 ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
1089 "add x11, x11, #0x10\n"
1090 ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
1091 "sub x14, x14, #0x10\n"
1092 ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
1093 "ldr q20, [x16, #0x30]\n"
1094 ".inst 0x6f80e2aa // udot v10.4s, v21.16b, v0.4b[0]\n"
1095 "prfm pldl1keep, [x13, #0x80]\n"
1096 ".inst 0x6f81e2ae // udot v14.4s, v21.16b, v1.4b[0]\n"
1097 "prfm pldl1keep, [x12, #0x80]\n"
1098 ".inst 0x6f82e2b2 // udot v18.4s, v21.16b, v2.4b[0]\n"
1099 "ldr q21, [x16, #0x40]\n"
1100 ".inst 0x6f80e28b // udot v11.4s, v20.16b, v0.4b[0]\n"
1101 "prfm pldl1keep, [x11, #0x80]\n"
1102 ".inst 0x6f81e28f // udot v15.4s, v20.16b, v1.4b[0]\n"
1103 ".inst 0x6f82e293 // udot v19.4s, v20.16b, v2.4b[0]\n"
1104 "ldr q20, [x16, #0x50]\n"
1105 ".inst 0x6fa0e2a8 // udot v8.4s, v21.16b, v0.4b[1]\n"
1106 ".inst 0x6fa1e2ac // udot v12.4s, v21.16b, v1.4b[1]\n"
1107 ".inst 0x6fa2e2b0 // udot v16.4s, v21.16b, v2.4b[1]\n"
1108 "ldr q21, [x16, #0x60]\n"
1109 ".inst 0x6fa0e289 // udot v9.4s, v20.16b, v0.4b[1]\n"
1110 ".inst 0x6fa1e28d // udot v13.4s, v20.16b, v1.4b[1]\n"
1111 ".inst 0x6fa2e291 // udot v17.4s, v20.16b, v2.4b[1]\n"
1112 "ldr q20, [x16, #0x70]\n"
1113 ".inst 0x6fa0e2aa // udot v10.4s, v21.16b, v0.4b[1]\n"
1114 ".inst 0x6fa1e2ae // udot v14.4s, v21.16b, v1.4b[1]\n"
1115 ".inst 0x6fa2e2b2 // udot v18.4s, v21.16b, v2.4b[1]\n"
1116 "ldr q21, [x16, #0x80]\n"
1117 ".inst 0x6fa0e28b // udot v11.4s, v20.16b, v0.4b[1]\n"
1118 ".inst 0x6fa1e28f // udot v15.4s, v20.16b, v1.4b[1]\n"
1119 ".inst 0x6fa2e293 // udot v19.4s, v20.16b, v2.4b[1]\n"
1120 "ldr q20, [x16, #0x90]\n"
1121 ".inst 0x6f80eaa8 // udot v8.4s, v21.16b, v0.4b[2]\n"
1122 ".inst 0x6f81eaac // udot v12.4s, v21.16b, v1.4b[2]\n"
1123 ".inst 0x6f82eab0 // udot v16.4s, v21.16b, v2.4b[2]\n"
1124 "ldr q21, [x16, #0xa0]\n"
1125 ".inst 0x6f80ea89 // udot v9.4s, v20.16b, v0.4b[2]\n"
1126 ".inst 0x6f81ea8d // udot v13.4s, v20.16b, v1.4b[2]\n"
1127 ".inst 0x6f82ea91 // udot v17.4s, v20.16b, v2.4b[2]\n"
1128 "ldr q20, [x16, #0xb0]\n"
1129 ".inst 0x6f80eaaa // udot v10.4s, v21.16b, v0.4b[2]\n"
1130 ".inst 0x6f81eaae // udot v14.4s, v21.16b, v1.4b[2]\n"
1131 ".inst 0x6f82eab2 // udot v18.4s, v21.16b, v2.4b[2]\n"
1132 "ldr q21, [x16, #0xc0]\n"
1133 ".inst 0x6f80ea8b // udot v11.4s, v20.16b, v0.4b[2]\n"
1134 ".inst 0x6f81ea8f // udot v15.4s, v20.16b, v1.4b[2]\n"
1135 ".inst 0x6f82ea93 // udot v19.4s, v20.16b, v2.4b[2]\n"
1136 "ldr q20, [x16, #0xd0]\n"
1137 ".inst 0x6fa0eaa8 // udot v8.4s, v21.16b, v0.4b[3]\n"
1138 ".inst 0x6fa1eaac // udot v12.4s, v21.16b, v1.4b[3]\n"
1139 ".inst 0x6fa2eab0 // udot v16.4s, v21.16b, v2.4b[3]\n"
1140 "ldr q21, [x16, #0xe0]\n"
1141 ".inst 0x6fa0ea89 // udot v9.4s, v20.16b, v0.4b[3]\n"
1142 ".inst 0x6fa1ea8d // udot v13.4s, v20.16b, v1.4b[3]\n"
1143 ".inst 0x6fa2ea91 // udot v17.4s, v20.16b, v2.4b[3]\n"
1144 "ldr q20, [x16, #0xf0]\n"
1145 ".inst 0x6fa0eaaa // udot v10.4s, v21.16b, v0.4b[3]\n"
1146 "add x16, x16, #0x100\n"
1147 ".inst 0x6fa1eaae // udot v14.4s, v21.16b, v1.4b[3]\n"
1148 ".inst 0x6fa2eab2 // udot v18.4s, v21.16b, v2.4b[3]\n"
1149 ".inst 0x6fa0ea8b // udot v11.4s, v20.16b, v0.4b[3]\n"
1150 ".inst 0x6fa1ea8f // udot v15.4s, v20.16b, v1.4b[3]\n"
1151 ".inst 0x6fa2ea93 // udot v19.4s, v20.16b, v2.4b[3]\n"
1157 "ldr s24, [x13], #0x4\n"
1158 "sub x14, x14, #0x4\n"
1159 "ldr s23, [x12], #0x4\n"
1161 "ldr s22, [x11], #0x4\n"
1162 "ldr q21, [x16, #0x0]\n"
1163 ".inst 0x6f98e2a8 // udot v8.4s, v21.16b, v24.4b[0]\n"
1164 "ldr q20, [x16, #0x10]\n"
1165 ".inst 0x6f97e2ac // udot v12.4s, v21.16b, v23.4b[0]\n"
1166 ".inst 0x6f96e2b0 // udot v16.4s, v21.16b, v22.4b[0]\n"
1167 "ldr q21, [x16, #0x20]\n"
1168 ".inst 0x6f98e289 // udot v9.4s, v20.16b, v24.4b[0]\n"
1169 ".inst 0x6f97e28d // udot v13.4s, v20.16b, v23.4b[0]\n"
1170 ".inst 0x6f96e291 // udot v17.4s, v20.16b, v22.4b[0]\n"
1171 "ldr q20, [x16, #0x30]\n"
1172 ".inst 0x6f98e2aa // udot v10.4s, v21.16b, v24.4b[0]\n"
1173 "add x16, x16, #0x40\n"
1174 ".inst 0x6f97e2ae // udot v14.4s, v21.16b, v23.4b[0]\n"
1175 ".inst 0x6f96e2b2 // udot v18.4s, v21.16b, v22.4b[0]\n"
1176 ".inst 0x6f98e28b // udot v11.4s, v20.16b, v24.4b[0]\n"
1177 ".inst 0x6f97e28f // udot v15.4s, v20.16b, v23.4b[0]\n"
1178 ".inst 0x6f96e293 // udot v19.4s, v20.16b, v22.4b[0]\n"
1182 "tbz x14, #1, 90f\n"
1183 "ldr h0, [x13], #0x2\n"
1184 "ldr h1, [x12], #0x2\n"
1185 "ldr h2, [x11], #0x2\n"
1186 "tbz x14, #0, 91f\n"
1187 "ld1 { v0.b }[2], [x13]\n"
1188 "ld1 { v1.b }[2], [x12]\n"
1189 "ld1 { v2.b }[2], [x11]\n"
1192 "ldr b0, [x13, #0x0]\n"
1193 "ldr b1, [x12, #0x0]\n"
1194 "ldr b2, [x11, #0x0]\n"
1196 "ldr q21, [x16, #0x0]\n"
1197 ".inst 0x6f80e2a8 // udot v8.4s, v21.16b, v0.4b[0]\n"
1198 "ldr q20, [x16, #0x10]\n"
1199 ".inst 0x6f81e2ac // udot v12.4s, v21.16b, v1.4b[0]\n"
1200 ".inst 0x6f82e2b0 // udot v16.4s, v21.16b, v2.4b[0]\n"
1201 "ldr q21, [x16, #0x20]\n"
1202 ".inst 0x6f80e289 // udot v9.4s, v20.16b, v0.4b[0]\n"
1203 ".inst 0x6f81e28d // udot v13.4s, v20.16b, v1.4b[0]\n"
1204 ".inst 0x6f82e291 // udot v17.4s, v20.16b, v2.4b[0]\n"
1205 "ldr q20, [x16, #0x30]\n"
1206 ".inst 0x6f80e2aa // udot v10.4s, v21.16b, v0.4b[0]\n"
1207 "add x16, x16, #0x40\n"
1208 ".inst 0x6f81e2ae // udot v14.4s, v21.16b, v1.4b[0]\n"
1209 ".inst 0x6f82e2b2 // udot v18.4s, v21.16b, v2.4b[0]\n"
1210 ".inst 0x6f80e28b // udot v11.4s, v20.16b, v0.4b[0]\n"
1211 ".inst 0x6f81e28f // udot v15.4s, v20.16b, v1.4b[0]\n"
1212 ".inst 0x6f82e293 // udot v19.4s, v20.16b, v2.4b[0]\n"
1214 "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
1215 "add x15, x15, #0x1\n"
1218 "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
1219 "add x24, x17, x20, LSL #2\n"
1220 "add x23, x24, x20, LSL #2\n"
1222 "prfm pstl1keep, [x17, #0x0]\n"
1223 "prfm pstl1keep, [x24, #0x0]\n"
1224 "prfm pstl1keep, [x23, #0x0]\n"
1227 "st1 { v8.4s }, [x17], #0x10\n"
1228 "st1 { v9.4s }, [x17], #0x10\n"
1229 "st1 { v12.4s }, [x24], #0x10\n"
1230 "st1 { v13.4s }, [x24], #0x10\n"
1231 "st1 { v16.4s }, [x23], #0x10\n"
1232 "st1 { v17.4s }, [x23], #0x10\n"
1234 "st1 { v10.4s }, [x17], #0x10\n"
1235 "st1 { v14.4s }, [x24], #0x10\n"
1236 "st1 { v18.4s }, [x23], #0x10\n"
1238 "str d11, [x17], #0x8\n"
1239 "str d15, [x24], #0x8\n"
1240 "str d19, [x23], #0x8\n"
1241 "tbz x8, #0, 100f\n"
1242 "st1 { v11.s }[2], [x17]\n"
1243 "st1 { v15.s }[2], [x24]\n"
1244 "st1 { v19.s }[2], [x23]\n"
1247 "tbz x8, #0, 100f\n"
1248 "str s11, [x17, #0x0]\n"
1249 "str s15, [x24, #0x0]\n"
1250 "str s19, [x23, #0x0]\n"
1254 "str d10, [x17], #0x8\n"
1255 "str d14, [x24], #0x8\n"
1256 "str d18, [x23], #0x8\n"
1257 "tbz x8, #0, 100f\n"
1258 "st1 { v10.s }[2], [x17]\n"
1259 "st1 { v14.s }[2], [x24]\n"
1260 "st1 { v18.s }[2], [x23]\n"
1263 "tbz x8, #0, 100f\n"
1264 "str s10, [x17, #0x0]\n"
1265 "str s14, [x24, #0x0]\n"
1266 "str s18, [x23, #0x0]\n"
1270 "st1 { v8.4s }, [x17], #0x10\n"
1271 "st1 { v12.4s }, [x24], #0x10\n"
1272 "st1 { v16.4s }, [x23], #0x10\n"
1274 "str d9, [x17], #0x8\n"
1275 "str d13, [x24], #0x8\n"
1276 "str d17, [x23], #0x8\n"
1277 "tbz x8, #0, 100f\n"
1278 "st1 { v9.s }[2], [x17]\n"
1279 "st1 { v13.s }[2], [x24]\n"
1280 "st1 { v17.s }[2], [x23]\n"
1283 "tbz x8, #0, 100f\n"
1284 "str s9, [x17, #0x0]\n"
1285 "str s13, [x24, #0x0]\n"
1286 "str s17, [x23, #0x0]\n"
1290 "str d8, [x17], #0x8\n"
1291 "str d12, [x24], #0x8\n"
1292 "str d16, [x23], #0x8\n"
1293 "tbz x8, #0, 100f\n"
1294 "st1 { v8.s }[2], [x17]\n"
1295 "st1 { v12.s }[2], [x24]\n"
1296 "st1 { v16.s }[2], [x23]\n"
1299 "str s8, [x17, #0x0]\n"
1300 "str s12, [x24, #0x0]\n"
1301 "str s16, [x23, #0x0]\n"
1305 "str q8, [x17, #0x0]\n"
1306 "str q9, [x17, #0x10]\n"
1307 "str q10, [x17, #0x20]\n"
1308 "str q11, [x17, #0x30]\n"
1309 "add x17, x17, #0x40\n"
1310 "str q12, [x24, #0x0]\n"
1311 "str q13, [x24, #0x10]\n"
1312 "str q14, [x24, #0x20]\n"
1313 "str q15, [x24, #0x30]\n"
1314 "str q16, [x23, #0x0]\n"
1315 "str q17, [x23, #0x10]\n"
1316 "str q18, [x23, #0x20]\n"
1317 "str q19, [x23, #0x30]\n"
1319 "subs x8, x8, #0x10\n"
1323 "ldr x8, [%x[args_ptr], %[offsetof_N]]\n"
1324 "mov x17, %x[output_ptr]\n"
1325 "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
1327 "tbz %x[flags], #0, 114f\n"
1328 "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
1329 "add x24, x17, x20, LSL #2\n"
1330 "add x23, x24, x20, LSL #2\n"
1332 "add x22, x23, x20, LSL #2\n"
1334 "tbz x8, #3, 108f\n"
1335 "ld1 { v8.4s }, [x17], #0x10\n"
1336 "ld1 { v12.4s }, [x24], #0x10\n"
1337 "ld1 { v16.4s }, [x23], #0x10\n"
1338 "ld1 { v20.4s }, [x22], #0x10\n"
1339 "ld1 { v9.4s }, [x17], #0x10\n"
1340 "ld1 { v13.4s }, [x24], #0x10\n"
1341 "ld1 { v17.4s }, [x23], #0x10\n"
1342 "ld1 { v21.4s }, [x22], #0x10\n"
1343 "tbz x8, #2, 106f\n"
1344 "ld1 { v10.4s }, [x17], #0x10\n"
1345 "ld1 { v14.4s }, [x24], #0x10\n"
1346 "ld1 { v18.4s }, [x23], #0x10\n"
1347 "ld1 { v22.4s }, [x22], #0x10\n"
1348 "tbz x8, #1, 105f\n"
1349 "ldr d11, [x17], #0x8\n"
1351 "ldr d15, [x24], #0x8\n"
1352 "ldr d19, [x23], #0x8\n"
1353 "ldr d23, [x22], #0x8\n"
1354 "tbz x8, #0, 112f\n"
1355 "ld1 { v11.s }[2], [x17]\n"
1356 "ld1 { v15.s }[2], [x24]\n"
1357 "ld1 { v19.s }[2], [x23]\n"
1358 "ld1 { v23.s }[2], [x22]\n"
1362 "tbz x8, #0, 112f\n"
1363 "ldr s11, [x17, #0x0]\n"
1364 "ldr s15, [x24, #0x0]\n"
1365 "ldr s19, [x23, #0x0]\n"
1366 "ldr s23, [x22, #0x0]\n"
1369 "tbz x8, #1, 107f\n"
1370 "ldr d10, [x17], #0x8\n"
1372 "ldr d14, [x24], #0x8\n"
1373 "ldr d18, [x23], #0x8\n"
1374 "ldr d22, [x22], #0x8\n"
1375 "tbz x8, #0, 112f\n"
1376 "ld1 { v10.s }[2], [x17]\n"
1377 "ld1 { v14.s }[2], [x24]\n"
1378 "ld1 { v18.s }[2], [x23]\n"
1379 "ld1 { v22.s }[2], [x22]\n"
1383 "tbz x8, #0, 112f\n"
1384 "ldr s10, [x17, #0x0]\n"
1385 "ldr s14, [x24, #0x0]\n"
1386 "ldr s18, [x23, #0x0]\n"
1387 "ldr s22, [x22, #0x0]\n"
1390 "tbz x8, #2, 110f\n"
1391 "ld1 { v8.4s }, [x17], #0x10\n"
1392 "ld1 { v12.4s }, [x24], #0x10\n"
1393 "ld1 { v16.4s }, [x23], #0x10\n"
1394 "ld1 { v20.4s }, [x22], #0x10\n"
1395 "tbz x8, #1, 109f\n"
1396 "ldr d9, [x17], #0x8\n"
1398 "ldr d13, [x24], #0x8\n"
1399 "ldr d17, [x23], #0x8\n"
1400 "ldr d21, [x22], #0x8\n"
1401 "tbz x8, #0, 112f\n"
1402 "ld1 { v9.s }[2], [x17]\n"
1403 "ld1 { v13.s }[2], [x24]\n"
1404 "ld1 { v17.s }[2], [x23]\n"
1405 "ld1 { v21.s }[2], [x22]\n"
1409 "tbz x8, #0, 112f\n"
1410 "ldr s9, [x17, #0x0]\n"
1411 "ldr s13, [x24, #0x0]\n"
1412 "ldr s17, [x23, #0x0]\n"
1413 "ldr s21, [x22, #0x0]\n"
1416 "tbz x8, #1, 111f\n"
1417 "ldr d8, [x17], #0x8\n"
1419 "ldr d12, [x24], #0x8\n"
1420 "ldr d16, [x23], #0x8\n"
1421 "ldr d20, [x22], #0x8\n"
1422 "tbz x8, #0, 112f\n"
1423 "ld1 { v8.s }[2], [x17]\n"
1424 "ld1 { v12.s }[2], [x24]\n"
1425 "ld1 { v16.s }[2], [x23]\n"
1426 "ld1 { v20.s }[2], [x22]\n"
1429 "ldr s8, [x17, #0x0]\n"
1431 "ldr s12, [x24, #0x0]\n"
1432 "ldr s16, [x23, #0x0]\n"
1433 "ldr s20, [x22, #0x0]\n"
1435 "sub x17, x17, x25\n"
1438 "ldr q8, [x17, #0x0]\n"
1439 "ldr q9, [x17, #0x10]\n"
1440 "ldr q10, [x17, #0x20]\n"
1441 "ldr q11, [x17, #0x30]\n"
1442 "ldr q12, [x24, #0x0]\n"
1443 "ldr q13, [x24, #0x10]\n"
1444 "ldr q14, [x24, #0x20]\n"
1445 "ldr q15, [x24, #0x30]\n"
1446 "ldr q16, [x23, #0x0]\n"
1447 "ldr q17, [x23, #0x10]\n"
1448 "ldr q18, [x23, #0x20]\n"
1449 "ldr q19, [x23, #0x30]\n"
1450 "ldr q20, [x22, #0x0]\n"
1451 "ldr q21, [x22, #0x10]\n"
1452 "ldr q22, [x22, #0x20]\n"
1453 "ldr q23, [x22, #0x30]\n"
1456 "movi v8.4s, #0x0\n"
1457 "movi v9.4s, #0x0\n"
1458 "movi v10.4s, #0x0\n"
1459 "movi v11.4s, #0x0\n"
1460 "movi v12.4s, #0x0\n"
1461 "movi v13.4s, #0x0\n"
1462 "movi v14.4s, #0x0\n"
1463 "movi v15.4s, #0x0\n"
1464 "movi v16.4s, #0x0\n"
1465 "movi v17.4s, #0x0\n"
1466 "movi v18.4s, #0x0\n"
1467 "movi v19.4s, #0x0\n"
1468 "movi v20.4s, #0x0\n"
1469 "movi v21.4s, #0x0\n"
1470 "movi v22.4s, #0x0\n"
1471 "movi v23.4s, #0x0\n"
1475 "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
1476 "ldr w14, [x20, x15, LSL #0x2]\n"
1477 "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
1478 "tbz %x[flags], #3, 117f\n"
1479 "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
1480 "add x20, x20, x21, LSL #3\n"
1481 "ldr x13, [x20, #0x0]\n"
1482 "ldr x12, [x20, #0x8]\n"
1483 "ldr x11, [x20, #0x10]\n"
1484 "ldr x10, [x20, #0x18]\n"
1486 "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
1487 "add x13, x13, x20\n"
1488 "add x12, x12, x20\n"
1489 "add x11, x11, x20\n"
1490 "add x10, x10, x20\n"
1493 "mov x13, %x[input_ptr]\n"
1494 "add x12, x13, x21\n"
1495 "add x11, x12, x21\n"
1496 "add x10, x11, x21\n"
1500 "ldr q0, [x13, #0x0]\n"
1502 "ldr q1, [x12, #0x0]\n"
1503 "ldr q2, [x11, #0x0]\n"
1504 "ldr q3, [x10, #0x0]\n"
1505 "ldr q6, [x16, #0x0]\n"
1506 "ldr q7, [x16, #0x10]\n"
1509 ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
1510 "ldr x21, [x16, #0x28]\n"
1511 ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
1512 "ldr x20, [x16, #0x38]\n"
1513 ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
1514 "add x13, x13, #0x10\n"
1515 ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
1516 "ldr d25, [x16, #0x20]\n"
1517 ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
1518 "mov v25.d[1], x21\n"
1519 ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
1520 "ldr x21, [x16, #0x48]\n"
1521 ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
1522 "add x12, x12, #0x10\n"
1523 ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
1524 "ldr d24, [x16, #0x30]\n"
1525 "mov v24.d[1], x20\n"
1526 ".inst 0x6f80e32a // udot v10.4s, v25.16b, v0.4b[0]\n"
1527 ".inst 0x6f81e32e // udot v14.4s, v25.16b, v1.4b[0]\n"
1528 "ldr x20, [x16, #0x58]\n"
1529 ".inst 0x6f82e332 // udot v18.4s, v25.16b, v2.4b[0]\n"
1530 "add x11, x11, #0x10\n"
1531 ".inst 0x6f83e336 // udot v22.4s, v25.16b, v3.4b[0]\n"
1532 "ldr d25, [x16, #0x40]\n"
1533 ".inst 0x6f80e30b // udot v11.4s, v24.16b, v0.4b[0]\n"
1534 "mov v25.d[1], x21\n"
1535 ".inst 0x6f81e30f // udot v15.4s, v24.16b, v1.4b[0]\n"
1536 "ldr x21, [x16, #0x68]\n"
1537 ".inst 0x6f82e313 // udot v19.4s, v24.16b, v2.4b[0]\n"
1538 "add x10, x10, #0x10\n"
1539 ".inst 0x6f83e317 // udot v23.4s, v24.16b, v3.4b[0]\n"
1540 "ldr d24, [x16, #0x50]\n"
1541 "mov v24.d[1], x20\n"
1542 ".inst 0x6fa0e328 // udot v8.4s, v25.16b, v0.4b[1]\n"
1543 ".inst 0x6fa1e32c // udot v12.4s, v25.16b, v1.4b[1]\n"
1544 "ldr x20, [x16, #0x78]\n"
1545 ".inst 0x6fa2e330 // udot v16.4s, v25.16b, v2.4b[1]\n"
1546 "ldr x25, [x13, #0x8]\n"
1547 ".inst 0x6fa3e334 // udot v20.4s, v25.16b, v3.4b[1]\n"
1548 "ldr d25, [x16, #0x60]\n"
1549 ".inst 0x6fa0e309 // udot v9.4s, v24.16b, v0.4b[1]\n"
1550 "mov v25.d[1], x21\n"
1551 ".inst 0x6fa1e30d // udot v13.4s, v24.16b, v1.4b[1]\n"
1552 "ldr x21, [x16, #0x88]\n"
1553 ".inst 0x6fa2e311 // udot v17.4s, v24.16b, v2.4b[1]\n"
1554 "ldr x24, [x12, #0x8]\n"
1555 ".inst 0x6fa3e315 // udot v21.4s, v24.16b, v3.4b[1]\n"
1556 "ldr d24, [x16, #0x70]\n"
1557 "mov v24.d[1], x20\n"
1558 ".inst 0x6fa0e32a // udot v10.4s, v25.16b, v0.4b[1]\n"
1559 ".inst 0x6fa1e32e // udot v14.4s, v25.16b, v1.4b[1]\n"
1560 "ldr x20, [x16, #0x98]\n"
1561 ".inst 0x6fa2e332 // udot v18.4s, v25.16b, v2.4b[1]\n"
1562 "ldr x23, [x11, #0x8]\n"
1563 ".inst 0x6fa3e336 // udot v22.4s, v25.16b, v3.4b[1]\n"
1564 "ldr d25, [x16, #0x80]\n"
1565 ".inst 0x6fa0e30b // udot v11.4s, v24.16b, v0.4b[1]\n"
1566 "mov v25.d[1], x21\n"
1567 ".inst 0x6fa1e30f // udot v15.4s, v24.16b, v1.4b[1]\n"
1568 "ldr x21, [x16, #0xa8]\n"
1569 ".inst 0x6fa2e313 // udot v19.4s, v24.16b, v2.4b[1]\n"
1570 "ldr x22, [x10, #0x8]\n"
1571 ".inst 0x6fa3e317 // udot v23.4s, v24.16b, v3.4b[1]\n"
1572 "ldr d24, [x16, #0x90]\n"
1573 "mov v24.d[1], x20\n"
1574 ".inst 0x6f80eb28 // udot v8.4s, v25.16b, v0.4b[2]\n"
1575 ".inst 0x6f81eb2c // udot v12.4s, v25.16b, v1.4b[2]\n"
1576 "ldr x20, [x16, #0xb8]\n"
1577 ".inst 0x6f82eb30 // udot v16.4s, v25.16b, v2.4b[2]\n"
1578 "sub x14, x14, #0x10\n"
1579 ".inst 0x6f83eb34 // udot v20.4s, v25.16b, v3.4b[2]\n"
1580 "ldr d25, [x16, #0xa0]\n"
1581 ".inst 0x6f80eb09 // udot v9.4s, v24.16b, v0.4b[2]\n"
1582 "mov v25.d[1], x21\n"
1583 ".inst 0x6f81eb0d // udot v13.4s, v24.16b, v1.4b[2]\n"
1584 "ldr x21, [x16, #0xc8]\n"
1585 ".inst 0x6f82eb11 // udot v17.4s, v24.16b, v2.4b[2]\n"
1587 ".inst 0x6f83eb15 // udot v21.4s, v24.16b, v3.4b[2]\n"
1588 "ldr d24, [x16, #0xb0]\n"
1589 "mov v24.d[1], x20\n"
1590 ".inst 0x6f80eb2a // udot v10.4s, v25.16b, v0.4b[2]\n"
1591 ".inst 0x6f81eb2e // udot v14.4s, v25.16b, v1.4b[2]\n"
1592 "ldr x20, [x16, #0xd8]\n"
1593 ".inst 0x6f82eb32 // udot v18.4s, v25.16b, v2.4b[2]\n"
1594 "prfm pldl1keep, [x13, #0x80]\n"
1595 ".inst 0x6f83eb36 // udot v22.4s, v25.16b, v3.4b[2]\n"
1596 "ldr d25, [x16, #0xc0]\n"
1597 ".inst 0x6f80eb0b // udot v11.4s, v24.16b, v0.4b[2]\n"
1598 "mov v25.d[1], x21\n"
1599 ".inst 0x6f81eb0f // udot v15.4s, v24.16b, v1.4b[2]\n"
1600 "ldr x21, [x16, #0xe8]\n"
1601 ".inst 0x6f82eb13 // udot v19.4s, v24.16b, v2.4b[2]\n"
1602 "prfm pldl1keep, [x12, #0x80]\n"
1603 ".inst 0x6f83eb17 // udot v23.4s, v24.16b, v3.4b[2]\n"
1604 "ldr d24, [x16, #0xd0]\n"
1605 "mov v24.d[1], x20\n"
1606 ".inst 0x6fa0eb28 // udot v8.4s, v25.16b, v0.4b[3]\n"
1607 ".inst 0x6fa1eb2c // udot v12.4s, v25.16b, v1.4b[3]\n"
1608 "ldr x20, [x16, #0xf8]\n"
1609 ".inst 0x6fa2eb30 // udot v16.4s, v25.16b, v2.4b[3]\n"
1610 "prfm pldl1keep, [x11, #0x80]\n"
1611 ".inst 0x6fa3eb34 // udot v20.4s, v25.16b, v3.4b[3]\n"
1612 "ldr d25, [x16, #0xe0]\n"
1613 ".inst 0x6fa0eb09 // udot v9.4s, v24.16b, v0.4b[3]\n"
1614 "mov v25.d[1], x21\n"
1615 ".inst 0x6fa1eb0d // udot v13.4s, v24.16b, v1.4b[3]\n"
1616 "prfm pldl1keep, [x10, #0x80]\n"
1617 ".inst 0x6fa2eb11 // udot v17.4s, v24.16b, v2.4b[3]\n"
1618 ".inst 0x6fa3eb15 // udot v21.4s, v24.16b, v3.4b[3]\n"
1619 "ldr d24, [x16, #0xf0]\n"
1620 "mov v24.d[1], x20\n"
1621 "add x16, x16, #0x100\n"
1622 ".inst 0x6fa0eb2a // udot v10.4s, v25.16b, v0.4b[3]\n"
1623 "ldr x21, [x16, #0x8]\n"
1624 ".inst 0x6fa1eb2e // udot v14.4s, v25.16b, v1.4b[3]\n"
1625 "ldr x20, [x16, #0x18]\n"
1626 ".inst 0x6fa2eb32 // udot v18.4s, v25.16b, v2.4b[3]\n"
1627 ".inst 0x6fa3eb36 // udot v22.4s, v25.16b, v3.4b[3]\n"
1628 "ldr d6, [x16, #0x0]\n"
1629 ".inst 0x6fa0eb0b // udot v11.4s, v24.16b, v0.4b[3]\n"
1630 "ldr d0, [x13, #0x0]\n"
1631 ".inst 0x6fa1eb0f // udot v15.4s, v24.16b, v1.4b[3]\n"
1632 "ldr d1, [x12, #0x0]\n"
1633 ".inst 0x6fa2eb13 // udot v19.4s, v24.16b, v2.4b[3]\n"
1634 "ldr d2, [x11, #0x0]\n"
1635 ".inst 0x6fa3eb17 // udot v23.4s, v24.16b, v3.4b[3]\n"
1636 "ldr d3, [x10, #0x0]\n"
1637 "ldr d7, [x16, #0x10]\n"
1638 "mov v6.d[1], x21\n"
1639 "mov v0.d[1], x25\n"
1640 "mov v1.d[1], x24\n"
1641 "mov v2.d[1], x23\n"
1642 "mov v3.d[1], x22\n"
1643 "mov v7.d[1], x20\n"
1646 ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
1647 "add x13, x13, #0x10\n"
1648 ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
1649 "add x12, x12, #0x10\n"
1650 ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
1651 "add x11, x11, #0x10\n"
1652 ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
1653 "ldr q25, [x16, #0x20]\n"
1654 ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
1655 "add x10, x10, #0x10\n"
1656 ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
1657 "sub x14, x14, #0x10\n"
1658 ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
1659 "prfm pldl1keep, [x13, #0x80]\n"
1660 ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
1661 "ldr q24, [x16, #0x30]\n"
1662 ".inst 0x6f80e32a // udot v10.4s, v25.16b, v0.4b[0]\n"
1663 "prfm pldl1keep, [x12, #0x80]\n"
1664 ".inst 0x6f81e32e // udot v14.4s, v25.16b, v1.4b[0]\n"
1665 "prfm pldl1keep, [x11, #0x80]\n"
1666 ".inst 0x6f82e332 // udot v18.4s, v25.16b, v2.4b[0]\n"
1667 "prfm pldl1keep, [x10, #0x80]\n"
1668 ".inst 0x6f83e336 // udot v22.4s, v25.16b, v3.4b[0]\n"
1669 "ldr q25, [x16, #0x40]\n"
1670 ".inst 0x6f80e30b // udot v11.4s, v24.16b, v0.4b[0]\n"
1671 ".inst 0x6f81e30f // udot v15.4s, v24.16b, v1.4b[0]\n"
1672 ".inst 0x6f82e313 // udot v19.4s, v24.16b, v2.4b[0]\n"
1673 ".inst 0x6f83e317 // udot v23.4s, v24.16b, v3.4b[0]\n"
1674 "ldr q24, [x16, #0x50]\n"
1675 ".inst 0x6fa0e328 // udot v8.4s, v25.16b, v0.4b[1]\n"
1676 ".inst 0x6fa1e32c // udot v12.4s, v25.16b, v1.4b[1]\n"
1677 ".inst 0x6fa2e330 // udot v16.4s, v25.16b, v2.4b[1]\n"
1678 ".inst 0x6fa3e334 // udot v20.4s, v25.16b, v3.4b[1]\n"
1679 "ldr q25, [x16, #0x60]\n"
1680 ".inst 0x6fa0e309 // udot v9.4s, v24.16b, v0.4b[1]\n"
1681 ".inst 0x6fa1e30d // udot v13.4s, v24.16b, v1.4b[1]\n"
1682 ".inst 0x6fa2e311 // udot v17.4s, v24.16b, v2.4b[1]\n"
1683 ".inst 0x6fa3e315 // udot v21.4s, v24.16b, v3.4b[1]\n"
1684 "ldr q24, [x16, #0x70]\n"
1685 ".inst 0x6fa0e32a // udot v10.4s, v25.16b, v0.4b[1]\n"
1686 ".inst 0x6fa1e32e // udot v14.4s, v25.16b, v1.4b[1]\n"
1687 ".inst 0x6fa2e332 // udot v18.4s, v25.16b, v2.4b[1]\n"
1688 ".inst 0x6fa3e336 // udot v22.4s, v25.16b, v3.4b[1]\n"
1689 "ldr q25, [x16, #0x80]\n"
1690 ".inst 0x6fa0e30b // udot v11.4s, v24.16b, v0.4b[1]\n"
1691 ".inst 0x6fa1e30f // udot v15.4s, v24.16b, v1.4b[1]\n"
1692 ".inst 0x6fa2e313 // udot v19.4s, v24.16b, v2.4b[1]\n"
1693 ".inst 0x6fa3e317 // udot v23.4s, v24.16b, v3.4b[1]\n"
1694 "ldr q24, [x16, #0x90]\n"
1695 ".inst 0x6f80eb28 // udot v8.4s, v25.16b, v0.4b[2]\n"
1696 ".inst 0x6f81eb2c // udot v12.4s, v25.16b, v1.4b[2]\n"
1697 ".inst 0x6f82eb30 // udot v16.4s, v25.16b, v2.4b[2]\n"
1698 ".inst 0x6f83eb34 // udot v20.4s, v25.16b, v3.4b[2]\n"
1699 "ldr q25, [x16, #0xa0]\n"
1700 ".inst 0x6f80eb09 // udot v9.4s, v24.16b, v0.4b[2]\n"
1701 ".inst 0x6f81eb0d // udot v13.4s, v24.16b, v1.4b[2]\n"
1702 ".inst 0x6f82eb11 // udot v17.4s, v24.16b, v2.4b[2]\n"
1703 ".inst 0x6f83eb15 // udot v21.4s, v24.16b, v3.4b[2]\n"
1704 "ldr q24, [x16, #0xb0]\n"
1705 ".inst 0x6f80eb2a // udot v10.4s, v25.16b, v0.4b[2]\n"
1706 ".inst 0x6f81eb2e // udot v14.4s, v25.16b, v1.4b[2]\n"
1707 ".inst 0x6f82eb32 // udot v18.4s, v25.16b, v2.4b[2]\n"
1708 ".inst 0x6f83eb36 // udot v22.4s, v25.16b, v3.4b[2]\n"
1709 "ldr q25, [x16, #0xc0]\n"
1710 ".inst 0x6f80eb0b // udot v11.4s, v24.16b, v0.4b[2]\n"
1711 ".inst 0x6f81eb0f // udot v15.4s, v24.16b, v1.4b[2]\n"
1712 ".inst 0x6f82eb13 // udot v19.4s, v24.16b, v2.4b[2]\n"
1713 ".inst 0x6f83eb17 // udot v23.4s, v24.16b, v3.4b[2]\n"
1714 "ldr q24, [x16, #0xd0]\n"
1715 ".inst 0x6fa0eb28 // udot v8.4s, v25.16b, v0.4b[3]\n"
1716 ".inst 0x6fa1eb2c // udot v12.4s, v25.16b, v1.4b[3]\n"
1717 ".inst 0x6fa2eb30 // udot v16.4s, v25.16b, v2.4b[3]\n"
1718 ".inst 0x6fa3eb34 // udot v20.4s, v25.16b, v3.4b[3]\n"
1719 "ldr q25, [x16, #0xe0]\n"
1720 ".inst 0x6fa0eb09 // udot v9.4s, v24.16b, v0.4b[3]\n"
1721 ".inst 0x6fa1eb0d // udot v13.4s, v24.16b, v1.4b[3]\n"
1722 ".inst 0x6fa2eb11 // udot v17.4s, v24.16b, v2.4b[3]\n"
1723 ".inst 0x6fa3eb15 // udot v21.4s, v24.16b, v3.4b[3]\n"
1724 "ldr q24, [x16, #0xf0]\n"
1725 ".inst 0x6fa0eb2a // udot v10.4s, v25.16b, v0.4b[3]\n"
1726 "add x16, x16, #0x100\n"
1727 ".inst 0x6fa1eb2e // udot v14.4s, v25.16b, v1.4b[3]\n"
1728 ".inst 0x6fa2eb32 // udot v18.4s, v25.16b, v2.4b[3]\n"
1729 ".inst 0x6fa3eb36 // udot v22.4s, v25.16b, v3.4b[3]\n"
1730 ".inst 0x6fa0eb0b // udot v11.4s, v24.16b, v0.4b[3]\n"
1731 ".inst 0x6fa1eb0f // udot v15.4s, v24.16b, v1.4b[3]\n"
1732 ".inst 0x6fa2eb13 // udot v19.4s, v24.16b, v2.4b[3]\n"
1733 ".inst 0x6fa3eb17 // udot v23.4s, v24.16b, v3.4b[3]\n"
1739 "ldr s29, [x13], #0x4\n"
1740 "sub x14, x14, #0x4\n"
1741 "ldr s28, [x12], #0x4\n"
1743 "ldr s27, [x11], #0x4\n"
1744 "ldr s26, [x10], #0x4\n"
1745 "ldr q25, [x16, #0x0]\n"
1746 ".inst 0x6f9de328 // udot v8.4s, v25.16b, v29.4b[0]\n"
1747 "ldr q24, [x16, #0x10]\n"
1748 ".inst 0x6f9ce32c // udot v12.4s, v25.16b, v28.4b[0]\n"
1749 ".inst 0x6f9be330 // udot v16.4s, v25.16b, v27.4b[0]\n"
1750 ".inst 0x6f9ae334 // udot v20.4s, v25.16b, v26.4b[0]\n"
1751 "ldr q25, [x16, #0x20]\n"
1752 ".inst 0x6f9de309 // udot v9.4s, v24.16b, v29.4b[0]\n"
1753 ".inst 0x6f9ce30d // udot v13.4s, v24.16b, v28.4b[0]\n"
1754 ".inst 0x6f9be311 // udot v17.4s, v24.16b, v27.4b[0]\n"
1755 ".inst 0x6f9ae315 // udot v21.4s, v24.16b, v26.4b[0]\n"
1756 "ldr q24, [x16, #0x30]\n"
1757 ".inst 0x6f9de32a // udot v10.4s, v25.16b, v29.4b[0]\n"
1758 "add x16, x16, #0x40\n"
1759 ".inst 0x6f9ce32e // udot v14.4s, v25.16b, v28.4b[0]\n"
1760 ".inst 0x6f9be332 // udot v18.4s, v25.16b, v27.4b[0]\n"
1761 ".inst 0x6f9ae336 // udot v22.4s, v25.16b, v26.4b[0]\n"
1762 ".inst 0x6f9de30b // udot v11.4s, v24.16b, v29.4b[0]\n"
1763 ".inst 0x6f9ce30f // udot v15.4s, v24.16b, v28.4b[0]\n"
1764 ".inst 0x6f9be313 // udot v19.4s, v24.16b, v27.4b[0]\n"
1765 ".inst 0x6f9ae317 // udot v23.4s, v24.16b, v26.4b[0]\n"
1769 "tbz x14, #1, 124f\n"
1770 "ldr h0, [x13], #0x2\n"
1771 "ldr h1, [x12], #0x2\n"
1772 "ldr h2, [x11], #0x2\n"
1773 "ldr h3, [x10], #0x2\n"
1774 "tbz x14, #0, 125f\n"
1775 "ld1 { v0.b }[2], [x13]\n"
1776 "ld1 { v1.b }[2], [x12]\n"
1777 "ld1 { v2.b }[2], [x11]\n"
1778 "ld1 { v3.b }[2], [x10]\n"
1781 "ldr b0, [x13, #0x0]\n"
1782 "ldr b1, [x12, #0x0]\n"
1783 "ldr b2, [x11, #0x0]\n"
1784 "ldr b3, [x10, #0x0]\n"
1786 "ldr q25, [x16, #0x0]\n"
1787 ".inst 0x6f80e328 // udot v8.4s, v25.16b, v0.4b[0]\n"
1788 "ldr q24, [x16, #0x10]\n"
1789 ".inst 0x6f81e32c // udot v12.4s, v25.16b, v1.4b[0]\n"
1790 ".inst 0x6f82e330 // udot v16.4s, v25.16b, v2.4b[0]\n"
1791 ".inst 0x6f83e334 // udot v20.4s, v25.16b, v3.4b[0]\n"
1792 "ldr q25, [x16, #0x20]\n"
1793 ".inst 0x6f80e309 // udot v9.4s, v24.16b, v0.4b[0]\n"
1794 ".inst 0x6f81e30d // udot v13.4s, v24.16b, v1.4b[0]\n"
1795 ".inst 0x6f82e311 // udot v17.4s, v24.16b, v2.4b[0]\n"
1796 ".inst 0x6f83e315 // udot v21.4s, v24.16b, v3.4b[0]\n"
1797 "ldr q24, [x16, #0x30]\n"
1798 ".inst 0x6f80e32a // udot v10.4s, v25.16b, v0.4b[0]\n"
1799 "add x16, x16, #0x40\n"
1800 ".inst 0x6f81e32e // udot v14.4s, v25.16b, v1.4b[0]\n"
1801 ".inst 0x6f82e332 // udot v18.4s, v25.16b, v2.4b[0]\n"
1802 ".inst 0x6f83e336 // udot v22.4s, v25.16b, v3.4b[0]\n"
1803 ".inst 0x6f80e30b // udot v11.4s, v24.16b, v0.4b[0]\n"
1804 ".inst 0x6f81e30f // udot v15.4s, v24.16b, v1.4b[0]\n"
1805 ".inst 0x6f82e313 // udot v19.4s, v24.16b, v2.4b[0]\n"
1806 ".inst 0x6f83e317 // udot v23.4s, v24.16b, v3.4b[0]\n"
1808 "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
1809 "add x15, x15, #0x1\n"
1812 "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
1813 "add x24, x17, x20, LSL #2\n"
1814 "add x23, x24, x20, LSL #2\n"
1815 "add x22, x23, x20, LSL #2\n"
1817 "prfm pstl1keep, [x17, #0x0]\n"
1818 "prfm pstl1keep, [x24, #0x0]\n"
1819 "prfm pstl1keep, [x23, #0x0]\n"
1820 "prfm pstl1keep, [x22, #0x0]\n"
1822 "tbz x8, #3, 130f\n"
1823 "st1 { v8.4s }, [x17], #0x10\n"
1824 "st1 { v9.4s }, [x17], #0x10\n"
1825 "st1 { v12.4s }, [x24], #0x10\n"
1826 "st1 { v13.4s }, [x24], #0x10\n"
1827 "st1 { v16.4s }, [x23], #0x10\n"
1828 "st1 { v17.4s }, [x23], #0x10\n"
1829 "st1 { v20.4s }, [x22], #0x10\n"
1830 "st1 { v21.4s }, [x22], #0x10\n"
1831 "tbz x8, #2, 128f\n"
1832 "st1 { v10.4s }, [x17], #0x10\n"
1833 "st1 { v14.4s }, [x24], #0x10\n"
1834 "st1 { v18.4s }, [x23], #0x10\n"
1835 "st1 { v22.4s }, [x22], #0x10\n"
1836 "tbz x8, #1, 127f\n"
1837 "str d11, [x17], #0x8\n"
1838 "str d15, [x24], #0x8\n"
1839 "str d19, [x23], #0x8\n"
1840 "str d23, [x22], #0x8\n"
1841 "tbz x8, #0, 134f\n"
1842 "st1 { v11.s }[2], [x17]\n"
1843 "st1 { v15.s }[2], [x24]\n"
1844 "st1 { v19.s }[2], [x23]\n"
1845 "st1 { v23.s }[2], [x22]\n"
1848 "tbz x8, #0, 134f\n"
1849 "str s11, [x17, #0x0]\n"
1850 "str s15, [x24, #0x0]\n"
1851 "str s19, [x23, #0x0]\n"
1852 "str s23, [x22, #0x0]\n"
1855 "tbz x8, #1, 129f\n"
1856 "str d10, [x17], #0x8\n"
1857 "str d14, [x24], #0x8\n"
1858 "str d18, [x23], #0x8\n"
1859 "str d22, [x22], #0x8\n"
1860 "tbz x8, #0, 134f\n"
1861 "st1 { v10.s }[2], [x17]\n"
1862 "st1 { v14.s }[2], [x24]\n"
1863 "st1 { v18.s }[2], [x23]\n"
1864 "st1 { v22.s }[2], [x22]\n"
1867 "tbz x8, #0, 134f\n"
1868 "str s10, [x17, #0x0]\n"
1869 "str s14, [x24, #0x0]\n"
1870 "str s18, [x23, #0x0]\n"
1871 "str s22, [x22, #0x0]\n"
1874 "tbz x8, #2, 132f\n"
1875 "st1 { v8.4s }, [x17], #0x10\n"
1876 "st1 { v12.4s }, [x24], #0x10\n"
1877 "st1 { v16.4s }, [x23], #0x10\n"
1878 "st1 { v20.4s }, [x22], #0x10\n"
1879 "tbz x8, #1, 131f\n"
1880 "str d9, [x17], #0x8\n"
1881 "str d13, [x24], #0x8\n"
1882 "str d17, [x23], #0x8\n"
1883 "str d21, [x22], #0x8\n"
1884 "tbz x8, #0, 134f\n"
1885 "st1 { v9.s }[2], [x17]\n"
1886 "st1 { v13.s }[2], [x24]\n"
1887 "st1 { v17.s }[2], [x23]\n"
1888 "st1 { v21.s }[2], [x22]\n"
1891 "tbz x8, #0, 134f\n"
1892 "str s9, [x17, #0x0]\n"
1893 "str s13, [x24, #0x0]\n"
1894 "str s17, [x23, #0x0]\n"
1895 "str s21, [x22, #0x0]\n"
1898 "tbz x8, #1, 133f\n"
1899 "str d8, [x17], #0x8\n"
1900 "str d12, [x24], #0x8\n"
1901 "str d16, [x23], #0x8\n"
1902 "str d20, [x22], #0x8\n"
1903 "tbz x8, #0, 134f\n"
1904 "st1 { v8.s }[2], [x17]\n"
1905 "st1 { v12.s }[2], [x24]\n"
1906 "st1 { v16.s }[2], [x23]\n"
1907 "st1 { v20.s }[2], [x22]\n"
1910 "str s8, [x17, #0x0]\n"
1911 "str s12, [x24, #0x0]\n"
1912 "str s16, [x23, #0x0]\n"
1913 "str s20, [x22, #0x0]\n"
1917 "str q8, [x17, #0x0]\n"
1918 "str q9, [x17, #0x10]\n"
1919 "str q10, [x17, #0x20]\n"
1920 "str q11, [x17, #0x30]\n"
1921 "add x17, x17, #0x40\n"
1922 "str q12, [x24, #0x0]\n"
1923 "str q13, [x24, #0x10]\n"
1924 "str q14, [x24, #0x20]\n"
1925 "str q15, [x24, #0x30]\n"
1926 "str q16, [x23, #0x0]\n"
1927 "str q17, [x23, #0x10]\n"
1928 "str q18, [x23, #0x20]\n"
1929 "str q19, [x23, #0x30]\n"
1930 "str q20, [x22, #0x0]\n"
1931 "str q21, [x22, #0x10]\n"
1932 "str q22, [x22, #0x20]\n"
1933 "str q23, [x22, #0x30]\n"
1935 "subs x8, x8, #0x10\n"
1939 "ldr x8, [%x[args_ptr], %[offsetof_N]]\n"
1940 "mov x17, %x[output_ptr]\n"
1941 "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
1943 "tbz %x[flags], #0, 148f\n"
1944 "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
1945 "add x24, x17, x20, LSL #2\n"
1946 "add x23, x24, x20, LSL #2\n"
1947 "add x22, x23, x20, LSL #2\n"
1949 "add x21, x22, x20, LSL #2\n"
1951 "tbz x8, #3, 142f\n"
1952 "ld1 { v8.4s }, [x17], #0x10\n"
1953 "ld1 { v12.4s }, [x24], #0x10\n"
1954 "ld1 { v16.4s }, [x23], #0x10\n"
1955 "ld1 { v20.4s }, [x22], #0x10\n"
1956 "ld1 { v24.4s }, [x21], #0x10\n"
1957 "ld1 { v9.4s }, [x17], #0x10\n"
1958 "ld1 { v13.4s }, [x24], #0x10\n"
1959 "ld1 { v17.4s }, [x23], #0x10\n"
1960 "ld1 { v21.4s }, [x22], #0x10\n"
1961 "ld1 { v25.4s }, [x21], #0x10\n"
1962 "tbz x8, #2, 140f\n"
1963 "ld1 { v10.4s }, [x17], #0x10\n"
1964 "ld1 { v14.4s }, [x24], #0x10\n"
1965 "ld1 { v18.4s }, [x23], #0x10\n"
1966 "ld1 { v22.4s }, [x22], #0x10\n"
1967 "ld1 { v26.4s }, [x21], #0x10\n"
1968 "tbz x8, #1, 139f\n"
1969 "ldr d11, [x17], #0x8\n"
1971 "ldr d15, [x24], #0x8\n"
1972 "ldr d19, [x23], #0x8\n"
1973 "ldr d23, [x22], #0x8\n"
1974 "ldr d27, [x21], #0x8\n"
1975 "tbz x8, #0, 146f\n"
1976 "ld1 { v11.s }[2], [x17]\n"
1977 "ld1 { v15.s }[2], [x24]\n"
1978 "ld1 { v19.s }[2], [x23]\n"
1979 "ld1 { v23.s }[2], [x22]\n"
1980 "ld1 { v27.s }[2], [x21]\n"
1984 "tbz x8, #0, 146f\n"
1985 "ldr s11, [x17, #0x0]\n"
1986 "ldr s15, [x24, #0x0]\n"
1987 "ldr s19, [x23, #0x0]\n"
1988 "ldr s23, [x22, #0x0]\n"
1989 "ldr s27, [x21, #0x0]\n"
1992 "tbz x8, #1, 141f\n"
1993 "ldr d10, [x17], #0x8\n"
1995 "ldr d14, [x24], #0x8\n"
1996 "ldr d18, [x23], #0x8\n"
1997 "ldr d22, [x22], #0x8\n"
1998 "ldr d26, [x21], #0x8\n"
1999 "tbz x8, #0, 146f\n"
2000 "ld1 { v10.s }[2], [x17]\n"
2001 "ld1 { v14.s }[2], [x24]\n"
2002 "ld1 { v18.s }[2], [x23]\n"
2003 "ld1 { v22.s }[2], [x22]\n"
2004 "ld1 { v26.s }[2], [x21]\n"
2008 "tbz x8, #0, 146f\n"
2009 "ldr s10, [x17, #0x0]\n"
2010 "ldr s14, [x24, #0x0]\n"
2011 "ldr s18, [x23, #0x0]\n"
2012 "ldr s22, [x22, #0x0]\n"
2013 "ldr s26, [x21, #0x0]\n"
2016 "tbz x8, #2, 144f\n"
2017 "ld1 { v8.4s }, [x17], #0x10\n"
2018 "ld1 { v12.4s }, [x24], #0x10\n"
2019 "ld1 { v16.4s }, [x23], #0x10\n"
2020 "ld1 { v20.4s }, [x22], #0x10\n"
2021 "ld1 { v24.4s }, [x21], #0x10\n"
2022 "tbz x8, #1, 143f\n"
2023 "ldr d9, [x17], #0x8\n"
2025 "ldr d13, [x24], #0x8\n"
2026 "ldr d17, [x23], #0x8\n"
2027 "ldr d21, [x22], #0x8\n"
2028 "ldr d25, [x21], #0x8\n"
2029 "tbz x8, #0, 146f\n"
2030 "ld1 { v9.s }[2], [x17]\n"
2031 "ld1 { v13.s }[2], [x24]\n"
2032 "ld1 { v17.s }[2], [x23]\n"
2033 "ld1 { v21.s }[2], [x22]\n"
2034 "ld1 { v25.s }[2], [x21]\n"
2038 "tbz x8, #0, 146f\n"
2039 "ldr s9, [x17, #0x0]\n"
2040 "ldr s13, [x24, #0x0]\n"
2041 "ldr s17, [x23, #0x0]\n"
2042 "ldr s21, [x22, #0x0]\n"
2043 "ldr s25, [x21, #0x0]\n"
2046 "tbz x8, #1, 145f\n"
2047 "ldr d8, [x17], #0x8\n"
2049 "ldr d12, [x24], #0x8\n"
2050 "ldr d16, [x23], #0x8\n"
2051 "ldr d20, [x22], #0x8\n"
2052 "ldr d24, [x21], #0x8\n"
2053 "tbz x8, #0, 146f\n"
2054 "ld1 { v8.s }[2], [x17]\n"
2055 "ld1 { v12.s }[2], [x24]\n"
2056 "ld1 { v16.s }[2], [x23]\n"
2057 "ld1 { v20.s }[2], [x22]\n"
2058 "ld1 { v24.s }[2], [x21]\n"
2061 "ldr s8, [x17, #0x0]\n"
2063 "ldr s12, [x24, #0x0]\n"
2064 "ldr s16, [x23, #0x0]\n"
2065 "ldr s20, [x22, #0x0]\n"
2066 "ldr s24, [x21, #0x0]\n"
2068 "sub x17, x17, x25\n"
2071 "ldr q8, [x17, #0x0]\n"
2072 "ldr q9, [x17, #0x10]\n"
2073 "ldr q10, [x17, #0x20]\n"
2074 "ldr q11, [x17, #0x30]\n"
2075 "ldr q12, [x24, #0x0]\n"
2076 "ldr q13, [x24, #0x10]\n"
2077 "ldr q14, [x24, #0x20]\n"
2078 "ldr q15, [x24, #0x30]\n"
2079 "ldr q16, [x23, #0x0]\n"
2080 "ldr q17, [x23, #0x10]\n"
2081 "ldr q18, [x23, #0x20]\n"
2082 "ldr q19, [x23, #0x30]\n"
2083 "ldr q20, [x22, #0x0]\n"
2084 "ldr q21, [x22, #0x10]\n"
2085 "ldr q22, [x22, #0x20]\n"
2086 "ldr q23, [x22, #0x30]\n"
2087 "ldr q24, [x21, #0x0]\n"
2088 "ldr q25, [x21, #0x10]\n"
2089 "ldr q26, [x21, #0x20]\n"
2090 "ldr q27, [x21, #0x30]\n"
2093 "movi v8.4s, #0x0\n"
2094 "movi v9.4s, #0x0\n"
2095 "movi v10.4s, #0x0\n"
2096 "movi v11.4s, #0x0\n"
2097 "movi v12.4s, #0x0\n"
2098 "movi v13.4s, #0x0\n"
2099 "movi v14.4s, #0x0\n"
2100 "movi v15.4s, #0x0\n"
2101 "movi v16.4s, #0x0\n"
2102 "movi v17.4s, #0x0\n"
2103 "movi v18.4s, #0x0\n"
2104 "movi v19.4s, #0x0\n"
2105 "movi v20.4s, #0x0\n"
2106 "movi v21.4s, #0x0\n"
2107 "movi v22.4s, #0x0\n"
2108 "movi v23.4s, #0x0\n"
2109 "movi v24.4s, #0x0\n"
2110 "movi v25.4s, #0x0\n"
2111 "movi v26.4s, #0x0\n"
2112 "movi v27.4s, #0x0\n"
2116 "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
2117 "ldr w14, [x20, x15, LSL #0x2]\n"
2118 "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
2119 "tbz %x[flags], #3, 151f\n"
2120 "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
2121 "add x20, x20, x21, LSL #3\n"
2122 "ldr x13, [x20, #0x0]\n"
2123 "ldr x12, [x20, #0x8]\n"
2124 "ldr x11, [x20, #0x10]\n"
2125 "ldr x10, [x20, #0x18]\n"
2126 "ldr x9, [x20, #0x20]\n"
2128 "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
2129 "add x13, x13, x20\n"
2130 "add x12, x12, x20\n"
2131 "add x11, x11, x20\n"
2132 "add x10, x10, x20\n"
2136 "mov x13, %x[input_ptr]\n"
2137 "add x12, x13, x21\n"
2138 "add x11, x12, x21\n"
2139 "add x10, x11, x21\n"
2140 "add x9, x10, x21\n"
2144 "ldr q0, [x13, #0x0]\n"
2146 "ldr q1, [x12, #0x0]\n"
2147 "ldr q2, [x11, #0x0]\n"
2148 "ldr q3, [x10, #0x0]\n"
2149 "ldr q4, [x9, #0x0]\n"
2150 "ldr q6, [x16, #0x0]\n"
2151 "ldr q7, [x16, #0x10]\n"
2154 ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
2155 "ldr x21, [x16, #0x28]\n"
2156 ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
2157 "ldr x20, [x16, #0x38]\n"
2158 ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
2159 "add x13, x13, #0x10\n"
2160 ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
2161 "add x12, x12, #0x10\n"
2162 ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n"
2163 "ldr d29, [x16, #0x20]\n"
2164 ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
2165 "mov v29.d[1], x21\n"
2166 ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
2167 "ldr x21, [x16, #0x48]\n"
2168 ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
2169 "add x11, x11, #0x10\n"
2170 ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
2171 "add x10, x10, #0x10\n"
2172 ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n"
2173 "ldr d28, [x16, #0x30]\n"
2174 "mov v28.d[1], x20\n"
2175 ".inst 0x6f80e3aa // udot v10.4s, v29.16b, v0.4b[0]\n"
2176 ".inst 0x6f81e3ae // udot v14.4s, v29.16b, v1.4b[0]\n"
2177 "ldr x20, [x16, #0x58]\n"
2178 ".inst 0x6f82e3b2 // udot v18.4s, v29.16b, v2.4b[0]\n"
2179 "add x9, x9, #0x10\n"
2180 ".inst 0x6f83e3b6 // udot v22.4s, v29.16b, v3.4b[0]\n"
2181 "ldr x26, [x13, #0x8]\n"
2182 ".inst 0x6f84e3ba // udot v26.4s, v29.16b, v4.4b[0]\n"
2183 "ldr d29, [x16, #0x40]\n"
2184 ".inst 0x6f80e38b // udot v11.4s, v28.16b, v0.4b[0]\n"
2185 "mov v29.d[1], x21\n"
2186 ".inst 0x6f81e38f // udot v15.4s, v28.16b, v1.4b[0]\n"
2187 "ldr x21, [x16, #0x68]\n"
2188 ".inst 0x6f82e393 // udot v19.4s, v28.16b, v2.4b[0]\n"
2189 "ldr x25, [x12, #0x8]\n"
2190 ".inst 0x6f83e397 // udot v23.4s, v28.16b, v3.4b[0]\n"
2191 "ldr x24, [x11, #0x8]\n"
2192 ".inst 0x6f84e39b // udot v27.4s, v28.16b, v4.4b[0]\n"
2193 "ldr d28, [x16, #0x50]\n"
2194 "mov v28.d[1], x20\n"
2195 ".inst 0x6fa0e3a8 // udot v8.4s, v29.16b, v0.4b[1]\n"
2196 ".inst 0x6fa1e3ac // udot v12.4s, v29.16b, v1.4b[1]\n"
2197 "ldr x20, [x16, #0x78]\n"
2198 ".inst 0x6fa2e3b0 // udot v16.4s, v29.16b, v2.4b[1]\n"
2199 "ldr x23, [x10, #0x8]\n"
2200 ".inst 0x6fa3e3b4 // udot v20.4s, v29.16b, v3.4b[1]\n"
2201 "ldr x22, [x9, #0x8]\n"
2202 ".inst 0x6fa4e3b8 // udot v24.4s, v29.16b, v4.4b[1]\n"
2203 "ldr d29, [x16, #0x60]\n"
2204 ".inst 0x6fa0e389 // udot v9.4s, v28.16b, v0.4b[1]\n"
2205 "mov v29.d[1], x21\n"
2206 ".inst 0x6fa1e38d // udot v13.4s, v28.16b, v1.4b[1]\n"
2207 "ldr x21, [x16, #0x88]\n"
2208 ".inst 0x6fa2e391 // udot v17.4s, v28.16b, v2.4b[1]\n"
2209 "sub x14, x14, #0x10\n"
2210 ".inst 0x6fa3e395 // udot v21.4s, v28.16b, v3.4b[1]\n"
2212 ".inst 0x6fa4e399 // udot v25.4s, v28.16b, v4.4b[1]\n"
2213 "ldr d28, [x16, #0x70]\n"
2214 "mov v28.d[1], x20\n"
2215 ".inst 0x6fa0e3aa // udot v10.4s, v29.16b, v0.4b[1]\n"
2216 ".inst 0x6fa1e3ae // udot v14.4s, v29.16b, v1.4b[1]\n"
2217 "ldr x20, [x16, #0x98]\n"
2218 ".inst 0x6fa2e3b2 // udot v18.4s, v29.16b, v2.4b[1]\n"
2219 "prfm pldl1keep, [x13, #0x80]\n"
2220 ".inst 0x6fa3e3b6 // udot v22.4s, v29.16b, v3.4b[1]\n"
2221 "prfm pldl1keep, [x12, #0x80]\n"
2222 ".inst 0x6fa4e3ba // udot v26.4s, v29.16b, v4.4b[1]\n"
2223 "ldr d29, [x16, #0x80]\n"
2224 ".inst 0x6fa0e38b // udot v11.4s, v28.16b, v0.4b[1]\n"
2225 "mov v29.d[1], x21\n"
2226 ".inst 0x6fa1e38f // udot v15.4s, v28.16b, v1.4b[1]\n"
2227 "ldr x21, [x16, #0xa8]\n"
2228 ".inst 0x6fa2e393 // udot v19.4s, v28.16b, v2.4b[1]\n"
2229 "prfm pldl1keep, [x11, #0x80]\n"
2230 ".inst 0x6fa3e397 // udot v23.4s, v28.16b, v3.4b[1]\n"
2231 "prfm pldl1keep, [x10, #0x80]\n"
2232 ".inst 0x6fa4e39b // udot v27.4s, v28.16b, v4.4b[1]\n"
2233 "ldr d28, [x16, #0x90]\n"
2234 "mov v28.d[1], x20\n"
2235 ".inst 0x6f80eba8 // udot v8.4s, v29.16b, v0.4b[2]\n"
2236 ".inst 0x6f81ebac // udot v12.4s, v29.16b, v1.4b[2]\n"
2237 "ldr x20, [x16, #0xb8]\n"
2238 ".inst 0x6f82ebb0 // udot v16.4s, v29.16b, v2.4b[2]\n"
2239 "prfm pldl1keep, [x9, #0x80]\n"
2240 ".inst 0x6f83ebb4 // udot v20.4s, v29.16b, v3.4b[2]\n"
2241 ".inst 0x6f84ebb8 // udot v24.4s, v29.16b, v4.4b[2]\n"
2242 "ldr d29, [x16, #0xa0]\n"
2243 ".inst 0x6f80eb89 // udot v9.4s, v28.16b, v0.4b[2]\n"
2244 "mov v29.d[1], x21\n"
2245 ".inst 0x6f81eb8d // udot v13.4s, v28.16b, v1.4b[2]\n"
2246 "ldr x21, [x16, #0xc8]\n"
2247 ".inst 0x6f82eb91 // udot v17.4s, v28.16b, v2.4b[2]\n"
2248 ".inst 0x6f83eb95 // udot v21.4s, v28.16b, v3.4b[2]\n"
2249 ".inst 0x6f84eb99 // udot v25.4s, v28.16b, v4.4b[2]\n"
2250 "ldr d28, [x16, #0xb0]\n"
2251 "mov v28.d[1], x20\n"
2252 ".inst 0x6f80ebaa // udot v10.4s, v29.16b, v0.4b[2]\n"
2253 ".inst 0x6f81ebae // udot v14.4s, v29.16b, v1.4b[2]\n"
2254 "ldr x20, [x16, #0xd8]\n"
2255 ".inst 0x6f82ebb2 // udot v18.4s, v29.16b, v2.4b[2]\n"
2256 ".inst 0x6f83ebb6 // udot v22.4s, v29.16b, v3.4b[2]\n"
2257 ".inst 0x6f84ebba // udot v26.4s, v29.16b, v4.4b[2]\n"
2258 "ldr d29, [x16, #0xc0]\n"
2259 ".inst 0x6f80eb8b // udot v11.4s, v28.16b, v0.4b[2]\n"
2260 "mov v29.d[1], x21\n"
2261 ".inst 0x6f81eb8f // udot v15.4s, v28.16b, v1.4b[2]\n"
2262 "ldr x21, [x16, #0xe8]\n"
2263 ".inst 0x6f82eb93 // udot v19.4s, v28.16b, v2.4b[2]\n"
2264 ".inst 0x6f83eb97 // udot v23.4s, v28.16b, v3.4b[2]\n"
2265 ".inst 0x6f84eb9b // udot v27.4s, v28.16b, v4.4b[2]\n"
2266 "ldr d28, [x16, #0xd0]\n"
2267 "mov v28.d[1], x20\n"
2268 ".inst 0x6fa0eba8 // udot v8.4s, v29.16b, v0.4b[3]\n"
2269 ".inst 0x6fa1ebac // udot v12.4s, v29.16b, v1.4b[3]\n"
2270 "ldr x20, [x16, #0xf8]\n"
2271 ".inst 0x6fa2ebb0 // udot v16.4s, v29.16b, v2.4b[3]\n"
2272 ".inst 0x6fa3ebb4 // udot v20.4s, v29.16b, v3.4b[3]\n"
2273 ".inst 0x6fa4ebb8 // udot v24.4s, v29.16b, v4.4b[3]\n"
2274 "ldr d29, [x16, #0xe0]\n"
2275 ".inst 0x6fa0eb89 // udot v9.4s, v28.16b, v0.4b[3]\n"
2276 "mov v29.d[1], x21\n"
2277 ".inst 0x6fa1eb8d // udot v13.4s, v28.16b, v1.4b[3]\n"
2278 ".inst 0x6fa2eb91 // udot v17.4s, v28.16b, v2.4b[3]\n"
2279 ".inst 0x6fa3eb95 // udot v21.4s, v28.16b, v3.4b[3]\n"
2280 ".inst 0x6fa4eb99 // udot v25.4s, v28.16b, v4.4b[3]\n"
2281 "ldr d28, [x16, #0xf0]\n"
2282 "mov v28.d[1], x20\n"
2283 "add x16, x16, #0x100\n"
2284 ".inst 0x6fa0ebaa // udot v10.4s, v29.16b, v0.4b[3]\n"
2285 "ldr x21, [x16, #0x8]\n"
2286 ".inst 0x6fa1ebae // udot v14.4s, v29.16b, v1.4b[3]\n"
2287 "ldr x20, [x16, #0x18]\n"
2288 ".inst 0x6fa2ebb2 // udot v18.4s, v29.16b, v2.4b[3]\n"
2289 ".inst 0x6fa3ebb6 // udot v22.4s, v29.16b, v3.4b[3]\n"
2290 ".inst 0x6fa4ebba // udot v26.4s, v29.16b, v4.4b[3]\n"
2291 "ldr d6, [x16, #0x0]\n"
2292 ".inst 0x6fa0eb8b // udot v11.4s, v28.16b, v0.4b[3]\n"
2293 "ldr d0, [x13, #0x0]\n"
2294 ".inst 0x6fa1eb8f // udot v15.4s, v28.16b, v1.4b[3]\n"
2295 "ldr d1, [x12, #0x0]\n"
2296 ".inst 0x6fa2eb93 // udot v19.4s, v28.16b, v2.4b[3]\n"
2297 "ldr d2, [x11, #0x0]\n"
2298 ".inst 0x6fa3eb97 // udot v23.4s, v28.16b, v3.4b[3]\n"
2299 "ldr d3, [x10, #0x0]\n"
2300 ".inst 0x6fa4eb9b // udot v27.4s, v28.16b, v4.4b[3]\n"
2301 "ldr d4, [x9, #0x0]\n"
2302 "ldr d7, [x16, #0x10]\n"
2303 "mov v6.d[1], x21\n"
2304 "mov v0.d[1], x26\n"
2305 "mov v1.d[1], x25\n"
2306 "mov v2.d[1], x24\n"
2307 "mov v3.d[1], x23\n"
2308 "mov v4.d[1], x22\n"
2309 "mov v7.d[1], x20\n"
2312 ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
2313 "add x13, x13, #0x10\n"
2314 ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
2315 "add x12, x12, #0x10\n"
2316 ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
2317 "add x11, x11, #0x10\n"
2318 ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
2319 "add x10, x10, #0x10\n"
2320 ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n"
2321 "ldr q29, [x16, #0x20]\n"
2322 ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
2323 "add x9, x9, #0x10\n"
2324 ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
2325 "sub x14, x14, #0x10\n"
2326 ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
2327 "prfm pldl1keep, [x13, #0x80]\n"
2328 ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
2329 "prfm pldl1keep, [x12, #0x80]\n"
2330 ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n"
2331 "ldr q28, [x16, #0x30]\n"
2332 ".inst 0x6f80e3aa // udot v10.4s, v29.16b, v0.4b[0]\n"
2333 "prfm pldl1keep, [x11, #0x80]\n"
2334 ".inst 0x6f81e3ae // udot v14.4s, v29.16b, v1.4b[0]\n"
2335 "prfm pldl1keep, [x10, #0x80]\n"
2336 ".inst 0x6f82e3b2 // udot v18.4s, v29.16b, v2.4b[0]\n"
2337 "prfm pldl1keep, [x9, #0x80]\n"
2338 ".inst 0x6f83e3b6 // udot v22.4s, v29.16b, v3.4b[0]\n"
2339 ".inst 0x6f84e3ba // udot v26.4s, v29.16b, v4.4b[0]\n"
2340 "ldr q29, [x16, #0x40]\n"
2341 ".inst 0x6f80e38b // udot v11.4s, v28.16b, v0.4b[0]\n"
2342 ".inst 0x6f81e38f // udot v15.4s, v28.16b, v1.4b[0]\n"
2343 ".inst 0x6f82e393 // udot v19.4s, v28.16b, v2.4b[0]\n"
2344 ".inst 0x6f83e397 // udot v23.4s, v28.16b, v3.4b[0]\n"
2345 ".inst 0x6f84e39b // udot v27.4s, v28.16b, v4.4b[0]\n"
2346 "ldr q28, [x16, #0x50]\n"
2347 ".inst 0x6fa0e3a8 // udot v8.4s, v29.16b, v0.4b[1]\n"
2348 ".inst 0x6fa1e3ac // udot v12.4s, v29.16b, v1.4b[1]\n"
2349 ".inst 0x6fa2e3b0 // udot v16.4s, v29.16b, v2.4b[1]\n"
2350 ".inst 0x6fa3e3b4 // udot v20.4s, v29.16b, v3.4b[1]\n"
2351 ".inst 0x6fa4e3b8 // udot v24.4s, v29.16b, v4.4b[1]\n"
2352 "ldr q29, [x16, #0x60]\n"
2353 ".inst 0x6fa0e389 // udot v9.4s, v28.16b, v0.4b[1]\n"
2354 ".inst 0x6fa1e38d // udot v13.4s, v28.16b, v1.4b[1]\n"
2355 ".inst 0x6fa2e391 // udot v17.4s, v28.16b, v2.4b[1]\n"
2356 ".inst 0x6fa3e395 // udot v21.4s, v28.16b, v3.4b[1]\n"
2357 ".inst 0x6fa4e399 // udot v25.4s, v28.16b, v4.4b[1]\n"
2358 "ldr q28, [x16, #0x70]\n"
2359 ".inst 0x6fa0e3aa // udot v10.4s, v29.16b, v0.4b[1]\n"
2360 ".inst 0x6fa1e3ae // udot v14.4s, v29.16b, v1.4b[1]\n"
2361 ".inst 0x6fa2e3b2 // udot v18.4s, v29.16b, v2.4b[1]\n"
2362 ".inst 0x6fa3e3b6 // udot v22.4s, v29.16b, v3.4b[1]\n"
2363 ".inst 0x6fa4e3ba // udot v26.4s, v29.16b, v4.4b[1]\n"
2364 "ldr q29, [x16, #0x80]\n"
2365 ".inst 0x6fa0e38b // udot v11.4s, v28.16b, v0.4b[1]\n"
2366 ".inst 0x6fa1e38f // udot v15.4s, v28.16b, v1.4b[1]\n"
2367 ".inst 0x6fa2e393 // udot v19.4s, v28.16b, v2.4b[1]\n"
2368 ".inst 0x6fa3e397 // udot v23.4s, v28.16b, v3.4b[1]\n"
2369 ".inst 0x6fa4e39b // udot v27.4s, v28.16b, v4.4b[1]\n"
2370 "ldr q28, [x16, #0x90]\n"
2371 ".inst 0x6f80eba8 // udot v8.4s, v29.16b, v0.4b[2]\n"
2372 ".inst 0x6f81ebac // udot v12.4s, v29.16b, v1.4b[2]\n"
2373 ".inst 0x6f82ebb0 // udot v16.4s, v29.16b, v2.4b[2]\n"
2374 ".inst 0x6f83ebb4 // udot v20.4s, v29.16b, v3.4b[2]\n"
2375 ".inst 0x6f84ebb8 // udot v24.4s, v29.16b, v4.4b[2]\n"
2376 "ldr q29, [x16, #0xa0]\n"
2377 ".inst 0x6f80eb89 // udot v9.4s, v28.16b, v0.4b[2]\n"
2378 ".inst 0x6f81eb8d // udot v13.4s, v28.16b, v1.4b[2]\n"
2379 ".inst 0x6f82eb91 // udot v17.4s, v28.16b, v2.4b[2]\n"
2380 ".inst 0x6f83eb95 // udot v21.4s, v28.16b, v3.4b[2]\n"
2381 ".inst 0x6f84eb99 // udot v25.4s, v28.16b, v4.4b[2]\n"
2382 "ldr q28, [x16, #0xb0]\n"
2383 ".inst 0x6f80ebaa // udot v10.4s, v29.16b, v0.4b[2]\n"
2384 ".inst 0x6f81ebae // udot v14.4s, v29.16b, v1.4b[2]\n"
2385 ".inst 0x6f82ebb2 // udot v18.4s, v29.16b, v2.4b[2]\n"
2386 ".inst 0x6f83ebb6 // udot v22.4s, v29.16b, v3.4b[2]\n"
2387 ".inst 0x6f84ebba // udot v26.4s, v29.16b, v4.4b[2]\n"
2388 "ldr q29, [x16, #0xc0]\n"
2389 ".inst 0x6f80eb8b // udot v11.4s, v28.16b, v0.4b[2]\n"
2390 ".inst 0x6f81eb8f // udot v15.4s, v28.16b, v1.4b[2]\n"
2391 ".inst 0x6f82eb93 // udot v19.4s, v28.16b, v2.4b[2]\n"
2392 ".inst 0x6f83eb97 // udot v23.4s, v28.16b, v3.4b[2]\n"
2393 ".inst 0x6f84eb9b // udot v27.4s, v28.16b, v4.4b[2]\n"
2394 "ldr q28, [x16, #0xd0]\n"
2395 ".inst 0x6fa0eba8 // udot v8.4s, v29.16b, v0.4b[3]\n"
2396 ".inst 0x6fa1ebac // udot v12.4s, v29.16b, v1.4b[3]\n"
2397 ".inst 0x6fa2ebb0 // udot v16.4s, v29.16b, v2.4b[3]\n"
2398 ".inst 0x6fa3ebb4 // udot v20.4s, v29.16b, v3.4b[3]\n"
2399 ".inst 0x6fa4ebb8 // udot v24.4s, v29.16b, v4.4b[3]\n"
2400 "ldr q29, [x16, #0xe0]\n"
2401 ".inst 0x6fa0eb89 // udot v9.4s, v28.16b, v0.4b[3]\n"
2402 ".inst 0x6fa1eb8d // udot v13.4s, v28.16b, v1.4b[3]\n"
2403 ".inst 0x6fa2eb91 // udot v17.4s, v28.16b, v2.4b[3]\n"
2404 ".inst 0x6fa3eb95 // udot v21.4s, v28.16b, v3.4b[3]\n"
2405 ".inst 0x6fa4eb99 // udot v25.4s, v28.16b, v4.4b[3]\n"
2406 "ldr q28, [x16, #0xf0]\n"
2407 ".inst 0x6fa0ebaa // udot v10.4s, v29.16b, v0.4b[3]\n"
2408 "add x16, x16, #0x100\n"
2409 ".inst 0x6fa1ebae // udot v14.4s, v29.16b, v1.4b[3]\n"
2410 ".inst 0x6fa2ebb2 // udot v18.4s, v29.16b, v2.4b[3]\n"
2411 ".inst 0x6fa3ebb6 // udot v22.4s, v29.16b, v3.4b[3]\n"
2412 ".inst 0x6fa4ebba // udot v26.4s, v29.16b, v4.4b[3]\n"
2413 ".inst 0x6fa0eb8b // udot v11.4s, v28.16b, v0.4b[3]\n"
2414 ".inst 0x6fa1eb8f // udot v15.4s, v28.16b, v1.4b[3]\n"
2415 ".inst 0x6fa2eb93 // udot v19.4s, v28.16b, v2.4b[3]\n"
2416 ".inst 0x6fa3eb97 // udot v23.4s, v28.16b, v3.4b[3]\n"
2417 ".inst 0x6fa4eb9b // udot v27.4s, v28.16b, v4.4b[3]\n"
2423 "ldr s2, [x13], #0x4\n"
2424 "sub x14, x14, #0x4\n"
2425 "ldr s1, [x12], #0x4\n"
2427 "ldr s0, [x11], #0x4\n"
2428 "ldr s31, [x10], #0x4\n"
2429 "ldr s30, [x9], #0x4\n"
2430 "ldr q29, [x16, #0x0]\n"
2431 ".inst 0x6f82e3a8 // udot v8.4s, v29.16b, v2.4b[0]\n"
2432 "ldr q28, [x16, #0x10]\n"
2433 ".inst 0x6f81e3ac // udot v12.4s, v29.16b, v1.4b[0]\n"
2434 ".inst 0x6f80e3b0 // udot v16.4s, v29.16b, v0.4b[0]\n"
2435 ".inst 0x6f9fe3b4 // udot v20.4s, v29.16b, v31.4b[0]\n"
2436 ".inst 0x6f9ee3b8 // udot v24.4s, v29.16b, v30.4b[0]\n"
2437 "ldr q29, [x16, #0x20]\n"
2438 ".inst 0x6f82e389 // udot v9.4s, v28.16b, v2.4b[0]\n"
2439 ".inst 0x6f81e38d // udot v13.4s, v28.16b, v1.4b[0]\n"
2440 ".inst 0x6f80e391 // udot v17.4s, v28.16b, v0.4b[0]\n"
2441 ".inst 0x6f9fe395 // udot v21.4s, v28.16b, v31.4b[0]\n"
2442 ".inst 0x6f9ee399 // udot v25.4s, v28.16b, v30.4b[0]\n"
2443 "ldr q28, [x16, #0x30]\n"
2444 ".inst 0x6f82e3aa // udot v10.4s, v29.16b, v2.4b[0]\n"
2445 "add x16, x16, #0x40\n"
2446 ".inst 0x6f81e3ae // udot v14.4s, v29.16b, v1.4b[0]\n"
2447 ".inst 0x6f80e3b2 // udot v18.4s, v29.16b, v0.4b[0]\n"
2448 ".inst 0x6f9fe3b6 // udot v22.4s, v29.16b, v31.4b[0]\n"
2449 ".inst 0x6f9ee3ba // udot v26.4s, v29.16b, v30.4b[0]\n"
2450 ".inst 0x6f82e38b // udot v11.4s, v28.16b, v2.4b[0]\n"
2451 ".inst 0x6f81e38f // udot v15.4s, v28.16b, v1.4b[0]\n"
2452 ".inst 0x6f80e393 // udot v19.4s, v28.16b, v0.4b[0]\n"
2453 ".inst 0x6f9fe397 // udot v23.4s, v28.16b, v31.4b[0]\n"
2454 ".inst 0x6f9ee39b // udot v27.4s, v28.16b, v30.4b[0]\n"
2458 "tbz x14, #1, 158f\n"
2459 "ldr h0, [x13], #0x2\n"
2460 "ldr h1, [x12], #0x2\n"
2461 "ldr h2, [x11], #0x2\n"
2462 "ldr h3, [x10], #0x2\n"
2463 "ldr h4, [x9], #0x2\n"
2464 "tbz x14, #0, 159f\n"
2465 "ld1 { v0.b }[2], [x13]\n"
2466 "ld1 { v1.b }[2], [x12]\n"
2467 "ld1 { v2.b }[2], [x11]\n"
2468 "ld1 { v3.b }[2], [x10]\n"
2469 "ld1 { v4.b }[2], [x9]\n"
2472 "ldr b0, [x13, #0x0]\n"
2473 "ldr b1, [x12, #0x0]\n"
2474 "ldr b2, [x11, #0x0]\n"
2475 "ldr b3, [x10, #0x0]\n"
2476 "ldr b4, [x9, #0x0]\n"
2478 "ldr q29, [x16, #0x0]\n"
2479 ".inst 0x6f80e3a8 // udot v8.4s, v29.16b, v0.4b[0]\n"
2480 "ldr q28, [x16, #0x10]\n"
2481 ".inst 0x6f81e3ac // udot v12.4s, v29.16b, v1.4b[0]\n"
2482 ".inst 0x6f82e3b0 // udot v16.4s, v29.16b, v2.4b[0]\n"
2483 ".inst 0x6f83e3b4 // udot v20.4s, v29.16b, v3.4b[0]\n"
2484 ".inst 0x6f84e3b8 // udot v24.4s, v29.16b, v4.4b[0]\n"
2485 "ldr q29, [x16, #0x20]\n"
2486 ".inst 0x6f80e389 // udot v9.4s, v28.16b, v0.4b[0]\n"
2487 ".inst 0x6f81e38d // udot v13.4s, v28.16b, v1.4b[0]\n"
2488 ".inst 0x6f82e391 // udot v17.4s, v28.16b, v2.4b[0]\n"
2489 ".inst 0x6f83e395 // udot v21.4s, v28.16b, v3.4b[0]\n"
2490 ".inst 0x6f84e399 // udot v25.4s, v28.16b, v4.4b[0]\n"
2491 "ldr q28, [x16, #0x30]\n"
2492 ".inst 0x6f80e3aa // udot v10.4s, v29.16b, v0.4b[0]\n"
2493 "add x16, x16, #0x40\n"
2494 ".inst 0x6f81e3ae // udot v14.4s, v29.16b, v1.4b[0]\n"
2495 ".inst 0x6f82e3b2 // udot v18.4s, v29.16b, v2.4b[0]\n"
2496 ".inst 0x6f83e3b6 // udot v22.4s, v29.16b, v3.4b[0]\n"
2497 ".inst 0x6f84e3ba // udot v26.4s, v29.16b, v4.4b[0]\n"
2498 ".inst 0x6f80e38b // udot v11.4s, v28.16b, v0.4b[0]\n"
2499 ".inst 0x6f81e38f // udot v15.4s, v28.16b, v1.4b[0]\n"
2500 ".inst 0x6f82e393 // udot v19.4s, v28.16b, v2.4b[0]\n"
2501 ".inst 0x6f83e397 // udot v23.4s, v28.16b, v3.4b[0]\n"
2502 ".inst 0x6f84e39b // udot v27.4s, v28.16b, v4.4b[0]\n"
2504 "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
2505 "add x15, x15, #0x1\n"
2508 "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
2509 "add x24, x17, x20, LSL #2\n"
2510 "add x23, x24, x20, LSL #2\n"
2511 "add x22, x23, x20, LSL #2\n"
2512 "add x21, x22, x20, LSL #2\n"
2514 "prfm pstl1keep, [x17, #0x0]\n"
2515 "prfm pstl1keep, [x24, #0x0]\n"
2516 "prfm pstl1keep, [x23, #0x0]\n"
2517 "prfm pstl1keep, [x22, #0x0]\n"
2518 "prfm pstl1keep, [x21, #0x0]\n"
2520 "tbz x8, #3, 164f\n"
2521 "st1 { v8.4s }, [x17], #0x10\n"
2522 "st1 { v9.4s }, [x17], #0x10\n"
2523 "st1 { v12.4s }, [x24], #0x10\n"
2524 "st1 { v13.4s }, [x24], #0x10\n"
2525 "st1 { v16.4s }, [x23], #0x10\n"
2526 "st1 { v17.4s }, [x23], #0x10\n"
2527 "st1 { v20.4s }, [x22], #0x10\n"
2528 "st1 { v21.4s }, [x22], #0x10\n"
2529 "st1 { v24.4s }, [x21], #0x10\n"
2530 "st1 { v25.4s }, [x21], #0x10\n"
2531 "tbz x8, #2, 162f\n"
2532 "st1 { v10.4s }, [x17], #0x10\n"
2533 "st1 { v14.4s }, [x24], #0x10\n"
2534 "st1 { v18.4s }, [x23], #0x10\n"
2535 "st1 { v22.4s }, [x22], #0x10\n"
2536 "st1 { v26.4s }, [x21], #0x10\n"
2537 "tbz x8, #1, 161f\n"
2538 "str d11, [x17], #0x8\n"
2539 "str d15, [x24], #0x8\n"
2540 "str d19, [x23], #0x8\n"
2541 "str d23, [x22], #0x8\n"
2542 "str d27, [x21], #0x8\n"
2543 "tbz x8, #0, 168f\n"
2544 "st1 { v11.s }[2], [x17]\n"
2545 "st1 { v15.s }[2], [x24]\n"
2546 "st1 { v19.s }[2], [x23]\n"
2547 "st1 { v23.s }[2], [x22]\n"
2548 "st1 { v27.s }[2], [x21]\n"
2551 "tbz x8, #0, 168f\n"
2552 "str s11, [x17, #0x0]\n"
2553 "str s15, [x24, #0x0]\n"
2554 "str s19, [x23, #0x0]\n"
2555 "str s23, [x22, #0x0]\n"
2556 "str s27, [x21, #0x0]\n"
2559 "tbz x8, #1, 163f\n"
2560 "str d10, [x17], #0x8\n"
2561 "str d14, [x24], #0x8\n"
2562 "str d18, [x23], #0x8\n"
2563 "str d22, [x22], #0x8\n"
2564 "str d26, [x21], #0x8\n"
2565 "tbz x8, #0, 168f\n"
2566 "st1 { v10.s }[2], [x17]\n"
2567 "st1 { v14.s }[2], [x24]\n"
2568 "st1 { v18.s }[2], [x23]\n"
2569 "st1 { v22.s }[2], [x22]\n"
2570 "st1 { v26.s }[2], [x21]\n"
2573 "tbz x8, #0, 168f\n"
2574 "str s10, [x17, #0x0]\n"
2575 "str s14, [x24, #0x0]\n"
2576 "str s18, [x23, #0x0]\n"
2577 "str s22, [x22, #0x0]\n"
2578 "str s26, [x21, #0x0]\n"
2581 "tbz x8, #2, 166f\n"
2582 "st1 { v8.4s }, [x17], #0x10\n"
2583 "st1 { v12.4s }, [x24], #0x10\n"
2584 "st1 { v16.4s }, [x23], #0x10\n"
2585 "st1 { v20.4s }, [x22], #0x10\n"
2586 "st1 { v24.4s }, [x21], #0x10\n"
2587 "tbz x8, #1, 165f\n"
2588 "str d9, [x17], #0x8\n"
2589 "str d13, [x24], #0x8\n"
2590 "str d17, [x23], #0x8\n"
2591 "str d21, [x22], #0x8\n"
2592 "str d25, [x21], #0x8\n"
2593 "tbz x8, #0, 168f\n"
2594 "st1 { v9.s }[2], [x17]\n"
2595 "st1 { v13.s }[2], [x24]\n"
2596 "st1 { v17.s }[2], [x23]\n"
2597 "st1 { v21.s }[2], [x22]\n"
2598 "st1 { v25.s }[2], [x21]\n"
2601 "tbz x8, #0, 168f\n"
2602 "str s9, [x17, #0x0]\n"
2603 "str s13, [x24, #0x0]\n"
2604 "str s17, [x23, #0x0]\n"
2605 "str s21, [x22, #0x0]\n"
2606 "str s25, [x21, #0x0]\n"
2609 "tbz x8, #1, 167f\n"
2610 "str d8, [x17], #0x8\n"
2611 "str d12, [x24], #0x8\n"
2612 "str d16, [x23], #0x8\n"
2613 "str d20, [x22], #0x8\n"
2614 "str d24, [x21], #0x8\n"
2615 "tbz x8, #0, 168f\n"
2616 "st1 { v8.s }[2], [x17]\n"
2617 "st1 { v12.s }[2], [x24]\n"
2618 "st1 { v16.s }[2], [x23]\n"
2619 "st1 { v20.s }[2], [x22]\n"
2620 "st1 { v24.s }[2], [x21]\n"
2623 "str s8, [x17, #0x0]\n"
2624 "str s12, [x24, #0x0]\n"
2625 "str s16, [x23, #0x0]\n"
2626 "str s20, [x22, #0x0]\n"
2627 "str s24, [x21, #0x0]\n"
2631 "str q8, [x17, #0x0]\n"
2632 "str q9, [x17, #0x10]\n"
2633 "str q10, [x17, #0x20]\n"
2634 "str q11, [x17, #0x30]\n"
2635 "add x17, x17, #0x40\n"
2636 "str q12, [x24, #0x0]\n"
2637 "str q13, [x24, #0x10]\n"
2638 "str q14, [x24, #0x20]\n"
2639 "str q15, [x24, #0x30]\n"
2640 "str q16, [x23, #0x0]\n"
2641 "str q17, [x23, #0x10]\n"
2642 "str q18, [x23, #0x20]\n"
2643 "str q19, [x23, #0x30]\n"
2644 "str q20, [x22, #0x0]\n"
2645 "str q21, [x22, #0x10]\n"
2646 "str q22, [x22, #0x20]\n"
2647 "str q23, [x22, #0x30]\n"
2648 "str q24, [x21, #0x0]\n"
2649 "str q25, [x21, #0x10]\n"
2650 "str q26, [x21, #0x20]\n"
2651 "str q27, [x21, #0x30]\n"
2653 "subs x8, x8, #0x10\n"
2657 "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
2659 "ldr x8, [%x[args_ptr], %[offsetof_N]]\n"
2660 "mov x17, %x[output_ptr]\n"
2661 "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
2662 "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
2664 "tbz %x[flags], #0, 182f\n"
2665 "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
2666 "add x24, x17, x20, LSL #2\n"
2667 "add x23, x24, x20, LSL #2\n"
2668 "add x22, x23, x20, LSL #2\n"
2669 "add x21, x22, x20, LSL #2\n"
2671 "add x20, x21, x20, LSL #2\n"
2673 "tbz x8, #3, 176f\n"
2674 "ld1 { v8.4s }, [x17], #0x10\n"
2675 "ld1 { v12.4s }, [x24], #0x10\n"
2676 "ld1 { v16.4s }, [x23], #0x10\n"
2677 "ld1 { v20.4s }, [x22], #0x10\n"
2678 "ld1 { v24.4s }, [x21], #0x10\n"
2679 "ld1 { v28.4s }, [x20], #0x10\n"
2680 "ld1 { v9.4s }, [x17], #0x10\n"
2681 "ld1 { v13.4s }, [x24], #0x10\n"
2682 "ld1 { v17.4s }, [x23], #0x10\n"
2683 "ld1 { v21.4s }, [x22], #0x10\n"
2684 "ld1 { v25.4s }, [x21], #0x10\n"
2685 "ld1 { v29.4s }, [x20], #0x10\n"
2686 "tbz x8, #2, 174f\n"
2687 "ld1 { v10.4s }, [x17], #0x10\n"
2688 "ld1 { v14.4s }, [x24], #0x10\n"
2689 "ld1 { v18.4s }, [x23], #0x10\n"
2690 "ld1 { v22.4s }, [x22], #0x10\n"
2691 "ld1 { v26.4s }, [x21], #0x10\n"
2692 "ld1 { v30.4s }, [x20], #0x10\n"
2693 "tbz x8, #1, 173f\n"
2694 "ldr d11, [x17], #0x8\n"
2696 "ldr d15, [x24], #0x8\n"
2697 "ldr d19, [x23], #0x8\n"
2698 "ldr d23, [x22], #0x8\n"
2699 "ldr d27, [x21], #0x8\n"
2700 "ldr d31, [x20], #0x8\n"
2701 "tbz x8, #0, 180f\n"
2702 "ld1 { v11.s }[2], [x17]\n"
2703 "ld1 { v15.s }[2], [x24]\n"
2704 "ld1 { v19.s }[2], [x23]\n"
2705 "ld1 { v23.s }[2], [x22]\n"
2706 "ld1 { v27.s }[2], [x21]\n"
2707 "ld1 { v31.s }[2], [x20]\n"
2711 "tbz x8, #0, 180f\n"
2712 "ldr s11, [x17, #0x0]\n"
2713 "ldr s15, [x24, #0x0]\n"
2714 "ldr s19, [x23, #0x0]\n"
2715 "ldr s23, [x22, #0x0]\n"
2716 "ldr s27, [x21, #0x0]\n"
2717 "ldr s31, [x20, #0x0]\n"
2720 "tbz x8, #1, 175f\n"
2721 "ldr d10, [x17], #0x8\n"
2723 "ldr d14, [x24], #0x8\n"
2724 "ldr d18, [x23], #0x8\n"
2725 "ldr d22, [x22], #0x8\n"
2726 "ldr d26, [x21], #0x8\n"
2727 "ldr d30, [x20], #0x8\n"
2728 "tbz x8, #0, 180f\n"
2729 "ld1 { v10.s }[2], [x17]\n"
2730 "ld1 { v14.s }[2], [x24]\n"
2731 "ld1 { v18.s }[2], [x23]\n"
2732 "ld1 { v22.s }[2], [x22]\n"
2733 "ld1 { v26.s }[2], [x21]\n"
2734 "ld1 { v30.s }[2], [x20]\n"
2738 "tbz x8, #0, 180f\n"
2739 "ldr s10, [x17, #0x0]\n"
2740 "ldr s14, [x24, #0x0]\n"
2741 "ldr s18, [x23, #0x0]\n"
2742 "ldr s22, [x22, #0x0]\n"
2743 "ldr s26, [x21, #0x0]\n"
2744 "ldr s30, [x20, #0x0]\n"
2747 "tbz x8, #2, 178f\n"
2748 "ld1 { v8.4s }, [x17], #0x10\n"
2749 "ld1 { v12.4s }, [x24], #0x10\n"
2750 "ld1 { v16.4s }, [x23], #0x10\n"
2751 "ld1 { v20.4s }, [x22], #0x10\n"
2752 "ld1 { v24.4s }, [x21], #0x10\n"
2753 "ld1 { v28.4s }, [x20], #0x10\n"
2754 "tbz x8, #1, 177f\n"
2755 "ldr d9, [x17], #0x8\n"
2757 "ldr d13, [x24], #0x8\n"
2758 "ldr d17, [x23], #0x8\n"
2759 "ldr d21, [x22], #0x8\n"
2760 "ldr d25, [x21], #0x8\n"
2761 "ldr d29, [x20], #0x8\n"
2762 "tbz x8, #0, 180f\n"
2763 "ld1 { v9.s }[2], [x17]\n"
2764 "ld1 { v13.s }[2], [x24]\n"
2765 "ld1 { v17.s }[2], [x23]\n"
2766 "ld1 { v21.s }[2], [x22]\n"
2767 "ld1 { v25.s }[2], [x21]\n"
2768 "ld1 { v29.s }[2], [x20]\n"
2772 "tbz x8, #0, 180f\n"
2773 "ldr s9, [x17, #0x0]\n"
2774 "ldr s13, [x24, #0x0]\n"
2775 "ldr s17, [x23, #0x0]\n"
2776 "ldr s21, [x22, #0x0]\n"
2777 "ldr s25, [x21, #0x0]\n"
2778 "ldr s29, [x20, #0x0]\n"
2781 "tbz x8, #1, 179f\n"
2782 "ldr d8, [x17], #0x8\n"
2784 "ldr d12, [x24], #0x8\n"
2785 "ldr d16, [x23], #0x8\n"
2786 "ldr d20, [x22], #0x8\n"
2787 "ldr d24, [x21], #0x8\n"
2788 "ldr d28, [x20], #0x8\n"
2789 "tbz x8, #0, 180f\n"
2790 "ld1 { v8.s }[2], [x17]\n"
2791 "ld1 { v12.s }[2], [x24]\n"
2792 "ld1 { v16.s }[2], [x23]\n"
2793 "ld1 { v20.s }[2], [x22]\n"
2794 "ld1 { v24.s }[2], [x21]\n"
2795 "ld1 { v28.s }[2], [x20]\n"
2798 "ldr s8, [x17, #0x0]\n"
2800 "ldr s12, [x24, #0x0]\n"
2801 "ldr s16, [x23, #0x0]\n"
2802 "ldr s20, [x22, #0x0]\n"
2803 "ldr s24, [x21, #0x0]\n"
2804 "ldr s28, [x20, #0x0]\n"
2806 "sub x17, x17, x25\n"
2809 "ldr q8, [x17, #0x0]\n"
2810 "ldr q9, [x17, #0x10]\n"
2811 "ldr q10, [x17, #0x20]\n"
2812 "ldr q11, [x17, #0x30]\n"
2813 "ldr q12, [x24, #0x0]\n"
2814 "ldr q13, [x24, #0x10]\n"
2815 "ldr q14, [x24, #0x20]\n"
2816 "ldr q15, [x24, #0x30]\n"
2817 "ldr q16, [x23, #0x0]\n"
2818 "ldr q17, [x23, #0x10]\n"
2819 "ldr q18, [x23, #0x20]\n"
2820 "ldr q19, [x23, #0x30]\n"
2821 "ldr q20, [x22, #0x0]\n"
2822 "ldr q21, [x22, #0x10]\n"
2823 "ldr q22, [x22, #0x20]\n"
2824 "ldr q23, [x22, #0x30]\n"
2825 "ldr q24, [x21, #0x0]\n"
2826 "ldr q25, [x21, #0x10]\n"
2827 "ldr q26, [x21, #0x20]\n"
2828 "ldr q27, [x21, #0x30]\n"
2829 "ldr q28, [x20, #0x0]\n"
2830 "ldr q29, [x20, #0x10]\n"
2831 "ldr q30, [x20, #0x20]\n"
2832 "ldr q31, [x20, #0x30]\n"
2835 "movi v8.4s, #0x0\n"
2836 "movi v9.4s, #0x0\n"
2837 "movi v10.4s, #0x0\n"
2838 "movi v11.4s, #0x0\n"
2839 "movi v12.4s, #0x0\n"
2840 "movi v13.4s, #0x0\n"
2841 "movi v14.4s, #0x0\n"
2842 "movi v15.4s, #0x0\n"
2843 "movi v16.4s, #0x0\n"
2844 "movi v17.4s, #0x0\n"
2845 "movi v18.4s, #0x0\n"
2846 "movi v19.4s, #0x0\n"
2847 "movi v20.4s, #0x0\n"
2848 "movi v21.4s, #0x0\n"
2849 "movi v22.4s, #0x0\n"
2850 "movi v23.4s, #0x0\n"
2851 "movi v24.4s, #0x0\n"
2852 "movi v25.4s, #0x0\n"
2853 "movi v26.4s, #0x0\n"
2854 "movi v27.4s, #0x0\n"
2855 "movi v28.4s, #0x0\n"
2856 "movi v29.4s, #0x0\n"
2857 "movi v30.4s, #0x0\n"
2858 "movi v31.4s, #0x0\n"
2862 "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
2863 "ldr w14, [x20, x15, LSL #0x2]\n"
2864 "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
2865 "tbz %x[flags], #3, 185f\n"
2866 "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n"
2867 "add x20, x20, x21, LSL #3\n"
2868 "ldr x13, [x20, #0x0]\n"
2869 "ldr x12, [x20, #0x8]\n"
2870 "ldr x11, [x20, #0x10]\n"
2871 "ldr x10, [x20, #0x18]\n"
2872 "ldr x9, [x20, #0x20]\n"
2873 "ldr x28, [x20, #0x28]\n"
2875 "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
2876 "add x13, x13, x20\n"
2877 "add x12, x12, x20\n"
2878 "add x11, x11, x20\n"
2879 "add x10, x10, x20\n"
2881 "add x28, x28, x20\n"
2884 "mov x13, %x[input_ptr]\n"
2885 "add x12, x13, x21\n"
2886 "add x11, x12, x21\n"
2887 "add x10, x11, x21\n"
2888 "add x9, x10, x21\n"
2889 "add x28, x9, x21\n"
2893 "ldr q0, [x13, #0x0]\n"
2895 "ldr q1, [x12, #0x0]\n"
2896 "ldr q2, [x11, #0x0]\n"
2897 "ldr q3, [x10, #0x0]\n"
2898 "ldr q4, [x9, #0x0]\n"
2899 "ldr q5, [x28, #0x0]\n"
2900 "ldr q6, [x16, #0x0]\n"
2901 "ldr q7, [x16, #0x10]\n"
2904 ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
2905 "ldr x21, [x16, #0x28]\n"
2906 ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
2907 "ldr x20, [x16, #0x38]\n"
2908 ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
2909 "add x13, x13, #0x10\n"
2910 ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
2911 "add x12, x12, #0x10\n"
2912 ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n"
2913 "add x11, x11, #0x10\n"
2914 ".inst 0x6f85e0dc // udot v28.4s, v6.16b, v5.4b[0]\n"
2915 "ldr d6, [x16, #0x20]\n"
2916 ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
2917 "mov v6.d[1], x21\n"
2918 ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
2919 "ldr x21, [x16, #0x48]\n"
2920 ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
2921 "add x10, x10, #0x10\n"
2922 ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
2923 "add x9, x9, #0x10\n"
2924 ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n"
2925 "add x28, x28, #0x10\n"
2926 ".inst 0x6f85e0fd // udot v29.4s, v7.16b, v5.4b[0]\n"
2927 "ldr d7, [x16, #0x30]\n"
2928 "mov v7.d[1], x20\n"
2929 ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
2930 ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
2931 "ldr x20, [x16, #0x58]\n"
2932 ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
2933 "ldr x27, [x13, #0x8]\n"
2934 ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
2935 "ldr x26, [x12, #0x8]\n"
2936 ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n"
2937 "ldr x25, [x11, #0x8]\n"
2938 ".inst 0x6f85e0de // udot v30.4s, v6.16b, v5.4b[0]\n"
2939 "ldr d6, [x16, #0x40]\n"
2940 ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
2941 "mov v6.d[1], x21\n"
2942 ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
2943 "ldr x21, [x16, #0x68]\n"
2944 ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
2945 "ldr x24, [x10, #0x8]\n"
2946 ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
2947 "ldr x23, [x9, #0x8]\n"
2948 ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n"
2949 "ldr x22, [x28, #0x8]\n"
2950 ".inst 0x6f85e0ff // udot v31.4s, v7.16b, v5.4b[0]\n"
2951 "ldr d7, [x16, #0x50]\n"
2952 "mov v7.d[1], x20\n"
2953 ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
2954 ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n"
2955 "ldr x20, [x16, #0x78]\n"
2956 ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n"
2957 "sub x14, x14, #0x10\n"
2958 ".inst 0x6fa3e0d4 // udot v20.4s, v6.16b, v3.4b[1]\n"
2960 ".inst 0x6fa4e0d8 // udot v24.4s, v6.16b, v4.4b[1]\n"
2961 "prfm pldl1keep, [x13, #0x80]\n"
2962 ".inst 0x6fa5e0dc // udot v28.4s, v6.16b, v5.4b[1]\n"
2963 "ldr d6, [x16, #0x60]\n"
2964 ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
2965 "mov v6.d[1], x21\n"
2966 ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n"
2967 "ldr x21, [x16, #0x88]\n"
2968 ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n"
2969 "prfm pldl1keep, [x12, #0x80]\n"
2970 ".inst 0x6fa3e0f5 // udot v21.4s, v7.16b, v3.4b[1]\n"
2971 "prfm pldl1keep, [x11, #0x80]\n"
2972 ".inst 0x6fa4e0f9 // udot v25.4s, v7.16b, v4.4b[1]\n"
2973 "prfm pldl1keep, [x10, #0x80]\n"
2974 ".inst 0x6fa5e0fd // udot v29.4s, v7.16b, v5.4b[1]\n"
2975 "ldr d7, [x16, #0x70]\n"
2976 "mov v7.d[1], x20\n"
2977 ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
2978 ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n"
2979 "ldr x20, [x16, #0x98]\n"
2980 ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n"
2981 "prfm pldl1keep, [x9, #0x80]\n"
2982 ".inst 0x6fa3e0d6 // udot v22.4s, v6.16b, v3.4b[1]\n"
2983 "prfm pldl1keep, [x28, #0x80]\n"
2984 ".inst 0x6fa4e0da // udot v26.4s, v6.16b, v4.4b[1]\n"
2985 ".inst 0x6fa5e0de // udot v30.4s, v6.16b, v5.4b[1]\n"
2986 "ldr d6, [x16, #0x80]\n"
2987 ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
2988 "mov v6.d[1], x21\n"
2989 ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n"
2990 "ldr x21, [x16, #0xa8]\n"
2991 ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n"
2992 ".inst 0x6fa3e0f7 // udot v23.4s, v7.16b, v3.4b[1]\n"
2993 ".inst 0x6fa4e0fb // udot v27.4s, v7.16b, v4.4b[1]\n"
2994 ".inst 0x6fa5e0ff // udot v31.4s, v7.16b, v5.4b[1]\n"
2995 "ldr d7, [x16, #0x90]\n"
2996 "mov v7.d[1], x20\n"
2997 ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
2998 ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n"
2999 "ldr x20, [x16, #0xb8]\n"
3000 ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n"
3001 ".inst 0x6f83e8d4 // udot v20.4s, v6.16b, v3.4b[2]\n"
3002 ".inst 0x6f84e8d8 // udot v24.4s, v6.16b, v4.4b[2]\n"
3003 ".inst 0x6f85e8dc // udot v28.4s, v6.16b, v5.4b[2]\n"
3004 "ldr d6, [x16, #0xa0]\n"
3005 ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
3006 "mov v6.d[1], x21\n"
3007 ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n"
3008 "ldr x21, [x16, #0xc8]\n"
3009 ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n"
3010 ".inst 0x6f83e8f5 // udot v21.4s, v7.16b, v3.4b[2]\n"
3011 ".inst 0x6f84e8f9 // udot v25.4s, v7.16b, v4.4b[2]\n"
3012 ".inst 0x6f85e8fd // udot v29.4s, v7.16b, v5.4b[2]\n"
3013 "ldr d7, [x16, #0xb0]\n"
3014 "mov v7.d[1], x20\n"
3015 ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
3016 ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n"
3017 "ldr x20, [x16, #0xd8]\n"
3018 ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n"
3019 ".inst 0x6f83e8d6 // udot v22.4s, v6.16b, v3.4b[2]\n"
3020 ".inst 0x6f84e8da // udot v26.4s, v6.16b, v4.4b[2]\n"
3021 ".inst 0x6f85e8de // udot v30.4s, v6.16b, v5.4b[2]\n"
3022 "ldr d6, [x16, #0xc0]\n"
3023 ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
3024 "mov v6.d[1], x21\n"
3025 ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n"
3026 "ldr x21, [x16, #0xe8]\n"
3027 ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n"
3028 ".inst 0x6f83e8f7 // udot v23.4s, v7.16b, v3.4b[2]\n"
3029 ".inst 0x6f84e8fb // udot v27.4s, v7.16b, v4.4b[2]\n"
3030 ".inst 0x6f85e8ff // udot v31.4s, v7.16b, v5.4b[2]\n"
3031 "ldr d7, [x16, #0xd0]\n"
3032 "mov v7.d[1], x20\n"
3033 ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
3034 ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n"
3035 "ldr x20, [x16, #0xf8]\n"
3036 ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n"
3037 ".inst 0x6fa3e8d4 // udot v20.4s, v6.16b, v3.4b[3]\n"
3038 ".inst 0x6fa4e8d8 // udot v24.4s, v6.16b, v4.4b[3]\n"
3039 ".inst 0x6fa5e8dc // udot v28.4s, v6.16b, v5.4b[3]\n"
3040 "ldr d6, [x16, #0xe0]\n"
3041 ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
3042 "mov v6.d[1], x21\n"
3043 ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n"
3044 ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n"
3045 ".inst 0x6fa3e8f5 // udot v21.4s, v7.16b, v3.4b[3]\n"
3046 ".inst 0x6fa4e8f9 // udot v25.4s, v7.16b, v4.4b[3]\n"
3047 ".inst 0x6fa5e8fd // udot v29.4s, v7.16b, v5.4b[3]\n"
3048 "ldr d7, [x16, #0xf0]\n"
3049 "mov v7.d[1], x20\n"
3050 "add x16, x16, #0x100\n"
3051 ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
3052 "ldr x21, [x16, #0x8]\n"
3053 ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n"
3054 "ldr x20, [x16, #0x18]\n"
3055 ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n"
3056 ".inst 0x6fa3e8d6 // udot v22.4s, v6.16b, v3.4b[3]\n"
3057 ".inst 0x6fa4e8da // udot v26.4s, v6.16b, v4.4b[3]\n"
3058 ".inst 0x6fa5e8de // udot v30.4s, v6.16b, v5.4b[3]\n"
3059 "ldr d6, [x16, #0x0]\n"
3060 ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
3061 "ldr d0, [x13, #0x0]\n"
3062 ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n"
3063 "ldr d1, [x12, #0x0]\n"
3064 ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n"
3065 "ldr d2, [x11, #0x0]\n"
3066 ".inst 0x6fa3e8f7 // udot v23.4s, v7.16b, v3.4b[3]\n"
3067 "ldr d3, [x10, #0x0]\n"
3068 ".inst 0x6fa4e8fb // udot v27.4s, v7.16b, v4.4b[3]\n"
3069 "ldr d4, [x9, #0x0]\n"
3070 ".inst 0x6fa5e8ff // udot v31.4s, v7.16b, v5.4b[3]\n"
3071 "ldr d5, [x28, #0x0]\n"
3072 "ldr d7, [x16, #0x10]\n"
3073 "mov v6.d[1], x21\n"
3074 "mov v0.d[1], x27\n"
3075 "mov v1.d[1], x26\n"
3076 "mov v2.d[1], x25\n"
3077 "mov v3.d[1], x24\n"
3078 "mov v4.d[1], x23\n"
3079 "mov v5.d[1], x22\n"
3080 "mov v7.d[1], x20\n"
3083 ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
3084 "add x13, x13, #0x10\n"
3085 ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
3086 "add x12, x12, #0x10\n"
3087 ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
3088 "add x11, x11, #0x10\n"
3089 ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
3090 "add x10, x10, #0x10\n"
3091 ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n"
3092 "add x9, x9, #0x10\n"
3093 ".inst 0x6f85e0dc // udot v28.4s, v6.16b, v5.4b[0]\n"
3094 "ldr q6, [x16, #0x20]\n"
3095 ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
3096 "add x28, x28, #0x10\n"
3097 ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
3098 "sub x14, x14, #0x10\n"
3099 ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
3100 "prfm pldl1keep, [x13, #0x80]\n"
3101 ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
3102 "prfm pldl1keep, [x12, #0x80]\n"
3103 ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n"
3104 "prfm pldl1keep, [x11, #0x80]\n"
3105 ".inst 0x6f85e0fd // udot v29.4s, v7.16b, v5.4b[0]\n"
3106 "ldr q7, [x16, #0x30]\n"
3107 ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
3108 "prfm pldl1keep, [x10, #0x80]\n"
3109 ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
3110 "prfm pldl1keep, [x9, #0x80]\n"
3111 ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
3112 "prfm pldl1keep, [x28, #0x80]\n"
3113 ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
3114 ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n"
3115 ".inst 0x6f85e0de // udot v30.4s, v6.16b, v5.4b[0]\n"
3116 "ldr q6, [x16, #0x40]\n"
3117 ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
3118 ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
3119 ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
3120 ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
3121 ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n"
3122 ".inst 0x6f85e0ff // udot v31.4s, v7.16b, v5.4b[0]\n"
3123 "ldr q7, [x16, #0x50]\n"
3124 ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
3125 ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n"
3126 ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n"
3127 ".inst 0x6fa3e0d4 // udot v20.4s, v6.16b, v3.4b[1]\n"
3128 ".inst 0x6fa4e0d8 // udot v24.4s, v6.16b, v4.4b[1]\n"
3129 ".inst 0x6fa5e0dc // udot v28.4s, v6.16b, v5.4b[1]\n"
3130 "ldr q6, [x16, #0x60]\n"
3131 ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
3132 ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n"
3133 ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n"
3134 ".inst 0x6fa3e0f5 // udot v21.4s, v7.16b, v3.4b[1]\n"
3135 ".inst 0x6fa4e0f9 // udot v25.4s, v7.16b, v4.4b[1]\n"
3136 ".inst 0x6fa5e0fd // udot v29.4s, v7.16b, v5.4b[1]\n"
3137 "ldr q7, [x16, #0x70]\n"
3138 ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
3139 ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n"
3140 ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n"
3141 ".inst 0x6fa3e0d6 // udot v22.4s, v6.16b, v3.4b[1]\n"
3142 ".inst 0x6fa4e0da // udot v26.4s, v6.16b, v4.4b[1]\n"
3143 ".inst 0x6fa5e0de // udot v30.4s, v6.16b, v5.4b[1]\n"
3144 "ldr q6, [x16, #0x80]\n"
3145 ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
3146 ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n"
3147 ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n"
3148 ".inst 0x6fa3e0f7 // udot v23.4s, v7.16b, v3.4b[1]\n"
3149 ".inst 0x6fa4e0fb // udot v27.4s, v7.16b, v4.4b[1]\n"
3150 ".inst 0x6fa5e0ff // udot v31.4s, v7.16b, v5.4b[1]\n"
3151 "ldr q7, [x16, #0x90]\n"
3152 ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
3153 ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n"
3154 ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n"
3155 ".inst 0x6f83e8d4 // udot v20.4s, v6.16b, v3.4b[2]\n"
3156 ".inst 0x6f84e8d8 // udot v24.4s, v6.16b, v4.4b[2]\n"
3157 ".inst 0x6f85e8dc // udot v28.4s, v6.16b, v5.4b[2]\n"
3158 "ldr q6, [x16, #0xa0]\n"
3159 ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
3160 ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n"
3161 ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n"
3162 ".inst 0x6f83e8f5 // udot v21.4s, v7.16b, v3.4b[2]\n"
3163 ".inst 0x6f84e8f9 // udot v25.4s, v7.16b, v4.4b[2]\n"
3164 ".inst 0x6f85e8fd // udot v29.4s, v7.16b, v5.4b[2]\n"
3165 "ldr q7, [x16, #0xb0]\n"
3166 ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
3167 ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n"
3168 ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n"
3169 ".inst 0x6f83e8d6 // udot v22.4s, v6.16b, v3.4b[2]\n"
3170 ".inst 0x6f84e8da // udot v26.4s, v6.16b, v4.4b[2]\n"
3171 ".inst 0x6f85e8de // udot v30.4s, v6.16b, v5.4b[2]\n"
3172 "ldr q6, [x16, #0xc0]\n"
3173 ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
3174 ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n"
3175 ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n"
3176 ".inst 0x6f83e8f7 // udot v23.4s, v7.16b, v3.4b[2]\n"
3177 ".inst 0x6f84e8fb // udot v27.4s, v7.16b, v4.4b[2]\n"
3178 ".inst 0x6f85e8ff // udot v31.4s, v7.16b, v5.4b[2]\n"
3179 "ldr q7, [x16, #0xd0]\n"
3180 ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
3181 ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n"
3182 ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n"
3183 ".inst 0x6fa3e8d4 // udot v20.4s, v6.16b, v3.4b[3]\n"
3184 ".inst 0x6fa4e8d8 // udot v24.4s, v6.16b, v4.4b[3]\n"
3185 ".inst 0x6fa5e8dc // udot v28.4s, v6.16b, v5.4b[3]\n"
3186 "ldr q6, [x16, #0xe0]\n"
3187 ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
3188 ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n"
3189 ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n"
3190 ".inst 0x6fa3e8f5 // udot v21.4s, v7.16b, v3.4b[3]\n"
3191 ".inst 0x6fa4e8f9 // udot v25.4s, v7.16b, v4.4b[3]\n"
3192 ".inst 0x6fa5e8fd // udot v29.4s, v7.16b, v5.4b[3]\n"
3193 "ldr q7, [x16, #0xf0]\n"
3194 ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
3195 "add x16, x16, #0x100\n"
3196 ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n"
3197 ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n"
3198 ".inst 0x6fa3e8d6 // udot v22.4s, v6.16b, v3.4b[3]\n"
3199 ".inst 0x6fa4e8da // udot v26.4s, v6.16b, v4.4b[3]\n"
3200 ".inst 0x6fa5e8de // udot v30.4s, v6.16b, v5.4b[3]\n"
3201 ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
3202 ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n"
3203 ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n"
3204 ".inst 0x6fa3e8f7 // udot v23.4s, v7.16b, v3.4b[3]\n"
3205 ".inst 0x6fa4e8fb // udot v27.4s, v7.16b, v4.4b[3]\n"
3206 ".inst 0x6fa5e8ff // udot v31.4s, v7.16b, v5.4b[3]\n"
3212 "ldr s7, [x13], #0x4\n"
3213 "sub x14, x14, #0x4\n"
3214 "ldr s6, [x12], #0x4\n"
3216 "ldr s5, [x11], #0x4\n"
3217 "ldr s4, [x10], #0x4\n"
3218 "ldr s3, [x9], #0x4\n"
3219 "ldr s2, [x28], #0x4\n"
3220 "ldr q1, [x16, #0x0]\n"
3221 ".inst 0x6f87e028 // udot v8.4s, v1.16b, v7.4b[0]\n"
3222 "ldr q0, [x16, #0x10]\n"
3223 ".inst 0x6f86e02c // udot v12.4s, v1.16b, v6.4b[0]\n"
3224 ".inst 0x6f85e030 // udot v16.4s, v1.16b, v5.4b[0]\n"
3225 ".inst 0x6f84e034 // udot v20.4s, v1.16b, v4.4b[0]\n"
3226 ".inst 0x6f83e038 // udot v24.4s, v1.16b, v3.4b[0]\n"
3227 ".inst 0x6f82e03c // udot v28.4s, v1.16b, v2.4b[0]\n"
3228 "ldr q1, [x16, #0x20]\n"
3229 ".inst 0x6f87e009 // udot v9.4s, v0.16b, v7.4b[0]\n"
3230 ".inst 0x6f86e00d // udot v13.4s, v0.16b, v6.4b[0]\n"
3231 ".inst 0x6f85e011 // udot v17.4s, v0.16b, v5.4b[0]\n"
3232 ".inst 0x6f84e015 // udot v21.4s, v0.16b, v4.4b[0]\n"
3233 ".inst 0x6f83e019 // udot v25.4s, v0.16b, v3.4b[0]\n"
3234 ".inst 0x6f82e01d // udot v29.4s, v0.16b, v2.4b[0]\n"
3235 "ldr q0, [x16, #0x30]\n"
3236 ".inst 0x6f87e02a // udot v10.4s, v1.16b, v7.4b[0]\n"
3237 "add x16, x16, #0x40\n"
3238 ".inst 0x6f86e02e // udot v14.4s, v1.16b, v6.4b[0]\n"
3239 ".inst 0x6f85e032 // udot v18.4s, v1.16b, v5.4b[0]\n"
3240 ".inst 0x6f84e036 // udot v22.4s, v1.16b, v4.4b[0]\n"
3241 ".inst 0x6f83e03a // udot v26.4s, v1.16b, v3.4b[0]\n"
3242 ".inst 0x6f82e03e // udot v30.4s, v1.16b, v2.4b[0]\n"
3243 ".inst 0x6f87e00b // udot v11.4s, v0.16b, v7.4b[0]\n"
3244 ".inst 0x6f86e00f // udot v15.4s, v0.16b, v6.4b[0]\n"
3245 ".inst 0x6f85e013 // udot v19.4s, v0.16b, v5.4b[0]\n"
3246 ".inst 0x6f84e017 // udot v23.4s, v0.16b, v4.4b[0]\n"
3247 ".inst 0x6f83e01b // udot v27.4s, v0.16b, v3.4b[0]\n"
3248 ".inst 0x6f82e01f // udot v31.4s, v0.16b, v2.4b[0]\n"
3252 "tbz x14, #1, 192f\n"
3253 "ldr h0, [x13], #0x2\n"
3254 "ldr h1, [x12], #0x2\n"
3255 "ldr h2, [x11], #0x2\n"
3256 "ldr h3, [x10], #0x2\n"
3257 "ldr h4, [x9], #0x2\n"
3258 "ldr h5, [x28], #0x2\n"
3259 "tbz x14, #0, 193f\n"
3260 "ld1 { v0.b }[2], [x13]\n"
3261 "ld1 { v1.b }[2], [x12]\n"
3262 "ld1 { v2.b }[2], [x11]\n"
3263 "ld1 { v3.b }[2], [x10]\n"
3264 "ld1 { v4.b }[2], [x9]\n"
3265 "ld1 { v5.b }[2], [x28]\n"
3268 "ldr b0, [x13, #0x0]\n"
3269 "ldr b1, [x12, #0x0]\n"
3270 "ldr b2, [x11, #0x0]\n"
3271 "ldr b3, [x10, #0x0]\n"
3272 "ldr b4, [x9, #0x0]\n"
3273 "ldr b5, [x28, #0x0]\n"
3275 "ldr q7, [x16, #0x0]\n"
3276 ".inst 0x6f80e0e8 // udot v8.4s, v7.16b, v0.4b[0]\n"
3277 "ldr q6, [x16, #0x10]\n"
3278 ".inst 0x6f81e0ec // udot v12.4s, v7.16b, v1.4b[0]\n"
3279 ".inst 0x6f82e0f0 // udot v16.4s, v7.16b, v2.4b[0]\n"
3280 ".inst 0x6f83e0f4 // udot v20.4s, v7.16b, v3.4b[0]\n"
3281 ".inst 0x6f84e0f8 // udot v24.4s, v7.16b, v4.4b[0]\n"
3282 ".inst 0x6f85e0fc // udot v28.4s, v7.16b, v5.4b[0]\n"
3283 "ldr q7, [x16, #0x20]\n"
3284 ".inst 0x6f80e0c9 // udot v9.4s, v6.16b, v0.4b[0]\n"
3285 ".inst 0x6f81e0cd // udot v13.4s, v6.16b, v1.4b[0]\n"
3286 ".inst 0x6f82e0d1 // udot v17.4s, v6.16b, v2.4b[0]\n"
3287 ".inst 0x6f83e0d5 // udot v21.4s, v6.16b, v3.4b[0]\n"
3288 ".inst 0x6f84e0d9 // udot v25.4s, v6.16b, v4.4b[0]\n"
3289 ".inst 0x6f85e0dd // udot v29.4s, v6.16b, v5.4b[0]\n"
3290 "ldr q6, [x16, #0x30]\n"
3291 ".inst 0x6f80e0ea // udot v10.4s, v7.16b, v0.4b[0]\n"
3292 "add x16, x16, #0x40\n"
3293 ".inst 0x6f81e0ee // udot v14.4s, v7.16b, v1.4b[0]\n"
3294 ".inst 0x6f82e0f2 // udot v18.4s, v7.16b, v2.4b[0]\n"
3295 ".inst 0x6f83e0f6 // udot v22.4s, v7.16b, v3.4b[0]\n"
3296 ".inst 0x6f84e0fa // udot v26.4s, v7.16b, v4.4b[0]\n"
3297 ".inst 0x6f85e0fe // udot v30.4s, v7.16b, v5.4b[0]\n"
3298 ".inst 0x6f80e0cb // udot v11.4s, v6.16b, v0.4b[0]\n"
3299 ".inst 0x6f81e0cf // udot v15.4s, v6.16b, v1.4b[0]\n"
3300 ".inst 0x6f82e0d3 // udot v19.4s, v6.16b, v2.4b[0]\n"
3301 ".inst 0x6f83e0d7 // udot v23.4s, v6.16b, v3.4b[0]\n"
3302 ".inst 0x6f84e0db // udot v27.4s, v6.16b, v4.4b[0]\n"
3303 ".inst 0x6f85e0df // udot v31.4s, v6.16b, v5.4b[0]\n"
3305 "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
3306 "add x15, x15, #0x1\n"
3309 "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
3310 "add x24, x17, x20, LSL #2\n"
3311 "add x23, x24, x20, LSL #2\n"
3312 "add x22, x23, x20, LSL #2\n"
3313 "add x21, x22, x20, LSL #2\n"
3314 "add x20, x21, x20, LSL #2\n"
3316 "prfm pstl1keep, [x17, #0x0]\n"
3317 "prfm pstl1keep, [x24, #0x0]\n"
3318 "prfm pstl1keep, [x23, #0x0]\n"
3319 "prfm pstl1keep, [x22, #0x0]\n"
3320 "prfm pstl1keep, [x21, #0x0]\n"
3321 "prfm pstl1keep, [x20, #0x0]\n"
3323 "tbz x8, #3, 198f\n"
3324 "st1 { v8.4s }, [x17], #0x10\n"
3325 "st1 { v9.4s }, [x17], #0x10\n"
3326 "st1 { v12.4s }, [x24], #0x10\n"
3327 "st1 { v13.4s }, [x24], #0x10\n"
3328 "st1 { v16.4s }, [x23], #0x10\n"
3329 "st1 { v17.4s }, [x23], #0x10\n"
3330 "st1 { v20.4s }, [x22], #0x10\n"
3331 "st1 { v21.4s }, [x22], #0x10\n"
3332 "st1 { v24.4s }, [x21], #0x10\n"
3333 "st1 { v25.4s }, [x21], #0x10\n"
3334 "st1 { v28.4s }, [x20], #0x10\n"
3335 "st1 { v29.4s }, [x20], #0x10\n"
3336 "tbz x8, #2, 196f\n"
3337 "st1 { v10.4s }, [x17], #0x10\n"
3338 "st1 { v14.4s }, [x24], #0x10\n"
3339 "st1 { v18.4s }, [x23], #0x10\n"
3340 "st1 { v22.4s }, [x22], #0x10\n"
3341 "st1 { v26.4s }, [x21], #0x10\n"
3342 "st1 { v30.4s }, [x20], #0x10\n"
3343 "tbz x8, #1, 195f\n"
3344 "str d11, [x17], #0x8\n"
3345 "str d15, [x24], #0x8\n"
3346 "str d19, [x23], #0x8\n"
3347 "str d23, [x22], #0x8\n"
3348 "str d27, [x21], #0x8\n"
3349 "str d31, [x20], #0x8\n"
3350 "tbz x8, #0, 202f\n"
3351 "st1 { v11.s }[2], [x17]\n"
3352 "st1 { v15.s }[2], [x24]\n"
3353 "st1 { v19.s }[2], [x23]\n"
3354 "st1 { v23.s }[2], [x22]\n"
3355 "st1 { v27.s }[2], [x21]\n"
3356 "st1 { v31.s }[2], [x20]\n"
3359 "tbz x8, #0, 202f\n"
3360 "str s11, [x17, #0x0]\n"
3361 "str s15, [x24, #0x0]\n"
3362 "str s19, [x23, #0x0]\n"
3363 "str s23, [x22, #0x0]\n"
3364 "str s27, [x21, #0x0]\n"
3365 "str s31, [x20, #0x0]\n"
3368 "tbz x8, #1, 197f\n"
3369 "str d10, [x17], #0x8\n"
3370 "str d14, [x24], #0x8\n"
3371 "str d18, [x23], #0x8\n"
3372 "str d22, [x22], #0x8\n"
3373 "str d26, [x21], #0x8\n"
3374 "str d30, [x20], #0x8\n"
3375 "tbz x8, #0, 202f\n"
3376 "st1 { v10.s }[2], [x17]\n"
3377 "st1 { v14.s }[2], [x24]\n"
3378 "st1 { v18.s }[2], [x23]\n"
3379 "st1 { v22.s }[2], [x22]\n"
3380 "st1 { v26.s }[2], [x21]\n"
3381 "st1 { v30.s }[2], [x20]\n"
3384 "tbz x8, #0, 202f\n"
3385 "str s10, [x17, #0x0]\n"
3386 "str s14, [x24, #0x0]\n"
3387 "str s18, [x23, #0x0]\n"
3388 "str s22, [x22, #0x0]\n"
3389 "str s26, [x21, #0x0]\n"
3390 "str s30, [x20, #0x0]\n"
3393 "tbz x8, #2, 200f\n"
3394 "st1 { v8.4s }, [x17], #0x10\n"
3395 "st1 { v12.4s }, [x24], #0x10\n"
3396 "st1 { v16.4s }, [x23], #0x10\n"
3397 "st1 { v20.4s }, [x22], #0x10\n"
3398 "st1 { v24.4s }, [x21], #0x10\n"
3399 "st1 { v28.4s }, [x20], #0x10\n"
3400 "tbz x8, #1, 199f\n"
3401 "str d9, [x17], #0x8\n"
3402 "str d13, [x24], #0x8\n"
3403 "str d17, [x23], #0x8\n"
3404 "str d21, [x22], #0x8\n"
3405 "str d25, [x21], #0x8\n"
3406 "str d29, [x20], #0x8\n"
3407 "tbz x8, #0, 202f\n"
3408 "st1 { v9.s }[2], [x17]\n"
3409 "st1 { v13.s }[2], [x24]\n"
3410 "st1 { v17.s }[2], [x23]\n"
3411 "st1 { v21.s }[2], [x22]\n"
3412 "st1 { v25.s }[2], [x21]\n"
3413 "st1 { v29.s }[2], [x20]\n"
3416 "tbz x8, #0, 202f\n"
3417 "str s9, [x17, #0x0]\n"
3418 "str s13, [x24, #0x0]\n"
3419 "str s17, [x23, #0x0]\n"
3420 "str s21, [x22, #0x0]\n"
3421 "str s25, [x21, #0x0]\n"
3422 "str s29, [x20, #0x0]\n"
3425 "tbz x8, #1, 201f\n"
3426 "str d8, [x17], #0x8\n"
3427 "str d12, [x24], #0x8\n"
3428 "str d16, [x23], #0x8\n"
3429 "str d20, [x22], #0x8\n"
3430 "str d24, [x21], #0x8\n"
3431 "str d28, [x20], #0x8\n"
3432 "tbz x8, #0, 202f\n"
3433 "st1 { v8.s }[2], [x17]\n"
3434 "st1 { v12.s }[2], [x24]\n"
3435 "st1 { v16.s }[2], [x23]\n"
3436 "st1 { v20.s }[2], [x22]\n"
3437 "st1 { v24.s }[2], [x21]\n"
3438 "st1 { v28.s }[2], [x20]\n"
3441 "str s8, [x17, #0x0]\n"
3442 "str s12, [x24, #0x0]\n"
3443 "str s16, [x23, #0x0]\n"
3444 "str s20, [x22, #0x0]\n"
3445 "str s24, [x21, #0x0]\n"
3446 "str s28, [x20, #0x0]\n"
3450 "str q8, [x17, #0x0]\n"
3451 "str q9, [x17, #0x10]\n"
3452 "str q10, [x17, #0x20]\n"
3453 "str q11, [x17, #0x30]\n"
3454 "add x17, x17, #0x40\n"
3455 "str q12, [x24, #0x0]\n"
3456 "str q13, [x24, #0x10]\n"
3457 "str q14, [x24, #0x20]\n"
3458 "str q15, [x24, #0x30]\n"
3459 "str q16, [x23, #0x0]\n"
3460 "str q17, [x23, #0x10]\n"
3461 "str q18, [x23, #0x20]\n"
3462 "str q19, [x23, #0x30]\n"
3463 "str q20, [x22, #0x0]\n"
3464 "str q21, [x22, #0x10]\n"
3465 "str q22, [x22, #0x20]\n"
3466 "str q23, [x22, #0x30]\n"
3467 "str q24, [x21, #0x0]\n"
3468 "str q25, [x21, #0x10]\n"
3469 "str q26, [x21, #0x20]\n"
3470 "str q27, [x21, #0x30]\n"
3471 "str q28, [x20, #0x0]\n"
3472 "str q29, [x20, #0x10]\n"
3473 "str q30, [x20, #0x20]\n"
3474 "str q31, [x20, #0x30]\n"
3476 "subs x8, x8, #0x10\n"
3478 "subs %x[M], %x[M], #0x6\n"
3480 "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
3481 "tbz %x[flags], #3, 205f\n"
3482 "add x21, x21, #0x6\n"
3483 "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
3487 "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
3490 : [
M]
"+&r" (
M), [input_ptr]
"+&r" (input_ptr), [output_ptr]
"+&r" (output_ptr)
3491 : [args_ptr]
"r" (&ka), [flags]
"r" (flags), [offsetof_B_ptr]
"I" (offsetof(KernelArgs, B_ptr)), [offsetof_N]
"I" (offsetof(KernelArgs,
N)), [offsetof_input_initial_col]
"I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset]
"I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings]
"I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset]
"I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths]
"I" (offsetof(KernelArgs, string_lengths))
3492 :
"cc",
"memory",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23",
"v24",
"v25",
"v26",
"v27",
"v28",
"v29",
"v30",
"v31",
"x8",
"x9",
"x10",
"x11",
"x12",
"x13",
"x14",
"x15",
"x16",
"x17",
"x20",
"x21",
"x22",
"x23",
"x24",
"x25",
"x26",
"x27",
"x28"
3497 #endif // __aarch64__