24 #ifdef ARM_COMPUTE_ENABLE_SVE
27 #include "../../utils.hpp"
33 void sve_hybrid_s8s32_dot_6x4VL (
34 unsigned int num_strings,
const unsigned int *string_lengths, IndirectInputArg<int8_t> A_arg,
35 size_t M,
size_t N,
const int8_t *B_ptr, IndirectOutputArg<int32_t> output_arg,
40 unsigned int num_strings = {};
41 const unsigned int *string_lengths = {};
43 const int8_t *B_ptr = {};
44 size_t output_offset = {};
45 size_t input_initial_col = {};
46 size_t input_offset = {};
49 unsigned long flags=0;
53 if (output_arg.is_indirect) {
54 output_ptr=(
void *)(output_arg.indirect.ptr);
55 ka.output_offset=output_arg.indirect.offset;
58 output_ptr=(
void *)(output_arg.direct.base);
59 ka.output_offset=output_arg.direct.stride;
62 if (A_arg.is_indirect) {
63 input_ptr=(
void *)(A_arg.indirect.ptr);
64 ka.input_offset=A_arg.indirect.start_row;
65 ka.input_initial_col=A_arg.indirect.start_col;
68 assert(num_strings==1);
69 input_ptr=(
void *)(A_arg.direct.base);
70 ka.input_offset=A_arg.direct.stride;
75 ka.num_strings = num_strings;
76 ka.string_lengths = string_lengths;
90 "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
91 "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
92 "mov x9, %x[output_ptr]\n"
95 "whilelt p4.s, x20, x11\n"
97 "whilelt p3.s, x20, x11\n"
99 "whilelt p2.s, x20, x11\n"
101 "whilelt p1.s, x20, x11\n"
102 "tbz %x[flags], #0, 3f\n"
103 "ld1w { z8.s }, p4/Z, [x9]\n"
104 "ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
105 "ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
106 "ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
116 "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
117 "ldr w27, [x20, x28, LSL #0x2]\n"
118 "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
119 "tbz %x[flags], #3, 6f\n"
120 "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
121 "add x20, x20, x21, LSL #3\n"
122 "ldr x26, [x20, #0x0]\n"
124 "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
125 "add x26, x26, x20\n"
128 "mov x26, %x[input_ptr]\n"
133 "whilelt p0.b, XZR, x27\n"
134 "ld1rqb { z0.b }, p0/Z, [x26]\n"
135 "ld1b { z16.b }, p5/Z, [x10]\n"
136 "sdot z8.s, z16.b, z0.b[0]\n"
137 "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
138 "sdot z9.s, z16.b, z0.b[0]\n"
139 "ld1b { z16.b }, p5/Z, [x10, #2, MUL VL]\n"
140 "sdot z10.s, z16.b, z0.b[0]\n"
141 "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
142 "sdot z11.s, z16.b, z0.b[0]\n"
143 "ld1b { z16.b }, p5/Z, [x10, #4, MUL VL]\n"
144 "sdot z8.s, z16.b, z0.b[1]\n"
145 "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
146 "sdot z9.s, z16.b, z0.b[1]\n"
147 "ld1b { z16.b }, p5/Z, [x10, #6, MUL VL]\n"
148 "sdot z10.s, z16.b, z0.b[1]\n"
149 "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
150 "addvl x10, x10, #16\n"
151 "sdot z11.s, z16.b, z0.b[1]\n"
152 "ld1b { z17.b }, p5/Z, [x10, #-8, MUL VL]\n"
153 "ld1b { z16.b }, p5/Z, [x10, #-7, MUL VL]\n"
154 "sdot z8.s, z17.b, z0.b[2]\n"
155 "sdot z9.s, z16.b, z0.b[2]\n"
156 "ld1b { z17.b }, p5/Z, [x10, #-6, MUL VL]\n"
157 "ld1b { z16.b }, p5/Z, [x10, #-5, MUL VL]\n"
158 "sdot z10.s, z17.b, z0.b[2]\n"
159 "sdot z11.s, z16.b, z0.b[2]\n"
160 "ld1b { z17.b }, p5/Z, [x10, #-4, MUL VL]\n"
161 "ld1b { z16.b }, p5/Z, [x10, #-3, MUL VL]\n"
162 "sdot z8.s, z17.b, z0.b[3]\n"
163 "sdot z9.s, z16.b, z0.b[3]\n"
164 "ld1b { z17.b }, p5/Z, [x10, #-2, MUL VL]\n"
165 "ld1b { z16.b }, p5/Z, [x10, #-1, MUL VL]\n"
166 "sub x27, x27, #0x10\n"
168 "sdot z10.s, z17.b, z0.b[3]\n"
169 "sdot z11.s, z16.b, z0.b[3]\n"
170 "add x26, x26, #0x10\n"
173 "whilelt p0.b, XZR, x27\n"
174 "ld1rqb { z0.b }, p0/Z, [x26]\n"
175 "ld1b { z16.b }, p5/Z, [x10]\n"
176 "sdot z8.s, z16.b, z0.b[0]\n"
177 "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
178 "sdot z9.s, z16.b, z0.b[0]\n"
179 "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
180 "subs x27, x27, #0x4\n"
181 "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
182 "sdot z10.s, z17.b, z0.b[0]\n"
183 "sdot z11.s, z16.b, z0.b[0]\n"
184 "addvl x10, x10, #4\n"
186 "ld1b { z17.b }, p5/Z, [x10]\n"
187 "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
188 "sdot z8.s, z17.b, z0.b[1]\n"
189 "sdot z9.s, z16.b, z0.b[1]\n"
190 "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
191 "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
192 "subs x27, x27, #0x4\n"
193 "sdot z10.s, z17.b, z0.b[1]\n"
194 "sdot z11.s, z16.b, z0.b[1]\n"
195 "addvl x10, x10, #4\n"
197 "ld1b { z17.b }, p5/Z, [x10]\n"
198 "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
199 "sdot z8.s, z17.b, z0.b[2]\n"
200 "sdot z9.s, z16.b, z0.b[2]\n"
201 "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
202 "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
203 "subs x27, x27, #0x4\n"
204 "sdot z10.s, z17.b, z0.b[2]\n"
205 "sdot z11.s, z16.b, z0.b[2]\n"
206 "addvl x10, x10, #4\n"
208 "ld1b { z17.b }, p5/Z, [x10]\n"
209 "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
210 "sdot z8.s, z17.b, z0.b[3]\n"
211 "sdot z9.s, z16.b, z0.b[3]\n"
212 "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
213 "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
214 "sdot z10.s, z17.b, z0.b[3]\n"
215 "sdot z11.s, z16.b, z0.b[3]\n"
216 "addvl x10, x10, #4\n"
218 "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
219 "add x28, x28, #0x1\n"
222 "st1w { z8.s }, p4, [x9]\n"
223 "st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
224 "st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
225 "st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
228 "decw x11, ALL, MUL #4\n"
233 "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
234 "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
235 "mov x9, %x[output_ptr]\n"
238 "whilelt p4.s, x20, x11\n"
240 "whilelt p3.s, x20, x11\n"
242 "whilelt p2.s, x20, x11\n"
244 "whilelt p1.s, x20, x11\n"
245 "tbz %x[flags], #0, 14f\n"
246 "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
247 "add x20, x9, x20, LSL #2\n"
248 "ld1w { z8.s }, p4/Z, [x9]\n"
249 "ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
250 "ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
251 "ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
252 "ld1w { z12.s }, p4/Z, [x20]\n"
253 "ld1w { z13.s }, p3/Z, [x20, #1, MUL VL]\n"
254 "ld1w { z14.s }, p2/Z, [x20, #2, MUL VL]\n"
255 "ld1w { z15.s }, p1/Z, [x20, #3, MUL VL]\n"
269 "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
270 "ldr w27, [x20, x28, LSL #0x2]\n"
271 "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
272 "tbz %x[flags], #3, 17f\n"
273 "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
274 "add x20, x20, x21, LSL #3\n"
275 "ldr x26, [x20, #0x0]\n"
276 "ldr x25, [x20, #0x8]\n"
278 "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
279 "add x26, x26, x20\n"
280 "add x25, x25, x20\n"
283 "mov x26, %x[input_ptr]\n"
284 "add x25, x26, x21\n"
289 "whilelt p0.b, XZR, x27\n"
290 "ld1rqb { z1.b }, p0/Z, [x26]\n"
291 "ld1rqb { z0.b }, p0/Z, [x25]\n"
292 "sub x27, x27, #0x10\n"
293 "ld1b { z17.b }, p5/Z, [x10]\n"
294 "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
295 "sdot z8.s, z17.b, z1.b[0]\n"
296 "sdot z12.s, z17.b, z0.b[0]\n"
297 "sdot z9.s, z16.b, z1.b[0]\n"
298 "sdot z13.s, z16.b, z0.b[0]\n"
299 "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
300 "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
301 "sdot z10.s, z17.b, z1.b[0]\n"
302 "sdot z14.s, z17.b, z0.b[0]\n"
303 "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n"
305 "sdot z11.s, z16.b, z1.b[0]\n"
306 "sdot z15.s, z16.b, z0.b[0]\n"
307 "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n"
308 "add x26, x26, #0x10\n"
309 "sdot z8.s, z17.b, z1.b[1]\n"
310 "sdot z12.s, z17.b, z0.b[1]\n"
311 "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n"
312 "add x25, x25, #0x10\n"
313 "sdot z9.s, z16.b, z1.b[1]\n"
314 "sdot z13.s, z16.b, z0.b[1]\n"
315 "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n"
316 "addvl x10, x10, #16\n"
317 "sdot z10.s, z17.b, z1.b[1]\n"
318 "sdot z14.s, z17.b, z0.b[1]\n"
319 "ld1b { z17.b }, p5/Z, [x10, #-8, MUL VL]\n"
320 "sdot z11.s, z16.b, z1.b[1]\n"
321 "sdot z15.s, z16.b, z0.b[1]\n"
322 "ld1b { z16.b }, p5/Z, [x10, #-7, MUL VL]\n"
323 "sdot z8.s, z17.b, z1.b[2]\n"
324 "sdot z12.s, z17.b, z0.b[2]\n"
325 "ld1b { z17.b }, p5/Z, [x10, #-6, MUL VL]\n"
326 "sdot z9.s, z16.b, z1.b[2]\n"
327 "sdot z13.s, z16.b, z0.b[2]\n"
328 "ld1b { z16.b }, p5/Z, [x10, #-5, MUL VL]\n"
329 "sdot z10.s, z17.b, z1.b[2]\n"
330 "sdot z14.s, z17.b, z0.b[2]\n"
331 "ld1b { z17.b }, p5/Z, [x10, #-4, MUL VL]\n"
332 "sdot z11.s, z16.b, z1.b[2]\n"
333 "sdot z15.s, z16.b, z0.b[2]\n"
334 "ld1b { z16.b }, p5/Z, [x10, #-3, MUL VL]\n"
335 "sdot z8.s, z17.b, z1.b[3]\n"
336 "sdot z12.s, z17.b, z0.b[3]\n"
337 "ld1b { z17.b }, p5/Z, [x10, #-2, MUL VL]\n"
338 "sdot z9.s, z16.b, z1.b[3]\n"
339 "sdot z13.s, z16.b, z0.b[3]\n"
340 "ld1b { z16.b }, p5/Z, [x10, #-1, MUL VL]\n"
341 "sdot z10.s, z17.b, z1.b[3]\n"
342 "sdot z14.s, z17.b, z0.b[3]\n"
343 "sdot z11.s, z16.b, z1.b[3]\n"
344 "sdot z15.s, z16.b, z0.b[3]\n"
347 "whilelt p0.b, XZR, x27\n"
348 "ld1rqb { z0.b }, p0/Z, [x26]\n"
349 "ld1rqb { z1.b }, p0/Z, [x25]\n"
350 "subs x27, x27, #0x4\n"
351 "ld1b { z17.b }, p5/Z, [x10]\n"
352 "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
353 "sdot z8.s, z17.b, z0.b[0]\n"
354 "sdot z12.s, z17.b, z1.b[0]\n"
355 "sdot z9.s, z16.b, z0.b[0]\n"
356 "sdot z13.s, z16.b, z1.b[0]\n"
357 "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
358 "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
359 "sdot z10.s, z17.b, z0.b[0]\n"
360 "sdot z14.s, z17.b, z1.b[0]\n"
361 "addvl x10, x10, #4\n"
362 "sdot z11.s, z16.b, z0.b[0]\n"
363 "sdot z15.s, z16.b, z1.b[0]\n"
365 "ld1b { z17.b }, p5/Z, [x10]\n"
366 "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
367 "sdot z8.s, z17.b, z0.b[1]\n"
368 "sdot z12.s, z17.b, z1.b[1]\n"
369 "sdot z9.s, z16.b, z0.b[1]\n"
370 "sdot z13.s, z16.b, z1.b[1]\n"
371 "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
372 "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
373 "subs x27, x27, #0x4\n"
374 "sdot z10.s, z17.b, z0.b[1]\n"
375 "sdot z14.s, z17.b, z1.b[1]\n"
376 "addvl x10, x10, #4\n"
377 "sdot z11.s, z16.b, z0.b[1]\n"
378 "sdot z15.s, z16.b, z1.b[1]\n"
380 "ld1b { z17.b }, p5/Z, [x10]\n"
381 "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
382 "sdot z8.s, z17.b, z0.b[2]\n"
383 "sdot z12.s, z17.b, z1.b[2]\n"
384 "sdot z9.s, z16.b, z0.b[2]\n"
385 "sdot z13.s, z16.b, z1.b[2]\n"
386 "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
387 "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
388 "subs x27, x27, #0x4\n"
389 "sdot z10.s, z17.b, z0.b[2]\n"
390 "sdot z14.s, z17.b, z1.b[2]\n"
391 "addvl x10, x10, #4\n"
392 "sdot z11.s, z16.b, z0.b[2]\n"
393 "sdot z15.s, z16.b, z1.b[2]\n"
395 "ld1b { z17.b }, p5/Z, [x10]\n"
396 "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n"
397 "sdot z8.s, z17.b, z0.b[3]\n"
398 "sdot z12.s, z17.b, z1.b[3]\n"
399 "sdot z9.s, z16.b, z0.b[3]\n"
400 "sdot z13.s, z16.b, z1.b[3]\n"
401 "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n"
402 "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n"
403 "sdot z10.s, z17.b, z0.b[3]\n"
404 "sdot z14.s, z17.b, z1.b[3]\n"
405 "addvl x10, x10, #4\n"
406 "sdot z11.s, z16.b, z0.b[3]\n"
407 "sdot z15.s, z16.b, z1.b[3]\n"
409 "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
410 "add x28, x28, #0x1\n"
413 "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
414 "add x20, x9, x20, LSL #2\n"
415 "st1w { z8.s }, p4, [x9]\n"
416 "st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
417 "st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
418 "st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
420 "st1w { z12.s }, p4, [x20]\n"
421 "st1w { z13.s }, p3, [x20, #1, MUL VL]\n"
422 "st1w { z14.s }, p2, [x20, #2, MUL VL]\n"
423 "st1w { z15.s }, p1, [x20, #3, MUL VL]\n"
425 "decw x11, ALL, MUL #4\n"
430 "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
431 "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
432 "mov x9, %x[output_ptr]\n"
435 "whilelt p4.s, x20, x11\n"
437 "whilelt p3.s, x20, x11\n"
439 "whilelt p2.s, x20, x11\n"
441 "whilelt p1.s, x20, x11\n"
442 "tbz %x[flags], #0, 25f\n"
443 "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
444 "add x21, x9, x20, LSL #2\n"
445 "add x20, x21, x20, LSL #2\n"
446 "ld1w { z8.s }, p4/Z, [x9]\n"
447 "ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
448 "ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
449 "ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
450 "ld1w { z12.s }, p4/Z, [x21]\n"
451 "ld1w { z13.s }, p3/Z, [x21, #1, MUL VL]\n"
452 "ld1w { z14.s }, p2/Z, [x21, #2, MUL VL]\n"
453 "ld1w { z15.s }, p1/Z, [x21, #3, MUL VL]\n"
454 "ld1w { z16.s }, p4/Z, [x20]\n"
455 "ld1w { z17.s }, p3/Z, [x20, #1, MUL VL]\n"
456 "ld1w { z18.s }, p2/Z, [x20, #2, MUL VL]\n"
457 "ld1w { z19.s }, p1/Z, [x20, #3, MUL VL]\n"
475 "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
476 "ldr w27, [x20, x28, LSL #0x2]\n"
477 "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
478 "tbz %x[flags], #3, 28f\n"
479 "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
480 "add x20, x20, x21, LSL #3\n"
481 "ldr x26, [x20, #0x0]\n"
482 "ldr x25, [x20, #0x8]\n"
483 "ldr x24, [x20, #0x10]\n"
485 "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
486 "add x26, x26, x20\n"
487 "add x25, x25, x20\n"
488 "add x24, x24, x20\n"
491 "mov x26, %x[input_ptr]\n"
492 "add x25, x26, x21\n"
493 "add x24, x25, x21\n"
498 "whilelt p0.b, XZR, x27\n"
499 "ld1rqb { z2.b }, p0/Z, [x26]\n"
500 "ld1rqb { z1.b }, p0/Z, [x25]\n"
501 "sub x27, x27, #0x10\n"
502 "ld1rqb { z0.b }, p0/Z, [x24]\n"
503 "ld1b { z21.b }, p5/Z, [x10]\n"
504 "sdot z8.s, z21.b, z2.b[0]\n"
505 "sdot z12.s, z21.b, z1.b[0]\n"
506 "ld1b { z20.b }, p5/Z, [x10, #1, MUL VL]\n"
507 "sdot z16.s, z21.b, z0.b[0]\n"
508 "sdot z9.s, z20.b, z2.b[0]\n"
509 "ld1b { z21.b }, p5/Z, [x10, #2, MUL VL]\n"
510 "sdot z13.s, z20.b, z1.b[0]\n"
511 "sdot z17.s, z20.b, z0.b[0]\n"
512 "ld1b { z20.b }, p5/Z, [x10, #3, MUL VL]\n"
514 "sdot z10.s, z21.b, z2.b[0]\n"
515 "sdot z14.s, z21.b, z1.b[0]\n"
516 "add x26, x26, #0x10\n"
517 "add x25, x25, #0x10\n"
518 "sdot z18.s, z21.b, z0.b[0]\n"
519 "sdot z11.s, z20.b, z2.b[0]\n"
520 "ld1b { z21.b }, p5/Z, [x10, #4, MUL VL]\n"
521 "add x24, x24, #0x10\n"
522 "sdot z15.s, z20.b, z1.b[0]\n"
523 "sdot z19.s, z20.b, z0.b[0]\n"
524 "ld1b { z20.b }, p5/Z, [x10, #5, MUL VL]\n"
525 "sdot z8.s, z21.b, z2.b[1]\n"
526 "sdot z12.s, z21.b, z1.b[1]\n"
527 "sdot z16.s, z21.b, z0.b[1]\n"
528 "sdot z9.s, z20.b, z2.b[1]\n"
529 "ld1b { z21.b }, p5/Z, [x10, #6, MUL VL]\n"
530 "sdot z13.s, z20.b, z1.b[1]\n"
531 "sdot z17.s, z20.b, z0.b[1]\n"
532 "ld1b { z20.b }, p5/Z, [x10, #7, MUL VL]\n"
533 "addvl x10, x10, #16\n"
534 "sdot z10.s, z21.b, z2.b[1]\n"
535 "sdot z14.s, z21.b, z1.b[1]\n"
536 "sdot z18.s, z21.b, z0.b[1]\n"
537 "sdot z11.s, z20.b, z2.b[1]\n"
538 "ld1b { z21.b }, p5/Z, [x10, #-8, MUL VL]\n"
539 "sdot z15.s, z20.b, z1.b[1]\n"
540 "sdot z19.s, z20.b, z0.b[1]\n"
541 "ld1b { z20.b }, p5/Z, [x10, #-7, MUL VL]\n"
542 "sdot z8.s, z21.b, z2.b[2]\n"
543 "sdot z12.s, z21.b, z1.b[2]\n"
544 "sdot z16.s, z21.b, z0.b[2]\n"
545 "sdot z9.s, z20.b, z2.b[2]\n"
546 "ld1b { z21.b }, p5/Z, [x10, #-6, MUL VL]\n"
547 "sdot z13.s, z20.b, z1.b[2]\n"
548 "sdot z17.s, z20.b, z0.b[2]\n"
549 "ld1b { z20.b }, p5/Z, [x10, #-5, MUL VL]\n"
550 "sdot z10.s, z21.b, z2.b[2]\n"
551 "sdot z14.s, z21.b, z1.b[2]\n"
552 "sdot z18.s, z21.b, z0.b[2]\n"
553 "sdot z11.s, z20.b, z2.b[2]\n"
554 "ld1b { z21.b }, p5/Z, [x10, #-4, MUL VL]\n"
555 "sdot z15.s, z20.b, z1.b[2]\n"
556 "sdot z19.s, z20.b, z0.b[2]\n"
557 "ld1b { z20.b }, p5/Z, [x10, #-3, MUL VL]\n"
558 "sdot z8.s, z21.b, z2.b[3]\n"
559 "sdot z12.s, z21.b, z1.b[3]\n"
560 "sdot z16.s, z21.b, z0.b[3]\n"
561 "sdot z9.s, z20.b, z2.b[3]\n"
562 "ld1b { z21.b }, p5/Z, [x10, #-2, MUL VL]\n"
563 "sdot z13.s, z20.b, z1.b[3]\n"
564 "sdot z17.s, z20.b, z0.b[3]\n"
565 "ld1b { z20.b }, p5/Z, [x10, #-1, MUL VL]\n"
566 "sdot z10.s, z21.b, z2.b[3]\n"
567 "sdot z14.s, z21.b, z1.b[3]\n"
568 "sdot z18.s, z21.b, z0.b[3]\n"
569 "sdot z11.s, z20.b, z2.b[3]\n"
570 "sdot z15.s, z20.b, z1.b[3]\n"
571 "sdot z19.s, z20.b, z0.b[3]\n"
574 "whilelt p0.b, XZR, x27\n"
575 "ld1rqb { z0.b }, p0/Z, [x26]\n"
576 "ld1rqb { z1.b }, p0/Z, [x25]\n"
577 "subs x27, x27, #0x4\n"
578 "ld1rqb { z2.b }, p0/Z, [x24]\n"
579 "ld1b { z21.b }, p5/Z, [x10]\n"
580 "sdot z8.s, z21.b, z0.b[0]\n"
581 "sdot z12.s, z21.b, z1.b[0]\n"
582 "ld1b { z20.b }, p5/Z, [x10, #1, MUL VL]\n"
583 "sdot z16.s, z21.b, z2.b[0]\n"
584 "sdot z9.s, z20.b, z0.b[0]\n"
585 "ld1b { z21.b }, p5/Z, [x10, #2, MUL VL]\n"
586 "sdot z13.s, z20.b, z1.b[0]\n"
587 "sdot z17.s, z20.b, z2.b[0]\n"
588 "ld1b { z20.b }, p5/Z, [x10, #3, MUL VL]\n"
589 "addvl x10, x10, #4\n"
590 "sdot z10.s, z21.b, z0.b[0]\n"
591 "sdot z14.s, z21.b, z1.b[0]\n"
592 "sdot z18.s, z21.b, z2.b[0]\n"
593 "sdot z11.s, z20.b, z0.b[0]\n"
594 "sdot z15.s, z20.b, z1.b[0]\n"
595 "sdot z19.s, z20.b, z2.b[0]\n"
597 "ld1b { z21.b }, p5/Z, [x10]\n"
598 "ld1b { z20.b }, p5/Z, [x10, #1, MUL VL]\n"
599 "sdot z8.s, z21.b, z0.b[1]\n"
600 "sdot z12.s, z21.b, z1.b[1]\n"
601 "sdot z16.s, z21.b, z2.b[1]\n"
602 "sdot z9.s, z20.b, z0.b[1]\n"
603 "ld1b { z21.b }, p5/Z, [x10, #2, MUL VL]\n"
604 "subs x27, x27, #0x4\n"
605 "sdot z13.s, z20.b, z1.b[1]\n"
606 "sdot z17.s, z20.b, z2.b[1]\n"
607 "ld1b { z20.b }, p5/Z, [x10, #3, MUL VL]\n"
608 "addvl x10, x10, #4\n"
609 "sdot z10.s, z21.b, z0.b[1]\n"
610 "sdot z14.s, z21.b, z1.b[1]\n"
611 "sdot z18.s, z21.b, z2.b[1]\n"
612 "sdot z11.s, z20.b, z0.b[1]\n"
613 "sdot z15.s, z20.b, z1.b[1]\n"
614 "sdot z19.s, z20.b, z2.b[1]\n"
616 "ld1b { z21.b }, p5/Z, [x10]\n"
617 "ld1b { z20.b }, p5/Z, [x10, #1, MUL VL]\n"
618 "sdot z8.s, z21.b, z0.b[2]\n"
619 "sdot z12.s, z21.b, z1.b[2]\n"
620 "sdot z16.s, z21.b, z2.b[2]\n"
621 "sdot z9.s, z20.b, z0.b[2]\n"
622 "ld1b { z21.b }, p5/Z, [x10, #2, MUL VL]\n"
623 "subs x27, x27, #0x4\n"
624 "sdot z13.s, z20.b, z1.b[2]\n"
625 "sdot z17.s, z20.b, z2.b[2]\n"
626 "ld1b { z20.b }, p5/Z, [x10, #3, MUL VL]\n"
627 "addvl x10, x10, #4\n"
628 "sdot z10.s, z21.b, z0.b[2]\n"
629 "sdot z14.s, z21.b, z1.b[2]\n"
630 "sdot z18.s, z21.b, z2.b[2]\n"
631 "sdot z11.s, z20.b, z0.b[2]\n"
632 "sdot z15.s, z20.b, z1.b[2]\n"
633 "sdot z19.s, z20.b, z2.b[2]\n"
635 "ld1b { z21.b }, p5/Z, [x10]\n"
636 "ld1b { z20.b }, p5/Z, [x10, #1, MUL VL]\n"
637 "sdot z8.s, z21.b, z0.b[3]\n"
638 "sdot z12.s, z21.b, z1.b[3]\n"
639 "sdot z16.s, z21.b, z2.b[3]\n"
640 "sdot z9.s, z20.b, z0.b[3]\n"
641 "ld1b { z21.b }, p5/Z, [x10, #2, MUL VL]\n"
642 "sdot z13.s, z20.b, z1.b[3]\n"
643 "sdot z17.s, z20.b, z2.b[3]\n"
644 "ld1b { z20.b }, p5/Z, [x10, #3, MUL VL]\n"
645 "addvl x10, x10, #4\n"
646 "sdot z10.s, z21.b, z0.b[3]\n"
647 "sdot z14.s, z21.b, z1.b[3]\n"
648 "sdot z18.s, z21.b, z2.b[3]\n"
649 "sdot z11.s, z20.b, z0.b[3]\n"
650 "sdot z15.s, z20.b, z1.b[3]\n"
651 "sdot z19.s, z20.b, z2.b[3]\n"
653 "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
654 "add x28, x28, #0x1\n"
657 "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
658 "add x21, x9, x20, LSL #2\n"
659 "add x20, x21, x20, LSL #2\n"
660 "st1w { z8.s }, p4, [x9]\n"
661 "st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
662 "st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
663 "st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
665 "st1w { z12.s }, p4, [x21]\n"
666 "st1w { z13.s }, p3, [x21, #1, MUL VL]\n"
667 "st1w { z14.s }, p2, [x21, #2, MUL VL]\n"
668 "st1w { z15.s }, p1, [x21, #3, MUL VL]\n"
669 "st1w { z16.s }, p4, [x20]\n"
670 "st1w { z17.s }, p3, [x20, #1, MUL VL]\n"
671 "st1w { z18.s }, p2, [x20, #2, MUL VL]\n"
672 "st1w { z19.s }, p1, [x20, #3, MUL VL]\n"
674 "decw x11, ALL, MUL #4\n"
679 "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
680 "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
681 "mov x9, %x[output_ptr]\n"
684 "whilelt p4.s, x20, x11\n"
686 "whilelt p3.s, x20, x11\n"
688 "whilelt p2.s, x20, x11\n"
690 "whilelt p1.s, x20, x11\n"
691 "tbz %x[flags], #0, 36f\n"
692 "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
693 "add x22, x9, x20, LSL #2\n"
694 "add x21, x22, x20, LSL #2\n"
695 "ld1w { z8.s }, p4/Z, [x9]\n"
696 "add x20, x21, x20, LSL #2\n"
697 "ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
698 "ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
699 "ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
700 "ld1w { z12.s }, p4/Z, [x22]\n"
701 "ld1w { z13.s }, p3/Z, [x22, #1, MUL VL]\n"
702 "ld1w { z14.s }, p2/Z, [x22, #2, MUL VL]\n"
703 "ld1w { z15.s }, p1/Z, [x22, #3, MUL VL]\n"
704 "ld1w { z16.s }, p4/Z, [x21]\n"
705 "ld1w { z17.s }, p3/Z, [x21, #1, MUL VL]\n"
706 "ld1w { z18.s }, p2/Z, [x21, #2, MUL VL]\n"
707 "ld1w { z19.s }, p1/Z, [x21, #3, MUL VL]\n"
708 "ld1w { z20.s }, p4/Z, [x20]\n"
709 "ld1w { z21.s }, p3/Z, [x20, #1, MUL VL]\n"
710 "ld1w { z22.s }, p2/Z, [x20, #2, MUL VL]\n"
711 "ld1w { z23.s }, p1/Z, [x20, #3, MUL VL]\n"
733 "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
734 "ldr w27, [x20, x28, LSL #0x2]\n"
735 "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
736 "tbz %x[flags], #3, 39f\n"
737 "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
738 "add x20, x20, x21, LSL #3\n"
739 "ldr x26, [x20, #0x0]\n"
740 "ldr x25, [x20, #0x8]\n"
741 "ldr x24, [x20, #0x10]\n"
742 "ldr x23, [x20, #0x18]\n"
744 "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
745 "add x26, x26, x20\n"
746 "add x25, x25, x20\n"
747 "add x24, x24, x20\n"
748 "add x23, x23, x20\n"
751 "mov x26, %x[input_ptr]\n"
752 "add x25, x26, x21\n"
753 "add x24, x25, x21\n"
754 "add x23, x24, x21\n"
759 "whilelt p0.b, XZR, x27\n"
760 "ld1rqb { z3.b }, p0/Z, [x26]\n"
761 "ld1rqb { z2.b }, p0/Z, [x25]\n"
762 "sub x27, x27, #0x10\n"
763 "ld1rqb { z1.b }, p0/Z, [x24]\n"
764 "ld1rqb { z0.b }, p0/Z, [x23]\n"
766 "add x26, x26, #0x10\n"
767 "ld1b { z25.b }, p5/Z, [x10]\n"
768 "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
769 "sdot z8.s, z25.b, z3.b[0]\n"
770 "sdot z12.s, z25.b, z2.b[0]\n"
771 "sdot z16.s, z25.b, z1.b[0]\n"
772 "sdot z20.s, z25.b, z0.b[0]\n"
773 "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
774 "add x25, x25, #0x10\n"
775 "sdot z9.s, z24.b, z3.b[0]\n"
776 "sdot z13.s, z24.b, z2.b[0]\n"
777 "add x24, x24, #0x10\n"
778 "add x23, x23, #0x10\n"
779 "sdot z17.s, z24.b, z1.b[0]\n"
780 "sdot z21.s, z24.b, z0.b[0]\n"
781 "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
782 "sdot z10.s, z25.b, z3.b[0]\n"
783 "sdot z14.s, z25.b, z2.b[0]\n"
784 "sdot z18.s, z25.b, z1.b[0]\n"
785 "sdot z22.s, z25.b, z0.b[0]\n"
786 "ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n"
787 "sdot z11.s, z24.b, z3.b[0]\n"
788 "sdot z15.s, z24.b, z2.b[0]\n"
789 "sdot z19.s, z24.b, z1.b[0]\n"
790 "sdot z23.s, z24.b, z0.b[0]\n"
791 "ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n"
792 "sdot z8.s, z25.b, z3.b[1]\n"
793 "sdot z12.s, z25.b, z2.b[1]\n"
794 "sdot z16.s, z25.b, z1.b[1]\n"
795 "sdot z20.s, z25.b, z0.b[1]\n"
796 "ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n"
797 "sdot z9.s, z24.b, z3.b[1]\n"
798 "sdot z13.s, z24.b, z2.b[1]\n"
799 "sdot z17.s, z24.b, z1.b[1]\n"
800 "sdot z21.s, z24.b, z0.b[1]\n"
801 "ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n"
802 "addvl x10, x10, #16\n"
803 "sdot z10.s, z25.b, z3.b[1]\n"
804 "sdot z14.s, z25.b, z2.b[1]\n"
805 "sdot z18.s, z25.b, z1.b[1]\n"
806 "sdot z22.s, z25.b, z0.b[1]\n"
807 "ld1b { z25.b }, p5/Z, [x10, #-8, MUL VL]\n"
808 "sdot z11.s, z24.b, z3.b[1]\n"
809 "sdot z15.s, z24.b, z2.b[1]\n"
810 "sdot z19.s, z24.b, z1.b[1]\n"
811 "sdot z23.s, z24.b, z0.b[1]\n"
812 "ld1b { z24.b }, p5/Z, [x10, #-7, MUL VL]\n"
813 "sdot z8.s, z25.b, z3.b[2]\n"
814 "sdot z12.s, z25.b, z2.b[2]\n"
815 "sdot z16.s, z25.b, z1.b[2]\n"
816 "sdot z20.s, z25.b, z0.b[2]\n"
817 "ld1b { z25.b }, p5/Z, [x10, #-6, MUL VL]\n"
818 "sdot z9.s, z24.b, z3.b[2]\n"
819 "sdot z13.s, z24.b, z2.b[2]\n"
820 "sdot z17.s, z24.b, z1.b[2]\n"
821 "sdot z21.s, z24.b, z0.b[2]\n"
822 "ld1b { z24.b }, p5/Z, [x10, #-5, MUL VL]\n"
823 "sdot z10.s, z25.b, z3.b[2]\n"
824 "sdot z14.s, z25.b, z2.b[2]\n"
825 "sdot z18.s, z25.b, z1.b[2]\n"
826 "sdot z22.s, z25.b, z0.b[2]\n"
827 "ld1b { z25.b }, p5/Z, [x10, #-4, MUL VL]\n"
828 "sdot z11.s, z24.b, z3.b[2]\n"
829 "sdot z15.s, z24.b, z2.b[2]\n"
830 "sdot z19.s, z24.b, z1.b[2]\n"
831 "sdot z23.s, z24.b, z0.b[2]\n"
832 "ld1b { z24.b }, p5/Z, [x10, #-3, MUL VL]\n"
833 "sdot z8.s, z25.b, z3.b[3]\n"
834 "sdot z12.s, z25.b, z2.b[3]\n"
835 "sdot z16.s, z25.b, z1.b[3]\n"
836 "sdot z20.s, z25.b, z0.b[3]\n"
837 "ld1b { z25.b }, p5/Z, [x10, #-2, MUL VL]\n"
838 "sdot z9.s, z24.b, z3.b[3]\n"
839 "sdot z13.s, z24.b, z2.b[3]\n"
840 "sdot z17.s, z24.b, z1.b[3]\n"
841 "sdot z21.s, z24.b, z0.b[3]\n"
842 "ld1b { z24.b }, p5/Z, [x10, #-1, MUL VL]\n"
843 "sdot z10.s, z25.b, z3.b[3]\n"
844 "sdot z14.s, z25.b, z2.b[3]\n"
845 "sdot z18.s, z25.b, z1.b[3]\n"
846 "sdot z22.s, z25.b, z0.b[3]\n"
847 "sdot z11.s, z24.b, z3.b[3]\n"
848 "sdot z15.s, z24.b, z2.b[3]\n"
849 "sdot z19.s, z24.b, z1.b[3]\n"
850 "sdot z23.s, z24.b, z0.b[3]\n"
853 "whilelt p0.b, XZR, x27\n"
854 "ld1rqb { z0.b }, p0/Z, [x26]\n"
855 "ld1rqb { z1.b }, p0/Z, [x25]\n"
856 "subs x27, x27, #0x4\n"
857 "ld1rqb { z2.b }, p0/Z, [x24]\n"
858 "ld1rqb { z3.b }, p0/Z, [x23]\n"
859 "ld1b { z25.b }, p5/Z, [x10]\n"
860 "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
861 "sdot z8.s, z25.b, z0.b[0]\n"
862 "sdot z12.s, z25.b, z1.b[0]\n"
863 "sdot z16.s, z25.b, z2.b[0]\n"
864 "sdot z20.s, z25.b, z3.b[0]\n"
865 "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
866 "sdot z9.s, z24.b, z0.b[0]\n"
867 "sdot z13.s, z24.b, z1.b[0]\n"
868 "sdot z17.s, z24.b, z2.b[0]\n"
869 "sdot z21.s, z24.b, z3.b[0]\n"
870 "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
871 "addvl x10, x10, #4\n"
872 "sdot z10.s, z25.b, z0.b[0]\n"
873 "sdot z14.s, z25.b, z1.b[0]\n"
874 "sdot z18.s, z25.b, z2.b[0]\n"
875 "sdot z22.s, z25.b, z3.b[0]\n"
876 "sdot z11.s, z24.b, z0.b[0]\n"
877 "sdot z15.s, z24.b, z1.b[0]\n"
878 "sdot z19.s, z24.b, z2.b[0]\n"
879 "sdot z23.s, z24.b, z3.b[0]\n"
881 "ld1b { z25.b }, p5/Z, [x10]\n"
882 "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
883 "sdot z8.s, z25.b, z0.b[1]\n"
884 "sdot z12.s, z25.b, z1.b[1]\n"
885 "sdot z16.s, z25.b, z2.b[1]\n"
886 "sdot z20.s, z25.b, z3.b[1]\n"
887 "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
888 "subs x27, x27, #0x4\n"
889 "sdot z9.s, z24.b, z0.b[1]\n"
890 "sdot z13.s, z24.b, z1.b[1]\n"
891 "sdot z17.s, z24.b, z2.b[1]\n"
892 "sdot z21.s, z24.b, z3.b[1]\n"
893 "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
894 "addvl x10, x10, #4\n"
895 "sdot z10.s, z25.b, z0.b[1]\n"
896 "sdot z14.s, z25.b, z1.b[1]\n"
897 "sdot z18.s, z25.b, z2.b[1]\n"
898 "sdot z22.s, z25.b, z3.b[1]\n"
899 "sdot z11.s, z24.b, z0.b[1]\n"
900 "sdot z15.s, z24.b, z1.b[1]\n"
901 "sdot z19.s, z24.b, z2.b[1]\n"
902 "sdot z23.s, z24.b, z3.b[1]\n"
904 "ld1b { z25.b }, p5/Z, [x10]\n"
905 "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
906 "sdot z8.s, z25.b, z0.b[2]\n"
907 "sdot z12.s, z25.b, z1.b[2]\n"
908 "sdot z16.s, z25.b, z2.b[2]\n"
909 "sdot z20.s, z25.b, z3.b[2]\n"
910 "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
911 "subs x27, x27, #0x4\n"
912 "sdot z9.s, z24.b, z0.b[2]\n"
913 "sdot z13.s, z24.b, z1.b[2]\n"
914 "sdot z17.s, z24.b, z2.b[2]\n"
915 "sdot z21.s, z24.b, z3.b[2]\n"
916 "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
917 "addvl x10, x10, #4\n"
918 "sdot z10.s, z25.b, z0.b[2]\n"
919 "sdot z14.s, z25.b, z1.b[2]\n"
920 "sdot z18.s, z25.b, z2.b[2]\n"
921 "sdot z22.s, z25.b, z3.b[2]\n"
922 "sdot z11.s, z24.b, z0.b[2]\n"
923 "sdot z15.s, z24.b, z1.b[2]\n"
924 "sdot z19.s, z24.b, z2.b[2]\n"
925 "sdot z23.s, z24.b, z3.b[2]\n"
927 "ld1b { z25.b }, p5/Z, [x10]\n"
928 "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n"
929 "sdot z8.s, z25.b, z0.b[3]\n"
930 "sdot z12.s, z25.b, z1.b[3]\n"
931 "sdot z16.s, z25.b, z2.b[3]\n"
932 "sdot z20.s, z25.b, z3.b[3]\n"
933 "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n"
934 "sdot z9.s, z24.b, z0.b[3]\n"
935 "sdot z13.s, z24.b, z1.b[3]\n"
936 "sdot z17.s, z24.b, z2.b[3]\n"
937 "sdot z21.s, z24.b, z3.b[3]\n"
938 "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n"
939 "addvl x10, x10, #4\n"
940 "sdot z10.s, z25.b, z0.b[3]\n"
941 "sdot z14.s, z25.b, z1.b[3]\n"
942 "sdot z18.s, z25.b, z2.b[3]\n"
943 "sdot z22.s, z25.b, z3.b[3]\n"
944 "sdot z11.s, z24.b, z0.b[3]\n"
945 "sdot z15.s, z24.b, z1.b[3]\n"
946 "sdot z19.s, z24.b, z2.b[3]\n"
947 "sdot z23.s, z24.b, z3.b[3]\n"
949 "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
950 "add x28, x28, #0x1\n"
953 "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
954 "add x22, x9, x20, LSL #2\n"
955 "add x21, x22, x20, LSL #2\n"
956 "st1w { z8.s }, p4, [x9]\n"
957 "add x20, x21, x20, LSL #2\n"
958 "st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
959 "st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
960 "st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
962 "st1w { z12.s }, p4, [x22]\n"
963 "st1w { z13.s }, p3, [x22, #1, MUL VL]\n"
964 "st1w { z14.s }, p2, [x22, #2, MUL VL]\n"
965 "st1w { z15.s }, p1, [x22, #3, MUL VL]\n"
966 "st1w { z16.s }, p4, [x21]\n"
967 "st1w { z17.s }, p3, [x21, #1, MUL VL]\n"
968 "st1w { z18.s }, p2, [x21, #2, MUL VL]\n"
969 "st1w { z19.s }, p1, [x21, #3, MUL VL]\n"
970 "st1w { z20.s }, p4, [x20]\n"
971 "st1w { z21.s }, p3, [x20, #1, MUL VL]\n"
972 "st1w { z22.s }, p2, [x20, #2, MUL VL]\n"
973 "st1w { z23.s }, p1, [x20, #3, MUL VL]\n"
975 "decw x11, ALL, MUL #4\n"
980 "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
981 "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
982 "mov x9, %x[output_ptr]\n"
985 "whilelt p4.s, x20, x11\n"
987 "whilelt p3.s, x20, x11\n"
989 "whilelt p2.s, x20, x11\n"
991 "whilelt p1.s, x20, x11\n"
992 "tbz %x[flags], #0, 47f\n"
993 "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
994 "add x23, x9, x20, LSL #2\n"
995 "add x22, x23, x20, LSL #2\n"
996 "ld1w { z8.s }, p4/Z, [x9]\n"
997 "add x21, x22, x20, LSL #2\n"
998 "add x20, x21, x20, LSL #2\n"
999 "ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
1000 "ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
1001 "ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
1002 "ld1w { z12.s }, p4/Z, [x23]\n"
1003 "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
1004 "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
1005 "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
1006 "ld1w { z16.s }, p4/Z, [x22]\n"
1007 "ld1w { z17.s }, p3/Z, [x22, #1, MUL VL]\n"
1008 "ld1w { z18.s }, p2/Z, [x22, #2, MUL VL]\n"
1009 "ld1w { z19.s }, p1/Z, [x22, #3, MUL VL]\n"
1010 "ld1w { z20.s }, p4/Z, [x21]\n"
1011 "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n"
1012 "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n"
1013 "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n"
1014 "ld1w { z24.s }, p4/Z, [x20]\n"
1015 "ld1w { z25.s }, p3/Z, [x20, #1, MUL VL]\n"
1016 "ld1w { z26.s }, p2/Z, [x20, #2, MUL VL]\n"
1017 "ld1w { z27.s }, p1/Z, [x20, #3, MUL VL]\n"
1043 "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
1044 "ldr w27, [x20, x28, LSL #0x2]\n"
1045 "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
1046 "tbz %x[flags], #3, 50f\n"
1047 "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
1048 "add x20, x20, x21, LSL #3\n"
1049 "ldr x26, [x20, #0x0]\n"
1050 "ldr x25, [x20, #0x8]\n"
1051 "ldr x24, [x20, #0x10]\n"
1052 "ldr x23, [x20, #0x18]\n"
1053 "ldr x22, [x20, #0x20]\n"
1055 "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
1056 "add x26, x26, x20\n"
1057 "add x25, x25, x20\n"
1058 "add x24, x24, x20\n"
1059 "add x23, x23, x20\n"
1060 "add x22, x22, x20\n"
1063 "mov x26, %x[input_ptr]\n"
1064 "add x25, x26, x21\n"
1065 "add x24, x25, x21\n"
1066 "add x23, x24, x21\n"
1067 "add x22, x23, x21\n"
1072 "whilelt p0.b, XZR, x27\n"
1073 "ld1rqb { z4.b }, p0/Z, [x26]\n"
1074 "ld1rqb { z3.b }, p0/Z, [x25]\n"
1075 "sub x27, x27, #0x10\n"
1076 "ld1rqb { z2.b }, p0/Z, [x24]\n"
1077 "ld1rqb { z1.b }, p0/Z, [x23]\n"
1079 "add x26, x26, #0x10\n"
1080 "ld1rqb { z0.b }, p0/Z, [x22]\n"
1081 "ld1b { z29.b }, p5/Z, [x10]\n"
1082 "sdot z8.s, z29.b, z4.b[0]\n"
1083 "sdot z12.s, z29.b, z3.b[0]\n"
1084 "ld1b { z28.b }, p5/Z, [x10, #1, MUL VL]\n"
1085 "sdot z16.s, z29.b, z2.b[0]\n"
1086 "sdot z20.s, z29.b, z1.b[0]\n"
1087 "add x25, x25, #0x10\n"
1088 "sdot z24.s, z29.b, z0.b[0]\n"
1089 "sdot z9.s, z28.b, z4.b[0]\n"
1090 "ld1b { z29.b }, p5/Z, [x10, #2, MUL VL]\n"
1091 "add x24, x24, #0x10\n"
1092 "sdot z13.s, z28.b, z3.b[0]\n"
1093 "sdot z17.s, z28.b, z2.b[0]\n"
1094 "add x23, x23, #0x10\n"
1095 "add x22, x22, #0x10\n"
1096 "sdot z21.s, z28.b, z1.b[0]\n"
1097 "sdot z25.s, z28.b, z0.b[0]\n"
1098 "ld1b { z28.b }, p5/Z, [x10, #3, MUL VL]\n"
1099 "sdot z10.s, z29.b, z4.b[0]\n"
1100 "sdot z14.s, z29.b, z3.b[0]\n"
1101 "sdot z18.s, z29.b, z2.b[0]\n"
1102 "sdot z22.s, z29.b, z1.b[0]\n"
1103 "sdot z26.s, z29.b, z0.b[0]\n"
1104 "sdot z11.s, z28.b, z4.b[0]\n"
1105 "ld1b { z29.b }, p5/Z, [x10, #4, MUL VL]\n"
1106 "sdot z15.s, z28.b, z3.b[0]\n"
1107 "sdot z19.s, z28.b, z2.b[0]\n"
1108 "sdot z23.s, z28.b, z1.b[0]\n"
1109 "sdot z27.s, z28.b, z0.b[0]\n"
1110 "ld1b { z28.b }, p5/Z, [x10, #5, MUL VL]\n"
1111 "sdot z8.s, z29.b, z4.b[1]\n"
1112 "sdot z12.s, z29.b, z3.b[1]\n"
1113 "sdot z16.s, z29.b, z2.b[1]\n"
1114 "sdot z20.s, z29.b, z1.b[1]\n"
1115 "sdot z24.s, z29.b, z0.b[1]\n"
1116 "sdot z9.s, z28.b, z4.b[1]\n"
1117 "ld1b { z29.b }, p5/Z, [x10, #6, MUL VL]\n"
1118 "sdot z13.s, z28.b, z3.b[1]\n"
1119 "sdot z17.s, z28.b, z2.b[1]\n"
1120 "sdot z21.s, z28.b, z1.b[1]\n"
1121 "sdot z25.s, z28.b, z0.b[1]\n"
1122 "ld1b { z28.b }, p5/Z, [x10, #7, MUL VL]\n"
1123 "addvl x10, x10, #16\n"
1124 "sdot z10.s, z29.b, z4.b[1]\n"
1125 "sdot z14.s, z29.b, z3.b[1]\n"
1126 "sdot z18.s, z29.b, z2.b[1]\n"
1127 "sdot z22.s, z29.b, z1.b[1]\n"
1128 "sdot z26.s, z29.b, z0.b[1]\n"
1129 "sdot z11.s, z28.b, z4.b[1]\n"
1130 "ld1b { z29.b }, p5/Z, [x10, #-8, MUL VL]\n"
1131 "sdot z15.s, z28.b, z3.b[1]\n"
1132 "sdot z19.s, z28.b, z2.b[1]\n"
1133 "sdot z23.s, z28.b, z1.b[1]\n"
1134 "sdot z27.s, z28.b, z0.b[1]\n"
1135 "ld1b { z28.b }, p5/Z, [x10, #-7, MUL VL]\n"
1136 "sdot z8.s, z29.b, z4.b[2]\n"
1137 "sdot z12.s, z29.b, z3.b[2]\n"
1138 "sdot z16.s, z29.b, z2.b[2]\n"
1139 "sdot z20.s, z29.b, z1.b[2]\n"
1140 "sdot z24.s, z29.b, z0.b[2]\n"
1141 "sdot z9.s, z28.b, z4.b[2]\n"
1142 "ld1b { z29.b }, p5/Z, [x10, #-6, MUL VL]\n"
1143 "sdot z13.s, z28.b, z3.b[2]\n"
1144 "sdot z17.s, z28.b, z2.b[2]\n"
1145 "sdot z21.s, z28.b, z1.b[2]\n"
1146 "sdot z25.s, z28.b, z0.b[2]\n"
1147 "ld1b { z28.b }, p5/Z, [x10, #-5, MUL VL]\n"
1148 "sdot z10.s, z29.b, z4.b[2]\n"
1149 "sdot z14.s, z29.b, z3.b[2]\n"
1150 "sdot z18.s, z29.b, z2.b[2]\n"
1151 "sdot z22.s, z29.b, z1.b[2]\n"
1152 "sdot z26.s, z29.b, z0.b[2]\n"
1153 "sdot z11.s, z28.b, z4.b[2]\n"
1154 "ld1b { z29.b }, p5/Z, [x10, #-4, MUL VL]\n"
1155 "sdot z15.s, z28.b, z3.b[2]\n"
1156 "sdot z19.s, z28.b, z2.b[2]\n"
1157 "sdot z23.s, z28.b, z1.b[2]\n"
1158 "sdot z27.s, z28.b, z0.b[2]\n"
1159 "ld1b { z28.b }, p5/Z, [x10, #-3, MUL VL]\n"
1160 "sdot z8.s, z29.b, z4.b[3]\n"
1161 "sdot z12.s, z29.b, z3.b[3]\n"
1162 "sdot z16.s, z29.b, z2.b[3]\n"
1163 "sdot z20.s, z29.b, z1.b[3]\n"
1164 "sdot z24.s, z29.b, z0.b[3]\n"
1165 "sdot z9.s, z28.b, z4.b[3]\n"
1166 "ld1b { z29.b }, p5/Z, [x10, #-2, MUL VL]\n"
1167 "sdot z13.s, z28.b, z3.b[3]\n"
1168 "sdot z17.s, z28.b, z2.b[3]\n"
1169 "sdot z21.s, z28.b, z1.b[3]\n"
1170 "sdot z25.s, z28.b, z0.b[3]\n"
1171 "ld1b { z28.b }, p5/Z, [x10, #-1, MUL VL]\n"
1172 "sdot z10.s, z29.b, z4.b[3]\n"
1173 "sdot z14.s, z29.b, z3.b[3]\n"
1174 "sdot z18.s, z29.b, z2.b[3]\n"
1175 "sdot z22.s, z29.b, z1.b[3]\n"
1176 "sdot z26.s, z29.b, z0.b[3]\n"
1177 "sdot z11.s, z28.b, z4.b[3]\n"
1178 "sdot z15.s, z28.b, z3.b[3]\n"
1179 "sdot z19.s, z28.b, z2.b[3]\n"
1180 "sdot z23.s, z28.b, z1.b[3]\n"
1181 "sdot z27.s, z28.b, z0.b[3]\n"
1184 "whilelt p0.b, XZR, x27\n"
1185 "ld1rqb { z0.b }, p0/Z, [x26]\n"
1186 "ld1rqb { z1.b }, p0/Z, [x25]\n"
1187 "subs x27, x27, #0x4\n"
1188 "ld1rqb { z2.b }, p0/Z, [x24]\n"
1189 "ld1rqb { z3.b }, p0/Z, [x23]\n"
1190 "ld1rqb { z4.b }, p0/Z, [x22]\n"
1191 "ld1b { z29.b }, p5/Z, [x10]\n"
1192 "sdot z8.s, z29.b, z0.b[0]\n"
1193 "sdot z12.s, z29.b, z1.b[0]\n"
1194 "ld1b { z28.b }, p5/Z, [x10, #1, MUL VL]\n"
1195 "sdot z16.s, z29.b, z2.b[0]\n"
1196 "sdot z20.s, z29.b, z3.b[0]\n"
1197 "sdot z24.s, z29.b, z4.b[0]\n"
1198 "sdot z9.s, z28.b, z0.b[0]\n"
1199 "ld1b { z29.b }, p5/Z, [x10, #2, MUL VL]\n"
1200 "sdot z13.s, z28.b, z1.b[0]\n"
1201 "sdot z17.s, z28.b, z2.b[0]\n"
1202 "sdot z21.s, z28.b, z3.b[0]\n"
1203 "sdot z25.s, z28.b, z4.b[0]\n"
1204 "ld1b { z28.b }, p5/Z, [x10, #3, MUL VL]\n"
1205 "addvl x10, x10, #4\n"
1206 "sdot z10.s, z29.b, z0.b[0]\n"
1207 "sdot z14.s, z29.b, z1.b[0]\n"
1208 "sdot z18.s, z29.b, z2.b[0]\n"
1209 "sdot z22.s, z29.b, z3.b[0]\n"
1210 "sdot z26.s, z29.b, z4.b[0]\n"
1211 "sdot z11.s, z28.b, z0.b[0]\n"
1212 "sdot z15.s, z28.b, z1.b[0]\n"
1213 "sdot z19.s, z28.b, z2.b[0]\n"
1214 "sdot z23.s, z28.b, z3.b[0]\n"
1215 "sdot z27.s, z28.b, z4.b[0]\n"
1217 "ld1b { z29.b }, p5/Z, [x10]\n"
1218 "ld1b { z28.b }, p5/Z, [x10, #1, MUL VL]\n"
1219 "sdot z8.s, z29.b, z0.b[1]\n"
1220 "sdot z12.s, z29.b, z1.b[1]\n"
1221 "sdot z16.s, z29.b, z2.b[1]\n"
1222 "sdot z20.s, z29.b, z3.b[1]\n"
1223 "subs x27, x27, #0x4\n"
1224 "sdot z24.s, z29.b, z4.b[1]\n"
1225 "sdot z9.s, z28.b, z0.b[1]\n"
1226 "ld1b { z29.b }, p5/Z, [x10, #2, MUL VL]\n"
1227 "sdot z13.s, z28.b, z1.b[1]\n"
1228 "sdot z17.s, z28.b, z2.b[1]\n"
1229 "sdot z21.s, z28.b, z3.b[1]\n"
1230 "sdot z25.s, z28.b, z4.b[1]\n"
1231 "ld1b { z28.b }, p5/Z, [x10, #3, MUL VL]\n"
1232 "addvl x10, x10, #4\n"
1233 "sdot z10.s, z29.b, z0.b[1]\n"
1234 "sdot z14.s, z29.b, z1.b[1]\n"
1235 "sdot z18.s, z29.b, z2.b[1]\n"
1236 "sdot z22.s, z29.b, z3.b[1]\n"
1237 "sdot z26.s, z29.b, z4.b[1]\n"
1238 "sdot z11.s, z28.b, z0.b[1]\n"
1239 "sdot z15.s, z28.b, z1.b[1]\n"
1240 "sdot z19.s, z28.b, z2.b[1]\n"
1241 "sdot z23.s, z28.b, z3.b[1]\n"
1242 "sdot z27.s, z28.b, z4.b[1]\n"
1244 "ld1b { z29.b }, p5/Z, [x10]\n"
1245 "ld1b { z28.b }, p5/Z, [x10, #1, MUL VL]\n"
1246 "sdot z8.s, z29.b, z0.b[2]\n"
1247 "sdot z12.s, z29.b, z1.b[2]\n"
1248 "sdot z16.s, z29.b, z2.b[2]\n"
1249 "sdot z20.s, z29.b, z3.b[2]\n"
1250 "subs x27, x27, #0x4\n"
1251 "sdot z24.s, z29.b, z4.b[2]\n"
1252 "sdot z9.s, z28.b, z0.b[2]\n"
1253 "ld1b { z29.b }, p5/Z, [x10, #2, MUL VL]\n"
1254 "sdot z13.s, z28.b, z1.b[2]\n"
1255 "sdot z17.s, z28.b, z2.b[2]\n"
1256 "sdot z21.s, z28.b, z3.b[2]\n"
1257 "sdot z25.s, z28.b, z4.b[2]\n"
1258 "ld1b { z28.b }, p5/Z, [x10, #3, MUL VL]\n"
1259 "addvl x10, x10, #4\n"
1260 "sdot z10.s, z29.b, z0.b[2]\n"
1261 "sdot z14.s, z29.b, z1.b[2]\n"
1262 "sdot z18.s, z29.b, z2.b[2]\n"
1263 "sdot z22.s, z29.b, z3.b[2]\n"
1264 "sdot z26.s, z29.b, z4.b[2]\n"
1265 "sdot z11.s, z28.b, z0.b[2]\n"
1266 "sdot z15.s, z28.b, z1.b[2]\n"
1267 "sdot z19.s, z28.b, z2.b[2]\n"
1268 "sdot z23.s, z28.b, z3.b[2]\n"
1269 "sdot z27.s, z28.b, z4.b[2]\n"
1271 "ld1b { z29.b }, p5/Z, [x10]\n"
1272 "ld1b { z28.b }, p5/Z, [x10, #1, MUL VL]\n"
1273 "sdot z8.s, z29.b, z0.b[3]\n"
1274 "sdot z12.s, z29.b, z1.b[3]\n"
1275 "sdot z16.s, z29.b, z2.b[3]\n"
1276 "sdot z20.s, z29.b, z3.b[3]\n"
1277 "sdot z24.s, z29.b, z4.b[3]\n"
1278 "sdot z9.s, z28.b, z0.b[3]\n"
1279 "ld1b { z29.b }, p5/Z, [x10, #2, MUL VL]\n"
1280 "sdot z13.s, z28.b, z1.b[3]\n"
1281 "sdot z17.s, z28.b, z2.b[3]\n"
1282 "sdot z21.s, z28.b, z3.b[3]\n"
1283 "sdot z25.s, z28.b, z4.b[3]\n"
1284 "ld1b { z28.b }, p5/Z, [x10, #3, MUL VL]\n"
1285 "addvl x10, x10, #4\n"
1286 "sdot z10.s, z29.b, z0.b[3]\n"
1287 "sdot z14.s, z29.b, z1.b[3]\n"
1288 "sdot z18.s, z29.b, z2.b[3]\n"
1289 "sdot z22.s, z29.b, z3.b[3]\n"
1290 "sdot z26.s, z29.b, z4.b[3]\n"
1291 "sdot z11.s, z28.b, z0.b[3]\n"
1292 "sdot z15.s, z28.b, z1.b[3]\n"
1293 "sdot z19.s, z28.b, z2.b[3]\n"
1294 "sdot z23.s, z28.b, z3.b[3]\n"
1295 "sdot z27.s, z28.b, z4.b[3]\n"
1297 "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
1298 "add x28, x28, #0x1\n"
1301 "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
1302 "add x23, x9, x20, LSL #2\n"
1303 "add x22, x23, x20, LSL #2\n"
1304 "st1w { z8.s }, p4, [x9]\n"
1305 "add x21, x22, x20, LSL #2\n"
1306 "add x20, x21, x20, LSL #2\n"
1307 "st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
1308 "st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
1309 "st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
1310 "addvl x9, x9, #4\n"
1311 "st1w { z12.s }, p4, [x23]\n"
1312 "st1w { z13.s }, p3, [x23, #1, MUL VL]\n"
1313 "st1w { z14.s }, p2, [x23, #2, MUL VL]\n"
1314 "st1w { z15.s }, p1, [x23, #3, MUL VL]\n"
1315 "st1w { z16.s }, p4, [x22]\n"
1316 "st1w { z17.s }, p3, [x22, #1, MUL VL]\n"
1317 "st1w { z18.s }, p2, [x22, #2, MUL VL]\n"
1318 "st1w { z19.s }, p1, [x22, #3, MUL VL]\n"
1319 "st1w { z20.s }, p4, [x21]\n"
1320 "st1w { z21.s }, p3, [x21, #1, MUL VL]\n"
1321 "st1w { z22.s }, p2, [x21, #2, MUL VL]\n"
1322 "st1w { z23.s }, p1, [x21, #3, MUL VL]\n"
1323 "st1w { z24.s }, p4, [x20]\n"
1324 "st1w { z25.s }, p3, [x20, #1, MUL VL]\n"
1325 "st1w { z26.s }, p2, [x20, #2, MUL VL]\n"
1326 "st1w { z27.s }, p1, [x20, #3, MUL VL]\n"
1328 "decw x11, ALL, MUL #4\n"
1333 "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
1335 "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
1336 "mov x9, %x[output_ptr]\n"
1337 "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
1338 "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
1341 "whilelt p4.s, x20, x11\n"
1343 "whilelt p3.s, x20, x11\n"
1345 "whilelt p2.s, x20, x11\n"
1347 "whilelt p1.s, x20, x11\n"
1348 "tbz %x[flags], #0, 58f\n"
1349 "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
1350 "add x24, x9, x20, LSL #2\n"
1351 "add x23, x24, x20, LSL #2\n"
1352 "ld1w { z8.s }, p4/Z, [x9]\n"
1353 "add x22, x23, x20, LSL #2\n"
1354 "add x21, x22, x20, LSL #2\n"
1355 "ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n"
1356 "ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n"
1357 "add x20, x21, x20, LSL #2\n"
1358 "ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n"
1359 "ld1w { z12.s }, p4/Z, [x24]\n"
1360 "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
1361 "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
1362 "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
1363 "ld1w { z16.s }, p4/Z, [x23]\n"
1364 "ld1w { z17.s }, p3/Z, [x23, #1, MUL VL]\n"
1365 "ld1w { z18.s }, p2/Z, [x23, #2, MUL VL]\n"
1366 "ld1w { z19.s }, p1/Z, [x23, #3, MUL VL]\n"
1367 "ld1w { z20.s }, p4/Z, [x22]\n"
1368 "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n"
1369 "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n"
1370 "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n"
1371 "ld1w { z24.s }, p4/Z, [x21]\n"
1372 "ld1w { z25.s }, p3/Z, [x21, #1, MUL VL]\n"
1373 "ld1w { z26.s }, p2/Z, [x21, #2, MUL VL]\n"
1374 "ld1w { z27.s }, p1/Z, [x21, #3, MUL VL]\n"
1375 "ld1w { z28.s }, p4/Z, [x20]\n"
1376 "ld1w { z29.s }, p3/Z, [x20, #1, MUL VL]\n"
1377 "ld1w { z30.s }, p2/Z, [x20, #2, MUL VL]\n"
1378 "ld1w { z31.s }, p1/Z, [x20, #3, MUL VL]\n"
1408 "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
1409 "ldr w27, [x20, x28, LSL #0x2]\n"
1410 "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
1411 "tbz %x[flags], #3, 61f\n"
1412 "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
1413 "add x20, x20, x21, LSL #3\n"
1414 "ldr x26, [x20, #0x0]\n"
1415 "ldr x25, [x20, #0x8]\n"
1416 "ldr x24, [x20, #0x10]\n"
1417 "ldr x23, [x20, #0x18]\n"
1418 "ldr x22, [x20, #0x20]\n"
1419 "ldr x21, [x20, #0x28]\n"
1421 "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
1422 "add x26, x26, x20\n"
1423 "add x25, x25, x20\n"
1424 "add x24, x24, x20\n"
1425 "add x23, x23, x20\n"
1426 "add x22, x22, x20\n"
1427 "add x21, x21, x20\n"
1430 "mov x26, %x[input_ptr]\n"
1431 "add x25, x26, x21\n"
1432 "add x24, x25, x21\n"
1433 "add x23, x24, x21\n"
1434 "add x22, x23, x21\n"
1435 "add x21, x22, x21\n"
1440 "whilelt p0.b, XZR, x27\n"
1441 "ld1rqb { z7.b }, p0/Z, [x26]\n"
1442 "ld1rqb { z6.b }, p0/Z, [x25]\n"
1443 "sub x27, x27, #0x10\n"
1444 "ld1rqb { z5.b }, p0/Z, [x24]\n"
1445 "ld1rqb { z4.b }, p0/Z, [x23]\n"
1447 "add x26, x26, #0x10\n"
1448 "ld1rqb { z3.b }, p0/Z, [x22]\n"
1449 "ld1rqb { z2.b }, p0/Z, [x21]\n"
1450 "add x25, x25, #0x10\n"
1451 "add x24, x24, #0x10\n"
1452 "ld1b { z1.b }, p5/Z, [x10]\n"
1453 "ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n"
1454 "sdot z8.s, z1.b, z7.b[0]\n"
1455 "sdot z12.s, z1.b, z6.b[0]\n"
1456 "sdot z16.s, z1.b, z5.b[0]\n"
1457 "sdot z20.s, z1.b, z4.b[0]\n"
1458 "add x23, x23, #0x10\n"
1459 "add x22, x22, #0x10\n"
1460 "sdot z24.s, z1.b, z3.b[0]\n"
1461 "sdot z28.s, z1.b, z2.b[0]\n"
1462 "ld1b { z1.b }, p5/Z, [x10, #2, MUL VL]\n"
1463 "add x21, x21, #0x10\n"
1464 "sdot z9.s, z0.b, z7.b[0]\n"
1465 "sdot z13.s, z0.b, z6.b[0]\n"
1466 "sdot z17.s, z0.b, z5.b[0]\n"
1467 "sdot z21.s, z0.b, z4.b[0]\n"
1468 "sdot z25.s, z0.b, z3.b[0]\n"
1469 "sdot z29.s, z0.b, z2.b[0]\n"
1470 "ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n"
1471 "sdot z10.s, z1.b, z7.b[0]\n"
1472 "sdot z14.s, z1.b, z6.b[0]\n"
1473 "sdot z18.s, z1.b, z5.b[0]\n"
1474 "sdot z22.s, z1.b, z4.b[0]\n"
1475 "sdot z26.s, z1.b, z3.b[0]\n"
1476 "sdot z30.s, z1.b, z2.b[0]\n"
1477 "ld1b { z1.b }, p5/Z, [x10, #4, MUL VL]\n"
1478 "sdot z11.s, z0.b, z7.b[0]\n"
1479 "sdot z15.s, z0.b, z6.b[0]\n"
1480 "sdot z19.s, z0.b, z5.b[0]\n"
1481 "sdot z23.s, z0.b, z4.b[0]\n"
1482 "sdot z27.s, z0.b, z3.b[0]\n"
1483 "sdot z31.s, z0.b, z2.b[0]\n"
1484 "ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n"
1485 "sdot z8.s, z1.b, z7.b[1]\n"
1486 "sdot z12.s, z1.b, z6.b[1]\n"
1487 "sdot z16.s, z1.b, z5.b[1]\n"
1488 "sdot z20.s, z1.b, z4.b[1]\n"
1489 "sdot z24.s, z1.b, z3.b[1]\n"
1490 "sdot z28.s, z1.b, z2.b[1]\n"
1491 "ld1b { z1.b }, p5/Z, [x10, #6, MUL VL]\n"
1492 "sdot z9.s, z0.b, z7.b[1]\n"
1493 "sdot z13.s, z0.b, z6.b[1]\n"
1494 "sdot z17.s, z0.b, z5.b[1]\n"
1495 "sdot z21.s, z0.b, z4.b[1]\n"
1496 "sdot z25.s, z0.b, z3.b[1]\n"
1497 "sdot z29.s, z0.b, z2.b[1]\n"
1498 "ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n"
1499 "addvl x10, x10, #16\n"
1500 "sdot z10.s, z1.b, z7.b[1]\n"
1501 "sdot z14.s, z1.b, z6.b[1]\n"
1502 "sdot z18.s, z1.b, z5.b[1]\n"
1503 "sdot z22.s, z1.b, z4.b[1]\n"
1504 "sdot z26.s, z1.b, z3.b[1]\n"
1505 "sdot z30.s, z1.b, z2.b[1]\n"
1506 "ld1b { z1.b }, p5/Z, [x10, #-8, MUL VL]\n"
1507 "sdot z11.s, z0.b, z7.b[1]\n"
1508 "sdot z15.s, z0.b, z6.b[1]\n"
1509 "sdot z19.s, z0.b, z5.b[1]\n"
1510 "sdot z23.s, z0.b, z4.b[1]\n"
1511 "sdot z27.s, z0.b, z3.b[1]\n"
1512 "sdot z31.s, z0.b, z2.b[1]\n"
1513 "ld1b { z0.b }, p5/Z, [x10, #-7, MUL VL]\n"
1514 "sdot z8.s, z1.b, z7.b[2]\n"
1515 "sdot z12.s, z1.b, z6.b[2]\n"
1516 "sdot z16.s, z1.b, z5.b[2]\n"
1517 "sdot z20.s, z1.b, z4.b[2]\n"
1518 "sdot z24.s, z1.b, z3.b[2]\n"
1519 "sdot z28.s, z1.b, z2.b[2]\n"
1520 "ld1b { z1.b }, p5/Z, [x10, #-6, MUL VL]\n"
1521 "sdot z9.s, z0.b, z7.b[2]\n"
1522 "sdot z13.s, z0.b, z6.b[2]\n"
1523 "sdot z17.s, z0.b, z5.b[2]\n"
1524 "sdot z21.s, z0.b, z4.b[2]\n"
1525 "sdot z25.s, z0.b, z3.b[2]\n"
1526 "sdot z29.s, z0.b, z2.b[2]\n"
1527 "ld1b { z0.b }, p5/Z, [x10, #-5, MUL VL]\n"
1528 "sdot z10.s, z1.b, z7.b[2]\n"
1529 "sdot z14.s, z1.b, z6.b[2]\n"
1530 "sdot z18.s, z1.b, z5.b[2]\n"
1531 "sdot z22.s, z1.b, z4.b[2]\n"
1532 "sdot z26.s, z1.b, z3.b[2]\n"
1533 "sdot z30.s, z1.b, z2.b[2]\n"
1534 "ld1b { z1.b }, p5/Z, [x10, #-4, MUL VL]\n"
1535 "sdot z11.s, z0.b, z7.b[2]\n"
1536 "sdot z15.s, z0.b, z6.b[2]\n"
1537 "sdot z19.s, z0.b, z5.b[2]\n"
1538 "sdot z23.s, z0.b, z4.b[2]\n"
1539 "sdot z27.s, z0.b, z3.b[2]\n"
1540 "sdot z31.s, z0.b, z2.b[2]\n"
1541 "ld1b { z0.b }, p5/Z, [x10, #-3, MUL VL]\n"
1542 "sdot z8.s, z1.b, z7.b[3]\n"
1543 "sdot z12.s, z1.b, z6.b[3]\n"
1544 "sdot z16.s, z1.b, z5.b[3]\n"
1545 "sdot z20.s, z1.b, z4.b[3]\n"
1546 "sdot z24.s, z1.b, z3.b[3]\n"
1547 "sdot z28.s, z1.b, z2.b[3]\n"
1548 "ld1b { z1.b }, p5/Z, [x10, #-2, MUL VL]\n"
1549 "sdot z9.s, z0.b, z7.b[3]\n"
1550 "sdot z13.s, z0.b, z6.b[3]\n"
1551 "sdot z17.s, z0.b, z5.b[3]\n"
1552 "sdot z21.s, z0.b, z4.b[3]\n"
1553 "sdot z25.s, z0.b, z3.b[3]\n"
1554 "sdot z29.s, z0.b, z2.b[3]\n"
1555 "ld1b { z0.b }, p5/Z, [x10, #-1, MUL VL]\n"
1556 "sdot z10.s, z1.b, z7.b[3]\n"
1557 "sdot z14.s, z1.b, z6.b[3]\n"
1558 "sdot z18.s, z1.b, z5.b[3]\n"
1559 "sdot z22.s, z1.b, z4.b[3]\n"
1560 "sdot z26.s, z1.b, z3.b[3]\n"
1561 "sdot z30.s, z1.b, z2.b[3]\n"
1562 "sdot z11.s, z0.b, z7.b[3]\n"
1563 "sdot z15.s, z0.b, z6.b[3]\n"
1564 "sdot z19.s, z0.b, z5.b[3]\n"
1565 "sdot z23.s, z0.b, z4.b[3]\n"
1566 "sdot z27.s, z0.b, z3.b[3]\n"
1567 "sdot z31.s, z0.b, z2.b[3]\n"
1570 "whilelt p0.b, XZR, x27\n"
1571 "ld1rqb { z0.b }, p0/Z, [x26]\n"
1572 "ld1rqb { z1.b }, p0/Z, [x25]\n"
1573 "subs x27, x27, #0x4\n"
1574 "ld1rqb { z2.b }, p0/Z, [x24]\n"
1575 "ld1rqb { z3.b }, p0/Z, [x23]\n"
1576 "ld1rqb { z4.b }, p0/Z, [x22]\n"
1577 "ld1rqb { z5.b }, p0/Z, [x21]\n"
1578 "ld1b { z7.b }, p5/Z, [x10]\n"
1579 "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
1580 "sdot z8.s, z7.b, z0.b[0]\n"
1581 "sdot z12.s, z7.b, z1.b[0]\n"
1582 "sdot z16.s, z7.b, z2.b[0]\n"
1583 "sdot z20.s, z7.b, z3.b[0]\n"
1584 "sdot z24.s, z7.b, z4.b[0]\n"
1585 "sdot z28.s, z7.b, z5.b[0]\n"
1586 "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
1587 "sdot z9.s, z6.b, z0.b[0]\n"
1588 "sdot z13.s, z6.b, z1.b[0]\n"
1589 "sdot z17.s, z6.b, z2.b[0]\n"
1590 "sdot z21.s, z6.b, z3.b[0]\n"
1591 "sdot z25.s, z6.b, z4.b[0]\n"
1592 "sdot z29.s, z6.b, z5.b[0]\n"
1593 "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
1594 "addvl x10, x10, #4\n"
1595 "sdot z10.s, z7.b, z0.b[0]\n"
1596 "sdot z14.s, z7.b, z1.b[0]\n"
1597 "sdot z18.s, z7.b, z2.b[0]\n"
1598 "sdot z22.s, z7.b, z3.b[0]\n"
1599 "sdot z26.s, z7.b, z4.b[0]\n"
1600 "sdot z30.s, z7.b, z5.b[0]\n"
1601 "sdot z11.s, z6.b, z0.b[0]\n"
1602 "sdot z15.s, z6.b, z1.b[0]\n"
1603 "sdot z19.s, z6.b, z2.b[0]\n"
1604 "sdot z23.s, z6.b, z3.b[0]\n"
1605 "sdot z27.s, z6.b, z4.b[0]\n"
1606 "sdot z31.s, z6.b, z5.b[0]\n"
1608 "ld1b { z7.b }, p5/Z, [x10]\n"
1609 "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
1610 "sdot z8.s, z7.b, z0.b[1]\n"
1611 "sdot z12.s, z7.b, z1.b[1]\n"
1612 "sdot z16.s, z7.b, z2.b[1]\n"
1613 "sdot z20.s, z7.b, z3.b[1]\n"
1614 "subs x27, x27, #0x4\n"
1615 "sdot z24.s, z7.b, z4.b[1]\n"
1616 "sdot z28.s, z7.b, z5.b[1]\n"
1617 "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
1618 "sdot z9.s, z6.b, z0.b[1]\n"
1619 "sdot z13.s, z6.b, z1.b[1]\n"
1620 "sdot z17.s, z6.b, z2.b[1]\n"
1621 "sdot z21.s, z6.b, z3.b[1]\n"
1622 "sdot z25.s, z6.b, z4.b[1]\n"
1623 "sdot z29.s, z6.b, z5.b[1]\n"
1624 "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
1625 "addvl x10, x10, #4\n"
1626 "sdot z10.s, z7.b, z0.b[1]\n"
1627 "sdot z14.s, z7.b, z1.b[1]\n"
1628 "sdot z18.s, z7.b, z2.b[1]\n"
1629 "sdot z22.s, z7.b, z3.b[1]\n"
1630 "sdot z26.s, z7.b, z4.b[1]\n"
1631 "sdot z30.s, z7.b, z5.b[1]\n"
1632 "sdot z11.s, z6.b, z0.b[1]\n"
1633 "sdot z15.s, z6.b, z1.b[1]\n"
1634 "sdot z19.s, z6.b, z2.b[1]\n"
1635 "sdot z23.s, z6.b, z3.b[1]\n"
1636 "sdot z27.s, z6.b, z4.b[1]\n"
1637 "sdot z31.s, z6.b, z5.b[1]\n"
1639 "ld1b { z7.b }, p5/Z, [x10]\n"
1640 "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
1641 "sdot z8.s, z7.b, z0.b[2]\n"
1642 "sdot z12.s, z7.b, z1.b[2]\n"
1643 "sdot z16.s, z7.b, z2.b[2]\n"
1644 "sdot z20.s, z7.b, z3.b[2]\n"
1645 "subs x27, x27, #0x4\n"
1646 "sdot z24.s, z7.b, z4.b[2]\n"
1647 "sdot z28.s, z7.b, z5.b[2]\n"
1648 "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
1649 "sdot z9.s, z6.b, z0.b[2]\n"
1650 "sdot z13.s, z6.b, z1.b[2]\n"
1651 "sdot z17.s, z6.b, z2.b[2]\n"
1652 "sdot z21.s, z6.b, z3.b[2]\n"
1653 "sdot z25.s, z6.b, z4.b[2]\n"
1654 "sdot z29.s, z6.b, z5.b[2]\n"
1655 "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
1656 "addvl x10, x10, #4\n"
1657 "sdot z10.s, z7.b, z0.b[2]\n"
1658 "sdot z14.s, z7.b, z1.b[2]\n"
1659 "sdot z18.s, z7.b, z2.b[2]\n"
1660 "sdot z22.s, z7.b, z3.b[2]\n"
1661 "sdot z26.s, z7.b, z4.b[2]\n"
1662 "sdot z30.s, z7.b, z5.b[2]\n"
1663 "sdot z11.s, z6.b, z0.b[2]\n"
1664 "sdot z15.s, z6.b, z1.b[2]\n"
1665 "sdot z19.s, z6.b, z2.b[2]\n"
1666 "sdot z23.s, z6.b, z3.b[2]\n"
1667 "sdot z27.s, z6.b, z4.b[2]\n"
1668 "sdot z31.s, z6.b, z5.b[2]\n"
1670 "ld1b { z7.b }, p5/Z, [x10]\n"
1671 "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n"
1672 "sdot z8.s, z7.b, z0.b[3]\n"
1673 "sdot z12.s, z7.b, z1.b[3]\n"
1674 "sdot z16.s, z7.b, z2.b[3]\n"
1675 "sdot z20.s, z7.b, z3.b[3]\n"
1676 "sdot z24.s, z7.b, z4.b[3]\n"
1677 "sdot z28.s, z7.b, z5.b[3]\n"
1678 "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n"
1679 "sdot z9.s, z6.b, z0.b[3]\n"
1680 "sdot z13.s, z6.b, z1.b[3]\n"
1681 "sdot z17.s, z6.b, z2.b[3]\n"
1682 "sdot z21.s, z6.b, z3.b[3]\n"
1683 "sdot z25.s, z6.b, z4.b[3]\n"
1684 "sdot z29.s, z6.b, z5.b[3]\n"
1685 "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n"
1686 "addvl x10, x10, #4\n"
1687 "sdot z10.s, z7.b, z0.b[3]\n"
1688 "sdot z14.s, z7.b, z1.b[3]\n"
1689 "sdot z18.s, z7.b, z2.b[3]\n"
1690 "sdot z22.s, z7.b, z3.b[3]\n"
1691 "sdot z26.s, z7.b, z4.b[3]\n"
1692 "sdot z30.s, z7.b, z5.b[3]\n"
1693 "sdot z11.s, z6.b, z0.b[3]\n"
1694 "sdot z15.s, z6.b, z1.b[3]\n"
1695 "sdot z19.s, z6.b, z2.b[3]\n"
1696 "sdot z23.s, z6.b, z3.b[3]\n"
1697 "sdot z27.s, z6.b, z4.b[3]\n"
1698 "sdot z31.s, z6.b, z5.b[3]\n"
1700 "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
1701 "add x28, x28, #0x1\n"
1704 "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
1705 "add x24, x9, x20, LSL #2\n"
1706 "add x23, x24, x20, LSL #2\n"
1707 "st1w { z8.s }, p4, [x9]\n"
1708 "add x22, x23, x20, LSL #2\n"
1709 "add x21, x22, x20, LSL #2\n"
1710 "st1w { z9.s }, p3, [x9, #1, MUL VL]\n"
1711 "add x20, x21, x20, LSL #2\n"
1712 "st1w { z10.s }, p2, [x9, #2, MUL VL]\n"
1713 "st1w { z11.s }, p1, [x9, #3, MUL VL]\n"
1714 "addvl x9, x9, #4\n"
1715 "st1w { z12.s }, p4, [x24]\n"
1716 "st1w { z13.s }, p3, [x24, #1, MUL VL]\n"
1717 "st1w { z14.s }, p2, [x24, #2, MUL VL]\n"
1718 "st1w { z15.s }, p1, [x24, #3, MUL VL]\n"
1719 "st1w { z16.s }, p4, [x23]\n"
1720 "st1w { z17.s }, p3, [x23, #1, MUL VL]\n"
1721 "st1w { z18.s }, p2, [x23, #2, MUL VL]\n"
1722 "st1w { z19.s }, p1, [x23, #3, MUL VL]\n"
1723 "st1w { z20.s }, p4, [x22]\n"
1724 "st1w { z21.s }, p3, [x22, #1, MUL VL]\n"
1725 "st1w { z22.s }, p2, [x22, #2, MUL VL]\n"
1726 "st1w { z23.s }, p1, [x22, #3, MUL VL]\n"
1727 "st1w { z24.s }, p4, [x21]\n"
1728 "st1w { z25.s }, p3, [x21, #1, MUL VL]\n"
1729 "st1w { z26.s }, p2, [x21, #2, MUL VL]\n"
1730 "st1w { z27.s }, p1, [x21, #3, MUL VL]\n"
1731 "st1w { z28.s }, p4, [x20]\n"
1732 "st1w { z29.s }, p3, [x20, #1, MUL VL]\n"
1733 "st1w { z30.s }, p2, [x20, #2, MUL VL]\n"
1734 "st1w { z31.s }, p1, [x20, #3, MUL VL]\n"
1736 "decw x11, ALL, MUL #4\n"
1739 "subs %x[M], %x[M], #0x6\n"
1741 "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
1742 "tbz %x[flags], #3, 67f\n"
1743 "add x21, x21, #0x6\n"
1744 "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
1748 "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
1751 : [
M]
"+&r" (
M), [input_ptr]
"+&r" (input_ptr), [output_ptr]
"+&r" (output_ptr)
1752 : [args_ptr]
"r" (&ka), [flags]
"r" (flags), [offsetof_B_ptr]
"I" (offsetof(KernelArgs, B_ptr)), [offsetof_N]
"I" (offsetof(KernelArgs,
N)), [offsetof_input_initial_col]
"I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset]
"I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings]
"I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset]
"I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths]
"I" (offsetof(KernelArgs, string_lengths))
1753 :
"cc",
"memory",
"p0",
"p1",
"p2",
"p3",
"p4",
"p5",
"x9",
"x10",
"x11",
"x20",
"x21",
"x22",
"x23",
"x24",
"x25",
"x26",
"x27",
"x28",
"z0",
"z1",
"z2",
"z3",
"z4",
"z5",
"z6",
"z7",
"z8",
"z9",
"z10",
"z11",
"z12",
"z13",
"z14",
"z15",
"z16",
"z17",
"z18",
"z19",
"z20",
"z21",
"z22",
"z23",
"z24",
"z25",
"z26",
"z27",
"z28",
"z29",
"z30",
"z31"
1758 #endif // ARM_COMPUTE_ENABLE_SVE