24 #ifdef ARM_COMPUTE_ENABLE_SVE
27 #include "../../utils.hpp"
33 void sve_hybrid_s8s32_dot_6x4VL_a64fx (
34 unsigned int num_strings,
const unsigned int *string_lengths, IndirectInputArg<int8_t> A_arg,
35 size_t M,
size_t N,
const int8_t *B_ptr, IndirectOutputArg<int32_t> output_arg,
40 unsigned int num_strings = {};
41 const unsigned int *string_lengths = {};
43 const int8_t *B_ptr = {};
44 size_t output_offset = {};
45 size_t input_initial_col = {};
46 size_t input_offset = {};
49 unsigned long flags=0;
53 if (output_arg.is_indirect) {
54 output_ptr=(
void *)(output_arg.indirect.ptr);
55 ka.output_offset=output_arg.indirect.offset;
58 output_ptr=(
void *)(output_arg.direct.base);
59 ka.output_offset=output_arg.direct.stride;
62 if (A_arg.is_indirect) {
63 input_ptr=(
void *)(A_arg.indirect.ptr);
64 ka.input_offset=A_arg.indirect.start_row;
65 ka.input_initial_col=A_arg.indirect.start_col;
68 assert(num_strings==1);
69 input_ptr=(
void *)(A_arg.direct.base);
70 ka.input_offset=A_arg.direct.stride;
75 ka.num_strings = num_strings;
76 ka.string_lengths = string_lengths;
90 "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
91 "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
92 "mov x9, %x[output_ptr]\n"
95 "whilelt p3.s, x20, x11\n"
97 "whilelt p2.s, x20, x11\n"
99 "whilelt p1.s, x20, x11\n"
101 "whilelt p0.s, x20, x11\n"
102 "tbz %x[flags], #0, 3f\n"
103 "ld1w { z8.s }, p3/Z, [x9]\n"
104 "ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n"
105 "ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n"
106 "ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n"
116 "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
117 "ldr w27, [x20, x28, LSL #0x2]\n"
118 "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
119 "tbz %x[flags], #3, 6f\n"
120 "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
121 "add x20, x20, x21, LSL #3\n"
122 "ldr x26, [x20, #0x0]\n"
124 "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
125 "add x26, x26, x20\n"
128 "mov x26, %x[input_ptr]\n"
130 "subs x27, x27, #0x4\n"
131 "ld1rw { z0.s }, p4/Z, [x26]\n"
132 "ld1b { z6.b }, p4/Z, [x10]\n"
133 "ld1b { z7.b }, p4/Z, [x10, #1, MUL VL]\n"
136 "sdot z8.s, z6.b, z0.b\n"
137 "sdot z9.s, z7.b, z0.b\n"
138 "ld1b { z17.b }, p4/Z, [x10, #2, MUL VL]\n"
139 "ld1b { z16.b }, p4/Z, [x10, #3, MUL VL]\n"
140 "addvl x10, x10, #4\n"
141 "add x26, x26, #0x4\n"
142 "sdot z10.s, z17.b, z0.b\n"
143 "sdot z11.s, z16.b, z0.b\n"
144 "subs x27, x27, #0x4\n"
145 "ld1rw { z0.s }, p4/Z, [x26]\n"
146 "ld1b { z6.b }, p4/Z, [x10]\n"
147 "ld1b { z7.b }, p4/Z, [x10, #1, MUL VL]\n"
150 "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
151 "sdot z8.s, z6.b, z0.b\n"
152 "sdot z9.s, z7.b, z0.b\n"
153 "ld1b { z17.b }, p4/Z, [x10, #2, MUL VL]\n"
154 "ld1b { z16.b }, p4/Z, [x10, #3, MUL VL]\n"
155 "add x28, x28, #0x1\n"
157 "sdot z10.s, z17.b, z0.b\n"
158 "sdot z11.s, z16.b, z0.b\n"
159 "addvl x10, x10, #4\n"
161 "st1w { z8.s }, p3, [x9]\n"
162 "st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
163 "st1w { z10.s }, p1, [x9, #2, MUL VL]\n"
164 "st1w { z11.s }, p0, [x9, #3, MUL VL]\n"
167 "decw x11, ALL, MUL #4\n"
172 "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
173 "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
174 "mov x9, %x[output_ptr]\n"
177 "whilelt p3.s, x20, x11\n"
179 "whilelt p2.s, x20, x11\n"
181 "whilelt p1.s, x20, x11\n"
183 "whilelt p0.s, x20, x11\n"
184 "tbz %x[flags], #0, 13f\n"
185 "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
186 "add x20, x9, x20, LSL #2\n"
187 "ld1w { z8.s }, p3/Z, [x9]\n"
188 "ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n"
189 "ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n"
190 "ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n"
191 "ld1w { z12.s }, p3/Z, [x20]\n"
192 "ld1w { z13.s }, p2/Z, [x20, #1, MUL VL]\n"
193 "ld1w { z14.s }, p1/Z, [x20, #2, MUL VL]\n"
194 "ld1w { z15.s }, p0/Z, [x20, #3, MUL VL]\n"
208 "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
209 "ldr w27, [x20, x28, LSL #0x2]\n"
210 "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
211 "tbz %x[flags], #3, 16f\n"
212 "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
213 "add x20, x20, x21, LSL #3\n"
214 "ldr x26, [x20, #0x0]\n"
215 "ldr x25, [x20, #0x8]\n"
217 "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
218 "add x26, x26, x20\n"
219 "add x25, x25, x20\n"
222 "mov x26, %x[input_ptr]\n"
223 "add x25, x26, x21\n"
225 "subs x27, x27, #0x4\n"
226 "ld1rw { z0.s }, p4/Z, [x26]\n"
227 "ld1rw { z1.s }, p4/Z, [x25]\n"
228 "ld1b { z6.b }, p4/Z, [x10]\n"
229 "ld1b { z7.b }, p4/Z, [x10, #1, MUL VL]\n"
232 "sdot z8.s, z6.b, z0.b\n"
233 "sdot z12.s, z6.b, z1.b\n"
234 "ld1b { z17.b }, p4/Z, [x10, #2, MUL VL]\n"
235 "add x26, x26, #0x4\n"
236 "sdot z9.s, z7.b, z0.b\n"
237 "sdot z13.s, z7.b, z1.b\n"
238 "ld1b { z16.b }, p4/Z, [x10, #3, MUL VL]\n"
239 "addvl x10, x10, #4\n"
240 "subs x27, x27, #0x4\n"
241 "add x25, x25, #0x4\n"
242 "sdot z10.s, z17.b, z0.b\n"
243 "sdot z14.s, z17.b, z1.b\n"
244 "sdot z11.s, z16.b, z0.b\n"
245 "sdot z15.s, z16.b, z1.b\n"
246 "ld1rw { z0.s }, p4/Z, [x26]\n"
247 "ld1rw { z1.s }, p4/Z, [x25]\n"
248 "ld1b { z6.b }, p4/Z, [x10]\n"
249 "ld1b { z7.b }, p4/Z, [x10, #1, MUL VL]\n"
252 "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
253 "sdot z8.s, z6.b, z0.b\n"
254 "sdot z12.s, z6.b, z1.b\n"
255 "ld1b { z17.b }, p4/Z, [x10, #2, MUL VL]\n"
256 "sdot z9.s, z7.b, z0.b\n"
257 "sdot z13.s, z7.b, z1.b\n"
258 "ld1b { z16.b }, p4/Z, [x10, #3, MUL VL]\n"
259 "add x28, x28, #0x1\n"
261 "sdot z10.s, z17.b, z0.b\n"
262 "sdot z14.s, z17.b, z1.b\n"
263 "addvl x10, x10, #4\n"
264 "sdot z11.s, z16.b, z0.b\n"
265 "sdot z15.s, z16.b, z1.b\n"
267 "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
268 "add x20, x9, x20, LSL #2\n"
269 "st1w { z8.s }, p3, [x9]\n"
270 "st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
271 "st1w { z10.s }, p1, [x9, #2, MUL VL]\n"
272 "st1w { z11.s }, p0, [x9, #3, MUL VL]\n"
274 "st1w { z12.s }, p3, [x20]\n"
275 "st1w { z13.s }, p2, [x20, #1, MUL VL]\n"
276 "st1w { z14.s }, p1, [x20, #2, MUL VL]\n"
277 "st1w { z15.s }, p0, [x20, #3, MUL VL]\n"
279 "decw x11, ALL, MUL #4\n"
284 "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
285 "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
286 "mov x9, %x[output_ptr]\n"
289 "whilelt p3.s, x20, x11\n"
291 "whilelt p2.s, x20, x11\n"
293 "whilelt p1.s, x20, x11\n"
295 "whilelt p0.s, x20, x11\n"
296 "tbz %x[flags], #0, 23f\n"
297 "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
298 "add x21, x9, x20, LSL #2\n"
299 "add x20, x21, x20, LSL #2\n"
300 "ld1w { z8.s }, p3/Z, [x9]\n"
301 "ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n"
302 "ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n"
303 "ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n"
304 "ld1w { z12.s }, p3/Z, [x21]\n"
305 "ld1w { z13.s }, p2/Z, [x21, #1, MUL VL]\n"
306 "ld1w { z14.s }, p1/Z, [x21, #2, MUL VL]\n"
307 "ld1w { z15.s }, p0/Z, [x21, #3, MUL VL]\n"
308 "ld1w { z16.s }, p3/Z, [x20]\n"
309 "ld1w { z17.s }, p2/Z, [x20, #1, MUL VL]\n"
310 "ld1w { z18.s }, p1/Z, [x20, #2, MUL VL]\n"
311 "ld1w { z19.s }, p0/Z, [x20, #3, MUL VL]\n"
329 "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
330 "ldr w27, [x20, x28, LSL #0x2]\n"
331 "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
332 "tbz %x[flags], #3, 26f\n"
333 "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
334 "add x20, x20, x21, LSL #3\n"
335 "ldr x26, [x20, #0x0]\n"
336 "ldr x25, [x20, #0x8]\n"
337 "ldr x24, [x20, #0x10]\n"
339 "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
340 "add x26, x26, x20\n"
341 "add x25, x25, x20\n"
342 "add x24, x24, x20\n"
345 "mov x26, %x[input_ptr]\n"
346 "add x25, x26, x21\n"
347 "add x24, x25, x21\n"
349 "subs x27, x27, #0x4\n"
350 "ld1rw { z0.s }, p4/Z, [x26]\n"
351 "ld1rw { z1.s }, p4/Z, [x25]\n"
352 "ld1rw { z2.s }, p4/Z, [x24]\n"
353 "ld1b { z6.b }, p4/Z, [x10]\n"
354 "ld1b { z7.b }, p4/Z, [x10, #1, MUL VL]\n"
357 "sdot z8.s, z6.b, z0.b\n"
358 "sdot z12.s, z6.b, z1.b\n"
359 "add x26, x26, #0x4\n"
360 "subs x27, x27, #0x4\n"
361 "sdot z16.s, z6.b, z2.b\n"
362 "sdot z9.s, z7.b, z0.b\n"
363 "ld1b { z21.b }, p4/Z, [x10, #2, MUL VL]\n"
364 "add x25, x25, #0x4\n"
365 "sdot z13.s, z7.b, z1.b\n"
366 "sdot z17.s, z7.b, z2.b\n"
367 "ld1b { z20.b }, p4/Z, [x10, #3, MUL VL]\n"
368 "addvl x10, x10, #4\n"
369 "add x24, x24, #0x4\n"
370 "sdot z10.s, z21.b, z0.b\n"
371 "sdot z14.s, z21.b, z1.b\n"
372 "sdot z18.s, z21.b, z2.b\n"
373 "sdot z11.s, z20.b, z0.b\n"
374 "ld1rw { z0.s }, p4/Z, [x26]\n"
375 "ld1b { z6.b }, p4/Z, [x10]\n"
376 "sdot z15.s, z20.b, z1.b\n"
377 "sdot z19.s, z20.b, z2.b\n"
378 "ld1rw { z1.s }, p4/Z, [x25]\n"
379 "ld1rw { z2.s }, p4/Z, [x24]\n"
380 "ld1b { z7.b }, p4/Z, [x10, #1, MUL VL]\n"
383 "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
384 "sdot z8.s, z6.b, z0.b\n"
385 "sdot z12.s, z6.b, z1.b\n"
386 "add x28, x28, #0x1\n"
387 "sdot z16.s, z6.b, z2.b\n"
388 "sdot z9.s, z7.b, z0.b\n"
389 "ld1b { z21.b }, p4/Z, [x10, #2, MUL VL]\n"
391 "sdot z13.s, z7.b, z1.b\n"
392 "sdot z17.s, z7.b, z2.b\n"
393 "ld1b { z20.b }, p4/Z, [x10, #3, MUL VL]\n"
394 "addvl x10, x10, #4\n"
395 "sdot z10.s, z21.b, z0.b\n"
396 "sdot z14.s, z21.b, z1.b\n"
397 "sdot z18.s, z21.b, z2.b\n"
398 "sdot z11.s, z20.b, z0.b\n"
399 "sdot z15.s, z20.b, z1.b\n"
400 "sdot z19.s, z20.b, z2.b\n"
402 "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
403 "add x21, x9, x20, LSL #2\n"
404 "add x20, x21, x20, LSL #2\n"
405 "st1w { z8.s }, p3, [x9]\n"
406 "st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
407 "st1w { z10.s }, p1, [x9, #2, MUL VL]\n"
408 "st1w { z11.s }, p0, [x9, #3, MUL VL]\n"
410 "st1w { z12.s }, p3, [x21]\n"
411 "st1w { z13.s }, p2, [x21, #1, MUL VL]\n"
412 "st1w { z14.s }, p1, [x21, #2, MUL VL]\n"
413 "st1w { z15.s }, p0, [x21, #3, MUL VL]\n"
414 "st1w { z16.s }, p3, [x20]\n"
415 "st1w { z17.s }, p2, [x20, #1, MUL VL]\n"
416 "st1w { z18.s }, p1, [x20, #2, MUL VL]\n"
417 "st1w { z19.s }, p0, [x20, #3, MUL VL]\n"
419 "decw x11, ALL, MUL #4\n"
424 "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
425 "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
426 "mov x9, %x[output_ptr]\n"
429 "whilelt p3.s, x20, x11\n"
431 "whilelt p2.s, x20, x11\n"
433 "whilelt p1.s, x20, x11\n"
435 "whilelt p0.s, x20, x11\n"
436 "tbz %x[flags], #0, 33f\n"
437 "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
438 "add x22, x9, x20, LSL #2\n"
439 "add x21, x22, x20, LSL #2\n"
440 "ld1w { z8.s }, p3/Z, [x9]\n"
441 "add x20, x21, x20, LSL #2\n"
442 "ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n"
443 "ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n"
444 "ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n"
445 "ld1w { z12.s }, p3/Z, [x22]\n"
446 "ld1w { z13.s }, p2/Z, [x22, #1, MUL VL]\n"
447 "ld1w { z14.s }, p1/Z, [x22, #2, MUL VL]\n"
448 "ld1w { z15.s }, p0/Z, [x22, #3, MUL VL]\n"
449 "ld1w { z16.s }, p3/Z, [x21]\n"
450 "ld1w { z17.s }, p2/Z, [x21, #1, MUL VL]\n"
451 "ld1w { z18.s }, p1/Z, [x21, #2, MUL VL]\n"
452 "ld1w { z19.s }, p0/Z, [x21, #3, MUL VL]\n"
453 "ld1w { z20.s }, p3/Z, [x20]\n"
454 "ld1w { z21.s }, p2/Z, [x20, #1, MUL VL]\n"
455 "ld1w { z22.s }, p1/Z, [x20, #2, MUL VL]\n"
456 "ld1w { z23.s }, p0/Z, [x20, #3, MUL VL]\n"
478 "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
479 "ldr w27, [x20, x28, LSL #0x2]\n"
480 "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
481 "tbz %x[flags], #3, 36f\n"
482 "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
483 "add x20, x20, x21, LSL #3\n"
484 "ldr x26, [x20, #0x0]\n"
485 "ldr x25, [x20, #0x8]\n"
486 "ldr x24, [x20, #0x10]\n"
487 "ldr x23, [x20, #0x18]\n"
489 "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
490 "add x26, x26, x20\n"
491 "add x25, x25, x20\n"
492 "add x24, x24, x20\n"
493 "add x23, x23, x20\n"
496 "mov x26, %x[input_ptr]\n"
497 "add x25, x26, x21\n"
498 "add x24, x25, x21\n"
499 "add x23, x24, x21\n"
501 "subs x27, x27, #0x4\n"
502 "ld1rw { z0.s }, p4/Z, [x26]\n"
503 "ld1rw { z1.s }, p4/Z, [x25]\n"
504 "ld1rw { z2.s }, p4/Z, [x24]\n"
505 "ld1rw { z3.s }, p4/Z, [x23]\n"
506 "ld1b { z6.b }, p4/Z, [x10]\n"
507 "ld1b { z7.b }, p4/Z, [x10, #1, MUL VL]\n"
510 "sdot z8.s, z6.b, z0.b\n"
511 "sdot z12.s, z6.b, z1.b\n"
512 "add x26, x26, #0x4\n"
513 "subs x27, x27, #0x4\n"
514 "sdot z16.s, z6.b, z2.b\n"
515 "sdot z20.s, z6.b, z3.b\n"
516 "ld1b { z25.b }, p4/Z, [x10, #2, MUL VL]\n"
517 "add x25, x25, #0x4\n"
518 "sdot z9.s, z7.b, z0.b\n"
519 "sdot z13.s, z7.b, z1.b\n"
520 "add x24, x24, #0x4\n"
521 "add x23, x23, #0x4\n"
522 "sdot z17.s, z7.b, z2.b\n"
523 "sdot z21.s, z7.b, z3.b\n"
524 "ld1b { z24.b }, p4/Z, [x10, #3, MUL VL]\n"
525 "addvl x10, x10, #4\n"
526 "sdot z10.s, z25.b, z0.b\n"
527 "sdot z14.s, z25.b, z1.b\n"
528 "sdot z18.s, z25.b, z2.b\n"
529 "sdot z22.s, z25.b, z3.b\n"
530 "ld1b { z6.b }, p4/Z, [x10]\n"
531 "sdot z11.s, z24.b, z0.b\n"
532 "sdot z15.s, z24.b, z1.b\n"
533 "ld1rw { z0.s }, p4/Z, [x26]\n"
534 "ld1rw { z1.s }, p4/Z, [x25]\n"
535 "sdot z19.s, z24.b, z2.b\n"
536 "sdot z23.s, z24.b, z3.b\n"
537 "ld1rw { z2.s }, p4/Z, [x24]\n"
538 "ld1rw { z3.s }, p4/Z, [x23]\n"
539 "ld1b { z7.b }, p4/Z, [x10, #1, MUL VL]\n"
542 "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
543 "sdot z8.s, z6.b, z0.b\n"
544 "sdot z12.s, z6.b, z1.b\n"
545 "add x28, x28, #0x1\n"
546 "sdot z16.s, z6.b, z2.b\n"
547 "sdot z20.s, z6.b, z3.b\n"
548 "ld1b { z25.b }, p4/Z, [x10, #2, MUL VL]\n"
550 "sdot z9.s, z7.b, z0.b\n"
551 "sdot z13.s, z7.b, z1.b\n"
552 "sdot z17.s, z7.b, z2.b\n"
553 "sdot z21.s, z7.b, z3.b\n"
554 "ld1b { z24.b }, p4/Z, [x10, #3, MUL VL]\n"
555 "addvl x10, x10, #4\n"
556 "sdot z10.s, z25.b, z0.b\n"
557 "sdot z14.s, z25.b, z1.b\n"
558 "sdot z18.s, z25.b, z2.b\n"
559 "sdot z22.s, z25.b, z3.b\n"
560 "sdot z11.s, z24.b, z0.b\n"
561 "sdot z15.s, z24.b, z1.b\n"
562 "sdot z19.s, z24.b, z2.b\n"
563 "sdot z23.s, z24.b, z3.b\n"
565 "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
566 "add x22, x9, x20, LSL #2\n"
567 "add x21, x22, x20, LSL #2\n"
568 "st1w { z8.s }, p3, [x9]\n"
569 "add x20, x21, x20, LSL #2\n"
570 "st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
571 "st1w { z10.s }, p1, [x9, #2, MUL VL]\n"
572 "st1w { z11.s }, p0, [x9, #3, MUL VL]\n"
574 "st1w { z12.s }, p3, [x22]\n"
575 "st1w { z13.s }, p2, [x22, #1, MUL VL]\n"
576 "st1w { z14.s }, p1, [x22, #2, MUL VL]\n"
577 "st1w { z15.s }, p0, [x22, #3, MUL VL]\n"
578 "st1w { z16.s }, p3, [x21]\n"
579 "st1w { z17.s }, p2, [x21, #1, MUL VL]\n"
580 "st1w { z18.s }, p1, [x21, #2, MUL VL]\n"
581 "st1w { z19.s }, p0, [x21, #3, MUL VL]\n"
582 "st1w { z20.s }, p3, [x20]\n"
583 "st1w { z21.s }, p2, [x20, #1, MUL VL]\n"
584 "st1w { z22.s }, p1, [x20, #2, MUL VL]\n"
585 "st1w { z23.s }, p0, [x20, #3, MUL VL]\n"
587 "decw x11, ALL, MUL #4\n"
592 "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
593 "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
594 "mov x9, %x[output_ptr]\n"
597 "whilelt p3.s, x20, x11\n"
599 "whilelt p2.s, x20, x11\n"
601 "whilelt p1.s, x20, x11\n"
603 "whilelt p0.s, x20, x11\n"
604 "tbz %x[flags], #0, 43f\n"
605 "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
606 "add x23, x9, x20, LSL #2\n"
607 "add x22, x23, x20, LSL #2\n"
608 "ld1w { z8.s }, p3/Z, [x9]\n"
609 "add x21, x22, x20, LSL #2\n"
610 "add x20, x21, x20, LSL #2\n"
611 "ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n"
612 "ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n"
613 "ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n"
614 "ld1w { z12.s }, p3/Z, [x23]\n"
615 "ld1w { z13.s }, p2/Z, [x23, #1, MUL VL]\n"
616 "ld1w { z14.s }, p1/Z, [x23, #2, MUL VL]\n"
617 "ld1w { z15.s }, p0/Z, [x23, #3, MUL VL]\n"
618 "ld1w { z16.s }, p3/Z, [x22]\n"
619 "ld1w { z17.s }, p2/Z, [x22, #1, MUL VL]\n"
620 "ld1w { z18.s }, p1/Z, [x22, #2, MUL VL]\n"
621 "ld1w { z19.s }, p0/Z, [x22, #3, MUL VL]\n"
622 "ld1w { z20.s }, p3/Z, [x21]\n"
623 "ld1w { z21.s }, p2/Z, [x21, #1, MUL VL]\n"
624 "ld1w { z22.s }, p1/Z, [x21, #2, MUL VL]\n"
625 "ld1w { z23.s }, p0/Z, [x21, #3, MUL VL]\n"
626 "ld1w { z24.s }, p3/Z, [x20]\n"
627 "ld1w { z25.s }, p2/Z, [x20, #1, MUL VL]\n"
628 "ld1w { z26.s }, p1/Z, [x20, #2, MUL VL]\n"
629 "ld1w { z27.s }, p0/Z, [x20, #3, MUL VL]\n"
655 "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
656 "ldr w27, [x20, x28, LSL #0x2]\n"
657 "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
658 "tbz %x[flags], #3, 46f\n"
659 "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
660 "add x20, x20, x21, LSL #3\n"
661 "ldr x26, [x20, #0x0]\n"
662 "ldr x25, [x20, #0x8]\n"
663 "ldr x24, [x20, #0x10]\n"
664 "ldr x23, [x20, #0x18]\n"
665 "ldr x22, [x20, #0x20]\n"
667 "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
668 "add x26, x26, x20\n"
669 "add x25, x25, x20\n"
670 "add x24, x24, x20\n"
671 "add x23, x23, x20\n"
672 "add x22, x22, x20\n"
675 "mov x26, %x[input_ptr]\n"
676 "add x25, x26, x21\n"
677 "add x24, x25, x21\n"
678 "add x23, x24, x21\n"
679 "add x22, x23, x21\n"
681 "subs x27, x27, #0x4\n"
682 "ld1rw { z0.s }, p4/Z, [x26]\n"
683 "ld1rw { z1.s }, p4/Z, [x25]\n"
684 "ld1rw { z2.s }, p4/Z, [x24]\n"
685 "ld1rw { z3.s }, p4/Z, [x23]\n"
686 "ld1rw { z4.s }, p4/Z, [x22]\n"
687 "ld1b { z6.b }, p4/Z, [x10]\n"
688 "ld1b { z7.b }, p4/Z, [x10, #1, MUL VL]\n"
691 "sdot z8.s, z6.b, z0.b\n"
692 "sdot z12.s, z6.b, z1.b\n"
693 "add x26, x26, #0x4\n"
694 "subs x27, x27, #0x4\n"
695 "sdot z16.s, z6.b, z2.b\n"
696 "sdot z20.s, z6.b, z3.b\n"
697 "add x25, x25, #0x4\n"
698 "add x24, x24, #0x4\n"
699 "sdot z24.s, z6.b, z4.b\n"
700 "sdot z9.s, z7.b, z0.b\n"
701 "ld1b { z29.b }, p4/Z, [x10, #2, MUL VL]\n"
702 "add x23, x23, #0x4\n"
703 "sdot z13.s, z7.b, z1.b\n"
704 "sdot z17.s, z7.b, z2.b\n"
705 "add x22, x22, #0x4\n"
706 "sdot z21.s, z7.b, z3.b\n"
707 "sdot z25.s, z7.b, z4.b\n"
708 "ld1b { z28.b }, p4/Z, [x10, #3, MUL VL]\n"
709 "addvl x10, x10, #4\n"
710 "sdot z10.s, z29.b, z0.b\n"
711 "sdot z14.s, z29.b, z1.b\n"
712 "sdot z18.s, z29.b, z2.b\n"
713 "sdot z22.s, z29.b, z3.b\n"
714 "sdot z26.s, z29.b, z4.b\n"
715 "sdot z11.s, z28.b, z0.b\n"
716 "ld1rw { z0.s }, p4/Z, [x26]\n"
717 "ld1b { z6.b }, p4/Z, [x10]\n"
718 "sdot z15.s, z28.b, z1.b\n"
719 "sdot z19.s, z28.b, z2.b\n"
720 "ld1rw { z1.s }, p4/Z, [x25]\n"
721 "ld1rw { z2.s }, p4/Z, [x24]\n"
722 "sdot z23.s, z28.b, z3.b\n"
723 "sdot z27.s, z28.b, z4.b\n"
724 "ld1rw { z3.s }, p4/Z, [x23]\n"
725 "ld1rw { z4.s }, p4/Z, [x22]\n"
726 "ld1b { z7.b }, p4/Z, [x10, #1, MUL VL]\n"
729 "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
730 "sdot z8.s, z6.b, z0.b\n"
731 "sdot z12.s, z6.b, z1.b\n"
732 "add x28, x28, #0x1\n"
733 "sdot z16.s, z6.b, z2.b\n"
734 "sdot z20.s, z6.b, z3.b\n"
736 "sdot z24.s, z6.b, z4.b\n"
737 "sdot z9.s, z7.b, z0.b\n"
738 "ld1b { z29.b }, p4/Z, [x10, #2, MUL VL]\n"
739 "sdot z13.s, z7.b, z1.b\n"
740 "sdot z17.s, z7.b, z2.b\n"
741 "sdot z21.s, z7.b, z3.b\n"
742 "sdot z25.s, z7.b, z4.b\n"
743 "ld1b { z28.b }, p4/Z, [x10, #3, MUL VL]\n"
744 "addvl x10, x10, #4\n"
745 "sdot z10.s, z29.b, z0.b\n"
746 "sdot z14.s, z29.b, z1.b\n"
747 "sdot z18.s, z29.b, z2.b\n"
748 "sdot z22.s, z29.b, z3.b\n"
749 "sdot z26.s, z29.b, z4.b\n"
750 "sdot z11.s, z28.b, z0.b\n"
751 "sdot z15.s, z28.b, z1.b\n"
752 "sdot z19.s, z28.b, z2.b\n"
753 "sdot z23.s, z28.b, z3.b\n"
754 "sdot z27.s, z28.b, z4.b\n"
756 "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
757 "add x23, x9, x20, LSL #2\n"
758 "add x22, x23, x20, LSL #2\n"
759 "st1w { z8.s }, p3, [x9]\n"
760 "add x21, x22, x20, LSL #2\n"
761 "add x20, x21, x20, LSL #2\n"
762 "st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
763 "st1w { z10.s }, p1, [x9, #2, MUL VL]\n"
764 "st1w { z11.s }, p0, [x9, #3, MUL VL]\n"
766 "st1w { z12.s }, p3, [x23]\n"
767 "st1w { z13.s }, p2, [x23, #1, MUL VL]\n"
768 "st1w { z14.s }, p1, [x23, #2, MUL VL]\n"
769 "st1w { z15.s }, p0, [x23, #3, MUL VL]\n"
770 "st1w { z16.s }, p3, [x22]\n"
771 "st1w { z17.s }, p2, [x22, #1, MUL VL]\n"
772 "st1w { z18.s }, p1, [x22, #2, MUL VL]\n"
773 "st1w { z19.s }, p0, [x22, #3, MUL VL]\n"
774 "st1w { z20.s }, p3, [x21]\n"
775 "st1w { z21.s }, p2, [x21, #1, MUL VL]\n"
776 "st1w { z22.s }, p1, [x21, #2, MUL VL]\n"
777 "st1w { z23.s }, p0, [x21, #3, MUL VL]\n"
778 "st1w { z24.s }, p3, [x20]\n"
779 "st1w { z25.s }, p2, [x20, #1, MUL VL]\n"
780 "st1w { z26.s }, p1, [x20, #2, MUL VL]\n"
781 "st1w { z27.s }, p0, [x20, #3, MUL VL]\n"
783 "decw x11, ALL, MUL #4\n"
788 "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
790 "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
791 "mov x9, %x[output_ptr]\n"
792 "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
793 "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
796 "whilelt p3.s, x20, x11\n"
798 "whilelt p2.s, x20, x11\n"
800 "whilelt p1.s, x20, x11\n"
802 "whilelt p0.s, x20, x11\n"
803 "tbz %x[flags], #0, 53f\n"
804 "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
805 "add x24, x9, x20, LSL #2\n"
806 "add x23, x24, x20, LSL #2\n"
807 "ld1w { z8.s }, p3/Z, [x9]\n"
808 "add x22, x23, x20, LSL #2\n"
809 "add x21, x22, x20, LSL #2\n"
810 "ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n"
811 "ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n"
812 "add x20, x21, x20, LSL #2\n"
813 "ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n"
814 "ld1w { z12.s }, p3/Z, [x24]\n"
815 "ld1w { z13.s }, p2/Z, [x24, #1, MUL VL]\n"
816 "ld1w { z14.s }, p1/Z, [x24, #2, MUL VL]\n"
817 "ld1w { z15.s }, p0/Z, [x24, #3, MUL VL]\n"
818 "ld1w { z16.s }, p3/Z, [x23]\n"
819 "ld1w { z17.s }, p2/Z, [x23, #1, MUL VL]\n"
820 "ld1w { z18.s }, p1/Z, [x23, #2, MUL VL]\n"
821 "ld1w { z19.s }, p0/Z, [x23, #3, MUL VL]\n"
822 "ld1w { z20.s }, p3/Z, [x22]\n"
823 "ld1w { z21.s }, p2/Z, [x22, #1, MUL VL]\n"
824 "ld1w { z22.s }, p1/Z, [x22, #2, MUL VL]\n"
825 "ld1w { z23.s }, p0/Z, [x22, #3, MUL VL]\n"
826 "ld1w { z24.s }, p3/Z, [x21]\n"
827 "ld1w { z25.s }, p2/Z, [x21, #1, MUL VL]\n"
828 "ld1w { z26.s }, p1/Z, [x21, #2, MUL VL]\n"
829 "ld1w { z27.s }, p0/Z, [x21, #3, MUL VL]\n"
830 "ld1w { z28.s }, p3/Z, [x20]\n"
831 "ld1w { z29.s }, p2/Z, [x20, #1, MUL VL]\n"
832 "ld1w { z30.s }, p1/Z, [x20, #2, MUL VL]\n"
833 "ld1w { z31.s }, p0/Z, [x20, #3, MUL VL]\n"
863 "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
864 "ldr w27, [x20, x28, LSL #0x2]\n"
865 "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
866 "tbz %x[flags], #3, 56f\n"
867 "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
868 "add x20, x20, x21, LSL #3\n"
869 "ldr x26, [x20, #0x0]\n"
870 "ldr x25, [x20, #0x8]\n"
871 "ldr x24, [x20, #0x10]\n"
872 "ldr x23, [x20, #0x18]\n"
873 "ldr x22, [x20, #0x20]\n"
874 "ldr x21, [x20, #0x28]\n"
876 "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
877 "add x26, x26, x20\n"
878 "add x25, x25, x20\n"
879 "add x24, x24, x20\n"
880 "add x23, x23, x20\n"
881 "add x22, x22, x20\n"
882 "add x21, x21, x20\n"
885 "mov x26, %x[input_ptr]\n"
886 "add x25, x26, x21\n"
887 "add x24, x25, x21\n"
888 "add x23, x24, x21\n"
889 "add x22, x23, x21\n"
890 "add x21, x22, x21\n"
892 "subs x27, x27, #0x4\n"
893 "ld1rw { z0.s }, p4/Z, [x26]\n"
894 "ld1rw { z1.s }, p4/Z, [x25]\n"
895 "ld1rw { z2.s }, p4/Z, [x24]\n"
896 "ld1rw { z3.s }, p4/Z, [x23]\n"
897 "ld1rw { z4.s }, p4/Z, [x22]\n"
898 "ld1rw { z5.s }, p4/Z, [x21]\n"
899 "ld1b { z6.b }, p4/Z, [x10]\n"
900 "ld1b { z7.b }, p4/Z, [x10, #1, MUL VL]\n"
903 "sdot z8.s, z6.b, z0.b\n"
904 "sdot z12.s, z6.b, z1.b\n"
905 "add x26, x26, #0x4\n"
906 "subs x27, x27, #0x4\n"
907 "sdot z16.s, z6.b, z2.b\n"
908 "sdot z20.s, z6.b, z3.b\n"
909 "add x25, x25, #0x4\n"
910 "add x24, x24, #0x4\n"
911 "sdot z24.s, z6.b, z4.b\n"
912 "sdot z28.s, z6.b, z5.b\n"
913 "ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n"
914 "add x23, x23, #0x4\n"
915 "sdot z9.s, z7.b, z0.b\n"
916 "sdot z13.s, z7.b, z1.b\n"
917 "add x22, x22, #0x4\n"
918 "add x21, x21, #0x4\n"
919 "sdot z17.s, z7.b, z2.b\n"
920 "sdot z21.s, z7.b, z3.b\n"
921 "sdot z25.s, z7.b, z4.b\n"
922 "sdot z29.s, z7.b, z5.b\n"
923 "ld1b { z7.b }, p4/Z, [x10, #3, MUL VL]\n"
924 "addvl x10, x10, #4\n"
925 "sdot z10.s, z6.b, z0.b\n"
926 "sdot z14.s, z6.b, z1.b\n"
927 "sdot z18.s, z6.b, z2.b\n"
928 "sdot z22.s, z6.b, z3.b\n"
929 "sdot z26.s, z6.b, z4.b\n"
930 "sdot z30.s, z6.b, z5.b\n"
931 "ld1b { z6.b }, p4/Z, [x10]\n"
932 "sdot z11.s, z7.b, z0.b\n"
933 "sdot z15.s, z7.b, z1.b\n"
934 "ld1rw { z0.s }, p4/Z, [x26]\n"
935 "ld1rw { z1.s }, p4/Z, [x25]\n"
936 "sdot z19.s, z7.b, z2.b\n"
937 "sdot z23.s, z7.b, z3.b\n"
938 "ld1rw { z2.s }, p4/Z, [x24]\n"
939 "ld1rw { z3.s }, p4/Z, [x23]\n"
940 "sdot z27.s, z7.b, z4.b\n"
941 "sdot z31.s, z7.b, z5.b\n"
942 "ld1rw { z4.s }, p4/Z, [x22]\n"
943 "ld1rw { z5.s }, p4/Z, [x21]\n"
944 "ld1b { z7.b }, p4/Z, [x10, #1, MUL VL]\n"
947 "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
948 "sdot z8.s, z6.b, z0.b\n"
949 "sdot z12.s, z6.b, z1.b\n"
950 "add x28, x28, #0x1\n"
951 "sdot z16.s, z6.b, z2.b\n"
952 "sdot z20.s, z6.b, z3.b\n"
954 "sdot z24.s, z6.b, z4.b\n"
955 "sdot z28.s, z6.b, z5.b\n"
956 "ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n"
957 "sdot z9.s, z7.b, z0.b\n"
958 "sdot z13.s, z7.b, z1.b\n"
959 "sdot z17.s, z7.b, z2.b\n"
960 "sdot z21.s, z7.b, z3.b\n"
961 "sdot z25.s, z7.b, z4.b\n"
962 "sdot z29.s, z7.b, z5.b\n"
963 "ld1b { z7.b }, p4/Z, [x10, #3, MUL VL]\n"
964 "addvl x10, x10, #4\n"
965 "sdot z10.s, z6.b, z0.b\n"
966 "sdot z14.s, z6.b, z1.b\n"
967 "sdot z18.s, z6.b, z2.b\n"
968 "sdot z22.s, z6.b, z3.b\n"
969 "sdot z26.s, z6.b, z4.b\n"
970 "sdot z30.s, z6.b, z5.b\n"
971 "sdot z11.s, z7.b, z0.b\n"
972 "sdot z15.s, z7.b, z1.b\n"
973 "sdot z19.s, z7.b, z2.b\n"
974 "sdot z23.s, z7.b, z3.b\n"
975 "sdot z27.s, z7.b, z4.b\n"
976 "sdot z31.s, z7.b, z5.b\n"
978 "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
979 "add x24, x9, x20, LSL #2\n"
980 "add x23, x24, x20, LSL #2\n"
981 "st1w { z8.s }, p3, [x9]\n"
982 "add x22, x23, x20, LSL #2\n"
983 "add x21, x22, x20, LSL #2\n"
984 "st1w { z9.s }, p2, [x9, #1, MUL VL]\n"
985 "add x20, x21, x20, LSL #2\n"
986 "st1w { z10.s }, p1, [x9, #2, MUL VL]\n"
987 "st1w { z11.s }, p0, [x9, #3, MUL VL]\n"
989 "st1w { z12.s }, p3, [x24]\n"
990 "st1w { z13.s }, p2, [x24, #1, MUL VL]\n"
991 "st1w { z14.s }, p1, [x24, #2, MUL VL]\n"
992 "st1w { z15.s }, p0, [x24, #3, MUL VL]\n"
993 "st1w { z16.s }, p3, [x23]\n"
994 "st1w { z17.s }, p2, [x23, #1, MUL VL]\n"
995 "st1w { z18.s }, p1, [x23, #2, MUL VL]\n"
996 "st1w { z19.s }, p0, [x23, #3, MUL VL]\n"
997 "st1w { z20.s }, p3, [x22]\n"
998 "st1w { z21.s }, p2, [x22, #1, MUL VL]\n"
999 "st1w { z22.s }, p1, [x22, #2, MUL VL]\n"
1000 "st1w { z23.s }, p0, [x22, #3, MUL VL]\n"
1001 "st1w { z24.s }, p3, [x21]\n"
1002 "st1w { z25.s }, p2, [x21, #1, MUL VL]\n"
1003 "st1w { z26.s }, p1, [x21, #2, MUL VL]\n"
1004 "st1w { z27.s }, p0, [x21, #3, MUL VL]\n"
1005 "st1w { z28.s }, p3, [x20]\n"
1006 "st1w { z29.s }, p2, [x20, #1, MUL VL]\n"
1007 "st1w { z30.s }, p1, [x20, #2, MUL VL]\n"
1008 "st1w { z31.s }, p0, [x20, #3, MUL VL]\n"
1010 "decw x11, ALL, MUL #4\n"
1013 "subs %x[M], %x[M], #0x6\n"
1015 "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
1016 "tbz %x[flags], #3, 61f\n"
1017 "add x21, x21, #0x6\n"
1018 "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
1022 "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
1025 : [
M]
"+&r" (
M), [input_ptr]
"+&r" (input_ptr), [output_ptr]
"+&r" (output_ptr)
1026 : [args_ptr]
"r" (&ka), [flags]
"r" (flags), [offsetof_B_ptr]
"I" (offsetof(KernelArgs, B_ptr)), [offsetof_N]
"I" (offsetof(KernelArgs,
N)), [offsetof_input_initial_col]
"I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset]
"I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings]
"I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset]
"I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths]
"I" (offsetof(KernelArgs, string_lengths))
1027 :
"cc",
"memory",
"p0",
"p1",
"p2",
"p3",
"p4",
"x9",
"x10",
"x11",
"x20",
"x21",
"x22",
"x23",
"x24",
"x25",
"x26",
"x27",
"x28",
"z0",
"z1",
"z2",
"z3",
"z4",
"z5",
"z6",
"z7",
"z8",
"z9",
"z10",
"z11",
"z12",
"z13",
"z14",
"z15",
"z16",
"z17",
"z18",
"z19",
"z20",
"z21",
"z22",
"z23",
"z24",
"z25",
"z26",
"z27",
"z28",
"z29",
"z30",
"z31"
1032 #endif // ARM_COMPUTE_ENABLE_SVE