24 #ifdef ARM_COMPUTE_ENABLE_SVE
27 #include "../../utils.hpp"
34 void sve_hybrid_fp32_mla_8x1VL_a64fx (
35 unsigned int num_strings,
const unsigned int *string_lengths, IndirectInputArg<float> A_arg,
36 size_t M,
size_t N,
const float *B_ptr, IndirectOutputArg<float> output_arg,
41 float maxval =
static_cast<float>(std::numeric_limits<float>::infinity());
42 float minval = -
static_cast<float>(std::numeric_limits<float>::infinity());
43 unsigned int num_strings = {};
44 const unsigned int *string_lengths = {};
46 const float *B_ptr = {};
47 size_t output_offset = {};
48 size_t input_initial_col = {};
49 size_t input_offset = {};
52 unsigned long flags=0;
56 if (output_arg.is_indirect) {
57 output_ptr=(
void *)(output_arg.indirect.ptr);
58 ka.output_offset=output_arg.indirect.offset;
61 output_ptr=(
void *)(output_arg.direct.base);
62 ka.output_offset=output_arg.direct.stride;
65 if (A_arg.is_indirect) {
66 input_ptr=(
void *)(A_arg.indirect.ptr);
67 ka.input_offset=A_arg.indirect.start_row;
68 ka.input_initial_col=A_arg.indirect.start_col;
71 assert(num_strings==1);
72 input_ptr=(
void *)(A_arg.direct.base);
73 ka.input_offset=A_arg.direct.stride;
78 ka.num_strings = num_strings;
79 ka.string_lengths = string_lengths;
87 ka.maxval =
static_cast<float>(act.param1);
108 "mov x14, %x[bias]\n"
109 "ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
110 "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
111 "mov x11, %x[output_ptr]\n"
114 "whilelt p0.s, x20, x13\n"
116 "ld1w { z24.s }, p1/Z, [x14]\n"
117 "addvl x14, x14, #1\n"
120 "tbz %x[flags], #0, 4f\n"
121 "ld1w { z24.s }, p0/Z, [x11]\n"
128 "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
129 "ldr w9, [x20, x10, LSL #0x2]\n"
130 "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
131 "tbz %x[flags], #3, 7f\n"
132 "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
133 "add x20, x20, x21, LSL #3\n"
134 "ldr x28, [x20, #0x0]\n"
136 "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
137 "add x28, x28, x20, LSL #2\n"
140 "mov x28, %x[input_ptr]\n"
142 "subs x9, x9, #0x1\n"
143 "ld1rw { z0.s }, p1/Z, [x28]\n"
146 "ld1w { z16.s }, p1/Z, [x12]\n"
147 "add x28, x28, #0x4\n"
148 "subs x9, x9, #0x1\n"
149 "fmla z24.s, p1/M, z16.s, z0.s\n"
150 "addvl x12, x12, #1\n"
151 "ld1rw { z0.s }, p1/Z, [x28]\n"
154 "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
155 "ld1w { z16.s }, p1/Z, [x12]\n"
156 "add x10, x10, #0x1\n"
158 "fmla z24.s, p1/M, z16.s, z0.s\n"
159 "addvl x12, x12, #1\n"
161 "tbz %x[flags], #1, 11f\n"
162 "add x20, %x[args_ptr], %[offset_max]\n"
163 "ld1rw { z17.s }, p1/Z, [x20]\n"
164 "add x20, %x[args_ptr], %[offset_min]\n"
165 "ld1rw { z16.s }, p1/Z, [x20]\n"
166 "fmin z24.s, p1/M, z24.s, z17.s\n"
167 "fmax z24.s, p1/M, z24.s, z16.s\n"
169 "st1w { z24.s }, p0, [x11]\n"
170 "addvl x11, x11, #1\n"
177 "mov x14, %x[bias]\n"
178 "ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
179 "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
180 "mov x11, %x[output_ptr]\n"
183 "whilelt p0.s, x20, x13\n"
185 "ld1w { z24.s }, p1/Z, [x14]\n"
187 "addvl x14, x14, #1\n"
190 "tbz %x[flags], #0, 16f\n"
191 "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
192 "add x20, x11, x20, LSL #2\n"
193 "ld1w { z24.s }, p0/Z, [x11]\n"
194 "ld1w { z25.s }, p0/Z, [x20]\n"
202 "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
203 "ldr w9, [x20, x10, LSL #0x2]\n"
204 "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
205 "tbz %x[flags], #3, 19f\n"
206 "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
207 "add x20, x20, x21, LSL #3\n"
208 "ldr x28, [x20, #0x0]\n"
209 "ldr x27, [x20, #0x8]\n"
211 "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
212 "add x28, x28, x20, LSL #2\n"
213 "add x27, x27, x20, LSL #2\n"
216 "mov x28, %x[input_ptr]\n"
217 "add x27, x28, x21, LSL #2\n"
219 "subs x9, x9, #0x1\n"
220 "ld1rw { z0.s }, p1/Z, [x28]\n"
221 "ld1rw { z1.s }, p1/Z, [x27]\n"
224 "ld1w { z16.s }, p1/Z, [x12]\n"
225 "add x28, x28, #0x4\n"
226 "subs x9, x9, #0x1\n"
227 "fmla z24.s, p1/M, z16.s, z0.s\n"
228 "add x27, x27, #0x4\n"
229 "fmla z25.s, p1/M, z16.s, z1.s\n"
230 "addvl x12, x12, #1\n"
231 "ld1rw { z0.s }, p1/Z, [x28]\n"
232 "ld1rw { z1.s }, p1/Z, [x27]\n"
235 "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
236 "ld1w { z16.s }, p1/Z, [x12]\n"
237 "add x10, x10, #0x1\n"
239 "fmla z24.s, p1/M, z16.s, z0.s\n"
240 "fmla z25.s, p1/M, z16.s, z1.s\n"
241 "addvl x12, x12, #1\n"
243 "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
244 "add x27, x11, x20, LSL #2\n"
245 "tbz %x[flags], #1, 23f\n"
246 "add x20, %x[args_ptr], %[offset_max]\n"
247 "ld1rw { z17.s }, p1/Z, [x20]\n"
248 "add x20, %x[args_ptr], %[offset_min]\n"
249 "ld1rw { z16.s }, p1/Z, [x20]\n"
250 "fmin z24.s, p1/M, z24.s, z17.s\n"
251 "fmin z25.s, p1/M, z25.s, z17.s\n"
252 "fmax z24.s, p1/M, z24.s, z16.s\n"
253 "fmax z25.s, p1/M, z25.s, z16.s\n"
255 "st1w { z24.s }, p0, [x11]\n"
256 "addvl x11, x11, #1\n"
257 "st1w { z25.s }, p0, [x27]\n"
264 "mov x14, %x[bias]\n"
265 "ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
266 "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
267 "mov x11, %x[output_ptr]\n"
270 "whilelt p0.s, x20, x13\n"
272 "ld1w { z24.s }, p1/Z, [x14]\n"
275 "addvl x14, x14, #1\n"
278 "tbz %x[flags], #0, 28f\n"
279 "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
280 "add x21, x11, x20, LSL #2\n"
281 "add x20, x21, x20, LSL #2\n"
282 "ld1w { z24.s }, p0/Z, [x11]\n"
283 "ld1w { z25.s }, p0/Z, [x21]\n"
284 "ld1w { z26.s }, p0/Z, [x20]\n"
293 "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
294 "ldr w9, [x20, x10, LSL #0x2]\n"
295 "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
296 "tbz %x[flags], #3, 31f\n"
297 "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
298 "add x20, x20, x21, LSL #3\n"
299 "ldr x28, [x20, #0x0]\n"
300 "ldr x27, [x20, #0x8]\n"
301 "ldr x26, [x20, #0x10]\n"
303 "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
304 "add x28, x28, x20, LSL #2\n"
305 "add x27, x27, x20, LSL #2\n"
306 "add x26, x26, x20, LSL #2\n"
309 "mov x28, %x[input_ptr]\n"
310 "add x27, x28, x21, LSL #2\n"
311 "add x26, x27, x21, LSL #2\n"
313 "subs x9, x9, #0x1\n"
314 "ld1rw { z0.s }, p1/Z, [x28]\n"
315 "ld1rw { z1.s }, p1/Z, [x27]\n"
316 "ld1rw { z2.s }, p1/Z, [x26]\n"
319 "ld1w { z16.s }, p1/Z, [x12]\n"
320 "add x28, x28, #0x4\n"
321 "subs x9, x9, #0x1\n"
322 "fmla z24.s, p1/M, z16.s, z0.s\n"
323 "add x27, x27, #0x4\n"
324 "add x26, x26, #0x4\n"
325 "fmla z25.s, p1/M, z16.s, z1.s\n"
326 "fmla z26.s, p1/M, z16.s, z2.s\n"
327 "addvl x12, x12, #1\n"
328 "ld1rw { z0.s }, p1/Z, [x28]\n"
329 "ld1rw { z1.s }, p1/Z, [x27]\n"
330 "ld1rw { z2.s }, p1/Z, [x26]\n"
333 "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
334 "ld1w { z16.s }, p1/Z, [x12]\n"
335 "add x10, x10, #0x1\n"
337 "fmla z24.s, p1/M, z16.s, z0.s\n"
338 "fmla z25.s, p1/M, z16.s, z1.s\n"
339 "addvl x12, x12, #1\n"
340 "fmla z26.s, p1/M, z16.s, z2.s\n"
342 "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
343 "add x27, x11, x20, LSL #2\n"
344 "add x26, x27, x20, LSL #2\n"
345 "tbz %x[flags], #1, 35f\n"
346 "add x20, %x[args_ptr], %[offset_max]\n"
347 "ld1rw { z17.s }, p1/Z, [x20]\n"
348 "add x20, %x[args_ptr], %[offset_min]\n"
349 "ld1rw { z16.s }, p1/Z, [x20]\n"
350 "fmin z24.s, p1/M, z24.s, z17.s\n"
351 "fmin z25.s, p1/M, z25.s, z17.s\n"
352 "fmin z26.s, p1/M, z26.s, z17.s\n"
353 "fmax z24.s, p1/M, z24.s, z16.s\n"
354 "fmax z25.s, p1/M, z25.s, z16.s\n"
355 "fmax z26.s, p1/M, z26.s, z16.s\n"
357 "st1w { z24.s }, p0, [x11]\n"
358 "addvl x11, x11, #1\n"
359 "st1w { z25.s }, p0, [x27]\n"
360 "st1w { z26.s }, p0, [x26]\n"
367 "mov x14, %x[bias]\n"
368 "ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
369 "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
370 "mov x11, %x[output_ptr]\n"
373 "whilelt p0.s, x20, x13\n"
375 "ld1w { z24.s }, p1/Z, [x14]\n"
378 "addvl x14, x14, #1\n"
382 "tbz %x[flags], #0, 40f\n"
383 "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
384 "add x22, x11, x20, LSL #2\n"
385 "add x21, x22, x20, LSL #2\n"
386 "ld1w { z24.s }, p0/Z, [x11]\n"
387 "add x20, x21, x20, LSL #2\n"
388 "ld1w { z25.s }, p0/Z, [x22]\n"
389 "ld1w { z26.s }, p0/Z, [x21]\n"
390 "ld1w { z27.s }, p0/Z, [x20]\n"
400 "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
401 "ldr w9, [x20, x10, LSL #0x2]\n"
402 "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
403 "tbz %x[flags], #3, 43f\n"
404 "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
405 "add x20, x20, x21, LSL #3\n"
406 "ldr x28, [x20, #0x0]\n"
407 "ldr x27, [x20, #0x8]\n"
408 "ldr x26, [x20, #0x10]\n"
409 "ldr x25, [x20, #0x18]\n"
411 "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
412 "add x28, x28, x20, LSL #2\n"
413 "add x27, x27, x20, LSL #2\n"
414 "add x26, x26, x20, LSL #2\n"
415 "add x25, x25, x20, LSL #2\n"
418 "mov x28, %x[input_ptr]\n"
419 "add x27, x28, x21, LSL #2\n"
420 "add x26, x27, x21, LSL #2\n"
421 "add x25, x26, x21, LSL #2\n"
423 "subs x9, x9, #0x1\n"
424 "ld1rw { z0.s }, p1/Z, [x28]\n"
425 "ld1rw { z1.s }, p1/Z, [x27]\n"
426 "ld1rw { z2.s }, p1/Z, [x26]\n"
427 "ld1rw { z3.s }, p1/Z, [x25]\n"
430 "ld1w { z16.s }, p1/Z, [x12]\n"
431 "add x28, x28, #0x4\n"
432 "subs x9, x9, #0x1\n"
433 "fmla z24.s, p1/M, z16.s, z0.s\n"
434 "add x27, x27, #0x4\n"
435 "add x26, x26, #0x4\n"
436 "fmla z25.s, p1/M, z16.s, z1.s\n"
437 "fmla z26.s, p1/M, z16.s, z2.s\n"
438 "add x25, x25, #0x4\n"
439 "fmla z27.s, p1/M, z16.s, z3.s\n"
440 "addvl x12, x12, #1\n"
441 "ld1rw { z0.s }, p1/Z, [x28]\n"
442 "ld1rw { z1.s }, p1/Z, [x27]\n"
443 "ld1rw { z2.s }, p1/Z, [x26]\n"
444 "ld1rw { z3.s }, p1/Z, [x25]\n"
447 "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
448 "ld1w { z16.s }, p1/Z, [x12]\n"
449 "add x10, x10, #0x1\n"
451 "fmla z24.s, p1/M, z16.s, z0.s\n"
452 "fmla z25.s, p1/M, z16.s, z1.s\n"
453 "addvl x12, x12, #1\n"
454 "fmla z26.s, p1/M, z16.s, z2.s\n"
455 "fmla z27.s, p1/M, z16.s, z3.s\n"
457 "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
458 "add x27, x11, x20, LSL #2\n"
459 "add x26, x27, x20, LSL #2\n"
460 "add x25, x26, x20, LSL #2\n"
461 "tbz %x[flags], #1, 47f\n"
462 "add x20, %x[args_ptr], %[offset_max]\n"
463 "ld1rw { z17.s }, p1/Z, [x20]\n"
464 "add x20, %x[args_ptr], %[offset_min]\n"
465 "ld1rw { z16.s }, p1/Z, [x20]\n"
466 "fmin z24.s, p1/M, z24.s, z17.s\n"
467 "fmin z25.s, p1/M, z25.s, z17.s\n"
468 "fmin z26.s, p1/M, z26.s, z17.s\n"
469 "fmin z27.s, p1/M, z27.s, z17.s\n"
470 "fmax z24.s, p1/M, z24.s, z16.s\n"
471 "fmax z25.s, p1/M, z25.s, z16.s\n"
472 "fmax z26.s, p1/M, z26.s, z16.s\n"
473 "fmax z27.s, p1/M, z27.s, z16.s\n"
475 "st1w { z24.s }, p0, [x11]\n"
476 "addvl x11, x11, #1\n"
477 "st1w { z25.s }, p0, [x27]\n"
478 "st1w { z26.s }, p0, [x26]\n"
479 "st1w { z27.s }, p0, [x25]\n"
486 "mov x14, %x[bias]\n"
487 "ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
488 "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
489 "mov x11, %x[output_ptr]\n"
492 "whilelt p0.s, x20, x13\n"
494 "ld1w { z24.s }, p1/Z, [x14]\n"
497 "addvl x14, x14, #1\n"
502 "tbz %x[flags], #0, 52f\n"
503 "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
504 "add x23, x11, x20, LSL #2\n"
505 "add x22, x23, x20, LSL #2\n"
506 "ld1w { z24.s }, p0/Z, [x11]\n"
507 "add x21, x22, x20, LSL #2\n"
508 "add x20, x21, x20, LSL #2\n"
509 "ld1w { z25.s }, p0/Z, [x23]\n"
510 "ld1w { z26.s }, p0/Z, [x22]\n"
511 "ld1w { z27.s }, p0/Z, [x21]\n"
512 "ld1w { z28.s }, p0/Z, [x20]\n"
523 "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
524 "ldr w9, [x20, x10, LSL #0x2]\n"
525 "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
526 "tbz %x[flags], #3, 55f\n"
527 "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
528 "add x20, x20, x21, LSL #3\n"
529 "ldr x28, [x20, #0x0]\n"
530 "ldr x27, [x20, #0x8]\n"
531 "ldr x26, [x20, #0x10]\n"
532 "ldr x25, [x20, #0x18]\n"
533 "ldr x24, [x20, #0x20]\n"
535 "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
536 "add x28, x28, x20, LSL #2\n"
537 "add x27, x27, x20, LSL #2\n"
538 "add x26, x26, x20, LSL #2\n"
539 "add x25, x25, x20, LSL #2\n"
540 "add x24, x24, x20, LSL #2\n"
543 "mov x28, %x[input_ptr]\n"
544 "add x27, x28, x21, LSL #2\n"
545 "add x26, x27, x21, LSL #2\n"
546 "add x25, x26, x21, LSL #2\n"
547 "add x24, x25, x21, LSL #2\n"
549 "subs x9, x9, #0x1\n"
550 "ld1rw { z0.s }, p1/Z, [x28]\n"
551 "ld1rw { z1.s }, p1/Z, [x27]\n"
552 "ld1rw { z2.s }, p1/Z, [x26]\n"
553 "ld1rw { z3.s }, p1/Z, [x25]\n"
554 "ld1rw { z4.s }, p1/Z, [x24]\n"
557 "ld1w { z16.s }, p1/Z, [x12]\n"
558 "add x28, x28, #0x4\n"
559 "subs x9, x9, #0x1\n"
560 "fmla z24.s, p1/M, z16.s, z0.s\n"
561 "add x27, x27, #0x4\n"
562 "add x26, x26, #0x4\n"
563 "fmla z25.s, p1/M, z16.s, z1.s\n"
564 "fmla z26.s, p1/M, z16.s, z2.s\n"
565 "add x25, x25, #0x4\n"
566 "add x24, x24, #0x4\n"
567 "fmla z27.s, p1/M, z16.s, z3.s\n"
568 "ld1rw { z0.s }, p1/Z, [x28]\n"
569 "addvl x12, x12, #1\n"
570 "fmla z28.s, p1/M, z16.s, z4.s\n"
571 "ld1rw { z1.s }, p1/Z, [x27]\n"
572 "ld1rw { z2.s }, p1/Z, [x26]\n"
573 "ld1rw { z3.s }, p1/Z, [x25]\n"
574 "ld1rw { z4.s }, p1/Z, [x24]\n"
577 "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
578 "ld1w { z16.s }, p1/Z, [x12]\n"
579 "add x10, x10, #0x1\n"
581 "fmla z24.s, p1/M, z16.s, z0.s\n"
582 "fmla z25.s, p1/M, z16.s, z1.s\n"
583 "addvl x12, x12, #1\n"
584 "fmla z26.s, p1/M, z16.s, z2.s\n"
585 "fmla z27.s, p1/M, z16.s, z3.s\n"
586 "fmla z28.s, p1/M, z16.s, z4.s\n"
588 "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
589 "add x27, x11, x20, LSL #2\n"
590 "add x26, x27, x20, LSL #2\n"
591 "add x25, x26, x20, LSL #2\n"
592 "add x24, x25, x20, LSL #2\n"
593 "tbz %x[flags], #1, 59f\n"
594 "add x20, %x[args_ptr], %[offset_max]\n"
595 "ld1rw { z17.s }, p1/Z, [x20]\n"
596 "add x20, %x[args_ptr], %[offset_min]\n"
597 "ld1rw { z16.s }, p1/Z, [x20]\n"
598 "fmin z24.s, p1/M, z24.s, z17.s\n"
599 "fmin z25.s, p1/M, z25.s, z17.s\n"
600 "fmin z26.s, p1/M, z26.s, z17.s\n"
601 "fmin z27.s, p1/M, z27.s, z17.s\n"
602 "fmin z28.s, p1/M, z28.s, z17.s\n"
603 "fmax z24.s, p1/M, z24.s, z16.s\n"
604 "fmax z25.s, p1/M, z25.s, z16.s\n"
605 "fmax z26.s, p1/M, z26.s, z16.s\n"
606 "fmax z27.s, p1/M, z27.s, z16.s\n"
607 "fmax z28.s, p1/M, z28.s, z16.s\n"
609 "st1w { z24.s }, p0, [x11]\n"
610 "addvl x11, x11, #1\n"
611 "st1w { z25.s }, p0, [x27]\n"
612 "st1w { z26.s }, p0, [x26]\n"
613 "st1w { z27.s }, p0, [x25]\n"
614 "st1w { z28.s }, p0, [x24]\n"
621 "mov x14, %x[bias]\n"
622 "ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
623 "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
624 "mov x11, %x[output_ptr]\n"
627 "whilelt p0.s, x20, x13\n"
629 "ld1w { z24.s }, p1/Z, [x14]\n"
632 "addvl x14, x14, #1\n"
638 "tbz %x[flags], #0, 64f\n"
639 "ldr x24, [%x[args_ptr], %[offsetof_output_offset]]\n"
640 "add x23, x11, x24, LSL #2\n"
641 "add x20, x23, x24, LSL #2\n"
642 "ld1w { z24.s }, p0/Z, [x11]\n"
643 "add x22, x20, x24, LSL #2\n"
644 "add x21, x22, x24, LSL #2\n"
645 "ld1w { z25.s }, p0/Z, [x23]\n"
646 "ld1w { z26.s }, p0/Z, [x20]\n"
647 "add x20, x21, x24, LSL #2\n"
648 "ld1w { z27.s }, p0/Z, [x22]\n"
649 "ld1w { z28.s }, p0/Z, [x21]\n"
650 "ld1w { z29.s }, p0/Z, [x20]\n"
662 "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
663 "ldr w9, [x20, x10, LSL #0x2]\n"
664 "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
665 "tbz %x[flags], #3, 67f\n"
666 "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
667 "add x20, x20, x21, LSL #3\n"
668 "ldr x28, [x20, #0x0]\n"
669 "ldr x27, [x20, #0x8]\n"
670 "ldr x26, [x20, #0x10]\n"
671 "ldr x25, [x20, #0x18]\n"
672 "ldr x24, [x20, #0x20]\n"
673 "ldr x23, [x20, #0x28]\n"
675 "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
676 "add x28, x28, x20, LSL #2\n"
677 "add x27, x27, x20, LSL #2\n"
678 "add x26, x26, x20, LSL #2\n"
679 "add x25, x25, x20, LSL #2\n"
680 "add x24, x24, x20, LSL #2\n"
681 "add x23, x23, x20, LSL #2\n"
684 "mov x28, %x[input_ptr]\n"
685 "add x27, x28, x21, LSL #2\n"
686 "add x26, x27, x21, LSL #2\n"
687 "add x25, x26, x21, LSL #2\n"
688 "add x24, x25, x21, LSL #2\n"
689 "add x23, x24, x21, LSL #2\n"
691 "subs x9, x9, #0x1\n"
692 "ld1rw { z0.s }, p1/Z, [x28]\n"
693 "ld1rw { z1.s }, p1/Z, [x27]\n"
694 "ld1rw { z2.s }, p1/Z, [x26]\n"
695 "ld1rw { z3.s }, p1/Z, [x25]\n"
696 "ld1rw { z4.s }, p1/Z, [x24]\n"
697 "ld1rw { z5.s }, p1/Z, [x23]\n"
700 "ld1w { z16.s }, p1/Z, [x12]\n"
701 "add x28, x28, #0x4\n"
702 "subs x9, x9, #0x1\n"
703 "fmla z24.s, p1/M, z16.s, z0.s\n"
704 "add x27, x27, #0x4\n"
705 "add x26, x26, #0x4\n"
706 "fmla z25.s, p1/M, z16.s, z1.s\n"
707 "fmla z26.s, p1/M, z16.s, z2.s\n"
708 "add x25, x25, #0x4\n"
709 "add x24, x24, #0x4\n"
710 "fmla z27.s, p1/M, z16.s, z3.s\n"
711 "fmla z28.s, p1/M, z16.s, z4.s\n"
712 "add x23, x23, #0x4\n"
713 "addvl x12, x12, #1\n"
714 "fmla z29.s, p1/M, z16.s, z5.s\n"
715 "ld1rw { z0.s }, p1/Z, [x28]\n"
716 "ld1rw { z1.s }, p1/Z, [x27]\n"
717 "ld1rw { z2.s }, p1/Z, [x26]\n"
718 "ld1rw { z3.s }, p1/Z, [x25]\n"
719 "ld1rw { z4.s }, p1/Z, [x24]\n"
720 "ld1rw { z5.s }, p1/Z, [x23]\n"
723 "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
724 "ld1w { z16.s }, p1/Z, [x12]\n"
725 "add x10, x10, #0x1\n"
727 "fmla z24.s, p1/M, z16.s, z0.s\n"
728 "fmla z25.s, p1/M, z16.s, z1.s\n"
729 "addvl x12, x12, #1\n"
730 "fmla z26.s, p1/M, z16.s, z2.s\n"
731 "fmla z27.s, p1/M, z16.s, z3.s\n"
732 "fmla z28.s, p1/M, z16.s, z4.s\n"
733 "fmla z29.s, p1/M, z16.s, z5.s\n"
735 "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
736 "add x27, x11, x20, LSL #2\n"
737 "add x26, x27, x20, LSL #2\n"
738 "add x25, x26, x20, LSL #2\n"
739 "add x24, x25, x20, LSL #2\n"
740 "add x23, x24, x20, LSL #2\n"
741 "tbz %x[flags], #1, 71f\n"
742 "add x20, %x[args_ptr], %[offset_max]\n"
743 "ld1rw { z17.s }, p1/Z, [x20]\n"
744 "add x20, %x[args_ptr], %[offset_min]\n"
745 "ld1rw { z16.s }, p1/Z, [x20]\n"
746 "fmin z24.s, p1/M, z24.s, z17.s\n"
747 "fmin z25.s, p1/M, z25.s, z17.s\n"
748 "fmin z26.s, p1/M, z26.s, z17.s\n"
749 "fmin z27.s, p1/M, z27.s, z17.s\n"
750 "fmin z28.s, p1/M, z28.s, z17.s\n"
751 "fmin z29.s, p1/M, z29.s, z17.s\n"
752 "fmax z24.s, p1/M, z24.s, z16.s\n"
753 "fmax z25.s, p1/M, z25.s, z16.s\n"
754 "fmax z26.s, p1/M, z26.s, z16.s\n"
755 "fmax z27.s, p1/M, z27.s, z16.s\n"
756 "fmax z28.s, p1/M, z28.s, z16.s\n"
757 "fmax z29.s, p1/M, z29.s, z16.s\n"
759 "st1w { z24.s }, p0, [x11]\n"
760 "addvl x11, x11, #1\n"
761 "st1w { z25.s }, p0, [x27]\n"
762 "st1w { z26.s }, p0, [x26]\n"
763 "st1w { z27.s }, p0, [x25]\n"
764 "st1w { z28.s }, p0, [x24]\n"
765 "st1w { z29.s }, p0, [x23]\n"
772 "mov x14, %x[bias]\n"
773 "ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
774 "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
775 "mov x11, %x[output_ptr]\n"
778 "whilelt p0.s, x20, x13\n"
780 "ld1w { z24.s }, p1/Z, [x14]\n"
783 "addvl x14, x14, #1\n"
790 "tbz %x[flags], #0, 76f\n"
791 "ldr x24, [%x[args_ptr], %[offsetof_output_offset]]\n"
792 "add x21, x11, x24, LSL #2\n"
793 "add x20, x21, x24, LSL #2\n"
794 "ld1w { z24.s }, p0/Z, [x11]\n"
795 "add x23, x20, x24, LSL #2\n"
796 "add x22, x23, x24, LSL #2\n"
797 "ld1w { z25.s }, p0/Z, [x21]\n"
798 "ld1w { z26.s }, p0/Z, [x20]\n"
799 "add x21, x22, x24, LSL #2\n"
800 "add x20, x21, x24, LSL #2\n"
801 "ld1w { z27.s }, p0/Z, [x23]\n"
802 "ld1w { z28.s }, p0/Z, [x22]\n"
803 "ld1w { z29.s }, p0/Z, [x21]\n"
804 "ld1w { z30.s }, p0/Z, [x20]\n"
817 "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
818 "ldr w9, [x20, x10, LSL #0x2]\n"
819 "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
820 "tbz %x[flags], #3, 79f\n"
821 "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
822 "add x20, x20, x21, LSL #3\n"
823 "ldr x28, [x20, #0x0]\n"
824 "ldr x27, [x20, #0x8]\n"
825 "ldr x26, [x20, #0x10]\n"
826 "ldr x25, [x20, #0x18]\n"
827 "ldr x24, [x20, #0x20]\n"
828 "ldr x23, [x20, #0x28]\n"
829 "ldr x22, [x20, #0x30]\n"
831 "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
832 "add x28, x28, x20, LSL #2\n"
833 "add x27, x27, x20, LSL #2\n"
834 "add x26, x26, x20, LSL #2\n"
835 "add x25, x25, x20, LSL #2\n"
836 "add x24, x24, x20, LSL #2\n"
837 "add x23, x23, x20, LSL #2\n"
838 "add x22, x22, x20, LSL #2\n"
841 "mov x28, %x[input_ptr]\n"
842 "add x27, x28, x21, LSL #2\n"
843 "add x26, x27, x21, LSL #2\n"
844 "add x25, x26, x21, LSL #2\n"
845 "add x24, x25, x21, LSL #2\n"
846 "add x23, x24, x21, LSL #2\n"
847 "add x22, x23, x21, LSL #2\n"
849 "subs x9, x9, #0x1\n"
850 "ld1rw { z0.s }, p1/Z, [x28]\n"
851 "ld1rw { z1.s }, p1/Z, [x27]\n"
852 "ld1rw { z2.s }, p1/Z, [x26]\n"
853 "ld1rw { z3.s }, p1/Z, [x25]\n"
854 "ld1rw { z4.s }, p1/Z, [x24]\n"
855 "ld1rw { z5.s }, p1/Z, [x23]\n"
856 "ld1rw { z6.s }, p1/Z, [x22]\n"
859 "ld1w { z16.s }, p1/Z, [x12]\n"
860 "add x28, x28, #0x4\n"
861 "subs x9, x9, #0x1\n"
862 "fmla z24.s, p1/M, z16.s, z0.s\n"
863 "add x27, x27, #0x4\n"
864 "add x26, x26, #0x4\n"
865 "fmla z25.s, p1/M, z16.s, z1.s\n"
866 "fmla z26.s, p1/M, z16.s, z2.s\n"
867 "add x25, x25, #0x4\n"
868 "add x24, x24, #0x4\n"
869 "fmla z27.s, p1/M, z16.s, z3.s\n"
870 "ld1rw { z0.s }, p1/Z, [x28]\n"
871 "add x23, x23, #0x4\n"
872 "add x22, x22, #0x4\n"
873 "fmla z28.s, p1/M, z16.s, z4.s\n"
874 "fmla z29.s, p1/M, z16.s, z5.s\n"
875 "addvl x12, x12, #1\n"
876 "ld1rw { z1.s }, p1/Z, [x27]\n"
877 "fmla z30.s, p1/M, z16.s, z6.s\n"
878 "ld1rw { z2.s }, p1/Z, [x26]\n"
879 "ld1rw { z3.s }, p1/Z, [x25]\n"
880 "ld1rw { z4.s }, p1/Z, [x24]\n"
881 "ld1rw { z5.s }, p1/Z, [x23]\n"
882 "ld1rw { z6.s }, p1/Z, [x22]\n"
885 "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
886 "ld1w { z16.s }, p1/Z, [x12]\n"
887 "add x10, x10, #0x1\n"
889 "fmla z24.s, p1/M, z16.s, z0.s\n"
890 "fmla z25.s, p1/M, z16.s, z1.s\n"
891 "addvl x12, x12, #1\n"
892 "fmla z26.s, p1/M, z16.s, z2.s\n"
893 "fmla z27.s, p1/M, z16.s, z3.s\n"
894 "fmla z28.s, p1/M, z16.s, z4.s\n"
895 "fmla z29.s, p1/M, z16.s, z5.s\n"
896 "fmla z30.s, p1/M, z16.s, z6.s\n"
898 "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
899 "add x27, x11, x20, LSL #2\n"
900 "add x26, x27, x20, LSL #2\n"
901 "add x25, x26, x20, LSL #2\n"
902 "add x24, x25, x20, LSL #2\n"
903 "add x23, x24, x20, LSL #2\n"
904 "add x22, x23, x20, LSL #2\n"
905 "tbz %x[flags], #1, 83f\n"
906 "add x20, %x[args_ptr], %[offset_max]\n"
907 "ld1rw { z17.s }, p1/Z, [x20]\n"
908 "add x20, %x[args_ptr], %[offset_min]\n"
909 "ld1rw { z16.s }, p1/Z, [x20]\n"
910 "fmin z24.s, p1/M, z24.s, z17.s\n"
911 "fmin z25.s, p1/M, z25.s, z17.s\n"
912 "fmin z26.s, p1/M, z26.s, z17.s\n"
913 "fmin z27.s, p1/M, z27.s, z17.s\n"
914 "fmin z28.s, p1/M, z28.s, z17.s\n"
915 "fmin z29.s, p1/M, z29.s, z17.s\n"
916 "fmin z30.s, p1/M, z30.s, z17.s\n"
917 "fmax z24.s, p1/M, z24.s, z16.s\n"
918 "fmax z25.s, p1/M, z25.s, z16.s\n"
919 "fmax z26.s, p1/M, z26.s, z16.s\n"
920 "fmax z27.s, p1/M, z27.s, z16.s\n"
921 "fmax z28.s, p1/M, z28.s, z16.s\n"
922 "fmax z29.s, p1/M, z29.s, z16.s\n"
923 "fmax z30.s, p1/M, z30.s, z16.s\n"
925 "st1w { z24.s }, p0, [x11]\n"
926 "addvl x11, x11, #1\n"
927 "st1w { z25.s }, p0, [x27]\n"
928 "st1w { z26.s }, p0, [x26]\n"
929 "st1w { z27.s }, p0, [x25]\n"
930 "st1w { z28.s }, p0, [x24]\n"
931 "st1w { z29.s }, p0, [x23]\n"
932 "st1w { z30.s }, p0, [x22]\n"
939 "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
941 "mov x14, %x[bias]\n"
942 "ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
943 "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
944 "mov x11, %x[output_ptr]\n"
945 "madd %x[output_ptr], x21, x20, %x[output_ptr]\n"
948 "whilelt p0.s, x20, x13\n"
950 "ld1w { z24.s }, p1/Z, [x14]\n"
953 "addvl x14, x14, #1\n"
961 "tbz %x[flags], #0, 88f\n"
962 "ldr x24, [%x[args_ptr], %[offsetof_output_offset]]\n"
963 "add x22, x11, x24, LSL #2\n"
964 "add x21, x22, x24, LSL #2\n"
965 "ld1w { z24.s }, p0/Z, [x11]\n"
966 "add x23, x21, x24, LSL #2\n"
967 "add x20, x23, x24, LSL #2\n"
968 "ld1w { z25.s }, p0/Z, [x22]\n"
969 "ld1w { z26.s }, p0/Z, [x21]\n"
970 "add x22, x20, x24, LSL #2\n"
971 "add x21, x22, x24, LSL #2\n"
972 "ld1w { z27.s }, p0/Z, [x23]\n"
973 "ld1w { z28.s }, p0/Z, [x20]\n"
974 "add x20, x21, x24, LSL #2\n"
975 "ld1w { z29.s }, p0/Z, [x22]\n"
976 "ld1w { z30.s }, p0/Z, [x21]\n"
977 "ld1w { z31.s }, p0/Z, [x20]\n"
991 "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
992 "ldr w9, [x20, x10, LSL #0x2]\n"
993 "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
994 "tbz %x[flags], #3, 91f\n"
995 "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n"
996 "add x20, x20, x21, LSL #3\n"
997 "ldr x28, [x20, #0x0]\n"
998 "ldr x27, [x20, #0x8]\n"
999 "ldr x26, [x20, #0x10]\n"
1000 "ldr x25, [x20, #0x18]\n"
1001 "ldr x24, [x20, #0x20]\n"
1002 "ldr x23, [x20, #0x28]\n"
1003 "ldr x22, [x20, #0x30]\n"
1004 "ldr x21, [x20, #0x38]\n"
1006 "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
1007 "add x28, x28, x20, LSL #2\n"
1008 "add x27, x27, x20, LSL #2\n"
1009 "add x26, x26, x20, LSL #2\n"
1010 "add x25, x25, x20, LSL #2\n"
1011 "add x24, x24, x20, LSL #2\n"
1012 "add x23, x23, x20, LSL #2\n"
1013 "add x22, x22, x20, LSL #2\n"
1014 "add x21, x21, x20, LSL #2\n"
1017 "mov x28, %x[input_ptr]\n"
1018 "add x27, x28, x21, LSL #2\n"
1019 "add x26, x27, x21, LSL #2\n"
1020 "add x25, x26, x21, LSL #2\n"
1021 "add x24, x25, x21, LSL #2\n"
1022 "add x23, x24, x21, LSL #2\n"
1023 "add x22, x23, x21, LSL #2\n"
1024 "add x21, x22, x21, LSL #2\n"
1026 "subs x9, x9, #0x1\n"
1027 "ld1rw { z0.s }, p1/Z, [x28]\n"
1028 "ld1rw { z1.s }, p1/Z, [x27]\n"
1029 "ld1rw { z2.s }, p1/Z, [x26]\n"
1030 "ld1rw { z3.s }, p1/Z, [x25]\n"
1031 "ld1rw { z4.s }, p1/Z, [x24]\n"
1032 "ld1rw { z5.s }, p1/Z, [x23]\n"
1033 "ld1rw { z6.s }, p1/Z, [x22]\n"
1034 "ld1rw { z7.s }, p1/Z, [x21]\n"
1037 "ld1w { z16.s }, p1/Z, [x12]\n"
1038 "add x28, x28, #0x4\n"
1039 "subs x9, x9, #0x1\n"
1040 "fmla z24.s, p1/M, z16.s, z0.s\n"
1041 "add x27, x27, #0x4\n"
1042 "add x26, x26, #0x4\n"
1043 "fmla z25.s, p1/M, z16.s, z1.s\n"
1044 "fmla z26.s, p1/M, z16.s, z2.s\n"
1045 "add x25, x25, #0x4\n"
1046 "add x24, x24, #0x4\n"
1047 "fmla z27.s, p1/M, z16.s, z3.s\n"
1048 "fmla z28.s, p1/M, z16.s, z4.s\n"
1049 "add x23, x23, #0x4\n"
1050 "add x22, x22, #0x4\n"
1051 "fmla z29.s, p1/M, z16.s, z5.s\n"
1052 "ld1rw { z0.s }, p1/Z, [x28]\n"
1053 "add x21, x21, #0x4\n"
1054 "addvl x12, x12, #1\n"
1055 "ld1rw { z1.s }, p1/Z, [x27]\n"
1056 "fmla z30.s, p1/M, z16.s, z6.s\n"
1057 "fmla z31.s, p1/M, z16.s, z7.s\n"
1058 "ld1rw { z2.s }, p1/Z, [x26]\n"
1059 "ld1rw { z3.s }, p1/Z, [x25]\n"
1060 "ld1rw { z4.s }, p1/Z, [x24]\n"
1061 "ld1rw { z5.s }, p1/Z, [x23]\n"
1062 "ld1rw { z6.s }, p1/Z, [x22]\n"
1063 "ld1rw { z7.s }, p1/Z, [x21]\n"
1066 "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
1067 "ld1w { z16.s }, p1/Z, [x12]\n"
1068 "add x10, x10, #0x1\n"
1070 "fmla z24.s, p1/M, z16.s, z0.s\n"
1071 "fmla z25.s, p1/M, z16.s, z1.s\n"
1072 "addvl x12, x12, #1\n"
1073 "fmla z26.s, p1/M, z16.s, z2.s\n"
1074 "fmla z27.s, p1/M, z16.s, z3.s\n"
1075 "fmla z28.s, p1/M, z16.s, z4.s\n"
1076 "fmla z29.s, p1/M, z16.s, z5.s\n"
1077 "fmla z30.s, p1/M, z16.s, z6.s\n"
1078 "fmla z31.s, p1/M, z16.s, z7.s\n"
1080 "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
1081 "add x27, x11, x20, LSL #2\n"
1082 "add x26, x27, x20, LSL #2\n"
1083 "add x25, x26, x20, LSL #2\n"
1084 "add x24, x25, x20, LSL #2\n"
1085 "add x23, x24, x20, LSL #2\n"
1086 "add x22, x23, x20, LSL #2\n"
1087 "add x21, x22, x20, LSL #2\n"
1088 "tbz %x[flags], #1, 95f\n"
1089 "add x20, %x[args_ptr], %[offset_max]\n"
1090 "ld1rw { z17.s }, p1/Z, [x20]\n"
1091 "add x20, %x[args_ptr], %[offset_min]\n"
1092 "ld1rw { z16.s }, p1/Z, [x20]\n"
1093 "fmin z24.s, p1/M, z24.s, z17.s\n"
1094 "fmin z25.s, p1/M, z25.s, z17.s\n"
1095 "fmin z26.s, p1/M, z26.s, z17.s\n"
1096 "fmin z27.s, p1/M, z27.s, z17.s\n"
1097 "fmin z28.s, p1/M, z28.s, z17.s\n"
1098 "fmin z29.s, p1/M, z29.s, z17.s\n"
1099 "fmin z30.s, p1/M, z30.s, z17.s\n"
1100 "fmin z31.s, p1/M, z31.s, z17.s\n"
1101 "fmax z24.s, p1/M, z24.s, z16.s\n"
1102 "fmax z25.s, p1/M, z25.s, z16.s\n"
1103 "fmax z26.s, p1/M, z26.s, z16.s\n"
1104 "fmax z27.s, p1/M, z27.s, z16.s\n"
1105 "fmax z28.s, p1/M, z28.s, z16.s\n"
1106 "fmax z29.s, p1/M, z29.s, z16.s\n"
1107 "fmax z30.s, p1/M, z30.s, z16.s\n"
1108 "fmax z31.s, p1/M, z31.s, z16.s\n"
1110 "st1w { z24.s }, p0, [x11]\n"
1111 "addvl x11, x11, #1\n"
1112 "st1w { z25.s }, p0, [x27]\n"
1113 "st1w { z26.s }, p0, [x26]\n"
1114 "st1w { z27.s }, p0, [x25]\n"
1115 "st1w { z28.s }, p0, [x24]\n"
1116 "st1w { z29.s }, p0, [x23]\n"
1117 "st1w { z30.s }, p0, [x22]\n"
1118 "st1w { z31.s }, p0, [x21]\n"
1123 "subs %x[M], %x[M], #0x8\n"
1125 "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
1126 "tbz %x[flags], #3, 97f\n"
1127 "add x21, x21, #0x8\n"
1128 "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
1132 "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
1135 : [
M]
"+&r" (
M), [input_ptr]
"+&r" (input_ptr), [output_ptr]
"+&r" (output_ptr)
1136 : [args_ptr]
"r" (&ka), [
bias]
"r" (
bias), [flags]
"r" (flags), [offset_max]
"I" (offsetof(KernelArgs, maxval)), [offset_min]
"I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr]
"I" (offsetof(KernelArgs, B_ptr)), [offsetof_N]
"I" (offsetof(KernelArgs,
N)), [offsetof_input_initial_col]
"I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset]
"I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings]
"I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset]
"I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths]
"I" (offsetof(KernelArgs, string_lengths))
1137 :
"cc",
"memory",
"p0",
"p1",
"x9",
"x10",
"x11",
"x12",
"x13",
"x14",
"x20",
"x21",
"x22",
"x23",
"x24",
"x25",
"x26",
"x27",
"x28",
"z0",
"z1",
"z2",
"z3",
"z4",
"z5",
"z6",
"z7",
"z16",
"z17",
"z24",
"z25",
"z26",
"z27",
"z28",
"z29",
"z30",
"z31"
1142 #endif // ARM_COMPUTE_ENABLE_SVE