31 #include "../../asmlib.hpp"
32 #include "../../utils.hpp"
36 void a64_smallK_hybrid_fp32_mla_8x4(
const float *A,
int lda,
const float *B,
float *C,
int ldc,
int M,
int N,
int K,
const float *
bias,
Activation act,
bool) {
37 const long loops_count =
iceildiv(
N, (
int)4) - 1;
38 const long ldab = lda *
sizeof(float);
39 const long ldcb = ldc *
sizeof(float);
42 memset(nullbias, 0, (4 *
sizeof(
float)));
44 float minval = -
static_cast<float>(std::numeric_limits<float>::infinity());
45 float maxval =
static_cast<float>(std::numeric_limits<float>::infinity());
46 const float *
const minptr = &minval;
47 const float *
const maxptr = &maxval;
55 maxval =
static_cast<float>(act.param1);
62 for (
int y0=0; y0<
M; y0+=8) {
63 long loops = loops_count;
64 long oob_rows = std::max(8 - (
M-y0), 0);
65 const float *b_ptr0 =
B;
66 const float *biasptr =
bias ?
bias : nullbias;
67 const uint64_t biasinc =
bias ? 4*
sizeof(float) : 0;
68 const float *a_ptr0 =
A + (y0 * lda);
70 float *c_ptr0 = C + (y0 * ldc);
89 "add a_ptr1, %[a_ptr0], %[lda]\n"
90 "add c_ptr1, %[c_ptr0], %[ldc]\n"
91 "add a_ptr2, a_ptr1, %[lda]\n"
92 "add c_ptr2, c_ptr1, %[ldc]\n"
93 "add a_ptr3, a_ptr2, %[lda]\n"
94 "add c_ptr3, c_ptr2, %[ldc]\n"
95 "add a_ptr4, a_ptr3, %[lda]\n"
96 "add c_ptr4, c_ptr3, %[ldc]\n"
97 "add a_ptr5, a_ptr4, %[lda]\n"
98 "add c_ptr5, c_ptr4, %[ldc]\n"
99 "add a_ptr6, a_ptr5, %[lda]\n"
100 "add c_ptr6, c_ptr5, %[ldc]\n"
101 "add a_ptr7, a_ptr6, %[lda]\n"
102 "add c_ptr7, c_ptr6, %[ldc]\n"
103 "cbz %[oob_rows], 1f\n"
104 "subs %[oob_rows], %[oob_rows], #0x1\n"
105 "add c_ptr7, %[c_ptr0], #0x0\n"
106 "add a_ptr7, %[a_ptr0], #0x0\n"
108 "subs %[oob_rows], %[oob_rows], #0x1\n"
109 "add c_ptr6, %[c_ptr0], #0x0\n"
110 "add a_ptr6, %[a_ptr0], #0x0\n"
112 "subs %[oob_rows], %[oob_rows], #0x1\n"
113 "add c_ptr5, %[c_ptr0], #0x0\n"
114 "add a_ptr5, %[a_ptr0], #0x0\n"
116 "subs %[oob_rows], %[oob_rows], #0x1\n"
117 "add c_ptr4, %[c_ptr0], #0x0\n"
118 "add a_ptr4, %[a_ptr0], #0x0\n"
120 "subs %[oob_rows], %[oob_rows], #0x1\n"
121 "add c_ptr3, %[c_ptr0], #0x0\n"
122 "add a_ptr3, %[a_ptr0], #0x0\n"
124 "subs %[oob_rows], %[oob_rows], #0x1\n"
125 "add c_ptr2, %[c_ptr0], #0x0\n"
126 "add a_ptr2, %[a_ptr0], #0x0\n"
128 "subs %[oob_rows], %[oob_rows], #0x1\n"
129 "add c_ptr1, %[c_ptr0], #0x0\n"
130 "add a_ptr1, %[a_ptr0], #0x0\n"
132 "ldr q16, [%[b_ptr0]]\n"
133 "ldr s0, [%[a_ptr0]]\n"
141 "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
142 "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
143 "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
144 "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
145 "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
146 "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
147 "add %[b_ptr0], %[b_ptr0], #0x10\n"
149 "ldr q24, [%[biasptr]]\n"
150 "add %[biasptr], %[biasptr], %[biasinc]\n"
151 "subs %[loops], %[loops], #0x1\n"
152 "mov v25.16b, v24.16b\n"
153 "mov v26.16b, v24.16b\n"
154 "mov v27.16b, v24.16b\n"
155 "mov v28.16b, v24.16b\n"
156 "mov v29.16b, v24.16b\n"
157 "mov v30.16b, v24.16b\n"
158 "mov v31.16b, v24.16b\n"
159 "fmla v24.4s, v16.4s, v0.s[0]\n"
160 "fmla v25.4s, v16.4s, v1.s[0]\n"
161 "fmla v26.4s, v16.4s, v2.s[0]\n"
162 "fmla v27.4s, v16.4s, v3.s[0]\n"
163 "fmla v28.4s, v16.4s, v4.s[0]\n"
164 "fmla v29.4s, v16.4s, v5.s[0]\n"
165 "fmla v30.4s, v16.4s, v6.s[0]\n"
166 "fmla v31.4s, v16.4s, v7.s[0]\n"
169 "ld1r {v22.4s}, [%[minptr]]\n"
170 "subs %[loops], %[loops], #0x1\n"
171 "ld1r {v23.4s}, [%[maxptr]]\n"
172 "ldr q16, [%[b_ptr0]]\n"
173 "add %[b_ptr0], %[b_ptr0], #0x10\n"
174 "fmax v24.4s, v24.4s, v22.4s\n"
175 "fmax v25.4s, v25.4s, v22.4s\n"
176 "fmax v26.4s, v26.4s, v22.4s\n"
177 "fmax v27.4s, v27.4s, v22.4s\n"
178 "fmin v24.4s, v24.4s, v23.4s\n"
179 "fmin v25.4s, v25.4s, v23.4s\n"
180 "fmin v26.4s, v26.4s, v23.4s\n"
181 "fmin v27.4s, v27.4s, v23.4s\n"
182 "str q24, [%[c_ptr0]]\n"
183 "fmax v28.4s, v28.4s, v22.4s\n"
184 "ldr q24, [%[biasptr]]\n"
185 "fmax v29.4s, v29.4s, v22.4s\n"
186 "add %[c_ptr0], %[c_ptr0], #0x10\n"
187 "fmax v30.4s, v30.4s, v22.4s\n"
188 "str q25, [c_ptr1]\n"
189 "fmin v28.4s, v28.4s, v23.4s\n"
190 "add c_ptr1, c_ptr1, #0x10\n"
191 "fmin v29.4s, v29.4s, v23.4s\n"
192 "str q26, [c_ptr2]\n"
193 "fmin v30.4s, v30.4s, v23.4s\n"
194 "add c_ptr2, c_ptr2, #0x10\n"
195 "fmax v31.4s, v31.4s, v22.4s\n"
196 "str q27, [c_ptr3]\n"
197 "mov v25.16b, v24.16b\n"
198 "add c_ptr3, c_ptr3, #0x10\n"
199 "mov v26.16b, v24.16b\n"
200 "str q28, [c_ptr4]\n"
201 "fmin v31.4s, v31.4s, v23.4s\n"
202 "add c_ptr4, c_ptr4, #0x10\n"
203 "mov v27.16b, v24.16b\n"
204 "str q29, [c_ptr5]\n"
205 "mov v28.16b, v24.16b\n"
206 "add c_ptr5, c_ptr5, #0x10\n"
207 "mov v29.16b, v24.16b\n"
208 "str q30, [c_ptr6]\n"
209 "mov v30.16b, v24.16b\n"
210 "add c_ptr6, c_ptr6, #0x10\n"
211 "fmla v25.4s, v16.4s, v1.s[0]\n"
212 "str q31, [c_ptr7]\n"
213 "mov v31.16b, v24.16b\n"
214 "add c_ptr7, c_ptr7, #0x10\n"
215 "fmla v24.4s, v16.4s, v0.s[0]\n"
216 "add %[biasptr], %[biasptr], %[biasinc]\n"
217 "fmla v26.4s, v16.4s, v2.s[0]\n"
218 "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
219 "fmla v27.4s, v16.4s, v3.s[0]\n"
220 "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
221 "fmla v28.4s, v16.4s, v4.s[0]\n"
222 "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
223 "fmla v29.4s, v16.4s, v5.s[0]\n"
224 "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
225 "fmla v30.4s, v16.4s, v6.s[0]\n"
226 "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
227 "fmla v31.4s, v16.4s, v7.s[0]\n"
228 "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
229 "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
230 "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
233 "ld1r {v22.4s}, [%[minptr]]\n"
234 "ld1r {v23.4s}, [%[maxptr]]\n"
235 "ldr q16, [%[b_ptr0]]\n"
236 "add %[b_ptr0], %[b_ptr0], #0x10\n"
237 "fmax v24.4s, v24.4s, v22.4s\n"
238 "fmax v25.4s, v25.4s, v22.4s\n"
239 "fmax v26.4s, v26.4s, v22.4s\n"
240 "fmax v27.4s, v27.4s, v22.4s\n"
241 "fmin v24.4s, v24.4s, v23.4s\n"
242 "fmin v25.4s, v25.4s, v23.4s\n"
243 "fmin v26.4s, v26.4s, v23.4s\n"
244 "fmin v27.4s, v27.4s, v23.4s\n"
245 "str q24, [%[c_ptr0]]\n"
246 "fmax v28.4s, v28.4s, v22.4s\n"
247 "ldr q24, [%[biasptr]]\n"
248 "fmax v29.4s, v29.4s, v22.4s\n"
249 "add %[c_ptr0], %[c_ptr0], #0x10\n"
250 "fmax v30.4s, v30.4s, v22.4s\n"
251 "str q25, [c_ptr1]\n"
252 "fmin v28.4s, v28.4s, v23.4s\n"
253 "add c_ptr1, c_ptr1, #0x10\n"
254 "fmin v29.4s, v29.4s, v23.4s\n"
255 "str q26, [c_ptr2]\n"
256 "fmin v30.4s, v30.4s, v23.4s\n"
257 "add c_ptr2, c_ptr2, #0x10\n"
258 "fmax v31.4s, v31.4s, v22.4s\n"
259 "str q27, [c_ptr3]\n"
260 "mov v25.16b, v24.16b\n"
261 "add c_ptr3, c_ptr3, #0x10\n"
262 "mov v26.16b, v24.16b\n"
263 "str q28, [c_ptr4]\n"
264 "fmin v31.4s, v31.4s, v23.4s\n"
265 "add c_ptr4, c_ptr4, #0x10\n"
266 "mov v27.16b, v24.16b\n"
267 "str q29, [c_ptr5]\n"
268 "mov v28.16b, v24.16b\n"
269 "add c_ptr5, c_ptr5, #0x10\n"
270 "mov v29.16b, v24.16b\n"
271 "str q30, [c_ptr6]\n"
272 "mov v30.16b, v24.16b\n"
273 "add c_ptr6, c_ptr6, #0x10\n"
274 "fmla v25.4s, v16.4s, v1.s[0]\n"
275 "str q31, [c_ptr7]\n"
276 "mov v31.16b, v24.16b\n"
277 "add c_ptr7, c_ptr7, #0x10\n"
278 "fmla v24.4s, v16.4s, v0.s[0]\n"
279 "add %[biasptr], %[biasptr], %[biasinc]\n"
280 "fmla v26.4s, v16.4s, v2.s[0]\n"
281 "fmla v27.4s, v16.4s, v3.s[0]\n"
282 "fmla v28.4s, v16.4s, v4.s[0]\n"
283 "fmla v29.4s, v16.4s, v5.s[0]\n"
284 "fmla v30.4s, v16.4s, v6.s[0]\n"
285 "fmla v31.4s, v16.4s, v7.s[0]\n"
288 "ldr q24, [%[biasptr]]\n"
289 "add %[biasptr], %[biasptr], %[biasinc]\n"
290 "mov v25.16b, v24.16b\n"
291 "mov v26.16b, v24.16b\n"
292 "mov v27.16b, v24.16b\n"
293 "mov v28.16b, v24.16b\n"
294 "mov v29.16b, v24.16b\n"
295 "mov v30.16b, v24.16b\n"
296 "mov v31.16b, v24.16b\n"
297 "fmla v24.4s, v16.4s, v0.s[0]\n"
298 "fmla v25.4s, v16.4s, v1.s[0]\n"
299 "fmla v26.4s, v16.4s, v2.s[0]\n"
300 "fmla v27.4s, v16.4s, v3.s[0]\n"
301 "fmla v28.4s, v16.4s, v4.s[0]\n"
302 "fmla v29.4s, v16.4s, v5.s[0]\n"
303 "fmla v30.4s, v16.4s, v6.s[0]\n"
304 "fmla v31.4s, v16.4s, v7.s[0]\n"
306 "ld1r {v22.4s}, [%[minptr]]\n"
307 "ld1r {v23.4s}, [%[maxptr]]\n"
308 "fmax v24.4s, v24.4s, v22.4s\n"
309 "fmax v25.4s, v25.4s, v22.4s\n"
310 "fmax v26.4s, v26.4s, v22.4s\n"
311 "fmax v27.4s, v27.4s, v22.4s\n"
312 "fmin v24.4s, v24.4s, v23.4s\n"
313 "fmin v25.4s, v25.4s, v23.4s\n"
314 "fmin v26.4s, v26.4s, v23.4s\n"
315 "fmin v27.4s, v27.4s, v23.4s\n"
316 "str q24, [%[c_ptr0]]\n"
317 "fmax v28.4s, v28.4s, v22.4s\n"
318 "add %[c_ptr0], %[c_ptr0], #0x10\n"
319 "fmax v29.4s, v29.4s, v22.4s\n"
320 "str q25, [c_ptr1]\n"
321 "fmax v30.4s, v30.4s, v22.4s\n"
322 "fmin v28.4s, v28.4s, v23.4s\n"
323 "fmax v31.4s, v31.4s, v22.4s\n"
324 "str q26, [c_ptr2]\n"
325 "fmin v29.4s, v29.4s, v23.4s\n"
326 "fmin v30.4s, v30.4s, v23.4s\n"
327 "fmin v31.4s, v31.4s, v23.4s\n"
328 "str q27, [c_ptr3]\n"
329 "str q28, [c_ptr4]\n"
330 "str q29, [c_ptr5]\n"
331 "str q30, [c_ptr6]\n"
332 "str q31, [c_ptr7]\n"
347 : [a_ptr0]
"+r" (a_ptr0), [b_ptr0]
"+r" (b_ptr0), [c_ptr0]
"+r" (c_ptr0), [loops]
"+r" (loops), [oob_rows]
"+r" (oob_rows), [biasptr]
"+r" (biasptr)
348 : [lda]
"r" (ldab), [ldc]
"r" (ldcb), [biasinc]
"r" (biasinc), [minptr]
"r" (minptr), [maxptr]
"r" (maxptr)
349 :
"x0",
"x1",
"x2",
"x3",
"x4",
"x5",
"x6",
"x7",
"x8",
"x9",
"x10",
"x11",
"x12",
"x13",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23",
"v24",
"v25",
"v26",
"v27",
"v28",
"v29",
"v30",
"v31",
"cc",
"memory"
368 "add a_ptr1, %[a_ptr0], %[lda]\n"
369 "add c_ptr1, %[c_ptr0], %[ldc]\n"
370 "add a_ptr2, a_ptr1, %[lda]\n"
371 "add c_ptr2, c_ptr1, %[ldc]\n"
372 "add a_ptr3, a_ptr2, %[lda]\n"
373 "add c_ptr3, c_ptr2, %[ldc]\n"
374 "add a_ptr4, a_ptr3, %[lda]\n"
375 "add c_ptr4, c_ptr3, %[ldc]\n"
376 "add a_ptr5, a_ptr4, %[lda]\n"
377 "add c_ptr5, c_ptr4, %[ldc]\n"
378 "add a_ptr6, a_ptr5, %[lda]\n"
379 "add c_ptr6, c_ptr5, %[ldc]\n"
380 "add a_ptr7, a_ptr6, %[lda]\n"
381 "add c_ptr7, c_ptr6, %[ldc]\n"
382 "cbz %[oob_rows], 1f\n"
383 "subs %[oob_rows], %[oob_rows], #0x1\n"
384 "add c_ptr7, %[c_ptr0], #0x0\n"
385 "add a_ptr7, %[a_ptr0], #0x0\n"
387 "subs %[oob_rows], %[oob_rows], #0x1\n"
388 "add c_ptr6, %[c_ptr0], #0x0\n"
389 "add a_ptr6, %[a_ptr0], #0x0\n"
391 "subs %[oob_rows], %[oob_rows], #0x1\n"
392 "add c_ptr5, %[c_ptr0], #0x0\n"
393 "add a_ptr5, %[a_ptr0], #0x0\n"
395 "subs %[oob_rows], %[oob_rows], #0x1\n"
396 "add c_ptr4, %[c_ptr0], #0x0\n"
397 "add a_ptr4, %[a_ptr0], #0x0\n"
399 "subs %[oob_rows], %[oob_rows], #0x1\n"
400 "add c_ptr3, %[c_ptr0], #0x0\n"
401 "add a_ptr3, %[a_ptr0], #0x0\n"
403 "subs %[oob_rows], %[oob_rows], #0x1\n"
404 "add c_ptr2, %[c_ptr0], #0x0\n"
405 "add a_ptr2, %[a_ptr0], #0x0\n"
407 "subs %[oob_rows], %[oob_rows], #0x1\n"
408 "add c_ptr1, %[c_ptr0], #0x0\n"
409 "add a_ptr1, %[a_ptr0], #0x0\n"
411 "ldr q16, [%[b_ptr0]]\n"
412 "ldr d0, [%[a_ptr0]]\n"
413 "ldr q17, [%[b_ptr0], #0x10]\n"
421 "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
422 "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
423 "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
424 "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
425 "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
426 "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
427 "add %[b_ptr0], %[b_ptr0], #0x20\n"
429 "ldr q24, [%[biasptr]]\n"
430 "add %[biasptr], %[biasptr], %[biasinc]\n"
431 "subs %[loops], %[loops], #0x1\n"
432 "mov v25.16b, v24.16b\n"
433 "mov v26.16b, v24.16b\n"
434 "mov v27.16b, v24.16b\n"
435 "mov v28.16b, v24.16b\n"
436 "mov v29.16b, v24.16b\n"
437 "mov v30.16b, v24.16b\n"
438 "mov v31.16b, v24.16b\n"
439 "fmla v24.4s, v16.4s, v0.s[0]\n"
440 "fmla v25.4s, v16.4s, v1.s[0]\n"
441 "fmla v26.4s, v16.4s, v2.s[0]\n"
442 "fmla v27.4s, v16.4s, v3.s[0]\n"
443 "fmla v28.4s, v16.4s, v4.s[0]\n"
444 "fmla v29.4s, v16.4s, v5.s[0]\n"
445 "fmla v30.4s, v16.4s, v6.s[0]\n"
446 "fmla v31.4s, v16.4s, v7.s[0]\n"
447 "fmla v24.4s, v17.4s, v0.s[1]\n"
448 "fmla v25.4s, v17.4s, v1.s[1]\n"
449 "fmla v26.4s, v17.4s, v2.s[1]\n"
450 "fmla v27.4s, v17.4s, v3.s[1]\n"
451 "fmla v28.4s, v17.4s, v4.s[1]\n"
452 "fmla v29.4s, v17.4s, v5.s[1]\n"
453 "fmla v30.4s, v17.4s, v6.s[1]\n"
454 "fmla v31.4s, v17.4s, v7.s[1]\n"
457 "ld1r {v22.4s}, [%[minptr]]\n"
458 "subs %[loops], %[loops], #0x1\n"
459 "ld1r {v23.4s}, [%[maxptr]]\n"
460 "ldr q16, [%[b_ptr0]]\n"
461 "fmax v24.4s, v24.4s, v22.4s\n"
462 "ldr q17, [%[b_ptr0], #0x10]\n"
463 "fmax v25.4s, v25.4s, v22.4s\n"
464 "add %[b_ptr0], %[b_ptr0], #0x20\n"
465 "fmax v26.4s, v26.4s, v22.4s\n"
466 "fmin v24.4s, v24.4s, v23.4s\n"
467 "fmin v25.4s, v25.4s, v23.4s\n"
468 "fmax v27.4s, v27.4s, v22.4s\n"
469 "fmin v26.4s, v26.4s, v23.4s\n"
470 "str q24, [%[c_ptr0]]\n"
471 "fmax v28.4s, v28.4s, v22.4s\n"
472 "ldr q24, [%[biasptr]]\n"
473 "fmax v29.4s, v29.4s, v22.4s\n"
474 "add %[c_ptr0], %[c_ptr0], #0x10\n"
475 "fmin v27.4s, v27.4s, v23.4s\n"
476 "str q25, [c_ptr1]\n"
477 "fmin v28.4s, v28.4s, v23.4s\n"
478 "add c_ptr1, c_ptr1, #0x10\n"
479 "fmin v29.4s, v29.4s, v23.4s\n"
480 "str q26, [c_ptr2]\n"
481 "fmax v30.4s, v30.4s, v22.4s\n"
482 "add c_ptr2, c_ptr2, #0x10\n"
483 "fmax v31.4s, v31.4s, v22.4s\n"
484 "str q27, [c_ptr3]\n"
485 "mov v25.16b, v24.16b\n"
486 "add c_ptr3, c_ptr3, #0x10\n"
487 "fmin v30.4s, v30.4s, v23.4s\n"
488 "str q28, [c_ptr4]\n"
489 "fmin v31.4s, v31.4s, v23.4s\n"
490 "add c_ptr4, c_ptr4, #0x10\n"
491 "mov v26.16b, v24.16b\n"
492 "str q29, [c_ptr5]\n"
493 "mov v27.16b, v24.16b\n"
494 "add c_ptr5, c_ptr5, #0x10\n"
495 "mov v28.16b, v24.16b\n"
496 "str q30, [c_ptr6]\n"
497 "mov v29.16b, v24.16b\n"
498 "add c_ptr6, c_ptr6, #0x10\n"
499 "mov v30.16b, v24.16b\n"
500 "str q31, [c_ptr7]\n"
501 "mov v31.16b, v24.16b\n"
502 "add c_ptr7, c_ptr7, #0x10\n"
503 "fmla v24.4s, v16.4s, v0.s[0]\n"
504 "add %[biasptr], %[biasptr], %[biasinc]\n"
505 "fmla v25.4s, v16.4s, v1.s[0]\n"
506 "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
507 "fmla v26.4s, v16.4s, v2.s[0]\n"
508 "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
509 "fmla v27.4s, v16.4s, v3.s[0]\n"
510 "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
511 "fmla v28.4s, v16.4s, v4.s[0]\n"
512 "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
513 "fmla v29.4s, v16.4s, v5.s[0]\n"
514 "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
515 "fmla v30.4s, v16.4s, v6.s[0]\n"
516 "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
517 "fmla v31.4s, v16.4s, v7.s[0]\n"
518 "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
519 "fmla v24.4s, v17.4s, v0.s[1]\n"
520 "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
521 "fmla v25.4s, v17.4s, v1.s[1]\n"
522 "fmla v26.4s, v17.4s, v2.s[1]\n"
523 "fmla v27.4s, v17.4s, v3.s[1]\n"
524 "fmla v28.4s, v17.4s, v4.s[1]\n"
525 "fmla v29.4s, v17.4s, v5.s[1]\n"
526 "fmla v30.4s, v17.4s, v6.s[1]\n"
527 "fmla v31.4s, v17.4s, v7.s[1]\n"
530 "ld1r {v22.4s}, [%[minptr]]\n"
531 "ld1r {v23.4s}, [%[maxptr]]\n"
532 "ldr q16, [%[b_ptr0]]\n"
533 "ldr q17, [%[b_ptr0], #0x10]\n"
534 "add %[b_ptr0], %[b_ptr0], #0x20\n"
535 "fmax v24.4s, v24.4s, v22.4s\n"
536 "fmax v25.4s, v25.4s, v22.4s\n"
537 "fmax v26.4s, v26.4s, v22.4s\n"
538 "fmax v27.4s, v27.4s, v22.4s\n"
539 "fmin v24.4s, v24.4s, v23.4s\n"
540 "fmin v25.4s, v25.4s, v23.4s\n"
541 "fmin v26.4s, v26.4s, v23.4s\n"
542 "fmin v27.4s, v27.4s, v23.4s\n"
543 "str q24, [%[c_ptr0]]\n"
544 "fmax v28.4s, v28.4s, v22.4s\n"
545 "ldr q24, [%[biasptr]]\n"
546 "fmax v29.4s, v29.4s, v22.4s\n"
547 "add %[c_ptr0], %[c_ptr0], #0x10\n"
548 "fmax v30.4s, v30.4s, v22.4s\n"
549 "str q25, [c_ptr1]\n"
550 "fmin v28.4s, v28.4s, v23.4s\n"
551 "add c_ptr1, c_ptr1, #0x10\n"
552 "fmin v29.4s, v29.4s, v23.4s\n"
553 "str q26, [c_ptr2]\n"
554 "fmin v30.4s, v30.4s, v23.4s\n"
555 "add c_ptr2, c_ptr2, #0x10\n"
556 "fmax v31.4s, v31.4s, v22.4s\n"
557 "str q27, [c_ptr3]\n"
558 "mov v25.16b, v24.16b\n"
559 "add c_ptr3, c_ptr3, #0x10\n"
560 "mov v26.16b, v24.16b\n"
561 "str q28, [c_ptr4]\n"
562 "fmin v31.4s, v31.4s, v23.4s\n"
563 "add c_ptr4, c_ptr4, #0x10\n"
564 "mov v27.16b, v24.16b\n"
565 "str q29, [c_ptr5]\n"
566 "mov v28.16b, v24.16b\n"
567 "add c_ptr5, c_ptr5, #0x10\n"
568 "mov v29.16b, v24.16b\n"
569 "str q30, [c_ptr6]\n"
570 "mov v30.16b, v24.16b\n"
571 "add c_ptr6, c_ptr6, #0x10\n"
572 "fmla v25.4s, v16.4s, v1.s[0]\n"
573 "str q31, [c_ptr7]\n"
574 "mov v31.16b, v24.16b\n"
575 "add c_ptr7, c_ptr7, #0x10\n"
576 "fmla v24.4s, v16.4s, v0.s[0]\n"
577 "add %[biasptr], %[biasptr], %[biasinc]\n"
578 "fmla v26.4s, v16.4s, v2.s[0]\n"
579 "fmla v27.4s, v16.4s, v3.s[0]\n"
580 "fmla v28.4s, v16.4s, v4.s[0]\n"
581 "fmla v29.4s, v16.4s, v5.s[0]\n"
582 "fmla v30.4s, v16.4s, v6.s[0]\n"
583 "fmla v31.4s, v16.4s, v7.s[0]\n"
584 "fmla v24.4s, v17.4s, v0.s[1]\n"
585 "fmla v25.4s, v17.4s, v1.s[1]\n"
586 "fmla v26.4s, v17.4s, v2.s[1]\n"
587 "fmla v27.4s, v17.4s, v3.s[1]\n"
588 "fmla v28.4s, v17.4s, v4.s[1]\n"
589 "fmla v29.4s, v17.4s, v5.s[1]\n"
590 "fmla v30.4s, v17.4s, v6.s[1]\n"
591 "fmla v31.4s, v17.4s, v7.s[1]\n"
594 "ldr q24, [%[biasptr]]\n"
595 "add %[biasptr], %[biasptr], %[biasinc]\n"
596 "mov v25.16b, v24.16b\n"
597 "mov v26.16b, v24.16b\n"
598 "mov v27.16b, v24.16b\n"
599 "mov v28.16b, v24.16b\n"
600 "mov v29.16b, v24.16b\n"
601 "mov v30.16b, v24.16b\n"
602 "mov v31.16b, v24.16b\n"
603 "fmla v24.4s, v16.4s, v0.s[0]\n"
604 "fmla v25.4s, v16.4s, v1.s[0]\n"
605 "fmla v26.4s, v16.4s, v2.s[0]\n"
606 "fmla v27.4s, v16.4s, v3.s[0]\n"
607 "fmla v28.4s, v16.4s, v4.s[0]\n"
608 "fmla v29.4s, v16.4s, v5.s[0]\n"
609 "fmla v30.4s, v16.4s, v6.s[0]\n"
610 "fmla v31.4s, v16.4s, v7.s[0]\n"
611 "fmla v24.4s, v17.4s, v0.s[1]\n"
612 "fmla v25.4s, v17.4s, v1.s[1]\n"
613 "fmla v26.4s, v17.4s, v2.s[1]\n"
614 "fmla v27.4s, v17.4s, v3.s[1]\n"
615 "fmla v28.4s, v17.4s, v4.s[1]\n"
616 "fmla v29.4s, v17.4s, v5.s[1]\n"
617 "fmla v30.4s, v17.4s, v6.s[1]\n"
618 "fmla v31.4s, v17.4s, v7.s[1]\n"
620 "ld1r {v22.4s}, [%[minptr]]\n"
621 "ld1r {v23.4s}, [%[maxptr]]\n"
622 "fmax v24.4s, v24.4s, v22.4s\n"
623 "fmax v25.4s, v25.4s, v22.4s\n"
624 "fmax v26.4s, v26.4s, v22.4s\n"
625 "fmax v27.4s, v27.4s, v22.4s\n"
626 "fmin v24.4s, v24.4s, v23.4s\n"
627 "fmin v25.4s, v25.4s, v23.4s\n"
628 "fmin v26.4s, v26.4s, v23.4s\n"
629 "fmin v27.4s, v27.4s, v23.4s\n"
630 "str q24, [%[c_ptr0]]\n"
631 "fmax v28.4s, v28.4s, v22.4s\n"
632 "add %[c_ptr0], %[c_ptr0], #0x10\n"
633 "fmax v29.4s, v29.4s, v22.4s\n"
634 "str q25, [c_ptr1]\n"
635 "fmax v30.4s, v30.4s, v22.4s\n"
636 "fmin v28.4s, v28.4s, v23.4s\n"
637 "fmax v31.4s, v31.4s, v22.4s\n"
638 "str q26, [c_ptr2]\n"
639 "fmin v29.4s, v29.4s, v23.4s\n"
640 "fmin v30.4s, v30.4s, v23.4s\n"
641 "fmin v31.4s, v31.4s, v23.4s\n"
642 "str q27, [c_ptr3]\n"
643 "str q28, [c_ptr4]\n"
644 "str q29, [c_ptr5]\n"
645 "str q30, [c_ptr6]\n"
646 "str q31, [c_ptr7]\n"
661 : [a_ptr0]
"+r" (a_ptr0), [b_ptr0]
"+r" (b_ptr0), [c_ptr0]
"+r" (c_ptr0), [loops]
"+r" (loops), [oob_rows]
"+r" (oob_rows), [biasptr]
"+r" (biasptr)
662 : [lda]
"r" (ldab), [ldc]
"r" (ldcb), [biasinc]
"r" (biasinc), [minptr]
"r" (minptr), [maxptr]
"r" (maxptr)
663 :
"x0",
"x1",
"x2",
"x3",
"x4",
"x5",
"x6",
"x7",
"x8",
"x9",
"x10",
"x11",
"x12",
"x13",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23",
"v24",
"v25",
"v26",
"v27",
"v28",
"v29",
"v30",
"v31",
"cc",
"memory"
682 "add a_ptr1, %[a_ptr0], %[lda]\n"
683 "add c_ptr1, %[c_ptr0], %[ldc]\n"
684 "add a_ptr2, a_ptr1, %[lda]\n"
685 "add c_ptr2, c_ptr1, %[ldc]\n"
686 "add a_ptr3, a_ptr2, %[lda]\n"
687 "add c_ptr3, c_ptr2, %[ldc]\n"
688 "add a_ptr4, a_ptr3, %[lda]\n"
689 "add c_ptr4, c_ptr3, %[ldc]\n"
690 "add a_ptr5, a_ptr4, %[lda]\n"
691 "add c_ptr5, c_ptr4, %[ldc]\n"
692 "add a_ptr6, a_ptr5, %[lda]\n"
693 "add c_ptr6, c_ptr5, %[ldc]\n"
694 "add a_ptr7, a_ptr6, %[lda]\n"
695 "add c_ptr7, c_ptr6, %[ldc]\n"
696 "cbz %[oob_rows], 1f\n"
697 "subs %[oob_rows], %[oob_rows], #0x1\n"
698 "add c_ptr7, %[c_ptr0], #0x0\n"
699 "add a_ptr7, %[a_ptr0], #0x0\n"
701 "subs %[oob_rows], %[oob_rows], #0x1\n"
702 "add c_ptr6, %[c_ptr0], #0x0\n"
703 "add a_ptr6, %[a_ptr0], #0x0\n"
705 "subs %[oob_rows], %[oob_rows], #0x1\n"
706 "add c_ptr5, %[c_ptr0], #0x0\n"
707 "add a_ptr5, %[a_ptr0], #0x0\n"
709 "subs %[oob_rows], %[oob_rows], #0x1\n"
710 "add c_ptr4, %[c_ptr0], #0x0\n"
711 "add a_ptr4, %[a_ptr0], #0x0\n"
713 "subs %[oob_rows], %[oob_rows], #0x1\n"
714 "add c_ptr3, %[c_ptr0], #0x0\n"
715 "add a_ptr3, %[a_ptr0], #0x0\n"
717 "subs %[oob_rows], %[oob_rows], #0x1\n"
718 "add c_ptr2, %[c_ptr0], #0x0\n"
719 "add a_ptr2, %[a_ptr0], #0x0\n"
721 "subs %[oob_rows], %[oob_rows], #0x1\n"
722 "add c_ptr1, %[c_ptr0], #0x0\n"
723 "add a_ptr1, %[a_ptr0], #0x0\n"
725 "ldr q16, [%[b_ptr0]]\n"
726 "ldr d0, [%[a_ptr0]], #0x8\n"
727 "ldr q17, [%[b_ptr0], #0x10]\n"
728 "ldr d1, [a_ptr1], #0x8\n"
729 "ldr q18, [%[b_ptr0], #0x20]\n"
730 "ldr d2, [a_ptr2], #0x8\n"
731 "ldr d3, [a_ptr3], #0x8\n"
732 "ldr d4, [a_ptr4], #0x8\n"
733 "ldr d5, [a_ptr5], #0x8\n"
734 "ldr d6, [a_ptr6], #0x8\n"
735 "ldr d7, [a_ptr7], #0x8\n"
736 "ld1 {v0.s}[2], [%[a_ptr0]]\n"
737 "ld1 {v1.s}[2], [a_ptr1]\n"
738 "ld1 {v2.s}[2], [a_ptr2]\n"
739 "ld1 {v3.s}[2], [a_ptr3]\n"
740 "ld1 {v4.s}[2], [a_ptr4]\n"
741 "ld1 {v5.s}[2], [a_ptr5]\n"
742 "ld1 {v6.s}[2], [a_ptr6]\n"
743 "ld1 {v7.s}[2], [a_ptr7]\n"
744 "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
745 "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
746 "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
747 "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
748 "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
749 "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
750 "add %[b_ptr0], %[b_ptr0], #0x30\n"
752 "ldr q24, [%[biasptr]]\n"
753 "add %[biasptr], %[biasptr], %[biasinc]\n"
754 "subs %[loops], %[loops], #0x1\n"
755 "mov v25.16b, v24.16b\n"
756 "mov v26.16b, v24.16b\n"
757 "mov v27.16b, v24.16b\n"
758 "mov v28.16b, v24.16b\n"
759 "mov v29.16b, v24.16b\n"
760 "mov v30.16b, v24.16b\n"
761 "mov v31.16b, v24.16b\n"
762 "fmla v24.4s, v16.4s, v0.s[0]\n"
763 "fmla v25.4s, v16.4s, v1.s[0]\n"
764 "fmla v26.4s, v16.4s, v2.s[0]\n"
765 "fmla v27.4s, v16.4s, v3.s[0]\n"
766 "fmla v28.4s, v16.4s, v4.s[0]\n"
767 "fmla v29.4s, v16.4s, v5.s[0]\n"
768 "fmla v30.4s, v16.4s, v6.s[0]\n"
769 "fmla v31.4s, v16.4s, v7.s[0]\n"
770 "fmla v24.4s, v17.4s, v0.s[1]\n"
771 "fmla v25.4s, v17.4s, v1.s[1]\n"
772 "fmla v26.4s, v17.4s, v2.s[1]\n"
773 "fmla v27.4s, v17.4s, v3.s[1]\n"
774 "fmla v28.4s, v17.4s, v4.s[1]\n"
775 "fmla v29.4s, v17.4s, v5.s[1]\n"
776 "fmla v30.4s, v17.4s, v6.s[1]\n"
777 "fmla v31.4s, v17.4s, v7.s[1]\n"
778 "fmla v24.4s, v18.4s, v0.s[2]\n"
779 "fmla v25.4s, v18.4s, v1.s[2]\n"
780 "fmla v26.4s, v18.4s, v2.s[2]\n"
781 "fmla v27.4s, v18.4s, v3.s[2]\n"
782 "fmla v28.4s, v18.4s, v4.s[2]\n"
783 "fmla v29.4s, v18.4s, v5.s[2]\n"
784 "fmla v30.4s, v18.4s, v6.s[2]\n"
785 "fmla v31.4s, v18.4s, v7.s[2]\n"
788 "ld1r {v22.4s}, [%[minptr]]\n"
789 "subs %[loops], %[loops], #0x1\n"
790 "ld1r {v23.4s}, [%[maxptr]]\n"
791 "ldr q16, [%[b_ptr0]]\n"
792 "fmax v24.4s, v24.4s, v22.4s\n"
793 "ldr q17, [%[b_ptr0], #0x10]\n"
794 "fmax v25.4s, v25.4s, v22.4s\n"
795 "ldr q18, [%[b_ptr0], #0x20]\n"
796 "fmax v26.4s, v26.4s, v22.4s\n"
797 "add %[b_ptr0], %[b_ptr0], #0x30\n"
798 "fmin v24.4s, v24.4s, v23.4s\n"
799 "fmin v25.4s, v25.4s, v23.4s\n"
800 "fmin v26.4s, v26.4s, v23.4s\n"
801 "fmax v27.4s, v27.4s, v22.4s\n"
802 "str q24, [%[c_ptr0]]\n"
803 "fmax v28.4s, v28.4s, v22.4s\n"
804 "ldr q24, [%[biasptr]]\n"
805 "fmax v29.4s, v29.4s, v22.4s\n"
806 "add %[c_ptr0], %[c_ptr0], #0x10\n"
807 "fmin v27.4s, v27.4s, v23.4s\n"
808 "str q25, [c_ptr1]\n"
809 "fmin v28.4s, v28.4s, v23.4s\n"
810 "add c_ptr1, c_ptr1, #0x10\n"
811 "fmin v29.4s, v29.4s, v23.4s\n"
812 "str q26, [c_ptr2]\n"
813 "fmax v30.4s, v30.4s, v22.4s\n"
814 "add c_ptr2, c_ptr2, #0x10\n"
815 "fmax v31.4s, v31.4s, v22.4s\n"
816 "str q27, [c_ptr3]\n"
817 "mov v25.16b, v24.16b\n"
818 "add c_ptr3, c_ptr3, #0x10\n"
819 "fmin v30.4s, v30.4s, v23.4s\n"
820 "str q28, [c_ptr4]\n"
821 "fmin v31.4s, v31.4s, v23.4s\n"
822 "add c_ptr4, c_ptr4, #0x10\n"
823 "mov v26.16b, v24.16b\n"
824 "str q29, [c_ptr5]\n"
825 "mov v27.16b, v24.16b\n"
826 "add c_ptr5, c_ptr5, #0x10\n"
827 "mov v28.16b, v24.16b\n"
828 "str q30, [c_ptr6]\n"
829 "mov v29.16b, v24.16b\n"
830 "add c_ptr6, c_ptr6, #0x10\n"
831 "mov v30.16b, v24.16b\n"
832 "str q31, [c_ptr7]\n"
833 "mov v31.16b, v24.16b\n"
834 "add c_ptr7, c_ptr7, #0x10\n"
835 "fmla v24.4s, v16.4s, v0.s[0]\n"
836 "add %[biasptr], %[biasptr], %[biasinc]\n"
837 "fmla v25.4s, v16.4s, v1.s[0]\n"
838 "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
839 "fmla v26.4s, v16.4s, v2.s[0]\n"
840 "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
841 "fmla v27.4s, v16.4s, v3.s[0]\n"
842 "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
843 "fmla v28.4s, v16.4s, v4.s[0]\n"
844 "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
845 "fmla v29.4s, v16.4s, v5.s[0]\n"
846 "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
847 "fmla v30.4s, v16.4s, v6.s[0]\n"
848 "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
849 "fmla v31.4s, v16.4s, v7.s[0]\n"
850 "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
851 "fmla v24.4s, v17.4s, v0.s[1]\n"
852 "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
853 "fmla v25.4s, v17.4s, v1.s[1]\n"
854 "fmla v26.4s, v17.4s, v2.s[1]\n"
855 "fmla v27.4s, v17.4s, v3.s[1]\n"
856 "fmla v28.4s, v17.4s, v4.s[1]\n"
857 "fmla v29.4s, v17.4s, v5.s[1]\n"
858 "fmla v30.4s, v17.4s, v6.s[1]\n"
859 "fmla v31.4s, v17.4s, v7.s[1]\n"
860 "fmla v24.4s, v18.4s, v0.s[2]\n"
861 "fmla v25.4s, v18.4s, v1.s[2]\n"
862 "fmla v26.4s, v18.4s, v2.s[2]\n"
863 "fmla v27.4s, v18.4s, v3.s[2]\n"
864 "fmla v28.4s, v18.4s, v4.s[2]\n"
865 "fmla v29.4s, v18.4s, v5.s[2]\n"
866 "fmla v30.4s, v18.4s, v6.s[2]\n"
867 "fmla v31.4s, v18.4s, v7.s[2]\n"
870 "ld1r {v22.4s}, [%[minptr]]\n"
871 "ld1r {v23.4s}, [%[maxptr]]\n"
872 "ldr q16, [%[b_ptr0]]\n"
873 "ldr q17, [%[b_ptr0], #0x10]\n"
874 "fmax v24.4s, v24.4s, v22.4s\n"
875 "ldr q18, [%[b_ptr0], #0x20]\n"
876 "fmax v25.4s, v25.4s, v22.4s\n"
877 "add %[b_ptr0], %[b_ptr0], #0x30\n"
878 "fmax v26.4s, v26.4s, v22.4s\n"
879 "fmin v24.4s, v24.4s, v23.4s\n"
880 "fmin v25.4s, v25.4s, v23.4s\n"
881 "fmax v27.4s, v27.4s, v22.4s\n"
882 "fmin v26.4s, v26.4s, v23.4s\n"
883 "str q24, [%[c_ptr0]]\n"
884 "fmax v28.4s, v28.4s, v22.4s\n"
885 "ldr q24, [%[biasptr]]\n"
886 "fmax v29.4s, v29.4s, v22.4s\n"
887 "add %[c_ptr0], %[c_ptr0], #0x10\n"
888 "fmin v27.4s, v27.4s, v23.4s\n"
889 "str q25, [c_ptr1]\n"
890 "fmin v28.4s, v28.4s, v23.4s\n"
891 "add c_ptr1, c_ptr1, #0x10\n"
892 "fmin v29.4s, v29.4s, v23.4s\n"
893 "str q26, [c_ptr2]\n"
894 "fmax v30.4s, v30.4s, v22.4s\n"
895 "add c_ptr2, c_ptr2, #0x10\n"
896 "fmax v31.4s, v31.4s, v22.4s\n"
897 "str q27, [c_ptr3]\n"
898 "mov v25.16b, v24.16b\n"
899 "add c_ptr3, c_ptr3, #0x10\n"
900 "fmin v30.4s, v30.4s, v23.4s\n"
901 "str q28, [c_ptr4]\n"
902 "fmin v31.4s, v31.4s, v23.4s\n"
903 "add c_ptr4, c_ptr4, #0x10\n"
904 "mov v26.16b, v24.16b\n"
905 "str q29, [c_ptr5]\n"
906 "mov v27.16b, v24.16b\n"
907 "add c_ptr5, c_ptr5, #0x10\n"
908 "mov v28.16b, v24.16b\n"
909 "str q30, [c_ptr6]\n"
910 "mov v29.16b, v24.16b\n"
911 "add c_ptr6, c_ptr6, #0x10\n"
912 "mov v30.16b, v24.16b\n"
913 "str q31, [c_ptr7]\n"
914 "mov v31.16b, v24.16b\n"
915 "add c_ptr7, c_ptr7, #0x10\n"
916 "fmla v24.4s, v16.4s, v0.s[0]\n"
917 "add %[biasptr], %[biasptr], %[biasinc]\n"
918 "fmla v25.4s, v16.4s, v1.s[0]\n"
919 "fmla v26.4s, v16.4s, v2.s[0]\n"
920 "fmla v27.4s, v16.4s, v3.s[0]\n"
921 "fmla v28.4s, v16.4s, v4.s[0]\n"
922 "fmla v29.4s, v16.4s, v5.s[0]\n"
923 "fmla v30.4s, v16.4s, v6.s[0]\n"
924 "fmla v31.4s, v16.4s, v7.s[0]\n"
925 "fmla v24.4s, v17.4s, v0.s[1]\n"
926 "fmla v25.4s, v17.4s, v1.s[1]\n"
927 "fmla v26.4s, v17.4s, v2.s[1]\n"
928 "fmla v27.4s, v17.4s, v3.s[1]\n"
929 "fmla v28.4s, v17.4s, v4.s[1]\n"
930 "fmla v29.4s, v17.4s, v5.s[1]\n"
931 "fmla v30.4s, v17.4s, v6.s[1]\n"
932 "fmla v31.4s, v17.4s, v7.s[1]\n"
933 "fmla v24.4s, v18.4s, v0.s[2]\n"
934 "fmla v25.4s, v18.4s, v1.s[2]\n"
935 "fmla v26.4s, v18.4s, v2.s[2]\n"
936 "fmla v27.4s, v18.4s, v3.s[2]\n"
937 "fmla v28.4s, v18.4s, v4.s[2]\n"
938 "fmla v29.4s, v18.4s, v5.s[2]\n"
939 "fmla v30.4s, v18.4s, v6.s[2]\n"
940 "fmla v31.4s, v18.4s, v7.s[2]\n"
943 "ldr q24, [%[biasptr]]\n"
944 "add %[biasptr], %[biasptr], %[biasinc]\n"
945 "mov v25.16b, v24.16b\n"
946 "mov v26.16b, v24.16b\n"
947 "mov v27.16b, v24.16b\n"
948 "mov v28.16b, v24.16b\n"
949 "mov v29.16b, v24.16b\n"
950 "mov v30.16b, v24.16b\n"
951 "mov v31.16b, v24.16b\n"
952 "fmla v24.4s, v16.4s, v0.s[0]\n"
953 "fmla v25.4s, v16.4s, v1.s[0]\n"
954 "fmla v26.4s, v16.4s, v2.s[0]\n"
955 "fmla v27.4s, v16.4s, v3.s[0]\n"
956 "fmla v28.4s, v16.4s, v4.s[0]\n"
957 "fmla v29.4s, v16.4s, v5.s[0]\n"
958 "fmla v30.4s, v16.4s, v6.s[0]\n"
959 "fmla v31.4s, v16.4s, v7.s[0]\n"
960 "fmla v24.4s, v17.4s, v0.s[1]\n"
961 "fmla v25.4s, v17.4s, v1.s[1]\n"
962 "fmla v26.4s, v17.4s, v2.s[1]\n"
963 "fmla v27.4s, v17.4s, v3.s[1]\n"
964 "fmla v28.4s, v17.4s, v4.s[1]\n"
965 "fmla v29.4s, v17.4s, v5.s[1]\n"
966 "fmla v30.4s, v17.4s, v6.s[1]\n"
967 "fmla v31.4s, v17.4s, v7.s[1]\n"
968 "fmla v24.4s, v18.4s, v0.s[2]\n"
969 "fmla v25.4s, v18.4s, v1.s[2]\n"
970 "fmla v26.4s, v18.4s, v2.s[2]\n"
971 "fmla v27.4s, v18.4s, v3.s[2]\n"
972 "fmla v28.4s, v18.4s, v4.s[2]\n"
973 "fmla v29.4s, v18.4s, v5.s[2]\n"
974 "fmla v30.4s, v18.4s, v6.s[2]\n"
975 "fmla v31.4s, v18.4s, v7.s[2]\n"
977 "ld1r {v22.4s}, [%[minptr]]\n"
978 "ld1r {v23.4s}, [%[maxptr]]\n"
979 "fmax v24.4s, v24.4s, v22.4s\n"
980 "fmax v25.4s, v25.4s, v22.4s\n"
981 "fmax v26.4s, v26.4s, v22.4s\n"
982 "fmax v27.4s, v27.4s, v22.4s\n"
983 "fmin v24.4s, v24.4s, v23.4s\n"
984 "fmin v25.4s, v25.4s, v23.4s\n"
985 "fmin v26.4s, v26.4s, v23.4s\n"
986 "fmin v27.4s, v27.4s, v23.4s\n"
987 "str q24, [%[c_ptr0]]\n"
988 "fmax v28.4s, v28.4s, v22.4s\n"
989 "add %[c_ptr0], %[c_ptr0], #0x10\n"
990 "fmax v29.4s, v29.4s, v22.4s\n"
991 "str q25, [c_ptr1]\n"
992 "fmax v30.4s, v30.4s, v22.4s\n"
993 "fmin v28.4s, v28.4s, v23.4s\n"
994 "fmax v31.4s, v31.4s, v22.4s\n"
995 "str q26, [c_ptr2]\n"
996 "fmin v29.4s, v29.4s, v23.4s\n"
997 "fmin v30.4s, v30.4s, v23.4s\n"
998 "fmin v31.4s, v31.4s, v23.4s\n"
999 "str q27, [c_ptr3]\n"
1000 "str q28, [c_ptr4]\n"
1001 "str q29, [c_ptr5]\n"
1002 "str q30, [c_ptr6]\n"
1003 "str q31, [c_ptr7]\n"
1018 : [a_ptr0]
"+r" (a_ptr0), [b_ptr0]
"+r" (b_ptr0), [c_ptr0]
"+r" (c_ptr0), [loops]
"+r" (loops), [oob_rows]
"+r" (oob_rows), [biasptr]
"+r" (biasptr)
1019 : [lda]
"r" (ldab), [ldc]
"r" (ldcb), [biasinc]
"r" (biasinc), [minptr]
"r" (minptr), [maxptr]
"r" (maxptr)
1020 :
"x0",
"x1",
"x2",
"x3",
"x4",
"x5",
"x6",
"x7",
"x8",
"x9",
"x10",
"x11",
"x12",
"x13",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23",
"v24",
"v25",
"v26",
"v27",
"v28",
"v29",
"v30",
"v31",
"cc",
"memory"
1039 "add a_ptr1, %[a_ptr0], %[lda]\n"
1040 "add c_ptr1, %[c_ptr0], %[ldc]\n"
1041 "add a_ptr2, a_ptr1, %[lda]\n"
1042 "add c_ptr2, c_ptr1, %[ldc]\n"
1043 "add a_ptr3, a_ptr2, %[lda]\n"
1044 "add c_ptr3, c_ptr2, %[ldc]\n"
1045 "add a_ptr4, a_ptr3, %[lda]\n"
1046 "add c_ptr4, c_ptr3, %[ldc]\n"
1047 "add a_ptr5, a_ptr4, %[lda]\n"
1048 "add c_ptr5, c_ptr4, %[ldc]\n"
1049 "add a_ptr6, a_ptr5, %[lda]\n"
1050 "add c_ptr6, c_ptr5, %[ldc]\n"
1051 "add a_ptr7, a_ptr6, %[lda]\n"
1052 "add c_ptr7, c_ptr6, %[ldc]\n"
1053 "cbz %[oob_rows], 1f\n"
1054 "subs %[oob_rows], %[oob_rows], #0x1\n"
1055 "add c_ptr7, %[c_ptr0], #0x0\n"
1056 "add a_ptr7, %[a_ptr0], #0x0\n"
1058 "subs %[oob_rows], %[oob_rows], #0x1\n"
1059 "add c_ptr6, %[c_ptr0], #0x0\n"
1060 "add a_ptr6, %[a_ptr0], #0x0\n"
1062 "subs %[oob_rows], %[oob_rows], #0x1\n"
1063 "add c_ptr5, %[c_ptr0], #0x0\n"
1064 "add a_ptr5, %[a_ptr0], #0x0\n"
1066 "subs %[oob_rows], %[oob_rows], #0x1\n"
1067 "add c_ptr4, %[c_ptr0], #0x0\n"
1068 "add a_ptr4, %[a_ptr0], #0x0\n"
1070 "subs %[oob_rows], %[oob_rows], #0x1\n"
1071 "add c_ptr3, %[c_ptr0], #0x0\n"
1072 "add a_ptr3, %[a_ptr0], #0x0\n"
1074 "subs %[oob_rows], %[oob_rows], #0x1\n"
1075 "add c_ptr2, %[c_ptr0], #0x0\n"
1076 "add a_ptr2, %[a_ptr0], #0x0\n"
1078 "subs %[oob_rows], %[oob_rows], #0x1\n"
1079 "add c_ptr1, %[c_ptr0], #0x0\n"
1080 "add a_ptr1, %[a_ptr0], #0x0\n"
1082 "ldr q0, [%[a_ptr0]]\n"
1083 "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
1084 "ldr q1, [a_ptr1]\n"
1085 "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
1086 "ldr q2, [a_ptr2]\n"
1087 "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
1088 "ldr q3, [a_ptr3]\n"
1089 "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
1090 "ldr q4, [a_ptr4]\n"
1091 "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
1092 "ldr q5, [a_ptr5]\n"
1093 "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
1094 "ldr q6, [a_ptr6]\n"
1095 "ldr q7, [a_ptr7]\n"
1096 "ldr q16, [%[b_ptr0]]\n"
1097 "ldr q17, [%[b_ptr0], #0x10]\n"
1098 "ldr q18, [%[b_ptr0], #0x20]\n"
1099 "ldr q19, [%[b_ptr0], #0x30]\n"
1100 "add %[b_ptr0], %[b_ptr0], #0x40\n"
1101 "cbz %[loops], 2f\n"
1102 "ldr q24, [%[biasptr]]\n"
1103 "add %[biasptr], %[biasptr], %[biasinc]\n"
1104 "subs %[loops], %[loops], #0x1\n"
1105 "mov v25.16b, v24.16b\n"
1106 "mov v26.16b, v24.16b\n"
1107 "mov v27.16b, v24.16b\n"
1108 "mov v28.16b, v24.16b\n"
1109 "mov v29.16b, v24.16b\n"
1110 "mov v30.16b, v24.16b\n"
1111 "mov v31.16b, v24.16b\n"
1112 "fmla v24.4s, v16.4s, v0.s[0]\n"
1113 "fmla v25.4s, v16.4s, v1.s[0]\n"
1114 "fmla v26.4s, v16.4s, v2.s[0]\n"
1115 "fmla v27.4s, v16.4s, v3.s[0]\n"
1116 "fmla v28.4s, v16.4s, v4.s[0]\n"
1117 "fmla v29.4s, v16.4s, v5.s[0]\n"
1118 "fmla v30.4s, v16.4s, v6.s[0]\n"
1119 "fmla v31.4s, v16.4s, v7.s[0]\n"
1120 "fmla v24.4s, v17.4s, v0.s[1]\n"
1121 "fmla v25.4s, v17.4s, v1.s[1]\n"
1122 "fmla v26.4s, v17.4s, v2.s[1]\n"
1123 "fmla v27.4s, v17.4s, v3.s[1]\n"
1124 "fmla v28.4s, v17.4s, v4.s[1]\n"
1125 "fmla v29.4s, v17.4s, v5.s[1]\n"
1126 "fmla v30.4s, v17.4s, v6.s[1]\n"
1127 "fmla v31.4s, v17.4s, v7.s[1]\n"
1128 "fmla v24.4s, v18.4s, v0.s[2]\n"
1129 "fmla v25.4s, v18.4s, v1.s[2]\n"
1130 "fmla v26.4s, v18.4s, v2.s[2]\n"
1131 "fmla v27.4s, v18.4s, v3.s[2]\n"
1132 "fmla v28.4s, v18.4s, v4.s[2]\n"
1133 "fmla v29.4s, v18.4s, v5.s[2]\n"
1134 "fmla v30.4s, v18.4s, v6.s[2]\n"
1135 "fmla v31.4s, v18.4s, v7.s[2]\n"
1136 "fmla v24.4s, v19.4s, v0.s[3]\n"
1137 "fmla v25.4s, v19.4s, v1.s[3]\n"
1138 "fmla v26.4s, v19.4s, v2.s[3]\n"
1139 "fmla v27.4s, v19.4s, v3.s[3]\n"
1140 "fmla v28.4s, v19.4s, v4.s[3]\n"
1141 "fmla v29.4s, v19.4s, v5.s[3]\n"
1142 "fmla v30.4s, v19.4s, v6.s[3]\n"
1143 "fmla v31.4s, v19.4s, v7.s[3]\n"
1146 "ld1r {v22.4s}, [%[minptr]]\n"
1147 "subs %[loops], %[loops], #0x1\n"
1148 "ld1r {v23.4s}, [%[maxptr]]\n"
1149 "ldr q16, [%[b_ptr0]]\n"
1150 "fmax v24.4s, v24.4s, v22.4s\n"
1151 "ldr q17, [%[b_ptr0], #0x10]\n"
1152 "fmax v25.4s, v25.4s, v22.4s\n"
1153 "ldr q18, [%[b_ptr0], #0x20]\n"
1154 "fmax v26.4s, v26.4s, v22.4s\n"
1155 "ldr q19, [%[b_ptr0], #0x30]\n"
1156 "fmax v27.4s, v27.4s, v22.4s\n"
1157 "add %[b_ptr0], %[b_ptr0], #0x40\n"
1158 "fmin v24.4s, v24.4s, v23.4s\n"
1159 "fmin v25.4s, v25.4s, v23.4s\n"
1160 "fmin v26.4s, v26.4s, v23.4s\n"
1161 "fmin v27.4s, v27.4s, v23.4s\n"
1162 "str q24, [%[c_ptr0]]\n"
1163 "fmax v28.4s, v28.4s, v22.4s\n"
1164 "ldr q24, [%[biasptr]]\n"
1165 "fmax v29.4s, v29.4s, v22.4s\n"
1166 "add %[c_ptr0], %[c_ptr0], #0x10\n"
1167 "fmax v30.4s, v30.4s, v22.4s\n"
1168 "str q25, [c_ptr1]\n"
1169 "fmin v28.4s, v28.4s, v23.4s\n"
1170 "add c_ptr1, c_ptr1, #0x10\n"
1171 "fmin v29.4s, v29.4s, v23.4s\n"
1172 "str q26, [c_ptr2]\n"
1173 "fmin v30.4s, v30.4s, v23.4s\n"
1174 "add c_ptr2, c_ptr2, #0x10\n"
1175 "fmax v31.4s, v31.4s, v22.4s\n"
1176 "str q27, [c_ptr3]\n"
1177 "mov v25.16b, v24.16b\n"
1178 "add c_ptr3, c_ptr3, #0x10\n"
1179 "mov v26.16b, v24.16b\n"
1180 "str q28, [c_ptr4]\n"
1181 "fmin v31.4s, v31.4s, v23.4s\n"
1182 "add c_ptr4, c_ptr4, #0x10\n"
1183 "mov v27.16b, v24.16b\n"
1184 "str q29, [c_ptr5]\n"
1185 "mov v28.16b, v24.16b\n"
1186 "add c_ptr5, c_ptr5, #0x10\n"
1187 "mov v29.16b, v24.16b\n"
1188 "str q30, [c_ptr6]\n"
1189 "mov v30.16b, v24.16b\n"
1190 "add c_ptr6, c_ptr6, #0x10\n"
1191 "fmla v25.4s, v16.4s, v1.s[0]\n"
1192 "str q31, [c_ptr7]\n"
1193 "mov v31.16b, v24.16b\n"
1194 "add c_ptr7, c_ptr7, #0x10\n"
1195 "fmla v24.4s, v16.4s, v0.s[0]\n"
1196 "add %[biasptr], %[biasptr], %[biasinc]\n"
1197 "fmla v26.4s, v16.4s, v2.s[0]\n"
1198 "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
1199 "fmla v27.4s, v16.4s, v3.s[0]\n"
1200 "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
1201 "fmla v28.4s, v16.4s, v4.s[0]\n"
1202 "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
1203 "fmla v29.4s, v16.4s, v5.s[0]\n"
1204 "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
1205 "fmla v30.4s, v16.4s, v6.s[0]\n"
1206 "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
1207 "fmla v31.4s, v16.4s, v7.s[0]\n"
1208 "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
1209 "fmla v24.4s, v17.4s, v0.s[1]\n"
1210 "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
1211 "fmla v25.4s, v17.4s, v1.s[1]\n"
1212 "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
1213 "fmla v26.4s, v17.4s, v2.s[1]\n"
1214 "fmla v27.4s, v17.4s, v3.s[1]\n"
1215 "fmla v28.4s, v17.4s, v4.s[1]\n"
1216 "fmla v29.4s, v17.4s, v5.s[1]\n"
1217 "fmla v30.4s, v17.4s, v6.s[1]\n"
1218 "fmla v31.4s, v17.4s, v7.s[1]\n"
1219 "fmla v24.4s, v18.4s, v0.s[2]\n"
1220 "fmla v25.4s, v18.4s, v1.s[2]\n"
1221 "fmla v26.4s, v18.4s, v2.s[2]\n"
1222 "fmla v27.4s, v18.4s, v3.s[2]\n"
1223 "fmla v28.4s, v18.4s, v4.s[2]\n"
1224 "fmla v29.4s, v18.4s, v5.s[2]\n"
1225 "fmla v30.4s, v18.4s, v6.s[2]\n"
1226 "fmla v31.4s, v18.4s, v7.s[2]\n"
1227 "fmla v24.4s, v19.4s, v0.s[3]\n"
1228 "fmla v25.4s, v19.4s, v1.s[3]\n"
1229 "fmla v26.4s, v19.4s, v2.s[3]\n"
1230 "fmla v27.4s, v19.4s, v3.s[3]\n"
1231 "fmla v28.4s, v19.4s, v4.s[3]\n"
1232 "fmla v29.4s, v19.4s, v5.s[3]\n"
1233 "fmla v30.4s, v19.4s, v6.s[3]\n"
1234 "fmla v31.4s, v19.4s, v7.s[3]\n"
1237 "ld1r {v22.4s}, [%[minptr]]\n"
1238 "ld1r {v23.4s}, [%[maxptr]]\n"
1239 "ldr q16, [%[b_ptr0]]\n"
1240 "ldr q17, [%[b_ptr0], #0x10]\n"
1241 "fmax v24.4s, v24.4s, v22.4s\n"
1242 "ldr q18, [%[b_ptr0], #0x20]\n"
1243 "fmax v25.4s, v25.4s, v22.4s\n"
1244 "ldr q19, [%[b_ptr0], #0x30]\n"
1245 "fmax v26.4s, v26.4s, v22.4s\n"
1246 "add %[b_ptr0], %[b_ptr0], #0x40\n"
1247 "fmin v24.4s, v24.4s, v23.4s\n"
1248 "fmin v25.4s, v25.4s, v23.4s\n"
1249 "fmin v26.4s, v26.4s, v23.4s\n"
1250 "fmax v27.4s, v27.4s, v22.4s\n"
1251 "str q24, [%[c_ptr0]]\n"
1252 "fmax v28.4s, v28.4s, v22.4s\n"
1253 "ldr q24, [%[biasptr]]\n"
1254 "fmax v29.4s, v29.4s, v22.4s\n"
1255 "add %[c_ptr0], %[c_ptr0], #0x10\n"
1256 "fmin v27.4s, v27.4s, v23.4s\n"
1257 "str q25, [c_ptr1]\n"
1258 "fmin v28.4s, v28.4s, v23.4s\n"
1259 "add c_ptr1, c_ptr1, #0x10\n"
1260 "fmin v29.4s, v29.4s, v23.4s\n"
1261 "str q26, [c_ptr2]\n"
1262 "fmax v30.4s, v30.4s, v22.4s\n"
1263 "add c_ptr2, c_ptr2, #0x10\n"
1264 "fmax v31.4s, v31.4s, v22.4s\n"
1265 "str q27, [c_ptr3]\n"
1266 "mov v25.16b, v24.16b\n"
1267 "add c_ptr3, c_ptr3, #0x10\n"
1268 "fmin v30.4s, v30.4s, v23.4s\n"
1269 "str q28, [c_ptr4]\n"
1270 "fmin v31.4s, v31.4s, v23.4s\n"
1271 "add c_ptr4, c_ptr4, #0x10\n"
1272 "mov v26.16b, v24.16b\n"
1273 "str q29, [c_ptr5]\n"
1274 "mov v27.16b, v24.16b\n"
1275 "add c_ptr5, c_ptr5, #0x10\n"
1276 "mov v28.16b, v24.16b\n"
1277 "str q30, [c_ptr6]\n"
1278 "mov v29.16b, v24.16b\n"
1279 "add c_ptr6, c_ptr6, #0x10\n"
1280 "mov v30.16b, v24.16b\n"
1281 "str q31, [c_ptr7]\n"
1282 "mov v31.16b, v24.16b\n"
1283 "add c_ptr7, c_ptr7, #0x10\n"
1284 "fmla v24.4s, v16.4s, v0.s[0]\n"
1285 "add %[biasptr], %[biasptr], %[biasinc]\n"
1286 "fmla v25.4s, v16.4s, v1.s[0]\n"
1287 "fmla v26.4s, v16.4s, v2.s[0]\n"
1288 "fmla v27.4s, v16.4s, v3.s[0]\n"
1289 "fmla v28.4s, v16.4s, v4.s[0]\n"
1290 "fmla v29.4s, v16.4s, v5.s[0]\n"
1291 "fmla v30.4s, v16.4s, v6.s[0]\n"
1292 "fmla v31.4s, v16.4s, v7.s[0]\n"
1293 "fmla v24.4s, v17.4s, v0.s[1]\n"
1294 "fmla v25.4s, v17.4s, v1.s[1]\n"
1295 "fmla v26.4s, v17.4s, v2.s[1]\n"
1296 "fmla v27.4s, v17.4s, v3.s[1]\n"
1297 "fmla v28.4s, v17.4s, v4.s[1]\n"
1298 "fmla v29.4s, v17.4s, v5.s[1]\n"
1299 "fmla v30.4s, v17.4s, v6.s[1]\n"
1300 "fmla v31.4s, v17.4s, v7.s[1]\n"
1301 "fmla v24.4s, v18.4s, v0.s[2]\n"
1302 "fmla v25.4s, v18.4s, v1.s[2]\n"
1303 "fmla v26.4s, v18.4s, v2.s[2]\n"
1304 "fmla v27.4s, v18.4s, v3.s[2]\n"
1305 "fmla v28.4s, v18.4s, v4.s[2]\n"
1306 "fmla v29.4s, v18.4s, v5.s[2]\n"
1307 "fmla v30.4s, v18.4s, v6.s[2]\n"
1308 "fmla v31.4s, v18.4s, v7.s[2]\n"
1309 "fmla v24.4s, v19.4s, v0.s[3]\n"
1310 "fmla v25.4s, v19.4s, v1.s[3]\n"
1311 "fmla v26.4s, v19.4s, v2.s[3]\n"
1312 "fmla v27.4s, v19.4s, v3.s[3]\n"
1313 "fmla v28.4s, v19.4s, v4.s[3]\n"
1314 "fmla v29.4s, v19.4s, v5.s[3]\n"
1315 "fmla v30.4s, v19.4s, v6.s[3]\n"
1316 "fmla v31.4s, v19.4s, v7.s[3]\n"
1319 "ldr q24, [%[biasptr]]\n"
1320 "add %[biasptr], %[biasptr], %[biasinc]\n"
1321 "mov v25.16b, v24.16b\n"
1322 "mov v26.16b, v24.16b\n"
1323 "mov v27.16b, v24.16b\n"
1324 "mov v28.16b, v24.16b\n"
1325 "mov v29.16b, v24.16b\n"
1326 "mov v30.16b, v24.16b\n"
1327 "mov v31.16b, v24.16b\n"
1328 "fmla v24.4s, v16.4s, v0.s[0]\n"
1329 "fmla v25.4s, v16.4s, v1.s[0]\n"
1330 "fmla v26.4s, v16.4s, v2.s[0]\n"
1331 "fmla v27.4s, v16.4s, v3.s[0]\n"
1332 "fmla v28.4s, v16.4s, v4.s[0]\n"
1333 "fmla v29.4s, v16.4s, v5.s[0]\n"
1334 "fmla v30.4s, v16.4s, v6.s[0]\n"
1335 "fmla v31.4s, v16.4s, v7.s[0]\n"
1336 "fmla v24.4s, v17.4s, v0.s[1]\n"
1337 "fmla v25.4s, v17.4s, v1.s[1]\n"
1338 "fmla v26.4s, v17.4s, v2.s[1]\n"
1339 "fmla v27.4s, v17.4s, v3.s[1]\n"
1340 "fmla v28.4s, v17.4s, v4.s[1]\n"
1341 "fmla v29.4s, v17.4s, v5.s[1]\n"
1342 "fmla v30.4s, v17.4s, v6.s[1]\n"
1343 "fmla v31.4s, v17.4s, v7.s[1]\n"
1344 "fmla v24.4s, v18.4s, v0.s[2]\n"
1345 "fmla v25.4s, v18.4s, v1.s[2]\n"
1346 "fmla v26.4s, v18.4s, v2.s[2]\n"
1347 "fmla v27.4s, v18.4s, v3.s[2]\n"
1348 "fmla v28.4s, v18.4s, v4.s[2]\n"
1349 "fmla v29.4s, v18.4s, v5.s[2]\n"
1350 "fmla v30.4s, v18.4s, v6.s[2]\n"
1351 "fmla v31.4s, v18.4s, v7.s[2]\n"
1352 "fmla v24.4s, v19.4s, v0.s[3]\n"
1353 "fmla v25.4s, v19.4s, v1.s[3]\n"
1354 "fmla v26.4s, v19.4s, v2.s[3]\n"
1355 "fmla v27.4s, v19.4s, v3.s[3]\n"
1356 "fmla v28.4s, v19.4s, v4.s[3]\n"
1357 "fmla v29.4s, v19.4s, v5.s[3]\n"
1358 "fmla v30.4s, v19.4s, v6.s[3]\n"
1359 "fmla v31.4s, v19.4s, v7.s[3]\n"
1361 "ld1r {v22.4s}, [%[minptr]]\n"
1362 "ld1r {v23.4s}, [%[maxptr]]\n"
1363 "fmax v24.4s, v24.4s, v22.4s\n"
1364 "fmax v25.4s, v25.4s, v22.4s\n"
1365 "fmax v26.4s, v26.4s, v22.4s\n"
1366 "fmax v27.4s, v27.4s, v22.4s\n"
1367 "fmin v24.4s, v24.4s, v23.4s\n"
1368 "fmin v25.4s, v25.4s, v23.4s\n"
1369 "fmin v26.4s, v26.4s, v23.4s\n"
1370 "fmin v27.4s, v27.4s, v23.4s\n"
1371 "str q24, [%[c_ptr0]]\n"
1372 "fmax v28.4s, v28.4s, v22.4s\n"
1373 "add %[c_ptr0], %[c_ptr0], #0x10\n"
1374 "fmax v29.4s, v29.4s, v22.4s\n"
1375 "str q25, [c_ptr1]\n"
1376 "fmax v30.4s, v30.4s, v22.4s\n"
1377 "fmin v28.4s, v28.4s, v23.4s\n"
1378 "fmax v31.4s, v31.4s, v22.4s\n"
1379 "str q26, [c_ptr2]\n"
1380 "fmin v29.4s, v29.4s, v23.4s\n"
1381 "fmin v30.4s, v30.4s, v23.4s\n"
1382 "fmin v31.4s, v31.4s, v23.4s\n"
1383 "str q27, [c_ptr3]\n"
1384 "str q28, [c_ptr4]\n"
1385 "str q29, [c_ptr5]\n"
1386 "str q30, [c_ptr6]\n"
1387 "str q31, [c_ptr7]\n"
1402 : [a_ptr0]
"+r" (a_ptr0), [b_ptr0]
"+r" (b_ptr0), [c_ptr0]
"+r" (c_ptr0), [loops]
"+r" (loops), [oob_rows]
"+r" (oob_rows), [biasptr]
"+r" (biasptr)
1403 : [lda]
"r" (ldab), [ldc]
"r" (ldcb), [biasinc]
"r" (biasinc), [minptr]
"r" (minptr), [maxptr]
"r" (maxptr)
1404 :
"x0",
"x1",
"x2",
"x3",
"x4",
"x5",
"x6",
"x7",
"x8",
"x9",
"x10",
"x11",
"x12",
"x13",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23",
"v24",
"v25",
"v26",
"v27",
"v28",
"v29",
"v30",
"v31",
"cc",
"memory"
1423 "add a_ptr1, %[a_ptr0], %[lda]\n"
1424 "add c_ptr1, %[c_ptr0], %[ldc]\n"
1425 "add a_ptr2, a_ptr1, %[lda]\n"
1426 "add c_ptr2, c_ptr1, %[ldc]\n"
1427 "add a_ptr3, a_ptr2, %[lda]\n"
1428 "add c_ptr3, c_ptr2, %[ldc]\n"
1429 "add a_ptr4, a_ptr3, %[lda]\n"
1430 "add c_ptr4, c_ptr3, %[ldc]\n"
1431 "add a_ptr5, a_ptr4, %[lda]\n"
1432 "add c_ptr5, c_ptr4, %[ldc]\n"
1433 "add a_ptr6, a_ptr5, %[lda]\n"
1434 "add c_ptr6, c_ptr5, %[ldc]\n"
1435 "add a_ptr7, a_ptr6, %[lda]\n"
1436 "add c_ptr7, c_ptr6, %[ldc]\n"
1437 "cbz %[oob_rows], 1f\n"
1438 "subs %[oob_rows], %[oob_rows], #0x1\n"
1439 "add c_ptr7, %[c_ptr0], #0x0\n"
1440 "add a_ptr7, %[a_ptr0], #0x0\n"
1442 "subs %[oob_rows], %[oob_rows], #0x1\n"
1443 "add c_ptr6, %[c_ptr0], #0x0\n"
1444 "add a_ptr6, %[a_ptr0], #0x0\n"
1446 "subs %[oob_rows], %[oob_rows], #0x1\n"
1447 "add c_ptr5, %[c_ptr0], #0x0\n"
1448 "add a_ptr5, %[a_ptr0], #0x0\n"
1450 "subs %[oob_rows], %[oob_rows], #0x1\n"
1451 "add c_ptr4, %[c_ptr0], #0x0\n"
1452 "add a_ptr4, %[a_ptr0], #0x0\n"
1454 "subs %[oob_rows], %[oob_rows], #0x1\n"
1455 "add c_ptr3, %[c_ptr0], #0x0\n"
1456 "add a_ptr3, %[a_ptr0], #0x0\n"
1458 "subs %[oob_rows], %[oob_rows], #0x1\n"
1459 "add c_ptr2, %[c_ptr0], #0x0\n"
1460 "add a_ptr2, %[a_ptr0], #0x0\n"
1462 "subs %[oob_rows], %[oob_rows], #0x1\n"
1463 "add c_ptr1, %[c_ptr0], #0x0\n"
1464 "add a_ptr1, %[a_ptr0], #0x0\n"
1466 "ldr q0, [%[a_ptr0]], #0x10\n"
1467 "ldr q2, [a_ptr1], #0x10\n"
1468 "ldr q4, [a_ptr2], #0x10\n"
1469 "ldr q6, [a_ptr3], #0x10\n"
1470 "ldr s1, [%[a_ptr0]]\n"
1471 "ldr q8, [a_ptr4], #0x10\n"
1472 "ldr s3, [a_ptr1]\n"
1473 "ldr q10, [a_ptr5], #0x10\n"
1474 "ldr s5, [a_ptr2]\n"
1475 "ldr q12, [a_ptr6], #0x10\n"
1476 "ldr s7, [a_ptr3]\n"
1477 "ldr q14, [a_ptr7], #0x10\n"
1478 "ldr s9, [a_ptr4]\n"
1479 "ldr q16, [%[b_ptr0]]\n"
1480 "ldr s11, [a_ptr5]\n"
1481 "ldr q17, [%[b_ptr0], #0x10]\n"
1482 "ldr s13, [a_ptr6]\n"
1483 "ldr q18, [%[b_ptr0], #0x20]\n"
1484 "ldr s15, [a_ptr7]\n"
1485 "ldr q19, [%[b_ptr0], #0x30]\n"
1486 "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
1487 "ldr q20, [%[b_ptr0], #0x40]\n"
1488 "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
1489 "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
1490 "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
1491 "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
1492 "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
1493 "add %[b_ptr0], %[b_ptr0], #0x50\n"
1494 "cbz %[loops], 2f\n"
1495 "ldr q24, [%[biasptr]]\n"
1496 "add %[biasptr], %[biasptr], %[biasinc]\n"
1497 "subs %[loops], %[loops], #0x1\n"
1498 "mov v25.16b, v24.16b\n"
1499 "mov v26.16b, v24.16b\n"
1500 "mov v27.16b, v24.16b\n"
1501 "mov v28.16b, v24.16b\n"
1502 "mov v29.16b, v24.16b\n"
1503 "mov v30.16b, v24.16b\n"
1504 "mov v31.16b, v24.16b\n"
1505 "fmla v24.4s, v16.4s, v0.s[0]\n"
1506 "fmla v25.4s, v16.4s, v2.s[0]\n"
1507 "fmla v26.4s, v16.4s, v4.s[0]\n"
1508 "fmla v27.4s, v16.4s, v6.s[0]\n"
1509 "fmla v28.4s, v16.4s, v8.s[0]\n"
1510 "fmla v29.4s, v16.4s, v10.s[0]\n"
1511 "fmla v30.4s, v16.4s, v12.s[0]\n"
1512 "fmla v31.4s, v16.4s, v14.s[0]\n"
1513 "fmla v24.4s, v17.4s, v0.s[1]\n"
1514 "fmla v25.4s, v17.4s, v2.s[1]\n"
1515 "fmla v26.4s, v17.4s, v4.s[1]\n"
1516 "fmla v27.4s, v17.4s, v6.s[1]\n"
1517 "fmla v28.4s, v17.4s, v8.s[1]\n"
1518 "fmla v29.4s, v17.4s, v10.s[1]\n"
1519 "fmla v30.4s, v17.4s, v12.s[1]\n"
1520 "fmla v31.4s, v17.4s, v14.s[1]\n"
1521 "fmla v24.4s, v18.4s, v0.s[2]\n"
1522 "fmla v25.4s, v18.4s, v2.s[2]\n"
1523 "fmla v26.4s, v18.4s, v4.s[2]\n"
1524 "fmla v27.4s, v18.4s, v6.s[2]\n"
1525 "fmla v28.4s, v18.4s, v8.s[2]\n"
1526 "fmla v29.4s, v18.4s, v10.s[2]\n"
1527 "fmla v30.4s, v18.4s, v12.s[2]\n"
1528 "fmla v31.4s, v18.4s, v14.s[2]\n"
1529 "fmla v24.4s, v19.4s, v0.s[3]\n"
1530 "fmla v25.4s, v19.4s, v2.s[3]\n"
1531 "fmla v26.4s, v19.4s, v4.s[3]\n"
1532 "fmla v27.4s, v19.4s, v6.s[3]\n"
1533 "fmla v28.4s, v19.4s, v8.s[3]\n"
1534 "fmla v29.4s, v19.4s, v10.s[3]\n"
1535 "fmla v30.4s, v19.4s, v12.s[3]\n"
1536 "fmla v31.4s, v19.4s, v14.s[3]\n"
1537 "fmla v24.4s, v20.4s, v1.s[0]\n"
1538 "fmla v25.4s, v20.4s, v3.s[0]\n"
1539 "fmla v26.4s, v20.4s, v5.s[0]\n"
1540 "fmla v27.4s, v20.4s, v7.s[0]\n"
1541 "fmla v28.4s, v20.4s, v9.s[0]\n"
1542 "fmla v29.4s, v20.4s, v11.s[0]\n"
1543 "fmla v30.4s, v20.4s, v13.s[0]\n"
1544 "fmla v31.4s, v20.4s, v15.s[0]\n"
1547 "ld1r {v22.4s}, [%[minptr]]\n"
1548 "subs %[loops], %[loops], #0x1\n"
1549 "ld1r {v23.4s}, [%[maxptr]]\n"
1550 "ldr q16, [%[b_ptr0]]\n"
1551 "fmax v24.4s, v24.4s, v22.4s\n"
1552 "ldr q17, [%[b_ptr0], #0x10]\n"
1553 "fmax v25.4s, v25.4s, v22.4s\n"
1554 "ldr q18, [%[b_ptr0], #0x20]\n"
1555 "fmax v26.4s, v26.4s, v22.4s\n"
1556 "ldr q19, [%[b_ptr0], #0x30]\n"
1557 "fmax v27.4s, v27.4s, v22.4s\n"
1558 "ldr q20, [%[b_ptr0], #0x40]\n"
1559 "fmin v24.4s, v24.4s, v23.4s\n"
1560 "add %[b_ptr0], %[b_ptr0], #0x50\n"
1561 "fmin v25.4s, v25.4s, v23.4s\n"
1562 "fmin v26.4s, v26.4s, v23.4s\n"
1563 "str q24, [%[c_ptr0]]\n"
1564 "fmin v27.4s, v27.4s, v23.4s\n"
1565 "ldr q24, [%[biasptr]]\n"
1566 "fmax v28.4s, v28.4s, v22.4s\n"
1567 "add %[c_ptr0], %[c_ptr0], #0x10\n"
1568 "fmax v29.4s, v29.4s, v22.4s\n"
1569 "str q25, [c_ptr1]\n"
1570 "fmax v30.4s, v30.4s, v22.4s\n"
1571 "add c_ptr1, c_ptr1, #0x10\n"
1572 "fmin v28.4s, v28.4s, v23.4s\n"
1573 "str q26, [c_ptr2]\n"
1574 "fmin v29.4s, v29.4s, v23.4s\n"
1575 "add c_ptr2, c_ptr2, #0x10\n"
1576 "fmin v30.4s, v30.4s, v23.4s\n"
1577 "str q27, [c_ptr3]\n"
1578 "fmax v31.4s, v31.4s, v22.4s\n"
1579 "add c_ptr3, c_ptr3, #0x10\n"
1580 "mov v25.16b, v24.16b\n"
1581 "str q28, [c_ptr4]\n"
1582 "mov v26.16b, v24.16b\n"
1583 "add c_ptr4, c_ptr4, #0x10\n"
1584 "fmin v31.4s, v31.4s, v23.4s\n"
1585 "str q29, [c_ptr5]\n"
1586 "mov v27.16b, v24.16b\n"
1587 "add c_ptr5, c_ptr5, #0x10\n"
1588 "mov v28.16b, v24.16b\n"
1589 "str q30, [c_ptr6]\n"
1590 "mov v29.16b, v24.16b\n"
1591 "add c_ptr6, c_ptr6, #0x10\n"
1592 "mov v30.16b, v24.16b\n"
1593 "str q31, [c_ptr7]\n"
1594 "mov v31.16b, v24.16b\n"
1595 "add c_ptr7, c_ptr7, #0x10\n"
1596 "fmla v24.4s, v16.4s, v0.s[0]\n"
1597 "add %[biasptr], %[biasptr], %[biasinc]\n"
1598 "fmla v25.4s, v16.4s, v2.s[0]\n"
1599 "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
1600 "fmla v26.4s, v16.4s, v4.s[0]\n"
1601 "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
1602 "fmla v27.4s, v16.4s, v6.s[0]\n"
1603 "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
1604 "fmla v28.4s, v16.4s, v8.s[0]\n"
1605 "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
1606 "fmla v29.4s, v16.4s, v10.s[0]\n"
1607 "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
1608 "fmla v30.4s, v16.4s, v12.s[0]\n"
1609 "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
1610 "fmla v31.4s, v16.4s, v14.s[0]\n"
1611 "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
1612 "fmla v24.4s, v17.4s, v0.s[1]\n"
1613 "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
1614 "fmla v25.4s, v17.4s, v2.s[1]\n"
1615 "fmla v26.4s, v17.4s, v4.s[1]\n"
1616 "fmla v27.4s, v17.4s, v6.s[1]\n"
1617 "fmla v28.4s, v17.4s, v8.s[1]\n"
1618 "fmla v29.4s, v17.4s, v10.s[1]\n"
1619 "fmla v30.4s, v17.4s, v12.s[1]\n"
1620 "fmla v31.4s, v17.4s, v14.s[1]\n"
1621 "fmla v24.4s, v18.4s, v0.s[2]\n"
1622 "fmla v25.4s, v18.4s, v2.s[2]\n"
1623 "fmla v26.4s, v18.4s, v4.s[2]\n"
1624 "fmla v27.4s, v18.4s, v6.s[2]\n"
1625 "fmla v28.4s, v18.4s, v8.s[2]\n"
1626 "fmla v29.4s, v18.4s, v10.s[2]\n"
1627 "fmla v30.4s, v18.4s, v12.s[2]\n"
1628 "fmla v31.4s, v18.4s, v14.s[2]\n"
1629 "fmla v24.4s, v19.4s, v0.s[3]\n"
1630 "fmla v25.4s, v19.4s, v2.s[3]\n"
1631 "fmla v26.4s, v19.4s, v4.s[3]\n"
1632 "fmla v27.4s, v19.4s, v6.s[3]\n"
1633 "fmla v28.4s, v19.4s, v8.s[3]\n"
1634 "fmla v29.4s, v19.4s, v10.s[3]\n"
1635 "fmla v30.4s, v19.4s, v12.s[3]\n"
1636 "fmla v31.4s, v19.4s, v14.s[3]\n"
1637 "fmla v24.4s, v20.4s, v1.s[0]\n"
1638 "fmla v25.4s, v20.4s, v3.s[0]\n"
1639 "fmla v26.4s, v20.4s, v5.s[0]\n"
1640 "fmla v27.4s, v20.4s, v7.s[0]\n"
1641 "fmla v28.4s, v20.4s, v9.s[0]\n"
1642 "fmla v29.4s, v20.4s, v11.s[0]\n"
1643 "fmla v30.4s, v20.4s, v13.s[0]\n"
1644 "fmla v31.4s, v20.4s, v15.s[0]\n"
1647 "ld1r {v22.4s}, [%[minptr]]\n"
1648 "ld1r {v23.4s}, [%[maxptr]]\n"
1649 "ldr q16, [%[b_ptr0]]\n"
1650 "ldr q17, [%[b_ptr0], #0x10]\n"
1651 "fmax v24.4s, v24.4s, v22.4s\n"
1652 "ldr q18, [%[b_ptr0], #0x20]\n"
1653 "fmax v25.4s, v25.4s, v22.4s\n"
1654 "ldr q19, [%[b_ptr0], #0x30]\n"
1655 "fmax v26.4s, v26.4s, v22.4s\n"
1656 "ldr q20, [%[b_ptr0], #0x40]\n"
1657 "fmax v27.4s, v27.4s, v22.4s\n"
1658 "add %[b_ptr0], %[b_ptr0], #0x50\n"
1659 "fmin v24.4s, v24.4s, v23.4s\n"
1660 "fmin v25.4s, v25.4s, v23.4s\n"
1661 "fmin v26.4s, v26.4s, v23.4s\n"
1662 "fmin v27.4s, v27.4s, v23.4s\n"
1663 "str q24, [%[c_ptr0]]\n"
1664 "fmax v28.4s, v28.4s, v22.4s\n"
1665 "ldr q24, [%[biasptr]]\n"
1666 "fmax v29.4s, v29.4s, v22.4s\n"
1667 "add %[c_ptr0], %[c_ptr0], #0x10\n"
1668 "fmax v30.4s, v30.4s, v22.4s\n"
1669 "str q25, [c_ptr1]\n"
1670 "fmin v28.4s, v28.4s, v23.4s\n"
1671 "add c_ptr1, c_ptr1, #0x10\n"
1672 "fmin v29.4s, v29.4s, v23.4s\n"
1673 "str q26, [c_ptr2]\n"
1674 "fmin v30.4s, v30.4s, v23.4s\n"
1675 "add c_ptr2, c_ptr2, #0x10\n"
1676 "fmax v31.4s, v31.4s, v22.4s\n"
1677 "str q27, [c_ptr3]\n"
1678 "mov v25.16b, v24.16b\n"
1679 "add c_ptr3, c_ptr3, #0x10\n"
1680 "mov v26.16b, v24.16b\n"
1681 "str q28, [c_ptr4]\n"
1682 "fmin v31.4s, v31.4s, v23.4s\n"
1683 "add c_ptr4, c_ptr4, #0x10\n"
1684 "mov v27.16b, v24.16b\n"
1685 "str q29, [c_ptr5]\n"
1686 "mov v28.16b, v24.16b\n"
1687 "add c_ptr5, c_ptr5, #0x10\n"
1688 "mov v29.16b, v24.16b\n"
1689 "str q30, [c_ptr6]\n"
1690 "mov v30.16b, v24.16b\n"
1691 "add c_ptr6, c_ptr6, #0x10\n"
1692 "fmla v25.4s, v16.4s, v2.s[0]\n"
1693 "str q31, [c_ptr7]\n"
1694 "mov v31.16b, v24.16b\n"
1695 "add c_ptr7, c_ptr7, #0x10\n"
1696 "fmla v24.4s, v16.4s, v0.s[0]\n"
1697 "add %[biasptr], %[biasptr], %[biasinc]\n"
1698 "fmla v26.4s, v16.4s, v4.s[0]\n"
1699 "fmla v27.4s, v16.4s, v6.s[0]\n"
1700 "fmla v28.4s, v16.4s, v8.s[0]\n"
1701 "fmla v29.4s, v16.4s, v10.s[0]\n"
1702 "fmla v30.4s, v16.4s, v12.s[0]\n"
1703 "fmla v31.4s, v16.4s, v14.s[0]\n"
1704 "fmla v24.4s, v17.4s, v0.s[1]\n"
1705 "fmla v25.4s, v17.4s, v2.s[1]\n"
1706 "fmla v26.4s, v17.4s, v4.s[1]\n"
1707 "fmla v27.4s, v17.4s, v6.s[1]\n"
1708 "fmla v28.4s, v17.4s, v8.s[1]\n"
1709 "fmla v29.4s, v17.4s, v10.s[1]\n"
1710 "fmla v30.4s, v17.4s, v12.s[1]\n"
1711 "fmla v31.4s, v17.4s, v14.s[1]\n"
1712 "fmla v24.4s, v18.4s, v0.s[2]\n"
1713 "fmla v25.4s, v18.4s, v2.s[2]\n"
1714 "fmla v26.4s, v18.4s, v4.s[2]\n"
1715 "fmla v27.4s, v18.4s, v6.s[2]\n"
1716 "fmla v28.4s, v18.4s, v8.s[2]\n"
1717 "fmla v29.4s, v18.4s, v10.s[2]\n"
1718 "fmla v30.4s, v18.4s, v12.s[2]\n"
1719 "fmla v31.4s, v18.4s, v14.s[2]\n"
1720 "fmla v24.4s, v19.4s, v0.s[3]\n"
1721 "fmla v25.4s, v19.4s, v2.s[3]\n"
1722 "fmla v26.4s, v19.4s, v4.s[3]\n"
1723 "fmla v27.4s, v19.4s, v6.s[3]\n"
1724 "fmla v28.4s, v19.4s, v8.s[3]\n"
1725 "fmla v29.4s, v19.4s, v10.s[3]\n"
1726 "fmla v30.4s, v19.4s, v12.s[3]\n"
1727 "fmla v31.4s, v19.4s, v14.s[3]\n"
1728 "fmla v24.4s, v20.4s, v1.s[0]\n"
1729 "fmla v25.4s, v20.4s, v3.s[0]\n"
1730 "fmla v26.4s, v20.4s, v5.s[0]\n"
1731 "fmla v27.4s, v20.4s, v7.s[0]\n"
1732 "fmla v28.4s, v20.4s, v9.s[0]\n"
1733 "fmla v29.4s, v20.4s, v11.s[0]\n"
1734 "fmla v30.4s, v20.4s, v13.s[0]\n"
1735 "fmla v31.4s, v20.4s, v15.s[0]\n"
1738 "ldr q24, [%[biasptr]]\n"
1739 "add %[biasptr], %[biasptr], %[biasinc]\n"
1740 "mov v25.16b, v24.16b\n"
1741 "mov v26.16b, v24.16b\n"
1742 "mov v27.16b, v24.16b\n"
1743 "mov v28.16b, v24.16b\n"
1744 "mov v29.16b, v24.16b\n"
1745 "mov v30.16b, v24.16b\n"
1746 "mov v31.16b, v24.16b\n"
1747 "fmla v24.4s, v16.4s, v0.s[0]\n"
1748 "fmla v25.4s, v16.4s, v2.s[0]\n"
1749 "fmla v26.4s, v16.4s, v4.s[0]\n"
1750 "fmla v27.4s, v16.4s, v6.s[0]\n"
1751 "fmla v28.4s, v16.4s, v8.s[0]\n"
1752 "fmla v29.4s, v16.4s, v10.s[0]\n"
1753 "fmla v30.4s, v16.4s, v12.s[0]\n"
1754 "fmla v31.4s, v16.4s, v14.s[0]\n"
1755 "fmla v24.4s, v17.4s, v0.s[1]\n"
1756 "fmla v25.4s, v17.4s, v2.s[1]\n"
1757 "fmla v26.4s, v17.4s, v4.s[1]\n"
1758 "fmla v27.4s, v17.4s, v6.s[1]\n"
1759 "fmla v28.4s, v17.4s, v8.s[1]\n"
1760 "fmla v29.4s, v17.4s, v10.s[1]\n"
1761 "fmla v30.4s, v17.4s, v12.s[1]\n"
1762 "fmla v31.4s, v17.4s, v14.s[1]\n"
1763 "fmla v24.4s, v18.4s, v0.s[2]\n"
1764 "fmla v25.4s, v18.4s, v2.s[2]\n"
1765 "fmla v26.4s, v18.4s, v4.s[2]\n"
1766 "fmla v27.4s, v18.4s, v6.s[2]\n"
1767 "fmla v28.4s, v18.4s, v8.s[2]\n"
1768 "fmla v29.4s, v18.4s, v10.s[2]\n"
1769 "fmla v30.4s, v18.4s, v12.s[2]\n"
1770 "fmla v31.4s, v18.4s, v14.s[2]\n"
1771 "fmla v24.4s, v19.4s, v0.s[3]\n"
1772 "fmla v25.4s, v19.4s, v2.s[3]\n"
1773 "fmla v26.4s, v19.4s, v4.s[3]\n"
1774 "fmla v27.4s, v19.4s, v6.s[3]\n"
1775 "fmla v28.4s, v19.4s, v8.s[3]\n"
1776 "fmla v29.4s, v19.4s, v10.s[3]\n"
1777 "fmla v30.4s, v19.4s, v12.s[3]\n"
1778 "fmla v31.4s, v19.4s, v14.s[3]\n"
1779 "fmla v24.4s, v20.4s, v1.s[0]\n"
1780 "fmla v25.4s, v20.4s, v3.s[0]\n"
1781 "fmla v26.4s, v20.4s, v5.s[0]\n"
1782 "fmla v27.4s, v20.4s, v7.s[0]\n"
1783 "fmla v28.4s, v20.4s, v9.s[0]\n"
1784 "fmla v29.4s, v20.4s, v11.s[0]\n"
1785 "fmla v30.4s, v20.4s, v13.s[0]\n"
1786 "fmla v31.4s, v20.4s, v15.s[0]\n"
1788 "ld1r {v22.4s}, [%[minptr]]\n"
1789 "ld1r {v23.4s}, [%[maxptr]]\n"
1790 "fmax v24.4s, v24.4s, v22.4s\n"
1791 "fmax v25.4s, v25.4s, v22.4s\n"
1792 "fmax v26.4s, v26.4s, v22.4s\n"
1793 "fmax v27.4s, v27.4s, v22.4s\n"
1794 "fmin v24.4s, v24.4s, v23.4s\n"
1795 "fmin v25.4s, v25.4s, v23.4s\n"
1796 "fmin v26.4s, v26.4s, v23.4s\n"
1797 "fmin v27.4s, v27.4s, v23.4s\n"
1798 "str q24, [%[c_ptr0]]\n"
1799 "fmax v28.4s, v28.4s, v22.4s\n"
1800 "add %[c_ptr0], %[c_ptr0], #0x10\n"
1801 "fmax v29.4s, v29.4s, v22.4s\n"
1802 "str q25, [c_ptr1]\n"
1803 "fmax v30.4s, v30.4s, v22.4s\n"
1804 "fmin v28.4s, v28.4s, v23.4s\n"
1805 "fmax v31.4s, v31.4s, v22.4s\n"
1806 "str q26, [c_ptr2]\n"
1807 "fmin v29.4s, v29.4s, v23.4s\n"
1808 "fmin v30.4s, v30.4s, v23.4s\n"
1809 "fmin v31.4s, v31.4s, v23.4s\n"
1810 "str q27, [c_ptr3]\n"
1811 "str q28, [c_ptr4]\n"
1812 "str q29, [c_ptr5]\n"
1813 "str q30, [c_ptr6]\n"
1814 "str q31, [c_ptr7]\n"
1829 : [a_ptr0]
"+r" (a_ptr0), [b_ptr0]
"+r" (b_ptr0), [c_ptr0]
"+r" (c_ptr0), [loops]
"+r" (loops), [oob_rows]
"+r" (oob_rows), [biasptr]
"+r" (biasptr)
1830 : [lda]
"r" (ldab), [ldc]
"r" (ldcb), [biasinc]
"r" (biasinc), [minptr]
"r" (minptr), [maxptr]
"r" (maxptr)
1831 :
"x0",
"x1",
"x2",
"x3",
"x4",
"x5",
"x6",
"x7",
"x8",
"x9",
"x10",
"x11",
"x12",
"x13",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23",
"v24",
"v25",
"v26",
"v27",
"v28",
"v29",
"v30",
"v31",
"cc",
"memory"
1850 "add a_ptr1, %[a_ptr0], %[lda]\n"
1851 "add c_ptr1, %[c_ptr0], %[ldc]\n"
1852 "add a_ptr2, a_ptr1, %[lda]\n"
1853 "add c_ptr2, c_ptr1, %[ldc]\n"
1854 "add a_ptr3, a_ptr2, %[lda]\n"
1855 "add c_ptr3, c_ptr2, %[ldc]\n"
1856 "add a_ptr4, a_ptr3, %[lda]\n"
1857 "add c_ptr4, c_ptr3, %[ldc]\n"
1858 "add a_ptr5, a_ptr4, %[lda]\n"
1859 "add c_ptr5, c_ptr4, %[ldc]\n"
1860 "add a_ptr6, a_ptr5, %[lda]\n"
1861 "add c_ptr6, c_ptr5, %[ldc]\n"
1862 "add a_ptr7, a_ptr6, %[lda]\n"
1863 "add c_ptr7, c_ptr6, %[ldc]\n"
1864 "cbz %[oob_rows], 1f\n"
1865 "subs %[oob_rows], %[oob_rows], #0x1\n"
1866 "add c_ptr7, %[c_ptr0], #0x0\n"
1867 "add a_ptr7, %[a_ptr0], #0x0\n"
1869 "subs %[oob_rows], %[oob_rows], #0x1\n"
1870 "add c_ptr6, %[c_ptr0], #0x0\n"
1871 "add a_ptr6, %[a_ptr0], #0x0\n"
1873 "subs %[oob_rows], %[oob_rows], #0x1\n"
1874 "add c_ptr5, %[c_ptr0], #0x0\n"
1875 "add a_ptr5, %[a_ptr0], #0x0\n"
1877 "subs %[oob_rows], %[oob_rows], #0x1\n"
1878 "add c_ptr4, %[c_ptr0], #0x0\n"
1879 "add a_ptr4, %[a_ptr0], #0x0\n"
1881 "subs %[oob_rows], %[oob_rows], #0x1\n"
1882 "add c_ptr3, %[c_ptr0], #0x0\n"
1883 "add a_ptr3, %[a_ptr0], #0x0\n"
1885 "subs %[oob_rows], %[oob_rows], #0x1\n"
1886 "add c_ptr2, %[c_ptr0], #0x0\n"
1887 "add a_ptr2, %[a_ptr0], #0x0\n"
1889 "subs %[oob_rows], %[oob_rows], #0x1\n"
1890 "add c_ptr1, %[c_ptr0], #0x0\n"
1891 "add a_ptr1, %[a_ptr0], #0x0\n"
1893 "ldr q0, [%[a_ptr0]], #0x10\n"
1894 "ldr q2, [a_ptr1], #0x10\n"
1895 "ldr q4, [a_ptr2], #0x10\n"
1896 "ldr q6, [a_ptr3], #0x10\n"
1897 "ldr d1, [%[a_ptr0]]\n"
1898 "ldr q8, [a_ptr4], #0x10\n"
1899 "ldr d3, [a_ptr1]\n"
1900 "ldr q10, [a_ptr5], #0x10\n"
1901 "ldr d5, [a_ptr2]\n"
1902 "ldr q12, [a_ptr6], #0x10\n"
1903 "ldr d7, [a_ptr3]\n"
1904 "ldr q14, [a_ptr7], #0x10\n"
1905 "ldr d9, [a_ptr4]\n"
1906 "ldr q16, [%[b_ptr0]]\n"
1907 "ldr d11, [a_ptr5]\n"
1908 "ldr q17, [%[b_ptr0], #0x10]\n"
1909 "ldr d13, [a_ptr6]\n"
1910 "ldr q18, [%[b_ptr0], #0x20]\n"
1911 "ldr d15, [a_ptr7]\n"
1912 "ldr q19, [%[b_ptr0], #0x30]\n"
1913 "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
1914 "ldr q20, [%[b_ptr0], #0x40]\n"
1915 "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
1916 "ldr q21, [%[b_ptr0], #0x50]\n"
1917 "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
1918 "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
1919 "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
1920 "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
1921 "add %[b_ptr0], %[b_ptr0], #0x60\n"
1922 "cbz %[loops], 2f\n"
1923 "ldr q24, [%[biasptr]]\n"
1924 "add %[biasptr], %[biasptr], %[biasinc]\n"
1925 "subs %[loops], %[loops], #0x1\n"
1926 "mov v25.16b, v24.16b\n"
1927 "mov v26.16b, v24.16b\n"
1928 "mov v27.16b, v24.16b\n"
1929 "mov v28.16b, v24.16b\n"
1930 "mov v29.16b, v24.16b\n"
1931 "mov v30.16b, v24.16b\n"
1932 "mov v31.16b, v24.16b\n"
1933 "fmla v24.4s, v16.4s, v0.s[0]\n"
1934 "fmla v25.4s, v16.4s, v2.s[0]\n"
1935 "fmla v26.4s, v16.4s, v4.s[0]\n"
1936 "fmla v27.4s, v16.4s, v6.s[0]\n"
1937 "fmla v28.4s, v16.4s, v8.s[0]\n"
1938 "fmla v29.4s, v16.4s, v10.s[0]\n"
1939 "fmla v30.4s, v16.4s, v12.s[0]\n"
1940 "fmla v31.4s, v16.4s, v14.s[0]\n"
1941 "fmla v24.4s, v17.4s, v0.s[1]\n"
1942 "fmla v25.4s, v17.4s, v2.s[1]\n"
1943 "fmla v26.4s, v17.4s, v4.s[1]\n"
1944 "fmla v27.4s, v17.4s, v6.s[1]\n"
1945 "fmla v28.4s, v17.4s, v8.s[1]\n"
1946 "fmla v29.4s, v17.4s, v10.s[1]\n"
1947 "fmla v30.4s, v17.4s, v12.s[1]\n"
1948 "fmla v31.4s, v17.4s, v14.s[1]\n"
1949 "fmla v24.4s, v18.4s, v0.s[2]\n"
1950 "fmla v25.4s, v18.4s, v2.s[2]\n"
1951 "fmla v26.4s, v18.4s, v4.s[2]\n"
1952 "fmla v27.4s, v18.4s, v6.s[2]\n"
1953 "fmla v28.4s, v18.4s, v8.s[2]\n"
1954 "fmla v29.4s, v18.4s, v10.s[2]\n"
1955 "fmla v30.4s, v18.4s, v12.s[2]\n"
1956 "fmla v31.4s, v18.4s, v14.s[2]\n"
1957 "fmla v24.4s, v19.4s, v0.s[3]\n"
1958 "fmla v25.4s, v19.4s, v2.s[3]\n"
1959 "fmla v26.4s, v19.4s, v4.s[3]\n"
1960 "fmla v27.4s, v19.4s, v6.s[3]\n"
1961 "fmla v28.4s, v19.4s, v8.s[3]\n"
1962 "fmla v29.4s, v19.4s, v10.s[3]\n"
1963 "fmla v30.4s, v19.4s, v12.s[3]\n"
1964 "fmla v31.4s, v19.4s, v14.s[3]\n"
1965 "fmla v24.4s, v20.4s, v1.s[0]\n"
1966 "fmla v25.4s, v20.4s, v3.s[0]\n"
1967 "fmla v26.4s, v20.4s, v5.s[0]\n"
1968 "fmla v27.4s, v20.4s, v7.s[0]\n"
1969 "fmla v28.4s, v20.4s, v9.s[0]\n"
1970 "fmla v29.4s, v20.4s, v11.s[0]\n"
1971 "fmla v30.4s, v20.4s, v13.s[0]\n"
1972 "fmla v31.4s, v20.4s, v15.s[0]\n"
1973 "fmla v24.4s, v21.4s, v1.s[1]\n"
1974 "fmla v25.4s, v21.4s, v3.s[1]\n"
1975 "fmla v26.4s, v21.4s, v5.s[1]\n"
1976 "fmla v27.4s, v21.4s, v7.s[1]\n"
1977 "fmla v28.4s, v21.4s, v9.s[1]\n"
1978 "fmla v29.4s, v21.4s, v11.s[1]\n"
1979 "fmla v30.4s, v21.4s, v13.s[1]\n"
1980 "fmla v31.4s, v21.4s, v15.s[1]\n"
1983 "ld1r {v22.4s}, [%[minptr]]\n"
1984 "subs %[loops], %[loops], #0x1\n"
1985 "ld1r {v23.4s}, [%[maxptr]]\n"
1986 "ldr q16, [%[b_ptr0]]\n"
1987 "fmax v24.4s, v24.4s, v22.4s\n"
1988 "ldr q17, [%[b_ptr0], #0x10]\n"
1989 "fmax v25.4s, v25.4s, v22.4s\n"
1990 "ldr q18, [%[b_ptr0], #0x20]\n"
1991 "fmax v26.4s, v26.4s, v22.4s\n"
1992 "ldr q19, [%[b_ptr0], #0x30]\n"
1993 "fmax v27.4s, v27.4s, v22.4s\n"
1994 "ldr q20, [%[b_ptr0], #0x40]\n"
1995 "fmin v24.4s, v24.4s, v23.4s\n"
1996 "ldr q21, [%[b_ptr0], #0x50]\n"
1997 "fmin v25.4s, v25.4s, v23.4s\n"
1998 "add %[b_ptr0], %[b_ptr0], #0x60\n"
1999 "fmin v26.4s, v26.4s, v23.4s\n"
2000 "str q24, [%[c_ptr0]]\n"
2001 "fmin v27.4s, v27.4s, v23.4s\n"
2002 "ldr q24, [%[biasptr]]\n"
2003 "fmax v28.4s, v28.4s, v22.4s\n"
2004 "add %[c_ptr0], %[c_ptr0], #0x10\n"
2005 "fmax v29.4s, v29.4s, v22.4s\n"
2006 "str q25, [c_ptr1]\n"
2007 "fmax v30.4s, v30.4s, v22.4s\n"
2008 "add c_ptr1, c_ptr1, #0x10\n"
2009 "fmin v28.4s, v28.4s, v23.4s\n"
2010 "str q26, [c_ptr2]\n"
2011 "fmin v29.4s, v29.4s, v23.4s\n"
2012 "add c_ptr2, c_ptr2, #0x10\n"
2013 "fmin v30.4s, v30.4s, v23.4s\n"
2014 "str q27, [c_ptr3]\n"
2015 "fmax v31.4s, v31.4s, v22.4s\n"
2016 "add c_ptr3, c_ptr3, #0x10\n"
2017 "mov v25.16b, v24.16b\n"
2018 "str q28, [c_ptr4]\n"
2019 "mov v26.16b, v24.16b\n"
2020 "add c_ptr4, c_ptr4, #0x10\n"
2021 "fmin v31.4s, v31.4s, v23.4s\n"
2022 "str q29, [c_ptr5]\n"
2023 "mov v27.16b, v24.16b\n"
2024 "add c_ptr5, c_ptr5, #0x10\n"
2025 "mov v28.16b, v24.16b\n"
2026 "str q30, [c_ptr6]\n"
2027 "mov v29.16b, v24.16b\n"
2028 "add c_ptr6, c_ptr6, #0x10\n"
2029 "mov v30.16b, v24.16b\n"
2030 "str q31, [c_ptr7]\n"
2031 "mov v31.16b, v24.16b\n"
2032 "add c_ptr7, c_ptr7, #0x10\n"
2033 "fmla v24.4s, v16.4s, v0.s[0]\n"
2034 "add %[biasptr], %[biasptr], %[biasinc]\n"
2035 "fmla v25.4s, v16.4s, v2.s[0]\n"
2036 "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
2037 "fmla v26.4s, v16.4s, v4.s[0]\n"
2038 "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
2039 "fmla v27.4s, v16.4s, v6.s[0]\n"
2040 "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
2041 "fmla v28.4s, v16.4s, v8.s[0]\n"
2042 "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
2043 "fmla v29.4s, v16.4s, v10.s[0]\n"
2044 "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
2045 "fmla v30.4s, v16.4s, v12.s[0]\n"
2046 "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
2047 "fmla v31.4s, v16.4s, v14.s[0]\n"
2048 "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
2049 "fmla v24.4s, v17.4s, v0.s[1]\n"
2050 "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
2051 "fmla v25.4s, v17.4s, v2.s[1]\n"
2052 "fmla v26.4s, v17.4s, v4.s[1]\n"
2053 "fmla v27.4s, v17.4s, v6.s[1]\n"
2054 "fmla v28.4s, v17.4s, v8.s[1]\n"
2055 "fmla v29.4s, v17.4s, v10.s[1]\n"
2056 "fmla v30.4s, v17.4s, v12.s[1]\n"
2057 "fmla v31.4s, v17.4s, v14.s[1]\n"
2058 "fmla v24.4s, v18.4s, v0.s[2]\n"
2059 "fmla v25.4s, v18.4s, v2.s[2]\n"
2060 "fmla v26.4s, v18.4s, v4.s[2]\n"
2061 "fmla v27.4s, v18.4s, v6.s[2]\n"
2062 "fmla v28.4s, v18.4s, v8.s[2]\n"
2063 "fmla v29.4s, v18.4s, v10.s[2]\n"
2064 "fmla v30.4s, v18.4s, v12.s[2]\n"
2065 "fmla v31.4s, v18.4s, v14.s[2]\n"
2066 "fmla v24.4s, v19.4s, v0.s[3]\n"
2067 "fmla v25.4s, v19.4s, v2.s[3]\n"
2068 "fmla v26.4s, v19.4s, v4.s[3]\n"
2069 "fmla v27.4s, v19.4s, v6.s[3]\n"
2070 "fmla v28.4s, v19.4s, v8.s[3]\n"
2071 "fmla v29.4s, v19.4s, v10.s[3]\n"
2072 "fmla v30.4s, v19.4s, v12.s[3]\n"
2073 "fmla v31.4s, v19.4s, v14.s[3]\n"
2074 "fmla v24.4s, v20.4s, v1.s[0]\n"
2075 "fmla v25.4s, v20.4s, v3.s[0]\n"
2076 "fmla v26.4s, v20.4s, v5.s[0]\n"
2077 "fmla v27.4s, v20.4s, v7.s[0]\n"
2078 "fmla v28.4s, v20.4s, v9.s[0]\n"
2079 "fmla v29.4s, v20.4s, v11.s[0]\n"
2080 "fmla v30.4s, v20.4s, v13.s[0]\n"
2081 "fmla v31.4s, v20.4s, v15.s[0]\n"
2082 "fmla v24.4s, v21.4s, v1.s[1]\n"
2083 "fmla v25.4s, v21.4s, v3.s[1]\n"
2084 "fmla v26.4s, v21.4s, v5.s[1]\n"
2085 "fmla v27.4s, v21.4s, v7.s[1]\n"
2086 "fmla v28.4s, v21.4s, v9.s[1]\n"
2087 "fmla v29.4s, v21.4s, v11.s[1]\n"
2088 "fmla v30.4s, v21.4s, v13.s[1]\n"
2089 "fmla v31.4s, v21.4s, v15.s[1]\n"
2092 "ld1r {v22.4s}, [%[minptr]]\n"
2093 "ld1r {v23.4s}, [%[maxptr]]\n"
2094 "ldr q16, [%[b_ptr0]]\n"
2095 "ldr q17, [%[b_ptr0], #0x10]\n"
2096 "fmax v24.4s, v24.4s, v22.4s\n"
2097 "ldr q18, [%[b_ptr0], #0x20]\n"
2098 "fmax v25.4s, v25.4s, v22.4s\n"
2099 "ldr q19, [%[b_ptr0], #0x30]\n"
2100 "fmax v26.4s, v26.4s, v22.4s\n"
2101 "ldr q20, [%[b_ptr0], #0x40]\n"
2102 "fmax v27.4s, v27.4s, v22.4s\n"
2103 "ldr q21, [%[b_ptr0], #0x50]\n"
2104 "fmin v24.4s, v24.4s, v23.4s\n"
2105 "add %[b_ptr0], %[b_ptr0], #0x60\n"
2106 "fmin v25.4s, v25.4s, v23.4s\n"
2107 "fmin v26.4s, v26.4s, v23.4s\n"
2108 "str q24, [%[c_ptr0]]\n"
2109 "fmin v27.4s, v27.4s, v23.4s\n"
2110 "ldr q24, [%[biasptr]]\n"
2111 "fmax v28.4s, v28.4s, v22.4s\n"
2112 "add %[c_ptr0], %[c_ptr0], #0x10\n"
2113 "fmax v29.4s, v29.4s, v22.4s\n"
2114 "str q25, [c_ptr1]\n"
2115 "fmax v30.4s, v30.4s, v22.4s\n"
2116 "add c_ptr1, c_ptr1, #0x10\n"
2117 "fmin v28.4s, v28.4s, v23.4s\n"
2118 "str q26, [c_ptr2]\n"
2119 "fmin v29.4s, v29.4s, v23.4s\n"
2120 "add c_ptr2, c_ptr2, #0x10\n"
2121 "fmin v30.4s, v30.4s, v23.4s\n"
2122 "str q27, [c_ptr3]\n"
2123 "fmax v31.4s, v31.4s, v22.4s\n"
2124 "add c_ptr3, c_ptr3, #0x10\n"
2125 "mov v25.16b, v24.16b\n"
2126 "str q28, [c_ptr4]\n"
2127 "mov v26.16b, v24.16b\n"
2128 "add c_ptr4, c_ptr4, #0x10\n"
2129 "fmin v31.4s, v31.4s, v23.4s\n"
2130 "str q29, [c_ptr5]\n"
2131 "mov v27.16b, v24.16b\n"
2132 "add c_ptr5, c_ptr5, #0x10\n"
2133 "mov v28.16b, v24.16b\n"
2134 "str q30, [c_ptr6]\n"
2135 "mov v29.16b, v24.16b\n"
2136 "add c_ptr6, c_ptr6, #0x10\n"
2137 "mov v30.16b, v24.16b\n"
2138 "str q31, [c_ptr7]\n"
2139 "mov v31.16b, v24.16b\n"
2140 "add c_ptr7, c_ptr7, #0x10\n"
2141 "fmla v24.4s, v16.4s, v0.s[0]\n"
2142 "add %[biasptr], %[biasptr], %[biasinc]\n"
2143 "fmla v25.4s, v16.4s, v2.s[0]\n"
2144 "fmla v26.4s, v16.4s, v4.s[0]\n"
2145 "fmla v27.4s, v16.4s, v6.s[0]\n"
2146 "fmla v28.4s, v16.4s, v8.s[0]\n"
2147 "fmla v29.4s, v16.4s, v10.s[0]\n"
2148 "fmla v30.4s, v16.4s, v12.s[0]\n"
2149 "fmla v31.4s, v16.4s, v14.s[0]\n"
2150 "fmla v24.4s, v17.4s, v0.s[1]\n"
2151 "fmla v25.4s, v17.4s, v2.s[1]\n"
2152 "fmla v26.4s, v17.4s, v4.s[1]\n"
2153 "fmla v27.4s, v17.4s, v6.s[1]\n"
2154 "fmla v28.4s, v17.4s, v8.s[1]\n"
2155 "fmla v29.4s, v17.4s, v10.s[1]\n"
2156 "fmla v30.4s, v17.4s, v12.s[1]\n"
2157 "fmla v31.4s, v17.4s, v14.s[1]\n"
2158 "fmla v24.4s, v18.4s, v0.s[2]\n"
2159 "fmla v25.4s, v18.4s, v2.s[2]\n"
2160 "fmla v26.4s, v18.4s, v4.s[2]\n"
2161 "fmla v27.4s, v18.4s, v6.s[2]\n"
2162 "fmla v28.4s, v18.4s, v8.s[2]\n"
2163 "fmla v29.4s, v18.4s, v10.s[2]\n"
2164 "fmla v30.4s, v18.4s, v12.s[2]\n"
2165 "fmla v31.4s, v18.4s, v14.s[2]\n"
2166 "fmla v24.4s, v19.4s, v0.s[3]\n"
2167 "fmla v25.4s, v19.4s, v2.s[3]\n"
2168 "fmla v26.4s, v19.4s, v4.s[3]\n"
2169 "fmla v27.4s, v19.4s, v6.s[3]\n"
2170 "fmla v28.4s, v19.4s, v8.s[3]\n"
2171 "fmla v29.4s, v19.4s, v10.s[3]\n"
2172 "fmla v30.4s, v19.4s, v12.s[3]\n"
2173 "fmla v31.4s, v19.4s, v14.s[3]\n"
2174 "fmla v24.4s, v20.4s, v1.s[0]\n"
2175 "fmla v25.4s, v20.4s, v3.s[0]\n"
2176 "fmla v26.4s, v20.4s, v5.s[0]\n"
2177 "fmla v27.4s, v20.4s, v7.s[0]\n"
2178 "fmla v28.4s, v20.4s, v9.s[0]\n"
2179 "fmla v29.4s, v20.4s, v11.s[0]\n"
2180 "fmla v30.4s, v20.4s, v13.s[0]\n"
2181 "fmla v31.4s, v20.4s, v15.s[0]\n"
2182 "fmla v24.4s, v21.4s, v1.s[1]\n"
2183 "fmla v25.4s, v21.4s, v3.s[1]\n"
2184 "fmla v26.4s, v21.4s, v5.s[1]\n"
2185 "fmla v27.4s, v21.4s, v7.s[1]\n"
2186 "fmla v28.4s, v21.4s, v9.s[1]\n"
2187 "fmla v29.4s, v21.4s, v11.s[1]\n"
2188 "fmla v30.4s, v21.4s, v13.s[1]\n"
2189 "fmla v31.4s, v21.4s, v15.s[1]\n"
2192 "ldr q24, [%[biasptr]]\n"
2193 "add %[biasptr], %[biasptr], %[biasinc]\n"
2194 "mov v25.16b, v24.16b\n"
2195 "mov v26.16b, v24.16b\n"
2196 "mov v27.16b, v24.16b\n"
2197 "mov v28.16b, v24.16b\n"
2198 "mov v29.16b, v24.16b\n"
2199 "mov v30.16b, v24.16b\n"
2200 "mov v31.16b, v24.16b\n"
2201 "fmla v24.4s, v16.4s, v0.s[0]\n"
2202 "fmla v25.4s, v16.4s, v2.s[0]\n"
2203 "fmla v26.4s, v16.4s, v4.s[0]\n"
2204 "fmla v27.4s, v16.4s, v6.s[0]\n"
2205 "fmla v28.4s, v16.4s, v8.s[0]\n"
2206 "fmla v29.4s, v16.4s, v10.s[0]\n"
2207 "fmla v30.4s, v16.4s, v12.s[0]\n"
2208 "fmla v31.4s, v16.4s, v14.s[0]\n"
2209 "fmla v24.4s, v17.4s, v0.s[1]\n"
2210 "fmla v25.4s, v17.4s, v2.s[1]\n"
2211 "fmla v26.4s, v17.4s, v4.s[1]\n"
2212 "fmla v27.4s, v17.4s, v6.s[1]\n"
2213 "fmla v28.4s, v17.4s, v8.s[1]\n"
2214 "fmla v29.4s, v17.4s, v10.s[1]\n"
2215 "fmla v30.4s, v17.4s, v12.s[1]\n"
2216 "fmla v31.4s, v17.4s, v14.s[1]\n"
2217 "fmla v24.4s, v18.4s, v0.s[2]\n"
2218 "fmla v25.4s, v18.4s, v2.s[2]\n"
2219 "fmla v26.4s, v18.4s, v4.s[2]\n"
2220 "fmla v27.4s, v18.4s, v6.s[2]\n"
2221 "fmla v28.4s, v18.4s, v8.s[2]\n"
2222 "fmla v29.4s, v18.4s, v10.s[2]\n"
2223 "fmla v30.4s, v18.4s, v12.s[2]\n"
2224 "fmla v31.4s, v18.4s, v14.s[2]\n"
2225 "fmla v24.4s, v19.4s, v0.s[3]\n"
2226 "fmla v25.4s, v19.4s, v2.s[3]\n"
2227 "fmla v26.4s, v19.4s, v4.s[3]\n"
2228 "fmla v27.4s, v19.4s, v6.s[3]\n"
2229 "fmla v28.4s, v19.4s, v8.s[3]\n"
2230 "fmla v29.4s, v19.4s, v10.s[3]\n"
2231 "fmla v30.4s, v19.4s, v12.s[3]\n"
2232 "fmla v31.4s, v19.4s, v14.s[3]\n"
2233 "fmla v24.4s, v20.4s, v1.s[0]\n"
2234 "fmla v25.4s, v20.4s, v3.s[0]\n"
2235 "fmla v26.4s, v20.4s, v5.s[0]\n"
2236 "fmla v27.4s, v20.4s, v7.s[0]\n"
2237 "fmla v28.4s, v20.4s, v9.s[0]\n"
2238 "fmla v29.4s, v20.4s, v11.s[0]\n"
2239 "fmla v30.4s, v20.4s, v13.s[0]\n"
2240 "fmla v31.4s, v20.4s, v15.s[0]\n"
2241 "fmla v24.4s, v21.4s, v1.s[1]\n"
2242 "fmla v25.4s, v21.4s, v3.s[1]\n"
2243 "fmla v26.4s, v21.4s, v5.s[1]\n"
2244 "fmla v27.4s, v21.4s, v7.s[1]\n"
2245 "fmla v28.4s, v21.4s, v9.s[1]\n"
2246 "fmla v29.4s, v21.4s, v11.s[1]\n"
2247 "fmla v30.4s, v21.4s, v13.s[1]\n"
2248 "fmla v31.4s, v21.4s, v15.s[1]\n"
2250 "ld1r {v22.4s}, [%[minptr]]\n"
2251 "ld1r {v23.4s}, [%[maxptr]]\n"
2252 "fmax v24.4s, v24.4s, v22.4s\n"
2253 "fmax v25.4s, v25.4s, v22.4s\n"
2254 "fmax v26.4s, v26.4s, v22.4s\n"
2255 "fmax v27.4s, v27.4s, v22.4s\n"
2256 "fmin v24.4s, v24.4s, v23.4s\n"
2257 "fmin v25.4s, v25.4s, v23.4s\n"
2258 "fmin v26.4s, v26.4s, v23.4s\n"
2259 "fmin v27.4s, v27.4s, v23.4s\n"
2260 "str q24, [%[c_ptr0]]\n"
2261 "fmax v28.4s, v28.4s, v22.4s\n"
2262 "add %[c_ptr0], %[c_ptr0], #0x10\n"
2263 "fmax v29.4s, v29.4s, v22.4s\n"
2264 "str q25, [c_ptr1]\n"
2265 "fmax v30.4s, v30.4s, v22.4s\n"
2266 "fmin v28.4s, v28.4s, v23.4s\n"
2267 "fmax v31.4s, v31.4s, v22.4s\n"
2268 "str q26, [c_ptr2]\n"
2269 "fmin v29.4s, v29.4s, v23.4s\n"
2270 "fmin v30.4s, v30.4s, v23.4s\n"
2271 "fmin v31.4s, v31.4s, v23.4s\n"
2272 "str q27, [c_ptr3]\n"
2273 "str q28, [c_ptr4]\n"
2274 "str q29, [c_ptr5]\n"
2275 "str q30, [c_ptr6]\n"
2276 "str q31, [c_ptr7]\n"
2291 : [a_ptr0]
"+r" (a_ptr0), [b_ptr0]
"+r" (b_ptr0), [c_ptr0]
"+r" (c_ptr0), [loops]
"+r" (loops), [oob_rows]
"+r" (oob_rows), [biasptr]
"+r" (biasptr)
2292 : [lda]
"r" (ldab), [ldc]
"r" (ldcb), [biasinc]
"r" (biasinc), [minptr]
"r" (minptr), [maxptr]
"r" (maxptr)
2293 :
"x0",
"x1",
"x2",
"x3",
"x4",
"x5",
"x6",
"x7",
"x8",
"x9",
"x10",
"x11",
"x12",
"x13",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23",
"v24",
"v25",
"v26",
"v27",
"v28",
"v29",
"v30",
"v31",
"cc",
"memory"
2312 "add a_ptr1, %[a_ptr0], %[lda]\n"
2313 "add c_ptr1, %[c_ptr0], %[ldc]\n"
2314 "add a_ptr2, a_ptr1, %[lda]\n"
2315 "add c_ptr2, c_ptr1, %[ldc]\n"
2316 "add a_ptr3, a_ptr2, %[lda]\n"
2317 "add c_ptr3, c_ptr2, %[ldc]\n"
2318 "add a_ptr4, a_ptr3, %[lda]\n"
2319 "add c_ptr4, c_ptr3, %[ldc]\n"
2320 "add a_ptr5, a_ptr4, %[lda]\n"
2321 "add c_ptr5, c_ptr4, %[ldc]\n"
2322 "add a_ptr6, a_ptr5, %[lda]\n"
2323 "add c_ptr6, c_ptr5, %[ldc]\n"
2324 "add a_ptr7, a_ptr6, %[lda]\n"
2325 "add c_ptr7, c_ptr6, %[ldc]\n"
2326 "cbz %[oob_rows], 1f\n"
2327 "subs %[oob_rows], %[oob_rows], #0x1\n"
2328 "add c_ptr7, %[c_ptr0], #0x0\n"
2329 "add a_ptr7, %[a_ptr0], #0x0\n"
2331 "subs %[oob_rows], %[oob_rows], #0x1\n"
2332 "add c_ptr6, %[c_ptr0], #0x0\n"
2333 "add a_ptr6, %[a_ptr0], #0x0\n"
2335 "subs %[oob_rows], %[oob_rows], #0x1\n"
2336 "add c_ptr5, %[c_ptr0], #0x0\n"
2337 "add a_ptr5, %[a_ptr0], #0x0\n"
2339 "subs %[oob_rows], %[oob_rows], #0x1\n"
2340 "add c_ptr4, %[c_ptr0], #0x0\n"
2341 "add a_ptr4, %[a_ptr0], #0x0\n"
2343 "subs %[oob_rows], %[oob_rows], #0x1\n"
2344 "add c_ptr3, %[c_ptr0], #0x0\n"
2345 "add a_ptr3, %[a_ptr0], #0x0\n"
2347 "subs %[oob_rows], %[oob_rows], #0x1\n"
2348 "add c_ptr2, %[c_ptr0], #0x0\n"
2349 "add a_ptr2, %[a_ptr0], #0x0\n"
2351 "subs %[oob_rows], %[oob_rows], #0x1\n"
2352 "add c_ptr1, %[c_ptr0], #0x0\n"
2353 "add a_ptr1, %[a_ptr0], #0x0\n"
2355 "ldr q0, [%[a_ptr0]], #0x10\n"
2356 "ldr q2, [a_ptr1], #0x10\n"
2357 "ldr q4, [a_ptr2], #0x10\n"
2358 "ldr q6, [a_ptr3], #0x10\n"
2359 "ldr d1, [%[a_ptr0]], #0x8\n"
2360 "ldr q8, [a_ptr4], #0x10\n"
2361 "ldr d3, [a_ptr1], #0x8\n"
2362 "ldr q10, [a_ptr5], #0x10\n"
2363 "ldr d5, [a_ptr2], #0x8\n"
2364 "ldr q12, [a_ptr6], #0x10\n"
2365 "ldr d7, [a_ptr3], #0x8\n"
2366 "ldr q14, [a_ptr7], #0x10\n"
2367 "ldr d9, [a_ptr4], #0x8\n"
2368 "ldr q16, [%[b_ptr0]]\n"
2369 "ldr d11, [a_ptr5], #0x8\n"
2370 "ldr q17, [%[b_ptr0], #0x10]\n"
2371 "ldr d13, [a_ptr6], #0x8\n"
2372 "ldr q18, [%[b_ptr0], #0x20]\n"
2373 "ldr d15, [a_ptr7], #0x8\n"
2374 "ldr q19, [%[b_ptr0], #0x30]\n"
2375 "ld1 {v1.s}[2], [%[a_ptr0]]\n"
2376 "ldr q20, [%[b_ptr0], #0x40]\n"
2377 "ld1 {v3.s}[2], [a_ptr1]\n"
2378 "ldr q21, [%[b_ptr0], #0x50]\n"
2379 "ld1 {v5.s}[2], [a_ptr2]\n"
2380 "ldr q22, [%[b_ptr0], #0x60]\n"
2381 "ld1 {v7.s}[2], [a_ptr3]\n"
2382 "ld1 {v9.s}[2], [a_ptr4]\n"
2383 "ld1 {v11.s}[2], [a_ptr5]\n"
2384 "ld1 {v13.s}[2], [a_ptr6]\n"
2385 "ld1 {v15.s}[2], [a_ptr7]\n"
2386 "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
2387 "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
2388 "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
2389 "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
2390 "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
2391 "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
2392 "add %[b_ptr0], %[b_ptr0], #0x70\n"
2393 "cbz %[loops], 2f\n"
2394 "ldr q24, [%[biasptr]]\n"
2395 "add %[biasptr], %[biasptr], %[biasinc]\n"
2396 "subs %[loops], %[loops], #0x1\n"
2397 "mov v25.16b, v24.16b\n"
2398 "mov v26.16b, v24.16b\n"
2399 "mov v27.16b, v24.16b\n"
2400 "mov v28.16b, v24.16b\n"
2401 "mov v29.16b, v24.16b\n"
2402 "mov v30.16b, v24.16b\n"
2403 "mov v31.16b, v24.16b\n"
2404 "fmla v24.4s, v16.4s, v0.s[0]\n"
2405 "fmla v25.4s, v16.4s, v2.s[0]\n"
2406 "fmla v26.4s, v16.4s, v4.s[0]\n"
2407 "fmla v27.4s, v16.4s, v6.s[0]\n"
2408 "fmla v28.4s, v16.4s, v8.s[0]\n"
2409 "fmla v29.4s, v16.4s, v10.s[0]\n"
2410 "fmla v30.4s, v16.4s, v12.s[0]\n"
2411 "fmla v31.4s, v16.4s, v14.s[0]\n"
2412 "fmla v24.4s, v17.4s, v0.s[1]\n"
2413 "fmla v25.4s, v17.4s, v2.s[1]\n"
2414 "fmla v26.4s, v17.4s, v4.s[1]\n"
2415 "fmla v27.4s, v17.4s, v6.s[1]\n"
2416 "fmla v28.4s, v17.4s, v8.s[1]\n"
2417 "fmla v29.4s, v17.4s, v10.s[1]\n"
2418 "fmla v30.4s, v17.4s, v12.s[1]\n"
2419 "fmla v31.4s, v17.4s, v14.s[1]\n"
2420 "fmla v24.4s, v18.4s, v0.s[2]\n"
2421 "fmla v25.4s, v18.4s, v2.s[2]\n"
2422 "fmla v26.4s, v18.4s, v4.s[2]\n"
2423 "fmla v27.4s, v18.4s, v6.s[2]\n"
2424 "fmla v28.4s, v18.4s, v8.s[2]\n"
2425 "fmla v29.4s, v18.4s, v10.s[2]\n"
2426 "fmla v30.4s, v18.4s, v12.s[2]\n"
2427 "fmla v31.4s, v18.4s, v14.s[2]\n"
2428 "fmla v24.4s, v19.4s, v0.s[3]\n"
2429 "fmla v25.4s, v19.4s, v2.s[3]\n"
2430 "fmla v26.4s, v19.4s, v4.s[3]\n"
2431 "fmla v27.4s, v19.4s, v6.s[3]\n"
2432 "fmla v28.4s, v19.4s, v8.s[3]\n"
2433 "fmla v29.4s, v19.4s, v10.s[3]\n"
2434 "fmla v30.4s, v19.4s, v12.s[3]\n"
2435 "fmla v31.4s, v19.4s, v14.s[3]\n"
2436 "fmla v24.4s, v20.4s, v1.s[0]\n"
2437 "fmla v25.4s, v20.4s, v3.s[0]\n"
2438 "fmla v26.4s, v20.4s, v5.s[0]\n"
2439 "fmla v27.4s, v20.4s, v7.s[0]\n"
2440 "fmla v28.4s, v20.4s, v9.s[0]\n"
2441 "fmla v29.4s, v20.4s, v11.s[0]\n"
2442 "fmla v30.4s, v20.4s, v13.s[0]\n"
2443 "fmla v31.4s, v20.4s, v15.s[0]\n"
2444 "fmla v24.4s, v21.4s, v1.s[1]\n"
2445 "fmla v25.4s, v21.4s, v3.s[1]\n"
2446 "fmla v26.4s, v21.4s, v5.s[1]\n"
2447 "fmla v27.4s, v21.4s, v7.s[1]\n"
2448 "fmla v28.4s, v21.4s, v9.s[1]\n"
2449 "fmla v29.4s, v21.4s, v11.s[1]\n"
2450 "fmla v30.4s, v21.4s, v13.s[1]\n"
2451 "fmla v31.4s, v21.4s, v15.s[1]\n"
2452 "fmla v24.4s, v22.4s, v1.s[2]\n"
2453 "fmla v25.4s, v22.4s, v3.s[2]\n"
2454 "fmla v26.4s, v22.4s, v5.s[2]\n"
2455 "fmla v27.4s, v22.4s, v7.s[2]\n"
2456 "fmla v28.4s, v22.4s, v9.s[2]\n"
2457 "fmla v29.4s, v22.4s, v11.s[2]\n"
2458 "fmla v30.4s, v22.4s, v13.s[2]\n"
2459 "fmla v31.4s, v22.4s, v15.s[2]\n"
2462 "ld1r {v22.4s}, [%[minptr]]\n"
2463 "subs %[loops], %[loops], #0x1\n"
2464 "ld1r {v23.4s}, [%[maxptr]]\n"
2465 "ldr q16, [%[b_ptr0]]\n"
2466 "fmax v24.4s, v24.4s, v22.4s\n"
2467 "ldr q17, [%[b_ptr0], #0x10]\n"
2468 "fmax v25.4s, v25.4s, v22.4s\n"
2469 "ldr q18, [%[b_ptr0], #0x20]\n"
2470 "fmax v26.4s, v26.4s, v22.4s\n"
2471 "ldr q19, [%[b_ptr0], #0x30]\n"
2472 "fmax v27.4s, v27.4s, v22.4s\n"
2473 "ldr q20, [%[b_ptr0], #0x40]\n"
2474 "fmin v24.4s, v24.4s, v23.4s\n"
2475 "ldr q21, [%[b_ptr0], #0x50]\n"
2476 "fmin v25.4s, v25.4s, v23.4s\n"
2477 "fmin v26.4s, v26.4s, v23.4s\n"
2478 "fmin v27.4s, v27.4s, v23.4s\n"
2479 "str q24, [%[c_ptr0]]\n"
2480 "fmax v28.4s, v28.4s, v22.4s\n"
2481 "ldr q24, [%[biasptr]]\n"
2482 "fmax v29.4s, v29.4s, v22.4s\n"
2483 "add %[c_ptr0], %[c_ptr0], #0x10\n"
2484 "fmax v30.4s, v30.4s, v22.4s\n"
2485 "str q25, [c_ptr1]\n"
2486 "fmin v28.4s, v28.4s, v23.4s\n"
2487 "add c_ptr1, c_ptr1, #0x10\n"
2488 "fmin v29.4s, v29.4s, v23.4s\n"
2489 "str q26, [c_ptr2]\n"
2490 "fmin v30.4s, v30.4s, v23.4s\n"
2491 "add c_ptr2, c_ptr2, #0x10\n"
2492 "fmax v31.4s, v31.4s, v22.4s\n"
2493 "str q27, [c_ptr3]\n"
2494 "mov v25.16b, v24.16b\n"
2495 "ldr q22, [%[b_ptr0], #0x60]\n"
2496 "mov v26.16b, v24.16b\n"
2497 "add c_ptr3, c_ptr3, #0x10\n"
2498 "fmin v31.4s, v31.4s, v23.4s\n"
2499 "str q28, [c_ptr4]\n"
2500 "mov v27.16b, v24.16b\n"
2501 "add c_ptr4, c_ptr4, #0x10\n"
2502 "mov v28.16b, v24.16b\n"
2503 "str q29, [c_ptr5]\n"
2504 "mov v29.16b, v24.16b\n"
2505 "add c_ptr5, c_ptr5, #0x10\n"
2506 "fmla v25.4s, v16.4s, v2.s[0]\n"
2507 "str q30, [c_ptr6]\n"
2508 "mov v30.16b, v24.16b\n"
2509 "add c_ptr6, c_ptr6, #0x10\n"
2510 "fmla v26.4s, v16.4s, v4.s[0]\n"
2511 "str q31, [c_ptr7]\n"
2512 "mov v31.16b, v24.16b\n"
2513 "add c_ptr7, c_ptr7, #0x10\n"
2514 "fmla v24.4s, v16.4s, v0.s[0]\n"
2515 "add %[b_ptr0], %[b_ptr0], #0x70\n"
2516 "fmla v27.4s, v16.4s, v6.s[0]\n"
2517 "add %[biasptr], %[biasptr], %[biasinc]\n"
2518 "fmla v28.4s, v16.4s, v8.s[0]\n"
2519 "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
2520 "fmla v29.4s, v16.4s, v10.s[0]\n"
2521 "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
2522 "fmla v30.4s, v16.4s, v12.s[0]\n"
2523 "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
2524 "fmla v31.4s, v16.4s, v14.s[0]\n"
2525 "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
2526 "fmla v24.4s, v17.4s, v0.s[1]\n"
2527 "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
2528 "fmla v25.4s, v17.4s, v2.s[1]\n"
2529 "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
2530 "fmla v26.4s, v17.4s, v4.s[1]\n"
2531 "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
2532 "fmla v27.4s, v17.4s, v6.s[1]\n"
2533 "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
2534 "fmla v28.4s, v17.4s, v8.s[1]\n"
2535 "fmla v29.4s, v17.4s, v10.s[1]\n"
2536 "fmla v30.4s, v17.4s, v12.s[1]\n"
2537 "fmla v31.4s, v17.4s, v14.s[1]\n"
2538 "fmla v24.4s, v18.4s, v0.s[2]\n"
2539 "fmla v25.4s, v18.4s, v2.s[2]\n"
2540 "fmla v26.4s, v18.4s, v4.s[2]\n"
2541 "fmla v27.4s, v18.4s, v6.s[2]\n"
2542 "fmla v28.4s, v18.4s, v8.s[2]\n"
2543 "fmla v29.4s, v18.4s, v10.s[2]\n"
2544 "fmla v30.4s, v18.4s, v12.s[2]\n"
2545 "fmla v31.4s, v18.4s, v14.s[2]\n"
2546 "fmla v24.4s, v19.4s, v0.s[3]\n"
2547 "fmla v25.4s, v19.4s, v2.s[3]\n"
2548 "fmla v26.4s, v19.4s, v4.s[3]\n"
2549 "fmla v27.4s, v19.4s, v6.s[3]\n"
2550 "fmla v28.4s, v19.4s, v8.s[3]\n"
2551 "fmla v29.4s, v19.4s, v10.s[3]\n"
2552 "fmla v30.4s, v19.4s, v12.s[3]\n"
2553 "fmla v31.4s, v19.4s, v14.s[3]\n"
2554 "fmla v24.4s, v20.4s, v1.s[0]\n"
2555 "fmla v25.4s, v20.4s, v3.s[0]\n"
2556 "fmla v26.4s, v20.4s, v5.s[0]\n"
2557 "fmla v27.4s, v20.4s, v7.s[0]\n"
2558 "fmla v28.4s, v20.4s, v9.s[0]\n"
2559 "fmla v29.4s, v20.4s, v11.s[0]\n"
2560 "fmla v30.4s, v20.4s, v13.s[0]\n"
2561 "fmla v31.4s, v20.4s, v15.s[0]\n"
2562 "fmla v24.4s, v21.4s, v1.s[1]\n"
2563 "fmla v25.4s, v21.4s, v3.s[1]\n"
2564 "fmla v26.4s, v21.4s, v5.s[1]\n"
2565 "fmla v27.4s, v21.4s, v7.s[1]\n"
2566 "fmla v28.4s, v21.4s, v9.s[1]\n"
2567 "fmla v29.4s, v21.4s, v11.s[1]\n"
2568 "fmla v30.4s, v21.4s, v13.s[1]\n"
2569 "fmla v31.4s, v21.4s, v15.s[1]\n"
2570 "fmla v24.4s, v22.4s, v1.s[2]\n"
2571 "fmla v25.4s, v22.4s, v3.s[2]\n"
2572 "fmla v26.4s, v22.4s, v5.s[2]\n"
2573 "fmla v27.4s, v22.4s, v7.s[2]\n"
2574 "fmla v28.4s, v22.4s, v9.s[2]\n"
2575 "fmla v29.4s, v22.4s, v11.s[2]\n"
2576 "fmla v30.4s, v22.4s, v13.s[2]\n"
2577 "fmla v31.4s, v22.4s, v15.s[2]\n"
2580 "ld1r {v22.4s}, [%[minptr]]\n"
2581 "ld1r {v23.4s}, [%[maxptr]]\n"
2582 "ldr q16, [%[b_ptr0]]\n"
2583 "ldr q17, [%[b_ptr0], #0x10]\n"
2584 "fmax v24.4s, v24.4s, v22.4s\n"
2585 "ldr q18, [%[b_ptr0], #0x20]\n"
2586 "fmax v25.4s, v25.4s, v22.4s\n"
2587 "ldr q19, [%[b_ptr0], #0x30]\n"
2588 "fmax v26.4s, v26.4s, v22.4s\n"
2589 "ldr q20, [%[b_ptr0], #0x40]\n"
2590 "fmax v27.4s, v27.4s, v22.4s\n"
2591 "ldr q21, [%[b_ptr0], #0x50]\n"
2592 "fmin v24.4s, v24.4s, v23.4s\n"
2593 "fmin v25.4s, v25.4s, v23.4s\n"
2594 "fmin v26.4s, v26.4s, v23.4s\n"
2595 "fmin v27.4s, v27.4s, v23.4s\n"
2596 "str q24, [%[c_ptr0]]\n"
2597 "fmax v28.4s, v28.4s, v22.4s\n"
2598 "ldr q24, [%[biasptr]]\n"
2599 "fmax v29.4s, v29.4s, v22.4s\n"
2600 "add %[c_ptr0], %[c_ptr0], #0x10\n"
2601 "fmax v30.4s, v30.4s, v22.4s\n"
2602 "str q25, [c_ptr1]\n"
2603 "fmin v28.4s, v28.4s, v23.4s\n"
2604 "add c_ptr1, c_ptr1, #0x10\n"
2605 "fmin v29.4s, v29.4s, v23.4s\n"
2606 "str q26, [c_ptr2]\n"
2607 "fmin v30.4s, v30.4s, v23.4s\n"
2608 "add c_ptr2, c_ptr2, #0x10\n"
2609 "fmax v31.4s, v31.4s, v22.4s\n"
2610 "str q27, [c_ptr3]\n"
2611 "mov v25.16b, v24.16b\n"
2612 "ldr q22, [%[b_ptr0], #0x60]\n"
2613 "mov v26.16b, v24.16b\n"
2614 "add c_ptr3, c_ptr3, #0x10\n"
2615 "fmin v31.4s, v31.4s, v23.4s\n"
2616 "str q28, [c_ptr4]\n"
2617 "mov v27.16b, v24.16b\n"
2618 "add c_ptr4, c_ptr4, #0x10\n"
2619 "mov v28.16b, v24.16b\n"
2620 "str q29, [c_ptr5]\n"
2621 "mov v29.16b, v24.16b\n"
2622 "add c_ptr5, c_ptr5, #0x10\n"
2623 "fmla v25.4s, v16.4s, v2.s[0]\n"
2624 "str q30, [c_ptr6]\n"
2625 "mov v30.16b, v24.16b\n"
2626 "add c_ptr6, c_ptr6, #0x10\n"
2627 "fmla v26.4s, v16.4s, v4.s[0]\n"
2628 "str q31, [c_ptr7]\n"
2629 "mov v31.16b, v24.16b\n"
2630 "add c_ptr7, c_ptr7, #0x10\n"
2631 "fmla v24.4s, v16.4s, v0.s[0]\n"
2632 "add %[b_ptr0], %[b_ptr0], #0x70\n"
2633 "fmla v27.4s, v16.4s, v6.s[0]\n"
2634 "add %[biasptr], %[biasptr], %[biasinc]\n"
2635 "fmla v28.4s, v16.4s, v8.s[0]\n"
2636 "fmla v29.4s, v16.4s, v10.s[0]\n"
2637 "fmla v30.4s, v16.4s, v12.s[0]\n"
2638 "fmla v31.4s, v16.4s, v14.s[0]\n"
2639 "fmla v24.4s, v17.4s, v0.s[1]\n"
2640 "fmla v25.4s, v17.4s, v2.s[1]\n"
2641 "fmla v26.4s, v17.4s, v4.s[1]\n"
2642 "fmla v27.4s, v17.4s, v6.s[1]\n"
2643 "fmla v28.4s, v17.4s, v8.s[1]\n"
2644 "fmla v29.4s, v17.4s, v10.s[1]\n"
2645 "fmla v30.4s, v17.4s, v12.s[1]\n"
2646 "fmla v31.4s, v17.4s, v14.s[1]\n"
2647 "fmla v24.4s, v18.4s, v0.s[2]\n"
2648 "fmla v25.4s, v18.4s, v2.s[2]\n"
2649 "fmla v26.4s, v18.4s, v4.s[2]\n"
2650 "fmla v27.4s, v18.4s, v6.s[2]\n"
2651 "fmla v28.4s, v18.4s, v8.s[2]\n"
2652 "fmla v29.4s, v18.4s, v10.s[2]\n"
2653 "fmla v30.4s, v18.4s, v12.s[2]\n"
2654 "fmla v31.4s, v18.4s, v14.s[2]\n"
2655 "fmla v24.4s, v19.4s, v0.s[3]\n"
2656 "fmla v25.4s, v19.4s, v2.s[3]\n"
2657 "fmla v26.4s, v19.4s, v4.s[3]\n"
2658 "fmla v27.4s, v19.4s, v6.s[3]\n"
2659 "fmla v28.4s, v19.4s, v8.s[3]\n"
2660 "fmla v29.4s, v19.4s, v10.s[3]\n"
2661 "fmla v30.4s, v19.4s, v12.s[3]\n"
2662 "fmla v31.4s, v19.4s, v14.s[3]\n"
2663 "fmla v24.4s, v20.4s, v1.s[0]\n"
2664 "fmla v25.4s, v20.4s, v3.s[0]\n"
2665 "fmla v26.4s, v20.4s, v5.s[0]\n"
2666 "fmla v27.4s, v20.4s, v7.s[0]\n"
2667 "fmla v28.4s, v20.4s, v9.s[0]\n"
2668 "fmla v29.4s, v20.4s, v11.s[0]\n"
2669 "fmla v30.4s, v20.4s, v13.s[0]\n"
2670 "fmla v31.4s, v20.4s, v15.s[0]\n"
2671 "fmla v24.4s, v21.4s, v1.s[1]\n"
2672 "fmla v25.4s, v21.4s, v3.s[1]\n"
2673 "fmla v26.4s, v21.4s, v5.s[1]\n"
2674 "fmla v27.4s, v21.4s, v7.s[1]\n"
2675 "fmla v28.4s, v21.4s, v9.s[1]\n"
2676 "fmla v29.4s, v21.4s, v11.s[1]\n"
2677 "fmla v30.4s, v21.4s, v13.s[1]\n"
2678 "fmla v31.4s, v21.4s, v15.s[1]\n"
2679 "fmla v24.4s, v22.4s, v1.s[2]\n"
2680 "fmla v25.4s, v22.4s, v3.s[2]\n"
2681 "fmla v26.4s, v22.4s, v5.s[2]\n"
2682 "fmla v27.4s, v22.4s, v7.s[2]\n"
2683 "fmla v28.4s, v22.4s, v9.s[2]\n"
2684 "fmla v29.4s, v22.4s, v11.s[2]\n"
2685 "fmla v30.4s, v22.4s, v13.s[2]\n"
2686 "fmla v31.4s, v22.4s, v15.s[2]\n"
2689 "ldr q24, [%[biasptr]]\n"
2690 "add %[biasptr], %[biasptr], %[biasinc]\n"
2691 "mov v25.16b, v24.16b\n"
2692 "mov v26.16b, v24.16b\n"
2693 "mov v27.16b, v24.16b\n"
2694 "mov v28.16b, v24.16b\n"
2695 "mov v29.16b, v24.16b\n"
2696 "mov v30.16b, v24.16b\n"
2697 "mov v31.16b, v24.16b\n"
2698 "fmla v24.4s, v16.4s, v0.s[0]\n"
2699 "fmla v25.4s, v16.4s, v2.s[0]\n"
2700 "fmla v26.4s, v16.4s, v4.s[0]\n"
2701 "fmla v27.4s, v16.4s, v6.s[0]\n"
2702 "fmla v28.4s, v16.4s, v8.s[0]\n"
2703 "fmla v29.4s, v16.4s, v10.s[0]\n"
2704 "fmla v30.4s, v16.4s, v12.s[0]\n"
2705 "fmla v31.4s, v16.4s, v14.s[0]\n"
2706 "fmla v24.4s, v17.4s, v0.s[1]\n"
2707 "fmla v25.4s, v17.4s, v2.s[1]\n"
2708 "fmla v26.4s, v17.4s, v4.s[1]\n"
2709 "fmla v27.4s, v17.4s, v6.s[1]\n"
2710 "fmla v28.4s, v17.4s, v8.s[1]\n"
2711 "fmla v29.4s, v17.4s, v10.s[1]\n"
2712 "fmla v30.4s, v17.4s, v12.s[1]\n"
2713 "fmla v31.4s, v17.4s, v14.s[1]\n"
2714 "fmla v24.4s, v18.4s, v0.s[2]\n"
2715 "fmla v25.4s, v18.4s, v2.s[2]\n"
2716 "fmla v26.4s, v18.4s, v4.s[2]\n"
2717 "fmla v27.4s, v18.4s, v6.s[2]\n"
2718 "fmla v28.4s, v18.4s, v8.s[2]\n"
2719 "fmla v29.4s, v18.4s, v10.s[2]\n"
2720 "fmla v30.4s, v18.4s, v12.s[2]\n"
2721 "fmla v31.4s, v18.4s, v14.s[2]\n"
2722 "fmla v24.4s, v19.4s, v0.s[3]\n"
2723 "fmla v25.4s, v19.4s, v2.s[3]\n"
2724 "fmla v26.4s, v19.4s, v4.s[3]\n"
2725 "fmla v27.4s, v19.4s, v6.s[3]\n"
2726 "fmla v28.4s, v19.4s, v8.s[3]\n"
2727 "fmla v29.4s, v19.4s, v10.s[3]\n"
2728 "fmla v30.4s, v19.4s, v12.s[3]\n"
2729 "fmla v31.4s, v19.4s, v14.s[3]\n"
2730 "fmla v24.4s, v20.4s, v1.s[0]\n"
2731 "fmla v25.4s, v20.4s, v3.s[0]\n"
2732 "fmla v26.4s, v20.4s, v5.s[0]\n"
2733 "fmla v27.4s, v20.4s, v7.s[0]\n"
2734 "fmla v28.4s, v20.4s, v9.s[0]\n"
2735 "fmla v29.4s, v20.4s, v11.s[0]\n"
2736 "fmla v30.4s, v20.4s, v13.s[0]\n"
2737 "fmla v31.4s, v20.4s, v15.s[0]\n"
2738 "fmla v24.4s, v21.4s, v1.s[1]\n"
2739 "fmla v25.4s, v21.4s, v3.s[1]\n"
2740 "fmla v26.4s, v21.4s, v5.s[1]\n"
2741 "fmla v27.4s, v21.4s, v7.s[1]\n"
2742 "fmla v28.4s, v21.4s, v9.s[1]\n"
2743 "fmla v29.4s, v21.4s, v11.s[1]\n"
2744 "fmla v30.4s, v21.4s, v13.s[1]\n"
2745 "fmla v31.4s, v21.4s, v15.s[1]\n"
2746 "fmla v24.4s, v22.4s, v1.s[2]\n"
2747 "fmla v25.4s, v22.4s, v3.s[2]\n"
2748 "fmla v26.4s, v22.4s, v5.s[2]\n"
2749 "fmla v27.4s, v22.4s, v7.s[2]\n"
2750 "fmla v28.4s, v22.4s, v9.s[2]\n"
2751 "fmla v29.4s, v22.4s, v11.s[2]\n"
2752 "fmla v30.4s, v22.4s, v13.s[2]\n"
2753 "fmla v31.4s, v22.4s, v15.s[2]\n"
2755 "ld1r {v22.4s}, [%[minptr]]\n"
2756 "ld1r {v23.4s}, [%[maxptr]]\n"
2757 "fmax v24.4s, v24.4s, v22.4s\n"
2758 "fmax v25.4s, v25.4s, v22.4s\n"
2759 "fmax v26.4s, v26.4s, v22.4s\n"
2760 "fmax v27.4s, v27.4s, v22.4s\n"
2761 "fmin v24.4s, v24.4s, v23.4s\n"
2762 "fmin v25.4s, v25.4s, v23.4s\n"
2763 "fmin v26.4s, v26.4s, v23.4s\n"
2764 "fmin v27.4s, v27.4s, v23.4s\n"
2765 "str q24, [%[c_ptr0]]\n"
2766 "fmax v28.4s, v28.4s, v22.4s\n"
2767 "add %[c_ptr0], %[c_ptr0], #0x10\n"
2768 "fmax v29.4s, v29.4s, v22.4s\n"
2769 "str q25, [c_ptr1]\n"
2770 "fmax v30.4s, v30.4s, v22.4s\n"
2771 "fmin v28.4s, v28.4s, v23.4s\n"
2772 "fmax v31.4s, v31.4s, v22.4s\n"
2773 "str q26, [c_ptr2]\n"
2774 "fmin v29.4s, v29.4s, v23.4s\n"
2775 "fmin v30.4s, v30.4s, v23.4s\n"
2776 "fmin v31.4s, v31.4s, v23.4s\n"
2777 "str q27, [c_ptr3]\n"
2778 "str q28, [c_ptr4]\n"
2779 "str q29, [c_ptr5]\n"
2780 "str q30, [c_ptr6]\n"
2781 "str q31, [c_ptr7]\n"
2796 : [a_ptr0]
"+r" (a_ptr0), [b_ptr0]
"+r" (b_ptr0), [c_ptr0]
"+r" (c_ptr0), [loops]
"+r" (loops), [oob_rows]
"+r" (oob_rows), [biasptr]
"+r" (biasptr)
2797 : [lda]
"r" (ldab), [ldc]
"r" (ldcb), [biasinc]
"r" (biasinc), [minptr]
"r" (minptr), [maxptr]
"r" (maxptr)
2798 :
"x0",
"x1",
"x2",
"x3",
"x4",
"x5",
"x6",
"x7",
"x8",
"x9",
"x10",
"x11",
"x12",
"x13",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23",
"v24",
"v25",
"v26",
"v27",
"v28",
"v29",
"v30",
"v31",
"cc",
"memory"
2818 "add a_ptr1, %[a_ptr0], %[lda]\n"
2819 "add c_ptr1, %[c_ptr0], %[ldc]\n"
2820 "add a_ptr2, a_ptr1, %[lda]\n"
2821 "add c_ptr2, c_ptr1, %[ldc]\n"
2822 "add a_ptr3, a_ptr2, %[lda]\n"
2823 "add c_ptr3, c_ptr2, %[ldc]\n"
2824 "add a_ptr4, a_ptr3, %[lda]\n"
2825 "add c_ptr4, c_ptr3, %[ldc]\n"
2826 "add a_ptr5, a_ptr4, %[lda]\n"
2827 "add c_ptr5, c_ptr4, %[ldc]\n"
2828 "add a_ptr6, a_ptr5, %[lda]\n"
2829 "add c_ptr6, c_ptr5, %[ldc]\n"
2830 "add a_ptr7, a_ptr6, %[lda]\n"
2831 "add c_ptr7, c_ptr6, %[ldc]\n"
2832 "cbz %[oob_rows], 1f\n"
2833 "subs %[oob_rows], %[oob_rows], #0x1\n"
2834 "add c_ptr7, %[c_ptr0], #0x0\n"
2835 "add a_ptr7, %[a_ptr0], #0x0\n"
2837 "subs %[oob_rows], %[oob_rows], #0x1\n"
2838 "add c_ptr6, %[c_ptr0], #0x0\n"
2839 "add a_ptr6, %[a_ptr0], #0x0\n"
2841 "subs %[oob_rows], %[oob_rows], #0x1\n"
2842 "add c_ptr5, %[c_ptr0], #0x0\n"
2843 "add a_ptr5, %[a_ptr0], #0x0\n"
2845 "subs %[oob_rows], %[oob_rows], #0x1\n"
2846 "add c_ptr4, %[c_ptr0], #0x0\n"
2847 "add a_ptr4, %[a_ptr0], #0x0\n"
2849 "subs %[oob_rows], %[oob_rows], #0x1\n"
2850 "add c_ptr3, %[c_ptr0], #0x0\n"
2851 "add a_ptr3, %[a_ptr0], #0x0\n"
2853 "subs %[oob_rows], %[oob_rows], #0x1\n"
2854 "add c_ptr2, %[c_ptr0], #0x0\n"
2855 "add a_ptr2, %[a_ptr0], #0x0\n"
2857 "subs %[oob_rows], %[oob_rows], #0x1\n"
2858 "add c_ptr1, %[c_ptr0], #0x0\n"
2859 "add a_ptr1, %[a_ptr0], #0x0\n"
2861 "ldr q0, [%[a_ptr0]], #0x10\n"
2862 "ldr q2, [a_ptr1], #0x10\n"
2863 "ldr q4, [a_ptr2], #0x10\n"
2864 "ldr q6, [a_ptr3], #0x10\n"
2865 "ldr q8, [a_ptr4], #0x10\n"
2866 "ldr q10, [a_ptr5], #0x10\n"
2867 "ldr q12, [a_ptr6], #0x10\n"
2868 "ldr q14, [a_ptr7], #0x10\n"
2869 "ldr q1, [%[a_ptr0]]\n"
2870 "ldr q3, [a_ptr1]\n"
2871 "ldr q5, [a_ptr2]\n"
2872 "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
2873 "ldr q7, [a_ptr3]\n"
2874 "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
2875 "ldr q9, [a_ptr4]\n"
2876 "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
2877 "ldr q11, [a_ptr5]\n"
2878 "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
2879 "ldr q13, [a_ptr6]\n"
2880 "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
2881 "ldr q15, [a_ptr7]\n"
2882 "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
2883 "ldr q16, [%[b_ptr0]]\n"
2884 "ldr q17, [%[b_ptr0], #0x10]\n"
2885 "ldr q18, [%[b_ptr0], #0x20]\n"
2886 "ldr q19, [%[b_ptr0], #0x30]\n"
2887 "ldr q20, [%[b_ptr0], #0x40]\n"
2888 "ldr q21, [%[b_ptr0], #0x50]\n"
2889 "ldr q22, [%[b_ptr0], #0x60]\n"
2890 "ldr q23, [%[b_ptr0], #0x70]\n"
2891 "add %[b_ptr0], %[b_ptr0], #0x80\n"
2892 "cbz %[loops], 2f\n"
2893 "ldr q24, [%[biasptr]]\n"
2894 "add %[biasptr], %[biasptr], %[biasinc]\n"
2895 "subs %[loops], %[loops], #0x1\n"
2896 "mov v25.16b, v24.16b\n"
2897 "mov v26.16b, v24.16b\n"
2898 "mov v27.16b, v24.16b\n"
2899 "mov v28.16b, v24.16b\n"
2900 "mov v29.16b, v24.16b\n"
2901 "mov v30.16b, v24.16b\n"
2902 "mov v31.16b, v24.16b\n"
2903 "fmla v24.4s, v16.4s, v0.s[0]\n"
2904 "fmla v25.4s, v16.4s, v2.s[0]\n"
2905 "fmla v26.4s, v16.4s, v4.s[0]\n"
2906 "fmla v27.4s, v16.4s, v6.s[0]\n"
2907 "fmla v28.4s, v16.4s, v8.s[0]\n"
2908 "fmla v29.4s, v16.4s, v10.s[0]\n"
2909 "fmla v30.4s, v16.4s, v12.s[0]\n"
2910 "fmla v31.4s, v16.4s, v14.s[0]\n"
2911 "fmla v24.4s, v17.4s, v0.s[1]\n"
2912 "fmla v25.4s, v17.4s, v2.s[1]\n"
2913 "fmla v26.4s, v17.4s, v4.s[1]\n"
2914 "fmla v27.4s, v17.4s, v6.s[1]\n"
2915 "fmla v28.4s, v17.4s, v8.s[1]\n"
2916 "fmla v29.4s, v17.4s, v10.s[1]\n"
2917 "fmla v30.4s, v17.4s, v12.s[1]\n"
2918 "fmla v31.4s, v17.4s, v14.s[1]\n"
2919 "fmla v24.4s, v18.4s, v0.s[2]\n"
2920 "fmla v25.4s, v18.4s, v2.s[2]\n"
2921 "fmla v26.4s, v18.4s, v4.s[2]\n"
2922 "fmla v27.4s, v18.4s, v6.s[2]\n"
2923 "fmla v28.4s, v18.4s, v8.s[2]\n"
2924 "fmla v29.4s, v18.4s, v10.s[2]\n"
2925 "fmla v30.4s, v18.4s, v12.s[2]\n"
2926 "fmla v31.4s, v18.4s, v14.s[2]\n"
2927 "fmla v24.4s, v19.4s, v0.s[3]\n"
2928 "fmla v25.4s, v19.4s, v2.s[3]\n"
2929 "fmla v26.4s, v19.4s, v4.s[3]\n"
2930 "fmla v27.4s, v19.4s, v6.s[3]\n"
2931 "fmla v28.4s, v19.4s, v8.s[3]\n"
2932 "fmla v29.4s, v19.4s, v10.s[3]\n"
2933 "fmla v30.4s, v19.4s, v12.s[3]\n"
2934 "fmla v31.4s, v19.4s, v14.s[3]\n"
2935 "fmla v24.4s, v20.4s, v1.s[0]\n"
2936 "fmla v25.4s, v20.4s, v3.s[0]\n"
2937 "fmla v26.4s, v20.4s, v5.s[0]\n"
2938 "fmla v27.4s, v20.4s, v7.s[0]\n"
2939 "fmla v28.4s, v20.4s, v9.s[0]\n"
2940 "fmla v29.4s, v20.4s, v11.s[0]\n"
2941 "fmla v30.4s, v20.4s, v13.s[0]\n"
2942 "fmla v31.4s, v20.4s, v15.s[0]\n"
2943 "fmla v24.4s, v21.4s, v1.s[1]\n"
2944 "fmla v25.4s, v21.4s, v3.s[1]\n"
2945 "fmla v26.4s, v21.4s, v5.s[1]\n"
2946 "fmla v27.4s, v21.4s, v7.s[1]\n"
2947 "fmla v28.4s, v21.4s, v9.s[1]\n"
2948 "fmla v29.4s, v21.4s, v11.s[1]\n"
2949 "fmla v30.4s, v21.4s, v13.s[1]\n"
2950 "fmla v31.4s, v21.4s, v15.s[1]\n"
2951 "fmla v24.4s, v22.4s, v1.s[2]\n"
2952 "fmla v25.4s, v22.4s, v3.s[2]\n"
2953 "fmla v26.4s, v22.4s, v5.s[2]\n"
2954 "fmla v27.4s, v22.4s, v7.s[2]\n"
2955 "fmla v28.4s, v22.4s, v9.s[2]\n"
2956 "fmla v29.4s, v22.4s, v11.s[2]\n"
2957 "fmla v30.4s, v22.4s, v13.s[2]\n"
2958 "fmla v31.4s, v22.4s, v15.s[2]\n"
2959 "fmla v24.4s, v23.4s, v1.s[3]\n"
2960 "fmla v25.4s, v23.4s, v3.s[3]\n"
2961 "fmla v26.4s, v23.4s, v5.s[3]\n"
2962 "fmla v27.4s, v23.4s, v7.s[3]\n"
2963 "fmla v28.4s, v23.4s, v9.s[3]\n"
2964 "fmla v29.4s, v23.4s, v11.s[3]\n"
2965 "fmla v30.4s, v23.4s, v13.s[3]\n"
2966 "fmla v31.4s, v23.4s, v15.s[3]\n"
2969 "ld1r {v22.4s}, [%[minptr]]\n"
2970 "subs %[loops], %[loops], #0x1\n"
2971 "ld1r {v23.4s}, [%[maxptr]]\n"
2972 "ldr q16, [%[b_ptr0]]\n"
2973 "fmax v24.4s, v24.4s, v22.4s\n"
2974 "ldr q17, [%[b_ptr0], #0x10]\n"
2975 "fmax v25.4s, v25.4s, v22.4s\n"
2976 "ldr q18, [%[b_ptr0], #0x20]\n"
2977 "fmax v26.4s, v26.4s, v22.4s\n"
2978 "ldr q19, [%[b_ptr0], #0x30]\n"
2979 "fmax v27.4s, v27.4s, v22.4s\n"
2980 "ldr q20, [%[b_ptr0], #0x40]\n"
2981 "fmin v24.4s, v24.4s, v23.4s\n"
2982 "ldr q21, [%[b_ptr0], #0x50]\n"
2983 "fmin v25.4s, v25.4s, v23.4s\n"
2984 "fmin v26.4s, v26.4s, v23.4s\n"
2985 "fmin v27.4s, v27.4s, v23.4s\n"
2986 "str q24, [%[c_ptr0]]\n"
2987 "fmax v28.4s, v28.4s, v22.4s\n"
2988 "ldr q24, [%[biasptr]]\n"
2989 "fmax v29.4s, v29.4s, v22.4s\n"
2990 "add %[c_ptr0], %[c_ptr0], #0x10\n"
2991 "fmax v30.4s, v30.4s, v22.4s\n"
2992 "str q25, [c_ptr1]\n"
2993 "fmin v28.4s, v28.4s, v23.4s\n"
2994 "add c_ptr1, c_ptr1, #0x10\n"
2995 "fmin v29.4s, v29.4s, v23.4s\n"
2996 "str q26, [c_ptr2]\n"
2997 "fmin v30.4s, v30.4s, v23.4s\n"
2998 "add c_ptr2, c_ptr2, #0x10\n"
2999 "fmax v31.4s, v31.4s, v22.4s\n"
3000 "str q27, [c_ptr3]\n"
3001 "mov v25.16b, v24.16b\n"
3002 "ldr q22, [%[b_ptr0], #0x60]\n"
3003 "mov v26.16b, v24.16b\n"
3004 "add c_ptr3, c_ptr3, #0x10\n"
3005 "fmin v31.4s, v31.4s, v23.4s\n"
3006 "str q28, [c_ptr4]\n"
3007 "mov v27.16b, v24.16b\n"
3008 "ldr q23, [%[b_ptr0], #0x70]\n"
3009 "mov v28.16b, v24.16b\n"
3010 "add c_ptr4, c_ptr4, #0x10\n"
3011 "fmla v25.4s, v16.4s, v2.s[0]\n"
3012 "str q29, [c_ptr5]\n"
3013 "mov v29.16b, v24.16b\n"
3014 "add c_ptr5, c_ptr5, #0x10\n"
3015 "fmla v26.4s, v16.4s, v4.s[0]\n"
3016 "str q30, [c_ptr6]\n"
3017 "mov v30.16b, v24.16b\n"
3018 "add c_ptr6, c_ptr6, #0x10\n"
3019 "fmla v27.4s, v16.4s, v6.s[0]\n"
3020 "str q31, [c_ptr7]\n"
3021 "mov v31.16b, v24.16b\n"
3022 "add c_ptr7, c_ptr7, #0x10\n"
3023 "fmla v24.4s, v16.4s, v0.s[0]\n"
3024 "add %[b_ptr0], %[b_ptr0], #0x80\n"
3025 "fmla v28.4s, v16.4s, v8.s[0]\n"
3026 "add %[biasptr], %[biasptr], %[biasinc]\n"
3027 "fmla v29.4s, v16.4s, v10.s[0]\n"
3028 "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
3029 "fmla v30.4s, v16.4s, v12.s[0]\n"
3030 "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
3031 "fmla v31.4s, v16.4s, v14.s[0]\n"
3032 "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
3033 "fmla v24.4s, v17.4s, v0.s[1]\n"
3034 "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
3035 "fmla v25.4s, v17.4s, v2.s[1]\n"
3036 "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
3037 "fmla v26.4s, v17.4s, v4.s[1]\n"
3038 "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
3039 "fmla v27.4s, v17.4s, v6.s[1]\n"
3040 "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
3041 "fmla v28.4s, v17.4s, v8.s[1]\n"
3042 "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
3043 "fmla v29.4s, v17.4s, v10.s[1]\n"
3044 "fmla v30.4s, v17.4s, v12.s[1]\n"
3045 "fmla v31.4s, v17.4s, v14.s[1]\n"
3046 "fmla v24.4s, v18.4s, v0.s[2]\n"
3047 "fmla v25.4s, v18.4s, v2.s[2]\n"
3048 "fmla v26.4s, v18.4s, v4.s[2]\n"
3049 "fmla v27.4s, v18.4s, v6.s[2]\n"
3050 "fmla v28.4s, v18.4s, v8.s[2]\n"
3051 "fmla v29.4s, v18.4s, v10.s[2]\n"
3052 "fmla v30.4s, v18.4s, v12.s[2]\n"
3053 "fmla v31.4s, v18.4s, v14.s[2]\n"
3054 "fmla v24.4s, v19.4s, v0.s[3]\n"
3055 "fmla v25.4s, v19.4s, v2.s[3]\n"
3056 "fmla v26.4s, v19.4s, v4.s[3]\n"
3057 "fmla v27.4s, v19.4s, v6.s[3]\n"
3058 "fmla v28.4s, v19.4s, v8.s[3]\n"
3059 "fmla v29.4s, v19.4s, v10.s[3]\n"
3060 "fmla v30.4s, v19.4s, v12.s[3]\n"
3061 "fmla v31.4s, v19.4s, v14.s[3]\n"
3062 "fmla v24.4s, v20.4s, v1.s[0]\n"
3063 "fmla v25.4s, v20.4s, v3.s[0]\n"
3064 "fmla v26.4s, v20.4s, v5.s[0]\n"
3065 "fmla v27.4s, v20.4s, v7.s[0]\n"
3066 "fmla v28.4s, v20.4s, v9.s[0]\n"
3067 "fmla v29.4s, v20.4s, v11.s[0]\n"
3068 "fmla v30.4s, v20.4s, v13.s[0]\n"
3069 "fmla v31.4s, v20.4s, v15.s[0]\n"
3070 "fmla v24.4s, v21.4s, v1.s[1]\n"
3071 "fmla v25.4s, v21.4s, v3.s[1]\n"
3072 "fmla v26.4s, v21.4s, v5.s[1]\n"
3073 "fmla v27.4s, v21.4s, v7.s[1]\n"
3074 "fmla v28.4s, v21.4s, v9.s[1]\n"
3075 "fmla v29.4s, v21.4s, v11.s[1]\n"
3076 "fmla v30.4s, v21.4s, v13.s[1]\n"
3077 "fmla v31.4s, v21.4s, v15.s[1]\n"
3078 "fmla v24.4s, v22.4s, v1.s[2]\n"
3079 "fmla v25.4s, v22.4s, v3.s[2]\n"
3080 "fmla v26.4s, v22.4s, v5.s[2]\n"
3081 "fmla v27.4s, v22.4s, v7.s[2]\n"
3082 "fmla v28.4s, v22.4s, v9.s[2]\n"
3083 "fmla v29.4s, v22.4s, v11.s[2]\n"
3084 "fmla v30.4s, v22.4s, v13.s[2]\n"
3085 "fmla v31.4s, v22.4s, v15.s[2]\n"
3086 "fmla v24.4s, v23.4s, v1.s[3]\n"
3087 "fmla v25.4s, v23.4s, v3.s[3]\n"
3088 "fmla v26.4s, v23.4s, v5.s[3]\n"
3089 "fmla v27.4s, v23.4s, v7.s[3]\n"
3090 "fmla v28.4s, v23.4s, v9.s[3]\n"
3091 "fmla v29.4s, v23.4s, v11.s[3]\n"
3092 "fmla v30.4s, v23.4s, v13.s[3]\n"
3093 "fmla v31.4s, v23.4s, v15.s[3]\n"
3096 "ld1r {v22.4s}, [%[minptr]]\n"
3097 "ld1r {v23.4s}, [%[maxptr]]\n"
3098 "ldr q16, [%[b_ptr0]]\n"
3099 "ldr q17, [%[b_ptr0], #0x10]\n"
3100 "fmax v24.4s, v24.4s, v22.4s\n"
3101 "ldr q18, [%[b_ptr0], #0x20]\n"
3102 "fmax v25.4s, v25.4s, v22.4s\n"
3103 "ldr q19, [%[b_ptr0], #0x30]\n"
3104 "fmax v26.4s, v26.4s, v22.4s\n"
3105 "ldr q20, [%[b_ptr0], #0x40]\n"
3106 "fmax v27.4s, v27.4s, v22.4s\n"
3107 "ldr q21, [%[b_ptr0], #0x50]\n"
3108 "fmin v24.4s, v24.4s, v23.4s\n"
3109 "fmin v25.4s, v25.4s, v23.4s\n"
3110 "fmin v26.4s, v26.4s, v23.4s\n"
3111 "fmin v27.4s, v27.4s, v23.4s\n"
3112 "str q24, [%[c_ptr0]]\n"
3113 "fmax v28.4s, v28.4s, v22.4s\n"
3114 "ldr q24, [%[biasptr]]\n"
3115 "fmax v29.4s, v29.4s, v22.4s\n"
3116 "add %[c_ptr0], %[c_ptr0], #0x10\n"
3117 "fmax v30.4s, v30.4s, v22.4s\n"
3118 "str q25, [c_ptr1]\n"
3119 "fmin v28.4s, v28.4s, v23.4s\n"
3120 "add c_ptr1, c_ptr1, #0x10\n"
3121 "fmin v29.4s, v29.4s, v23.4s\n"
3122 "str q26, [c_ptr2]\n"
3123 "fmin v30.4s, v30.4s, v23.4s\n"
3124 "add c_ptr2, c_ptr2, #0x10\n"
3125 "fmax v31.4s, v31.4s, v22.4s\n"
3126 "str q27, [c_ptr3]\n"
3127 "mov v25.16b, v24.16b\n"
3128 "ldr q22, [%[b_ptr0], #0x60]\n"
3129 "mov v26.16b, v24.16b\n"
3130 "add c_ptr3, c_ptr3, #0x10\n"
3131 "fmin v31.4s, v31.4s, v23.4s\n"
3132 "str q28, [c_ptr4]\n"
3133 "mov v27.16b, v24.16b\n"
3134 "ldr q23, [%[b_ptr0], #0x70]\n"
3135 "mov v28.16b, v24.16b\n"
3136 "add c_ptr4, c_ptr4, #0x10\n"
3137 "fmla v25.4s, v16.4s, v2.s[0]\n"
3138 "str q29, [c_ptr5]\n"
3139 "mov v29.16b, v24.16b\n"
3140 "add c_ptr5, c_ptr5, #0x10\n"
3141 "fmla v26.4s, v16.4s, v4.s[0]\n"
3142 "str q30, [c_ptr6]\n"
3143 "mov v30.16b, v24.16b\n"
3144 "add c_ptr6, c_ptr6, #0x10\n"
3145 "fmla v27.4s, v16.4s, v6.s[0]\n"
3146 "str q31, [c_ptr7]\n"
3147 "mov v31.16b, v24.16b\n"
3148 "add c_ptr7, c_ptr7, #0x10\n"
3149 "fmla v24.4s, v16.4s, v0.s[0]\n"
3150 "add %[b_ptr0], %[b_ptr0], #0x80\n"
3151 "fmla v28.4s, v16.4s, v8.s[0]\n"
3152 "add %[biasptr], %[biasptr], %[biasinc]\n"
3153 "fmla v29.4s, v16.4s, v10.s[0]\n"
3154 "fmla v30.4s, v16.4s, v12.s[0]\n"
3155 "fmla v31.4s, v16.4s, v14.s[0]\n"
3156 "fmla v24.4s, v17.4s, v0.s[1]\n"
3157 "fmla v25.4s, v17.4s, v2.s[1]\n"
3158 "fmla v26.4s, v17.4s, v4.s[1]\n"
3159 "fmla v27.4s, v17.4s, v6.s[1]\n"
3160 "fmla v28.4s, v17.4s, v8.s[1]\n"
3161 "fmla v29.4s, v17.4s, v10.s[1]\n"
3162 "fmla v30.4s, v17.4s, v12.s[1]\n"
3163 "fmla v31.4s, v17.4s, v14.s[1]\n"
3164 "fmla v24.4s, v18.4s, v0.s[2]\n"
3165 "fmla v25.4s, v18.4s, v2.s[2]\n"
3166 "fmla v26.4s, v18.4s, v4.s[2]\n"
3167 "fmla v27.4s, v18.4s, v6.s[2]\n"
3168 "fmla v28.4s, v18.4s, v8.s[2]\n"
3169 "fmla v29.4s, v18.4s, v10.s[2]\n"
3170 "fmla v30.4s, v18.4s, v12.s[2]\n"
3171 "fmla v31.4s, v18.4s, v14.s[2]\n"
3172 "fmla v24.4s, v19.4s, v0.s[3]\n"
3173 "fmla v25.4s, v19.4s, v2.s[3]\n"
3174 "fmla v26.4s, v19.4s, v4.s[3]\n"
3175 "fmla v27.4s, v19.4s, v6.s[3]\n"
3176 "fmla v28.4s, v19.4s, v8.s[3]\n"
3177 "fmla v29.4s, v19.4s, v10.s[3]\n"
3178 "fmla v30.4s, v19.4s, v12.s[3]\n"
3179 "fmla v31.4s, v19.4s, v14.s[3]\n"
3180 "fmla v24.4s, v20.4s, v1.s[0]\n"
3181 "fmla v25.4s, v20.4s, v3.s[0]\n"
3182 "fmla v26.4s, v20.4s, v5.s[0]\n"
3183 "fmla v27.4s, v20.4s, v7.s[0]\n"
3184 "fmla v28.4s, v20.4s, v9.s[0]\n"
3185 "fmla v29.4s, v20.4s, v11.s[0]\n"
3186 "fmla v30.4s, v20.4s, v13.s[0]\n"
3187 "fmla v31.4s, v20.4s, v15.s[0]\n"
3188 "fmla v24.4s, v21.4s, v1.s[1]\n"
3189 "fmla v25.4s, v21.4s, v3.s[1]\n"
3190 "fmla v26.4s, v21.4s, v5.s[1]\n"
3191 "fmla v27.4s, v21.4s, v7.s[1]\n"
3192 "fmla v28.4s, v21.4s, v9.s[1]\n"
3193 "fmla v29.4s, v21.4s, v11.s[1]\n"
3194 "fmla v30.4s, v21.4s, v13.s[1]\n"
3195 "fmla v31.4s, v21.4s, v15.s[1]\n"
3196 "fmla v24.4s, v22.4s, v1.s[2]\n"
3197 "fmla v25.4s, v22.4s, v3.s[2]\n"
3198 "fmla v26.4s, v22.4s, v5.s[2]\n"
3199 "fmla v27.4s, v22.4s, v7.s[2]\n"
3200 "fmla v28.4s, v22.4s, v9.s[2]\n"
3201 "fmla v29.4s, v22.4s, v11.s[2]\n"
3202 "fmla v30.4s, v22.4s, v13.s[2]\n"
3203 "fmla v31.4s, v22.4s, v15.s[2]\n"
3204 "fmla v24.4s, v23.4s, v1.s[3]\n"
3205 "fmla v25.4s, v23.4s, v3.s[3]\n"
3206 "fmla v26.4s, v23.4s, v5.s[3]\n"
3207 "fmla v27.4s, v23.4s, v7.s[3]\n"
3208 "fmla v28.4s, v23.4s, v9.s[3]\n"
3209 "fmla v29.4s, v23.4s, v11.s[3]\n"
3210 "fmla v30.4s, v23.4s, v13.s[3]\n"
3211 "fmla v31.4s, v23.4s, v15.s[3]\n"
3214 "ldr q24, [%[biasptr]]\n"
3215 "add %[biasptr], %[biasptr], %[biasinc]\n"
3216 "mov v25.16b, v24.16b\n"
3217 "mov v26.16b, v24.16b\n"
3218 "mov v27.16b, v24.16b\n"
3219 "mov v28.16b, v24.16b\n"
3220 "mov v29.16b, v24.16b\n"
3221 "mov v30.16b, v24.16b\n"
3222 "mov v31.16b, v24.16b\n"
3223 "fmla v24.4s, v16.4s, v0.s[0]\n"
3224 "fmla v25.4s, v16.4s, v2.s[0]\n"
3225 "fmla v26.4s, v16.4s, v4.s[0]\n"
3226 "fmla v27.4s, v16.4s, v6.s[0]\n"
3227 "fmla v28.4s, v16.4s, v8.s[0]\n"
3228 "fmla v29.4s, v16.4s, v10.s[0]\n"
3229 "fmla v30.4s, v16.4s, v12.s[0]\n"
3230 "fmla v31.4s, v16.4s, v14.s[0]\n"
3231 "fmla v24.4s, v17.4s, v0.s[1]\n"
3232 "fmla v25.4s, v17.4s, v2.s[1]\n"
3233 "fmla v26.4s, v17.4s, v4.s[1]\n"
3234 "fmla v27.4s, v17.4s, v6.s[1]\n"
3235 "fmla v28.4s, v17.4s, v8.s[1]\n"
3236 "fmla v29.4s, v17.4s, v10.s[1]\n"
3237 "fmla v30.4s, v17.4s, v12.s[1]\n"
3238 "fmla v31.4s, v17.4s, v14.s[1]\n"
3239 "fmla v24.4s, v18.4s, v0.s[2]\n"
3240 "fmla v25.4s, v18.4s, v2.s[2]\n"
3241 "fmla v26.4s, v18.4s, v4.s[2]\n"
3242 "fmla v27.4s, v18.4s, v6.s[2]\n"
3243 "fmla v28.4s, v18.4s, v8.s[2]\n"
3244 "fmla v29.4s, v18.4s, v10.s[2]\n"
3245 "fmla v30.4s, v18.4s, v12.s[2]\n"
3246 "fmla v31.4s, v18.4s, v14.s[2]\n"
3247 "fmla v24.4s, v19.4s, v0.s[3]\n"
3248 "fmla v25.4s, v19.4s, v2.s[3]\n"
3249 "fmla v26.4s, v19.4s, v4.s[3]\n"
3250 "fmla v27.4s, v19.4s, v6.s[3]\n"
3251 "fmla v28.4s, v19.4s, v8.s[3]\n"
3252 "fmla v29.4s, v19.4s, v10.s[3]\n"
3253 "fmla v30.4s, v19.4s, v12.s[3]\n"
3254 "fmla v31.4s, v19.4s, v14.s[3]\n"
3255 "fmla v24.4s, v20.4s, v1.s[0]\n"
3256 "fmla v25.4s, v20.4s, v3.s[0]\n"
3257 "fmla v26.4s, v20.4s, v5.s[0]\n"
3258 "fmla v27.4s, v20.4s, v7.s[0]\n"
3259 "fmla v28.4s, v20.4s, v9.s[0]\n"
3260 "fmla v29.4s, v20.4s, v11.s[0]\n"
3261 "fmla v30.4s, v20.4s, v13.s[0]\n"
3262 "fmla v31.4s, v20.4s, v15.s[0]\n"
3263 "fmla v24.4s, v21.4s, v1.s[1]\n"
3264 "fmla v25.4s, v21.4s, v3.s[1]\n"
3265 "fmla v26.4s, v21.4s, v5.s[1]\n"
3266 "fmla v27.4s, v21.4s, v7.s[1]\n"
3267 "fmla v28.4s, v21.4s, v9.s[1]\n"
3268 "fmla v29.4s, v21.4s, v11.s[1]\n"
3269 "fmla v30.4s, v21.4s, v13.s[1]\n"
3270 "fmla v31.4s, v21.4s, v15.s[1]\n"
3271 "fmla v24.4s, v22.4s, v1.s[2]\n"
3272 "fmla v25.4s, v22.4s, v3.s[2]\n"
3273 "fmla v26.4s, v22.4s, v5.s[2]\n"
3274 "fmla v27.4s, v22.4s, v7.s[2]\n"
3275 "fmla v28.4s, v22.4s, v9.s[2]\n"
3276 "fmla v29.4s, v22.4s, v11.s[2]\n"
3277 "fmla v30.4s, v22.4s, v13.s[2]\n"
3278 "fmla v31.4s, v22.4s, v15.s[2]\n"
3279 "fmla v24.4s, v23.4s, v1.s[3]\n"
3280 "fmla v25.4s, v23.4s, v3.s[3]\n"
3281 "fmla v26.4s, v23.4s, v5.s[3]\n"
3282 "fmla v27.4s, v23.4s, v7.s[3]\n"
3283 "fmla v28.4s, v23.4s, v9.s[3]\n"
3284 "fmla v29.4s, v23.4s, v11.s[3]\n"
3285 "fmla v30.4s, v23.4s, v13.s[3]\n"
3286 "fmla v31.4s, v23.4s, v15.s[3]\n"
3288 "ld1r {v22.4s}, [%[minptr]]\n"
3289 "ld1r {v23.4s}, [%[maxptr]]\n"
3290 "fmax v24.4s, v24.4s, v22.4s\n"
3291 "fmax v25.4s, v25.4s, v22.4s\n"
3292 "fmax v26.4s, v26.4s, v22.4s\n"
3293 "fmax v27.4s, v27.4s, v22.4s\n"
3294 "fmin v24.4s, v24.4s, v23.4s\n"
3295 "fmin v25.4s, v25.4s, v23.4s\n"
3296 "fmin v26.4s, v26.4s, v23.4s\n"
3297 "fmin v27.4s, v27.4s, v23.4s\n"
3298 "str q24, [%[c_ptr0]]\n"
3299 "fmax v28.4s, v28.4s, v22.4s\n"
3300 "add %[c_ptr0], %[c_ptr0], #0x10\n"
3301 "fmax v29.4s, v29.4s, v22.4s\n"
3302 "str q25, [c_ptr1]\n"
3303 "fmax v30.4s, v30.4s, v22.4s\n"
3304 "fmin v28.4s, v28.4s, v23.4s\n"
3305 "fmax v31.4s, v31.4s, v22.4s\n"
3306 "str q26, [c_ptr2]\n"
3307 "fmin v29.4s, v29.4s, v23.4s\n"
3308 "fmin v30.4s, v30.4s, v23.4s\n"
3309 "fmin v31.4s, v31.4s, v23.4s\n"
3310 "str q27, [c_ptr3]\n"
3311 "str q28, [c_ptr4]\n"
3312 "str q29, [c_ptr5]\n"
3313 "str q30, [c_ptr6]\n"
3314 "str q31, [c_ptr7]\n"
3329 : [a_ptr0]
"+r" (a_ptr0), [b_ptr0]
"+r" (b_ptr0), [c_ptr0]
"+r" (c_ptr0), [loops]
"+r" (loops), [oob_rows]
"+r" (oob_rows), [biasptr]
"+r" (biasptr)
3330 : [lda]
"r" (ldab), [ldc]
"r" (ldcb), [biasinc]
"r" (biasinc), [minptr]
"r" (minptr), [maxptr]
"r" (maxptr)
3331 :
"x0",
"x1",
"x2",
"x3",
"x4",
"x5",
"x6",
"x7",
"x8",
"x9",
"x10",
"x11",
"x12",
"x13",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23",
"v24",
"v25",
"v26",
"v27",
"v28",
"v29",
"v30",
"v31",
"cc",
"memory"
3340 #endif // __aarch64__