31 #include "../../asmlib.hpp"
32 #include "../../utils.hpp"
36 void a64_smallK_hybrid_fp32_mla_6x4(
const float *A,
int lda,
const float *B,
float *C,
int ldc,
int M,
int N,
int K,
const float *
bias,
Activation act,
bool) {
37 const long loops_count =
iceildiv(
N, (
int)4) - 1;
38 const long ldab = lda *
sizeof(float);
39 const long ldcb = ldc *
sizeof(float);
42 memset(nullbias, 0, (4 *
sizeof(
float)));
44 float minval = -
static_cast<float>(std::numeric_limits<float>::infinity());
45 float maxval =
static_cast<float>(std::numeric_limits<float>::infinity());
46 const float *
const minptr = &minval;
47 const float *
const maxptr = &maxval;
55 maxval =
static_cast<float>(act.param1);
62 for (
int y0=0; y0<
M; y0+=6) {
63 long loops = loops_count;
64 long oob_rows = std::max(6 - (
M-y0), 0);
65 const float *b_ptr0 =
B;
66 const float *biasptr =
bias ?
bias : nullbias;
67 const uint64_t biasinc =
bias ? 4*
sizeof(float) : 0;
68 const float *a_ptr0 =
A + (y0 * lda);
70 float *c_ptr0 = C + (y0 * ldc);
85 "add a_ptr1, %[a_ptr0], %[lda]\n"
86 "add c_ptr1, %[c_ptr0], %[ldc]\n"
87 "add a_ptr2, a_ptr1, %[lda]\n"
88 "add c_ptr2, c_ptr1, %[ldc]\n"
89 "add a_ptr3, a_ptr2, %[lda]\n"
90 "add c_ptr3, c_ptr2, %[ldc]\n"
91 "add a_ptr4, a_ptr3, %[lda]\n"
92 "add c_ptr4, c_ptr3, %[ldc]\n"
93 "add a_ptr5, a_ptr4, %[lda]\n"
94 "add c_ptr5, c_ptr4, %[ldc]\n"
95 "cbz %[oob_rows], 1f\n"
96 "subs %[oob_rows], %[oob_rows], #0x1\n"
97 "add c_ptr5, %[c_ptr0], #0x0\n"
98 "add a_ptr5, %[a_ptr0], #0x0\n"
100 "subs %[oob_rows], %[oob_rows], #0x1\n"
101 "add c_ptr4, %[c_ptr0], #0x0\n"
102 "add a_ptr4, %[a_ptr0], #0x0\n"
104 "subs %[oob_rows], %[oob_rows], #0x1\n"
105 "add c_ptr3, %[c_ptr0], #0x0\n"
106 "add a_ptr3, %[a_ptr0], #0x0\n"
108 "subs %[oob_rows], %[oob_rows], #0x1\n"
109 "add c_ptr2, %[c_ptr0], #0x0\n"
110 "add a_ptr2, %[a_ptr0], #0x0\n"
112 "subs %[oob_rows], %[oob_rows], #0x1\n"
113 "add c_ptr1, %[c_ptr0], #0x0\n"
114 "add a_ptr1, %[a_ptr0], #0x0\n"
116 "ldr q0, [%[a_ptr0]], #0x10\n"
117 "ldr q3, [a_ptr1], #0x10\n"
118 "ldr q6, [a_ptr2], #0x10\n"
119 "ldr q9, [a_ptr3], #0x10\n"
120 "ldr q12, [a_ptr4], #0x10\n"
121 "ldr q15, [a_ptr5], #0x10\n"
122 "ldr q1, [%[a_ptr0]], #0x10\n"
123 "ldr q4, [a_ptr1], #0x10\n"
124 "ldr q7, [a_ptr2], #0x10\n"
125 "ldr q10, [a_ptr3], #0x10\n"
126 "ldr s2, [%[a_ptr0]]\n"
127 "ldr q13, [a_ptr4], #0x10\n"
129 "ldr q16, [a_ptr5], #0x10\n"
131 "ldr q18, [%[b_ptr0]]\n"
132 "ldr s11, [a_ptr3]\n"
133 "ldr q19, [%[b_ptr0], #0x10]\n"
134 "ldr s14, [a_ptr4]\n"
135 "ldr q20, [%[b_ptr0], #0x20]\n"
136 "ldr s17, [a_ptr5]\n"
137 "ldr q21, [%[b_ptr0], #0x30]\n"
138 "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
139 "ldr q22, [%[b_ptr0], #0x40]\n"
140 "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
141 "ldr q23, [%[b_ptr0], #0x50]\n"
142 "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
143 "ldr q24, [%[b_ptr0], #0x60]\n"
144 "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
145 "ldr q25, [%[b_ptr0], #0x70]\n"
146 "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
147 "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
148 "add %[b_ptr0], %[b_ptr0], #0x80\n"
150 "ldr q26, [%[biasptr]]\n"
151 "add %[biasptr], %[biasptr], %[biasinc]\n"
152 "subs %[loops], %[loops], #0x1\n"
153 "mov v27.16b, v26.16b\n"
154 "mov v28.16b, v26.16b\n"
155 "mov v29.16b, v26.16b\n"
156 "mov v30.16b, v26.16b\n"
157 "mov v31.16b, v26.16b\n"
158 "fmla v26.4s, v18.4s, v0.s[0]\n"
159 "fmla v27.4s, v18.4s, v3.s[0]\n"
160 "fmla v28.4s, v18.4s, v6.s[0]\n"
161 "fmla v29.4s, v18.4s, v9.s[0]\n"
162 "fmla v30.4s, v18.4s, v12.s[0]\n"
163 "fmla v31.4s, v18.4s, v15.s[0]\n"
164 "ldr q18, [%[b_ptr0]]\n"
165 "fmla v26.4s, v19.4s, v0.s[1]\n"
166 "add %[b_ptr0], %[b_ptr0], #0x10\n"
167 "fmla v27.4s, v19.4s, v3.s[1]\n"
168 "fmla v28.4s, v19.4s, v6.s[1]\n"
169 "fmla v29.4s, v19.4s, v9.s[1]\n"
170 "fmla v30.4s, v19.4s, v12.s[1]\n"
171 "fmla v31.4s, v19.4s, v15.s[1]\n"
172 "fmla v26.4s, v20.4s, v0.s[2]\n"
173 "fmla v27.4s, v20.4s, v3.s[2]\n"
174 "fmla v28.4s, v20.4s, v6.s[2]\n"
175 "fmla v29.4s, v20.4s, v9.s[2]\n"
176 "fmla v30.4s, v20.4s, v12.s[2]\n"
177 "fmla v31.4s, v20.4s, v15.s[2]\n"
178 "fmla v26.4s, v21.4s, v0.s[3]\n"
179 "fmla v27.4s, v21.4s, v3.s[3]\n"
180 "fmla v28.4s, v21.4s, v6.s[3]\n"
181 "fmla v29.4s, v21.4s, v9.s[3]\n"
182 "fmla v30.4s, v21.4s, v12.s[3]\n"
183 "fmla v31.4s, v21.4s, v15.s[3]\n"
184 "fmla v26.4s, v22.4s, v1.s[0]\n"
185 "fmla v27.4s, v22.4s, v4.s[0]\n"
186 "fmla v28.4s, v22.4s, v7.s[0]\n"
187 "fmla v29.4s, v22.4s, v10.s[0]\n"
188 "fmla v30.4s, v22.4s, v13.s[0]\n"
189 "fmla v31.4s, v22.4s, v16.s[0]\n"
190 "fmla v26.4s, v23.4s, v1.s[1]\n"
191 "fmla v27.4s, v23.4s, v4.s[1]\n"
192 "fmla v28.4s, v23.4s, v7.s[1]\n"
193 "fmla v29.4s, v23.4s, v10.s[1]\n"
194 "fmla v30.4s, v23.4s, v13.s[1]\n"
195 "fmla v31.4s, v23.4s, v16.s[1]\n"
196 "fmla v26.4s, v24.4s, v1.s[2]\n"
197 "fmla v27.4s, v24.4s, v4.s[2]\n"
198 "fmla v28.4s, v24.4s, v7.s[2]\n"
199 "fmla v29.4s, v24.4s, v10.s[2]\n"
200 "fmla v30.4s, v24.4s, v13.s[2]\n"
201 "fmla v31.4s, v24.4s, v16.s[2]\n"
202 "fmla v26.4s, v25.4s, v1.s[3]\n"
203 "fmla v27.4s, v25.4s, v4.s[3]\n"
204 "fmla v28.4s, v25.4s, v7.s[3]\n"
205 "fmla v29.4s, v25.4s, v10.s[3]\n"
206 "fmla v30.4s, v25.4s, v13.s[3]\n"
207 "fmla v31.4s, v25.4s, v16.s[3]\n"
208 "fmla v26.4s, v18.4s, v2.s[0]\n"
209 "fmla v27.4s, v18.4s, v5.s[0]\n"
210 "fmla v28.4s, v18.4s, v8.s[0]\n"
211 "fmla v29.4s, v18.4s, v11.s[0]\n"
212 "fmla v30.4s, v18.4s, v14.s[0]\n"
213 "fmla v31.4s, v18.4s, v17.s[0]\n"
216 "ld1r {v24.4s}, [%[minptr]]\n"
217 "subs %[loops], %[loops], #0x1\n"
218 "ld1r {v25.4s}, [%[maxptr]]\n"
219 "ldr q18, [%[b_ptr0]]\n"
220 "fmax v26.4s, v26.4s, v24.4s\n"
221 "ldr q19, [%[b_ptr0], #0x10]\n"
222 "fmax v27.4s, v27.4s, v24.4s\n"
223 "ldr q20, [%[b_ptr0], #0x20]\n"
224 "fmax v28.4s, v28.4s, v24.4s\n"
225 "ldr q21, [%[b_ptr0], #0x30]\n"
226 "fmax v29.4s, v29.4s, v24.4s\n"
227 "ldr q22, [%[b_ptr0], #0x40]\n"
228 "fmin v26.4s, v26.4s, v25.4s\n"
229 "ldr q23, [%[b_ptr0], #0x50]\n"
230 "fmin v27.4s, v27.4s, v25.4s\n"
231 "fmin v28.4s, v28.4s, v25.4s\n"
232 "fmin v29.4s, v29.4s, v25.4s\n"
233 "str q26, [%[c_ptr0]]\n"
234 "fmax v30.4s, v30.4s, v24.4s\n"
235 "ldr q26, [%[biasptr]]\n"
236 "fmax v31.4s, v31.4s, v24.4s\n"
237 "ldr q24, [%[b_ptr0], #0x60]\n"
238 "add %[c_ptr0], %[c_ptr0], #0x10\n"
239 "str q27, [c_ptr1]\n"
240 "add c_ptr1, c_ptr1, #0x10\n"
241 "fmin v30.4s, v30.4s, v25.4s\n"
242 "add %[biasptr], %[biasptr], %[biasinc]\n"
243 "fmin v31.4s, v31.4s, v25.4s\n"
244 "str q28, [c_ptr2]\n"
245 "mov v27.16b, v26.16b\n"
246 "ldr q25, [%[b_ptr0], #0x70]\n"
247 "mov v28.16b, v26.16b\n"
248 "add c_ptr2, c_ptr2, #0x10\n"
249 "str q29, [c_ptr3]\n"
250 "add c_ptr3, c_ptr3, #0x10\n"
251 "mov v29.16b, v26.16b\n"
252 "add %[b_ptr0], %[b_ptr0], #0x80\n"
253 "fmla v27.4s, v18.4s, v3.s[0]\n"
254 "str q30, [c_ptr4]\n"
255 "mov v30.16b, v26.16b\n"
256 "add c_ptr4, c_ptr4, #0x10\n"
257 "fmla v28.4s, v18.4s, v6.s[0]\n"
258 "str q31, [c_ptr5]\n"
259 "mov v31.16b, v26.16b\n"
260 "add c_ptr5, c_ptr5, #0x10\n"
261 "fmla v26.4s, v18.4s, v0.s[0]\n"
262 "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
263 "fmla v29.4s, v18.4s, v9.s[0]\n"
264 "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
265 "fmla v30.4s, v18.4s, v12.s[0]\n"
266 "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
267 "fmla v31.4s, v18.4s, v15.s[0]\n"
268 "ldr q18, [%[b_ptr0]]\n"
269 "fmla v26.4s, v19.4s, v0.s[1]\n"
270 "add %[b_ptr0], %[b_ptr0], #0x10\n"
271 "fmla v27.4s, v19.4s, v3.s[1]\n"
272 "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
273 "fmla v28.4s, v19.4s, v6.s[1]\n"
274 "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
275 "fmla v29.4s, v19.4s, v9.s[1]\n"
276 "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
277 "fmla v30.4s, v19.4s, v12.s[1]\n"
278 "fmla v31.4s, v19.4s, v15.s[1]\n"
279 "fmla v26.4s, v20.4s, v0.s[2]\n"
280 "fmla v27.4s, v20.4s, v3.s[2]\n"
281 "fmla v28.4s, v20.4s, v6.s[2]\n"
282 "fmla v29.4s, v20.4s, v9.s[2]\n"
283 "fmla v30.4s, v20.4s, v12.s[2]\n"
284 "fmla v31.4s, v20.4s, v15.s[2]\n"
285 "fmla v26.4s, v21.4s, v0.s[3]\n"
286 "fmla v27.4s, v21.4s, v3.s[3]\n"
287 "fmla v28.4s, v21.4s, v6.s[3]\n"
288 "fmla v29.4s, v21.4s, v9.s[3]\n"
289 "fmla v30.4s, v21.4s, v12.s[3]\n"
290 "fmla v31.4s, v21.4s, v15.s[3]\n"
291 "fmla v26.4s, v22.4s, v1.s[0]\n"
292 "fmla v27.4s, v22.4s, v4.s[0]\n"
293 "fmla v28.4s, v22.4s, v7.s[0]\n"
294 "fmla v29.4s, v22.4s, v10.s[0]\n"
295 "fmla v30.4s, v22.4s, v13.s[0]\n"
296 "fmla v31.4s, v22.4s, v16.s[0]\n"
297 "fmla v26.4s, v23.4s, v1.s[1]\n"
298 "fmla v27.4s, v23.4s, v4.s[1]\n"
299 "fmla v28.4s, v23.4s, v7.s[1]\n"
300 "fmla v29.4s, v23.4s, v10.s[1]\n"
301 "fmla v30.4s, v23.4s, v13.s[1]\n"
302 "fmla v31.4s, v23.4s, v16.s[1]\n"
303 "fmla v26.4s, v24.4s, v1.s[2]\n"
304 "fmla v27.4s, v24.4s, v4.s[2]\n"
305 "fmla v28.4s, v24.4s, v7.s[2]\n"
306 "fmla v29.4s, v24.4s, v10.s[2]\n"
307 "fmla v30.4s, v24.4s, v13.s[2]\n"
308 "fmla v31.4s, v24.4s, v16.s[2]\n"
309 "fmla v26.4s, v25.4s, v1.s[3]\n"
310 "fmla v27.4s, v25.4s, v4.s[3]\n"
311 "fmla v28.4s, v25.4s, v7.s[3]\n"
312 "fmla v29.4s, v25.4s, v10.s[3]\n"
313 "fmla v30.4s, v25.4s, v13.s[3]\n"
314 "fmla v31.4s, v25.4s, v16.s[3]\n"
315 "fmla v26.4s, v18.4s, v2.s[0]\n"
316 "fmla v27.4s, v18.4s, v5.s[0]\n"
317 "fmla v28.4s, v18.4s, v8.s[0]\n"
318 "fmla v29.4s, v18.4s, v11.s[0]\n"
319 "fmla v30.4s, v18.4s, v14.s[0]\n"
320 "fmla v31.4s, v18.4s, v17.s[0]\n"
323 "ld1r {v24.4s}, [%[minptr]]\n"
324 "ld1r {v25.4s}, [%[maxptr]]\n"
325 "ldr q18, [%[b_ptr0]]\n"
326 "ldr q19, [%[b_ptr0], #0x10]\n"
327 "fmax v26.4s, v26.4s, v24.4s\n"
328 "ldr q20, [%[b_ptr0], #0x20]\n"
329 "fmax v27.4s, v27.4s, v24.4s\n"
330 "ldr q21, [%[b_ptr0], #0x30]\n"
331 "fmax v28.4s, v28.4s, v24.4s\n"
332 "ldr q22, [%[b_ptr0], #0x40]\n"
333 "fmax v29.4s, v29.4s, v24.4s\n"
334 "ldr q23, [%[b_ptr0], #0x50]\n"
335 "fmin v26.4s, v26.4s, v25.4s\n"
336 "fmin v27.4s, v27.4s, v25.4s\n"
337 "fmin v28.4s, v28.4s, v25.4s\n"
338 "fmin v29.4s, v29.4s, v25.4s\n"
339 "str q26, [%[c_ptr0]]\n"
340 "fmax v30.4s, v30.4s, v24.4s\n"
341 "ldr q26, [%[biasptr]]\n"
342 "fmax v31.4s, v31.4s, v24.4s\n"
343 "ldr q24, [%[b_ptr0], #0x60]\n"
344 "add %[c_ptr0], %[c_ptr0], #0x10\n"
345 "str q27, [c_ptr1]\n"
346 "add c_ptr1, c_ptr1, #0x10\n"
347 "fmin v30.4s, v30.4s, v25.4s\n"
348 "add %[biasptr], %[biasptr], %[biasinc]\n"
349 "fmin v31.4s, v31.4s, v25.4s\n"
350 "str q28, [c_ptr2]\n"
351 "mov v27.16b, v26.16b\n"
352 "ldr q25, [%[b_ptr0], #0x70]\n"
353 "mov v28.16b, v26.16b\n"
354 "add c_ptr2, c_ptr2, #0x10\n"
355 "str q29, [c_ptr3]\n"
356 "add c_ptr3, c_ptr3, #0x10\n"
357 "mov v29.16b, v26.16b\n"
358 "add %[b_ptr0], %[b_ptr0], #0x80\n"
359 "fmla v27.4s, v18.4s, v3.s[0]\n"
360 "str q30, [c_ptr4]\n"
361 "mov v30.16b, v26.16b\n"
362 "add c_ptr4, c_ptr4, #0x10\n"
363 "fmla v28.4s, v18.4s, v6.s[0]\n"
364 "str q31, [c_ptr5]\n"
365 "mov v31.16b, v26.16b\n"
366 "add c_ptr5, c_ptr5, #0x10\n"
367 "fmla v26.4s, v18.4s, v0.s[0]\n"
368 "fmla v29.4s, v18.4s, v9.s[0]\n"
369 "fmla v30.4s, v18.4s, v12.s[0]\n"
370 "fmla v31.4s, v18.4s, v15.s[0]\n"
371 "ldr q18, [%[b_ptr0]]\n"
372 "fmla v26.4s, v19.4s, v0.s[1]\n"
373 "add %[b_ptr0], %[b_ptr0], #0x10\n"
374 "fmla v27.4s, v19.4s, v3.s[1]\n"
375 "fmla v28.4s, v19.4s, v6.s[1]\n"
376 "fmla v29.4s, v19.4s, v9.s[1]\n"
377 "fmla v30.4s, v19.4s, v12.s[1]\n"
378 "fmla v31.4s, v19.4s, v15.s[1]\n"
379 "fmla v26.4s, v20.4s, v0.s[2]\n"
380 "fmla v27.4s, v20.4s, v3.s[2]\n"
381 "fmla v28.4s, v20.4s, v6.s[2]\n"
382 "fmla v29.4s, v20.4s, v9.s[2]\n"
383 "fmla v30.4s, v20.4s, v12.s[2]\n"
384 "fmla v31.4s, v20.4s, v15.s[2]\n"
385 "fmla v26.4s, v21.4s, v0.s[3]\n"
386 "fmla v27.4s, v21.4s, v3.s[3]\n"
387 "fmla v28.4s, v21.4s, v6.s[3]\n"
388 "fmla v29.4s, v21.4s, v9.s[3]\n"
389 "fmla v30.4s, v21.4s, v12.s[3]\n"
390 "fmla v31.4s, v21.4s, v15.s[3]\n"
391 "fmla v26.4s, v22.4s, v1.s[0]\n"
392 "fmla v27.4s, v22.4s, v4.s[0]\n"
393 "fmla v28.4s, v22.4s, v7.s[0]\n"
394 "fmla v29.4s, v22.4s, v10.s[0]\n"
395 "fmla v30.4s, v22.4s, v13.s[0]\n"
396 "fmla v31.4s, v22.4s, v16.s[0]\n"
397 "fmla v26.4s, v23.4s, v1.s[1]\n"
398 "fmla v27.4s, v23.4s, v4.s[1]\n"
399 "fmla v28.4s, v23.4s, v7.s[1]\n"
400 "fmla v29.4s, v23.4s, v10.s[1]\n"
401 "fmla v30.4s, v23.4s, v13.s[1]\n"
402 "fmla v31.4s, v23.4s, v16.s[1]\n"
403 "fmla v26.4s, v24.4s, v1.s[2]\n"
404 "fmla v27.4s, v24.4s, v4.s[2]\n"
405 "fmla v28.4s, v24.4s, v7.s[2]\n"
406 "fmla v29.4s, v24.4s, v10.s[2]\n"
407 "fmla v30.4s, v24.4s, v13.s[2]\n"
408 "fmla v31.4s, v24.4s, v16.s[2]\n"
409 "fmla v26.4s, v25.4s, v1.s[3]\n"
410 "fmla v27.4s, v25.4s, v4.s[3]\n"
411 "fmla v28.4s, v25.4s, v7.s[3]\n"
412 "fmla v29.4s, v25.4s, v10.s[3]\n"
413 "fmla v30.4s, v25.4s, v13.s[3]\n"
414 "fmla v31.4s, v25.4s, v16.s[3]\n"
415 "fmla v26.4s, v18.4s, v2.s[0]\n"
416 "fmla v27.4s, v18.4s, v5.s[0]\n"
417 "fmla v28.4s, v18.4s, v8.s[0]\n"
418 "fmla v29.4s, v18.4s, v11.s[0]\n"
419 "fmla v30.4s, v18.4s, v14.s[0]\n"
420 "fmla v31.4s, v18.4s, v17.s[0]\n"
423 "ldr q26, [%[biasptr]]\n"
424 "add %[biasptr], %[biasptr], %[biasinc]\n"
425 "mov v27.16b, v26.16b\n"
426 "mov v28.16b, v26.16b\n"
427 "mov v29.16b, v26.16b\n"
428 "mov v30.16b, v26.16b\n"
429 "mov v31.16b, v26.16b\n"
430 "fmla v26.4s, v18.4s, v0.s[0]\n"
431 "fmla v27.4s, v18.4s, v3.s[0]\n"
432 "fmla v28.4s, v18.4s, v6.s[0]\n"
433 "fmla v29.4s, v18.4s, v9.s[0]\n"
434 "fmla v30.4s, v18.4s, v12.s[0]\n"
435 "fmla v31.4s, v18.4s, v15.s[0]\n"
436 "ldr q18, [%[b_ptr0]]\n"
437 "fmla v26.4s, v19.4s, v0.s[1]\n"
438 "add %[b_ptr0], %[b_ptr0], #0x10\n"
439 "fmla v27.4s, v19.4s, v3.s[1]\n"
440 "fmla v28.4s, v19.4s, v6.s[1]\n"
441 "fmla v29.4s, v19.4s, v9.s[1]\n"
442 "fmla v30.4s, v19.4s, v12.s[1]\n"
443 "fmla v31.4s, v19.4s, v15.s[1]\n"
444 "fmla v26.4s, v20.4s, v0.s[2]\n"
445 "fmla v27.4s, v20.4s, v3.s[2]\n"
446 "fmla v28.4s, v20.4s, v6.s[2]\n"
447 "fmla v29.4s, v20.4s, v9.s[2]\n"
448 "fmla v30.4s, v20.4s, v12.s[2]\n"
449 "fmla v31.4s, v20.4s, v15.s[2]\n"
450 "fmla v26.4s, v21.4s, v0.s[3]\n"
451 "fmla v27.4s, v21.4s, v3.s[3]\n"
452 "fmla v28.4s, v21.4s, v6.s[3]\n"
453 "fmla v29.4s, v21.4s, v9.s[3]\n"
454 "fmla v30.4s, v21.4s, v12.s[3]\n"
455 "fmla v31.4s, v21.4s, v15.s[3]\n"
456 "fmla v26.4s, v22.4s, v1.s[0]\n"
457 "fmla v27.4s, v22.4s, v4.s[0]\n"
458 "fmla v28.4s, v22.4s, v7.s[0]\n"
459 "fmla v29.4s, v22.4s, v10.s[0]\n"
460 "fmla v30.4s, v22.4s, v13.s[0]\n"
461 "fmla v31.4s, v22.4s, v16.s[0]\n"
462 "fmla v26.4s, v23.4s, v1.s[1]\n"
463 "fmla v27.4s, v23.4s, v4.s[1]\n"
464 "fmla v28.4s, v23.4s, v7.s[1]\n"
465 "fmla v29.4s, v23.4s, v10.s[1]\n"
466 "fmla v30.4s, v23.4s, v13.s[1]\n"
467 "fmla v31.4s, v23.4s, v16.s[1]\n"
468 "fmla v26.4s, v24.4s, v1.s[2]\n"
469 "fmla v27.4s, v24.4s, v4.s[2]\n"
470 "fmla v28.4s, v24.4s, v7.s[2]\n"
471 "fmla v29.4s, v24.4s, v10.s[2]\n"
472 "fmla v30.4s, v24.4s, v13.s[2]\n"
473 "fmla v31.4s, v24.4s, v16.s[2]\n"
474 "fmla v26.4s, v25.4s, v1.s[3]\n"
475 "fmla v27.4s, v25.4s, v4.s[3]\n"
476 "fmla v28.4s, v25.4s, v7.s[3]\n"
477 "fmla v29.4s, v25.4s, v10.s[3]\n"
478 "fmla v30.4s, v25.4s, v13.s[3]\n"
479 "fmla v31.4s, v25.4s, v16.s[3]\n"
480 "fmla v26.4s, v18.4s, v2.s[0]\n"
481 "fmla v27.4s, v18.4s, v5.s[0]\n"
482 "fmla v28.4s, v18.4s, v8.s[0]\n"
483 "fmla v29.4s, v18.4s, v11.s[0]\n"
484 "fmla v30.4s, v18.4s, v14.s[0]\n"
485 "fmla v31.4s, v18.4s, v17.s[0]\n"
487 "ld1r {v24.4s}, [%[minptr]]\n"
488 "ld1r {v25.4s}, [%[maxptr]]\n"
489 "fmax v26.4s, v26.4s, v24.4s\n"
490 "fmax v27.4s, v27.4s, v24.4s\n"
491 "fmax v28.4s, v28.4s, v24.4s\n"
492 "fmax v29.4s, v29.4s, v24.4s\n"
493 "fmin v26.4s, v26.4s, v25.4s\n"
494 "fmin v27.4s, v27.4s, v25.4s\n"
495 "fmin v28.4s, v28.4s, v25.4s\n"
496 "fmin v29.4s, v29.4s, v25.4s\n"
497 "str q26, [%[c_ptr0]]\n"
498 "fmax v30.4s, v30.4s, v24.4s\n"
499 "add %[c_ptr0], %[c_ptr0], #0x10\n"
500 "fmax v31.4s, v31.4s, v24.4s\n"
501 "str q27, [c_ptr1]\n"
502 "fmin v30.4s, v30.4s, v25.4s\n"
503 "fmin v31.4s, v31.4s, v25.4s\n"
504 "str q28, [c_ptr2]\n"
505 "str q29, [c_ptr3]\n"
506 "str q30, [c_ptr4]\n"
507 "str q31, [c_ptr5]\n"
518 : [a_ptr0]
"+r" (a_ptr0), [b_ptr0]
"+r" (b_ptr0), [c_ptr0]
"+r" (c_ptr0), [loops]
"+r" (loops), [oob_rows]
"+r" (oob_rows), [biasptr]
"+r" (biasptr)
519 : [lda]
"r" (ldab), [ldc]
"r" (ldcb), [biasinc]
"r" (biasinc), [minptr]
"r" (minptr), [maxptr]
"r" (maxptr)
520 :
"x0",
"x1",
"x2",
"x3",
"x4",
"x5",
"x6",
"x7",
"x8",
"x9",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23",
"v24",
"v25",
"v26",
"v27",
"v28",
"v29",
"v30",
"v31",
"cc",
"memory"
535 "add a_ptr1, %[a_ptr0], %[lda]\n"
536 "add c_ptr1, %[c_ptr0], %[ldc]\n"
537 "add a_ptr2, a_ptr1, %[lda]\n"
538 "add c_ptr2, c_ptr1, %[ldc]\n"
539 "add a_ptr3, a_ptr2, %[lda]\n"
540 "add c_ptr3, c_ptr2, %[ldc]\n"
541 "add a_ptr4, a_ptr3, %[lda]\n"
542 "add c_ptr4, c_ptr3, %[ldc]\n"
543 "add a_ptr5, a_ptr4, %[lda]\n"
544 "add c_ptr5, c_ptr4, %[ldc]\n"
545 "cbz %[oob_rows], 1f\n"
546 "subs %[oob_rows], %[oob_rows], #0x1\n"
547 "add c_ptr5, %[c_ptr0], #0x0\n"
548 "add a_ptr5, %[a_ptr0], #0x0\n"
550 "subs %[oob_rows], %[oob_rows], #0x1\n"
551 "add c_ptr4, %[c_ptr0], #0x0\n"
552 "add a_ptr4, %[a_ptr0], #0x0\n"
554 "subs %[oob_rows], %[oob_rows], #0x1\n"
555 "add c_ptr3, %[c_ptr0], #0x0\n"
556 "add a_ptr3, %[a_ptr0], #0x0\n"
558 "subs %[oob_rows], %[oob_rows], #0x1\n"
559 "add c_ptr2, %[c_ptr0], #0x0\n"
560 "add a_ptr2, %[a_ptr0], #0x0\n"
562 "subs %[oob_rows], %[oob_rows], #0x1\n"
563 "add c_ptr1, %[c_ptr0], #0x0\n"
564 "add a_ptr1, %[a_ptr0], #0x0\n"
566 "ldr q0, [%[a_ptr0]], #0x10\n"
567 "ldr q3, [a_ptr1], #0x10\n"
568 "ldr q6, [a_ptr2], #0x10\n"
569 "ldr q9, [a_ptr3], #0x10\n"
570 "ldr q12, [a_ptr4], #0x10\n"
571 "ldr q15, [a_ptr5], #0x10\n"
572 "ldr q1, [%[a_ptr0]], #0x10\n"
573 "ldr q4, [a_ptr1], #0x10\n"
574 "ldr q7, [a_ptr2], #0x10\n"
575 "ldr q10, [a_ptr3], #0x10\n"
576 "ldr d2, [%[a_ptr0]]\n"
577 "ldr q13, [a_ptr4], #0x10\n"
579 "ldr q16, [a_ptr5], #0x10\n"
581 "ldr q18, [%[b_ptr0]]\n"
582 "ldr d11, [a_ptr3]\n"
583 "ldr q19, [%[b_ptr0], #0x10]\n"
584 "ldr d14, [a_ptr4]\n"
585 "ldr q20, [%[b_ptr0], #0x20]\n"
586 "ldr d17, [a_ptr5]\n"
587 "ldr q21, [%[b_ptr0], #0x30]\n"
588 "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
589 "ldr q22, [%[b_ptr0], #0x40]\n"
590 "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
591 "ldr q23, [%[b_ptr0], #0x50]\n"
592 "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
593 "ldr q24, [%[b_ptr0], #0x60]\n"
594 "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
595 "ldr q25, [%[b_ptr0], #0x70]\n"
596 "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
597 "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
598 "add %[b_ptr0], %[b_ptr0], #0x80\n"
600 "ldr q26, [%[biasptr]]\n"
601 "add %[biasptr], %[biasptr], %[biasinc]\n"
602 "subs %[loops], %[loops], #0x1\n"
603 "mov v27.16b, v26.16b\n"
604 "mov v28.16b, v26.16b\n"
605 "mov v29.16b, v26.16b\n"
606 "mov v30.16b, v26.16b\n"
607 "mov v31.16b, v26.16b\n"
608 "fmla v26.4s, v18.4s, v0.s[0]\n"
609 "fmla v27.4s, v18.4s, v3.s[0]\n"
610 "fmla v28.4s, v18.4s, v6.s[0]\n"
611 "fmla v29.4s, v18.4s, v9.s[0]\n"
612 "fmla v30.4s, v18.4s, v12.s[0]\n"
613 "fmla v31.4s, v18.4s, v15.s[0]\n"
614 "ldr q18, [%[b_ptr0]]\n"
615 "fmla v26.4s, v19.4s, v0.s[1]\n"
616 "fmla v27.4s, v19.4s, v3.s[1]\n"
617 "fmla v28.4s, v19.4s, v6.s[1]\n"
618 "fmla v29.4s, v19.4s, v9.s[1]\n"
619 "fmla v30.4s, v19.4s, v12.s[1]\n"
620 "fmla v31.4s, v19.4s, v15.s[1]\n"
621 "ldr q19, [%[b_ptr0], #0x10]\n"
622 "fmla v26.4s, v20.4s, v0.s[2]\n"
623 "add %[b_ptr0], %[b_ptr0], #0x20\n"
624 "fmla v27.4s, v20.4s, v3.s[2]\n"
625 "fmla v28.4s, v20.4s, v6.s[2]\n"
626 "fmla v29.4s, v20.4s, v9.s[2]\n"
627 "fmla v30.4s, v20.4s, v12.s[2]\n"
628 "fmla v31.4s, v20.4s, v15.s[2]\n"
629 "fmla v26.4s, v21.4s, v0.s[3]\n"
630 "fmla v27.4s, v21.4s, v3.s[3]\n"
631 "fmla v28.4s, v21.4s, v6.s[3]\n"
632 "fmla v29.4s, v21.4s, v9.s[3]\n"
633 "fmla v30.4s, v21.4s, v12.s[3]\n"
634 "fmla v31.4s, v21.4s, v15.s[3]\n"
635 "fmla v26.4s, v22.4s, v1.s[0]\n"
636 "fmla v27.4s, v22.4s, v4.s[0]\n"
637 "fmla v28.4s, v22.4s, v7.s[0]\n"
638 "fmla v29.4s, v22.4s, v10.s[0]\n"
639 "fmla v30.4s, v22.4s, v13.s[0]\n"
640 "fmla v31.4s, v22.4s, v16.s[0]\n"
641 "fmla v26.4s, v23.4s, v1.s[1]\n"
642 "fmla v27.4s, v23.4s, v4.s[1]\n"
643 "fmla v28.4s, v23.4s, v7.s[1]\n"
644 "fmla v29.4s, v23.4s, v10.s[1]\n"
645 "fmla v30.4s, v23.4s, v13.s[1]\n"
646 "fmla v31.4s, v23.4s, v16.s[1]\n"
647 "fmla v26.4s, v24.4s, v1.s[2]\n"
648 "fmla v27.4s, v24.4s, v4.s[2]\n"
649 "fmla v28.4s, v24.4s, v7.s[2]\n"
650 "fmla v29.4s, v24.4s, v10.s[2]\n"
651 "fmla v30.4s, v24.4s, v13.s[2]\n"
652 "fmla v31.4s, v24.4s, v16.s[2]\n"
653 "fmla v26.4s, v25.4s, v1.s[3]\n"
654 "fmla v27.4s, v25.4s, v4.s[3]\n"
655 "fmla v28.4s, v25.4s, v7.s[3]\n"
656 "fmla v29.4s, v25.4s, v10.s[3]\n"
657 "fmla v30.4s, v25.4s, v13.s[3]\n"
658 "fmla v31.4s, v25.4s, v16.s[3]\n"
659 "fmla v26.4s, v18.4s, v2.s[0]\n"
660 "fmla v27.4s, v18.4s, v5.s[0]\n"
661 "fmla v28.4s, v18.4s, v8.s[0]\n"
662 "fmla v29.4s, v18.4s, v11.s[0]\n"
663 "fmla v30.4s, v18.4s, v14.s[0]\n"
664 "fmla v31.4s, v18.4s, v17.s[0]\n"
665 "fmla v26.4s, v19.4s, v2.s[1]\n"
666 "fmla v27.4s, v19.4s, v5.s[1]\n"
667 "fmla v28.4s, v19.4s, v8.s[1]\n"
668 "fmla v29.4s, v19.4s, v11.s[1]\n"
669 "fmla v30.4s, v19.4s, v14.s[1]\n"
670 "fmla v31.4s, v19.4s, v17.s[1]\n"
673 "ld1r {v24.4s}, [%[minptr]]\n"
674 "subs %[loops], %[loops], #0x1\n"
675 "ld1r {v25.4s}, [%[maxptr]]\n"
676 "ldr q18, [%[b_ptr0]]\n"
677 "fmax v26.4s, v26.4s, v24.4s\n"
678 "ldr q19, [%[b_ptr0], #0x10]\n"
679 "fmax v27.4s, v27.4s, v24.4s\n"
680 "ldr q20, [%[b_ptr0], #0x20]\n"
681 "fmax v28.4s, v28.4s, v24.4s\n"
682 "ldr q21, [%[b_ptr0], #0x30]\n"
683 "fmax v29.4s, v29.4s, v24.4s\n"
684 "ldr q22, [%[b_ptr0], #0x40]\n"
685 "fmin v26.4s, v26.4s, v25.4s\n"
686 "ldr q23, [%[b_ptr0], #0x50]\n"
687 "fmin v27.4s, v27.4s, v25.4s\n"
688 "fmin v28.4s, v28.4s, v25.4s\n"
689 "fmin v29.4s, v29.4s, v25.4s\n"
690 "str q26, [%[c_ptr0]]\n"
691 "fmax v30.4s, v30.4s, v24.4s\n"
692 "ldr q26, [%[biasptr]]\n"
693 "fmax v31.4s, v31.4s, v24.4s\n"
694 "ldr q24, [%[b_ptr0], #0x60]\n"
695 "add %[c_ptr0], %[c_ptr0], #0x10\n"
696 "str q27, [c_ptr1]\n"
697 "add c_ptr1, c_ptr1, #0x10\n"
698 "fmin v30.4s, v30.4s, v25.4s\n"
699 "add %[biasptr], %[biasptr], %[biasinc]\n"
700 "fmin v31.4s, v31.4s, v25.4s\n"
701 "str q28, [c_ptr2]\n"
702 "mov v27.16b, v26.16b\n"
703 "ldr q25, [%[b_ptr0], #0x70]\n"
704 "mov v28.16b, v26.16b\n"
705 "add c_ptr2, c_ptr2, #0x10\n"
706 "str q29, [c_ptr3]\n"
707 "add c_ptr3, c_ptr3, #0x10\n"
708 "mov v29.16b, v26.16b\n"
709 "add %[b_ptr0], %[b_ptr0], #0x80\n"
710 "fmla v27.4s, v18.4s, v3.s[0]\n"
711 "str q30, [c_ptr4]\n"
712 "mov v30.16b, v26.16b\n"
713 "add c_ptr4, c_ptr4, #0x10\n"
714 "fmla v28.4s, v18.4s, v6.s[0]\n"
715 "str q31, [c_ptr5]\n"
716 "mov v31.16b, v26.16b\n"
717 "add c_ptr5, c_ptr5, #0x10\n"
718 "fmla v26.4s, v18.4s, v0.s[0]\n"
719 "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
720 "fmla v29.4s, v18.4s, v9.s[0]\n"
721 "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
722 "fmla v30.4s, v18.4s, v12.s[0]\n"
723 "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
724 "fmla v31.4s, v18.4s, v15.s[0]\n"
725 "ldr q18, [%[b_ptr0]]\n"
726 "fmla v26.4s, v19.4s, v0.s[1]\n"
727 "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
728 "fmla v27.4s, v19.4s, v3.s[1]\n"
729 "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
730 "fmla v28.4s, v19.4s, v6.s[1]\n"
731 "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
732 "fmla v29.4s, v19.4s, v9.s[1]\n"
733 "fmla v30.4s, v19.4s, v12.s[1]\n"
734 "fmla v31.4s, v19.4s, v15.s[1]\n"
735 "ldr q19, [%[b_ptr0], #0x10]\n"
736 "fmla v26.4s, v20.4s, v0.s[2]\n"
737 "add %[b_ptr0], %[b_ptr0], #0x20\n"
738 "fmla v27.4s, v20.4s, v3.s[2]\n"
739 "fmla v28.4s, v20.4s, v6.s[2]\n"
740 "fmla v29.4s, v20.4s, v9.s[2]\n"
741 "fmla v30.4s, v20.4s, v12.s[2]\n"
742 "fmla v31.4s, v20.4s, v15.s[2]\n"
743 "fmla v26.4s, v21.4s, v0.s[3]\n"
744 "fmla v27.4s, v21.4s, v3.s[3]\n"
745 "fmla v28.4s, v21.4s, v6.s[3]\n"
746 "fmla v29.4s, v21.4s, v9.s[3]\n"
747 "fmla v30.4s, v21.4s, v12.s[3]\n"
748 "fmla v31.4s, v21.4s, v15.s[3]\n"
749 "fmla v26.4s, v22.4s, v1.s[0]\n"
750 "fmla v27.4s, v22.4s, v4.s[0]\n"
751 "fmla v28.4s, v22.4s, v7.s[0]\n"
752 "fmla v29.4s, v22.4s, v10.s[0]\n"
753 "fmla v30.4s, v22.4s, v13.s[0]\n"
754 "fmla v31.4s, v22.4s, v16.s[0]\n"
755 "fmla v26.4s, v23.4s, v1.s[1]\n"
756 "fmla v27.4s, v23.4s, v4.s[1]\n"
757 "fmla v28.4s, v23.4s, v7.s[1]\n"
758 "fmla v29.4s, v23.4s, v10.s[1]\n"
759 "fmla v30.4s, v23.4s, v13.s[1]\n"
760 "fmla v31.4s, v23.4s, v16.s[1]\n"
761 "fmla v26.4s, v24.4s, v1.s[2]\n"
762 "fmla v27.4s, v24.4s, v4.s[2]\n"
763 "fmla v28.4s, v24.4s, v7.s[2]\n"
764 "fmla v29.4s, v24.4s, v10.s[2]\n"
765 "fmla v30.4s, v24.4s, v13.s[2]\n"
766 "fmla v31.4s, v24.4s, v16.s[2]\n"
767 "fmla v26.4s, v25.4s, v1.s[3]\n"
768 "fmla v27.4s, v25.4s, v4.s[3]\n"
769 "fmla v28.4s, v25.4s, v7.s[3]\n"
770 "fmla v29.4s, v25.4s, v10.s[3]\n"
771 "fmla v30.4s, v25.4s, v13.s[3]\n"
772 "fmla v31.4s, v25.4s, v16.s[3]\n"
773 "fmla v26.4s, v18.4s, v2.s[0]\n"
774 "fmla v27.4s, v18.4s, v5.s[0]\n"
775 "fmla v28.4s, v18.4s, v8.s[0]\n"
776 "fmla v29.4s, v18.4s, v11.s[0]\n"
777 "fmla v30.4s, v18.4s, v14.s[0]\n"
778 "fmla v31.4s, v18.4s, v17.s[0]\n"
779 "fmla v26.4s, v19.4s, v2.s[1]\n"
780 "fmla v27.4s, v19.4s, v5.s[1]\n"
781 "fmla v28.4s, v19.4s, v8.s[1]\n"
782 "fmla v29.4s, v19.4s, v11.s[1]\n"
783 "fmla v30.4s, v19.4s, v14.s[1]\n"
784 "fmla v31.4s, v19.4s, v17.s[1]\n"
787 "ld1r {v24.4s}, [%[minptr]]\n"
788 "ld1r {v25.4s}, [%[maxptr]]\n"
789 "ldr q18, [%[b_ptr0]]\n"
790 "ldr q19, [%[b_ptr0], #0x10]\n"
791 "fmax v26.4s, v26.4s, v24.4s\n"
792 "ldr q20, [%[b_ptr0], #0x20]\n"
793 "fmax v27.4s, v27.4s, v24.4s\n"
794 "ldr q21, [%[b_ptr0], #0x30]\n"
795 "fmax v28.4s, v28.4s, v24.4s\n"
796 "ldr q22, [%[b_ptr0], #0x40]\n"
797 "fmax v29.4s, v29.4s, v24.4s\n"
798 "ldr q23, [%[b_ptr0], #0x50]\n"
799 "fmin v26.4s, v26.4s, v25.4s\n"
800 "fmin v27.4s, v27.4s, v25.4s\n"
801 "fmin v28.4s, v28.4s, v25.4s\n"
802 "fmin v29.4s, v29.4s, v25.4s\n"
803 "str q26, [%[c_ptr0]]\n"
804 "fmax v30.4s, v30.4s, v24.4s\n"
805 "ldr q26, [%[biasptr]]\n"
806 "fmax v31.4s, v31.4s, v24.4s\n"
807 "ldr q24, [%[b_ptr0], #0x60]\n"
808 "add %[c_ptr0], %[c_ptr0], #0x10\n"
809 "str q27, [c_ptr1]\n"
810 "add c_ptr1, c_ptr1, #0x10\n"
811 "fmin v30.4s, v30.4s, v25.4s\n"
812 "add %[biasptr], %[biasptr], %[biasinc]\n"
813 "fmin v31.4s, v31.4s, v25.4s\n"
814 "str q28, [c_ptr2]\n"
815 "mov v27.16b, v26.16b\n"
816 "ldr q25, [%[b_ptr0], #0x70]\n"
817 "mov v28.16b, v26.16b\n"
818 "add c_ptr2, c_ptr2, #0x10\n"
819 "str q29, [c_ptr3]\n"
820 "add c_ptr3, c_ptr3, #0x10\n"
821 "mov v29.16b, v26.16b\n"
822 "add %[b_ptr0], %[b_ptr0], #0x80\n"
823 "fmla v27.4s, v18.4s, v3.s[0]\n"
824 "str q30, [c_ptr4]\n"
825 "mov v30.16b, v26.16b\n"
826 "add c_ptr4, c_ptr4, #0x10\n"
827 "fmla v28.4s, v18.4s, v6.s[0]\n"
828 "str q31, [c_ptr5]\n"
829 "mov v31.16b, v26.16b\n"
830 "add c_ptr5, c_ptr5, #0x10\n"
831 "fmla v26.4s, v18.4s, v0.s[0]\n"
832 "fmla v29.4s, v18.4s, v9.s[0]\n"
833 "fmla v30.4s, v18.4s, v12.s[0]\n"
834 "fmla v31.4s, v18.4s, v15.s[0]\n"
835 "ldr q18, [%[b_ptr0]]\n"
836 "fmla v26.4s, v19.4s, v0.s[1]\n"
837 "fmla v27.4s, v19.4s, v3.s[1]\n"
838 "fmla v28.4s, v19.4s, v6.s[1]\n"
839 "fmla v29.4s, v19.4s, v9.s[1]\n"
840 "fmla v30.4s, v19.4s, v12.s[1]\n"
841 "fmla v31.4s, v19.4s, v15.s[1]\n"
842 "ldr q19, [%[b_ptr0], #0x10]\n"
843 "fmla v26.4s, v20.4s, v0.s[2]\n"
844 "add %[b_ptr0], %[b_ptr0], #0x20\n"
845 "fmla v27.4s, v20.4s, v3.s[2]\n"
846 "fmla v28.4s, v20.4s, v6.s[2]\n"
847 "fmla v29.4s, v20.4s, v9.s[2]\n"
848 "fmla v30.4s, v20.4s, v12.s[2]\n"
849 "fmla v31.4s, v20.4s, v15.s[2]\n"
850 "fmla v26.4s, v21.4s, v0.s[3]\n"
851 "fmla v27.4s, v21.4s, v3.s[3]\n"
852 "fmla v28.4s, v21.4s, v6.s[3]\n"
853 "fmla v29.4s, v21.4s, v9.s[3]\n"
854 "fmla v30.4s, v21.4s, v12.s[3]\n"
855 "fmla v31.4s, v21.4s, v15.s[3]\n"
856 "fmla v26.4s, v22.4s, v1.s[0]\n"
857 "fmla v27.4s, v22.4s, v4.s[0]\n"
858 "fmla v28.4s, v22.4s, v7.s[0]\n"
859 "fmla v29.4s, v22.4s, v10.s[0]\n"
860 "fmla v30.4s, v22.4s, v13.s[0]\n"
861 "fmla v31.4s, v22.4s, v16.s[0]\n"
862 "fmla v26.4s, v23.4s, v1.s[1]\n"
863 "fmla v27.4s, v23.4s, v4.s[1]\n"
864 "fmla v28.4s, v23.4s, v7.s[1]\n"
865 "fmla v29.4s, v23.4s, v10.s[1]\n"
866 "fmla v30.4s, v23.4s, v13.s[1]\n"
867 "fmla v31.4s, v23.4s, v16.s[1]\n"
868 "fmla v26.4s, v24.4s, v1.s[2]\n"
869 "fmla v27.4s, v24.4s, v4.s[2]\n"
870 "fmla v28.4s, v24.4s, v7.s[2]\n"
871 "fmla v29.4s, v24.4s, v10.s[2]\n"
872 "fmla v30.4s, v24.4s, v13.s[2]\n"
873 "fmla v31.4s, v24.4s, v16.s[2]\n"
874 "fmla v26.4s, v25.4s, v1.s[3]\n"
875 "fmla v27.4s, v25.4s, v4.s[3]\n"
876 "fmla v28.4s, v25.4s, v7.s[3]\n"
877 "fmla v29.4s, v25.4s, v10.s[3]\n"
878 "fmla v30.4s, v25.4s, v13.s[3]\n"
879 "fmla v31.4s, v25.4s, v16.s[3]\n"
880 "fmla v26.4s, v18.4s, v2.s[0]\n"
881 "fmla v27.4s, v18.4s, v5.s[0]\n"
882 "fmla v28.4s, v18.4s, v8.s[0]\n"
883 "fmla v29.4s, v18.4s, v11.s[0]\n"
884 "fmla v30.4s, v18.4s, v14.s[0]\n"
885 "fmla v31.4s, v18.4s, v17.s[0]\n"
886 "fmla v26.4s, v19.4s, v2.s[1]\n"
887 "fmla v27.4s, v19.4s, v5.s[1]\n"
888 "fmla v28.4s, v19.4s, v8.s[1]\n"
889 "fmla v29.4s, v19.4s, v11.s[1]\n"
890 "fmla v30.4s, v19.4s, v14.s[1]\n"
891 "fmla v31.4s, v19.4s, v17.s[1]\n"
894 "ldr q26, [%[biasptr]]\n"
895 "add %[biasptr], %[biasptr], %[biasinc]\n"
896 "mov v27.16b, v26.16b\n"
897 "mov v28.16b, v26.16b\n"
898 "mov v29.16b, v26.16b\n"
899 "mov v30.16b, v26.16b\n"
900 "mov v31.16b, v26.16b\n"
901 "fmla v26.4s, v18.4s, v0.s[0]\n"
902 "fmla v27.4s, v18.4s, v3.s[0]\n"
903 "fmla v28.4s, v18.4s, v6.s[0]\n"
904 "fmla v29.4s, v18.4s, v9.s[0]\n"
905 "fmla v30.4s, v18.4s, v12.s[0]\n"
906 "fmla v31.4s, v18.4s, v15.s[0]\n"
907 "ldr q18, [%[b_ptr0]]\n"
908 "fmla v26.4s, v19.4s, v0.s[1]\n"
909 "fmla v27.4s, v19.4s, v3.s[1]\n"
910 "fmla v28.4s, v19.4s, v6.s[1]\n"
911 "fmla v29.4s, v19.4s, v9.s[1]\n"
912 "fmla v30.4s, v19.4s, v12.s[1]\n"
913 "fmla v31.4s, v19.4s, v15.s[1]\n"
914 "ldr q19, [%[b_ptr0], #0x10]\n"
915 "fmla v26.4s, v20.4s, v0.s[2]\n"
916 "add %[b_ptr0], %[b_ptr0], #0x20\n"
917 "fmla v27.4s, v20.4s, v3.s[2]\n"
918 "fmla v28.4s, v20.4s, v6.s[2]\n"
919 "fmla v29.4s, v20.4s, v9.s[2]\n"
920 "fmla v30.4s, v20.4s, v12.s[2]\n"
921 "fmla v31.4s, v20.4s, v15.s[2]\n"
922 "fmla v26.4s, v21.4s, v0.s[3]\n"
923 "fmla v27.4s, v21.4s, v3.s[3]\n"
924 "fmla v28.4s, v21.4s, v6.s[3]\n"
925 "fmla v29.4s, v21.4s, v9.s[3]\n"
926 "fmla v30.4s, v21.4s, v12.s[3]\n"
927 "fmla v31.4s, v21.4s, v15.s[3]\n"
928 "fmla v26.4s, v22.4s, v1.s[0]\n"
929 "fmla v27.4s, v22.4s, v4.s[0]\n"
930 "fmla v28.4s, v22.4s, v7.s[0]\n"
931 "fmla v29.4s, v22.4s, v10.s[0]\n"
932 "fmla v30.4s, v22.4s, v13.s[0]\n"
933 "fmla v31.4s, v22.4s, v16.s[0]\n"
934 "fmla v26.4s, v23.4s, v1.s[1]\n"
935 "fmla v27.4s, v23.4s, v4.s[1]\n"
936 "fmla v28.4s, v23.4s, v7.s[1]\n"
937 "fmla v29.4s, v23.4s, v10.s[1]\n"
938 "fmla v30.4s, v23.4s, v13.s[1]\n"
939 "fmla v31.4s, v23.4s, v16.s[1]\n"
940 "fmla v26.4s, v24.4s, v1.s[2]\n"
941 "fmla v27.4s, v24.4s, v4.s[2]\n"
942 "fmla v28.4s, v24.4s, v7.s[2]\n"
943 "fmla v29.4s, v24.4s, v10.s[2]\n"
944 "fmla v30.4s, v24.4s, v13.s[2]\n"
945 "fmla v31.4s, v24.4s, v16.s[2]\n"
946 "fmla v26.4s, v25.4s, v1.s[3]\n"
947 "fmla v27.4s, v25.4s, v4.s[3]\n"
948 "fmla v28.4s, v25.4s, v7.s[3]\n"
949 "fmla v29.4s, v25.4s, v10.s[3]\n"
950 "fmla v30.4s, v25.4s, v13.s[3]\n"
951 "fmla v31.4s, v25.4s, v16.s[3]\n"
952 "fmla v26.4s, v18.4s, v2.s[0]\n"
953 "fmla v27.4s, v18.4s, v5.s[0]\n"
954 "fmla v28.4s, v18.4s, v8.s[0]\n"
955 "fmla v29.4s, v18.4s, v11.s[0]\n"
956 "fmla v30.4s, v18.4s, v14.s[0]\n"
957 "fmla v31.4s, v18.4s, v17.s[0]\n"
958 "fmla v26.4s, v19.4s, v2.s[1]\n"
959 "fmla v27.4s, v19.4s, v5.s[1]\n"
960 "fmla v28.4s, v19.4s, v8.s[1]\n"
961 "fmla v29.4s, v19.4s, v11.s[1]\n"
962 "fmla v30.4s, v19.4s, v14.s[1]\n"
963 "fmla v31.4s, v19.4s, v17.s[1]\n"
965 "ld1r {v24.4s}, [%[minptr]]\n"
966 "ld1r {v25.4s}, [%[maxptr]]\n"
967 "fmax v26.4s, v26.4s, v24.4s\n"
968 "fmax v27.4s, v27.4s, v24.4s\n"
969 "fmax v28.4s, v28.4s, v24.4s\n"
970 "fmax v29.4s, v29.4s, v24.4s\n"
971 "fmin v26.4s, v26.4s, v25.4s\n"
972 "fmin v27.4s, v27.4s, v25.4s\n"
973 "fmin v28.4s, v28.4s, v25.4s\n"
974 "fmin v29.4s, v29.4s, v25.4s\n"
975 "str q26, [%[c_ptr0]]\n"
976 "fmax v30.4s, v30.4s, v24.4s\n"
977 "add %[c_ptr0], %[c_ptr0], #0x10\n"
978 "fmax v31.4s, v31.4s, v24.4s\n"
979 "str q27, [c_ptr1]\n"
980 "fmin v30.4s, v30.4s, v25.4s\n"
981 "fmin v31.4s, v31.4s, v25.4s\n"
982 "str q28, [c_ptr2]\n"
983 "str q29, [c_ptr3]\n"
984 "str q30, [c_ptr4]\n"
985 "str q31, [c_ptr5]\n"
996 : [a_ptr0]
"+r" (a_ptr0), [b_ptr0]
"+r" (b_ptr0), [c_ptr0]
"+r" (c_ptr0), [loops]
"+r" (loops), [oob_rows]
"+r" (oob_rows), [biasptr]
"+r" (biasptr)
997 : [lda]
"r" (ldab), [ldc]
"r" (ldcb), [biasinc]
"r" (biasinc), [minptr]
"r" (minptr), [maxptr]
"r" (maxptr)
998 :
"x0",
"x1",
"x2",
"x3",
"x4",
"x5",
"x6",
"x7",
"x8",
"x9",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23",
"v24",
"v25",
"v26",
"v27",
"v28",
"v29",
"v30",
"v31",
"cc",
"memory"
1013 "add a_ptr1, %[a_ptr0], %[lda]\n"
1014 "add c_ptr1, %[c_ptr0], %[ldc]\n"
1015 "add a_ptr2, a_ptr1, %[lda]\n"
1016 "add c_ptr2, c_ptr1, %[ldc]\n"
1017 "add a_ptr3, a_ptr2, %[lda]\n"
1018 "add c_ptr3, c_ptr2, %[ldc]\n"
1019 "add a_ptr4, a_ptr3, %[lda]\n"
1020 "add c_ptr4, c_ptr3, %[ldc]\n"
1021 "add a_ptr5, a_ptr4, %[lda]\n"
1022 "add c_ptr5, c_ptr4, %[ldc]\n"
1023 "cbz %[oob_rows], 1f\n"
1024 "subs %[oob_rows], %[oob_rows], #0x1\n"
1025 "add c_ptr5, %[c_ptr0], #0x0\n"
1026 "add a_ptr5, %[a_ptr0], #0x0\n"
1028 "subs %[oob_rows], %[oob_rows], #0x1\n"
1029 "add c_ptr4, %[c_ptr0], #0x0\n"
1030 "add a_ptr4, %[a_ptr0], #0x0\n"
1032 "subs %[oob_rows], %[oob_rows], #0x1\n"
1033 "add c_ptr3, %[c_ptr0], #0x0\n"
1034 "add a_ptr3, %[a_ptr0], #0x0\n"
1036 "subs %[oob_rows], %[oob_rows], #0x1\n"
1037 "add c_ptr2, %[c_ptr0], #0x0\n"
1038 "add a_ptr2, %[a_ptr0], #0x0\n"
1040 "subs %[oob_rows], %[oob_rows], #0x1\n"
1041 "add c_ptr1, %[c_ptr0], #0x0\n"
1042 "add a_ptr1, %[a_ptr0], #0x0\n"
1044 "ldr q0, [%[a_ptr0]], #0x10\n"
1045 "ldr q3, [a_ptr1], #0x10\n"
1046 "ldr q6, [a_ptr2], #0x10\n"
1047 "ldr q9, [a_ptr3], #0x10\n"
1048 "ldr q12, [a_ptr4], #0x10\n"
1049 "ldr q15, [a_ptr5], #0x10\n"
1050 "ldr q1, [%[a_ptr0]], #0x10\n"
1051 "ldr q4, [a_ptr1], #0x10\n"
1052 "ldr q7, [a_ptr2], #0x10\n"
1053 "ldr q10, [a_ptr3], #0x10\n"
1054 "ldr d2, [%[a_ptr0]], #0x8\n"
1055 "ldr q13, [a_ptr4], #0x10\n"
1056 "ldr d5, [a_ptr1], #0x8\n"
1057 "ldr q16, [a_ptr5], #0x10\n"
1058 "ldr d8, [a_ptr2], #0x8\n"
1059 "ldr q18, [%[b_ptr0]]\n"
1060 "ldr d11, [a_ptr3], #0x8\n"
1061 "ldr q19, [%[b_ptr0], #0x10]\n"
1062 "ldr d14, [a_ptr4], #0x8\n"
1063 "ldr q20, [%[b_ptr0], #0x20]\n"
1064 "ldr d17, [a_ptr5], #0x8\n"
1065 "ldr q21, [%[b_ptr0], #0x30]\n"
1066 "ld1 {v2.s}[2], [%[a_ptr0]]\n"
1067 "ldr q22, [%[b_ptr0], #0x40]\n"
1068 "ld1 {v5.s}[2], [a_ptr1]\n"
1069 "ldr q23, [%[b_ptr0], #0x50]\n"
1070 "ld1 {v8.s}[2], [a_ptr2]\n"
1071 "ldr q24, [%[b_ptr0], #0x60]\n"
1072 "ld1 {v11.s}[2], [a_ptr3]\n"
1073 "ldr q25, [%[b_ptr0], #0x70]\n"
1074 "ld1 {v14.s}[2], [a_ptr4]\n"
1075 "ld1 {v17.s}[2], [a_ptr5]\n"
1076 "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
1077 "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
1078 "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
1079 "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
1080 "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
1081 "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
1082 "add %[b_ptr0], %[b_ptr0], #0x80\n"
1083 "cbz %[loops], 2f\n"
1084 "ldr q26, [%[biasptr]]\n"
1085 "add %[biasptr], %[biasptr], %[biasinc]\n"
1086 "subs %[loops], %[loops], #0x1\n"
1087 "mov v27.16b, v26.16b\n"
1088 "mov v28.16b, v26.16b\n"
1089 "mov v29.16b, v26.16b\n"
1090 "mov v30.16b, v26.16b\n"
1091 "mov v31.16b, v26.16b\n"
1092 "fmla v26.4s, v18.4s, v0.s[0]\n"
1093 "fmla v27.4s, v18.4s, v3.s[0]\n"
1094 "fmla v28.4s, v18.4s, v6.s[0]\n"
1095 "fmla v29.4s, v18.4s, v9.s[0]\n"
1096 "fmla v30.4s, v18.4s, v12.s[0]\n"
1097 "fmla v31.4s, v18.4s, v15.s[0]\n"
1098 "ldr q18, [%[b_ptr0]]\n"
1099 "fmla v26.4s, v19.4s, v0.s[1]\n"
1100 "fmla v27.4s, v19.4s, v3.s[1]\n"
1101 "fmla v28.4s, v19.4s, v6.s[1]\n"
1102 "fmla v29.4s, v19.4s, v9.s[1]\n"
1103 "fmla v30.4s, v19.4s, v12.s[1]\n"
1104 "fmla v31.4s, v19.4s, v15.s[1]\n"
1105 "ldr q19, [%[b_ptr0], #0x10]\n"
1106 "fmla v26.4s, v20.4s, v0.s[2]\n"
1107 "fmla v27.4s, v20.4s, v3.s[2]\n"
1108 "fmla v28.4s, v20.4s, v6.s[2]\n"
1109 "fmla v29.4s, v20.4s, v9.s[2]\n"
1110 "fmla v30.4s, v20.4s, v12.s[2]\n"
1111 "fmla v31.4s, v20.4s, v15.s[2]\n"
1112 "ldr q20, [%[b_ptr0], #0x20]\n"
1113 "fmla v26.4s, v21.4s, v0.s[3]\n"
1114 "add %[b_ptr0], %[b_ptr0], #0x30\n"
1115 "fmla v27.4s, v21.4s, v3.s[3]\n"
1116 "fmla v28.4s, v21.4s, v6.s[3]\n"
1117 "fmla v29.4s, v21.4s, v9.s[3]\n"
1118 "fmla v30.4s, v21.4s, v12.s[3]\n"
1119 "fmla v31.4s, v21.4s, v15.s[3]\n"
1120 "fmla v26.4s, v22.4s, v1.s[0]\n"
1121 "fmla v27.4s, v22.4s, v4.s[0]\n"
1122 "fmla v28.4s, v22.4s, v7.s[0]\n"
1123 "fmla v29.4s, v22.4s, v10.s[0]\n"
1124 "fmla v30.4s, v22.4s, v13.s[0]\n"
1125 "fmla v31.4s, v22.4s, v16.s[0]\n"
1126 "fmla v26.4s, v23.4s, v1.s[1]\n"
1127 "fmla v27.4s, v23.4s, v4.s[1]\n"
1128 "fmla v28.4s, v23.4s, v7.s[1]\n"
1129 "fmla v29.4s, v23.4s, v10.s[1]\n"
1130 "fmla v30.4s, v23.4s, v13.s[1]\n"
1131 "fmla v31.4s, v23.4s, v16.s[1]\n"
1132 "fmla v26.4s, v24.4s, v1.s[2]\n"
1133 "fmla v27.4s, v24.4s, v4.s[2]\n"
1134 "fmla v28.4s, v24.4s, v7.s[2]\n"
1135 "fmla v29.4s, v24.4s, v10.s[2]\n"
1136 "fmla v30.4s, v24.4s, v13.s[2]\n"
1137 "fmla v31.4s, v24.4s, v16.s[2]\n"
1138 "fmla v26.4s, v25.4s, v1.s[3]\n"
1139 "fmla v27.4s, v25.4s, v4.s[3]\n"
1140 "fmla v28.4s, v25.4s, v7.s[3]\n"
1141 "fmla v29.4s, v25.4s, v10.s[3]\n"
1142 "fmla v30.4s, v25.4s, v13.s[3]\n"
1143 "fmla v31.4s, v25.4s, v16.s[3]\n"
1144 "fmla v26.4s, v18.4s, v2.s[0]\n"
1145 "fmla v27.4s, v18.4s, v5.s[0]\n"
1146 "fmla v28.4s, v18.4s, v8.s[0]\n"
1147 "fmla v29.4s, v18.4s, v11.s[0]\n"
1148 "fmla v30.4s, v18.4s, v14.s[0]\n"
1149 "fmla v31.4s, v18.4s, v17.s[0]\n"
1150 "fmla v26.4s, v19.4s, v2.s[1]\n"
1151 "fmla v27.4s, v19.4s, v5.s[1]\n"
1152 "fmla v28.4s, v19.4s, v8.s[1]\n"
1153 "fmla v29.4s, v19.4s, v11.s[1]\n"
1154 "fmla v30.4s, v19.4s, v14.s[1]\n"
1155 "fmla v31.4s, v19.4s, v17.s[1]\n"
1156 "fmla v26.4s, v20.4s, v2.s[2]\n"
1157 "fmla v27.4s, v20.4s, v5.s[2]\n"
1158 "fmla v28.4s, v20.4s, v8.s[2]\n"
1159 "fmla v29.4s, v20.4s, v11.s[2]\n"
1160 "fmla v30.4s, v20.4s, v14.s[2]\n"
1161 "fmla v31.4s, v20.4s, v17.s[2]\n"
1164 "ld1r {v24.4s}, [%[minptr]]\n"
1165 "subs %[loops], %[loops], #0x1\n"
1166 "ld1r {v25.4s}, [%[maxptr]]\n"
1167 "ldr q18, [%[b_ptr0]]\n"
1168 "fmax v26.4s, v26.4s, v24.4s\n"
1169 "ldr q19, [%[b_ptr0], #0x10]\n"
1170 "fmax v27.4s, v27.4s, v24.4s\n"
1171 "ldr q20, [%[b_ptr0], #0x20]\n"
1172 "fmax v28.4s, v28.4s, v24.4s\n"
1173 "ldr q21, [%[b_ptr0], #0x30]\n"
1174 "fmax v29.4s, v29.4s, v24.4s\n"
1175 "ldr q22, [%[b_ptr0], #0x40]\n"
1176 "fmin v26.4s, v26.4s, v25.4s\n"
1177 "ldr q23, [%[b_ptr0], #0x50]\n"
1178 "fmin v27.4s, v27.4s, v25.4s\n"
1179 "fmin v28.4s, v28.4s, v25.4s\n"
1180 "fmin v29.4s, v29.4s, v25.4s\n"
1181 "str q26, [%[c_ptr0]]\n"
1182 "fmax v30.4s, v30.4s, v24.4s\n"
1183 "ldr q26, [%[biasptr]]\n"
1184 "fmax v31.4s, v31.4s, v24.4s\n"
1185 "ldr q24, [%[b_ptr0], #0x60]\n"
1186 "add %[c_ptr0], %[c_ptr0], #0x10\n"
1187 "str q27, [c_ptr1]\n"
1188 "add c_ptr1, c_ptr1, #0x10\n"
1189 "fmin v30.4s, v30.4s, v25.4s\n"
1190 "add %[biasptr], %[biasptr], %[biasinc]\n"
1191 "fmin v31.4s, v31.4s, v25.4s\n"
1192 "str q28, [c_ptr2]\n"
1193 "mov v27.16b, v26.16b\n"
1194 "ldr q25, [%[b_ptr0], #0x70]\n"
1195 "mov v28.16b, v26.16b\n"
1196 "add c_ptr2, c_ptr2, #0x10\n"
1197 "str q29, [c_ptr3]\n"
1198 "add c_ptr3, c_ptr3, #0x10\n"
1199 "mov v29.16b, v26.16b\n"
1200 "add %[b_ptr0], %[b_ptr0], #0x80\n"
1201 "fmla v27.4s, v18.4s, v3.s[0]\n"
1202 "str q30, [c_ptr4]\n"
1203 "mov v30.16b, v26.16b\n"
1204 "add c_ptr4, c_ptr4, #0x10\n"
1205 "fmla v28.4s, v18.4s, v6.s[0]\n"
1206 "str q31, [c_ptr5]\n"
1207 "mov v31.16b, v26.16b\n"
1208 "add c_ptr5, c_ptr5, #0x10\n"
1209 "fmla v26.4s, v18.4s, v0.s[0]\n"
1210 "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
1211 "fmla v29.4s, v18.4s, v9.s[0]\n"
1212 "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
1213 "fmla v30.4s, v18.4s, v12.s[0]\n"
1214 "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
1215 "fmla v31.4s, v18.4s, v15.s[0]\n"
1216 "ldr q18, [%[b_ptr0]]\n"
1217 "fmla v26.4s, v19.4s, v0.s[1]\n"
1218 "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
1219 "fmla v27.4s, v19.4s, v3.s[1]\n"
1220 "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
1221 "fmla v28.4s, v19.4s, v6.s[1]\n"
1222 "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
1223 "fmla v29.4s, v19.4s, v9.s[1]\n"
1224 "fmla v30.4s, v19.4s, v12.s[1]\n"
1225 "fmla v31.4s, v19.4s, v15.s[1]\n"
1226 "ldr q19, [%[b_ptr0], #0x10]\n"
1227 "fmla v26.4s, v20.4s, v0.s[2]\n"
1228 "fmla v27.4s, v20.4s, v3.s[2]\n"
1229 "fmla v28.4s, v20.4s, v6.s[2]\n"
1230 "fmla v29.4s, v20.4s, v9.s[2]\n"
1231 "fmla v30.4s, v20.4s, v12.s[2]\n"
1232 "fmla v31.4s, v20.4s, v15.s[2]\n"
1233 "ldr q20, [%[b_ptr0], #0x20]\n"
1234 "fmla v26.4s, v21.4s, v0.s[3]\n"
1235 "add %[b_ptr0], %[b_ptr0], #0x30\n"
1236 "fmla v27.4s, v21.4s, v3.s[3]\n"
1237 "fmla v28.4s, v21.4s, v6.s[3]\n"
1238 "fmla v29.4s, v21.4s, v9.s[3]\n"
1239 "fmla v30.4s, v21.4s, v12.s[3]\n"
1240 "fmla v31.4s, v21.4s, v15.s[3]\n"
1241 "fmla v26.4s, v22.4s, v1.s[0]\n"
1242 "fmla v27.4s, v22.4s, v4.s[0]\n"
1243 "fmla v28.4s, v22.4s, v7.s[0]\n"
1244 "fmla v29.4s, v22.4s, v10.s[0]\n"
1245 "fmla v30.4s, v22.4s, v13.s[0]\n"
1246 "fmla v31.4s, v22.4s, v16.s[0]\n"
1247 "fmla v26.4s, v23.4s, v1.s[1]\n"
1248 "fmla v27.4s, v23.4s, v4.s[1]\n"
1249 "fmla v28.4s, v23.4s, v7.s[1]\n"
1250 "fmla v29.4s, v23.4s, v10.s[1]\n"
1251 "fmla v30.4s, v23.4s, v13.s[1]\n"
1252 "fmla v31.4s, v23.4s, v16.s[1]\n"
1253 "fmla v26.4s, v24.4s, v1.s[2]\n"
1254 "fmla v27.4s, v24.4s, v4.s[2]\n"
1255 "fmla v28.4s, v24.4s, v7.s[2]\n"
1256 "fmla v29.4s, v24.4s, v10.s[2]\n"
1257 "fmla v30.4s, v24.4s, v13.s[2]\n"
1258 "fmla v31.4s, v24.4s, v16.s[2]\n"
1259 "fmla v26.4s, v25.4s, v1.s[3]\n"
1260 "fmla v27.4s, v25.4s, v4.s[3]\n"
1261 "fmla v28.4s, v25.4s, v7.s[3]\n"
1262 "fmla v29.4s, v25.4s, v10.s[3]\n"
1263 "fmla v30.4s, v25.4s, v13.s[3]\n"
1264 "fmla v31.4s, v25.4s, v16.s[3]\n"
1265 "fmla v26.4s, v18.4s, v2.s[0]\n"
1266 "fmla v27.4s, v18.4s, v5.s[0]\n"
1267 "fmla v28.4s, v18.4s, v8.s[0]\n"
1268 "fmla v29.4s, v18.4s, v11.s[0]\n"
1269 "fmla v30.4s, v18.4s, v14.s[0]\n"
1270 "fmla v31.4s, v18.4s, v17.s[0]\n"
1271 "fmla v26.4s, v19.4s, v2.s[1]\n"
1272 "fmla v27.4s, v19.4s, v5.s[1]\n"
1273 "fmla v28.4s, v19.4s, v8.s[1]\n"
1274 "fmla v29.4s, v19.4s, v11.s[1]\n"
1275 "fmla v30.4s, v19.4s, v14.s[1]\n"
1276 "fmla v31.4s, v19.4s, v17.s[1]\n"
1277 "fmla v26.4s, v20.4s, v2.s[2]\n"
1278 "fmla v27.4s, v20.4s, v5.s[2]\n"
1279 "fmla v28.4s, v20.4s, v8.s[2]\n"
1280 "fmla v29.4s, v20.4s, v11.s[2]\n"
1281 "fmla v30.4s, v20.4s, v14.s[2]\n"
1282 "fmla v31.4s, v20.4s, v17.s[2]\n"
1285 "ld1r {v24.4s}, [%[minptr]]\n"
1286 "ld1r {v25.4s}, [%[maxptr]]\n"
1287 "ldr q18, [%[b_ptr0]]\n"
1288 "ldr q19, [%[b_ptr0], #0x10]\n"
1289 "fmax v26.4s, v26.4s, v24.4s\n"
1290 "ldr q20, [%[b_ptr0], #0x20]\n"
1291 "fmax v27.4s, v27.4s, v24.4s\n"
1292 "ldr q21, [%[b_ptr0], #0x30]\n"
1293 "fmax v28.4s, v28.4s, v24.4s\n"
1294 "ldr q22, [%[b_ptr0], #0x40]\n"
1295 "fmax v29.4s, v29.4s, v24.4s\n"
1296 "ldr q23, [%[b_ptr0], #0x50]\n"
1297 "fmin v26.4s, v26.4s, v25.4s\n"
1298 "fmin v27.4s, v27.4s, v25.4s\n"
1299 "fmin v28.4s, v28.4s, v25.4s\n"
1300 "fmin v29.4s, v29.4s, v25.4s\n"
1301 "str q26, [%[c_ptr0]]\n"
1302 "fmax v30.4s, v30.4s, v24.4s\n"
1303 "ldr q26, [%[biasptr]]\n"
1304 "fmax v31.4s, v31.4s, v24.4s\n"
1305 "ldr q24, [%[b_ptr0], #0x60]\n"
1306 "add %[c_ptr0], %[c_ptr0], #0x10\n"
1307 "str q27, [c_ptr1]\n"
1308 "add c_ptr1, c_ptr1, #0x10\n"
1309 "fmin v30.4s, v30.4s, v25.4s\n"
1310 "add %[biasptr], %[biasptr], %[biasinc]\n"
1311 "fmin v31.4s, v31.4s, v25.4s\n"
1312 "str q28, [c_ptr2]\n"
1313 "mov v27.16b, v26.16b\n"
1314 "ldr q25, [%[b_ptr0], #0x70]\n"
1315 "mov v28.16b, v26.16b\n"
1316 "add c_ptr2, c_ptr2, #0x10\n"
1317 "str q29, [c_ptr3]\n"
1318 "add c_ptr3, c_ptr3, #0x10\n"
1319 "mov v29.16b, v26.16b\n"
1320 "add %[b_ptr0], %[b_ptr0], #0x80\n"
1321 "fmla v27.4s, v18.4s, v3.s[0]\n"
1322 "str q30, [c_ptr4]\n"
1323 "mov v30.16b, v26.16b\n"
1324 "add c_ptr4, c_ptr4, #0x10\n"
1325 "fmla v28.4s, v18.4s, v6.s[0]\n"
1326 "str q31, [c_ptr5]\n"
1327 "mov v31.16b, v26.16b\n"
1328 "add c_ptr5, c_ptr5, #0x10\n"
1329 "fmla v26.4s, v18.4s, v0.s[0]\n"
1330 "fmla v29.4s, v18.4s, v9.s[0]\n"
1331 "fmla v30.4s, v18.4s, v12.s[0]\n"
1332 "fmla v31.4s, v18.4s, v15.s[0]\n"
1333 "ldr q18, [%[b_ptr0]]\n"
1334 "fmla v26.4s, v19.4s, v0.s[1]\n"
1335 "fmla v27.4s, v19.4s, v3.s[1]\n"
1336 "fmla v28.4s, v19.4s, v6.s[1]\n"
1337 "fmla v29.4s, v19.4s, v9.s[1]\n"
1338 "fmla v30.4s, v19.4s, v12.s[1]\n"
1339 "fmla v31.4s, v19.4s, v15.s[1]\n"
1340 "ldr q19, [%[b_ptr0], #0x10]\n"
1341 "fmla v26.4s, v20.4s, v0.s[2]\n"
1342 "fmla v27.4s, v20.4s, v3.s[2]\n"
1343 "fmla v28.4s, v20.4s, v6.s[2]\n"
1344 "fmla v29.4s, v20.4s, v9.s[2]\n"
1345 "fmla v30.4s, v20.4s, v12.s[2]\n"
1346 "fmla v31.4s, v20.4s, v15.s[2]\n"
1347 "ldr q20, [%[b_ptr0], #0x20]\n"
1348 "fmla v26.4s, v21.4s, v0.s[3]\n"
1349 "add %[b_ptr0], %[b_ptr0], #0x30\n"
1350 "fmla v27.4s, v21.4s, v3.s[3]\n"
1351 "fmla v28.4s, v21.4s, v6.s[3]\n"
1352 "fmla v29.4s, v21.4s, v9.s[3]\n"
1353 "fmla v30.4s, v21.4s, v12.s[3]\n"
1354 "fmla v31.4s, v21.4s, v15.s[3]\n"
1355 "fmla v26.4s, v22.4s, v1.s[0]\n"
1356 "fmla v27.4s, v22.4s, v4.s[0]\n"
1357 "fmla v28.4s, v22.4s, v7.s[0]\n"
1358 "fmla v29.4s, v22.4s, v10.s[0]\n"
1359 "fmla v30.4s, v22.4s, v13.s[0]\n"
1360 "fmla v31.4s, v22.4s, v16.s[0]\n"
1361 "fmla v26.4s, v23.4s, v1.s[1]\n"
1362 "fmla v27.4s, v23.4s, v4.s[1]\n"
1363 "fmla v28.4s, v23.4s, v7.s[1]\n"
1364 "fmla v29.4s, v23.4s, v10.s[1]\n"
1365 "fmla v30.4s, v23.4s, v13.s[1]\n"
1366 "fmla v31.4s, v23.4s, v16.s[1]\n"
1367 "fmla v26.4s, v24.4s, v1.s[2]\n"
1368 "fmla v27.4s, v24.4s, v4.s[2]\n"
1369 "fmla v28.4s, v24.4s, v7.s[2]\n"
1370 "fmla v29.4s, v24.4s, v10.s[2]\n"
1371 "fmla v30.4s, v24.4s, v13.s[2]\n"
1372 "fmla v31.4s, v24.4s, v16.s[2]\n"
1373 "fmla v26.4s, v25.4s, v1.s[3]\n"
1374 "fmla v27.4s, v25.4s, v4.s[3]\n"
1375 "fmla v28.4s, v25.4s, v7.s[3]\n"
1376 "fmla v29.4s, v25.4s, v10.s[3]\n"
1377 "fmla v30.4s, v25.4s, v13.s[3]\n"
1378 "fmla v31.4s, v25.4s, v16.s[3]\n"
1379 "fmla v26.4s, v18.4s, v2.s[0]\n"
1380 "fmla v27.4s, v18.4s, v5.s[0]\n"
1381 "fmla v28.4s, v18.4s, v8.s[0]\n"
1382 "fmla v29.4s, v18.4s, v11.s[0]\n"
1383 "fmla v30.4s, v18.4s, v14.s[0]\n"
1384 "fmla v31.4s, v18.4s, v17.s[0]\n"
1385 "fmla v26.4s, v19.4s, v2.s[1]\n"
1386 "fmla v27.4s, v19.4s, v5.s[1]\n"
1387 "fmla v28.4s, v19.4s, v8.s[1]\n"
1388 "fmla v29.4s, v19.4s, v11.s[1]\n"
1389 "fmla v30.4s, v19.4s, v14.s[1]\n"
1390 "fmla v31.4s, v19.4s, v17.s[1]\n"
1391 "fmla v26.4s, v20.4s, v2.s[2]\n"
1392 "fmla v27.4s, v20.4s, v5.s[2]\n"
1393 "fmla v28.4s, v20.4s, v8.s[2]\n"
1394 "fmla v29.4s, v20.4s, v11.s[2]\n"
1395 "fmla v30.4s, v20.4s, v14.s[2]\n"
1396 "fmla v31.4s, v20.4s, v17.s[2]\n"
1399 "ldr q26, [%[biasptr]]\n"
1400 "add %[biasptr], %[biasptr], %[biasinc]\n"
1401 "mov v27.16b, v26.16b\n"
1402 "mov v28.16b, v26.16b\n"
1403 "mov v29.16b, v26.16b\n"
1404 "mov v30.16b, v26.16b\n"
1405 "mov v31.16b, v26.16b\n"
1406 "fmla v26.4s, v18.4s, v0.s[0]\n"
1407 "fmla v27.4s, v18.4s, v3.s[0]\n"
1408 "fmla v28.4s, v18.4s, v6.s[0]\n"
1409 "fmla v29.4s, v18.4s, v9.s[0]\n"
1410 "fmla v30.4s, v18.4s, v12.s[0]\n"
1411 "fmla v31.4s, v18.4s, v15.s[0]\n"
1412 "ldr q18, [%[b_ptr0]]\n"
1413 "fmla v26.4s, v19.4s, v0.s[1]\n"
1414 "fmla v27.4s, v19.4s, v3.s[1]\n"
1415 "fmla v28.4s, v19.4s, v6.s[1]\n"
1416 "fmla v29.4s, v19.4s, v9.s[1]\n"
1417 "fmla v30.4s, v19.4s, v12.s[1]\n"
1418 "fmla v31.4s, v19.4s, v15.s[1]\n"
1419 "ldr q19, [%[b_ptr0], #0x10]\n"
1420 "fmla v26.4s, v20.4s, v0.s[2]\n"
1421 "fmla v27.4s, v20.4s, v3.s[2]\n"
1422 "fmla v28.4s, v20.4s, v6.s[2]\n"
1423 "fmla v29.4s, v20.4s, v9.s[2]\n"
1424 "fmla v30.4s, v20.4s, v12.s[2]\n"
1425 "fmla v31.4s, v20.4s, v15.s[2]\n"
1426 "ldr q20, [%[b_ptr0], #0x20]\n"
1427 "fmla v26.4s, v21.4s, v0.s[3]\n"
1428 "add %[b_ptr0], %[b_ptr0], #0x30\n"
1429 "fmla v27.4s, v21.4s, v3.s[3]\n"
1430 "fmla v28.4s, v21.4s, v6.s[3]\n"
1431 "fmla v29.4s, v21.4s, v9.s[3]\n"
1432 "fmla v30.4s, v21.4s, v12.s[3]\n"
1433 "fmla v31.4s, v21.4s, v15.s[3]\n"
1434 "fmla v26.4s, v22.4s, v1.s[0]\n"
1435 "fmla v27.4s, v22.4s, v4.s[0]\n"
1436 "fmla v28.4s, v22.4s, v7.s[0]\n"
1437 "fmla v29.4s, v22.4s, v10.s[0]\n"
1438 "fmla v30.4s, v22.4s, v13.s[0]\n"
1439 "fmla v31.4s, v22.4s, v16.s[0]\n"
1440 "fmla v26.4s, v23.4s, v1.s[1]\n"
1441 "fmla v27.4s, v23.4s, v4.s[1]\n"
1442 "fmla v28.4s, v23.4s, v7.s[1]\n"
1443 "fmla v29.4s, v23.4s, v10.s[1]\n"
1444 "fmla v30.4s, v23.4s, v13.s[1]\n"
1445 "fmla v31.4s, v23.4s, v16.s[1]\n"
1446 "fmla v26.4s, v24.4s, v1.s[2]\n"
1447 "fmla v27.4s, v24.4s, v4.s[2]\n"
1448 "fmla v28.4s, v24.4s, v7.s[2]\n"
1449 "fmla v29.4s, v24.4s, v10.s[2]\n"
1450 "fmla v30.4s, v24.4s, v13.s[2]\n"
1451 "fmla v31.4s, v24.4s, v16.s[2]\n"
1452 "fmla v26.4s, v25.4s, v1.s[3]\n"
1453 "fmla v27.4s, v25.4s, v4.s[3]\n"
1454 "fmla v28.4s, v25.4s, v7.s[3]\n"
1455 "fmla v29.4s, v25.4s, v10.s[3]\n"
1456 "fmla v30.4s, v25.4s, v13.s[3]\n"
1457 "fmla v31.4s, v25.4s, v16.s[3]\n"
1458 "fmla v26.4s, v18.4s, v2.s[0]\n"
1459 "fmla v27.4s, v18.4s, v5.s[0]\n"
1460 "fmla v28.4s, v18.4s, v8.s[0]\n"
1461 "fmla v29.4s, v18.4s, v11.s[0]\n"
1462 "fmla v30.4s, v18.4s, v14.s[0]\n"
1463 "fmla v31.4s, v18.4s, v17.s[0]\n"
1464 "fmla v26.4s, v19.4s, v2.s[1]\n"
1465 "fmla v27.4s, v19.4s, v5.s[1]\n"
1466 "fmla v28.4s, v19.4s, v8.s[1]\n"
1467 "fmla v29.4s, v19.4s, v11.s[1]\n"
1468 "fmla v30.4s, v19.4s, v14.s[1]\n"
1469 "fmla v31.4s, v19.4s, v17.s[1]\n"
1470 "fmla v26.4s, v20.4s, v2.s[2]\n"
1471 "fmla v27.4s, v20.4s, v5.s[2]\n"
1472 "fmla v28.4s, v20.4s, v8.s[2]\n"
1473 "fmla v29.4s, v20.4s, v11.s[2]\n"
1474 "fmla v30.4s, v20.4s, v14.s[2]\n"
1475 "fmla v31.4s, v20.4s, v17.s[2]\n"
1477 "ld1r {v24.4s}, [%[minptr]]\n"
1478 "ld1r {v25.4s}, [%[maxptr]]\n"
1479 "fmax v26.4s, v26.4s, v24.4s\n"
1480 "fmax v27.4s, v27.4s, v24.4s\n"
1481 "fmax v28.4s, v28.4s, v24.4s\n"
1482 "fmax v29.4s, v29.4s, v24.4s\n"
1483 "fmin v26.4s, v26.4s, v25.4s\n"
1484 "fmin v27.4s, v27.4s, v25.4s\n"
1485 "fmin v28.4s, v28.4s, v25.4s\n"
1486 "fmin v29.4s, v29.4s, v25.4s\n"
1487 "str q26, [%[c_ptr0]]\n"
1488 "fmax v30.4s, v30.4s, v24.4s\n"
1489 "add %[c_ptr0], %[c_ptr0], #0x10\n"
1490 "fmax v31.4s, v31.4s, v24.4s\n"
1491 "str q27, [c_ptr1]\n"
1492 "fmin v30.4s, v30.4s, v25.4s\n"
1493 "fmin v31.4s, v31.4s, v25.4s\n"
1494 "str q28, [c_ptr2]\n"
1495 "str q29, [c_ptr3]\n"
1496 "str q30, [c_ptr4]\n"
1497 "str q31, [c_ptr5]\n"
1508 : [a_ptr0]
"+r" (a_ptr0), [b_ptr0]
"+r" (b_ptr0), [c_ptr0]
"+r" (c_ptr0), [loops]
"+r" (loops), [oob_rows]
"+r" (oob_rows), [biasptr]
"+r" (biasptr)
1509 : [lda]
"r" (ldab), [ldc]
"r" (ldcb), [biasinc]
"r" (biasinc), [minptr]
"r" (minptr), [maxptr]
"r" (maxptr)
1510 :
"x0",
"x1",
"x2",
"x3",
"x4",
"x5",
"x6",
"x7",
"x8",
"x9",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23",
"v24",
"v25",
"v26",
"v27",
"v28",
"v29",
"v30",
"v31",
"cc",
"memory"
1525 "add a_ptr1, %[a_ptr0], %[lda]\n"
1526 "add c_ptr1, %[c_ptr0], %[ldc]\n"
1527 "add a_ptr2, a_ptr1, %[lda]\n"
1528 "add c_ptr2, c_ptr1, %[ldc]\n"
1529 "add a_ptr3, a_ptr2, %[lda]\n"
1530 "add c_ptr3, c_ptr2, %[ldc]\n"
1531 "add a_ptr4, a_ptr3, %[lda]\n"
1532 "add c_ptr4, c_ptr3, %[ldc]\n"
1533 "add a_ptr5, a_ptr4, %[lda]\n"
1534 "add c_ptr5, c_ptr4, %[ldc]\n"
1535 "cbz %[oob_rows], 1f\n"
1536 "subs %[oob_rows], %[oob_rows], #0x1\n"
1537 "add c_ptr5, %[c_ptr0], #0x0\n"
1538 "add a_ptr5, %[a_ptr0], #0x0\n"
1540 "subs %[oob_rows], %[oob_rows], #0x1\n"
1541 "add c_ptr4, %[c_ptr0], #0x0\n"
1542 "add a_ptr4, %[a_ptr0], #0x0\n"
1544 "subs %[oob_rows], %[oob_rows], #0x1\n"
1545 "add c_ptr3, %[c_ptr0], #0x0\n"
1546 "add a_ptr3, %[a_ptr0], #0x0\n"
1548 "subs %[oob_rows], %[oob_rows], #0x1\n"
1549 "add c_ptr2, %[c_ptr0], #0x0\n"
1550 "add a_ptr2, %[a_ptr0], #0x0\n"
1552 "subs %[oob_rows], %[oob_rows], #0x1\n"
1553 "add c_ptr1, %[c_ptr0], #0x0\n"
1554 "add a_ptr1, %[a_ptr0], #0x0\n"
1556 "ldr q0, [%[a_ptr0]], #0x10\n"
1557 "ldr q3, [a_ptr1], #0x10\n"
1558 "ldr q6, [a_ptr2], #0x10\n"
1559 "ldr q9, [a_ptr3], #0x10\n"
1560 "ldr q12, [a_ptr4], #0x10\n"
1561 "ldr q15, [a_ptr5], #0x10\n"
1562 "ldr q1, [%[a_ptr0]], #0x10\n"
1563 "ldr q4, [a_ptr1], #0x10\n"
1564 "ldr q7, [a_ptr2], #0x10\n"
1565 "ldr q10, [a_ptr3], #0x10\n"
1566 "ldr q13, [a_ptr4], #0x10\n"
1567 "ldr q16, [a_ptr5], #0x10\n"
1568 "ldr q2, [%[a_ptr0]]\n"
1569 "ldr q5, [a_ptr1]\n"
1570 "ldr q8, [a_ptr2]\n"
1571 "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
1572 "ldr q11, [a_ptr3]\n"
1573 "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
1574 "ldr q14, [a_ptr4]\n"
1575 "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
1576 "ldr q17, [a_ptr5]\n"
1577 "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
1578 "ldr q18, [%[b_ptr0]]\n"
1579 "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
1580 "ldr q19, [%[b_ptr0], #0x10]\n"
1581 "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
1582 "ldr q20, [%[b_ptr0], #0x20]\n"
1583 "ldr q21, [%[b_ptr0], #0x30]\n"
1584 "ldr q22, [%[b_ptr0], #0x40]\n"
1585 "ldr q23, [%[b_ptr0], #0x50]\n"
1586 "ldr q24, [%[b_ptr0], #0x60]\n"
1587 "ldr q25, [%[b_ptr0], #0x70]\n"
1588 "add %[b_ptr0], %[b_ptr0], #0x80\n"
1589 "cbz %[loops], 2f\n"
1590 "ldr q26, [%[biasptr]]\n"
1591 "add %[biasptr], %[biasptr], %[biasinc]\n"
1592 "subs %[loops], %[loops], #0x1\n"
1593 "mov v27.16b, v26.16b\n"
1594 "mov v28.16b, v26.16b\n"
1595 "mov v29.16b, v26.16b\n"
1596 "mov v30.16b, v26.16b\n"
1597 "mov v31.16b, v26.16b\n"
1598 "fmla v26.4s, v18.4s, v0.s[0]\n"
1599 "fmla v27.4s, v18.4s, v3.s[0]\n"
1600 "fmla v28.4s, v18.4s, v6.s[0]\n"
1601 "fmla v29.4s, v18.4s, v9.s[0]\n"
1602 "fmla v30.4s, v18.4s, v12.s[0]\n"
1603 "fmla v31.4s, v18.4s, v15.s[0]\n"
1604 "ldr q18, [%[b_ptr0]]\n"
1605 "fmla v26.4s, v19.4s, v0.s[1]\n"
1606 "fmla v27.4s, v19.4s, v3.s[1]\n"
1607 "fmla v28.4s, v19.4s, v6.s[1]\n"
1608 "fmla v29.4s, v19.4s, v9.s[1]\n"
1609 "fmla v30.4s, v19.4s, v12.s[1]\n"
1610 "fmla v31.4s, v19.4s, v15.s[1]\n"
1611 "ldr q19, [%[b_ptr0], #0x10]\n"
1612 "fmla v26.4s, v20.4s, v0.s[2]\n"
1613 "fmla v27.4s, v20.4s, v3.s[2]\n"
1614 "fmla v28.4s, v20.4s, v6.s[2]\n"
1615 "fmla v29.4s, v20.4s, v9.s[2]\n"
1616 "fmla v30.4s, v20.4s, v12.s[2]\n"
1617 "fmla v31.4s, v20.4s, v15.s[2]\n"
1618 "ldr q20, [%[b_ptr0], #0x20]\n"
1619 "fmla v26.4s, v21.4s, v0.s[3]\n"
1620 "fmla v27.4s, v21.4s, v3.s[3]\n"
1621 "fmla v28.4s, v21.4s, v6.s[3]\n"
1622 "fmla v29.4s, v21.4s, v9.s[3]\n"
1623 "fmla v30.4s, v21.4s, v12.s[3]\n"
1624 "fmla v31.4s, v21.4s, v15.s[3]\n"
1625 "ldr q21, [%[b_ptr0], #0x30]\n"
1626 "fmla v26.4s, v22.4s, v1.s[0]\n"
1627 "add %[b_ptr0], %[b_ptr0], #0x40\n"
1628 "fmla v27.4s, v22.4s, v4.s[0]\n"
1629 "fmla v28.4s, v22.4s, v7.s[0]\n"
1630 "fmla v29.4s, v22.4s, v10.s[0]\n"
1631 "fmla v30.4s, v22.4s, v13.s[0]\n"
1632 "fmla v31.4s, v22.4s, v16.s[0]\n"
1633 "fmla v26.4s, v23.4s, v1.s[1]\n"
1634 "fmla v27.4s, v23.4s, v4.s[1]\n"
1635 "fmla v28.4s, v23.4s, v7.s[1]\n"
1636 "fmla v29.4s, v23.4s, v10.s[1]\n"
1637 "fmla v30.4s, v23.4s, v13.s[1]\n"
1638 "fmla v31.4s, v23.4s, v16.s[1]\n"
1639 "fmla v26.4s, v24.4s, v1.s[2]\n"
1640 "fmla v27.4s, v24.4s, v4.s[2]\n"
1641 "fmla v28.4s, v24.4s, v7.s[2]\n"
1642 "fmla v29.4s, v24.4s, v10.s[2]\n"
1643 "fmla v30.4s, v24.4s, v13.s[2]\n"
1644 "fmla v31.4s, v24.4s, v16.s[2]\n"
1645 "fmla v26.4s, v25.4s, v1.s[3]\n"
1646 "fmla v27.4s, v25.4s, v4.s[3]\n"
1647 "fmla v28.4s, v25.4s, v7.s[3]\n"
1648 "fmla v29.4s, v25.4s, v10.s[3]\n"
1649 "fmla v30.4s, v25.4s, v13.s[3]\n"
1650 "fmla v31.4s, v25.4s, v16.s[3]\n"
1651 "fmla v26.4s, v18.4s, v2.s[0]\n"
1652 "fmla v27.4s, v18.4s, v5.s[0]\n"
1653 "fmla v28.4s, v18.4s, v8.s[0]\n"
1654 "fmla v29.4s, v18.4s, v11.s[0]\n"
1655 "fmla v30.4s, v18.4s, v14.s[0]\n"
1656 "fmla v31.4s, v18.4s, v17.s[0]\n"
1657 "fmla v26.4s, v19.4s, v2.s[1]\n"
1658 "fmla v27.4s, v19.4s, v5.s[1]\n"
1659 "fmla v28.4s, v19.4s, v8.s[1]\n"
1660 "fmla v29.4s, v19.4s, v11.s[1]\n"
1661 "fmla v30.4s, v19.4s, v14.s[1]\n"
1662 "fmla v31.4s, v19.4s, v17.s[1]\n"
1663 "fmla v26.4s, v20.4s, v2.s[2]\n"
1664 "fmla v27.4s, v20.4s, v5.s[2]\n"
1665 "fmla v28.4s, v20.4s, v8.s[2]\n"
1666 "fmla v29.4s, v20.4s, v11.s[2]\n"
1667 "fmla v30.4s, v20.4s, v14.s[2]\n"
1668 "fmla v31.4s, v20.4s, v17.s[2]\n"
1669 "fmla v26.4s, v21.4s, v2.s[3]\n"
1670 "fmla v27.4s, v21.4s, v5.s[3]\n"
1671 "fmla v28.4s, v21.4s, v8.s[3]\n"
1672 "fmla v29.4s, v21.4s, v11.s[3]\n"
1673 "fmla v30.4s, v21.4s, v14.s[3]\n"
1674 "fmla v31.4s, v21.4s, v17.s[3]\n"
1677 "ld1r {v24.4s}, [%[minptr]]\n"
1678 "subs %[loops], %[loops], #0x1\n"
1679 "ld1r {v25.4s}, [%[maxptr]]\n"
1680 "ldr q18, [%[b_ptr0]]\n"
1681 "fmax v26.4s, v26.4s, v24.4s\n"
1682 "ldr q19, [%[b_ptr0], #0x10]\n"
1683 "fmax v27.4s, v27.4s, v24.4s\n"
1684 "ldr q20, [%[b_ptr0], #0x20]\n"
1685 "fmax v28.4s, v28.4s, v24.4s\n"
1686 "ldr q21, [%[b_ptr0], #0x30]\n"
1687 "fmax v29.4s, v29.4s, v24.4s\n"
1688 "ldr q22, [%[b_ptr0], #0x40]\n"
1689 "fmin v26.4s, v26.4s, v25.4s\n"
1690 "ldr q23, [%[b_ptr0], #0x50]\n"
1691 "fmin v27.4s, v27.4s, v25.4s\n"
1692 "fmin v28.4s, v28.4s, v25.4s\n"
1693 "fmin v29.4s, v29.4s, v25.4s\n"
1694 "str q26, [%[c_ptr0]]\n"
1695 "fmax v30.4s, v30.4s, v24.4s\n"
1696 "ldr q26, [%[biasptr]]\n"
1697 "fmax v31.4s, v31.4s, v24.4s\n"
1698 "ldr q24, [%[b_ptr0], #0x60]\n"
1699 "add %[c_ptr0], %[c_ptr0], #0x10\n"
1700 "str q27, [c_ptr1]\n"
1701 "add c_ptr1, c_ptr1, #0x10\n"
1702 "fmin v30.4s, v30.4s, v25.4s\n"
1703 "add %[biasptr], %[biasptr], %[biasinc]\n"
1704 "fmin v31.4s, v31.4s, v25.4s\n"
1705 "str q28, [c_ptr2]\n"
1706 "mov v27.16b, v26.16b\n"
1707 "ldr q25, [%[b_ptr0], #0x70]\n"
1708 "mov v28.16b, v26.16b\n"
1709 "add c_ptr2, c_ptr2, #0x10\n"
1710 "str q29, [c_ptr3]\n"
1711 "add c_ptr3, c_ptr3, #0x10\n"
1712 "mov v29.16b, v26.16b\n"
1713 "add %[b_ptr0], %[b_ptr0], #0x80\n"
1714 "fmla v27.4s, v18.4s, v3.s[0]\n"
1715 "str q30, [c_ptr4]\n"
1716 "mov v30.16b, v26.16b\n"
1717 "add c_ptr4, c_ptr4, #0x10\n"
1718 "fmla v28.4s, v18.4s, v6.s[0]\n"
1719 "str q31, [c_ptr5]\n"
1720 "mov v31.16b, v26.16b\n"
1721 "add c_ptr5, c_ptr5, #0x10\n"
1722 "fmla v26.4s, v18.4s, v0.s[0]\n"
1723 "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
1724 "fmla v29.4s, v18.4s, v9.s[0]\n"
1725 "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
1726 "fmla v30.4s, v18.4s, v12.s[0]\n"
1727 "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
1728 "fmla v31.4s, v18.4s, v15.s[0]\n"
1729 "ldr q18, [%[b_ptr0]]\n"
1730 "fmla v26.4s, v19.4s, v0.s[1]\n"
1731 "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
1732 "fmla v27.4s, v19.4s, v3.s[1]\n"
1733 "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
1734 "fmla v28.4s, v19.4s, v6.s[1]\n"
1735 "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
1736 "fmla v29.4s, v19.4s, v9.s[1]\n"
1737 "fmla v30.4s, v19.4s, v12.s[1]\n"
1738 "fmla v31.4s, v19.4s, v15.s[1]\n"
1739 "ldr q19, [%[b_ptr0], #0x10]\n"
1740 "fmla v26.4s, v20.4s, v0.s[2]\n"
1741 "fmla v27.4s, v20.4s, v3.s[2]\n"
1742 "fmla v28.4s, v20.4s, v6.s[2]\n"
1743 "fmla v29.4s, v20.4s, v9.s[2]\n"
1744 "fmla v30.4s, v20.4s, v12.s[2]\n"
1745 "fmla v31.4s, v20.4s, v15.s[2]\n"
1746 "ldr q20, [%[b_ptr0], #0x20]\n"
1747 "fmla v26.4s, v21.4s, v0.s[3]\n"
1748 "fmla v27.4s, v21.4s, v3.s[3]\n"
1749 "fmla v28.4s, v21.4s, v6.s[3]\n"
1750 "fmla v29.4s, v21.4s, v9.s[3]\n"
1751 "fmla v30.4s, v21.4s, v12.s[3]\n"
1752 "fmla v31.4s, v21.4s, v15.s[3]\n"
1753 "ldr q21, [%[b_ptr0], #0x30]\n"
1754 "fmla v26.4s, v22.4s, v1.s[0]\n"
1755 "add %[b_ptr0], %[b_ptr0], #0x40\n"
1756 "fmla v27.4s, v22.4s, v4.s[0]\n"
1757 "fmla v28.4s, v22.4s, v7.s[0]\n"
1758 "fmla v29.4s, v22.4s, v10.s[0]\n"
1759 "fmla v30.4s, v22.4s, v13.s[0]\n"
1760 "fmla v31.4s, v22.4s, v16.s[0]\n"
1761 "fmla v26.4s, v23.4s, v1.s[1]\n"
1762 "fmla v27.4s, v23.4s, v4.s[1]\n"
1763 "fmla v28.4s, v23.4s, v7.s[1]\n"
1764 "fmla v29.4s, v23.4s, v10.s[1]\n"
1765 "fmla v30.4s, v23.4s, v13.s[1]\n"
1766 "fmla v31.4s, v23.4s, v16.s[1]\n"
1767 "fmla v26.4s, v24.4s, v1.s[2]\n"
1768 "fmla v27.4s, v24.4s, v4.s[2]\n"
1769 "fmla v28.4s, v24.4s, v7.s[2]\n"
1770 "fmla v29.4s, v24.4s, v10.s[2]\n"
1771 "fmla v30.4s, v24.4s, v13.s[2]\n"
1772 "fmla v31.4s, v24.4s, v16.s[2]\n"
1773 "fmla v26.4s, v25.4s, v1.s[3]\n"
1774 "fmla v27.4s, v25.4s, v4.s[3]\n"
1775 "fmla v28.4s, v25.4s, v7.s[3]\n"
1776 "fmla v29.4s, v25.4s, v10.s[3]\n"
1777 "fmla v30.4s, v25.4s, v13.s[3]\n"
1778 "fmla v31.4s, v25.4s, v16.s[3]\n"
1779 "fmla v26.4s, v18.4s, v2.s[0]\n"
1780 "fmla v27.4s, v18.4s, v5.s[0]\n"
1781 "fmla v28.4s, v18.4s, v8.s[0]\n"
1782 "fmla v29.4s, v18.4s, v11.s[0]\n"
1783 "fmla v30.4s, v18.4s, v14.s[0]\n"
1784 "fmla v31.4s, v18.4s, v17.s[0]\n"
1785 "fmla v26.4s, v19.4s, v2.s[1]\n"
1786 "fmla v27.4s, v19.4s, v5.s[1]\n"
1787 "fmla v28.4s, v19.4s, v8.s[1]\n"
1788 "fmla v29.4s, v19.4s, v11.s[1]\n"
1789 "fmla v30.4s, v19.4s, v14.s[1]\n"
1790 "fmla v31.4s, v19.4s, v17.s[1]\n"
1791 "fmla v26.4s, v20.4s, v2.s[2]\n"
1792 "fmla v27.4s, v20.4s, v5.s[2]\n"
1793 "fmla v28.4s, v20.4s, v8.s[2]\n"
1794 "fmla v29.4s, v20.4s, v11.s[2]\n"
1795 "fmla v30.4s, v20.4s, v14.s[2]\n"
1796 "fmla v31.4s, v20.4s, v17.s[2]\n"
1797 "fmla v26.4s, v21.4s, v2.s[3]\n"
1798 "fmla v27.4s, v21.4s, v5.s[3]\n"
1799 "fmla v28.4s, v21.4s, v8.s[3]\n"
1800 "fmla v29.4s, v21.4s, v11.s[3]\n"
1801 "fmla v30.4s, v21.4s, v14.s[3]\n"
1802 "fmla v31.4s, v21.4s, v17.s[3]\n"
1805 "ld1r {v24.4s}, [%[minptr]]\n"
1806 "ld1r {v25.4s}, [%[maxptr]]\n"
1807 "ldr q18, [%[b_ptr0]]\n"
1808 "ldr q19, [%[b_ptr0], #0x10]\n"
1809 "fmax v26.4s, v26.4s, v24.4s\n"
1810 "ldr q20, [%[b_ptr0], #0x20]\n"
1811 "fmax v27.4s, v27.4s, v24.4s\n"
1812 "ldr q21, [%[b_ptr0], #0x30]\n"
1813 "fmax v28.4s, v28.4s, v24.4s\n"
1814 "ldr q22, [%[b_ptr0], #0x40]\n"
1815 "fmax v29.4s, v29.4s, v24.4s\n"
1816 "ldr q23, [%[b_ptr0], #0x50]\n"
1817 "fmin v26.4s, v26.4s, v25.4s\n"
1818 "fmin v27.4s, v27.4s, v25.4s\n"
1819 "fmin v28.4s, v28.4s, v25.4s\n"
1820 "fmin v29.4s, v29.4s, v25.4s\n"
1821 "str q26, [%[c_ptr0]]\n"
1822 "fmax v30.4s, v30.4s, v24.4s\n"
1823 "ldr q26, [%[biasptr]]\n"
1824 "fmax v31.4s, v31.4s, v24.4s\n"
1825 "ldr q24, [%[b_ptr0], #0x60]\n"
1826 "add %[c_ptr0], %[c_ptr0], #0x10\n"
1827 "str q27, [c_ptr1]\n"
1828 "add c_ptr1, c_ptr1, #0x10\n"
1829 "fmin v30.4s, v30.4s, v25.4s\n"
1830 "add %[biasptr], %[biasptr], %[biasinc]\n"
1831 "fmin v31.4s, v31.4s, v25.4s\n"
1832 "str q28, [c_ptr2]\n"
1833 "mov v27.16b, v26.16b\n"
1834 "ldr q25, [%[b_ptr0], #0x70]\n"
1835 "mov v28.16b, v26.16b\n"
1836 "add c_ptr2, c_ptr2, #0x10\n"
1837 "str q29, [c_ptr3]\n"
1838 "add c_ptr3, c_ptr3, #0x10\n"
1839 "mov v29.16b, v26.16b\n"
1840 "add %[b_ptr0], %[b_ptr0], #0x80\n"
1841 "fmla v27.4s, v18.4s, v3.s[0]\n"
1842 "str q30, [c_ptr4]\n"
1843 "mov v30.16b, v26.16b\n"
1844 "add c_ptr4, c_ptr4, #0x10\n"
1845 "fmla v28.4s, v18.4s, v6.s[0]\n"
1846 "str q31, [c_ptr5]\n"
1847 "mov v31.16b, v26.16b\n"
1848 "add c_ptr5, c_ptr5, #0x10\n"
1849 "fmla v26.4s, v18.4s, v0.s[0]\n"
1850 "fmla v29.4s, v18.4s, v9.s[0]\n"
1851 "fmla v30.4s, v18.4s, v12.s[0]\n"
1852 "fmla v31.4s, v18.4s, v15.s[0]\n"
1853 "ldr q18, [%[b_ptr0]]\n"
1854 "fmla v26.4s, v19.4s, v0.s[1]\n"
1855 "fmla v27.4s, v19.4s, v3.s[1]\n"
1856 "fmla v28.4s, v19.4s, v6.s[1]\n"
1857 "fmla v29.4s, v19.4s, v9.s[1]\n"
1858 "fmla v30.4s, v19.4s, v12.s[1]\n"
1859 "fmla v31.4s, v19.4s, v15.s[1]\n"
1860 "ldr q19, [%[b_ptr0], #0x10]\n"
1861 "fmla v26.4s, v20.4s, v0.s[2]\n"
1862 "fmla v27.4s, v20.4s, v3.s[2]\n"
1863 "fmla v28.4s, v20.4s, v6.s[2]\n"
1864 "fmla v29.4s, v20.4s, v9.s[2]\n"
1865 "fmla v30.4s, v20.4s, v12.s[2]\n"
1866 "fmla v31.4s, v20.4s, v15.s[2]\n"
1867 "ldr q20, [%[b_ptr0], #0x20]\n"
1868 "fmla v26.4s, v21.4s, v0.s[3]\n"
1869 "fmla v27.4s, v21.4s, v3.s[3]\n"
1870 "fmla v28.4s, v21.4s, v6.s[3]\n"
1871 "fmla v29.4s, v21.4s, v9.s[3]\n"
1872 "fmla v30.4s, v21.4s, v12.s[3]\n"
1873 "fmla v31.4s, v21.4s, v15.s[3]\n"
1874 "ldr q21, [%[b_ptr0], #0x30]\n"
1875 "fmla v26.4s, v22.4s, v1.s[0]\n"
1876 "add %[b_ptr0], %[b_ptr0], #0x40\n"
1877 "fmla v27.4s, v22.4s, v4.s[0]\n"
1878 "fmla v28.4s, v22.4s, v7.s[0]\n"
1879 "fmla v29.4s, v22.4s, v10.s[0]\n"
1880 "fmla v30.4s, v22.4s, v13.s[0]\n"
1881 "fmla v31.4s, v22.4s, v16.s[0]\n"
1882 "fmla v26.4s, v23.4s, v1.s[1]\n"
1883 "fmla v27.4s, v23.4s, v4.s[1]\n"
1884 "fmla v28.4s, v23.4s, v7.s[1]\n"
1885 "fmla v29.4s, v23.4s, v10.s[1]\n"
1886 "fmla v30.4s, v23.4s, v13.s[1]\n"
1887 "fmla v31.4s, v23.4s, v16.s[1]\n"
1888 "fmla v26.4s, v24.4s, v1.s[2]\n"
1889 "fmla v27.4s, v24.4s, v4.s[2]\n"
1890 "fmla v28.4s, v24.4s, v7.s[2]\n"
1891 "fmla v29.4s, v24.4s, v10.s[2]\n"
1892 "fmla v30.4s, v24.4s, v13.s[2]\n"
1893 "fmla v31.4s, v24.4s, v16.s[2]\n"
1894 "fmla v26.4s, v25.4s, v1.s[3]\n"
1895 "fmla v27.4s, v25.4s, v4.s[3]\n"
1896 "fmla v28.4s, v25.4s, v7.s[3]\n"
1897 "fmla v29.4s, v25.4s, v10.s[3]\n"
1898 "fmla v30.4s, v25.4s, v13.s[3]\n"
1899 "fmla v31.4s, v25.4s, v16.s[3]\n"
1900 "fmla v26.4s, v18.4s, v2.s[0]\n"
1901 "fmla v27.4s, v18.4s, v5.s[0]\n"
1902 "fmla v28.4s, v18.4s, v8.s[0]\n"
1903 "fmla v29.4s, v18.4s, v11.s[0]\n"
1904 "fmla v30.4s, v18.4s, v14.s[0]\n"
1905 "fmla v31.4s, v18.4s, v17.s[0]\n"
1906 "fmla v26.4s, v19.4s, v2.s[1]\n"
1907 "fmla v27.4s, v19.4s, v5.s[1]\n"
1908 "fmla v28.4s, v19.4s, v8.s[1]\n"
1909 "fmla v29.4s, v19.4s, v11.s[1]\n"
1910 "fmla v30.4s, v19.4s, v14.s[1]\n"
1911 "fmla v31.4s, v19.4s, v17.s[1]\n"
1912 "fmla v26.4s, v20.4s, v2.s[2]\n"
1913 "fmla v27.4s, v20.4s, v5.s[2]\n"
1914 "fmla v28.4s, v20.4s, v8.s[2]\n"
1915 "fmla v29.4s, v20.4s, v11.s[2]\n"
1916 "fmla v30.4s, v20.4s, v14.s[2]\n"
1917 "fmla v31.4s, v20.4s, v17.s[2]\n"
1918 "fmla v26.4s, v21.4s, v2.s[3]\n"
1919 "fmla v27.4s, v21.4s, v5.s[3]\n"
1920 "fmla v28.4s, v21.4s, v8.s[3]\n"
1921 "fmla v29.4s, v21.4s, v11.s[3]\n"
1922 "fmla v30.4s, v21.4s, v14.s[3]\n"
1923 "fmla v31.4s, v21.4s, v17.s[3]\n"
1926 "ldr q26, [%[biasptr]]\n"
1927 "add %[biasptr], %[biasptr], %[biasinc]\n"
1928 "mov v27.16b, v26.16b\n"
1929 "mov v28.16b, v26.16b\n"
1930 "mov v29.16b, v26.16b\n"
1931 "mov v30.16b, v26.16b\n"
1932 "mov v31.16b, v26.16b\n"
1933 "fmla v26.4s, v18.4s, v0.s[0]\n"
1934 "fmla v27.4s, v18.4s, v3.s[0]\n"
1935 "fmla v28.4s, v18.4s, v6.s[0]\n"
1936 "fmla v29.4s, v18.4s, v9.s[0]\n"
1937 "fmla v30.4s, v18.4s, v12.s[0]\n"
1938 "fmla v31.4s, v18.4s, v15.s[0]\n"
1939 "ldr q18, [%[b_ptr0]]\n"
1940 "fmla v26.4s, v19.4s, v0.s[1]\n"
1941 "fmla v27.4s, v19.4s, v3.s[1]\n"
1942 "fmla v28.4s, v19.4s, v6.s[1]\n"
1943 "fmla v29.4s, v19.4s, v9.s[1]\n"
1944 "fmla v30.4s, v19.4s, v12.s[1]\n"
1945 "fmla v31.4s, v19.4s, v15.s[1]\n"
1946 "ldr q19, [%[b_ptr0], #0x10]\n"
1947 "fmla v26.4s, v20.4s, v0.s[2]\n"
1948 "fmla v27.4s, v20.4s, v3.s[2]\n"
1949 "fmla v28.4s, v20.4s, v6.s[2]\n"
1950 "fmla v29.4s, v20.4s, v9.s[2]\n"
1951 "fmla v30.4s, v20.4s, v12.s[2]\n"
1952 "fmla v31.4s, v20.4s, v15.s[2]\n"
1953 "ldr q20, [%[b_ptr0], #0x20]\n"
1954 "fmla v26.4s, v21.4s, v0.s[3]\n"
1955 "fmla v27.4s, v21.4s, v3.s[3]\n"
1956 "fmla v28.4s, v21.4s, v6.s[3]\n"
1957 "fmla v29.4s, v21.4s, v9.s[3]\n"
1958 "fmla v30.4s, v21.4s, v12.s[3]\n"
1959 "fmla v31.4s, v21.4s, v15.s[3]\n"
1960 "ldr q21, [%[b_ptr0], #0x30]\n"
1961 "fmla v26.4s, v22.4s, v1.s[0]\n"
1962 "add %[b_ptr0], %[b_ptr0], #0x40\n"
1963 "fmla v27.4s, v22.4s, v4.s[0]\n"
1964 "fmla v28.4s, v22.4s, v7.s[0]\n"
1965 "fmla v29.4s, v22.4s, v10.s[0]\n"
1966 "fmla v30.4s, v22.4s, v13.s[0]\n"
1967 "fmla v31.4s, v22.4s, v16.s[0]\n"
1968 "fmla v26.4s, v23.4s, v1.s[1]\n"
1969 "fmla v27.4s, v23.4s, v4.s[1]\n"
1970 "fmla v28.4s, v23.4s, v7.s[1]\n"
1971 "fmla v29.4s, v23.4s, v10.s[1]\n"
1972 "fmla v30.4s, v23.4s, v13.s[1]\n"
1973 "fmla v31.4s, v23.4s, v16.s[1]\n"
1974 "fmla v26.4s, v24.4s, v1.s[2]\n"
1975 "fmla v27.4s, v24.4s, v4.s[2]\n"
1976 "fmla v28.4s, v24.4s, v7.s[2]\n"
1977 "fmla v29.4s, v24.4s, v10.s[2]\n"
1978 "fmla v30.4s, v24.4s, v13.s[2]\n"
1979 "fmla v31.4s, v24.4s, v16.s[2]\n"
1980 "fmla v26.4s, v25.4s, v1.s[3]\n"
1981 "fmla v27.4s, v25.4s, v4.s[3]\n"
1982 "fmla v28.4s, v25.4s, v7.s[3]\n"
1983 "fmla v29.4s, v25.4s, v10.s[3]\n"
1984 "fmla v30.4s, v25.4s, v13.s[3]\n"
1985 "fmla v31.4s, v25.4s, v16.s[3]\n"
1986 "fmla v26.4s, v18.4s, v2.s[0]\n"
1987 "fmla v27.4s, v18.4s, v5.s[0]\n"
1988 "fmla v28.4s, v18.4s, v8.s[0]\n"
1989 "fmla v29.4s, v18.4s, v11.s[0]\n"
1990 "fmla v30.4s, v18.4s, v14.s[0]\n"
1991 "fmla v31.4s, v18.4s, v17.s[0]\n"
1992 "fmla v26.4s, v19.4s, v2.s[1]\n"
1993 "fmla v27.4s, v19.4s, v5.s[1]\n"
1994 "fmla v28.4s, v19.4s, v8.s[1]\n"
1995 "fmla v29.4s, v19.4s, v11.s[1]\n"
1996 "fmla v30.4s, v19.4s, v14.s[1]\n"
1997 "fmla v31.4s, v19.4s, v17.s[1]\n"
1998 "fmla v26.4s, v20.4s, v2.s[2]\n"
1999 "fmla v27.4s, v20.4s, v5.s[2]\n"
2000 "fmla v28.4s, v20.4s, v8.s[2]\n"
2001 "fmla v29.4s, v20.4s, v11.s[2]\n"
2002 "fmla v30.4s, v20.4s, v14.s[2]\n"
2003 "fmla v31.4s, v20.4s, v17.s[2]\n"
2004 "fmla v26.4s, v21.4s, v2.s[3]\n"
2005 "fmla v27.4s, v21.4s, v5.s[3]\n"
2006 "fmla v28.4s, v21.4s, v8.s[3]\n"
2007 "fmla v29.4s, v21.4s, v11.s[3]\n"
2008 "fmla v30.4s, v21.4s, v14.s[3]\n"
2009 "fmla v31.4s, v21.4s, v17.s[3]\n"
2011 "ld1r {v24.4s}, [%[minptr]]\n"
2012 "ld1r {v25.4s}, [%[maxptr]]\n"
2013 "fmax v26.4s, v26.4s, v24.4s\n"
2014 "fmax v27.4s, v27.4s, v24.4s\n"
2015 "fmax v28.4s, v28.4s, v24.4s\n"
2016 "fmax v29.4s, v29.4s, v24.4s\n"
2017 "fmin v26.4s, v26.4s, v25.4s\n"
2018 "fmin v27.4s, v27.4s, v25.4s\n"
2019 "fmin v28.4s, v28.4s, v25.4s\n"
2020 "fmin v29.4s, v29.4s, v25.4s\n"
2021 "str q26, [%[c_ptr0]]\n"
2022 "fmax v30.4s, v30.4s, v24.4s\n"
2023 "add %[c_ptr0], %[c_ptr0], #0x10\n"
2024 "fmax v31.4s, v31.4s, v24.4s\n"
2025 "str q27, [c_ptr1]\n"
2026 "fmin v30.4s, v30.4s, v25.4s\n"
2027 "fmin v31.4s, v31.4s, v25.4s\n"
2028 "str q28, [c_ptr2]\n"
2029 "str q29, [c_ptr3]\n"
2030 "str q30, [c_ptr4]\n"
2031 "str q31, [c_ptr5]\n"
2042 : [a_ptr0]
"+r" (a_ptr0), [b_ptr0]
"+r" (b_ptr0), [c_ptr0]
"+r" (c_ptr0), [loops]
"+r" (loops), [oob_rows]
"+r" (oob_rows), [biasptr]
"+r" (biasptr)
2043 : [lda]
"r" (ldab), [ldc]
"r" (ldcb), [biasinc]
"r" (biasinc), [minptr]
"r" (minptr), [maxptr]
"r" (maxptr)
2044 :
"x0",
"x1",
"x2",
"x3",
"x4",
"x5",
"x6",
"x7",
"x8",
"x9",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23",
"v24",
"v25",
"v26",
"v27",
"v28",
"v29",
"v30",
"v31",
"cc",
"memory"
2059 "add a_ptr1, %[a_ptr0], %[lda]\n"
2060 "add c_ptr1, %[c_ptr0], %[ldc]\n"
2061 "add a_ptr2, a_ptr1, %[lda]\n"
2062 "add c_ptr2, c_ptr1, %[ldc]\n"
2063 "add a_ptr3, a_ptr2, %[lda]\n"
2064 "add c_ptr3, c_ptr2, %[ldc]\n"
2065 "add a_ptr4, a_ptr3, %[lda]\n"
2066 "add c_ptr4, c_ptr3, %[ldc]\n"
2067 "add a_ptr5, a_ptr4, %[lda]\n"
2068 "add c_ptr5, c_ptr4, %[ldc]\n"
2069 "cbz %[oob_rows], 1f\n"
2070 "subs %[oob_rows], %[oob_rows], #0x1\n"
2071 "add c_ptr5, %[c_ptr0], #0x0\n"
2072 "add a_ptr5, %[a_ptr0], #0x0\n"
2074 "subs %[oob_rows], %[oob_rows], #0x1\n"
2075 "add c_ptr4, %[c_ptr0], #0x0\n"
2076 "add a_ptr4, %[a_ptr0], #0x0\n"
2078 "subs %[oob_rows], %[oob_rows], #0x1\n"
2079 "add c_ptr3, %[c_ptr0], #0x0\n"
2080 "add a_ptr3, %[a_ptr0], #0x0\n"
2082 "subs %[oob_rows], %[oob_rows], #0x1\n"
2083 "add c_ptr2, %[c_ptr0], #0x0\n"
2084 "add a_ptr2, %[a_ptr0], #0x0\n"
2086 "subs %[oob_rows], %[oob_rows], #0x1\n"
2087 "add c_ptr1, %[c_ptr0], #0x0\n"
2088 "add a_ptr1, %[a_ptr0], #0x0\n"
2090 "ldr q0, [%[a_ptr0]], #0x10\n"
2091 "ldr q4, [a_ptr1], #0x10\n"
2092 "ldr q8, [a_ptr2], #0x10\n"
2093 "ldr q12, [a_ptr3], #0x10\n"
2094 "ldr q16, [a_ptr4], #0x10\n"
2095 "ldr q20, [a_ptr5], #0x10\n"
2096 "ldr q1, [%[a_ptr0]], #0x10\n"
2097 "ldr q5, [a_ptr1], #0x10\n"
2098 "ldr q9, [a_ptr2], #0x10\n"
2099 "ldr q13, [a_ptr3], #0x10\n"
2100 "ldr q17, [a_ptr4], #0x10\n"
2101 "ldr q21, [a_ptr5], #0x10\n"
2102 "ldr q2, [%[a_ptr0]], #0x10\n"
2103 "ldr q6, [a_ptr1], #0x10\n"
2104 "ldr q10, [a_ptr2], #0x10\n"
2105 "ldr q14, [a_ptr3], #0x10\n"
2106 "ldr s3, [%[a_ptr0]]\n"
2107 "ldr q18, [a_ptr4], #0x10\n"
2108 "ldr s7, [a_ptr1]\n"
2109 "ldr q22, [a_ptr5], #0x10\n"
2110 "ldr s11, [a_ptr2]\n"
2111 "ldr q24, [%[b_ptr0]]\n"
2112 "ldr s15, [a_ptr3]\n"
2113 "ldr q25, [%[b_ptr0], #0x10]\n"
2114 "ldr s19, [a_ptr4]\n"
2115 "ldr s23, [a_ptr5]\n"
2116 "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
2117 "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
2118 "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
2119 "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
2120 "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
2121 "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
2122 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2123 "cbz %[loops], 2f\n"
2124 "ldr q26, [%[biasptr]]\n"
2125 "add %[biasptr], %[biasptr], %[biasinc]\n"
2126 "subs %[loops], %[loops], #0x1\n"
2127 "mov v27.16b, v26.16b\n"
2128 "mov v28.16b, v26.16b\n"
2129 "mov v29.16b, v26.16b\n"
2130 "mov v30.16b, v26.16b\n"
2131 "mov v31.16b, v26.16b\n"
2132 "fmla v26.4s, v24.4s, v0.s[0]\n"
2133 "fmla v27.4s, v24.4s, v4.s[0]\n"
2134 "fmla v28.4s, v24.4s, v8.s[0]\n"
2135 "fmla v29.4s, v24.4s, v12.s[0]\n"
2136 "fmla v30.4s, v24.4s, v16.s[0]\n"
2137 "fmla v31.4s, v24.4s, v20.s[0]\n"
2138 "ldr q24, [%[b_ptr0]]\n"
2139 "fmla v26.4s, v25.4s, v0.s[1]\n"
2140 "fmla v27.4s, v25.4s, v4.s[1]\n"
2141 "fmla v28.4s, v25.4s, v8.s[1]\n"
2142 "fmla v29.4s, v25.4s, v12.s[1]\n"
2143 "fmla v30.4s, v25.4s, v16.s[1]\n"
2144 "fmla v31.4s, v25.4s, v20.s[1]\n"
2145 "ldr q25, [%[b_ptr0], #0x10]\n"
2146 "fmla v26.4s, v24.4s, v0.s[2]\n"
2147 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2148 "fmla v27.4s, v24.4s, v4.s[2]\n"
2149 "fmla v28.4s, v24.4s, v8.s[2]\n"
2150 "fmla v29.4s, v24.4s, v12.s[2]\n"
2151 "fmla v30.4s, v24.4s, v16.s[2]\n"
2152 "fmla v31.4s, v24.4s, v20.s[2]\n"
2153 "ldr q24, [%[b_ptr0]]\n"
2154 "fmla v26.4s, v25.4s, v0.s[3]\n"
2155 "fmla v27.4s, v25.4s, v4.s[3]\n"
2156 "fmla v28.4s, v25.4s, v8.s[3]\n"
2157 "fmla v29.4s, v25.4s, v12.s[3]\n"
2158 "fmla v30.4s, v25.4s, v16.s[3]\n"
2159 "fmla v31.4s, v25.4s, v20.s[3]\n"
2160 "ldr q25, [%[b_ptr0], #0x10]\n"
2161 "fmla v26.4s, v24.4s, v1.s[0]\n"
2162 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2163 "fmla v27.4s, v24.4s, v5.s[0]\n"
2164 "fmla v28.4s, v24.4s, v9.s[0]\n"
2165 "fmla v29.4s, v24.4s, v13.s[0]\n"
2166 "fmla v30.4s, v24.4s, v17.s[0]\n"
2167 "fmla v31.4s, v24.4s, v21.s[0]\n"
2168 "ldr q24, [%[b_ptr0]]\n"
2169 "fmla v26.4s, v25.4s, v1.s[1]\n"
2170 "fmla v27.4s, v25.4s, v5.s[1]\n"
2171 "fmla v28.4s, v25.4s, v9.s[1]\n"
2172 "fmla v29.4s, v25.4s, v13.s[1]\n"
2173 "fmla v30.4s, v25.4s, v17.s[1]\n"
2174 "fmla v31.4s, v25.4s, v21.s[1]\n"
2175 "ldr q25, [%[b_ptr0], #0x10]\n"
2176 "fmla v26.4s, v24.4s, v1.s[2]\n"
2177 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2178 "fmla v27.4s, v24.4s, v5.s[2]\n"
2179 "fmla v28.4s, v24.4s, v9.s[2]\n"
2180 "fmla v29.4s, v24.4s, v13.s[2]\n"
2181 "fmla v30.4s, v24.4s, v17.s[2]\n"
2182 "fmla v31.4s, v24.4s, v21.s[2]\n"
2183 "ldr q24, [%[b_ptr0]]\n"
2184 "fmla v26.4s, v25.4s, v1.s[3]\n"
2185 "fmla v27.4s, v25.4s, v5.s[3]\n"
2186 "fmla v28.4s, v25.4s, v9.s[3]\n"
2187 "fmla v29.4s, v25.4s, v13.s[3]\n"
2188 "fmla v30.4s, v25.4s, v17.s[3]\n"
2189 "fmla v31.4s, v25.4s, v21.s[3]\n"
2190 "ldr q25, [%[b_ptr0], #0x10]\n"
2191 "fmla v26.4s, v24.4s, v2.s[0]\n"
2192 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2193 "fmla v27.4s, v24.4s, v6.s[0]\n"
2194 "fmla v28.4s, v24.4s, v10.s[0]\n"
2195 "fmla v29.4s, v24.4s, v14.s[0]\n"
2196 "fmla v30.4s, v24.4s, v18.s[0]\n"
2197 "fmla v31.4s, v24.4s, v22.s[0]\n"
2198 "ldr q24, [%[b_ptr0]]\n"
2199 "fmla v26.4s, v25.4s, v2.s[1]\n"
2200 "fmla v27.4s, v25.4s, v6.s[1]\n"
2201 "fmla v28.4s, v25.4s, v10.s[1]\n"
2202 "fmla v29.4s, v25.4s, v14.s[1]\n"
2203 "fmla v30.4s, v25.4s, v18.s[1]\n"
2204 "fmla v31.4s, v25.4s, v22.s[1]\n"
2205 "ldr q25, [%[b_ptr0], #0x10]\n"
2206 "fmla v26.4s, v24.4s, v2.s[2]\n"
2207 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2208 "fmla v27.4s, v24.4s, v6.s[2]\n"
2209 "fmla v28.4s, v24.4s, v10.s[2]\n"
2210 "fmla v29.4s, v24.4s, v14.s[2]\n"
2211 "fmla v30.4s, v24.4s, v18.s[2]\n"
2212 "fmla v31.4s, v24.4s, v22.s[2]\n"
2213 "ldr q24, [%[b_ptr0]]\n"
2214 "fmla v26.4s, v25.4s, v2.s[3]\n"
2215 "add %[b_ptr0], %[b_ptr0], #0x10\n"
2216 "fmla v27.4s, v25.4s, v6.s[3]\n"
2217 "fmla v28.4s, v25.4s, v10.s[3]\n"
2218 "fmla v29.4s, v25.4s, v14.s[3]\n"
2219 "fmla v30.4s, v25.4s, v18.s[3]\n"
2220 "fmla v31.4s, v25.4s, v22.s[3]\n"
2221 "fmla v26.4s, v24.4s, v3.s[0]\n"
2222 "fmla v27.4s, v24.4s, v7.s[0]\n"
2223 "fmla v28.4s, v24.4s, v11.s[0]\n"
2224 "fmla v29.4s, v24.4s, v15.s[0]\n"
2225 "fmla v30.4s, v24.4s, v19.s[0]\n"
2226 "fmla v31.4s, v24.4s, v23.s[0]\n"
2229 "ld1r {v24.4s}, [%[minptr]]\n"
2230 "subs %[loops], %[loops], #0x1\n"
2231 "ld1r {v25.4s}, [%[maxptr]]\n"
2232 "fmax v26.4s, v26.4s, v24.4s\n"
2233 "fmax v27.4s, v27.4s, v24.4s\n"
2234 "fmax v28.4s, v28.4s, v24.4s\n"
2235 "fmax v29.4s, v29.4s, v24.4s\n"
2236 "fmin v26.4s, v26.4s, v25.4s\n"
2237 "fmin v27.4s, v27.4s, v25.4s\n"
2238 "fmin v28.4s, v28.4s, v25.4s\n"
2239 "fmin v29.4s, v29.4s, v25.4s\n"
2240 "str q26, [%[c_ptr0]]\n"
2241 "fmax v30.4s, v30.4s, v24.4s\n"
2242 "ldr q26, [%[biasptr]]\n"
2243 "fmax v31.4s, v31.4s, v24.4s\n"
2244 "ldr q24, [%[b_ptr0]]\n"
2245 "add %[c_ptr0], %[c_ptr0], #0x10\n"
2246 "str q27, [c_ptr1]\n"
2247 "add c_ptr1, c_ptr1, #0x10\n"
2248 "fmin v30.4s, v30.4s, v25.4s\n"
2249 "add %[biasptr], %[biasptr], %[biasinc]\n"
2250 "fmin v31.4s, v31.4s, v25.4s\n"
2251 "str q28, [c_ptr2]\n"
2252 "mov v27.16b, v26.16b\n"
2253 "ldr q25, [%[b_ptr0], #0x10]\n"
2254 "mov v28.16b, v26.16b\n"
2255 "add c_ptr2, c_ptr2, #0x10\n"
2256 "str q29, [c_ptr3]\n"
2257 "add c_ptr3, c_ptr3, #0x10\n"
2258 "mov v29.16b, v26.16b\n"
2259 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2260 "fmla v27.4s, v24.4s, v4.s[0]\n"
2261 "str q30, [c_ptr4]\n"
2262 "mov v30.16b, v26.16b\n"
2263 "add c_ptr4, c_ptr4, #0x10\n"
2264 "fmla v28.4s, v24.4s, v8.s[0]\n"
2265 "str q31, [c_ptr5]\n"
2266 "mov v31.16b, v26.16b\n"
2267 "add c_ptr5, c_ptr5, #0x10\n"
2268 "fmla v26.4s, v24.4s, v0.s[0]\n"
2269 "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
2270 "fmla v29.4s, v24.4s, v12.s[0]\n"
2271 "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
2272 "fmla v30.4s, v24.4s, v16.s[0]\n"
2273 "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
2274 "fmla v31.4s, v24.4s, v20.s[0]\n"
2275 "ldr q24, [%[b_ptr0]]\n"
2276 "fmla v26.4s, v25.4s, v0.s[1]\n"
2277 "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
2278 "fmla v27.4s, v25.4s, v4.s[1]\n"
2279 "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
2280 "fmla v28.4s, v25.4s, v8.s[1]\n"
2281 "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
2282 "fmla v29.4s, v25.4s, v12.s[1]\n"
2283 "fmla v30.4s, v25.4s, v16.s[1]\n"
2284 "fmla v31.4s, v25.4s, v20.s[1]\n"
2285 "ldr q25, [%[b_ptr0], #0x10]\n"
2286 "fmla v26.4s, v24.4s, v0.s[2]\n"
2287 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2288 "fmla v27.4s, v24.4s, v4.s[2]\n"
2289 "fmla v28.4s, v24.4s, v8.s[2]\n"
2290 "fmla v29.4s, v24.4s, v12.s[2]\n"
2291 "fmla v30.4s, v24.4s, v16.s[2]\n"
2292 "fmla v31.4s, v24.4s, v20.s[2]\n"
2293 "ldr q24, [%[b_ptr0]]\n"
2294 "fmla v26.4s, v25.4s, v0.s[3]\n"
2295 "fmla v27.4s, v25.4s, v4.s[3]\n"
2296 "fmla v28.4s, v25.4s, v8.s[3]\n"
2297 "fmla v29.4s, v25.4s, v12.s[3]\n"
2298 "fmla v30.4s, v25.4s, v16.s[3]\n"
2299 "fmla v31.4s, v25.4s, v20.s[3]\n"
2300 "ldr q25, [%[b_ptr0], #0x10]\n"
2301 "fmla v26.4s, v24.4s, v1.s[0]\n"
2302 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2303 "fmla v27.4s, v24.4s, v5.s[0]\n"
2304 "fmla v28.4s, v24.4s, v9.s[0]\n"
2305 "fmla v29.4s, v24.4s, v13.s[0]\n"
2306 "fmla v30.4s, v24.4s, v17.s[0]\n"
2307 "fmla v31.4s, v24.4s, v21.s[0]\n"
2308 "ldr q24, [%[b_ptr0]]\n"
2309 "fmla v26.4s, v25.4s, v1.s[1]\n"
2310 "fmla v27.4s, v25.4s, v5.s[1]\n"
2311 "fmla v28.4s, v25.4s, v9.s[1]\n"
2312 "fmla v29.4s, v25.4s, v13.s[1]\n"
2313 "fmla v30.4s, v25.4s, v17.s[1]\n"
2314 "fmla v31.4s, v25.4s, v21.s[1]\n"
2315 "ldr q25, [%[b_ptr0], #0x10]\n"
2316 "fmla v26.4s, v24.4s, v1.s[2]\n"
2317 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2318 "fmla v27.4s, v24.4s, v5.s[2]\n"
2319 "fmla v28.4s, v24.4s, v9.s[2]\n"
2320 "fmla v29.4s, v24.4s, v13.s[2]\n"
2321 "fmla v30.4s, v24.4s, v17.s[2]\n"
2322 "fmla v31.4s, v24.4s, v21.s[2]\n"
2323 "ldr q24, [%[b_ptr0]]\n"
2324 "fmla v26.4s, v25.4s, v1.s[3]\n"
2325 "fmla v27.4s, v25.4s, v5.s[3]\n"
2326 "fmla v28.4s, v25.4s, v9.s[3]\n"
2327 "fmla v29.4s, v25.4s, v13.s[3]\n"
2328 "fmla v30.4s, v25.4s, v17.s[3]\n"
2329 "fmla v31.4s, v25.4s, v21.s[3]\n"
2330 "ldr q25, [%[b_ptr0], #0x10]\n"
2331 "fmla v26.4s, v24.4s, v2.s[0]\n"
2332 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2333 "fmla v27.4s, v24.4s, v6.s[0]\n"
2334 "fmla v28.4s, v24.4s, v10.s[0]\n"
2335 "fmla v29.4s, v24.4s, v14.s[0]\n"
2336 "fmla v30.4s, v24.4s, v18.s[0]\n"
2337 "fmla v31.4s, v24.4s, v22.s[0]\n"
2338 "ldr q24, [%[b_ptr0]]\n"
2339 "fmla v26.4s, v25.4s, v2.s[1]\n"
2340 "fmla v27.4s, v25.4s, v6.s[1]\n"
2341 "fmla v28.4s, v25.4s, v10.s[1]\n"
2342 "fmla v29.4s, v25.4s, v14.s[1]\n"
2343 "fmla v30.4s, v25.4s, v18.s[1]\n"
2344 "fmla v31.4s, v25.4s, v22.s[1]\n"
2345 "ldr q25, [%[b_ptr0], #0x10]\n"
2346 "fmla v26.4s, v24.4s, v2.s[2]\n"
2347 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2348 "fmla v27.4s, v24.4s, v6.s[2]\n"
2349 "fmla v28.4s, v24.4s, v10.s[2]\n"
2350 "fmla v29.4s, v24.4s, v14.s[2]\n"
2351 "fmla v30.4s, v24.4s, v18.s[2]\n"
2352 "fmla v31.4s, v24.4s, v22.s[2]\n"
2353 "ldr q24, [%[b_ptr0]]\n"
2354 "fmla v26.4s, v25.4s, v2.s[3]\n"
2355 "add %[b_ptr0], %[b_ptr0], #0x10\n"
2356 "fmla v27.4s, v25.4s, v6.s[3]\n"
2357 "fmla v28.4s, v25.4s, v10.s[3]\n"
2358 "fmla v29.4s, v25.4s, v14.s[3]\n"
2359 "fmla v30.4s, v25.4s, v18.s[3]\n"
2360 "fmla v31.4s, v25.4s, v22.s[3]\n"
2361 "fmla v26.4s, v24.4s, v3.s[0]\n"
2362 "fmla v27.4s, v24.4s, v7.s[0]\n"
2363 "fmla v28.4s, v24.4s, v11.s[0]\n"
2364 "fmla v29.4s, v24.4s, v15.s[0]\n"
2365 "fmla v30.4s, v24.4s, v19.s[0]\n"
2366 "fmla v31.4s, v24.4s, v23.s[0]\n"
2369 "ld1r {v24.4s}, [%[minptr]]\n"
2370 "ld1r {v25.4s}, [%[maxptr]]\n"
2371 "fmax v26.4s, v26.4s, v24.4s\n"
2372 "fmax v27.4s, v27.4s, v24.4s\n"
2373 "fmax v28.4s, v28.4s, v24.4s\n"
2374 "fmax v29.4s, v29.4s, v24.4s\n"
2375 "fmin v26.4s, v26.4s, v25.4s\n"
2376 "fmin v27.4s, v27.4s, v25.4s\n"
2377 "fmin v28.4s, v28.4s, v25.4s\n"
2378 "fmin v29.4s, v29.4s, v25.4s\n"
2379 "str q26, [%[c_ptr0]]\n"
2380 "fmax v30.4s, v30.4s, v24.4s\n"
2381 "ldr q26, [%[biasptr]]\n"
2382 "fmax v31.4s, v31.4s, v24.4s\n"
2383 "ldr q24, [%[b_ptr0]]\n"
2384 "add %[c_ptr0], %[c_ptr0], #0x10\n"
2385 "str q27, [c_ptr1]\n"
2386 "add c_ptr1, c_ptr1, #0x10\n"
2387 "fmin v30.4s, v30.4s, v25.4s\n"
2388 "add %[biasptr], %[biasptr], %[biasinc]\n"
2389 "fmin v31.4s, v31.4s, v25.4s\n"
2390 "str q28, [c_ptr2]\n"
2391 "mov v27.16b, v26.16b\n"
2392 "ldr q25, [%[b_ptr0], #0x10]\n"
2393 "mov v28.16b, v26.16b\n"
2394 "add c_ptr2, c_ptr2, #0x10\n"
2395 "str q29, [c_ptr3]\n"
2396 "add c_ptr3, c_ptr3, #0x10\n"
2397 "mov v29.16b, v26.16b\n"
2398 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2399 "fmla v27.4s, v24.4s, v4.s[0]\n"
2400 "str q30, [c_ptr4]\n"
2401 "mov v30.16b, v26.16b\n"
2402 "add c_ptr4, c_ptr4, #0x10\n"
2403 "fmla v28.4s, v24.4s, v8.s[0]\n"
2404 "str q31, [c_ptr5]\n"
2405 "mov v31.16b, v26.16b\n"
2406 "add c_ptr5, c_ptr5, #0x10\n"
2407 "fmla v26.4s, v24.4s, v0.s[0]\n"
2408 "fmla v29.4s, v24.4s, v12.s[0]\n"
2409 "fmla v30.4s, v24.4s, v16.s[0]\n"
2410 "fmla v31.4s, v24.4s, v20.s[0]\n"
2411 "ldr q24, [%[b_ptr0]]\n"
2412 "fmla v26.4s, v25.4s, v0.s[1]\n"
2413 "fmla v27.4s, v25.4s, v4.s[1]\n"
2414 "fmla v28.4s, v25.4s, v8.s[1]\n"
2415 "fmla v29.4s, v25.4s, v12.s[1]\n"
2416 "fmla v30.4s, v25.4s, v16.s[1]\n"
2417 "fmla v31.4s, v25.4s, v20.s[1]\n"
2418 "ldr q25, [%[b_ptr0], #0x10]\n"
2419 "fmla v26.4s, v24.4s, v0.s[2]\n"
2420 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2421 "fmla v27.4s, v24.4s, v4.s[2]\n"
2422 "fmla v28.4s, v24.4s, v8.s[2]\n"
2423 "fmla v29.4s, v24.4s, v12.s[2]\n"
2424 "fmla v30.4s, v24.4s, v16.s[2]\n"
2425 "fmla v31.4s, v24.4s, v20.s[2]\n"
2426 "ldr q24, [%[b_ptr0]]\n"
2427 "fmla v26.4s, v25.4s, v0.s[3]\n"
2428 "fmla v27.4s, v25.4s, v4.s[3]\n"
2429 "fmla v28.4s, v25.4s, v8.s[3]\n"
2430 "fmla v29.4s, v25.4s, v12.s[3]\n"
2431 "fmla v30.4s, v25.4s, v16.s[3]\n"
2432 "fmla v31.4s, v25.4s, v20.s[3]\n"
2433 "ldr q25, [%[b_ptr0], #0x10]\n"
2434 "fmla v26.4s, v24.4s, v1.s[0]\n"
2435 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2436 "fmla v27.4s, v24.4s, v5.s[0]\n"
2437 "fmla v28.4s, v24.4s, v9.s[0]\n"
2438 "fmla v29.4s, v24.4s, v13.s[0]\n"
2439 "fmla v30.4s, v24.4s, v17.s[0]\n"
2440 "fmla v31.4s, v24.4s, v21.s[0]\n"
2441 "ldr q24, [%[b_ptr0]]\n"
2442 "fmla v26.4s, v25.4s, v1.s[1]\n"
2443 "fmla v27.4s, v25.4s, v5.s[1]\n"
2444 "fmla v28.4s, v25.4s, v9.s[1]\n"
2445 "fmla v29.4s, v25.4s, v13.s[1]\n"
2446 "fmla v30.4s, v25.4s, v17.s[1]\n"
2447 "fmla v31.4s, v25.4s, v21.s[1]\n"
2448 "ldr q25, [%[b_ptr0], #0x10]\n"
2449 "fmla v26.4s, v24.4s, v1.s[2]\n"
2450 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2451 "fmla v27.4s, v24.4s, v5.s[2]\n"
2452 "fmla v28.4s, v24.4s, v9.s[2]\n"
2453 "fmla v29.4s, v24.4s, v13.s[2]\n"
2454 "fmla v30.4s, v24.4s, v17.s[2]\n"
2455 "fmla v31.4s, v24.4s, v21.s[2]\n"
2456 "ldr q24, [%[b_ptr0]]\n"
2457 "fmla v26.4s, v25.4s, v1.s[3]\n"
2458 "fmla v27.4s, v25.4s, v5.s[3]\n"
2459 "fmla v28.4s, v25.4s, v9.s[3]\n"
2460 "fmla v29.4s, v25.4s, v13.s[3]\n"
2461 "fmla v30.4s, v25.4s, v17.s[3]\n"
2462 "fmla v31.4s, v25.4s, v21.s[3]\n"
2463 "ldr q25, [%[b_ptr0], #0x10]\n"
2464 "fmla v26.4s, v24.4s, v2.s[0]\n"
2465 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2466 "fmla v27.4s, v24.4s, v6.s[0]\n"
2467 "fmla v28.4s, v24.4s, v10.s[0]\n"
2468 "fmla v29.4s, v24.4s, v14.s[0]\n"
2469 "fmla v30.4s, v24.4s, v18.s[0]\n"
2470 "fmla v31.4s, v24.4s, v22.s[0]\n"
2471 "ldr q24, [%[b_ptr0]]\n"
2472 "fmla v26.4s, v25.4s, v2.s[1]\n"
2473 "fmla v27.4s, v25.4s, v6.s[1]\n"
2474 "fmla v28.4s, v25.4s, v10.s[1]\n"
2475 "fmla v29.4s, v25.4s, v14.s[1]\n"
2476 "fmla v30.4s, v25.4s, v18.s[1]\n"
2477 "fmla v31.4s, v25.4s, v22.s[1]\n"
2478 "ldr q25, [%[b_ptr0], #0x10]\n"
2479 "fmla v26.4s, v24.4s, v2.s[2]\n"
2480 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2481 "fmla v27.4s, v24.4s, v6.s[2]\n"
2482 "fmla v28.4s, v24.4s, v10.s[2]\n"
2483 "fmla v29.4s, v24.4s, v14.s[2]\n"
2484 "fmla v30.4s, v24.4s, v18.s[2]\n"
2485 "fmla v31.4s, v24.4s, v22.s[2]\n"
2486 "ldr q24, [%[b_ptr0]]\n"
2487 "fmla v26.4s, v25.4s, v2.s[3]\n"
2488 "add %[b_ptr0], %[b_ptr0], #0x10\n"
2489 "fmla v27.4s, v25.4s, v6.s[3]\n"
2490 "fmla v28.4s, v25.4s, v10.s[3]\n"
2491 "fmla v29.4s, v25.4s, v14.s[3]\n"
2492 "fmla v30.4s, v25.4s, v18.s[3]\n"
2493 "fmla v31.4s, v25.4s, v22.s[3]\n"
2494 "fmla v26.4s, v24.4s, v3.s[0]\n"
2495 "fmla v27.4s, v24.4s, v7.s[0]\n"
2496 "fmla v28.4s, v24.4s, v11.s[0]\n"
2497 "fmla v29.4s, v24.4s, v15.s[0]\n"
2498 "fmla v30.4s, v24.4s, v19.s[0]\n"
2499 "fmla v31.4s, v24.4s, v23.s[0]\n"
2502 "ldr q26, [%[biasptr]]\n"
2503 "add %[biasptr], %[biasptr], %[biasinc]\n"
2504 "mov v27.16b, v26.16b\n"
2505 "mov v28.16b, v26.16b\n"
2506 "mov v29.16b, v26.16b\n"
2507 "mov v30.16b, v26.16b\n"
2508 "mov v31.16b, v26.16b\n"
2509 "fmla v26.4s, v24.4s, v0.s[0]\n"
2510 "fmla v27.4s, v24.4s, v4.s[0]\n"
2511 "fmla v28.4s, v24.4s, v8.s[0]\n"
2512 "fmla v29.4s, v24.4s, v12.s[0]\n"
2513 "fmla v30.4s, v24.4s, v16.s[0]\n"
2514 "fmla v31.4s, v24.4s, v20.s[0]\n"
2515 "ldr q24, [%[b_ptr0]]\n"
2516 "fmla v26.4s, v25.4s, v0.s[1]\n"
2517 "fmla v27.4s, v25.4s, v4.s[1]\n"
2518 "fmla v28.4s, v25.4s, v8.s[1]\n"
2519 "fmla v29.4s, v25.4s, v12.s[1]\n"
2520 "fmla v30.4s, v25.4s, v16.s[1]\n"
2521 "fmla v31.4s, v25.4s, v20.s[1]\n"
2522 "ldr q25, [%[b_ptr0], #0x10]\n"
2523 "fmla v26.4s, v24.4s, v0.s[2]\n"
2524 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2525 "fmla v27.4s, v24.4s, v4.s[2]\n"
2526 "fmla v28.4s, v24.4s, v8.s[2]\n"
2527 "fmla v29.4s, v24.4s, v12.s[2]\n"
2528 "fmla v30.4s, v24.4s, v16.s[2]\n"
2529 "fmla v31.4s, v24.4s, v20.s[2]\n"
2530 "ldr q24, [%[b_ptr0]]\n"
2531 "fmla v26.4s, v25.4s, v0.s[3]\n"
2532 "fmla v27.4s, v25.4s, v4.s[3]\n"
2533 "fmla v28.4s, v25.4s, v8.s[3]\n"
2534 "fmla v29.4s, v25.4s, v12.s[3]\n"
2535 "fmla v30.4s, v25.4s, v16.s[3]\n"
2536 "fmla v31.4s, v25.4s, v20.s[3]\n"
2537 "ldr q25, [%[b_ptr0], #0x10]\n"
2538 "fmla v26.4s, v24.4s, v1.s[0]\n"
2539 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2540 "fmla v27.4s, v24.4s, v5.s[0]\n"
2541 "fmla v28.4s, v24.4s, v9.s[0]\n"
2542 "fmla v29.4s, v24.4s, v13.s[0]\n"
2543 "fmla v30.4s, v24.4s, v17.s[0]\n"
2544 "fmla v31.4s, v24.4s, v21.s[0]\n"
2545 "ldr q24, [%[b_ptr0]]\n"
2546 "fmla v26.4s, v25.4s, v1.s[1]\n"
2547 "fmla v27.4s, v25.4s, v5.s[1]\n"
2548 "fmla v28.4s, v25.4s, v9.s[1]\n"
2549 "fmla v29.4s, v25.4s, v13.s[1]\n"
2550 "fmla v30.4s, v25.4s, v17.s[1]\n"
2551 "fmla v31.4s, v25.4s, v21.s[1]\n"
2552 "ldr q25, [%[b_ptr0], #0x10]\n"
2553 "fmla v26.4s, v24.4s, v1.s[2]\n"
2554 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2555 "fmla v27.4s, v24.4s, v5.s[2]\n"
2556 "fmla v28.4s, v24.4s, v9.s[2]\n"
2557 "fmla v29.4s, v24.4s, v13.s[2]\n"
2558 "fmla v30.4s, v24.4s, v17.s[2]\n"
2559 "fmla v31.4s, v24.4s, v21.s[2]\n"
2560 "ldr q24, [%[b_ptr0]]\n"
2561 "fmla v26.4s, v25.4s, v1.s[3]\n"
2562 "fmla v27.4s, v25.4s, v5.s[3]\n"
2563 "fmla v28.4s, v25.4s, v9.s[3]\n"
2564 "fmla v29.4s, v25.4s, v13.s[3]\n"
2565 "fmla v30.4s, v25.4s, v17.s[3]\n"
2566 "fmla v31.4s, v25.4s, v21.s[3]\n"
2567 "ldr q25, [%[b_ptr0], #0x10]\n"
2568 "fmla v26.4s, v24.4s, v2.s[0]\n"
2569 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2570 "fmla v27.4s, v24.4s, v6.s[0]\n"
2571 "fmla v28.4s, v24.4s, v10.s[0]\n"
2572 "fmla v29.4s, v24.4s, v14.s[0]\n"
2573 "fmla v30.4s, v24.4s, v18.s[0]\n"
2574 "fmla v31.4s, v24.4s, v22.s[0]\n"
2575 "ldr q24, [%[b_ptr0]]\n"
2576 "fmla v26.4s, v25.4s, v2.s[1]\n"
2577 "fmla v27.4s, v25.4s, v6.s[1]\n"
2578 "fmla v28.4s, v25.4s, v10.s[1]\n"
2579 "fmla v29.4s, v25.4s, v14.s[1]\n"
2580 "fmla v30.4s, v25.4s, v18.s[1]\n"
2581 "fmla v31.4s, v25.4s, v22.s[1]\n"
2582 "ldr q25, [%[b_ptr0], #0x10]\n"
2583 "fmla v26.4s, v24.4s, v2.s[2]\n"
2584 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2585 "fmla v27.4s, v24.4s, v6.s[2]\n"
2586 "fmla v28.4s, v24.4s, v10.s[2]\n"
2587 "fmla v29.4s, v24.4s, v14.s[2]\n"
2588 "fmla v30.4s, v24.4s, v18.s[2]\n"
2589 "fmla v31.4s, v24.4s, v22.s[2]\n"
2590 "ldr q24, [%[b_ptr0]]\n"
2591 "fmla v26.4s, v25.4s, v2.s[3]\n"
2592 "add %[b_ptr0], %[b_ptr0], #0x10\n"
2593 "fmla v27.4s, v25.4s, v6.s[3]\n"
2594 "fmla v28.4s, v25.4s, v10.s[3]\n"
2595 "fmla v29.4s, v25.4s, v14.s[3]\n"
2596 "fmla v30.4s, v25.4s, v18.s[3]\n"
2597 "fmla v31.4s, v25.4s, v22.s[3]\n"
2598 "fmla v26.4s, v24.4s, v3.s[0]\n"
2599 "fmla v27.4s, v24.4s, v7.s[0]\n"
2600 "fmla v28.4s, v24.4s, v11.s[0]\n"
2601 "fmla v29.4s, v24.4s, v15.s[0]\n"
2602 "fmla v30.4s, v24.4s, v19.s[0]\n"
2603 "fmla v31.4s, v24.4s, v23.s[0]\n"
2605 "ld1r {v24.4s}, [%[minptr]]\n"
2606 "ld1r {v25.4s}, [%[maxptr]]\n"
2607 "fmax v26.4s, v26.4s, v24.4s\n"
2608 "fmax v27.4s, v27.4s, v24.4s\n"
2609 "fmax v28.4s, v28.4s, v24.4s\n"
2610 "fmax v29.4s, v29.4s, v24.4s\n"
2611 "fmin v26.4s, v26.4s, v25.4s\n"
2612 "fmin v27.4s, v27.4s, v25.4s\n"
2613 "fmin v28.4s, v28.4s, v25.4s\n"
2614 "fmin v29.4s, v29.4s, v25.4s\n"
2615 "str q26, [%[c_ptr0]]\n"
2616 "fmax v30.4s, v30.4s, v24.4s\n"
2617 "add %[c_ptr0], %[c_ptr0], #0x10\n"
2618 "fmax v31.4s, v31.4s, v24.4s\n"
2619 "str q27, [c_ptr1]\n"
2620 "fmin v30.4s, v30.4s, v25.4s\n"
2621 "fmin v31.4s, v31.4s, v25.4s\n"
2622 "str q28, [c_ptr2]\n"
2623 "str q29, [c_ptr3]\n"
2624 "str q30, [c_ptr4]\n"
2625 "str q31, [c_ptr5]\n"
2636 : [a_ptr0]
"+r" (a_ptr0), [b_ptr0]
"+r" (b_ptr0), [c_ptr0]
"+r" (c_ptr0), [loops]
"+r" (loops), [oob_rows]
"+r" (oob_rows), [biasptr]
"+r" (biasptr)
2637 : [lda]
"r" (ldab), [ldc]
"r" (ldcb), [biasinc]
"r" (biasinc), [minptr]
"r" (minptr), [maxptr]
"r" (maxptr)
2638 :
"x0",
"x1",
"x2",
"x3",
"x4",
"x5",
"x6",
"x7",
"x8",
"x9",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23",
"v24",
"v25",
"v26",
"v27",
"v28",
"v29",
"v30",
"v31",
"cc",
"memory"
2653 "add a_ptr1, %[a_ptr0], %[lda]\n"
2654 "add c_ptr1, %[c_ptr0], %[ldc]\n"
2655 "add a_ptr2, a_ptr1, %[lda]\n"
2656 "add c_ptr2, c_ptr1, %[ldc]\n"
2657 "add a_ptr3, a_ptr2, %[lda]\n"
2658 "add c_ptr3, c_ptr2, %[ldc]\n"
2659 "add a_ptr4, a_ptr3, %[lda]\n"
2660 "add c_ptr4, c_ptr3, %[ldc]\n"
2661 "add a_ptr5, a_ptr4, %[lda]\n"
2662 "add c_ptr5, c_ptr4, %[ldc]\n"
2663 "cbz %[oob_rows], 1f\n"
2664 "subs %[oob_rows], %[oob_rows], #0x1\n"
2665 "add c_ptr5, %[c_ptr0], #0x0\n"
2666 "add a_ptr5, %[a_ptr0], #0x0\n"
2668 "subs %[oob_rows], %[oob_rows], #0x1\n"
2669 "add c_ptr4, %[c_ptr0], #0x0\n"
2670 "add a_ptr4, %[a_ptr0], #0x0\n"
2672 "subs %[oob_rows], %[oob_rows], #0x1\n"
2673 "add c_ptr3, %[c_ptr0], #0x0\n"
2674 "add a_ptr3, %[a_ptr0], #0x0\n"
2676 "subs %[oob_rows], %[oob_rows], #0x1\n"
2677 "add c_ptr2, %[c_ptr0], #0x0\n"
2678 "add a_ptr2, %[a_ptr0], #0x0\n"
2680 "subs %[oob_rows], %[oob_rows], #0x1\n"
2681 "add c_ptr1, %[c_ptr0], #0x0\n"
2682 "add a_ptr1, %[a_ptr0], #0x0\n"
2684 "ldr q0, [%[a_ptr0]], #0x10\n"
2685 "ldr q4, [a_ptr1], #0x10\n"
2686 "ldr q8, [a_ptr2], #0x10\n"
2687 "ldr q12, [a_ptr3], #0x10\n"
2688 "ldr q16, [a_ptr4], #0x10\n"
2689 "ldr q20, [a_ptr5], #0x10\n"
2690 "ldr q1, [%[a_ptr0]], #0x10\n"
2691 "ldr q5, [a_ptr1], #0x10\n"
2692 "ldr q9, [a_ptr2], #0x10\n"
2693 "ldr q13, [a_ptr3], #0x10\n"
2694 "ldr q17, [a_ptr4], #0x10\n"
2695 "ldr q21, [a_ptr5], #0x10\n"
2696 "ldr q2, [%[a_ptr0]], #0x10\n"
2697 "ldr q6, [a_ptr1], #0x10\n"
2698 "ldr q10, [a_ptr2], #0x10\n"
2699 "ldr q14, [a_ptr3], #0x10\n"
2700 "ldr d3, [%[a_ptr0]]\n"
2701 "ldr q18, [a_ptr4], #0x10\n"
2702 "ldr d7, [a_ptr1]\n"
2703 "ldr q22, [a_ptr5], #0x10\n"
2704 "ldr d11, [a_ptr2]\n"
2705 "ldr q24, [%[b_ptr0]]\n"
2706 "ldr d15, [a_ptr3]\n"
2707 "ldr q25, [%[b_ptr0], #0x10]\n"
2708 "ldr d19, [a_ptr4]\n"
2709 "ldr d23, [a_ptr5]\n"
2710 "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
2711 "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
2712 "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
2713 "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
2714 "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
2715 "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
2716 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2717 "cbz %[loops], 2f\n"
2718 "ldr q26, [%[biasptr]]\n"
2719 "add %[biasptr], %[biasptr], %[biasinc]\n"
2720 "subs %[loops], %[loops], #0x1\n"
2721 "mov v27.16b, v26.16b\n"
2722 "mov v28.16b, v26.16b\n"
2723 "mov v29.16b, v26.16b\n"
2724 "mov v30.16b, v26.16b\n"
2725 "mov v31.16b, v26.16b\n"
2726 "fmla v26.4s, v24.4s, v0.s[0]\n"
2727 "fmla v27.4s, v24.4s, v4.s[0]\n"
2728 "fmla v28.4s, v24.4s, v8.s[0]\n"
2729 "fmla v29.4s, v24.4s, v12.s[0]\n"
2730 "fmla v30.4s, v24.4s, v16.s[0]\n"
2731 "fmla v31.4s, v24.4s, v20.s[0]\n"
2732 "ldr q24, [%[b_ptr0]]\n"
2733 "fmla v26.4s, v25.4s, v0.s[1]\n"
2734 "fmla v27.4s, v25.4s, v4.s[1]\n"
2735 "fmla v28.4s, v25.4s, v8.s[1]\n"
2736 "fmla v29.4s, v25.4s, v12.s[1]\n"
2737 "fmla v30.4s, v25.4s, v16.s[1]\n"
2738 "fmla v31.4s, v25.4s, v20.s[1]\n"
2739 "ldr q25, [%[b_ptr0], #0x10]\n"
2740 "fmla v26.4s, v24.4s, v0.s[2]\n"
2741 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2742 "fmla v27.4s, v24.4s, v4.s[2]\n"
2743 "fmla v28.4s, v24.4s, v8.s[2]\n"
2744 "fmla v29.4s, v24.4s, v12.s[2]\n"
2745 "fmla v30.4s, v24.4s, v16.s[2]\n"
2746 "fmla v31.4s, v24.4s, v20.s[2]\n"
2747 "ldr q24, [%[b_ptr0]]\n"
2748 "fmla v26.4s, v25.4s, v0.s[3]\n"
2749 "fmla v27.4s, v25.4s, v4.s[3]\n"
2750 "fmla v28.4s, v25.4s, v8.s[3]\n"
2751 "fmla v29.4s, v25.4s, v12.s[3]\n"
2752 "fmla v30.4s, v25.4s, v16.s[3]\n"
2753 "fmla v31.4s, v25.4s, v20.s[3]\n"
2754 "ldr q25, [%[b_ptr0], #0x10]\n"
2755 "fmla v26.4s, v24.4s, v1.s[0]\n"
2756 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2757 "fmla v27.4s, v24.4s, v5.s[0]\n"
2758 "fmla v28.4s, v24.4s, v9.s[0]\n"
2759 "fmla v29.4s, v24.4s, v13.s[0]\n"
2760 "fmla v30.4s, v24.4s, v17.s[0]\n"
2761 "fmla v31.4s, v24.4s, v21.s[0]\n"
2762 "ldr q24, [%[b_ptr0]]\n"
2763 "fmla v26.4s, v25.4s, v1.s[1]\n"
2764 "fmla v27.4s, v25.4s, v5.s[1]\n"
2765 "fmla v28.4s, v25.4s, v9.s[1]\n"
2766 "fmla v29.4s, v25.4s, v13.s[1]\n"
2767 "fmla v30.4s, v25.4s, v17.s[1]\n"
2768 "fmla v31.4s, v25.4s, v21.s[1]\n"
2769 "ldr q25, [%[b_ptr0], #0x10]\n"
2770 "fmla v26.4s, v24.4s, v1.s[2]\n"
2771 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2772 "fmla v27.4s, v24.4s, v5.s[2]\n"
2773 "fmla v28.4s, v24.4s, v9.s[2]\n"
2774 "fmla v29.4s, v24.4s, v13.s[2]\n"
2775 "fmla v30.4s, v24.4s, v17.s[2]\n"
2776 "fmla v31.4s, v24.4s, v21.s[2]\n"
2777 "ldr q24, [%[b_ptr0]]\n"
2778 "fmla v26.4s, v25.4s, v1.s[3]\n"
2779 "fmla v27.4s, v25.4s, v5.s[3]\n"
2780 "fmla v28.4s, v25.4s, v9.s[3]\n"
2781 "fmla v29.4s, v25.4s, v13.s[3]\n"
2782 "fmla v30.4s, v25.4s, v17.s[3]\n"
2783 "fmla v31.4s, v25.4s, v21.s[3]\n"
2784 "ldr q25, [%[b_ptr0], #0x10]\n"
2785 "fmla v26.4s, v24.4s, v2.s[0]\n"
2786 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2787 "fmla v27.4s, v24.4s, v6.s[0]\n"
2788 "fmla v28.4s, v24.4s, v10.s[0]\n"
2789 "fmla v29.4s, v24.4s, v14.s[0]\n"
2790 "fmla v30.4s, v24.4s, v18.s[0]\n"
2791 "fmla v31.4s, v24.4s, v22.s[0]\n"
2792 "ldr q24, [%[b_ptr0]]\n"
2793 "fmla v26.4s, v25.4s, v2.s[1]\n"
2794 "fmla v27.4s, v25.4s, v6.s[1]\n"
2795 "fmla v28.4s, v25.4s, v10.s[1]\n"
2796 "fmla v29.4s, v25.4s, v14.s[1]\n"
2797 "fmla v30.4s, v25.4s, v18.s[1]\n"
2798 "fmla v31.4s, v25.4s, v22.s[1]\n"
2799 "ldr q25, [%[b_ptr0], #0x10]\n"
2800 "fmla v26.4s, v24.4s, v2.s[2]\n"
2801 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2802 "fmla v27.4s, v24.4s, v6.s[2]\n"
2803 "fmla v28.4s, v24.4s, v10.s[2]\n"
2804 "fmla v29.4s, v24.4s, v14.s[2]\n"
2805 "fmla v30.4s, v24.4s, v18.s[2]\n"
2806 "fmla v31.4s, v24.4s, v22.s[2]\n"
2807 "ldr q24, [%[b_ptr0]]\n"
2808 "fmla v26.4s, v25.4s, v2.s[3]\n"
2809 "fmla v27.4s, v25.4s, v6.s[3]\n"
2810 "fmla v28.4s, v25.4s, v10.s[3]\n"
2811 "fmla v29.4s, v25.4s, v14.s[3]\n"
2812 "fmla v30.4s, v25.4s, v18.s[3]\n"
2813 "fmla v31.4s, v25.4s, v22.s[3]\n"
2814 "ldr q25, [%[b_ptr0], #0x10]\n"
2815 "fmla v26.4s, v24.4s, v3.s[0]\n"
2816 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2817 "fmla v27.4s, v24.4s, v7.s[0]\n"
2818 "fmla v28.4s, v24.4s, v11.s[0]\n"
2819 "fmla v29.4s, v24.4s, v15.s[0]\n"
2820 "fmla v30.4s, v24.4s, v19.s[0]\n"
2821 "fmla v31.4s, v24.4s, v23.s[0]\n"
2822 "fmla v26.4s, v25.4s, v3.s[1]\n"
2823 "fmla v27.4s, v25.4s, v7.s[1]\n"
2824 "fmla v28.4s, v25.4s, v11.s[1]\n"
2825 "fmla v29.4s, v25.4s, v15.s[1]\n"
2826 "fmla v30.4s, v25.4s, v19.s[1]\n"
2827 "fmla v31.4s, v25.4s, v23.s[1]\n"
2830 "ld1r {v24.4s}, [%[minptr]]\n"
2831 "subs %[loops], %[loops], #0x1\n"
2832 "ld1r {v25.4s}, [%[maxptr]]\n"
2833 "fmax v26.4s, v26.4s, v24.4s\n"
2834 "fmax v27.4s, v27.4s, v24.4s\n"
2835 "fmax v28.4s, v28.4s, v24.4s\n"
2836 "fmax v29.4s, v29.4s, v24.4s\n"
2837 "fmin v26.4s, v26.4s, v25.4s\n"
2838 "fmin v27.4s, v27.4s, v25.4s\n"
2839 "fmin v28.4s, v28.4s, v25.4s\n"
2840 "fmin v29.4s, v29.4s, v25.4s\n"
2841 "str q26, [%[c_ptr0]]\n"
2842 "fmax v30.4s, v30.4s, v24.4s\n"
2843 "ldr q26, [%[biasptr]]\n"
2844 "fmax v31.4s, v31.4s, v24.4s\n"
2845 "ldr q24, [%[b_ptr0]]\n"
2846 "add %[c_ptr0], %[c_ptr0], #0x10\n"
2847 "str q27, [c_ptr1]\n"
2848 "add c_ptr1, c_ptr1, #0x10\n"
2849 "fmin v30.4s, v30.4s, v25.4s\n"
2850 "add %[biasptr], %[biasptr], %[biasinc]\n"
2851 "fmin v31.4s, v31.4s, v25.4s\n"
2852 "str q28, [c_ptr2]\n"
2853 "mov v27.16b, v26.16b\n"
2854 "ldr q25, [%[b_ptr0], #0x10]\n"
2855 "mov v28.16b, v26.16b\n"
2856 "add c_ptr2, c_ptr2, #0x10\n"
2857 "str q29, [c_ptr3]\n"
2858 "add c_ptr3, c_ptr3, #0x10\n"
2859 "mov v29.16b, v26.16b\n"
2860 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2861 "fmla v27.4s, v24.4s, v4.s[0]\n"
2862 "str q30, [c_ptr4]\n"
2863 "mov v30.16b, v26.16b\n"
2864 "add c_ptr4, c_ptr4, #0x10\n"
2865 "fmla v28.4s, v24.4s, v8.s[0]\n"
2866 "str q31, [c_ptr5]\n"
2867 "mov v31.16b, v26.16b\n"
2868 "add c_ptr5, c_ptr5, #0x10\n"
2869 "fmla v26.4s, v24.4s, v0.s[0]\n"
2870 "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
2871 "fmla v29.4s, v24.4s, v12.s[0]\n"
2872 "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
2873 "fmla v30.4s, v24.4s, v16.s[0]\n"
2874 "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
2875 "fmla v31.4s, v24.4s, v20.s[0]\n"
2876 "ldr q24, [%[b_ptr0]]\n"
2877 "fmla v26.4s, v25.4s, v0.s[1]\n"
2878 "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
2879 "fmla v27.4s, v25.4s, v4.s[1]\n"
2880 "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
2881 "fmla v28.4s, v25.4s, v8.s[1]\n"
2882 "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
2883 "fmla v29.4s, v25.4s, v12.s[1]\n"
2884 "fmla v30.4s, v25.4s, v16.s[1]\n"
2885 "fmla v31.4s, v25.4s, v20.s[1]\n"
2886 "ldr q25, [%[b_ptr0], #0x10]\n"
2887 "fmla v26.4s, v24.4s, v0.s[2]\n"
2888 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2889 "fmla v27.4s, v24.4s, v4.s[2]\n"
2890 "fmla v28.4s, v24.4s, v8.s[2]\n"
2891 "fmla v29.4s, v24.4s, v12.s[2]\n"
2892 "fmla v30.4s, v24.4s, v16.s[2]\n"
2893 "fmla v31.4s, v24.4s, v20.s[2]\n"
2894 "ldr q24, [%[b_ptr0]]\n"
2895 "fmla v26.4s, v25.4s, v0.s[3]\n"
2896 "fmla v27.4s, v25.4s, v4.s[3]\n"
2897 "fmla v28.4s, v25.4s, v8.s[3]\n"
2898 "fmla v29.4s, v25.4s, v12.s[3]\n"
2899 "fmla v30.4s, v25.4s, v16.s[3]\n"
2900 "fmla v31.4s, v25.4s, v20.s[3]\n"
2901 "ldr q25, [%[b_ptr0], #0x10]\n"
2902 "fmla v26.4s, v24.4s, v1.s[0]\n"
2903 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2904 "fmla v27.4s, v24.4s, v5.s[0]\n"
2905 "fmla v28.4s, v24.4s, v9.s[0]\n"
2906 "fmla v29.4s, v24.4s, v13.s[0]\n"
2907 "fmla v30.4s, v24.4s, v17.s[0]\n"
2908 "fmla v31.4s, v24.4s, v21.s[0]\n"
2909 "ldr q24, [%[b_ptr0]]\n"
2910 "fmla v26.4s, v25.4s, v1.s[1]\n"
2911 "fmla v27.4s, v25.4s, v5.s[1]\n"
2912 "fmla v28.4s, v25.4s, v9.s[1]\n"
2913 "fmla v29.4s, v25.4s, v13.s[1]\n"
2914 "fmla v30.4s, v25.4s, v17.s[1]\n"
2915 "fmla v31.4s, v25.4s, v21.s[1]\n"
2916 "ldr q25, [%[b_ptr0], #0x10]\n"
2917 "fmla v26.4s, v24.4s, v1.s[2]\n"
2918 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2919 "fmla v27.4s, v24.4s, v5.s[2]\n"
2920 "fmla v28.4s, v24.4s, v9.s[2]\n"
2921 "fmla v29.4s, v24.4s, v13.s[2]\n"
2922 "fmla v30.4s, v24.4s, v17.s[2]\n"
2923 "fmla v31.4s, v24.4s, v21.s[2]\n"
2924 "ldr q24, [%[b_ptr0]]\n"
2925 "fmla v26.4s, v25.4s, v1.s[3]\n"
2926 "fmla v27.4s, v25.4s, v5.s[3]\n"
2927 "fmla v28.4s, v25.4s, v9.s[3]\n"
2928 "fmla v29.4s, v25.4s, v13.s[3]\n"
2929 "fmla v30.4s, v25.4s, v17.s[3]\n"
2930 "fmla v31.4s, v25.4s, v21.s[3]\n"
2931 "ldr q25, [%[b_ptr0], #0x10]\n"
2932 "fmla v26.4s, v24.4s, v2.s[0]\n"
2933 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2934 "fmla v27.4s, v24.4s, v6.s[0]\n"
2935 "fmla v28.4s, v24.4s, v10.s[0]\n"
2936 "fmla v29.4s, v24.4s, v14.s[0]\n"
2937 "fmla v30.4s, v24.4s, v18.s[0]\n"
2938 "fmla v31.4s, v24.4s, v22.s[0]\n"
2939 "ldr q24, [%[b_ptr0]]\n"
2940 "fmla v26.4s, v25.4s, v2.s[1]\n"
2941 "fmla v27.4s, v25.4s, v6.s[1]\n"
2942 "fmla v28.4s, v25.4s, v10.s[1]\n"
2943 "fmla v29.4s, v25.4s, v14.s[1]\n"
2944 "fmla v30.4s, v25.4s, v18.s[1]\n"
2945 "fmla v31.4s, v25.4s, v22.s[1]\n"
2946 "ldr q25, [%[b_ptr0], #0x10]\n"
2947 "fmla v26.4s, v24.4s, v2.s[2]\n"
2948 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2949 "fmla v27.4s, v24.4s, v6.s[2]\n"
2950 "fmla v28.4s, v24.4s, v10.s[2]\n"
2951 "fmla v29.4s, v24.4s, v14.s[2]\n"
2952 "fmla v30.4s, v24.4s, v18.s[2]\n"
2953 "fmla v31.4s, v24.4s, v22.s[2]\n"
2954 "ldr q24, [%[b_ptr0]]\n"
2955 "fmla v26.4s, v25.4s, v2.s[3]\n"
2956 "fmla v27.4s, v25.4s, v6.s[3]\n"
2957 "fmla v28.4s, v25.4s, v10.s[3]\n"
2958 "fmla v29.4s, v25.4s, v14.s[3]\n"
2959 "fmla v30.4s, v25.4s, v18.s[3]\n"
2960 "fmla v31.4s, v25.4s, v22.s[3]\n"
2961 "ldr q25, [%[b_ptr0], #0x10]\n"
2962 "fmla v26.4s, v24.4s, v3.s[0]\n"
2963 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2964 "fmla v27.4s, v24.4s, v7.s[0]\n"
2965 "fmla v28.4s, v24.4s, v11.s[0]\n"
2966 "fmla v29.4s, v24.4s, v15.s[0]\n"
2967 "fmla v30.4s, v24.4s, v19.s[0]\n"
2968 "fmla v31.4s, v24.4s, v23.s[0]\n"
2969 "fmla v26.4s, v25.4s, v3.s[1]\n"
2970 "fmla v27.4s, v25.4s, v7.s[1]\n"
2971 "fmla v28.4s, v25.4s, v11.s[1]\n"
2972 "fmla v29.4s, v25.4s, v15.s[1]\n"
2973 "fmla v30.4s, v25.4s, v19.s[1]\n"
2974 "fmla v31.4s, v25.4s, v23.s[1]\n"
2977 "ld1r {v24.4s}, [%[minptr]]\n"
2978 "ld1r {v25.4s}, [%[maxptr]]\n"
2979 "fmax v26.4s, v26.4s, v24.4s\n"
2980 "fmax v27.4s, v27.4s, v24.4s\n"
2981 "fmax v28.4s, v28.4s, v24.4s\n"
2982 "fmax v29.4s, v29.4s, v24.4s\n"
2983 "fmin v26.4s, v26.4s, v25.4s\n"
2984 "fmin v27.4s, v27.4s, v25.4s\n"
2985 "fmin v28.4s, v28.4s, v25.4s\n"
2986 "fmin v29.4s, v29.4s, v25.4s\n"
2987 "str q26, [%[c_ptr0]]\n"
2988 "fmax v30.4s, v30.4s, v24.4s\n"
2989 "ldr q26, [%[biasptr]]\n"
2990 "fmax v31.4s, v31.4s, v24.4s\n"
2991 "ldr q24, [%[b_ptr0]]\n"
2992 "add %[c_ptr0], %[c_ptr0], #0x10\n"
2993 "str q27, [c_ptr1]\n"
2994 "add c_ptr1, c_ptr1, #0x10\n"
2995 "fmin v30.4s, v30.4s, v25.4s\n"
2996 "add %[biasptr], %[biasptr], %[biasinc]\n"
2997 "fmin v31.4s, v31.4s, v25.4s\n"
2998 "str q28, [c_ptr2]\n"
2999 "mov v27.16b, v26.16b\n"
3000 "ldr q25, [%[b_ptr0], #0x10]\n"
3001 "mov v28.16b, v26.16b\n"
3002 "add c_ptr2, c_ptr2, #0x10\n"
3003 "str q29, [c_ptr3]\n"
3004 "add c_ptr3, c_ptr3, #0x10\n"
3005 "mov v29.16b, v26.16b\n"
3006 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3007 "fmla v27.4s, v24.4s, v4.s[0]\n"
3008 "str q30, [c_ptr4]\n"
3009 "mov v30.16b, v26.16b\n"
3010 "add c_ptr4, c_ptr4, #0x10\n"
3011 "fmla v28.4s, v24.4s, v8.s[0]\n"
3012 "str q31, [c_ptr5]\n"
3013 "mov v31.16b, v26.16b\n"
3014 "add c_ptr5, c_ptr5, #0x10\n"
3015 "fmla v26.4s, v24.4s, v0.s[0]\n"
3016 "fmla v29.4s, v24.4s, v12.s[0]\n"
3017 "fmla v30.4s, v24.4s, v16.s[0]\n"
3018 "fmla v31.4s, v24.4s, v20.s[0]\n"
3019 "ldr q24, [%[b_ptr0]]\n"
3020 "fmla v26.4s, v25.4s, v0.s[1]\n"
3021 "fmla v27.4s, v25.4s, v4.s[1]\n"
3022 "fmla v28.4s, v25.4s, v8.s[1]\n"
3023 "fmla v29.4s, v25.4s, v12.s[1]\n"
3024 "fmla v30.4s, v25.4s, v16.s[1]\n"
3025 "fmla v31.4s, v25.4s, v20.s[1]\n"
3026 "ldr q25, [%[b_ptr0], #0x10]\n"
3027 "fmla v26.4s, v24.4s, v0.s[2]\n"
3028 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3029 "fmla v27.4s, v24.4s, v4.s[2]\n"
3030 "fmla v28.4s, v24.4s, v8.s[2]\n"
3031 "fmla v29.4s, v24.4s, v12.s[2]\n"
3032 "fmla v30.4s, v24.4s, v16.s[2]\n"
3033 "fmla v31.4s, v24.4s, v20.s[2]\n"
3034 "ldr q24, [%[b_ptr0]]\n"
3035 "fmla v26.4s, v25.4s, v0.s[3]\n"
3036 "fmla v27.4s, v25.4s, v4.s[3]\n"
3037 "fmla v28.4s, v25.4s, v8.s[3]\n"
3038 "fmla v29.4s, v25.4s, v12.s[3]\n"
3039 "fmla v30.4s, v25.4s, v16.s[3]\n"
3040 "fmla v31.4s, v25.4s, v20.s[3]\n"
3041 "ldr q25, [%[b_ptr0], #0x10]\n"
3042 "fmla v26.4s, v24.4s, v1.s[0]\n"
3043 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3044 "fmla v27.4s, v24.4s, v5.s[0]\n"
3045 "fmla v28.4s, v24.4s, v9.s[0]\n"
3046 "fmla v29.4s, v24.4s, v13.s[0]\n"
3047 "fmla v30.4s, v24.4s, v17.s[0]\n"
3048 "fmla v31.4s, v24.4s, v21.s[0]\n"
3049 "ldr q24, [%[b_ptr0]]\n"
3050 "fmla v26.4s, v25.4s, v1.s[1]\n"
3051 "fmla v27.4s, v25.4s, v5.s[1]\n"
3052 "fmla v28.4s, v25.4s, v9.s[1]\n"
3053 "fmla v29.4s, v25.4s, v13.s[1]\n"
3054 "fmla v30.4s, v25.4s, v17.s[1]\n"
3055 "fmla v31.4s, v25.4s, v21.s[1]\n"
3056 "ldr q25, [%[b_ptr0], #0x10]\n"
3057 "fmla v26.4s, v24.4s, v1.s[2]\n"
3058 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3059 "fmla v27.4s, v24.4s, v5.s[2]\n"
3060 "fmla v28.4s, v24.4s, v9.s[2]\n"
3061 "fmla v29.4s, v24.4s, v13.s[2]\n"
3062 "fmla v30.4s, v24.4s, v17.s[2]\n"
3063 "fmla v31.4s, v24.4s, v21.s[2]\n"
3064 "ldr q24, [%[b_ptr0]]\n"
3065 "fmla v26.4s, v25.4s, v1.s[3]\n"
3066 "fmla v27.4s, v25.4s, v5.s[3]\n"
3067 "fmla v28.4s, v25.4s, v9.s[3]\n"
3068 "fmla v29.4s, v25.4s, v13.s[3]\n"
3069 "fmla v30.4s, v25.4s, v17.s[3]\n"
3070 "fmla v31.4s, v25.4s, v21.s[3]\n"
3071 "ldr q25, [%[b_ptr0], #0x10]\n"
3072 "fmla v26.4s, v24.4s, v2.s[0]\n"
3073 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3074 "fmla v27.4s, v24.4s, v6.s[0]\n"
3075 "fmla v28.4s, v24.4s, v10.s[0]\n"
3076 "fmla v29.4s, v24.4s, v14.s[0]\n"
3077 "fmla v30.4s, v24.4s, v18.s[0]\n"
3078 "fmla v31.4s, v24.4s, v22.s[0]\n"
3079 "ldr q24, [%[b_ptr0]]\n"
3080 "fmla v26.4s, v25.4s, v2.s[1]\n"
3081 "fmla v27.4s, v25.4s, v6.s[1]\n"
3082 "fmla v28.4s, v25.4s, v10.s[1]\n"
3083 "fmla v29.4s, v25.4s, v14.s[1]\n"
3084 "fmla v30.4s, v25.4s, v18.s[1]\n"
3085 "fmla v31.4s, v25.4s, v22.s[1]\n"
3086 "ldr q25, [%[b_ptr0], #0x10]\n"
3087 "fmla v26.4s, v24.4s, v2.s[2]\n"
3088 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3089 "fmla v27.4s, v24.4s, v6.s[2]\n"
3090 "fmla v28.4s, v24.4s, v10.s[2]\n"
3091 "fmla v29.4s, v24.4s, v14.s[2]\n"
3092 "fmla v30.4s, v24.4s, v18.s[2]\n"
3093 "fmla v31.4s, v24.4s, v22.s[2]\n"
3094 "ldr q24, [%[b_ptr0]]\n"
3095 "fmla v26.4s, v25.4s, v2.s[3]\n"
3096 "fmla v27.4s, v25.4s, v6.s[3]\n"
3097 "fmla v28.4s, v25.4s, v10.s[3]\n"
3098 "fmla v29.4s, v25.4s, v14.s[3]\n"
3099 "fmla v30.4s, v25.4s, v18.s[3]\n"
3100 "fmla v31.4s, v25.4s, v22.s[3]\n"
3101 "ldr q25, [%[b_ptr0], #0x10]\n"
3102 "fmla v26.4s, v24.4s, v3.s[0]\n"
3103 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3104 "fmla v27.4s, v24.4s, v7.s[0]\n"
3105 "fmla v28.4s, v24.4s, v11.s[0]\n"
3106 "fmla v29.4s, v24.4s, v15.s[0]\n"
3107 "fmla v30.4s, v24.4s, v19.s[0]\n"
3108 "fmla v31.4s, v24.4s, v23.s[0]\n"
3109 "fmla v26.4s, v25.4s, v3.s[1]\n"
3110 "fmla v27.4s, v25.4s, v7.s[1]\n"
3111 "fmla v28.4s, v25.4s, v11.s[1]\n"
3112 "fmla v29.4s, v25.4s, v15.s[1]\n"
3113 "fmla v30.4s, v25.4s, v19.s[1]\n"
3114 "fmla v31.4s, v25.4s, v23.s[1]\n"
3117 "ldr q26, [%[biasptr]]\n"
3118 "add %[biasptr], %[biasptr], %[biasinc]\n"
3119 "mov v27.16b, v26.16b\n"
3120 "mov v28.16b, v26.16b\n"
3121 "mov v29.16b, v26.16b\n"
3122 "mov v30.16b, v26.16b\n"
3123 "mov v31.16b, v26.16b\n"
3124 "fmla v26.4s, v24.4s, v0.s[0]\n"
3125 "fmla v27.4s, v24.4s, v4.s[0]\n"
3126 "fmla v28.4s, v24.4s, v8.s[0]\n"
3127 "fmla v29.4s, v24.4s, v12.s[0]\n"
3128 "fmla v30.4s, v24.4s, v16.s[0]\n"
3129 "fmla v31.4s, v24.4s, v20.s[0]\n"
3130 "ldr q24, [%[b_ptr0]]\n"
3131 "fmla v26.4s, v25.4s, v0.s[1]\n"
3132 "fmla v27.4s, v25.4s, v4.s[1]\n"
3133 "fmla v28.4s, v25.4s, v8.s[1]\n"
3134 "fmla v29.4s, v25.4s, v12.s[1]\n"
3135 "fmla v30.4s, v25.4s, v16.s[1]\n"
3136 "fmla v31.4s, v25.4s, v20.s[1]\n"
3137 "ldr q25, [%[b_ptr0], #0x10]\n"
3138 "fmla v26.4s, v24.4s, v0.s[2]\n"
3139 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3140 "fmla v27.4s, v24.4s, v4.s[2]\n"
3141 "fmla v28.4s, v24.4s, v8.s[2]\n"
3142 "fmla v29.4s, v24.4s, v12.s[2]\n"
3143 "fmla v30.4s, v24.4s, v16.s[2]\n"
3144 "fmla v31.4s, v24.4s, v20.s[2]\n"
3145 "ldr q24, [%[b_ptr0]]\n"
3146 "fmla v26.4s, v25.4s, v0.s[3]\n"
3147 "fmla v27.4s, v25.4s, v4.s[3]\n"
3148 "fmla v28.4s, v25.4s, v8.s[3]\n"
3149 "fmla v29.4s, v25.4s, v12.s[3]\n"
3150 "fmla v30.4s, v25.4s, v16.s[3]\n"
3151 "fmla v31.4s, v25.4s, v20.s[3]\n"
3152 "ldr q25, [%[b_ptr0], #0x10]\n"
3153 "fmla v26.4s, v24.4s, v1.s[0]\n"
3154 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3155 "fmla v27.4s, v24.4s, v5.s[0]\n"
3156 "fmla v28.4s, v24.4s, v9.s[0]\n"
3157 "fmla v29.4s, v24.4s, v13.s[0]\n"
3158 "fmla v30.4s, v24.4s, v17.s[0]\n"
3159 "fmla v31.4s, v24.4s, v21.s[0]\n"
3160 "ldr q24, [%[b_ptr0]]\n"
3161 "fmla v26.4s, v25.4s, v1.s[1]\n"
3162 "fmla v27.4s, v25.4s, v5.s[1]\n"
3163 "fmla v28.4s, v25.4s, v9.s[1]\n"
3164 "fmla v29.4s, v25.4s, v13.s[1]\n"
3165 "fmla v30.4s, v25.4s, v17.s[1]\n"
3166 "fmla v31.4s, v25.4s, v21.s[1]\n"
3167 "ldr q25, [%[b_ptr0], #0x10]\n"
3168 "fmla v26.4s, v24.4s, v1.s[2]\n"
3169 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3170 "fmla v27.4s, v24.4s, v5.s[2]\n"
3171 "fmla v28.4s, v24.4s, v9.s[2]\n"
3172 "fmla v29.4s, v24.4s, v13.s[2]\n"
3173 "fmla v30.4s, v24.4s, v17.s[2]\n"
3174 "fmla v31.4s, v24.4s, v21.s[2]\n"
3175 "ldr q24, [%[b_ptr0]]\n"
3176 "fmla v26.4s, v25.4s, v1.s[3]\n"
3177 "fmla v27.4s, v25.4s, v5.s[3]\n"
3178 "fmla v28.4s, v25.4s, v9.s[3]\n"
3179 "fmla v29.4s, v25.4s, v13.s[3]\n"
3180 "fmla v30.4s, v25.4s, v17.s[3]\n"
3181 "fmla v31.4s, v25.4s, v21.s[3]\n"
3182 "ldr q25, [%[b_ptr0], #0x10]\n"
3183 "fmla v26.4s, v24.4s, v2.s[0]\n"
3184 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3185 "fmla v27.4s, v24.4s, v6.s[0]\n"
3186 "fmla v28.4s, v24.4s, v10.s[0]\n"
3187 "fmla v29.4s, v24.4s, v14.s[0]\n"
3188 "fmla v30.4s, v24.4s, v18.s[0]\n"
3189 "fmla v31.4s, v24.4s, v22.s[0]\n"
3190 "ldr q24, [%[b_ptr0]]\n"
3191 "fmla v26.4s, v25.4s, v2.s[1]\n"
3192 "fmla v27.4s, v25.4s, v6.s[1]\n"
3193 "fmla v28.4s, v25.4s, v10.s[1]\n"
3194 "fmla v29.4s, v25.4s, v14.s[1]\n"
3195 "fmla v30.4s, v25.4s, v18.s[1]\n"
3196 "fmla v31.4s, v25.4s, v22.s[1]\n"
3197 "ldr q25, [%[b_ptr0], #0x10]\n"
3198 "fmla v26.4s, v24.4s, v2.s[2]\n"
3199 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3200 "fmla v27.4s, v24.4s, v6.s[2]\n"
3201 "fmla v28.4s, v24.4s, v10.s[2]\n"
3202 "fmla v29.4s, v24.4s, v14.s[2]\n"
3203 "fmla v30.4s, v24.4s, v18.s[2]\n"
3204 "fmla v31.4s, v24.4s, v22.s[2]\n"
3205 "ldr q24, [%[b_ptr0]]\n"
3206 "fmla v26.4s, v25.4s, v2.s[3]\n"
3207 "fmla v27.4s, v25.4s, v6.s[3]\n"
3208 "fmla v28.4s, v25.4s, v10.s[3]\n"
3209 "fmla v29.4s, v25.4s, v14.s[3]\n"
3210 "fmla v30.4s, v25.4s, v18.s[3]\n"
3211 "fmla v31.4s, v25.4s, v22.s[3]\n"
3212 "ldr q25, [%[b_ptr0], #0x10]\n"
3213 "fmla v26.4s, v24.4s, v3.s[0]\n"
3214 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3215 "fmla v27.4s, v24.4s, v7.s[0]\n"
3216 "fmla v28.4s, v24.4s, v11.s[0]\n"
3217 "fmla v29.4s, v24.4s, v15.s[0]\n"
3218 "fmla v30.4s, v24.4s, v19.s[0]\n"
3219 "fmla v31.4s, v24.4s, v23.s[0]\n"
3220 "fmla v26.4s, v25.4s, v3.s[1]\n"
3221 "fmla v27.4s, v25.4s, v7.s[1]\n"
3222 "fmla v28.4s, v25.4s, v11.s[1]\n"
3223 "fmla v29.4s, v25.4s, v15.s[1]\n"
3224 "fmla v30.4s, v25.4s, v19.s[1]\n"
3225 "fmla v31.4s, v25.4s, v23.s[1]\n"
3227 "ld1r {v24.4s}, [%[minptr]]\n"
3228 "ld1r {v25.4s}, [%[maxptr]]\n"
3229 "fmax v26.4s, v26.4s, v24.4s\n"
3230 "fmax v27.4s, v27.4s, v24.4s\n"
3231 "fmax v28.4s, v28.4s, v24.4s\n"
3232 "fmax v29.4s, v29.4s, v24.4s\n"
3233 "fmin v26.4s, v26.4s, v25.4s\n"
3234 "fmin v27.4s, v27.4s, v25.4s\n"
3235 "fmin v28.4s, v28.4s, v25.4s\n"
3236 "fmin v29.4s, v29.4s, v25.4s\n"
3237 "str q26, [%[c_ptr0]]\n"
3238 "fmax v30.4s, v30.4s, v24.4s\n"
3239 "add %[c_ptr0], %[c_ptr0], #0x10\n"
3240 "fmax v31.4s, v31.4s, v24.4s\n"
3241 "str q27, [c_ptr1]\n"
3242 "fmin v30.4s, v30.4s, v25.4s\n"
3243 "fmin v31.4s, v31.4s, v25.4s\n"
3244 "str q28, [c_ptr2]\n"
3245 "str q29, [c_ptr3]\n"
3246 "str q30, [c_ptr4]\n"
3247 "str q31, [c_ptr5]\n"
3258 : [a_ptr0]
"+r" (a_ptr0), [b_ptr0]
"+r" (b_ptr0), [c_ptr0]
"+r" (c_ptr0), [loops]
"+r" (loops), [oob_rows]
"+r" (oob_rows), [biasptr]
"+r" (biasptr)
3259 : [lda]
"r" (ldab), [ldc]
"r" (ldcb), [biasinc]
"r" (biasinc), [minptr]
"r" (minptr), [maxptr]
"r" (maxptr)
3260 :
"x0",
"x1",
"x2",
"x3",
"x4",
"x5",
"x6",
"x7",
"x8",
"x9",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23",
"v24",
"v25",
"v26",
"v27",
"v28",
"v29",
"v30",
"v31",
"cc",
"memory"
3275 "add a_ptr1, %[a_ptr0], %[lda]\n"
3276 "add c_ptr1, %[c_ptr0], %[ldc]\n"
3277 "add a_ptr2, a_ptr1, %[lda]\n"
3278 "add c_ptr2, c_ptr1, %[ldc]\n"
3279 "add a_ptr3, a_ptr2, %[lda]\n"
3280 "add c_ptr3, c_ptr2, %[ldc]\n"
3281 "add a_ptr4, a_ptr3, %[lda]\n"
3282 "add c_ptr4, c_ptr3, %[ldc]\n"
3283 "add a_ptr5, a_ptr4, %[lda]\n"
3284 "add c_ptr5, c_ptr4, %[ldc]\n"
3285 "cbz %[oob_rows], 1f\n"
3286 "subs %[oob_rows], %[oob_rows], #0x1\n"
3287 "add c_ptr5, %[c_ptr0], #0x0\n"
3288 "add a_ptr5, %[a_ptr0], #0x0\n"
3290 "subs %[oob_rows], %[oob_rows], #0x1\n"
3291 "add c_ptr4, %[c_ptr0], #0x0\n"
3292 "add a_ptr4, %[a_ptr0], #0x0\n"
3294 "subs %[oob_rows], %[oob_rows], #0x1\n"
3295 "add c_ptr3, %[c_ptr0], #0x0\n"
3296 "add a_ptr3, %[a_ptr0], #0x0\n"
3298 "subs %[oob_rows], %[oob_rows], #0x1\n"
3299 "add c_ptr2, %[c_ptr0], #0x0\n"
3300 "add a_ptr2, %[a_ptr0], #0x0\n"
3302 "subs %[oob_rows], %[oob_rows], #0x1\n"
3303 "add c_ptr1, %[c_ptr0], #0x0\n"
3304 "add a_ptr1, %[a_ptr0], #0x0\n"
3306 "ldr q0, [%[a_ptr0]], #0x10\n"
3307 "ldr q4, [a_ptr1], #0x10\n"
3308 "ldr q8, [a_ptr2], #0x10\n"
3309 "ldr q12, [a_ptr3], #0x10\n"
3310 "ldr q16, [a_ptr4], #0x10\n"
3311 "ldr q20, [a_ptr5], #0x10\n"
3312 "ldr q1, [%[a_ptr0]], #0x10\n"
3313 "ldr q5, [a_ptr1], #0x10\n"
3314 "ldr q9, [a_ptr2], #0x10\n"
3315 "ldr q13, [a_ptr3], #0x10\n"
3316 "ldr q17, [a_ptr4], #0x10\n"
3317 "ldr q21, [a_ptr5], #0x10\n"
3318 "ldr q2, [%[a_ptr0]], #0x10\n"
3319 "ldr q6, [a_ptr1], #0x10\n"
3320 "ldr q10, [a_ptr2], #0x10\n"
3321 "ldr q14, [a_ptr3], #0x10\n"
3322 "ldr d3, [%[a_ptr0]], #0x8\n"
3323 "ldr q18, [a_ptr4], #0x10\n"
3324 "ldr d7, [a_ptr1], #0x8\n"
3325 "ldr q22, [a_ptr5], #0x10\n"
3326 "ldr d11, [a_ptr2], #0x8\n"
3327 "ldr q24, [%[b_ptr0]]\n"
3328 "ldr d15, [a_ptr3], #0x8\n"
3329 "ldr q25, [%[b_ptr0], #0x10]\n"
3330 "ldr d19, [a_ptr4], #0x8\n"
3331 "ldr d23, [a_ptr5], #0x8\n"
3332 "ld1 {v3.s}[2], [%[a_ptr0]]\n"
3333 "ld1 {v7.s}[2], [a_ptr1]\n"
3334 "ld1 {v11.s}[2], [a_ptr2]\n"
3335 "ld1 {v15.s}[2], [a_ptr3]\n"
3336 "ld1 {v19.s}[2], [a_ptr4]\n"
3337 "ld1 {v23.s}[2], [a_ptr5]\n"
3338 "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
3339 "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
3340 "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
3341 "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
3342 "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
3343 "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
3344 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3345 "cbz %[loops], 2f\n"
3346 "ldr q26, [%[biasptr]]\n"
3347 "add %[biasptr], %[biasptr], %[biasinc]\n"
3348 "subs %[loops], %[loops], #0x1\n"
3349 "mov v27.16b, v26.16b\n"
3350 "mov v28.16b, v26.16b\n"
3351 "mov v29.16b, v26.16b\n"
3352 "mov v30.16b, v26.16b\n"
3353 "mov v31.16b, v26.16b\n"
3354 "fmla v26.4s, v24.4s, v0.s[0]\n"
3355 "fmla v27.4s, v24.4s, v4.s[0]\n"
3356 "fmla v28.4s, v24.4s, v8.s[0]\n"
3357 "fmla v29.4s, v24.4s, v12.s[0]\n"
3358 "fmla v30.4s, v24.4s, v16.s[0]\n"
3359 "fmla v31.4s, v24.4s, v20.s[0]\n"
3360 "ldr q24, [%[b_ptr0]]\n"
3361 "fmla v26.4s, v25.4s, v0.s[1]\n"
3362 "fmla v27.4s, v25.4s, v4.s[1]\n"
3363 "fmla v28.4s, v25.4s, v8.s[1]\n"
3364 "fmla v29.4s, v25.4s, v12.s[1]\n"
3365 "fmla v30.4s, v25.4s, v16.s[1]\n"
3366 "fmla v31.4s, v25.4s, v20.s[1]\n"
3367 "ldr q25, [%[b_ptr0], #0x10]\n"
3368 "fmla v26.4s, v24.4s, v0.s[2]\n"
3369 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3370 "fmla v27.4s, v24.4s, v4.s[2]\n"
3371 "fmla v28.4s, v24.4s, v8.s[2]\n"
3372 "fmla v29.4s, v24.4s, v12.s[2]\n"
3373 "fmla v30.4s, v24.4s, v16.s[2]\n"
3374 "fmla v31.4s, v24.4s, v20.s[2]\n"
3375 "ldr q24, [%[b_ptr0]]\n"
3376 "fmla v26.4s, v25.4s, v0.s[3]\n"
3377 "fmla v27.4s, v25.4s, v4.s[3]\n"
3378 "fmla v28.4s, v25.4s, v8.s[3]\n"
3379 "fmla v29.4s, v25.4s, v12.s[3]\n"
3380 "fmla v30.4s, v25.4s, v16.s[3]\n"
3381 "fmla v31.4s, v25.4s, v20.s[3]\n"
3382 "ldr q25, [%[b_ptr0], #0x10]\n"
3383 "fmla v26.4s, v24.4s, v1.s[0]\n"
3384 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3385 "fmla v27.4s, v24.4s, v5.s[0]\n"
3386 "fmla v28.4s, v24.4s, v9.s[0]\n"
3387 "fmla v29.4s, v24.4s, v13.s[0]\n"
3388 "fmla v30.4s, v24.4s, v17.s[0]\n"
3389 "fmla v31.4s, v24.4s, v21.s[0]\n"
3390 "ldr q24, [%[b_ptr0]]\n"
3391 "fmla v26.4s, v25.4s, v1.s[1]\n"
3392 "fmla v27.4s, v25.4s, v5.s[1]\n"
3393 "fmla v28.4s, v25.4s, v9.s[1]\n"
3394 "fmla v29.4s, v25.4s, v13.s[1]\n"
3395 "fmla v30.4s, v25.4s, v17.s[1]\n"
3396 "fmla v31.4s, v25.4s, v21.s[1]\n"
3397 "ldr q25, [%[b_ptr0], #0x10]\n"
3398 "fmla v26.4s, v24.4s, v1.s[2]\n"
3399 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3400 "fmla v27.4s, v24.4s, v5.s[2]\n"
3401 "fmla v28.4s, v24.4s, v9.s[2]\n"
3402 "fmla v29.4s, v24.4s, v13.s[2]\n"
3403 "fmla v30.4s, v24.4s, v17.s[2]\n"
3404 "fmla v31.4s, v24.4s, v21.s[2]\n"
3405 "ldr q24, [%[b_ptr0]]\n"
3406 "fmla v26.4s, v25.4s, v1.s[3]\n"
3407 "fmla v27.4s, v25.4s, v5.s[3]\n"
3408 "fmla v28.4s, v25.4s, v9.s[3]\n"
3409 "fmla v29.4s, v25.4s, v13.s[3]\n"
3410 "fmla v30.4s, v25.4s, v17.s[3]\n"
3411 "fmla v31.4s, v25.4s, v21.s[3]\n"
3412 "ldr q25, [%[b_ptr0], #0x10]\n"
3413 "fmla v26.4s, v24.4s, v2.s[0]\n"
3414 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3415 "fmla v27.4s, v24.4s, v6.s[0]\n"
3416 "fmla v28.4s, v24.4s, v10.s[0]\n"
3417 "fmla v29.4s, v24.4s, v14.s[0]\n"
3418 "fmla v30.4s, v24.4s, v18.s[0]\n"
3419 "fmla v31.4s, v24.4s, v22.s[0]\n"
3420 "ldr q24, [%[b_ptr0]]\n"
3421 "fmla v26.4s, v25.4s, v2.s[1]\n"
3422 "fmla v27.4s, v25.4s, v6.s[1]\n"
3423 "fmla v28.4s, v25.4s, v10.s[1]\n"
3424 "fmla v29.4s, v25.4s, v14.s[1]\n"
3425 "fmla v30.4s, v25.4s, v18.s[1]\n"
3426 "fmla v31.4s, v25.4s, v22.s[1]\n"
3427 "ldr q25, [%[b_ptr0], #0x10]\n"
3428 "fmla v26.4s, v24.4s, v2.s[2]\n"
3429 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3430 "fmla v27.4s, v24.4s, v6.s[2]\n"
3431 "fmla v28.4s, v24.4s, v10.s[2]\n"
3432 "fmla v29.4s, v24.4s, v14.s[2]\n"
3433 "fmla v30.4s, v24.4s, v18.s[2]\n"
3434 "fmla v31.4s, v24.4s, v22.s[2]\n"
3435 "ldr q24, [%[b_ptr0]]\n"
3436 "fmla v26.4s, v25.4s, v2.s[3]\n"
3437 "fmla v27.4s, v25.4s, v6.s[3]\n"
3438 "fmla v28.4s, v25.4s, v10.s[3]\n"
3439 "fmla v29.4s, v25.4s, v14.s[3]\n"
3440 "fmla v30.4s, v25.4s, v18.s[3]\n"
3441 "fmla v31.4s, v25.4s, v22.s[3]\n"
3442 "ldr q25, [%[b_ptr0], #0x10]\n"
3443 "fmla v26.4s, v24.4s, v3.s[0]\n"
3444 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3445 "fmla v27.4s, v24.4s, v7.s[0]\n"
3446 "fmla v28.4s, v24.4s, v11.s[0]\n"
3447 "fmla v29.4s, v24.4s, v15.s[0]\n"
3448 "fmla v30.4s, v24.4s, v19.s[0]\n"
3449 "fmla v31.4s, v24.4s, v23.s[0]\n"
3450 "ldr q24, [%[b_ptr0]]\n"
3451 "fmla v26.4s, v25.4s, v3.s[1]\n"
3452 "add %[b_ptr0], %[b_ptr0], #0x10\n"
3453 "fmla v27.4s, v25.4s, v7.s[1]\n"
3454 "fmla v28.4s, v25.4s, v11.s[1]\n"
3455 "fmla v29.4s, v25.4s, v15.s[1]\n"
3456 "fmla v30.4s, v25.4s, v19.s[1]\n"
3457 "fmla v31.4s, v25.4s, v23.s[1]\n"
3458 "fmla v26.4s, v24.4s, v3.s[2]\n"
3459 "fmla v27.4s, v24.4s, v7.s[2]\n"
3460 "fmla v28.4s, v24.4s, v11.s[2]\n"
3461 "fmla v29.4s, v24.4s, v15.s[2]\n"
3462 "fmla v30.4s, v24.4s, v19.s[2]\n"
3463 "fmla v31.4s, v24.4s, v23.s[2]\n"
3466 "ld1r {v24.4s}, [%[minptr]]\n"
3467 "subs %[loops], %[loops], #0x1\n"
3468 "ld1r {v25.4s}, [%[maxptr]]\n"
3469 "fmax v26.4s, v26.4s, v24.4s\n"
3470 "fmax v27.4s, v27.4s, v24.4s\n"
3471 "fmax v28.4s, v28.4s, v24.4s\n"
3472 "fmax v29.4s, v29.4s, v24.4s\n"
3473 "fmin v26.4s, v26.4s, v25.4s\n"
3474 "fmin v27.4s, v27.4s, v25.4s\n"
3475 "fmin v28.4s, v28.4s, v25.4s\n"
3476 "fmin v29.4s, v29.4s, v25.4s\n"
3477 "str q26, [%[c_ptr0]]\n"
3478 "fmax v30.4s, v30.4s, v24.4s\n"
3479 "ldr q26, [%[biasptr]]\n"
3480 "fmax v31.4s, v31.4s, v24.4s\n"
3481 "ldr q24, [%[b_ptr0]]\n"
3482 "add %[c_ptr0], %[c_ptr0], #0x10\n"
3483 "str q27, [c_ptr1]\n"
3484 "add c_ptr1, c_ptr1, #0x10\n"
3485 "fmin v30.4s, v30.4s, v25.4s\n"
3486 "add %[biasptr], %[biasptr], %[biasinc]\n"
3487 "fmin v31.4s, v31.4s, v25.4s\n"
3488 "str q28, [c_ptr2]\n"
3489 "mov v27.16b, v26.16b\n"
3490 "ldr q25, [%[b_ptr0], #0x10]\n"
3491 "mov v28.16b, v26.16b\n"
3492 "add c_ptr2, c_ptr2, #0x10\n"
3493 "str q29, [c_ptr3]\n"
3494 "add c_ptr3, c_ptr3, #0x10\n"
3495 "mov v29.16b, v26.16b\n"
3496 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3497 "fmla v27.4s, v24.4s, v4.s[0]\n"
3498 "str q30, [c_ptr4]\n"
3499 "mov v30.16b, v26.16b\n"
3500 "add c_ptr4, c_ptr4, #0x10\n"
3501 "fmla v28.4s, v24.4s, v8.s[0]\n"
3502 "str q31, [c_ptr5]\n"
3503 "mov v31.16b, v26.16b\n"
3504 "add c_ptr5, c_ptr5, #0x10\n"
3505 "fmla v26.4s, v24.4s, v0.s[0]\n"
3506 "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
3507 "fmla v29.4s, v24.4s, v12.s[0]\n"
3508 "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
3509 "fmla v30.4s, v24.4s, v16.s[0]\n"
3510 "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
3511 "fmla v31.4s, v24.4s, v20.s[0]\n"
3512 "ldr q24, [%[b_ptr0]]\n"
3513 "fmla v26.4s, v25.4s, v0.s[1]\n"
3514 "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
3515 "fmla v27.4s, v25.4s, v4.s[1]\n"
3516 "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
3517 "fmla v28.4s, v25.4s, v8.s[1]\n"
3518 "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
3519 "fmla v29.4s, v25.4s, v12.s[1]\n"
3520 "fmla v30.4s, v25.4s, v16.s[1]\n"
3521 "fmla v31.4s, v25.4s, v20.s[1]\n"
3522 "ldr q25, [%[b_ptr0], #0x10]\n"
3523 "fmla v26.4s, v24.4s, v0.s[2]\n"
3524 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3525 "fmla v27.4s, v24.4s, v4.s[2]\n"
3526 "fmla v28.4s, v24.4s, v8.s[2]\n"
3527 "fmla v29.4s, v24.4s, v12.s[2]\n"
3528 "fmla v30.4s, v24.4s, v16.s[2]\n"
3529 "fmla v31.4s, v24.4s, v20.s[2]\n"
3530 "ldr q24, [%[b_ptr0]]\n"
3531 "fmla v26.4s, v25.4s, v0.s[3]\n"
3532 "fmla v27.4s, v25.4s, v4.s[3]\n"
3533 "fmla v28.4s, v25.4s, v8.s[3]\n"
3534 "fmla v29.4s, v25.4s, v12.s[3]\n"
3535 "fmla v30.4s, v25.4s, v16.s[3]\n"
3536 "fmla v31.4s, v25.4s, v20.s[3]\n"
3537 "ldr q25, [%[b_ptr0], #0x10]\n"
3538 "fmla v26.4s, v24.4s, v1.s[0]\n"
3539 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3540 "fmla v27.4s, v24.4s, v5.s[0]\n"
3541 "fmla v28.4s, v24.4s, v9.s[0]\n"
3542 "fmla v29.4s, v24.4s, v13.s[0]\n"
3543 "fmla v30.4s, v24.4s, v17.s[0]\n"
3544 "fmla v31.4s, v24.4s, v21.s[0]\n"
3545 "ldr q24, [%[b_ptr0]]\n"
3546 "fmla v26.4s, v25.4s, v1.s[1]\n"
3547 "fmla v27.4s, v25.4s, v5.s[1]\n"
3548 "fmla v28.4s, v25.4s, v9.s[1]\n"
3549 "fmla v29.4s, v25.4s, v13.s[1]\n"
3550 "fmla v30.4s, v25.4s, v17.s[1]\n"
3551 "fmla v31.4s, v25.4s, v21.s[1]\n"
3552 "ldr q25, [%[b_ptr0], #0x10]\n"
3553 "fmla v26.4s, v24.4s, v1.s[2]\n"
3554 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3555 "fmla v27.4s, v24.4s, v5.s[2]\n"
3556 "fmla v28.4s, v24.4s, v9.s[2]\n"
3557 "fmla v29.4s, v24.4s, v13.s[2]\n"
3558 "fmla v30.4s, v24.4s, v17.s[2]\n"
3559 "fmla v31.4s, v24.4s, v21.s[2]\n"
3560 "ldr q24, [%[b_ptr0]]\n"
3561 "fmla v26.4s, v25.4s, v1.s[3]\n"
3562 "fmla v27.4s, v25.4s, v5.s[3]\n"
3563 "fmla v28.4s, v25.4s, v9.s[3]\n"
3564 "fmla v29.4s, v25.4s, v13.s[3]\n"
3565 "fmla v30.4s, v25.4s, v17.s[3]\n"
3566 "fmla v31.4s, v25.4s, v21.s[3]\n"
3567 "ldr q25, [%[b_ptr0], #0x10]\n"
3568 "fmla v26.4s, v24.4s, v2.s[0]\n"
3569 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3570 "fmla v27.4s, v24.4s, v6.s[0]\n"
3571 "fmla v28.4s, v24.4s, v10.s[0]\n"
3572 "fmla v29.4s, v24.4s, v14.s[0]\n"
3573 "fmla v30.4s, v24.4s, v18.s[0]\n"
3574 "fmla v31.4s, v24.4s, v22.s[0]\n"
3575 "ldr q24, [%[b_ptr0]]\n"
3576 "fmla v26.4s, v25.4s, v2.s[1]\n"
3577 "fmla v27.4s, v25.4s, v6.s[1]\n"
3578 "fmla v28.4s, v25.4s, v10.s[1]\n"
3579 "fmla v29.4s, v25.4s, v14.s[1]\n"
3580 "fmla v30.4s, v25.4s, v18.s[1]\n"
3581 "fmla v31.4s, v25.4s, v22.s[1]\n"
3582 "ldr q25, [%[b_ptr0], #0x10]\n"
3583 "fmla v26.4s, v24.4s, v2.s[2]\n"
3584 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3585 "fmla v27.4s, v24.4s, v6.s[2]\n"
3586 "fmla v28.4s, v24.4s, v10.s[2]\n"
3587 "fmla v29.4s, v24.4s, v14.s[2]\n"
3588 "fmla v30.4s, v24.4s, v18.s[2]\n"
3589 "fmla v31.4s, v24.4s, v22.s[2]\n"
3590 "ldr q24, [%[b_ptr0]]\n"
3591 "fmla v26.4s, v25.4s, v2.s[3]\n"
3592 "fmla v27.4s, v25.4s, v6.s[3]\n"
3593 "fmla v28.4s, v25.4s, v10.s[3]\n"
3594 "fmla v29.4s, v25.4s, v14.s[3]\n"
3595 "fmla v30.4s, v25.4s, v18.s[3]\n"
3596 "fmla v31.4s, v25.4s, v22.s[3]\n"
3597 "ldr q25, [%[b_ptr0], #0x10]\n"
3598 "fmla v26.4s, v24.4s, v3.s[0]\n"
3599 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3600 "fmla v27.4s, v24.4s, v7.s[0]\n"
3601 "fmla v28.4s, v24.4s, v11.s[0]\n"
3602 "fmla v29.4s, v24.4s, v15.s[0]\n"
3603 "fmla v30.4s, v24.4s, v19.s[0]\n"
3604 "fmla v31.4s, v24.4s, v23.s[0]\n"
3605 "ldr q24, [%[b_ptr0]]\n"
3606 "fmla v26.4s, v25.4s, v3.s[1]\n"
3607 "add %[b_ptr0], %[b_ptr0], #0x10\n"
3608 "fmla v27.4s, v25.4s, v7.s[1]\n"
3609 "fmla v28.4s, v25.4s, v11.s[1]\n"
3610 "fmla v29.4s, v25.4s, v15.s[1]\n"
3611 "fmla v30.4s, v25.4s, v19.s[1]\n"
3612 "fmla v31.4s, v25.4s, v23.s[1]\n"
3613 "fmla v26.4s, v24.4s, v3.s[2]\n"
3614 "fmla v27.4s, v24.4s, v7.s[2]\n"
3615 "fmla v28.4s, v24.4s, v11.s[2]\n"
3616 "fmla v29.4s, v24.4s, v15.s[2]\n"
3617 "fmla v30.4s, v24.4s, v19.s[2]\n"
3618 "fmla v31.4s, v24.4s, v23.s[2]\n"
3621 "ld1r {v24.4s}, [%[minptr]]\n"
3622 "ld1r {v25.4s}, [%[maxptr]]\n"
3623 "fmax v26.4s, v26.4s, v24.4s\n"
3624 "fmax v27.4s, v27.4s, v24.4s\n"
3625 "fmax v28.4s, v28.4s, v24.4s\n"
3626 "fmax v29.4s, v29.4s, v24.4s\n"
3627 "fmin v26.4s, v26.4s, v25.4s\n"
3628 "fmin v27.4s, v27.4s, v25.4s\n"
3629 "fmin v28.4s, v28.4s, v25.4s\n"
3630 "fmin v29.4s, v29.4s, v25.4s\n"
3631 "str q26, [%[c_ptr0]]\n"
3632 "fmax v30.4s, v30.4s, v24.4s\n"
3633 "ldr q26, [%[biasptr]]\n"
3634 "fmax v31.4s, v31.4s, v24.4s\n"
3635 "ldr q24, [%[b_ptr0]]\n"
3636 "add %[c_ptr0], %[c_ptr0], #0x10\n"
3637 "str q27, [c_ptr1]\n"
3638 "add c_ptr1, c_ptr1, #0x10\n"
3639 "fmin v30.4s, v30.4s, v25.4s\n"
3640 "add %[biasptr], %[biasptr], %[biasinc]\n"
3641 "fmin v31.4s, v31.4s, v25.4s\n"
3642 "str q28, [c_ptr2]\n"
3643 "mov v27.16b, v26.16b\n"
3644 "ldr q25, [%[b_ptr0], #0x10]\n"
3645 "mov v28.16b, v26.16b\n"
3646 "add c_ptr2, c_ptr2, #0x10\n"
3647 "str q29, [c_ptr3]\n"
3648 "add c_ptr3, c_ptr3, #0x10\n"
3649 "mov v29.16b, v26.16b\n"
3650 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3651 "fmla v27.4s, v24.4s, v4.s[0]\n"
3652 "str q30, [c_ptr4]\n"
3653 "mov v30.16b, v26.16b\n"
3654 "add c_ptr4, c_ptr4, #0x10\n"
3655 "fmla v28.4s, v24.4s, v8.s[0]\n"
3656 "str q31, [c_ptr5]\n"
3657 "mov v31.16b, v26.16b\n"
3658 "add c_ptr5, c_ptr5, #0x10\n"
3659 "fmla v26.4s, v24.4s, v0.s[0]\n"
3660 "fmla v29.4s, v24.4s, v12.s[0]\n"
3661 "fmla v30.4s, v24.4s, v16.s[0]\n"
3662 "fmla v31.4s, v24.4s, v20.s[0]\n"
3663 "ldr q24, [%[b_ptr0]]\n"
3664 "fmla v26.4s, v25.4s, v0.s[1]\n"
3665 "fmla v27.4s, v25.4s, v4.s[1]\n"
3666 "fmla v28.4s, v25.4s, v8.s[1]\n"
3667 "fmla v29.4s, v25.4s, v12.s[1]\n"
3668 "fmla v30.4s, v25.4s, v16.s[1]\n"
3669 "fmla v31.4s, v25.4s, v20.s[1]\n"
3670 "ldr q25, [%[b_ptr0], #0x10]\n"
3671 "fmla v26.4s, v24.4s, v0.s[2]\n"
3672 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3673 "fmla v27.4s, v24.4s, v4.s[2]\n"
3674 "fmla v28.4s, v24.4s, v8.s[2]\n"
3675 "fmla v29.4s, v24.4s, v12.s[2]\n"
3676 "fmla v30.4s, v24.4s, v16.s[2]\n"
3677 "fmla v31.4s, v24.4s, v20.s[2]\n"
3678 "ldr q24, [%[b_ptr0]]\n"
3679 "fmla v26.4s, v25.4s, v0.s[3]\n"
3680 "fmla v27.4s, v25.4s, v4.s[3]\n"
3681 "fmla v28.4s, v25.4s, v8.s[3]\n"
3682 "fmla v29.4s, v25.4s, v12.s[3]\n"
3683 "fmla v30.4s, v25.4s, v16.s[3]\n"
3684 "fmla v31.4s, v25.4s, v20.s[3]\n"
3685 "ldr q25, [%[b_ptr0], #0x10]\n"
3686 "fmla v26.4s, v24.4s, v1.s[0]\n"
3687 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3688 "fmla v27.4s, v24.4s, v5.s[0]\n"
3689 "fmla v28.4s, v24.4s, v9.s[0]\n"
3690 "fmla v29.4s, v24.4s, v13.s[0]\n"
3691 "fmla v30.4s, v24.4s, v17.s[0]\n"
3692 "fmla v31.4s, v24.4s, v21.s[0]\n"
3693 "ldr q24, [%[b_ptr0]]\n"
3694 "fmla v26.4s, v25.4s, v1.s[1]\n"
3695 "fmla v27.4s, v25.4s, v5.s[1]\n"
3696 "fmla v28.4s, v25.4s, v9.s[1]\n"
3697 "fmla v29.4s, v25.4s, v13.s[1]\n"
3698 "fmla v30.4s, v25.4s, v17.s[1]\n"
3699 "fmla v31.4s, v25.4s, v21.s[1]\n"
3700 "ldr q25, [%[b_ptr0], #0x10]\n"
3701 "fmla v26.4s, v24.4s, v1.s[2]\n"
3702 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3703 "fmla v27.4s, v24.4s, v5.s[2]\n"
3704 "fmla v28.4s, v24.4s, v9.s[2]\n"
3705 "fmla v29.4s, v24.4s, v13.s[2]\n"
3706 "fmla v30.4s, v24.4s, v17.s[2]\n"
3707 "fmla v31.4s, v24.4s, v21.s[2]\n"
3708 "ldr q24, [%[b_ptr0]]\n"
3709 "fmla v26.4s, v25.4s, v1.s[3]\n"
3710 "fmla v27.4s, v25.4s, v5.s[3]\n"
3711 "fmla v28.4s, v25.4s, v9.s[3]\n"
3712 "fmla v29.4s, v25.4s, v13.s[3]\n"
3713 "fmla v30.4s, v25.4s, v17.s[3]\n"
3714 "fmla v31.4s, v25.4s, v21.s[3]\n"
3715 "ldr q25, [%[b_ptr0], #0x10]\n"
3716 "fmla v26.4s, v24.4s, v2.s[0]\n"
3717 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3718 "fmla v27.4s, v24.4s, v6.s[0]\n"
3719 "fmla v28.4s, v24.4s, v10.s[0]\n"
3720 "fmla v29.4s, v24.4s, v14.s[0]\n"
3721 "fmla v30.4s, v24.4s, v18.s[0]\n"
3722 "fmla v31.4s, v24.4s, v22.s[0]\n"
3723 "ldr q24, [%[b_ptr0]]\n"
3724 "fmla v26.4s, v25.4s, v2.s[1]\n"
3725 "fmla v27.4s, v25.4s, v6.s[1]\n"
3726 "fmla v28.4s, v25.4s, v10.s[1]\n"
3727 "fmla v29.4s, v25.4s, v14.s[1]\n"
3728 "fmla v30.4s, v25.4s, v18.s[1]\n"
3729 "fmla v31.4s, v25.4s, v22.s[1]\n"
3730 "ldr q25, [%[b_ptr0], #0x10]\n"
3731 "fmla v26.4s, v24.4s, v2.s[2]\n"
3732 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3733 "fmla v27.4s, v24.4s, v6.s[2]\n"
3734 "fmla v28.4s, v24.4s, v10.s[2]\n"
3735 "fmla v29.4s, v24.4s, v14.s[2]\n"
3736 "fmla v30.4s, v24.4s, v18.s[2]\n"
3737 "fmla v31.4s, v24.4s, v22.s[2]\n"
3738 "ldr q24, [%[b_ptr0]]\n"
3739 "fmla v26.4s, v25.4s, v2.s[3]\n"
3740 "fmla v27.4s, v25.4s, v6.s[3]\n"
3741 "fmla v28.4s, v25.4s, v10.s[3]\n"
3742 "fmla v29.4s, v25.4s, v14.s[3]\n"
3743 "fmla v30.4s, v25.4s, v18.s[3]\n"
3744 "fmla v31.4s, v25.4s, v22.s[3]\n"
3745 "ldr q25, [%[b_ptr0], #0x10]\n"
3746 "fmla v26.4s, v24.4s, v3.s[0]\n"
3747 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3748 "fmla v27.4s, v24.4s, v7.s[0]\n"
3749 "fmla v28.4s, v24.4s, v11.s[0]\n"
3750 "fmla v29.4s, v24.4s, v15.s[0]\n"
3751 "fmla v30.4s, v24.4s, v19.s[0]\n"
3752 "fmla v31.4s, v24.4s, v23.s[0]\n"
3753 "ldr q24, [%[b_ptr0]]\n"
3754 "fmla v26.4s, v25.4s, v3.s[1]\n"
3755 "add %[b_ptr0], %[b_ptr0], #0x10\n"
3756 "fmla v27.4s, v25.4s, v7.s[1]\n"
3757 "fmla v28.4s, v25.4s, v11.s[1]\n"
3758 "fmla v29.4s, v25.4s, v15.s[1]\n"
3759 "fmla v30.4s, v25.4s, v19.s[1]\n"
3760 "fmla v31.4s, v25.4s, v23.s[1]\n"
3761 "fmla v26.4s, v24.4s, v3.s[2]\n"
3762 "fmla v27.4s, v24.4s, v7.s[2]\n"
3763 "fmla v28.4s, v24.4s, v11.s[2]\n"
3764 "fmla v29.4s, v24.4s, v15.s[2]\n"
3765 "fmla v30.4s, v24.4s, v19.s[2]\n"
3766 "fmla v31.4s, v24.4s, v23.s[2]\n"
3769 "ldr q26, [%[biasptr]]\n"
3770 "add %[biasptr], %[biasptr], %[biasinc]\n"
3771 "mov v27.16b, v26.16b\n"
3772 "mov v28.16b, v26.16b\n"
3773 "mov v29.16b, v26.16b\n"
3774 "mov v30.16b, v26.16b\n"
3775 "mov v31.16b, v26.16b\n"
3776 "fmla v26.4s, v24.4s, v0.s[0]\n"
3777 "fmla v27.4s, v24.4s, v4.s[0]\n"
3778 "fmla v28.4s, v24.4s, v8.s[0]\n"
3779 "fmla v29.4s, v24.4s, v12.s[0]\n"
3780 "fmla v30.4s, v24.4s, v16.s[0]\n"
3781 "fmla v31.4s, v24.4s, v20.s[0]\n"
3782 "ldr q24, [%[b_ptr0]]\n"
3783 "fmla v26.4s, v25.4s, v0.s[1]\n"
3784 "fmla v27.4s, v25.4s, v4.s[1]\n"
3785 "fmla v28.4s, v25.4s, v8.s[1]\n"
3786 "fmla v29.4s, v25.4s, v12.s[1]\n"
3787 "fmla v30.4s, v25.4s, v16.s[1]\n"
3788 "fmla v31.4s, v25.4s, v20.s[1]\n"
3789 "ldr q25, [%[b_ptr0], #0x10]\n"
3790 "fmla v26.4s, v24.4s, v0.s[2]\n"
3791 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3792 "fmla v27.4s, v24.4s, v4.s[2]\n"
3793 "fmla v28.4s, v24.4s, v8.s[2]\n"
3794 "fmla v29.4s, v24.4s, v12.s[2]\n"
3795 "fmla v30.4s, v24.4s, v16.s[2]\n"
3796 "fmla v31.4s, v24.4s, v20.s[2]\n"
3797 "ldr q24, [%[b_ptr0]]\n"
3798 "fmla v26.4s, v25.4s, v0.s[3]\n"
3799 "fmla v27.4s, v25.4s, v4.s[3]\n"
3800 "fmla v28.4s, v25.4s, v8.s[3]\n"
3801 "fmla v29.4s, v25.4s, v12.s[3]\n"
3802 "fmla v30.4s, v25.4s, v16.s[3]\n"
3803 "fmla v31.4s, v25.4s, v20.s[3]\n"
3804 "ldr q25, [%[b_ptr0], #0x10]\n"
3805 "fmla v26.4s, v24.4s, v1.s[0]\n"
3806 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3807 "fmla v27.4s, v24.4s, v5.s[0]\n"
3808 "fmla v28.4s, v24.4s, v9.s[0]\n"
3809 "fmla v29.4s, v24.4s, v13.s[0]\n"
3810 "fmla v30.4s, v24.4s, v17.s[0]\n"
3811 "fmla v31.4s, v24.4s, v21.s[0]\n"
3812 "ldr q24, [%[b_ptr0]]\n"
3813 "fmla v26.4s, v25.4s, v1.s[1]\n"
3814 "fmla v27.4s, v25.4s, v5.s[1]\n"
3815 "fmla v28.4s, v25.4s, v9.s[1]\n"
3816 "fmla v29.4s, v25.4s, v13.s[1]\n"
3817 "fmla v30.4s, v25.4s, v17.s[1]\n"
3818 "fmla v31.4s, v25.4s, v21.s[1]\n"
3819 "ldr q25, [%[b_ptr0], #0x10]\n"
3820 "fmla v26.4s, v24.4s, v1.s[2]\n"
3821 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3822 "fmla v27.4s, v24.4s, v5.s[2]\n"
3823 "fmla v28.4s, v24.4s, v9.s[2]\n"
3824 "fmla v29.4s, v24.4s, v13.s[2]\n"
3825 "fmla v30.4s, v24.4s, v17.s[2]\n"
3826 "fmla v31.4s, v24.4s, v21.s[2]\n"
3827 "ldr q24, [%[b_ptr0]]\n"
3828 "fmla v26.4s, v25.4s, v1.s[3]\n"
3829 "fmla v27.4s, v25.4s, v5.s[3]\n"
3830 "fmla v28.4s, v25.4s, v9.s[3]\n"
3831 "fmla v29.4s, v25.4s, v13.s[3]\n"
3832 "fmla v30.4s, v25.4s, v17.s[3]\n"
3833 "fmla v31.4s, v25.4s, v21.s[3]\n"
3834 "ldr q25, [%[b_ptr0], #0x10]\n"
3835 "fmla v26.4s, v24.4s, v2.s[0]\n"
3836 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3837 "fmla v27.4s, v24.4s, v6.s[0]\n"
3838 "fmla v28.4s, v24.4s, v10.s[0]\n"
3839 "fmla v29.4s, v24.4s, v14.s[0]\n"
3840 "fmla v30.4s, v24.4s, v18.s[0]\n"
3841 "fmla v31.4s, v24.4s, v22.s[0]\n"
3842 "ldr q24, [%[b_ptr0]]\n"
3843 "fmla v26.4s, v25.4s, v2.s[1]\n"
3844 "fmla v27.4s, v25.4s, v6.s[1]\n"
3845 "fmla v28.4s, v25.4s, v10.s[1]\n"
3846 "fmla v29.4s, v25.4s, v14.s[1]\n"
3847 "fmla v30.4s, v25.4s, v18.s[1]\n"
3848 "fmla v31.4s, v25.4s, v22.s[1]\n"
3849 "ldr q25, [%[b_ptr0], #0x10]\n"
3850 "fmla v26.4s, v24.4s, v2.s[2]\n"
3851 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3852 "fmla v27.4s, v24.4s, v6.s[2]\n"
3853 "fmla v28.4s, v24.4s, v10.s[2]\n"
3854 "fmla v29.4s, v24.4s, v14.s[2]\n"
3855 "fmla v30.4s, v24.4s, v18.s[2]\n"
3856 "fmla v31.4s, v24.4s, v22.s[2]\n"
3857 "ldr q24, [%[b_ptr0]]\n"
3858 "fmla v26.4s, v25.4s, v2.s[3]\n"
3859 "fmla v27.4s, v25.4s, v6.s[3]\n"
3860 "fmla v28.4s, v25.4s, v10.s[3]\n"
3861 "fmla v29.4s, v25.4s, v14.s[3]\n"
3862 "fmla v30.4s, v25.4s, v18.s[3]\n"
3863 "fmla v31.4s, v25.4s, v22.s[3]\n"
3864 "ldr q25, [%[b_ptr0], #0x10]\n"
3865 "fmla v26.4s, v24.4s, v3.s[0]\n"
3866 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3867 "fmla v27.4s, v24.4s, v7.s[0]\n"
3868 "fmla v28.4s, v24.4s, v11.s[0]\n"
3869 "fmla v29.4s, v24.4s, v15.s[0]\n"
3870 "fmla v30.4s, v24.4s, v19.s[0]\n"
3871 "fmla v31.4s, v24.4s, v23.s[0]\n"
3872 "ldr q24, [%[b_ptr0]]\n"
3873 "fmla v26.4s, v25.4s, v3.s[1]\n"
3874 "add %[b_ptr0], %[b_ptr0], #0x10\n"
3875 "fmla v27.4s, v25.4s, v7.s[1]\n"
3876 "fmla v28.4s, v25.4s, v11.s[1]\n"
3877 "fmla v29.4s, v25.4s, v15.s[1]\n"
3878 "fmla v30.4s, v25.4s, v19.s[1]\n"
3879 "fmla v31.4s, v25.4s, v23.s[1]\n"
3880 "fmla v26.4s, v24.4s, v3.s[2]\n"
3881 "fmla v27.4s, v24.4s, v7.s[2]\n"
3882 "fmla v28.4s, v24.4s, v11.s[2]\n"
3883 "fmla v29.4s, v24.4s, v15.s[2]\n"
3884 "fmla v30.4s, v24.4s, v19.s[2]\n"
3885 "fmla v31.4s, v24.4s, v23.s[2]\n"
3887 "ld1r {v24.4s}, [%[minptr]]\n"
3888 "ld1r {v25.4s}, [%[maxptr]]\n"
3889 "fmax v26.4s, v26.4s, v24.4s\n"
3890 "fmax v27.4s, v27.4s, v24.4s\n"
3891 "fmax v28.4s, v28.4s, v24.4s\n"
3892 "fmax v29.4s, v29.4s, v24.4s\n"
3893 "fmin v26.4s, v26.4s, v25.4s\n"
3894 "fmin v27.4s, v27.4s, v25.4s\n"
3895 "fmin v28.4s, v28.4s, v25.4s\n"
3896 "fmin v29.4s, v29.4s, v25.4s\n"
3897 "str q26, [%[c_ptr0]]\n"
3898 "fmax v30.4s, v30.4s, v24.4s\n"
3899 "add %[c_ptr0], %[c_ptr0], #0x10\n"
3900 "fmax v31.4s, v31.4s, v24.4s\n"
3901 "str q27, [c_ptr1]\n"
3902 "fmin v30.4s, v30.4s, v25.4s\n"
3903 "fmin v31.4s, v31.4s, v25.4s\n"
3904 "str q28, [c_ptr2]\n"
3905 "str q29, [c_ptr3]\n"
3906 "str q30, [c_ptr4]\n"
3907 "str q31, [c_ptr5]\n"
3918 : [a_ptr0]
"+r" (a_ptr0), [b_ptr0]
"+r" (b_ptr0), [c_ptr0]
"+r" (c_ptr0), [loops]
"+r" (loops), [oob_rows]
"+r" (oob_rows), [biasptr]
"+r" (biasptr)
3919 : [lda]
"r" (ldab), [ldc]
"r" (ldcb), [biasinc]
"r" (biasinc), [minptr]
"r" (minptr), [maxptr]
"r" (maxptr)
3920 :
"x0",
"x1",
"x2",
"x3",
"x4",
"x5",
"x6",
"x7",
"x8",
"x9",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23",
"v24",
"v25",
"v26",
"v27",
"v28",
"v29",
"v30",
"v31",
"cc",
"memory"
3936 "add a_ptr1, %[a_ptr0], %[lda]\n"
3937 "add c_ptr1, %[c_ptr0], %[ldc]\n"
3938 "add a_ptr2, a_ptr1, %[lda]\n"
3939 "add c_ptr2, c_ptr1, %[ldc]\n"
3940 "add a_ptr3, a_ptr2, %[lda]\n"
3941 "add c_ptr3, c_ptr2, %[ldc]\n"
3942 "add a_ptr4, a_ptr3, %[lda]\n"
3943 "add c_ptr4, c_ptr3, %[ldc]\n"
3944 "add a_ptr5, a_ptr4, %[lda]\n"
3945 "add c_ptr5, c_ptr4, %[ldc]\n"
3946 "cbz %[oob_rows], 1f\n"
3947 "subs %[oob_rows], %[oob_rows], #0x1\n"
3948 "add c_ptr5, %[c_ptr0], #0x0\n"
3949 "add a_ptr5, %[a_ptr0], #0x0\n"
3951 "subs %[oob_rows], %[oob_rows], #0x1\n"
3952 "add c_ptr4, %[c_ptr0], #0x0\n"
3953 "add a_ptr4, %[a_ptr0], #0x0\n"
3955 "subs %[oob_rows], %[oob_rows], #0x1\n"
3956 "add c_ptr3, %[c_ptr0], #0x0\n"
3957 "add a_ptr3, %[a_ptr0], #0x0\n"
3959 "subs %[oob_rows], %[oob_rows], #0x1\n"
3960 "add c_ptr2, %[c_ptr0], #0x0\n"
3961 "add a_ptr2, %[a_ptr0], #0x0\n"
3963 "subs %[oob_rows], %[oob_rows], #0x1\n"
3964 "add c_ptr1, %[c_ptr0], #0x0\n"
3965 "add a_ptr1, %[a_ptr0], #0x0\n"
3967 "ldr q0, [%[a_ptr0]], #0x10\n"
3968 "ldr q4, [a_ptr1], #0x10\n"
3969 "ldr q8, [a_ptr2], #0x10\n"
3970 "ldr q12, [a_ptr3], #0x10\n"
3971 "ldr q16, [a_ptr4], #0x10\n"
3972 "ldr q20, [a_ptr5], #0x10\n"
3973 "ldr q1, [%[a_ptr0]], #0x10\n"
3974 "ldr q5, [a_ptr1], #0x10\n"
3975 "ldr q9, [a_ptr2], #0x10\n"
3976 "ldr q13, [a_ptr3], #0x10\n"
3977 "ldr q17, [a_ptr4], #0x10\n"
3978 "ldr q21, [a_ptr5], #0x10\n"
3979 "ldr q2, [%[a_ptr0]], #0x10\n"
3980 "ldr q6, [a_ptr1], #0x10\n"
3981 "ldr q10, [a_ptr2], #0x10\n"
3982 "ldr q14, [a_ptr3], #0x10\n"
3983 "ldr q18, [a_ptr4], #0x10\n"
3984 "ldr q22, [a_ptr5], #0x10\n"
3985 "ldr q3, [%[a_ptr0]]\n"
3986 "ldr q7, [a_ptr1]\n"
3987 "ldr q11, [a_ptr2]\n"
3988 "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
3989 "ldr q15, [a_ptr3]\n"
3990 "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
3991 "ldr q19, [a_ptr4]\n"
3992 "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
3993 "ldr q23, [a_ptr5]\n"
3994 "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
3995 "ldr q24, [%[b_ptr0]]\n"
3996 "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
3997 "ldr q25, [%[b_ptr0], #0x10]\n"
3998 "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
3999 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4000 "cbz %[loops], 2f\n"
4001 "ldr q26, [%[biasptr]]\n"
4002 "add %[biasptr], %[biasptr], %[biasinc]\n"
4003 "subs %[loops], %[loops], #0x1\n"
4004 "mov v27.16b, v26.16b\n"
4005 "mov v28.16b, v26.16b\n"
4006 "mov v29.16b, v26.16b\n"
4007 "mov v30.16b, v26.16b\n"
4008 "mov v31.16b, v26.16b\n"
4009 "fmla v26.4s, v24.4s, v0.s[0]\n"
4010 "fmla v27.4s, v24.4s, v4.s[0]\n"
4011 "fmla v28.4s, v24.4s, v8.s[0]\n"
4012 "fmla v29.4s, v24.4s, v12.s[0]\n"
4013 "fmla v30.4s, v24.4s, v16.s[0]\n"
4014 "fmla v31.4s, v24.4s, v20.s[0]\n"
4015 "ldr q24, [%[b_ptr0]]\n"
4016 "fmla v26.4s, v25.4s, v0.s[1]\n"
4017 "fmla v27.4s, v25.4s, v4.s[1]\n"
4018 "fmla v28.4s, v25.4s, v8.s[1]\n"
4019 "fmla v29.4s, v25.4s, v12.s[1]\n"
4020 "fmla v30.4s, v25.4s, v16.s[1]\n"
4021 "fmla v31.4s, v25.4s, v20.s[1]\n"
4022 "ldr q25, [%[b_ptr0], #0x10]\n"
4023 "fmla v26.4s, v24.4s, v0.s[2]\n"
4024 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4025 "fmla v27.4s, v24.4s, v4.s[2]\n"
4026 "fmla v28.4s, v24.4s, v8.s[2]\n"
4027 "fmla v29.4s, v24.4s, v12.s[2]\n"
4028 "fmla v30.4s, v24.4s, v16.s[2]\n"
4029 "fmla v31.4s, v24.4s, v20.s[2]\n"
4030 "ldr q24, [%[b_ptr0]]\n"
4031 "fmla v26.4s, v25.4s, v0.s[3]\n"
4032 "fmla v27.4s, v25.4s, v4.s[3]\n"
4033 "fmla v28.4s, v25.4s, v8.s[3]\n"
4034 "fmla v29.4s, v25.4s, v12.s[3]\n"
4035 "fmla v30.4s, v25.4s, v16.s[3]\n"
4036 "fmla v31.4s, v25.4s, v20.s[3]\n"
4037 "ldr q25, [%[b_ptr0], #0x10]\n"
4038 "fmla v26.4s, v24.4s, v1.s[0]\n"
4039 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4040 "fmla v27.4s, v24.4s, v5.s[0]\n"
4041 "fmla v28.4s, v24.4s, v9.s[0]\n"
4042 "fmla v29.4s, v24.4s, v13.s[0]\n"
4043 "fmla v30.4s, v24.4s, v17.s[0]\n"
4044 "fmla v31.4s, v24.4s, v21.s[0]\n"
4045 "ldr q24, [%[b_ptr0]]\n"
4046 "fmla v26.4s, v25.4s, v1.s[1]\n"
4047 "fmla v27.4s, v25.4s, v5.s[1]\n"
4048 "fmla v28.4s, v25.4s, v9.s[1]\n"
4049 "fmla v29.4s, v25.4s, v13.s[1]\n"
4050 "fmla v30.4s, v25.4s, v17.s[1]\n"
4051 "fmla v31.4s, v25.4s, v21.s[1]\n"
4052 "ldr q25, [%[b_ptr0], #0x10]\n"
4053 "fmla v26.4s, v24.4s, v1.s[2]\n"
4054 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4055 "fmla v27.4s, v24.4s, v5.s[2]\n"
4056 "fmla v28.4s, v24.4s, v9.s[2]\n"
4057 "fmla v29.4s, v24.4s, v13.s[2]\n"
4058 "fmla v30.4s, v24.4s, v17.s[2]\n"
4059 "fmla v31.4s, v24.4s, v21.s[2]\n"
4060 "ldr q24, [%[b_ptr0]]\n"
4061 "fmla v26.4s, v25.4s, v1.s[3]\n"
4062 "fmla v27.4s, v25.4s, v5.s[3]\n"
4063 "fmla v28.4s, v25.4s, v9.s[3]\n"
4064 "fmla v29.4s, v25.4s, v13.s[3]\n"
4065 "fmla v30.4s, v25.4s, v17.s[3]\n"
4066 "fmla v31.4s, v25.4s, v21.s[3]\n"
4067 "ldr q25, [%[b_ptr0], #0x10]\n"
4068 "fmla v26.4s, v24.4s, v2.s[0]\n"
4069 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4070 "fmla v27.4s, v24.4s, v6.s[0]\n"
4071 "fmla v28.4s, v24.4s, v10.s[0]\n"
4072 "fmla v29.4s, v24.4s, v14.s[0]\n"
4073 "fmla v30.4s, v24.4s, v18.s[0]\n"
4074 "fmla v31.4s, v24.4s, v22.s[0]\n"
4075 "ldr q24, [%[b_ptr0]]\n"
4076 "fmla v26.4s, v25.4s, v2.s[1]\n"
4077 "fmla v27.4s, v25.4s, v6.s[1]\n"
4078 "fmla v28.4s, v25.4s, v10.s[1]\n"
4079 "fmla v29.4s, v25.4s, v14.s[1]\n"
4080 "fmla v30.4s, v25.4s, v18.s[1]\n"
4081 "fmla v31.4s, v25.4s, v22.s[1]\n"
4082 "ldr q25, [%[b_ptr0], #0x10]\n"
4083 "fmla v26.4s, v24.4s, v2.s[2]\n"
4084 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4085 "fmla v27.4s, v24.4s, v6.s[2]\n"
4086 "fmla v28.4s, v24.4s, v10.s[2]\n"
4087 "fmla v29.4s, v24.4s, v14.s[2]\n"
4088 "fmla v30.4s, v24.4s, v18.s[2]\n"
4089 "fmla v31.4s, v24.4s, v22.s[2]\n"
4090 "ldr q24, [%[b_ptr0]]\n"
4091 "fmla v26.4s, v25.4s, v2.s[3]\n"
4092 "fmla v27.4s, v25.4s, v6.s[3]\n"
4093 "fmla v28.4s, v25.4s, v10.s[3]\n"
4094 "fmla v29.4s, v25.4s, v14.s[3]\n"
4095 "fmla v30.4s, v25.4s, v18.s[3]\n"
4096 "fmla v31.4s, v25.4s, v22.s[3]\n"
4097 "ldr q25, [%[b_ptr0], #0x10]\n"
4098 "fmla v26.4s, v24.4s, v3.s[0]\n"
4099 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4100 "fmla v27.4s, v24.4s, v7.s[0]\n"
4101 "fmla v28.4s, v24.4s, v11.s[0]\n"
4102 "fmla v29.4s, v24.4s, v15.s[0]\n"
4103 "fmla v30.4s, v24.4s, v19.s[0]\n"
4104 "fmla v31.4s, v24.4s, v23.s[0]\n"
4105 "ldr q24, [%[b_ptr0]]\n"
4106 "fmla v26.4s, v25.4s, v3.s[1]\n"
4107 "fmla v27.4s, v25.4s, v7.s[1]\n"
4108 "fmla v28.4s, v25.4s, v11.s[1]\n"
4109 "fmla v29.4s, v25.4s, v15.s[1]\n"
4110 "fmla v30.4s, v25.4s, v19.s[1]\n"
4111 "fmla v31.4s, v25.4s, v23.s[1]\n"
4112 "ldr q25, [%[b_ptr0], #0x10]\n"
4113 "fmla v26.4s, v24.4s, v3.s[2]\n"
4114 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4115 "fmla v27.4s, v24.4s, v7.s[2]\n"
4116 "fmla v28.4s, v24.4s, v11.s[2]\n"
4117 "fmla v29.4s, v24.4s, v15.s[2]\n"
4118 "fmla v30.4s, v24.4s, v19.s[2]\n"
4119 "fmla v31.4s, v24.4s, v23.s[2]\n"
4120 "fmla v26.4s, v25.4s, v3.s[3]\n"
4121 "fmla v27.4s, v25.4s, v7.s[3]\n"
4122 "fmla v28.4s, v25.4s, v11.s[3]\n"
4123 "fmla v29.4s, v25.4s, v15.s[3]\n"
4124 "fmla v30.4s, v25.4s, v19.s[3]\n"
4125 "fmla v31.4s, v25.4s, v23.s[3]\n"
4128 "ld1r {v24.4s}, [%[minptr]]\n"
4129 "subs %[loops], %[loops], #0x1\n"
4130 "ld1r {v25.4s}, [%[maxptr]]\n"
4131 "fmax v26.4s, v26.4s, v24.4s\n"
4132 "fmax v27.4s, v27.4s, v24.4s\n"
4133 "fmax v28.4s, v28.4s, v24.4s\n"
4134 "fmax v29.4s, v29.4s, v24.4s\n"
4135 "fmin v26.4s, v26.4s, v25.4s\n"
4136 "fmin v27.4s, v27.4s, v25.4s\n"
4137 "fmin v28.4s, v28.4s, v25.4s\n"
4138 "fmin v29.4s, v29.4s, v25.4s\n"
4139 "str q26, [%[c_ptr0]]\n"
4140 "fmax v30.4s, v30.4s, v24.4s\n"
4141 "ldr q26, [%[biasptr]]\n"
4142 "fmax v31.4s, v31.4s, v24.4s\n"
4143 "ldr q24, [%[b_ptr0]]\n"
4144 "add %[c_ptr0], %[c_ptr0], #0x10\n"
4145 "str q27, [c_ptr1]\n"
4146 "add c_ptr1, c_ptr1, #0x10\n"
4147 "fmin v30.4s, v30.4s, v25.4s\n"
4148 "add %[biasptr], %[biasptr], %[biasinc]\n"
4149 "fmin v31.4s, v31.4s, v25.4s\n"
4150 "str q28, [c_ptr2]\n"
4151 "mov v27.16b, v26.16b\n"
4152 "ldr q25, [%[b_ptr0], #0x10]\n"
4153 "mov v28.16b, v26.16b\n"
4154 "add c_ptr2, c_ptr2, #0x10\n"
4155 "str q29, [c_ptr3]\n"
4156 "add c_ptr3, c_ptr3, #0x10\n"
4157 "mov v29.16b, v26.16b\n"
4158 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4159 "fmla v27.4s, v24.4s, v4.s[0]\n"
4160 "str q30, [c_ptr4]\n"
4161 "mov v30.16b, v26.16b\n"
4162 "add c_ptr4, c_ptr4, #0x10\n"
4163 "fmla v28.4s, v24.4s, v8.s[0]\n"
4164 "str q31, [c_ptr5]\n"
4165 "mov v31.16b, v26.16b\n"
4166 "add c_ptr5, c_ptr5, #0x10\n"
4167 "fmla v26.4s, v24.4s, v0.s[0]\n"
4168 "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
4169 "fmla v29.4s, v24.4s, v12.s[0]\n"
4170 "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
4171 "fmla v30.4s, v24.4s, v16.s[0]\n"
4172 "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
4173 "fmla v31.4s, v24.4s, v20.s[0]\n"
4174 "ldr q24, [%[b_ptr0]]\n"
4175 "fmla v26.4s, v25.4s, v0.s[1]\n"
4176 "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
4177 "fmla v27.4s, v25.4s, v4.s[1]\n"
4178 "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
4179 "fmla v28.4s, v25.4s, v8.s[1]\n"
4180 "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
4181 "fmla v29.4s, v25.4s, v12.s[1]\n"
4182 "fmla v30.4s, v25.4s, v16.s[1]\n"
4183 "fmla v31.4s, v25.4s, v20.s[1]\n"
4184 "ldr q25, [%[b_ptr0], #0x10]\n"
4185 "fmla v26.4s, v24.4s, v0.s[2]\n"
4186 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4187 "fmla v27.4s, v24.4s, v4.s[2]\n"
4188 "fmla v28.4s, v24.4s, v8.s[2]\n"
4189 "fmla v29.4s, v24.4s, v12.s[2]\n"
4190 "fmla v30.4s, v24.4s, v16.s[2]\n"
4191 "fmla v31.4s, v24.4s, v20.s[2]\n"
4192 "ldr q24, [%[b_ptr0]]\n"
4193 "fmla v26.4s, v25.4s, v0.s[3]\n"
4194 "fmla v27.4s, v25.4s, v4.s[3]\n"
4195 "fmla v28.4s, v25.4s, v8.s[3]\n"
4196 "fmla v29.4s, v25.4s, v12.s[3]\n"
4197 "fmla v30.4s, v25.4s, v16.s[3]\n"
4198 "fmla v31.4s, v25.4s, v20.s[3]\n"
4199 "ldr q25, [%[b_ptr0], #0x10]\n"
4200 "fmla v26.4s, v24.4s, v1.s[0]\n"
4201 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4202 "fmla v27.4s, v24.4s, v5.s[0]\n"
4203 "fmla v28.4s, v24.4s, v9.s[0]\n"
4204 "fmla v29.4s, v24.4s, v13.s[0]\n"
4205 "fmla v30.4s, v24.4s, v17.s[0]\n"
4206 "fmla v31.4s, v24.4s, v21.s[0]\n"
4207 "ldr q24, [%[b_ptr0]]\n"
4208 "fmla v26.4s, v25.4s, v1.s[1]\n"
4209 "fmla v27.4s, v25.4s, v5.s[1]\n"
4210 "fmla v28.4s, v25.4s, v9.s[1]\n"
4211 "fmla v29.4s, v25.4s, v13.s[1]\n"
4212 "fmla v30.4s, v25.4s, v17.s[1]\n"
4213 "fmla v31.4s, v25.4s, v21.s[1]\n"
4214 "ldr q25, [%[b_ptr0], #0x10]\n"
4215 "fmla v26.4s, v24.4s, v1.s[2]\n"
4216 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4217 "fmla v27.4s, v24.4s, v5.s[2]\n"
4218 "fmla v28.4s, v24.4s, v9.s[2]\n"
4219 "fmla v29.4s, v24.4s, v13.s[2]\n"
4220 "fmla v30.4s, v24.4s, v17.s[2]\n"
4221 "fmla v31.4s, v24.4s, v21.s[2]\n"
4222 "ldr q24, [%[b_ptr0]]\n"
4223 "fmla v26.4s, v25.4s, v1.s[3]\n"
4224 "fmla v27.4s, v25.4s, v5.s[3]\n"
4225 "fmla v28.4s, v25.4s, v9.s[3]\n"
4226 "fmla v29.4s, v25.4s, v13.s[3]\n"
4227 "fmla v30.4s, v25.4s, v17.s[3]\n"
4228 "fmla v31.4s, v25.4s, v21.s[3]\n"
4229 "ldr q25, [%[b_ptr0], #0x10]\n"
4230 "fmla v26.4s, v24.4s, v2.s[0]\n"
4231 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4232 "fmla v27.4s, v24.4s, v6.s[0]\n"
4233 "fmla v28.4s, v24.4s, v10.s[0]\n"
4234 "fmla v29.4s, v24.4s, v14.s[0]\n"
4235 "fmla v30.4s, v24.4s, v18.s[0]\n"
4236 "fmla v31.4s, v24.4s, v22.s[0]\n"
4237 "ldr q24, [%[b_ptr0]]\n"
4238 "fmla v26.4s, v25.4s, v2.s[1]\n"
4239 "fmla v27.4s, v25.4s, v6.s[1]\n"
4240 "fmla v28.4s, v25.4s, v10.s[1]\n"
4241 "fmla v29.4s, v25.4s, v14.s[1]\n"
4242 "fmla v30.4s, v25.4s, v18.s[1]\n"
4243 "fmla v31.4s, v25.4s, v22.s[1]\n"
4244 "ldr q25, [%[b_ptr0], #0x10]\n"
4245 "fmla v26.4s, v24.4s, v2.s[2]\n"
4246 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4247 "fmla v27.4s, v24.4s, v6.s[2]\n"
4248 "fmla v28.4s, v24.4s, v10.s[2]\n"
4249 "fmla v29.4s, v24.4s, v14.s[2]\n"
4250 "fmla v30.4s, v24.4s, v18.s[2]\n"
4251 "fmla v31.4s, v24.4s, v22.s[2]\n"
4252 "ldr q24, [%[b_ptr0]]\n"
4253 "fmla v26.4s, v25.4s, v2.s[3]\n"
4254 "fmla v27.4s, v25.4s, v6.s[3]\n"
4255 "fmla v28.4s, v25.4s, v10.s[3]\n"
4256 "fmla v29.4s, v25.4s, v14.s[3]\n"
4257 "fmla v30.4s, v25.4s, v18.s[3]\n"
4258 "fmla v31.4s, v25.4s, v22.s[3]\n"
4259 "ldr q25, [%[b_ptr0], #0x10]\n"
4260 "fmla v26.4s, v24.4s, v3.s[0]\n"
4261 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4262 "fmla v27.4s, v24.4s, v7.s[0]\n"
4263 "fmla v28.4s, v24.4s, v11.s[0]\n"
4264 "fmla v29.4s, v24.4s, v15.s[0]\n"
4265 "fmla v30.4s, v24.4s, v19.s[0]\n"
4266 "fmla v31.4s, v24.4s, v23.s[0]\n"
4267 "ldr q24, [%[b_ptr0]]\n"
4268 "fmla v26.4s, v25.4s, v3.s[1]\n"
4269 "fmla v27.4s, v25.4s, v7.s[1]\n"
4270 "fmla v28.4s, v25.4s, v11.s[1]\n"
4271 "fmla v29.4s, v25.4s, v15.s[1]\n"
4272 "fmla v30.4s, v25.4s, v19.s[1]\n"
4273 "fmla v31.4s, v25.4s, v23.s[1]\n"
4274 "ldr q25, [%[b_ptr0], #0x10]\n"
4275 "fmla v26.4s, v24.4s, v3.s[2]\n"
4276 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4277 "fmla v27.4s, v24.4s, v7.s[2]\n"
4278 "fmla v28.4s, v24.4s, v11.s[2]\n"
4279 "fmla v29.4s, v24.4s, v15.s[2]\n"
4280 "fmla v30.4s, v24.4s, v19.s[2]\n"
4281 "fmla v31.4s, v24.4s, v23.s[2]\n"
4282 "fmla v26.4s, v25.4s, v3.s[3]\n"
4283 "fmla v27.4s, v25.4s, v7.s[3]\n"
4284 "fmla v28.4s, v25.4s, v11.s[3]\n"
4285 "fmla v29.4s, v25.4s, v15.s[3]\n"
4286 "fmla v30.4s, v25.4s, v19.s[3]\n"
4287 "fmla v31.4s, v25.4s, v23.s[3]\n"
4290 "ld1r {v24.4s}, [%[minptr]]\n"
4291 "ld1r {v25.4s}, [%[maxptr]]\n"
4292 "fmax v26.4s, v26.4s, v24.4s\n"
4293 "fmax v27.4s, v27.4s, v24.4s\n"
4294 "fmax v28.4s, v28.4s, v24.4s\n"
4295 "fmax v29.4s, v29.4s, v24.4s\n"
4296 "fmin v26.4s, v26.4s, v25.4s\n"
4297 "fmin v27.4s, v27.4s, v25.4s\n"
4298 "fmin v28.4s, v28.4s, v25.4s\n"
4299 "fmin v29.4s, v29.4s, v25.4s\n"
4300 "str q26, [%[c_ptr0]]\n"
4301 "fmax v30.4s, v30.4s, v24.4s\n"
4302 "ldr q26, [%[biasptr]]\n"
4303 "fmax v31.4s, v31.4s, v24.4s\n"
4304 "ldr q24, [%[b_ptr0]]\n"
4305 "add %[c_ptr0], %[c_ptr0], #0x10\n"
4306 "str q27, [c_ptr1]\n"
4307 "add c_ptr1, c_ptr1, #0x10\n"
4308 "fmin v30.4s, v30.4s, v25.4s\n"
4309 "add %[biasptr], %[biasptr], %[biasinc]\n"
4310 "fmin v31.4s, v31.4s, v25.4s\n"
4311 "str q28, [c_ptr2]\n"
4312 "mov v27.16b, v26.16b\n"
4313 "ldr q25, [%[b_ptr0], #0x10]\n"
4314 "mov v28.16b, v26.16b\n"
4315 "add c_ptr2, c_ptr2, #0x10\n"
4316 "str q29, [c_ptr3]\n"
4317 "add c_ptr3, c_ptr3, #0x10\n"
4318 "mov v29.16b, v26.16b\n"
4319 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4320 "fmla v27.4s, v24.4s, v4.s[0]\n"
4321 "str q30, [c_ptr4]\n"
4322 "mov v30.16b, v26.16b\n"
4323 "add c_ptr4, c_ptr4, #0x10\n"
4324 "fmla v28.4s, v24.4s, v8.s[0]\n"
4325 "str q31, [c_ptr5]\n"
4326 "mov v31.16b, v26.16b\n"
4327 "add c_ptr5, c_ptr5, #0x10\n"
4328 "fmla v26.4s, v24.4s, v0.s[0]\n"
4329 "fmla v29.4s, v24.4s, v12.s[0]\n"
4330 "fmla v30.4s, v24.4s, v16.s[0]\n"
4331 "fmla v31.4s, v24.4s, v20.s[0]\n"
4332 "ldr q24, [%[b_ptr0]]\n"
4333 "fmla v26.4s, v25.4s, v0.s[1]\n"
4334 "fmla v27.4s, v25.4s, v4.s[1]\n"
4335 "fmla v28.4s, v25.4s, v8.s[1]\n"
4336 "fmla v29.4s, v25.4s, v12.s[1]\n"
4337 "fmla v30.4s, v25.4s, v16.s[1]\n"
4338 "fmla v31.4s, v25.4s, v20.s[1]\n"
4339 "ldr q25, [%[b_ptr0], #0x10]\n"
4340 "fmla v26.4s, v24.4s, v0.s[2]\n"
4341 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4342 "fmla v27.4s, v24.4s, v4.s[2]\n"
4343 "fmla v28.4s, v24.4s, v8.s[2]\n"
4344 "fmla v29.4s, v24.4s, v12.s[2]\n"
4345 "fmla v30.4s, v24.4s, v16.s[2]\n"
4346 "fmla v31.4s, v24.4s, v20.s[2]\n"
4347 "ldr q24, [%[b_ptr0]]\n"
4348 "fmla v26.4s, v25.4s, v0.s[3]\n"
4349 "fmla v27.4s, v25.4s, v4.s[3]\n"
4350 "fmla v28.4s, v25.4s, v8.s[3]\n"
4351 "fmla v29.4s, v25.4s, v12.s[3]\n"
4352 "fmla v30.4s, v25.4s, v16.s[3]\n"
4353 "fmla v31.4s, v25.4s, v20.s[3]\n"
4354 "ldr q25, [%[b_ptr0], #0x10]\n"
4355 "fmla v26.4s, v24.4s, v1.s[0]\n"
4356 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4357 "fmla v27.4s, v24.4s, v5.s[0]\n"
4358 "fmla v28.4s, v24.4s, v9.s[0]\n"
4359 "fmla v29.4s, v24.4s, v13.s[0]\n"
4360 "fmla v30.4s, v24.4s, v17.s[0]\n"
4361 "fmla v31.4s, v24.4s, v21.s[0]\n"
4362 "ldr q24, [%[b_ptr0]]\n"
4363 "fmla v26.4s, v25.4s, v1.s[1]\n"
4364 "fmla v27.4s, v25.4s, v5.s[1]\n"
4365 "fmla v28.4s, v25.4s, v9.s[1]\n"
4366 "fmla v29.4s, v25.4s, v13.s[1]\n"
4367 "fmla v30.4s, v25.4s, v17.s[1]\n"
4368 "fmla v31.4s, v25.4s, v21.s[1]\n"
4369 "ldr q25, [%[b_ptr0], #0x10]\n"
4370 "fmla v26.4s, v24.4s, v1.s[2]\n"
4371 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4372 "fmla v27.4s, v24.4s, v5.s[2]\n"
4373 "fmla v28.4s, v24.4s, v9.s[2]\n"
4374 "fmla v29.4s, v24.4s, v13.s[2]\n"
4375 "fmla v30.4s, v24.4s, v17.s[2]\n"
4376 "fmla v31.4s, v24.4s, v21.s[2]\n"
4377 "ldr q24, [%[b_ptr0]]\n"
4378 "fmla v26.4s, v25.4s, v1.s[3]\n"
4379 "fmla v27.4s, v25.4s, v5.s[3]\n"
4380 "fmla v28.4s, v25.4s, v9.s[3]\n"
4381 "fmla v29.4s, v25.4s, v13.s[3]\n"
4382 "fmla v30.4s, v25.4s, v17.s[3]\n"
4383 "fmla v31.4s, v25.4s, v21.s[3]\n"
4384 "ldr q25, [%[b_ptr0], #0x10]\n"
4385 "fmla v26.4s, v24.4s, v2.s[0]\n"
4386 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4387 "fmla v27.4s, v24.4s, v6.s[0]\n"
4388 "fmla v28.4s, v24.4s, v10.s[0]\n"
4389 "fmla v29.4s, v24.4s, v14.s[0]\n"
4390 "fmla v30.4s, v24.4s, v18.s[0]\n"
4391 "fmla v31.4s, v24.4s, v22.s[0]\n"
4392 "ldr q24, [%[b_ptr0]]\n"
4393 "fmla v26.4s, v25.4s, v2.s[1]\n"
4394 "fmla v27.4s, v25.4s, v6.s[1]\n"
4395 "fmla v28.4s, v25.4s, v10.s[1]\n"
4396 "fmla v29.4s, v25.4s, v14.s[1]\n"
4397 "fmla v30.4s, v25.4s, v18.s[1]\n"
4398 "fmla v31.4s, v25.4s, v22.s[1]\n"
4399 "ldr q25, [%[b_ptr0], #0x10]\n"
4400 "fmla v26.4s, v24.4s, v2.s[2]\n"
4401 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4402 "fmla v27.4s, v24.4s, v6.s[2]\n"
4403 "fmla v28.4s, v24.4s, v10.s[2]\n"
4404 "fmla v29.4s, v24.4s, v14.s[2]\n"
4405 "fmla v30.4s, v24.4s, v18.s[2]\n"
4406 "fmla v31.4s, v24.4s, v22.s[2]\n"
4407 "ldr q24, [%[b_ptr0]]\n"
4408 "fmla v26.4s, v25.4s, v2.s[3]\n"
4409 "fmla v27.4s, v25.4s, v6.s[3]\n"
4410 "fmla v28.4s, v25.4s, v10.s[3]\n"
4411 "fmla v29.4s, v25.4s, v14.s[3]\n"
4412 "fmla v30.4s, v25.4s, v18.s[3]\n"
4413 "fmla v31.4s, v25.4s, v22.s[3]\n"
4414 "ldr q25, [%[b_ptr0], #0x10]\n"
4415 "fmla v26.4s, v24.4s, v3.s[0]\n"
4416 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4417 "fmla v27.4s, v24.4s, v7.s[0]\n"
4418 "fmla v28.4s, v24.4s, v11.s[0]\n"
4419 "fmla v29.4s, v24.4s, v15.s[0]\n"
4420 "fmla v30.4s, v24.4s, v19.s[0]\n"
4421 "fmla v31.4s, v24.4s, v23.s[0]\n"
4422 "ldr q24, [%[b_ptr0]]\n"
4423 "fmla v26.4s, v25.4s, v3.s[1]\n"
4424 "fmla v27.4s, v25.4s, v7.s[1]\n"
4425 "fmla v28.4s, v25.4s, v11.s[1]\n"
4426 "fmla v29.4s, v25.4s, v15.s[1]\n"
4427 "fmla v30.4s, v25.4s, v19.s[1]\n"
4428 "fmla v31.4s, v25.4s, v23.s[1]\n"
4429 "ldr q25, [%[b_ptr0], #0x10]\n"
4430 "fmla v26.4s, v24.4s, v3.s[2]\n"
4431 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4432 "fmla v27.4s, v24.4s, v7.s[2]\n"
4433 "fmla v28.4s, v24.4s, v11.s[2]\n"
4434 "fmla v29.4s, v24.4s, v15.s[2]\n"
4435 "fmla v30.4s, v24.4s, v19.s[2]\n"
4436 "fmla v31.4s, v24.4s, v23.s[2]\n"
4437 "fmla v26.4s, v25.4s, v3.s[3]\n"
4438 "fmla v27.4s, v25.4s, v7.s[3]\n"
4439 "fmla v28.4s, v25.4s, v11.s[3]\n"
4440 "fmla v29.4s, v25.4s, v15.s[3]\n"
4441 "fmla v30.4s, v25.4s, v19.s[3]\n"
4442 "fmla v31.4s, v25.4s, v23.s[3]\n"
4445 "ldr q26, [%[biasptr]]\n"
4446 "add %[biasptr], %[biasptr], %[biasinc]\n"
4447 "mov v27.16b, v26.16b\n"
4448 "mov v28.16b, v26.16b\n"
4449 "mov v29.16b, v26.16b\n"
4450 "mov v30.16b, v26.16b\n"
4451 "mov v31.16b, v26.16b\n"
4452 "fmla v26.4s, v24.4s, v0.s[0]\n"
4453 "fmla v27.4s, v24.4s, v4.s[0]\n"
4454 "fmla v28.4s, v24.4s, v8.s[0]\n"
4455 "fmla v29.4s, v24.4s, v12.s[0]\n"
4456 "fmla v30.4s, v24.4s, v16.s[0]\n"
4457 "fmla v31.4s, v24.4s, v20.s[0]\n"
4458 "ldr q24, [%[b_ptr0]]\n"
4459 "fmla v26.4s, v25.4s, v0.s[1]\n"
4460 "fmla v27.4s, v25.4s, v4.s[1]\n"
4461 "fmla v28.4s, v25.4s, v8.s[1]\n"
4462 "fmla v29.4s, v25.4s, v12.s[1]\n"
4463 "fmla v30.4s, v25.4s, v16.s[1]\n"
4464 "fmla v31.4s, v25.4s, v20.s[1]\n"
4465 "ldr q25, [%[b_ptr0], #0x10]\n"
4466 "fmla v26.4s, v24.4s, v0.s[2]\n"
4467 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4468 "fmla v27.4s, v24.4s, v4.s[2]\n"
4469 "fmla v28.4s, v24.4s, v8.s[2]\n"
4470 "fmla v29.4s, v24.4s, v12.s[2]\n"
4471 "fmla v30.4s, v24.4s, v16.s[2]\n"
4472 "fmla v31.4s, v24.4s, v20.s[2]\n"
4473 "ldr q24, [%[b_ptr0]]\n"
4474 "fmla v26.4s, v25.4s, v0.s[3]\n"
4475 "fmla v27.4s, v25.4s, v4.s[3]\n"
4476 "fmla v28.4s, v25.4s, v8.s[3]\n"
4477 "fmla v29.4s, v25.4s, v12.s[3]\n"
4478 "fmla v30.4s, v25.4s, v16.s[3]\n"
4479 "fmla v31.4s, v25.4s, v20.s[3]\n"
4480 "ldr q25, [%[b_ptr0], #0x10]\n"
4481 "fmla v26.4s, v24.4s, v1.s[0]\n"
4482 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4483 "fmla v27.4s, v24.4s, v5.s[0]\n"
4484 "fmla v28.4s, v24.4s, v9.s[0]\n"
4485 "fmla v29.4s, v24.4s, v13.s[0]\n"
4486 "fmla v30.4s, v24.4s, v17.s[0]\n"
4487 "fmla v31.4s, v24.4s, v21.s[0]\n"
4488 "ldr q24, [%[b_ptr0]]\n"
4489 "fmla v26.4s, v25.4s, v1.s[1]\n"
4490 "fmla v27.4s, v25.4s, v5.s[1]\n"
4491 "fmla v28.4s, v25.4s, v9.s[1]\n"
4492 "fmla v29.4s, v25.4s, v13.s[1]\n"
4493 "fmla v30.4s, v25.4s, v17.s[1]\n"
4494 "fmla v31.4s, v25.4s, v21.s[1]\n"
4495 "ldr q25, [%[b_ptr0], #0x10]\n"
4496 "fmla v26.4s, v24.4s, v1.s[2]\n"
4497 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4498 "fmla v27.4s, v24.4s, v5.s[2]\n"
4499 "fmla v28.4s, v24.4s, v9.s[2]\n"
4500 "fmla v29.4s, v24.4s, v13.s[2]\n"
4501 "fmla v30.4s, v24.4s, v17.s[2]\n"
4502 "fmla v31.4s, v24.4s, v21.s[2]\n"
4503 "ldr q24, [%[b_ptr0]]\n"
4504 "fmla v26.4s, v25.4s, v1.s[3]\n"
4505 "fmla v27.4s, v25.4s, v5.s[3]\n"
4506 "fmla v28.4s, v25.4s, v9.s[3]\n"
4507 "fmla v29.4s, v25.4s, v13.s[3]\n"
4508 "fmla v30.4s, v25.4s, v17.s[3]\n"
4509 "fmla v31.4s, v25.4s, v21.s[3]\n"
4510 "ldr q25, [%[b_ptr0], #0x10]\n"
4511 "fmla v26.4s, v24.4s, v2.s[0]\n"
4512 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4513 "fmla v27.4s, v24.4s, v6.s[0]\n"
4514 "fmla v28.4s, v24.4s, v10.s[0]\n"
4515 "fmla v29.4s, v24.4s, v14.s[0]\n"
4516 "fmla v30.4s, v24.4s, v18.s[0]\n"
4517 "fmla v31.4s, v24.4s, v22.s[0]\n"
4518 "ldr q24, [%[b_ptr0]]\n"
4519 "fmla v26.4s, v25.4s, v2.s[1]\n"
4520 "fmla v27.4s, v25.4s, v6.s[1]\n"
4521 "fmla v28.4s, v25.4s, v10.s[1]\n"
4522 "fmla v29.4s, v25.4s, v14.s[1]\n"
4523 "fmla v30.4s, v25.4s, v18.s[1]\n"
4524 "fmla v31.4s, v25.4s, v22.s[1]\n"
4525 "ldr q25, [%[b_ptr0], #0x10]\n"
4526 "fmla v26.4s, v24.4s, v2.s[2]\n"
4527 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4528 "fmla v27.4s, v24.4s, v6.s[2]\n"
4529 "fmla v28.4s, v24.4s, v10.s[2]\n"
4530 "fmla v29.4s, v24.4s, v14.s[2]\n"
4531 "fmla v30.4s, v24.4s, v18.s[2]\n"
4532 "fmla v31.4s, v24.4s, v22.s[2]\n"
4533 "ldr q24, [%[b_ptr0]]\n"
4534 "fmla v26.4s, v25.4s, v2.s[3]\n"
4535 "fmla v27.4s, v25.4s, v6.s[3]\n"
4536 "fmla v28.4s, v25.4s, v10.s[3]\n"
4537 "fmla v29.4s, v25.4s, v14.s[3]\n"
4538 "fmla v30.4s, v25.4s, v18.s[3]\n"
4539 "fmla v31.4s, v25.4s, v22.s[3]\n"
4540 "ldr q25, [%[b_ptr0], #0x10]\n"
4541 "fmla v26.4s, v24.4s, v3.s[0]\n"
4542 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4543 "fmla v27.4s, v24.4s, v7.s[0]\n"
4544 "fmla v28.4s, v24.4s, v11.s[0]\n"
4545 "fmla v29.4s, v24.4s, v15.s[0]\n"
4546 "fmla v30.4s, v24.4s, v19.s[0]\n"
4547 "fmla v31.4s, v24.4s, v23.s[0]\n"
4548 "ldr q24, [%[b_ptr0]]\n"
4549 "fmla v26.4s, v25.4s, v3.s[1]\n"
4550 "fmla v27.4s, v25.4s, v7.s[1]\n"
4551 "fmla v28.4s, v25.4s, v11.s[1]\n"
4552 "fmla v29.4s, v25.4s, v15.s[1]\n"
4553 "fmla v30.4s, v25.4s, v19.s[1]\n"
4554 "fmla v31.4s, v25.4s, v23.s[1]\n"
4555 "ldr q25, [%[b_ptr0], #0x10]\n"
4556 "fmla v26.4s, v24.4s, v3.s[2]\n"
4557 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4558 "fmla v27.4s, v24.4s, v7.s[2]\n"
4559 "fmla v28.4s, v24.4s, v11.s[2]\n"
4560 "fmla v29.4s, v24.4s, v15.s[2]\n"
4561 "fmla v30.4s, v24.4s, v19.s[2]\n"
4562 "fmla v31.4s, v24.4s, v23.s[2]\n"
4563 "fmla v26.4s, v25.4s, v3.s[3]\n"
4564 "fmla v27.4s, v25.4s, v7.s[3]\n"
4565 "fmla v28.4s, v25.4s, v11.s[3]\n"
4566 "fmla v29.4s, v25.4s, v15.s[3]\n"
4567 "fmla v30.4s, v25.4s, v19.s[3]\n"
4568 "fmla v31.4s, v25.4s, v23.s[3]\n"
4570 "ld1r {v24.4s}, [%[minptr]]\n"
4571 "ld1r {v25.4s}, [%[maxptr]]\n"
4572 "fmax v26.4s, v26.4s, v24.4s\n"
4573 "fmax v27.4s, v27.4s, v24.4s\n"
4574 "fmax v28.4s, v28.4s, v24.4s\n"
4575 "fmax v29.4s, v29.4s, v24.4s\n"
4576 "fmin v26.4s, v26.4s, v25.4s\n"
4577 "fmin v27.4s, v27.4s, v25.4s\n"
4578 "fmin v28.4s, v28.4s, v25.4s\n"
4579 "fmin v29.4s, v29.4s, v25.4s\n"
4580 "str q26, [%[c_ptr0]]\n"
4581 "fmax v30.4s, v30.4s, v24.4s\n"
4582 "add %[c_ptr0], %[c_ptr0], #0x10\n"
4583 "fmax v31.4s, v31.4s, v24.4s\n"
4584 "str q27, [c_ptr1]\n"
4585 "fmin v30.4s, v30.4s, v25.4s\n"
4586 "fmin v31.4s, v31.4s, v25.4s\n"
4587 "str q28, [c_ptr2]\n"
4588 "str q29, [c_ptr3]\n"
4589 "str q30, [c_ptr4]\n"
4590 "str q31, [c_ptr5]\n"
4601 : [a_ptr0]
"+r" (a_ptr0), [b_ptr0]
"+r" (b_ptr0), [c_ptr0]
"+r" (c_ptr0), [loops]
"+r" (loops), [oob_rows]
"+r" (oob_rows), [biasptr]
"+r" (biasptr)
4602 : [lda]
"r" (ldab), [ldc]
"r" (ldcb), [biasinc]
"r" (biasinc), [minptr]
"r" (minptr), [maxptr]
"r" (maxptr)
4603 :
"x0",
"x1",
"x2",
"x3",
"x4",
"x5",
"x6",
"x7",
"x8",
"x9",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23",
"v24",
"v25",
"v26",
"v27",
"v28",
"v29",
"v30",
"v31",
"cc",
"memory"
4612 #endif // __aarch64__