29 void MergeResults<12, 8, false>(uint32_t *out,
const uint32_t *in,
const int ldout,
const int y0,
const int ymax,
const int x0,
const int xmax,
const uint32_t *
bias,
Activation ,
bool append)
31 const uint32_t *inptr = in;
32 uint32_t nullbias[12];
37 memset(nullbias, 0, (12 *
sizeof(uint32_t)));
40 for (
int y=y0; y<ymax; y+=8)
42 uint32_t *outptr0 = out + (y * ldout) + x0;
43 uint32_t *outptr1 = outptr0 + ldout;
44 uint32_t *outptr2 = outptr1 + ldout;
45 uint32_t *outptr3 = outptr2 + ldout;
46 uint32_t *outptr4 = outptr3 + ldout;
47 uint32_t *outptr5 = outptr4 + ldout;
48 uint32_t *outptr6 = outptr5 + ldout;
49 uint32_t *outptr7 = outptr6 + ldout;
51 const int height = ymax - y;
53 for (
int i=x0; i<xmax; i+=12)
63 for (
int xi=0; xi<11; xi++)
67 *outptr0 += inptr[xi];
75 "ldr q2, [%[outptr0]]\n"
76 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
77 "ldr q10, [%[inptr]]\n"
78 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
79 "ldr q3, [%[outptr0], #0x10]\n"
80 "ldr q11, [%[inptr], #0x10]\n"
81 "add v10.4s, v10.4s, v2.4s\n"
82 "ldr q4, [%[outptr0], #0x20]\n"
83 "ldr q12, [%[inptr], #0x20]\n"
84 "add %[inptr], %[inptr], #0x180\n"
85 "add v11.4s, v11.4s, v3.4s\n"
86 "str q10, [%[outptr0]]\n"
87 "add v12.4s, v12.4s, v4.4s\n"
88 "str q11, [%[outptr0], #0x10]\n"
89 "str q12, [%[outptr0], #0x20]\n"
90 "add %[outptr0], %[outptr0], #0x30\n"
91 : [outptr0]
"+r" (outptr0), [outptr1]
"+r" (outptr1), [outptr2]
"+r" (outptr2), [outptr3]
"+r" (outptr3), [outptr4]
"+r" (outptr4), [outptr5]
"+r" (outptr5), [outptr6]
"+r" (outptr6), [outptr7]
"+r" (outptr7),
94 :
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"memory"
104 for (
int xi=0; xi<11; xi++)
108 *outptr0 += inptr[xi];
110 *outptr1 += inptr[xi + 12];
118 "ldr q2, [%[outptr0]]\n"
119 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
120 "ldr q10, [%[inptr]]\n"
121 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
122 "ldr q3, [%[outptr0], #0x10]\n"
123 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
124 "add v10.4s, v10.4s, v2.4s\n"
125 "ldr q11, [%[inptr], #0x10]\n"
126 "ldr q4, [%[outptr0], #0x20]\n"
127 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
128 "ldr q12, [%[inptr], #0x20]\n"
129 "add v11.4s, v11.4s, v3.4s\n"
130 "str q10, [%[outptr0]]\n"
131 "ldr q5, [%[outptr1]]\n"
132 "ldr q13, [%[inptr], #0x30]\n"
133 "add v12.4s, v12.4s, v4.4s\n"
134 "str q11, [%[outptr0], #0x10]\n"
135 "ldr q6, [%[outptr1], #0x10]\n"
136 "ldr q14, [%[inptr], #0x40]\n"
137 "add v13.4s, v13.4s, v5.4s\n"
138 "str q12, [%[outptr0], #0x20]\n"
139 "ldr q7, [%[outptr1], #0x20]\n"
140 "add %[outptr0], %[outptr0], #0x30\n"
141 "add v14.4s, v14.4s, v6.4s\n"
142 "str q13, [%[outptr1]]\n"
143 "ldr q15, [%[inptr], #0x50]\n"
144 "add %[inptr], %[inptr], #0x180\n"
145 "str q14, [%[outptr1], #0x10]\n"
146 "add v15.4s, v15.4s, v7.4s\n"
147 "str q15, [%[outptr1], #0x20]\n"
148 "add %[outptr1], %[outptr1], #0x30\n"
149 : [outptr0]
"+r" (outptr0), [outptr1]
"+r" (outptr1), [outptr2]
"+r" (outptr2), [outptr3]
"+r" (outptr3), [outptr4]
"+r" (outptr4), [outptr5]
"+r" (outptr5), [outptr6]
"+r" (outptr6), [outptr7]
"+r" (outptr7),
152 :
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"memory"
162 for (
int xi=0; xi<11; xi++)
166 *outptr0 += inptr[xi];
168 *outptr1 += inptr[xi + 12];
170 *outptr2 += inptr[xi + 24];
178 "ldr q2, [%[outptr0]]\n"
179 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
180 "ldr q10, [%[inptr]]\n"
181 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
182 "ldr q3, [%[outptr0], #0x10]\n"
183 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
184 "add v10.4s, v10.4s, v2.4s\n"
185 "ldr q11, [%[inptr], #0x10]\n"
186 "ldr q4, [%[outptr0], #0x20]\n"
187 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
188 "ldr q12, [%[inptr], #0x20]\n"
189 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
190 "add v11.4s, v11.4s, v3.4s\n"
191 "str q10, [%[outptr0]]\n"
192 "ldr q5, [%[outptr1]]\n"
193 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
194 "add v12.4s, v12.4s, v4.4s\n"
195 "str q11, [%[outptr0], #0x10]\n"
196 "ldr q13, [%[inptr], #0x30]\n"
197 "ldr q6, [%[outptr1], #0x10]\n"
198 "ldr q14, [%[inptr], #0x40]\n"
199 "str q12, [%[outptr0], #0x20]\n"
200 "add %[outptr0], %[outptr0], #0x30\n"
201 "add v13.4s, v13.4s, v5.4s\n"
202 "ldr q7, [%[outptr1], #0x20]\n"
203 "add v14.4s, v14.4s, v6.4s\n"
204 "ldr q15, [%[inptr], #0x50]\n"
205 "ldr q8, [%[outptr2]]\n"
206 "ldr q16, [%[inptr], #0x60]\n"
207 "str q13, [%[outptr1]]\n"
208 "add v15.4s, v15.4s, v7.4s\n"
209 "ldr q9, [%[outptr2], #0x10]\n"
210 "ldr q17, [%[inptr], #0x70]\n"
211 "add v16.4s, v16.4s, v8.4s\n"
212 "str q14, [%[outptr1], #0x10]\n"
213 "ldr q2, [%[outptr2], #0x20]\n"
214 "ldr q10, [%[inptr], #0x80]\n"
215 "add %[inptr], %[inptr], #0x180\n"
216 "add v17.4s, v17.4s, v9.4s\n"
217 "str q15, [%[outptr1], #0x20]\n"
218 "add %[outptr1], %[outptr1], #0x30\n"
219 "add v10.4s, v10.4s, v2.4s\n"
220 "str q16, [%[outptr2]]\n"
221 "str q17, [%[outptr2], #0x10]\n"
222 "str q10, [%[outptr2], #0x20]\n"
223 "add %[outptr2], %[outptr2], #0x30\n"
224 : [outptr0]
"+r" (outptr0), [outptr1]
"+r" (outptr1), [outptr2]
"+r" (outptr2), [outptr3]
"+r" (outptr3), [outptr4]
"+r" (outptr4), [outptr5]
"+r" (outptr5), [outptr6]
"+r" (outptr6), [outptr7]
"+r" (outptr7),
227 :
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"memory"
237 for (
int xi=0; xi<11; xi++)
241 *outptr0 += inptr[xi];
243 *outptr1 += inptr[xi + 12];
245 *outptr2 += inptr[xi + 24];
247 *outptr3 += inptr[xi + 36];
255 "ldr q2, [%[outptr0]]\n"
256 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
257 "ldr q10, [%[inptr]]\n"
258 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
259 "ldr q3, [%[outptr0], #0x10]\n"
260 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
261 "add v10.4s, v10.4s, v2.4s\n"
262 "ldr q11, [%[inptr], #0x10]\n"
263 "ldr q4, [%[outptr0], #0x20]\n"
264 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
265 "ldr q12, [%[inptr], #0x20]\n"
266 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
267 "add v11.4s, v11.4s, v3.4s\n"
268 "str q10, [%[outptr0]]\n"
269 "ldr q5, [%[outptr1]]\n"
270 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
271 "add v12.4s, v12.4s, v4.4s\n"
272 "str q11, [%[outptr0], #0x10]\n"
273 "ldr q13, [%[inptr], #0x30]\n"
274 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
275 "ldr q6, [%[outptr1], #0x10]\n"
276 "str q12, [%[outptr0], #0x20]\n"
277 "add %[outptr0], %[outptr0], #0x30\n"
278 "add v13.4s, v13.4s, v5.4s\n"
279 "ldr q14, [%[inptr], #0x40]\n"
280 "ldr q7, [%[outptr1], #0x20]\n"
281 "ldr q15, [%[inptr], #0x50]\n"
282 "ldr q8, [%[outptr2]]\n"
283 "add v14.4s, v14.4s, v6.4s\n"
284 "str q13, [%[outptr1]]\n"
285 "ldr q16, [%[inptr], #0x60]\n"
286 "add v15.4s, v15.4s, v7.4s\n"
287 "ldr q9, [%[outptr2], #0x10]\n"
288 "ldr q17, [%[inptr], #0x70]\n"
289 "str q14, [%[outptr1], #0x10]\n"
290 "add v16.4s, v16.4s, v8.4s\n"
291 "ldr q2, [%[outptr2], #0x20]\n"
292 "ldr q10, [%[inptr], #0x80]\n"
293 "add v17.4s, v17.4s, v9.4s\n"
294 "str q15, [%[outptr1], #0x20]\n"
295 "ldr q3, [%[outptr3]]\n"
296 "add %[outptr1], %[outptr1], #0x30\n"
297 "add v10.4s, v10.4s, v2.4s\n"
298 "str q16, [%[outptr2]]\n"
299 "ldr q11, [%[inptr], #0x90]\n"
300 "ldr q4, [%[outptr3], #0x10]\n"
301 "ldr q12, [%[inptr], #0xa0]\n"
302 "str q17, [%[outptr2], #0x10]\n"
303 "add v11.4s, v11.4s, v3.4s\n"
304 "ldr q5, [%[outptr3], #0x20]\n"
305 "ldr q13, [%[inptr], #0xb0]\n"
306 "add %[inptr], %[inptr], #0x180\n"
307 "add v12.4s, v12.4s, v4.4s\n"
308 "str q10, [%[outptr2], #0x20]\n"
309 "add %[outptr2], %[outptr2], #0x30\n"
310 "add v13.4s, v13.4s, v5.4s\n"
311 "str q11, [%[outptr3]]\n"
312 "str q12, [%[outptr3], #0x10]\n"
313 "str q13, [%[outptr3], #0x20]\n"
314 "add %[outptr3], %[outptr3], #0x30\n"
315 : [outptr0]
"+r" (outptr0), [outptr1]
"+r" (outptr1), [outptr2]
"+r" (outptr2), [outptr3]
"+r" (outptr3), [outptr4]
"+r" (outptr4), [outptr5]
"+r" (outptr5), [outptr6]
"+r" (outptr6), [outptr7]
"+r" (outptr7),
318 :
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"memory"
328 for (
int xi=0; xi<11; xi++)
332 *outptr0 += inptr[xi];
334 *outptr1 += inptr[xi + 12];
336 *outptr2 += inptr[xi + 24];
338 *outptr3 += inptr[xi + 36];
340 *outptr4 += inptr[xi + 48];
348 "ldr q2, [%[outptr0]]\n"
349 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
350 "ldr q10, [%[inptr]]\n"
351 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
352 "ldr q3, [%[outptr0], #0x10]\n"
353 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
354 "add v10.4s, v10.4s, v2.4s\n"
355 "ldr q11, [%[inptr], #0x10]\n"
356 "ldr q4, [%[outptr0], #0x20]\n"
357 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
358 "ldr q12, [%[inptr], #0x20]\n"
359 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
360 "add v11.4s, v11.4s, v3.4s\n"
361 "str q10, [%[outptr0]]\n"
362 "ldr q5, [%[outptr1]]\n"
363 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
364 "add v12.4s, v12.4s, v4.4s\n"
365 "str q11, [%[outptr0], #0x10]\n"
366 "ldr q13, [%[inptr], #0x30]\n"
367 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
368 "ldr q6, [%[outptr1], #0x10]\n"
369 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
370 "add v13.4s, v13.4s, v5.4s\n"
371 "str q12, [%[outptr0], #0x20]\n"
372 "ldr q14, [%[inptr], #0x40]\n"
373 "add %[outptr0], %[outptr0], #0x30\n"
374 "ldr q7, [%[outptr1], #0x20]\n"
375 "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
376 "add v14.4s, v14.4s, v6.4s\n"
377 "str q13, [%[outptr1]]\n"
378 "ldr q15, [%[inptr], #0x50]\n"
379 "ldr q8, [%[outptr2]]\n"
380 "ldr q16, [%[inptr], #0x60]\n"
381 "str q14, [%[outptr1], #0x10]\n"
382 "add v15.4s, v15.4s, v7.4s\n"
383 "ldr q9, [%[outptr2], #0x10]\n"
384 "ldr q17, [%[inptr], #0x70]\n"
385 "add v16.4s, v16.4s, v8.4s\n"
386 "ldr q2, [%[outptr2], #0x20]\n"
387 "ldr q10, [%[inptr], #0x80]\n"
388 "str q15, [%[outptr1], #0x20]\n"
389 "add %[outptr1], %[outptr1], #0x30\n"
390 "add v17.4s, v17.4s, v9.4s\n"
391 "ldr q3, [%[outptr3]]\n"
392 "add v10.4s, v10.4s, v2.4s\n"
393 "str q16, [%[outptr2]]\n"
394 "ldr q11, [%[inptr], #0x90]\n"
395 "ldr q4, [%[outptr3], #0x10]\n"
396 "ldr q12, [%[inptr], #0xa0]\n"
397 "str q17, [%[outptr2], #0x10]\n"
398 "add v11.4s, v11.4s, v3.4s\n"
399 "ldr q5, [%[outptr3], #0x20]\n"
400 "ldr q13, [%[inptr], #0xb0]\n"
401 "add v12.4s, v12.4s, v4.4s\n"
402 "str q10, [%[outptr2], #0x20]\n"
403 "ldr q6, [%[outptr4]]\n"
404 "add %[outptr2], %[outptr2], #0x30\n"
405 "add v13.4s, v13.4s, v5.4s\n"
406 "str q11, [%[outptr3]]\n"
407 "ldr q14, [%[inptr], #0xc0]\n"
408 "ldr q7, [%[outptr4], #0x10]\n"
409 "ldr q15, [%[inptr], #0xd0]\n"
410 "str q12, [%[outptr3], #0x10]\n"
411 "add v14.4s, v14.4s, v6.4s\n"
412 "ldr q8, [%[outptr4], #0x20]\n"
413 "ldr q16, [%[inptr], #0xe0]\n"
414 "add %[inptr], %[inptr], #0x180\n"
415 "add v15.4s, v15.4s, v7.4s\n"
416 "str q13, [%[outptr3], #0x20]\n"
417 "add %[outptr3], %[outptr3], #0x30\n"
418 "add v16.4s, v16.4s, v8.4s\n"
419 "str q14, [%[outptr4]]\n"
420 "str q15, [%[outptr4], #0x10]\n"
421 "str q16, [%[outptr4], #0x20]\n"
422 "add %[outptr4], %[outptr4], #0x30\n"
423 : [outptr0]
"+r" (outptr0), [outptr1]
"+r" (outptr1), [outptr2]
"+r" (outptr2), [outptr3]
"+r" (outptr3), [outptr4]
"+r" (outptr4), [outptr5]
"+r" (outptr5), [outptr6]
"+r" (outptr6), [outptr7]
"+r" (outptr7),
426 :
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"memory"
436 for (
int xi=0; xi<11; xi++)
440 *outptr0 += inptr[xi];
442 *outptr1 += inptr[xi + 12];
444 *outptr2 += inptr[xi + 24];
446 *outptr3 += inptr[xi + 36];
448 *outptr4 += inptr[xi + 48];
450 *outptr5 += inptr[xi + 60];
458 "ldr q2, [%[outptr0]]\n"
459 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
460 "ldr q10, [%[inptr]]\n"
461 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
462 "ldr q3, [%[outptr0], #0x10]\n"
463 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
464 "add v10.4s, v10.4s, v2.4s\n"
465 "ldr q11, [%[inptr], #0x10]\n"
466 "ldr q4, [%[outptr0], #0x20]\n"
467 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
468 "ldr q12, [%[inptr], #0x20]\n"
469 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
470 "add v11.4s, v11.4s, v3.4s\n"
471 "str q10, [%[outptr0]]\n"
472 "ldr q5, [%[outptr1]]\n"
473 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
474 "add v12.4s, v12.4s, v4.4s\n"
475 "str q11, [%[outptr0], #0x10]\n"
476 "ldr q13, [%[inptr], #0x30]\n"
477 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
478 "ldr q6, [%[outptr1], #0x10]\n"
479 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
480 "add v13.4s, v13.4s, v5.4s\n"
481 "str q12, [%[outptr0], #0x20]\n"
482 "ldr q14, [%[inptr], #0x40]\n"
483 "add %[outptr0], %[outptr0], #0x30\n"
484 "ldr q7, [%[outptr1], #0x20]\n"
485 "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
486 "add v14.4s, v14.4s, v6.4s\n"
487 "str q13, [%[outptr1]]\n"
488 "ldr q15, [%[inptr], #0x50]\n"
489 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
490 "ldr q8, [%[outptr2]]\n"
491 "prfm PLDL1KEEP, [%[outptr5], #0x60]\n"
492 "add v15.4s, v15.4s, v7.4s\n"
493 "str q14, [%[outptr1], #0x10]\n"
494 "ldr q16, [%[inptr], #0x60]\n"
495 "ldr q9, [%[outptr2], #0x10]\n"
496 "ldr q17, [%[inptr], #0x70]\n"
497 "str q15, [%[outptr1], #0x20]\n"
498 "add %[outptr1], %[outptr1], #0x30\n"
499 "add v16.4s, v16.4s, v8.4s\n"
500 "ldr q2, [%[outptr2], #0x20]\n"
501 "add v17.4s, v17.4s, v9.4s\n"
502 "ldr q10, [%[inptr], #0x80]\n"
503 "ldr q3, [%[outptr3]]\n"
504 "ldr q11, [%[inptr], #0x90]\n"
505 "str q16, [%[outptr2]]\n"
506 "add v10.4s, v10.4s, v2.4s\n"
507 "ldr q4, [%[outptr3], #0x10]\n"
508 "ldr q12, [%[inptr], #0xa0]\n"
509 "add v11.4s, v11.4s, v3.4s\n"
510 "str q17, [%[outptr2], #0x10]\n"
511 "ldr q5, [%[outptr3], #0x20]\n"
512 "ldr q13, [%[inptr], #0xb0]\n"
513 "add v12.4s, v12.4s, v4.4s\n"
514 "str q10, [%[outptr2], #0x20]\n"
515 "ldr q6, [%[outptr4]]\n"
516 "add %[outptr2], %[outptr2], #0x30\n"
517 "add v13.4s, v13.4s, v5.4s\n"
518 "str q11, [%[outptr3]]\n"
519 "ldr q14, [%[inptr], #0xc0]\n"
520 "ldr q7, [%[outptr4], #0x10]\n"
521 "ldr q15, [%[inptr], #0xd0]\n"
522 "str q12, [%[outptr3], #0x10]\n"
523 "add v14.4s, v14.4s, v6.4s\n"
524 "ldr q8, [%[outptr4], #0x20]\n"
525 "ldr q16, [%[inptr], #0xe0]\n"
526 "add v15.4s, v15.4s, v7.4s\n"
527 "str q13, [%[outptr3], #0x20]\n"
528 "ldr q9, [%[outptr5]]\n"
529 "add %[outptr3], %[outptr3], #0x30\n"
530 "add v16.4s, v16.4s, v8.4s\n"
531 "str q14, [%[outptr4]]\n"
532 "ldr q17, [%[inptr], #0xf0]\n"
533 "ldr q2, [%[outptr5], #0x10]\n"
534 "ldr q10, [%[inptr], #0x100]\n"
535 "str q15, [%[outptr4], #0x10]\n"
536 "add v17.4s, v17.4s, v9.4s\n"
537 "ldr q3, [%[outptr5], #0x20]\n"
538 "ldr q11, [%[inptr], #0x110]\n"
539 "add %[inptr], %[inptr], #0x180\n"
540 "add v10.4s, v10.4s, v2.4s\n"
541 "str q16, [%[outptr4], #0x20]\n"
542 "add %[outptr4], %[outptr4], #0x30\n"
543 "add v11.4s, v11.4s, v3.4s\n"
544 "str q17, [%[outptr5]]\n"
545 "str q10, [%[outptr5], #0x10]\n"
546 "str q11, [%[outptr5], #0x20]\n"
547 "add %[outptr5], %[outptr5], #0x30\n"
548 : [outptr0]
"+r" (outptr0), [outptr1]
"+r" (outptr1), [outptr2]
"+r" (outptr2), [outptr3]
"+r" (outptr3), [outptr4]
"+r" (outptr4), [outptr5]
"+r" (outptr5), [outptr6]
"+r" (outptr6), [outptr7]
"+r" (outptr7),
551 :
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"memory"
561 for (
int xi=0; xi<11; xi++)
565 *outptr0 += inptr[xi];
567 *outptr1 += inptr[xi + 12];
569 *outptr2 += inptr[xi + 24];
571 *outptr3 += inptr[xi + 36];
573 *outptr4 += inptr[xi + 48];
575 *outptr5 += inptr[xi + 60];
577 *outptr6 += inptr[xi + 72];
585 "ldr q2, [%[outptr0]]\n"
586 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
587 "ldr q10, [%[inptr]]\n"
588 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
589 "ldr q3, [%[outptr0], #0x10]\n"
590 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
591 "add v10.4s, v10.4s, v2.4s\n"
592 "ldr q11, [%[inptr], #0x10]\n"
593 "ldr q4, [%[outptr0], #0x20]\n"
594 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
595 "ldr q12, [%[inptr], #0x20]\n"
596 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
597 "add v11.4s, v11.4s, v3.4s\n"
598 "str q10, [%[outptr0]]\n"
599 "ldr q5, [%[outptr1]]\n"
600 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
601 "add v12.4s, v12.4s, v4.4s\n"
602 "str q11, [%[outptr0], #0x10]\n"
603 "ldr q13, [%[inptr], #0x30]\n"
604 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
605 "ldr q6, [%[outptr1], #0x10]\n"
606 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
607 "add v13.4s, v13.4s, v5.4s\n"
608 "str q12, [%[outptr0], #0x20]\n"
609 "ldr q14, [%[inptr], #0x40]\n"
610 "add %[outptr0], %[outptr0], #0x30\n"
611 "ldr q7, [%[outptr1], #0x20]\n"
612 "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
613 "add v14.4s, v14.4s, v6.4s\n"
614 "str q13, [%[outptr1]]\n"
615 "ldr q15, [%[inptr], #0x50]\n"
616 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
617 "ldr q8, [%[outptr2]]\n"
618 "prfm PLDL1KEEP, [%[outptr5], #0x60]\n"
619 "add v15.4s, v15.4s, v7.4s\n"
620 "str q14, [%[outptr1], #0x10]\n"
621 "ldr q16, [%[inptr], #0x60]\n"
622 "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
623 "ldr q9, [%[outptr2], #0x10]\n"
624 "prfm PLDL1KEEP, [%[outptr6], #0x60]\n"
625 "add v16.4s, v16.4s, v8.4s\n"
626 "str q15, [%[outptr1], #0x20]\n"
627 "ldr q17, [%[inptr], #0x70]\n"
628 "add %[outptr1], %[outptr1], #0x30\n"
629 "ldr q2, [%[outptr2], #0x20]\n"
630 "str q16, [%[outptr2]]\n"
631 "add v17.4s, v17.4s, v9.4s\n"
632 "ldr q10, [%[inptr], #0x80]\n"
633 "ldr q3, [%[outptr3]]\n"
634 "ldr q11, [%[inptr], #0x90]\n"
635 "ldr q4, [%[outptr3], #0x10]\n"
636 "add v10.4s, v10.4s, v2.4s\n"
637 "str q17, [%[outptr2], #0x10]\n"
638 "ldr q12, [%[inptr], #0xa0]\n"
639 "add v11.4s, v11.4s, v3.4s\n"
640 "ldr q5, [%[outptr3], #0x20]\n"
641 "ldr q13, [%[inptr], #0xb0]\n"
642 "str q10, [%[outptr2], #0x20]\n"
643 "add %[outptr2], %[outptr2], #0x30\n"
644 "add v12.4s, v12.4s, v4.4s\n"
645 "ldr q6, [%[outptr4]]\n"
646 "add v13.4s, v13.4s, v5.4s\n"
647 "str q11, [%[outptr3]]\n"
648 "ldr q14, [%[inptr], #0xc0]\n"
649 "ldr q7, [%[outptr4], #0x10]\n"
650 "ldr q15, [%[inptr], #0xd0]\n"
651 "str q12, [%[outptr3], #0x10]\n"
652 "add v14.4s, v14.4s, v6.4s\n"
653 "ldr q8, [%[outptr4], #0x20]\n"
654 "ldr q16, [%[inptr], #0xe0]\n"
655 "add v15.4s, v15.4s, v7.4s\n"
656 "str q13, [%[outptr3], #0x20]\n"
657 "ldr q9, [%[outptr5]]\n"
658 "add %[outptr3], %[outptr3], #0x30\n"
659 "add v16.4s, v16.4s, v8.4s\n"
660 "str q14, [%[outptr4]]\n"
661 "ldr q17, [%[inptr], #0xf0]\n"
662 "ldr q2, [%[outptr5], #0x10]\n"
663 "ldr q10, [%[inptr], #0x100]\n"
664 "str q15, [%[outptr4], #0x10]\n"
665 "add v17.4s, v17.4s, v9.4s\n"
666 "ldr q3, [%[outptr5], #0x20]\n"
667 "ldr q11, [%[inptr], #0x110]\n"
668 "add v10.4s, v10.4s, v2.4s\n"
669 "str q16, [%[outptr4], #0x20]\n"
670 "ldr q4, [%[outptr6]]\n"
671 "add %[outptr4], %[outptr4], #0x30\n"
672 "add v11.4s, v11.4s, v3.4s\n"
673 "str q17, [%[outptr5]]\n"
674 "ldr q12, [%[inptr], #0x120]\n"
675 "ldr q5, [%[outptr6], #0x10]\n"
676 "ldr q13, [%[inptr], #0x130]\n"
677 "str q10, [%[outptr5], #0x10]\n"
678 "add v12.4s, v12.4s, v4.4s\n"
679 "ldr q6, [%[outptr6], #0x20]\n"
680 "ldr q14, [%[inptr], #0x140]\n"
681 "add %[inptr], %[inptr], #0x180\n"
682 "add v13.4s, v13.4s, v5.4s\n"
683 "str q11, [%[outptr5], #0x20]\n"
684 "add %[outptr5], %[outptr5], #0x30\n"
685 "add v14.4s, v14.4s, v6.4s\n"
686 "str q12, [%[outptr6]]\n"
687 "str q13, [%[outptr6], #0x10]\n"
688 "str q14, [%[outptr6], #0x20]\n"
689 "add %[outptr6], %[outptr6], #0x30\n"
690 : [outptr0]
"+r" (outptr0), [outptr1]
"+r" (outptr1), [outptr2]
"+r" (outptr2), [outptr3]
"+r" (outptr3), [outptr4]
"+r" (outptr4), [outptr5]
"+r" (outptr5), [outptr6]
"+r" (outptr6), [outptr7]
"+r" (outptr7),
693 :
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"memory"
704 for (
int xi=0; xi<11; xi++)
708 *outptr0 += inptr[xi];
710 *outptr1 += inptr[xi + 12];
712 *outptr2 += inptr[xi + 24];
714 *outptr3 += inptr[xi + 36];
716 *outptr4 += inptr[xi + 48];
718 *outptr5 += inptr[xi + 60];
720 *outptr6 += inptr[xi + 72];
722 *outptr7 += inptr[xi + 84];
730 "ldr q2, [%[outptr0]]\n"
731 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
732 "ldr q10, [%[inptr]]\n"
733 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
734 "ldr q3, [%[outptr0], #0x10]\n"
735 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
736 "add v10.4s, v10.4s, v2.4s\n"
737 "ldr q11, [%[inptr], #0x10]\n"
738 "ldr q4, [%[outptr0], #0x20]\n"
739 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
740 "ldr q12, [%[inptr], #0x20]\n"
741 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
742 "add v11.4s, v11.4s, v3.4s\n"
743 "str q10, [%[outptr0]]\n"
744 "ldr q5, [%[outptr1]]\n"
745 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
746 "add v12.4s, v12.4s, v4.4s\n"
747 "str q11, [%[outptr0], #0x10]\n"
748 "ldr q13, [%[inptr], #0x30]\n"
749 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
750 "ldr q6, [%[outptr1], #0x10]\n"
751 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
752 "add v13.4s, v13.4s, v5.4s\n"
753 "str q12, [%[outptr0], #0x20]\n"
754 "ldr q14, [%[inptr], #0x40]\n"
755 "add %[outptr0], %[outptr0], #0x30\n"
756 "ldr q7, [%[outptr1], #0x20]\n"
757 "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
758 "add v14.4s, v14.4s, v6.4s\n"
759 "str q13, [%[outptr1]]\n"
760 "ldr q15, [%[inptr], #0x50]\n"
761 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
762 "ldr q8, [%[outptr2]]\n"
763 "prfm PLDL1KEEP, [%[outptr5], #0x60]\n"
764 "add v15.4s, v15.4s, v7.4s\n"
765 "str q14, [%[outptr1], #0x10]\n"
766 "ldr q16, [%[inptr], #0x60]\n"
767 "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
768 "ldr q9, [%[outptr2], #0x10]\n"
769 "prfm PLDL1KEEP, [%[outptr6], #0x60]\n"
770 "add v16.4s, v16.4s, v8.4s\n"
771 "str q15, [%[outptr1], #0x20]\n"
772 "ldr q17, [%[inptr], #0x70]\n"
773 "add %[outptr1], %[outptr1], #0x30\n"
774 "ldr q2, [%[outptr2], #0x20]\n"
775 "prfm PLDL1KEEP, [%[outptr7], #0x60]\n"
776 "add v17.4s, v17.4s, v9.4s\n"
777 "str q16, [%[outptr2]]\n"
778 "ldr q10, [%[inptr], #0x80]\n"
779 "ldr q3, [%[outptr3]]\n"
780 "ldr q11, [%[inptr], #0x90]\n"
781 "str q17, [%[outptr2], #0x10]\n"
782 "add v10.4s, v10.4s, v2.4s\n"
783 "ldr q4, [%[outptr3], #0x10]\n"
784 "ldr q12, [%[inptr], #0xa0]\n"
785 "add v11.4s, v11.4s, v3.4s\n"
786 "ldr q5, [%[outptr3], #0x20]\n"
787 "ldr q13, [%[inptr], #0xb0]\n"
788 "str q10, [%[outptr2], #0x20]\n"
789 "add %[outptr2], %[outptr2], #0x30\n"
790 "add v12.4s, v12.4s, v4.4s\n"
791 "ldr q6, [%[outptr4]]\n"
792 "add v13.4s, v13.4s, v5.4s\n"
793 "str q11, [%[outptr3]]\n"
794 "ldr q14, [%[inptr], #0xc0]\n"
795 "ldr q7, [%[outptr4], #0x10]\n"
796 "ldr q15, [%[inptr], #0xd0]\n"
797 "str q12, [%[outptr3], #0x10]\n"
798 "add v14.4s, v14.4s, v6.4s\n"
799 "ldr q8, [%[outptr4], #0x20]\n"
800 "ldr q16, [%[inptr], #0xe0]\n"
801 "add v15.4s, v15.4s, v7.4s\n"
802 "str q13, [%[outptr3], #0x20]\n"
803 "ldr q9, [%[outptr5]]\n"
804 "add %[outptr3], %[outptr3], #0x30\n"
805 "add v16.4s, v16.4s, v8.4s\n"
806 "str q14, [%[outptr4]]\n"
807 "ldr q17, [%[inptr], #0xf0]\n"
808 "ldr q2, [%[outptr5], #0x10]\n"
809 "ldr q10, [%[inptr], #0x100]\n"
810 "str q15, [%[outptr4], #0x10]\n"
811 "add v17.4s, v17.4s, v9.4s\n"
812 "ldr q3, [%[outptr5], #0x20]\n"
813 "ldr q11, [%[inptr], #0x110]\n"
814 "add v10.4s, v10.4s, v2.4s\n"
815 "str q16, [%[outptr4], #0x20]\n"
816 "ldr q4, [%[outptr6]]\n"
817 "add %[outptr4], %[outptr4], #0x30\n"
818 "add v11.4s, v11.4s, v3.4s\n"
819 "str q17, [%[outptr5]]\n"
820 "ldr q12, [%[inptr], #0x120]\n"
821 "ldr q5, [%[outptr6], #0x10]\n"
822 "ldr q13, [%[inptr], #0x130]\n"
823 "str q10, [%[outptr5], #0x10]\n"
824 "add v12.4s, v12.4s, v4.4s\n"
825 "ldr q6, [%[outptr6], #0x20]\n"
826 "ldr q14, [%[inptr], #0x140]\n"
827 "add v13.4s, v13.4s, v5.4s\n"
828 "str q11, [%[outptr5], #0x20]\n"
829 "ldr q7, [%[outptr7]]\n"
830 "add %[outptr5], %[outptr5], #0x30\n"
831 "add v14.4s, v14.4s, v6.4s\n"
832 "str q12, [%[outptr6]]\n"
833 "ldr q15, [%[inptr], #0x150]\n"
834 "ldr q8, [%[outptr7], #0x10]\n"
835 "ldr q16, [%[inptr], #0x160]\n"
836 "str q13, [%[outptr6], #0x10]\n"
837 "add v15.4s, v15.4s, v7.4s\n"
838 "ldr q9, [%[outptr7], #0x20]\n"
839 "ldr q17, [%[inptr], #0x170]\n"
840 "add %[inptr], %[inptr], #0x180\n"
841 "add v16.4s, v16.4s, v8.4s\n"
842 "str q14, [%[outptr6], #0x20]\n"
843 "add %[outptr6], %[outptr6], #0x30\n"
844 "add v17.4s, v17.4s, v9.4s\n"
845 "str q15, [%[outptr7]]\n"
846 "str q16, [%[outptr7], #0x10]\n"
847 "str q17, [%[outptr7], #0x20]\n"
848 "add %[outptr7], %[outptr7], #0x30\n"
849 : [outptr0]
"+r" (outptr0), [outptr1]
"+r" (outptr1), [outptr2]
"+r" (outptr2), [outptr3]
"+r" (outptr3), [outptr4]
"+r" (outptr4), [outptr5]
"+r" (outptr5), [outptr6]
"+r" (outptr6), [outptr7]
"+r" (outptr7),
852 :
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"memory"
863 const uint32_t *biasptr =
bias ?
bias + i : nullbias;
871 for (
int xi=0; xi<11; xi++)
875 *outptr0 = biasptr[xi] + inptr[xi];
883 "ldr q2, [%[biasptr]]\n"
884 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
885 "ldr q3, [%[biasptr], #0x10]\n"
886 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
887 "ldr q4, [%[biasptr], #0x20]\n"
888 "ldr q13, [%[inptr]]\n"
889 "ldr q14, [%[inptr], #0x10]\n"
890 "ldr q15, [%[inptr], #0x20]\n"
891 "add %[inptr], %[inptr], #0x180\n"
892 "add v13.4s, v13.4s, v2.4s\n"
893 "add v14.4s, v14.4s, v3.4s\n"
894 "add v15.4s, v15.4s, v4.4s\n"
895 "str q13, [%[outptr0]]\n"
896 "str q14, [%[outptr0], #0x10]\n"
897 "str q15, [%[outptr0], #0x20]\n"
898 "add %[outptr0], %[outptr0], #0x30\n"
899 : [outptr0]
"+r" (outptr0), [outptr1]
"+r" (outptr1), [outptr2]
"+r" (outptr2), [outptr3]
"+r" (outptr3), [outptr4]
"+r" (outptr4), [outptr5]
"+r" (outptr5), [outptr6]
"+r" (outptr6), [outptr7]
"+r" (outptr7),
901 : [biasptr]
"r" (biasptr)
902 :
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"memory"
912 for (
int xi=0; xi<11; xi++)
916 *outptr0 = biasptr[xi] + inptr[xi];
918 *outptr1 = biasptr[xi] + inptr[xi + 12];
926 "ldr q2, [%[biasptr]]\n"
927 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
928 "ldr q3, [%[biasptr], #0x10]\n"
929 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
930 "ldr q4, [%[biasptr], #0x20]\n"
931 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
932 "ldr q13, [%[inptr]]\n"
933 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
934 "ldr q14, [%[inptr], #0x10]\n"
935 "ldr q15, [%[inptr], #0x20]\n"
936 "add v13.4s, v13.4s, v2.4s\n"
937 "ldr q16, [%[inptr], #0x30]\n"
938 "ldr q17, [%[inptr], #0x40]\n"
939 "add v14.4s, v14.4s, v3.4s\n"
940 "ldr q18, [%[inptr], #0x50]\n"
941 "add v15.4s, v15.4s, v4.4s\n"
942 "str q13, [%[outptr0]]\n"
943 "add v16.4s, v16.4s, v2.4s\n"
944 "add %[inptr], %[inptr], #0x180\n"
945 "add v17.4s, v17.4s, v3.4s\n"
946 "str q14, [%[outptr0], #0x10]\n"
947 "add v18.4s, v18.4s, v4.4s\n"
948 "str q15, [%[outptr0], #0x20]\n"
949 "add %[outptr0], %[outptr0], #0x30\n"
950 "str q16, [%[outptr1]]\n"
951 "str q17, [%[outptr1], #0x10]\n"
952 "str q18, [%[outptr1], #0x20]\n"
953 "add %[outptr1], %[outptr1], #0x30\n"
954 : [outptr0]
"+r" (outptr0), [outptr1]
"+r" (outptr1), [outptr2]
"+r" (outptr2), [outptr3]
"+r" (outptr3), [outptr4]
"+r" (outptr4), [outptr5]
"+r" (outptr5), [outptr6]
"+r" (outptr6), [outptr7]
"+r" (outptr7),
956 : [biasptr]
"r" (biasptr)
957 :
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"memory"
967 for (
int xi=0; xi<11; xi++)
971 *outptr0 = biasptr[xi] + inptr[xi];
973 *outptr1 = biasptr[xi] + inptr[xi + 12];
975 *outptr2 = biasptr[xi] + inptr[xi + 24];
983 "ldr q2, [%[biasptr]]\n"
984 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
985 "ldr q3, [%[biasptr], #0x10]\n"
986 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
987 "ldr q4, [%[biasptr], #0x20]\n"
988 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
989 "ldr q13, [%[inptr]]\n"
990 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
991 "ldr q14, [%[inptr], #0x10]\n"
992 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
993 "add v13.4s, v13.4s, v2.4s\n"
994 "ldr q15, [%[inptr], #0x20]\n"
995 "ldr q16, [%[inptr], #0x30]\n"
996 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
997 "add v14.4s, v14.4s, v3.4s\n"
998 "str q13, [%[outptr0]]\n"
999 "add v15.4s, v15.4s, v4.4s\n"
1000 "ldr q17, [%[inptr], #0x40]\n"
1001 "add v16.4s, v16.4s, v2.4s\n"
1002 "ldr q18, [%[inptr], #0x50]\n"
1003 "ldr q19, [%[inptr], #0x60]\n"
1004 "str q14, [%[outptr0], #0x10]\n"
1005 "add v17.4s, v17.4s, v3.4s\n"
1006 "ldr q20, [%[inptr], #0x70]\n"
1007 "add v18.4s, v18.4s, v4.4s\n"
1008 "ldr q13, [%[inptr], #0x80]\n"
1009 "add v19.4s, v19.4s, v2.4s\n"
1010 "str q15, [%[outptr0], #0x20]\n"
1011 "add %[outptr0], %[outptr0], #0x30\n"
1012 "add v20.4s, v20.4s, v3.4s\n"
1013 "add %[inptr], %[inptr], #0x180\n"
1014 "add v13.4s, v13.4s, v4.4s\n"
1015 "str q16, [%[outptr1]]\n"
1016 "str q17, [%[outptr1], #0x10]\n"
1017 "str q18, [%[outptr1], #0x20]\n"
1018 "add %[outptr1], %[outptr1], #0x30\n"
1019 "str q19, [%[outptr2]]\n"
1020 "str q20, [%[outptr2], #0x10]\n"
1021 "str q13, [%[outptr2], #0x20]\n"
1022 "add %[outptr2], %[outptr2], #0x30\n"
1023 : [outptr0]
"+r" (outptr0), [outptr1]
"+r" (outptr1), [outptr2]
"+r" (outptr2), [outptr3]
"+r" (outptr3), [outptr4]
"+r" (outptr4), [outptr5]
"+r" (outptr5), [outptr6]
"+r" (outptr6), [outptr7]
"+r" (outptr7),
1024 [inptr]
"+r" (inptr)
1025 : [biasptr]
"r" (biasptr)
1026 :
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"memory"
1036 for (
int xi=0; xi<11; xi++)
1040 *outptr0 = biasptr[xi] + inptr[xi];
1042 *outptr1 = biasptr[xi] + inptr[xi + 12];
1044 *outptr2 = biasptr[xi] + inptr[xi + 24];
1046 *outptr3 = biasptr[xi] + inptr[xi + 36];
1054 "ldr q2, [%[biasptr]]\n"
1055 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1056 "ldr q3, [%[biasptr], #0x10]\n"
1057 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1058 "ldr q4, [%[biasptr], #0x20]\n"
1059 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1060 "ldr q13, [%[inptr]]\n"
1061 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1062 "ldr q14, [%[inptr], #0x10]\n"
1063 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1064 "add v13.4s, v13.4s, v2.4s\n"
1065 "ldr q15, [%[inptr], #0x20]\n"
1066 "ldr q16, [%[inptr], #0x30]\n"
1067 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1068 "add v14.4s, v14.4s, v3.4s\n"
1069 "str q13, [%[outptr0]]\n"
1070 "add v15.4s, v15.4s, v4.4s\n"
1071 "ldr q17, [%[inptr], #0x40]\n"
1072 "add v16.4s, v16.4s, v2.4s\n"
1073 "ldr q18, [%[inptr], #0x50]\n"
1074 "ldr q19, [%[inptr], #0x60]\n"
1075 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
1076 "add v17.4s, v17.4s, v3.4s\n"
1077 "str q14, [%[outptr0], #0x10]\n"
1078 "add v18.4s, v18.4s, v4.4s\n"
1079 "ldr q20, [%[inptr], #0x70]\n"
1080 "add v19.4s, v19.4s, v2.4s\n"
1081 "ldr q13, [%[inptr], #0x80]\n"
1082 "ldr q14, [%[inptr], #0x90]\n"
1083 "str q15, [%[outptr0], #0x20]\n"
1084 "add %[outptr0], %[outptr0], #0x30\n"
1085 "add v20.4s, v20.4s, v3.4s\n"
1086 "ldr q15, [%[inptr], #0xa0]\n"
1087 "add v13.4s, v13.4s, v4.4s\n"
1088 "str q16, [%[outptr1]]\n"
1089 "add v14.4s, v14.4s, v2.4s\n"
1090 "ldr q16, [%[inptr], #0xb0]\n"
1091 "add %[inptr], %[inptr], #0x180\n"
1092 "add v15.4s, v15.4s, v3.4s\n"
1093 "str q17, [%[outptr1], #0x10]\n"
1094 "add v16.4s, v16.4s, v4.4s\n"
1095 "str q18, [%[outptr1], #0x20]\n"
1096 "add %[outptr1], %[outptr1], #0x30\n"
1097 "str q19, [%[outptr2]]\n"
1098 "str q20, [%[outptr2], #0x10]\n"
1099 "str q13, [%[outptr2], #0x20]\n"
1100 "add %[outptr2], %[outptr2], #0x30\n"
1101 "str q14, [%[outptr3]]\n"
1102 "str q15, [%[outptr3], #0x10]\n"
1103 "str q16, [%[outptr3], #0x20]\n"
1104 "add %[outptr3], %[outptr3], #0x30\n"
1105 : [outptr0]
"+r" (outptr0), [outptr1]
"+r" (outptr1), [outptr2]
"+r" (outptr2), [outptr3]
"+r" (outptr3), [outptr4]
"+r" (outptr4), [outptr5]
"+r" (outptr5), [outptr6]
"+r" (outptr6), [outptr7]
"+r" (outptr7),
1106 [inptr]
"+r" (inptr)
1107 : [biasptr]
"r" (biasptr)
1108 :
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"memory"
1118 for (
int xi=0; xi<11; xi++)
1122 *outptr0 = biasptr[xi] + inptr[xi];
1124 *outptr1 = biasptr[xi] + inptr[xi + 12];
1126 *outptr2 = biasptr[xi] + inptr[xi + 24];
1128 *outptr3 = biasptr[xi] + inptr[xi + 36];
1130 *outptr4 = biasptr[xi] + inptr[xi + 48];
1138 "ldr q2, [%[biasptr]]\n"
1139 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1140 "ldr q3, [%[biasptr], #0x10]\n"
1141 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1142 "ldr q4, [%[biasptr], #0x20]\n"
1143 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1144 "ldr q13, [%[inptr]]\n"
1145 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1146 "ldr q14, [%[inptr], #0x10]\n"
1147 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1148 "add v13.4s, v13.4s, v2.4s\n"
1149 "ldr q15, [%[inptr], #0x20]\n"
1150 "ldr q16, [%[inptr], #0x30]\n"
1151 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1152 "add v14.4s, v14.4s, v3.4s\n"
1153 "str q13, [%[outptr0]]\n"
1154 "add v15.4s, v15.4s, v4.4s\n"
1155 "ldr q17, [%[inptr], #0x40]\n"
1156 "add v16.4s, v16.4s, v2.4s\n"
1157 "ldr q18, [%[inptr], #0x50]\n"
1158 "ldr q19, [%[inptr], #0x60]\n"
1159 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
1160 "add v17.4s, v17.4s, v3.4s\n"
1161 "str q14, [%[outptr0], #0x10]\n"
1162 "add v18.4s, v18.4s, v4.4s\n"
1163 "ldr q20, [%[inptr], #0x70]\n"
1164 "add v19.4s, v19.4s, v2.4s\n"
1165 "ldr q13, [%[inptr], #0x80]\n"
1166 "ldr q14, [%[inptr], #0x90]\n"
1167 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
1168 "add v20.4s, v20.4s, v3.4s\n"
1169 "str q15, [%[outptr0], #0x20]\n"
1170 "add v13.4s, v13.4s, v4.4s\n"
1171 "ldr q15, [%[inptr], #0xa0]\n"
1172 "add v14.4s, v14.4s, v2.4s\n"
1173 "add %[outptr0], %[outptr0], #0x30\n"
1174 "str q16, [%[outptr1]]\n"
1175 "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
1176 "add v15.4s, v15.4s, v3.4s\n"
1177 "ldr q16, [%[inptr], #0xb0]\n"
1178 "str q17, [%[outptr1], #0x10]\n"
1179 "ldr q17, [%[inptr], #0xc0]\n"
1180 "add v16.4s, v16.4s, v4.4s\n"
1181 "str q18, [%[outptr1], #0x20]\n"
1182 "add %[outptr1], %[outptr1], #0x30\n"
1183 "add v17.4s, v17.4s, v2.4s\n"
1184 "ldr q18, [%[inptr], #0xd0]\n"
1185 "str q19, [%[outptr2]]\n"
1186 "ldr q19, [%[inptr], #0xe0]\n"
1187 "add %[inptr], %[inptr], #0x180\n"
1188 "add v18.4s, v18.4s, v3.4s\n"
1189 "str q20, [%[outptr2], #0x10]\n"
1190 "add v19.4s, v19.4s, v4.4s\n"
1191 "str q13, [%[outptr2], #0x20]\n"
1192 "add %[outptr2], %[outptr2], #0x30\n"
1193 "str q14, [%[outptr3]]\n"
1194 "str q15, [%[outptr3], #0x10]\n"
1195 "str q16, [%[outptr3], #0x20]\n"
1196 "add %[outptr3], %[outptr3], #0x30\n"
1197 "str q17, [%[outptr4]]\n"
1198 "str q18, [%[outptr4], #0x10]\n"
1199 "str q19, [%[outptr4], #0x20]\n"
1200 "add %[outptr4], %[outptr4], #0x30\n"
1201 : [outptr0]
"+r" (outptr0), [outptr1]
"+r" (outptr1), [outptr2]
"+r" (outptr2), [outptr3]
"+r" (outptr3), [outptr4]
"+r" (outptr4), [outptr5]
"+r" (outptr5), [outptr6]
"+r" (outptr6), [outptr7]
"+r" (outptr7),
1202 [inptr]
"+r" (inptr)
1203 : [biasptr]
"r" (biasptr)
1204 :
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"memory"
1214 for (
int xi=0; xi<11; xi++)
1218 *outptr0 = biasptr[xi] + inptr[xi];
1220 *outptr1 = biasptr[xi] + inptr[xi + 12];
1222 *outptr2 = biasptr[xi] + inptr[xi + 24];
1224 *outptr3 = biasptr[xi] + inptr[xi + 36];
1226 *outptr4 = biasptr[xi] + inptr[xi + 48];
1228 *outptr5 = biasptr[xi] + inptr[xi + 60];
1236 "ldr q2, [%[biasptr]]\n"
1237 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1238 "ldr q3, [%[biasptr], #0x10]\n"
1239 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1240 "ldr q4, [%[biasptr], #0x20]\n"
1241 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1242 "ldr q13, [%[inptr]]\n"
1243 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1244 "ldr q14, [%[inptr], #0x10]\n"
1245 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1246 "add v13.4s, v13.4s, v2.4s\n"
1247 "ldr q15, [%[inptr], #0x20]\n"
1248 "ldr q16, [%[inptr], #0x30]\n"
1249 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1250 "add v14.4s, v14.4s, v3.4s\n"
1251 "str q13, [%[outptr0]]\n"
1252 "add v15.4s, v15.4s, v4.4s\n"
1253 "ldr q17, [%[inptr], #0x40]\n"
1254 "add v16.4s, v16.4s, v2.4s\n"
1255 "ldr q18, [%[inptr], #0x50]\n"
1256 "ldr q19, [%[inptr], #0x60]\n"
1257 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
1258 "add v17.4s, v17.4s, v3.4s\n"
1259 "str q14, [%[outptr0], #0x10]\n"
1260 "add v18.4s, v18.4s, v4.4s\n"
1261 "ldr q20, [%[inptr], #0x70]\n"
1262 "add v19.4s, v19.4s, v2.4s\n"
1263 "ldr q13, [%[inptr], #0x80]\n"
1264 "ldr q14, [%[inptr], #0x90]\n"
1265 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
1266 "add v20.4s, v20.4s, v3.4s\n"
1267 "str q15, [%[outptr0], #0x20]\n"
1268 "add v13.4s, v13.4s, v4.4s\n"
1269 "ldr q15, [%[inptr], #0xa0]\n"
1270 "add v14.4s, v14.4s, v2.4s\n"
1271 "add %[outptr0], %[outptr0], #0x30\n"
1272 "str q16, [%[outptr1]]\n"
1273 "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
1274 "add v15.4s, v15.4s, v3.4s\n"
1275 "ldr q16, [%[inptr], #0xb0]\n"
1276 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
1277 "str q17, [%[outptr1], #0x10]\n"
1278 "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
1279 "add v16.4s, v16.4s, v4.4s\n"
1280 "ldr q17, [%[inptr], #0xc0]\n"
1281 "str q18, [%[outptr1], #0x20]\n"
1282 "add %[outptr1], %[outptr1], #0x30\n"
1283 "add v17.4s, v17.4s, v2.4s\n"
1284 "ldr q18, [%[inptr], #0xd0]\n"
1285 "str q19, [%[outptr2]]\n"
1286 "ldr q19, [%[inptr], #0xe0]\n"
1287 "add v18.4s, v18.4s, v3.4s\n"
1288 "str q20, [%[outptr2], #0x10]\n"
1289 "add v19.4s, v19.4s, v4.4s\n"
1290 "ldr q20, [%[inptr], #0xf0]\n"
1291 "str q13, [%[outptr2], #0x20]\n"
1292 "add %[outptr2], %[outptr2], #0x30\n"
1293 "add v20.4s, v20.4s, v2.4s\n"
1294 "ldr q13, [%[inptr], #0x100]\n"
1295 "str q14, [%[outptr3]]\n"
1296 "ldr q14, [%[inptr], #0x110]\n"
1297 "add %[inptr], %[inptr], #0x180\n"
1298 "add v13.4s, v13.4s, v3.4s\n"
1299 "str q15, [%[outptr3], #0x10]\n"
1300 "add v14.4s, v14.4s, v4.4s\n"
1301 "str q16, [%[outptr3], #0x20]\n"
1302 "add %[outptr3], %[outptr3], #0x30\n"
1303 "str q17, [%[outptr4]]\n"
1304 "str q18, [%[outptr4], #0x10]\n"
1305 "str q19, [%[outptr4], #0x20]\n"
1306 "add %[outptr4], %[outptr4], #0x30\n"
1307 "str q20, [%[outptr5]]\n"
1308 "str q13, [%[outptr5], #0x10]\n"
1309 "str q14, [%[outptr5], #0x20]\n"
1310 "add %[outptr5], %[outptr5], #0x30\n"
1311 : [outptr0]
"+r" (outptr0), [outptr1]
"+r" (outptr1), [outptr2]
"+r" (outptr2), [outptr3]
"+r" (outptr3), [outptr4]
"+r" (outptr4), [outptr5]
"+r" (outptr5), [outptr6]
"+r" (outptr6), [outptr7]
"+r" (outptr7),
1312 [inptr]
"+r" (inptr)
1313 : [biasptr]
"r" (biasptr)
1314 :
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"memory"
1324 for (
int xi=0; xi<11; xi++)
1328 *outptr0 = biasptr[xi] + inptr[xi];
1330 *outptr1 = biasptr[xi] + inptr[xi + 12];
1332 *outptr2 = biasptr[xi] + inptr[xi + 24];
1334 *outptr3 = biasptr[xi] + inptr[xi + 36];
1336 *outptr4 = biasptr[xi] + inptr[xi + 48];
1338 *outptr5 = biasptr[xi] + inptr[xi + 60];
1340 *outptr6 = biasptr[xi] + inptr[xi + 72];
1348 "ldr q2, [%[biasptr]]\n"
1349 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1350 "ldr q3, [%[biasptr], #0x10]\n"
1351 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1352 "ldr q4, [%[biasptr], #0x20]\n"
1353 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1354 "ldr q13, [%[inptr]]\n"
1355 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1356 "ldr q14, [%[inptr], #0x10]\n"
1357 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1358 "add v13.4s, v13.4s, v2.4s\n"
1359 "ldr q15, [%[inptr], #0x20]\n"
1360 "ldr q16, [%[inptr], #0x30]\n"
1361 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1362 "add v14.4s, v14.4s, v3.4s\n"
1363 "str q13, [%[outptr0]]\n"
1364 "add v15.4s, v15.4s, v4.4s\n"
1365 "ldr q17, [%[inptr], #0x40]\n"
1366 "add v16.4s, v16.4s, v2.4s\n"
1367 "ldr q18, [%[inptr], #0x50]\n"
1368 "ldr q19, [%[inptr], #0x60]\n"
1369 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
1370 "add v17.4s, v17.4s, v3.4s\n"
1371 "str q14, [%[outptr0], #0x10]\n"
1372 "add v18.4s, v18.4s, v4.4s\n"
1373 "ldr q20, [%[inptr], #0x70]\n"
1374 "add v19.4s, v19.4s, v2.4s\n"
1375 "ldr q13, [%[inptr], #0x80]\n"
1376 "ldr q14, [%[inptr], #0x90]\n"
1377 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
1378 "add v20.4s, v20.4s, v3.4s\n"
1379 "str q15, [%[outptr0], #0x20]\n"
1380 "add v13.4s, v13.4s, v4.4s\n"
1381 "ldr q15, [%[inptr], #0xa0]\n"
1382 "add v14.4s, v14.4s, v2.4s\n"
1383 "add %[outptr0], %[outptr0], #0x30\n"
1384 "str q16, [%[outptr1]]\n"
1385 "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
1386 "add v15.4s, v15.4s, v3.4s\n"
1387 "ldr q16, [%[inptr], #0xb0]\n"
1388 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
1389 "str q17, [%[outptr1], #0x10]\n"
1390 "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
1391 "add v16.4s, v16.4s, v4.4s\n"
1392 "ldr q17, [%[inptr], #0xc0]\n"
1393 "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
1394 "str q18, [%[outptr1], #0x20]\n"
1395 "add %[outptr1], %[outptr1], #0x30\n"
1396 "add v17.4s, v17.4s, v2.4s\n"
1397 "ldr q18, [%[inptr], #0xd0]\n"
1398 "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
1399 "str q19, [%[outptr2]]\n"
1400 "ldr q19, [%[inptr], #0xe0]\n"
1401 "add v18.4s, v18.4s, v3.4s\n"
1402 "str q20, [%[outptr2], #0x10]\n"
1403 "add v19.4s, v19.4s, v4.4s\n"
1404 "ldr q20, [%[inptr], #0xf0]\n"
1405 "str q13, [%[outptr2], #0x20]\n"
1406 "add %[outptr2], %[outptr2], #0x30\n"
1407 "add v20.4s, v20.4s, v2.4s\n"
1408 "ldr q13, [%[inptr], #0x100]\n"
1409 "str q14, [%[outptr3]]\n"
1410 "ldr q14, [%[inptr], #0x110]\n"
1411 "add v13.4s, v13.4s, v3.4s\n"
1412 "str q15, [%[outptr3], #0x10]\n"
1413 "add v14.4s, v14.4s, v4.4s\n"
1414 "ldr q15, [%[inptr], #0x120]\n"
1415 "str q16, [%[outptr3], #0x20]\n"
1416 "add %[outptr3], %[outptr3], #0x30\n"
1417 "add v15.4s, v15.4s, v2.4s\n"
1418 "ldr q16, [%[inptr], #0x130]\n"
1419 "str q17, [%[outptr4]]\n"
1420 "ldr q17, [%[inptr], #0x140]\n"
1421 "add %[inptr], %[inptr], #0x180\n"
1422 "add v16.4s, v16.4s, v3.4s\n"
1423 "str q18, [%[outptr4], #0x10]\n"
1424 "add v17.4s, v17.4s, v4.4s\n"
1425 "str q19, [%[outptr4], #0x20]\n"
1426 "add %[outptr4], %[outptr4], #0x30\n"
1427 "str q20, [%[outptr5]]\n"
1428 "str q13, [%[outptr5], #0x10]\n"
1429 "str q14, [%[outptr5], #0x20]\n"
1430 "add %[outptr5], %[outptr5], #0x30\n"
1431 "str q15, [%[outptr6]]\n"
1432 "str q16, [%[outptr6], #0x10]\n"
1433 "str q17, [%[outptr6], #0x20]\n"
1434 "add %[outptr6], %[outptr6], #0x30\n"
1435 : [outptr0]
"+r" (outptr0), [outptr1]
"+r" (outptr1), [outptr2]
"+r" (outptr2), [outptr3]
"+r" (outptr3), [outptr4]
"+r" (outptr4), [outptr5]
"+r" (outptr5), [outptr6]
"+r" (outptr6), [outptr7]
"+r" (outptr7),
1436 [inptr]
"+r" (inptr)
1437 : [biasptr]
"r" (biasptr)
1438 :
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"memory"
1449 for (
int xi=0; xi<11; xi++)
1453 *outptr0 = biasptr[xi] + inptr[xi];
1455 *outptr1 = biasptr[xi] + inptr[xi + 12];
1457 *outptr2 = biasptr[xi] + inptr[xi + 24];
1459 *outptr3 = biasptr[xi] + inptr[xi + 36];
1461 *outptr4 = biasptr[xi] + inptr[xi + 48];
1463 *outptr5 = biasptr[xi] + inptr[xi + 60];
1465 *outptr6 = biasptr[xi] + inptr[xi + 72];
1467 *outptr7 = biasptr[xi] + inptr[xi + 84];
1475 "ldr q2, [%[biasptr]]\n"
1476 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1477 "ldr q3, [%[biasptr], #0x10]\n"
1478 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1479 "ldr q4, [%[biasptr], #0x20]\n"
1480 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1481 "ldr q13, [%[inptr]]\n"
1482 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1483 "ldr q14, [%[inptr], #0x10]\n"
1484 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1485 "add v13.4s, v13.4s, v2.4s\n"
1486 "ldr q15, [%[inptr], #0x20]\n"
1487 "ldr q16, [%[inptr], #0x30]\n"
1488 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1489 "add v14.4s, v14.4s, v3.4s\n"
1490 "str q13, [%[outptr0]]\n"
1491 "add v15.4s, v15.4s, v4.4s\n"
1492 "ldr q17, [%[inptr], #0x40]\n"
1493 "add v16.4s, v16.4s, v2.4s\n"
1494 "ldr q18, [%[inptr], #0x50]\n"
1495 "ldr q19, [%[inptr], #0x60]\n"
1496 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
1497 "add v17.4s, v17.4s, v3.4s\n"
1498 "str q14, [%[outptr0], #0x10]\n"
1499 "add v18.4s, v18.4s, v4.4s\n"
1500 "ldr q20, [%[inptr], #0x70]\n"
1501 "add v19.4s, v19.4s, v2.4s\n"
1502 "ldr q13, [%[inptr], #0x80]\n"
1503 "ldr q14, [%[inptr], #0x90]\n"
1504 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
1505 "add v20.4s, v20.4s, v3.4s\n"
1506 "str q15, [%[outptr0], #0x20]\n"
1507 "add v13.4s, v13.4s, v4.4s\n"
1508 "ldr q15, [%[inptr], #0xa0]\n"
1509 "add v14.4s, v14.4s, v2.4s\n"
1510 "add %[outptr0], %[outptr0], #0x30\n"
1511 "str q16, [%[outptr1]]\n"
1512 "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
1513 "add v15.4s, v15.4s, v3.4s\n"
1514 "ldr q16, [%[inptr], #0xb0]\n"
1515 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
1516 "str q17, [%[outptr1], #0x10]\n"
1517 "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
1518 "add v16.4s, v16.4s, v4.4s\n"
1519 "ldr q17, [%[inptr], #0xc0]\n"
1520 "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
1521 "str q18, [%[outptr1], #0x20]\n"
1522 "add %[outptr1], %[outptr1], #0x30\n"
1523 "add v17.4s, v17.4s, v2.4s\n"
1524 "ldr q18, [%[inptr], #0xd0]\n"
1525 "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
1526 "str q19, [%[outptr2]]\n"
1527 "prfm PSTL1KEEP, [%[outptr7], #0x60]\n"
1528 "add v18.4s, v18.4s, v3.4s\n"
1529 "ldr q19, [%[inptr], #0xe0]\n"
1530 "str q20, [%[outptr2], #0x10]\n"
1531 "ldr q20, [%[inptr], #0xf0]\n"
1532 "add v19.4s, v19.4s, v4.4s\n"
1533 "str q13, [%[outptr2], #0x20]\n"
1534 "add %[outptr2], %[outptr2], #0x30\n"
1535 "add v20.4s, v20.4s, v2.4s\n"
1536 "ldr q13, [%[inptr], #0x100]\n"
1537 "str q14, [%[outptr3]]\n"
1538 "ldr q14, [%[inptr], #0x110]\n"
1539 "add v13.4s, v13.4s, v3.4s\n"
1540 "str q15, [%[outptr3], #0x10]\n"
1541 "add v14.4s, v14.4s, v4.4s\n"
1542 "ldr q15, [%[inptr], #0x120]\n"
1543 "str q16, [%[outptr3], #0x20]\n"
1544 "add %[outptr3], %[outptr3], #0x30\n"
1545 "add v15.4s, v15.4s, v2.4s\n"
1546 "ldr q16, [%[inptr], #0x130]\n"
1547 "str q17, [%[outptr4]]\n"
1548 "ldr q17, [%[inptr], #0x140]\n"
1549 "add v16.4s, v16.4s, v3.4s\n"
1550 "str q18, [%[outptr4], #0x10]\n"
1551 "add v17.4s, v17.4s, v4.4s\n"
1552 "ldr q18, [%[inptr], #0x150]\n"
1553 "str q19, [%[outptr4], #0x20]\n"
1554 "add %[outptr4], %[outptr4], #0x30\n"
1555 "add v18.4s, v18.4s, v2.4s\n"
1556 "ldr q19, [%[inptr], #0x160]\n"
1557 "str q20, [%[outptr5]]\n"
1558 "ldr q20, [%[inptr], #0x170]\n"
1559 "add %[inptr], %[inptr], #0x180\n"
1560 "add v19.4s, v19.4s, v3.4s\n"
1561 "str q13, [%[outptr5], #0x10]\n"
1562 "add v20.4s, v20.4s, v4.4s\n"
1563 "str q14, [%[outptr5], #0x20]\n"
1564 "add %[outptr5], %[outptr5], #0x30\n"
1565 "str q15, [%[outptr6]]\n"
1566 "str q16, [%[outptr6], #0x10]\n"
1567 "str q17, [%[outptr6], #0x20]\n"
1568 "add %[outptr6], %[outptr6], #0x30\n"
1569 "str q18, [%[outptr7]]\n"
1570 "str q19, [%[outptr7], #0x10]\n"
1571 "str q20, [%[outptr7], #0x20]\n"
1572 "add %[outptr7], %[outptr7], #0x30\n"
1573 : [outptr0]
"+r" (outptr0), [outptr1]
"+r" (outptr1), [outptr2]
"+r" (outptr2), [outptr3]
"+r" (outptr3), [outptr4]
"+r" (outptr4), [outptr5]
"+r" (outptr5), [outptr6]
"+r" (outptr6), [outptr7]
"+r" (outptr7),
1574 [inptr]
"+r" (inptr)
1575 : [biasptr]
"r" (biasptr)
1576 :
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"memory"
1589 #endif // __aarch64__