26 #ifdef ARM_COMPUTE_ENABLE_SVE
29 void MergeResults<3, 8, true>(uint32_t *out,
const uint32_t *in,
const int ldout,
const int y0,
const int ymax,
const int x0,
const int xmax,
const uint32_t *
bias,
Activation ,
bool append)
31 const uint32_t *inptr = in;
32 uint32_t nullbias[192];
37 memset(nullbias, 0, (3 * get_vector_length<uint32_t>() *
sizeof(uint32_t)));
40 for (
int y=y0; y<ymax; y+=8)
42 uint32_t *outptr0 = out + (y * ldout) + x0;
43 uint32_t *outptr1 = outptr0 + ldout;
44 uint32_t *outptr2 = outptr1 + ldout;
45 uint32_t *outptr3 = outptr2 + ldout;
46 uint32_t *outptr4 = outptr3 + ldout;
47 uint32_t *outptr5 = outptr4 + ldout;
48 uint32_t *outptr6 = outptr5 + ldout;
49 uint32_t *outptr7 = outptr6 + ldout;
51 const int height = ymax - y;
53 for (
int i=x0; i<xmax; i+=(3 * get_vector_length<uint32_t>()))
65 "addvl x8, %[inptr], #16\n"
66 "whilelt p0.s, %[p], %[w]\n"
67 "incw %[p], all, mul #1\n"
68 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
69 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
70 "ld1w z2.s, p0/z, [%[outptr0]]\n"
71 "whilelt p1.s, %[p], %[w]\n"
72 "ld1w z10.s, p0/z, [%[inptr]]\n"
73 "incw %[p], all, mul #1\n"
74 "ld1w z3.s, p1/z, [%[outptr0], #1, MUL VL]\n"
75 "add z10.s, z10.s, z2.s\n"
76 "ld1w z11.s, p1/z, [%[inptr], #1, MUL VL]\n"
77 "whilelt p2.s, %[p], %[w]\n"
78 "add z11.s, z11.s, z3.s\n"
79 "st1w z10.s, p0, [%[outptr0]]\n"
80 "ld1w z4.s, p2/z, [%[outptr0], #2, MUL VL]\n"
81 "ld1w z12.s, p2/z, [%[inptr], #2, MUL VL]\n"
82 "addvl %[inptr], %[inptr], #24\n"
83 "st1w z11.s, p1, [%[outptr0], #1, MUL VL]\n"
84 "add z12.s, z12.s, z4.s\n"
85 "st1w z12.s, p2, [%[outptr0], #2, MUL VL]\n"
86 "addvl %[outptr0], %[outptr0], #3\n"
87 : [outptr0]
"+r" (outptr0), [outptr1]
"+r" (outptr1), [outptr2]
"+r" (outptr2), [outptr3]
"+r" (outptr3), [outptr4]
"+r" (outptr4), [outptr5]
"+r" (outptr5), [outptr6]
"+r" (outptr6), [outptr7]
"+r" (outptr7),
88 [inptr]
"+r" (inptr), [p]
"+r" (p)
90 :
"x8",
"z0",
"z1",
"z2",
"z3",
"z4",
"z5",
"z6",
"z7",
"z8",
"z9",
"z10",
"z11",
"z12",
"z13",
"z14",
"z15",
"z16",
"z17",
"z18",
"z19",
"z20",
"memory",
"cc"
101 "addvl x8, %[inptr], #16\n"
102 "whilelt p0.s, %[p], %[w]\n"
103 "incw %[p], all, mul #1\n"
104 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
105 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
106 "ld1w z2.s, p0/z, [%[outptr0]]\n"
107 "whilelt p1.s, %[p], %[w]\n"
108 "ld1w z10.s, p0/z, [%[inptr]]\n"
109 "incw %[p], all, mul #1\n"
110 "ld1w z5.s, p0/z, [%[outptr1]]\n"
111 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
112 "add z10.s, z10.s, z2.s\n"
113 "ld1w z3.s, p1/z, [%[outptr0], #1, MUL VL]\n"
114 "ld1w z11.s, p1/z, [%[inptr], #1, MUL VL]\n"
115 "whilelt p2.s, %[p], %[w]\n"
116 "ld1w z13.s, p0/z, [%[inptr], #3, MUL VL]\n"
117 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
118 "add z11.s, z11.s, z3.s\n"
119 "st1w z10.s, p0, [%[outptr0]]\n"
120 "ld1w z4.s, p2/z, [%[outptr0], #2, MUL VL]\n"
121 "add z13.s, z13.s, z5.s\n"
122 "ld1w z12.s, p2/z, [%[inptr], #2, MUL VL]\n"
123 "ld1w z6.s, p1/z, [%[outptr1], #1, MUL VL]\n"
124 "st1w z11.s, p1, [%[outptr0], #1, MUL VL]\n"
125 "ld1w z14.s, p1/z, [%[inptr], #4, MUL VL]\n"
126 "add z12.s, z12.s, z4.s\n"
127 "ld1w z7.s, p2/z, [%[outptr1], #2, MUL VL]\n"
128 "ld1w z15.s, p2/z, [%[inptr], #5, MUL VL]\n"
129 "addvl %[inptr], %[inptr], #24\n"
130 "add z14.s, z14.s, z6.s\n"
131 "st1w z12.s, p2, [%[outptr0], #2, MUL VL]\n"
132 "addvl %[outptr0], %[outptr0], #3\n"
133 "add z15.s, z15.s, z7.s\n"
134 "st1w z13.s, p0, [%[outptr1]]\n"
135 "st1w z14.s, p1, [%[outptr1], #1, MUL VL]\n"
136 "st1w z15.s, p2, [%[outptr1], #2, MUL VL]\n"
137 "addvl %[outptr1], %[outptr1], #3\n"
138 : [outptr0]
"+r" (outptr0), [outptr1]
"+r" (outptr1), [outptr2]
"+r" (outptr2), [outptr3]
"+r" (outptr3), [outptr4]
"+r" (outptr4), [outptr5]
"+r" (outptr5), [outptr6]
"+r" (outptr6), [outptr7]
"+r" (outptr7),
139 [inptr]
"+r" (inptr), [p]
"+r" (p)
141 :
"x8",
"z0",
"z1",
"z2",
"z3",
"z4",
"z5",
"z6",
"z7",
"z8",
"z9",
"z10",
"z11",
"z12",
"z13",
"z14",
"z15",
"z16",
"z17",
"z18",
"z19",
"z20",
"memory",
"cc"
152 "addvl x8, %[inptr], #16\n"
153 "whilelt p0.s, %[p], %[w]\n"
154 "incw %[p], all, mul #1\n"
155 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
156 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
157 "ld1w z2.s, p0/z, [%[outptr0]]\n"
158 "whilelt p1.s, %[p], %[w]\n"
159 "ld1w z10.s, p0/z, [%[inptr]]\n"
160 "incw %[p], all, mul #1\n"
161 "ld1w z5.s, p0/z, [%[outptr1]]\n"
162 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
163 "add z10.s, z10.s, z2.s\n"
164 "ld1w z3.s, p1/z, [%[outptr0], #1, MUL VL]\n"
165 "ld1w z11.s, p1/z, [%[inptr], #1, MUL VL]\n"
166 "whilelt p2.s, %[p], %[w]\n"
167 "ld1w z13.s, p0/z, [%[inptr], #3, MUL VL]\n"
168 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
169 "add z11.s, z11.s, z3.s\n"
170 "st1w z10.s, p0, [%[outptr0]]\n"
171 "ld1w z4.s, p2/z, [%[outptr0], #2, MUL VL]\n"
172 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
173 "add z13.s, z13.s, z5.s\n"
174 "st1w z11.s, p1, [%[outptr0], #1, MUL VL]\n"
175 "ld1w z12.s, p2/z, [%[inptr], #2, MUL VL]\n"
176 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
177 "ld1w z6.s, p1/z, [%[outptr1], #1, MUL VL]\n"
178 "ld1w z14.s, p1/z, [%[inptr], #4, MUL VL]\n"
179 "add z12.s, z12.s, z4.s\n"
180 "ld1w z7.s, p2/z, [%[outptr1], #2, MUL VL]\n"
181 "ld1w z15.s, p2/z, [%[inptr], #5, MUL VL]\n"
182 "ld1w z8.s, p0/z, [%[outptr2]]\n"
183 "add z14.s, z14.s, z6.s\n"
184 "st1w z12.s, p2, [%[outptr0], #2, MUL VL]\n"
185 "ld1w z16.s, p0/z, [%[inptr], #6, MUL VL]\n"
186 "addvl %[outptr0], %[outptr0], #3\n"
187 "add z15.s, z15.s, z7.s\n"
188 "st1w z13.s, p0, [%[outptr1]]\n"
189 "ld1w z9.s, p1/z, [%[outptr2], #1, MUL VL]\n"
190 "add z16.s, z16.s, z8.s\n"
191 "ld1w z17.s, p1/z, [%[inptr], #7, MUL VL]\n"
192 "ld1w z2.s, p2/z, [%[outptr2], #2, MUL VL]\n"
193 "addvl %[inptr], %[inptr], #24\n"
194 "st1w z14.s, p1, [%[outptr1], #1, MUL VL]\n"
195 "add z17.s, z17.s, z9.s\n"
196 "ld1w z10.s, p2/z, [x8, #-8, MUL VL]\n"
197 "st1w z15.s, p2, [%[outptr1], #2, MUL VL]\n"
198 "addvl %[outptr1], %[outptr1], #3\n"
199 "add z10.s, z10.s, z2.s\n"
200 "st1w z16.s, p0, [%[outptr2]]\n"
201 "st1w z17.s, p1, [%[outptr2], #1, MUL VL]\n"
202 "st1w z10.s, p2, [%[outptr2], #2, MUL VL]\n"
203 "addvl %[outptr2], %[outptr2], #3\n"
204 : [outptr0]
"+r" (outptr0), [outptr1]
"+r" (outptr1), [outptr2]
"+r" (outptr2), [outptr3]
"+r" (outptr3), [outptr4]
"+r" (outptr4), [outptr5]
"+r" (outptr5), [outptr6]
"+r" (outptr6), [outptr7]
"+r" (outptr7),
205 [inptr]
"+r" (inptr), [p]
"+r" (p)
207 :
"x8",
"z0",
"z1",
"z2",
"z3",
"z4",
"z5",
"z6",
"z7",
"z8",
"z9",
"z10",
"z11",
"z12",
"z13",
"z14",
"z15",
"z16",
"z17",
"z18",
"z19",
"z20",
"memory",
"cc"
218 "addvl x8, %[inptr], #16\n"
219 "whilelt p0.s, %[p], %[w]\n"
220 "incw %[p], all, mul #1\n"
221 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
222 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
223 "ld1w z2.s, p0/z, [%[outptr0]]\n"
224 "whilelt p1.s, %[p], %[w]\n"
225 "ld1w z10.s, p0/z, [%[inptr]]\n"
226 "incw %[p], all, mul #1\n"
227 "ld1w z5.s, p0/z, [%[outptr1]]\n"
228 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
229 "add z10.s, z10.s, z2.s\n"
230 "ld1w z3.s, p1/z, [%[outptr0], #1, MUL VL]\n"
231 "ld1w z11.s, p1/z, [%[inptr], #1, MUL VL]\n"
232 "whilelt p2.s, %[p], %[w]\n"
233 "ld1w z13.s, p0/z, [%[inptr], #3, MUL VL]\n"
234 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
235 "add z11.s, z11.s, z3.s\n"
236 "st1w z10.s, p0, [%[outptr0]]\n"
237 "ld1w z4.s, p2/z, [%[outptr0], #2, MUL VL]\n"
238 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
239 "add z13.s, z13.s, z5.s\n"
240 "st1w z11.s, p1, [%[outptr0], #1, MUL VL]\n"
241 "ld1w z12.s, p2/z, [%[inptr], #2, MUL VL]\n"
242 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
243 "ld1w z6.s, p1/z, [%[outptr1], #1, MUL VL]\n"
244 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
245 "add z12.s, z12.s, z4.s\n"
246 "ld1w z14.s, p1/z, [%[inptr], #4, MUL VL]\n"
247 "ld1w z7.s, p2/z, [%[outptr1], #2, MUL VL]\n"
248 "ld1w z15.s, p2/z, [%[inptr], #5, MUL VL]\n"
249 "ld1w z8.s, p0/z, [%[outptr2]]\n"
250 "add z14.s, z14.s, z6.s\n"
251 "st1w z12.s, p2, [%[outptr0], #2, MUL VL]\n"
252 "ld1w z16.s, p0/z, [%[inptr], #6, MUL VL]\n"
253 "addvl %[outptr0], %[outptr0], #3\n"
254 "add z15.s, z15.s, z7.s\n"
255 "st1w z13.s, p0, [%[outptr1]]\n"
256 "ld1w z9.s, p1/z, [%[outptr2], #1, MUL VL]\n"
257 "add z16.s, z16.s, z8.s\n"
258 "ld1w z17.s, p1/z, [%[inptr], #7, MUL VL]\n"
259 "ld1w z2.s, p2/z, [%[outptr2], #2, MUL VL]\n"
260 "addvl %[inptr], %[inptr], #24\n"
261 "st1w z14.s, p1, [%[outptr1], #1, MUL VL]\n"
262 "add z17.s, z17.s, z9.s\n"
263 "ld1w z10.s, p2/z, [x8, #-8, MUL VL]\n"
264 "ld1w z3.s, p0/z, [%[outptr3]]\n"
265 "ld1w z11.s, p0/z, [x8, #-7, MUL VL]\n"
266 "st1w z15.s, p2, [%[outptr1], #2, MUL VL]\n"
267 "addvl %[outptr1], %[outptr1], #3\n"
268 "add z10.s, z10.s, z2.s\n"
269 "ld1w z4.s, p1/z, [%[outptr3], #1, MUL VL]\n"
270 "add z11.s, z11.s, z3.s\n"
271 "st1w z16.s, p0, [%[outptr2]]\n"
272 "ld1w z12.s, p1/z, [x8, #-6, MUL VL]\n"
273 "ld1w z5.s, p2/z, [%[outptr3], #2, MUL VL]\n"
274 "ld1w z13.s, p2/z, [x8, #-5, MUL VL]\n"
275 "st1w z17.s, p1, [%[outptr2], #1, MUL VL]\n"
276 "add z12.s, z12.s, z4.s\n"
277 "add z13.s, z13.s, z5.s\n"
278 "st1w z10.s, p2, [%[outptr2], #2, MUL VL]\n"
279 "addvl %[outptr2], %[outptr2], #3\n"
280 "st1w z11.s, p0, [%[outptr3]]\n"
281 "st1w z12.s, p1, [%[outptr3], #1, MUL VL]\n"
282 "st1w z13.s, p2, [%[outptr3], #2, MUL VL]\n"
283 "addvl %[outptr3], %[outptr3], #3\n"
284 : [outptr0]
"+r" (outptr0), [outptr1]
"+r" (outptr1), [outptr2]
"+r" (outptr2), [outptr3]
"+r" (outptr3), [outptr4]
"+r" (outptr4), [outptr5]
"+r" (outptr5), [outptr6]
"+r" (outptr6), [outptr7]
"+r" (outptr7),
285 [inptr]
"+r" (inptr), [p]
"+r" (p)
287 :
"x8",
"z0",
"z1",
"z2",
"z3",
"z4",
"z5",
"z6",
"z7",
"z8",
"z9",
"z10",
"z11",
"z12",
"z13",
"z14",
"z15",
"z16",
"z17",
"z18",
"z19",
"z20",
"memory",
"cc"
298 "addvl x8, %[inptr], #16\n"
299 "whilelt p0.s, %[p], %[w]\n"
300 "incw %[p], all, mul #1\n"
301 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
302 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
303 "ld1w z2.s, p0/z, [%[outptr0]]\n"
304 "whilelt p1.s, %[p], %[w]\n"
305 "ld1w z10.s, p0/z, [%[inptr]]\n"
306 "incw %[p], all, mul #1\n"
307 "ld1w z5.s, p0/z, [%[outptr1]]\n"
308 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
309 "add z10.s, z10.s, z2.s\n"
310 "ld1w z3.s, p1/z, [%[outptr0], #1, MUL VL]\n"
311 "ld1w z11.s, p1/z, [%[inptr], #1, MUL VL]\n"
312 "whilelt p2.s, %[p], %[w]\n"
313 "ld1w z13.s, p0/z, [%[inptr], #3, MUL VL]\n"
314 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
315 "add z11.s, z11.s, z3.s\n"
316 "st1w z10.s, p0, [%[outptr0]]\n"
317 "ld1w z4.s, p2/z, [%[outptr0], #2, MUL VL]\n"
318 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
319 "add z13.s, z13.s, z5.s\n"
320 "st1w z11.s, p1, [%[outptr0], #1, MUL VL]\n"
321 "ld1w z12.s, p2/z, [%[inptr], #2, MUL VL]\n"
322 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
323 "ld1w z6.s, p1/z, [%[outptr1], #1, MUL VL]\n"
324 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
325 "add z12.s, z12.s, z4.s\n"
326 "ld1w z14.s, p1/z, [%[inptr], #4, MUL VL]\n"
327 "ld1w z7.s, p2/z, [%[outptr1], #2, MUL VL]\n"
328 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
329 "ld1w z15.s, p2/z, [%[inptr], #5, MUL VL]\n"
330 "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
331 "add z14.s, z14.s, z6.s\n"
332 "st1w z12.s, p2, [%[outptr0], #2, MUL VL]\n"
333 "ld1w z8.s, p0/z, [%[outptr2]]\n"
334 "addvl %[outptr0], %[outptr0], #3\n"
335 "add z15.s, z15.s, z7.s\n"
336 "st1w z13.s, p0, [%[outptr1]]\n"
337 "ld1w z16.s, p0/z, [%[inptr], #6, MUL VL]\n"
338 "ld1w z9.s, p1/z, [%[outptr2], #1, MUL VL]\n"
339 "ld1w z17.s, p1/z, [%[inptr], #7, MUL VL]\n"
340 "addvl %[inptr], %[inptr], #24\n"
341 "add z16.s, z16.s, z8.s\n"
342 "st1w z14.s, p1, [%[outptr1], #1, MUL VL]\n"
343 "ld1w z2.s, p2/z, [%[outptr2], #2, MUL VL]\n"
344 "add z17.s, z17.s, z9.s\n"
345 "ld1w z10.s, p2/z, [x8, #-8, MUL VL]\n"
346 "ld1w z3.s, p0/z, [%[outptr3]]\n"
347 "st1w z15.s, p2, [%[outptr1], #2, MUL VL]\n"
348 "addvl %[outptr1], %[outptr1], #3\n"
349 "add z10.s, z10.s, z2.s\n"
350 "ld1w z11.s, p0/z, [x8, #-7, MUL VL]\n"
351 "ld1w z4.s, p1/z, [%[outptr3], #1, MUL VL]\n"
352 "st1w z16.s, p0, [%[outptr2]]\n"
353 "ld1w z12.s, p1/z, [x8, #-6, MUL VL]\n"
354 "add z11.s, z11.s, z3.s\n"
355 "ld1w z5.s, p2/z, [%[outptr3], #2, MUL VL]\n"
356 "ld1w z13.s, p2/z, [x8, #-5, MUL VL]\n"
357 "st1w z17.s, p1, [%[outptr2], #1, MUL VL]\n"
358 "add z12.s, z12.s, z4.s\n"
359 "ld1w z6.s, p0/z, [%[outptr4]]\n"
360 "ld1w z14.s, p0/z, [x8, #-4, MUL VL]\n"
361 "add z13.s, z13.s, z5.s\n"
362 "st1w z10.s, p2, [%[outptr2], #2, MUL VL]\n"
363 "ld1w z7.s, p1/z, [%[outptr4], #1, MUL VL]\n"
364 "addvl %[outptr2], %[outptr2], #3\n"
365 "add z14.s, z14.s, z6.s\n"
366 "st1w z11.s, p0, [%[outptr3]]\n"
367 "ld1w z15.s, p1/z, [x8, #-3, MUL VL]\n"
368 "ld1w z8.s, p2/z, [%[outptr4], #2, MUL VL]\n"
369 "ld1w z16.s, p2/z, [x8, #-2, MUL VL]\n"
370 "st1w z12.s, p1, [%[outptr3], #1, MUL VL]\n"
371 "add z15.s, z15.s, z7.s\n"
372 "add z16.s, z16.s, z8.s\n"
373 "st1w z13.s, p2, [%[outptr3], #2, MUL VL]\n"
374 "addvl %[outptr3], %[outptr3], #3\n"
375 "st1w z14.s, p0, [%[outptr4]]\n"
376 "st1w z15.s, p1, [%[outptr4], #1, MUL VL]\n"
377 "st1w z16.s, p2, [%[outptr4], #2, MUL VL]\n"
378 "addvl %[outptr4], %[outptr4], #3\n"
379 : [outptr0]
"+r" (outptr0), [outptr1]
"+r" (outptr1), [outptr2]
"+r" (outptr2), [outptr3]
"+r" (outptr3), [outptr4]
"+r" (outptr4), [outptr5]
"+r" (outptr5), [outptr6]
"+r" (outptr6), [outptr7]
"+r" (outptr7),
380 [inptr]
"+r" (inptr), [p]
"+r" (p)
382 :
"x8",
"z0",
"z1",
"z2",
"z3",
"z4",
"z5",
"z6",
"z7",
"z8",
"z9",
"z10",
"z11",
"z12",
"z13",
"z14",
"z15",
"z16",
"z17",
"z18",
"z19",
"z20",
"memory",
"cc"
393 "addvl x8, %[inptr], #16\n"
394 "whilelt p0.s, %[p], %[w]\n"
395 "incw %[p], all, mul #1\n"
396 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
397 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
398 "ld1w z2.s, p0/z, [%[outptr0]]\n"
399 "whilelt p1.s, %[p], %[w]\n"
400 "ld1w z10.s, p0/z, [%[inptr]]\n"
401 "incw %[p], all, mul #1\n"
402 "ld1w z5.s, p0/z, [%[outptr1]]\n"
403 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
404 "add z10.s, z10.s, z2.s\n"
405 "ld1w z3.s, p1/z, [%[outptr0], #1, MUL VL]\n"
406 "ld1w z11.s, p1/z, [%[inptr], #1, MUL VL]\n"
407 "whilelt p2.s, %[p], %[w]\n"
408 "ld1w z13.s, p0/z, [%[inptr], #3, MUL VL]\n"
409 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
410 "add z11.s, z11.s, z3.s\n"
411 "st1w z10.s, p0, [%[outptr0]]\n"
412 "ld1w z4.s, p2/z, [%[outptr0], #2, MUL VL]\n"
413 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
414 "add z13.s, z13.s, z5.s\n"
415 "st1w z11.s, p1, [%[outptr0], #1, MUL VL]\n"
416 "ld1w z12.s, p2/z, [%[inptr], #2, MUL VL]\n"
417 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
418 "ld1w z6.s, p1/z, [%[outptr1], #1, MUL VL]\n"
419 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
420 "add z12.s, z12.s, z4.s\n"
421 "ld1w z14.s, p1/z, [%[inptr], #4, MUL VL]\n"
422 "ld1w z7.s, p2/z, [%[outptr1], #2, MUL VL]\n"
423 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
424 "ld1w z15.s, p2/z, [%[inptr], #5, MUL VL]\n"
425 "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
426 "add z14.s, z14.s, z6.s\n"
427 "st1w z12.s, p2, [%[outptr0], #2, MUL VL]\n"
428 "ld1w z8.s, p0/z, [%[outptr2]]\n"
429 "addvl %[outptr0], %[outptr0], #3\n"
430 "add z15.s, z15.s, z7.s\n"
431 "st1w z13.s, p0, [%[outptr1]]\n"
432 "ld1w z16.s, p0/z, [%[inptr], #6, MUL VL]\n"
433 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
434 "ld1w z9.s, p1/z, [%[outptr2], #1, MUL VL]\n"
435 "prfm PLDL1KEEP, [%[outptr5], #0x60]\n"
436 "add z16.s, z16.s, z8.s\n"
437 "st1w z14.s, p1, [%[outptr1], #1, MUL VL]\n"
438 "ld1w z17.s, p1/z, [%[inptr], #7, MUL VL]\n"
439 "addvl %[inptr], %[inptr], #24\n"
440 "ld1w z2.s, p2/z, [%[outptr2], #2, MUL VL]\n"
441 "st1w z15.s, p2, [%[outptr1], #2, MUL VL]\n"
442 "addvl %[outptr1], %[outptr1], #3\n"
443 "add z17.s, z17.s, z9.s\n"
444 "ld1w z10.s, p2/z, [x8, #-8, MUL VL]\n"
445 "ld1w z3.s, p0/z, [%[outptr3]]\n"
446 "st1w z16.s, p0, [%[outptr2]]\n"
447 "ld1w z11.s, p0/z, [x8, #-7, MUL VL]\n"
448 "add z10.s, z10.s, z2.s\n"
449 "ld1w z4.s, p1/z, [%[outptr3], #1, MUL VL]\n"
450 "ld1w z12.s, p1/z, [x8, #-6, MUL VL]\n"
451 "st1w z17.s, p1, [%[outptr2], #1, MUL VL]\n"
452 "add z11.s, z11.s, z3.s\n"
453 "ld1w z5.s, p2/z, [%[outptr3], #2, MUL VL]\n"
454 "ld1w z13.s, p2/z, [x8, #-5, MUL VL]\n"
455 "add z12.s, z12.s, z4.s\n"
456 "st1w z10.s, p2, [%[outptr2], #2, MUL VL]\n"
457 "ld1w z6.s, p0/z, [%[outptr4]]\n"
458 "addvl %[outptr2], %[outptr2], #3\n"
459 "add z13.s, z13.s, z5.s\n"
460 "st1w z11.s, p0, [%[outptr3]]\n"
461 "ld1w z14.s, p0/z, [x8, #-4, MUL VL]\n"
462 "ld1w z7.s, p1/z, [%[outptr4], #1, MUL VL]\n"
463 "ld1w z15.s, p1/z, [x8, #-3, MUL VL]\n"
464 "st1w z12.s, p1, [%[outptr3], #1, MUL VL]\n"
465 "add z14.s, z14.s, z6.s\n"
466 "ld1w z8.s, p2/z, [%[outptr4], #2, MUL VL]\n"
467 "ld1w z16.s, p2/z, [x8, #-2, MUL VL]\n"
468 "add z15.s, z15.s, z7.s\n"
469 "st1w z13.s, p2, [%[outptr3], #2, MUL VL]\n"
470 "ld1w z9.s, p0/z, [%[outptr5]]\n"
471 "addvl %[outptr3], %[outptr3], #3\n"
472 "add z16.s, z16.s, z8.s\n"
473 "st1w z14.s, p0, [%[outptr4]]\n"
474 "ld1w z17.s, p0/z, [x8, #-1, MUL VL]\n"
475 "ld1w z2.s, p1/z, [%[outptr5], #1, MUL VL]\n"
476 "ld1w z10.s, p1/z, [x8]\n"
477 "st1w z15.s, p1, [%[outptr4], #1, MUL VL]\n"
478 "add z17.s, z17.s, z9.s\n"
479 "ld1w z3.s, p2/z, [%[outptr5], #2, MUL VL]\n"
480 "ld1w z11.s, p2/z, [x8, #1, MUL VL]\n"
481 "add z10.s, z10.s, z2.s\n"
482 "st1w z16.s, p2, [%[outptr4], #2, MUL VL]\n"
483 "addvl %[outptr4], %[outptr4], #3\n"
484 "add z11.s, z11.s, z3.s\n"
485 "st1w z17.s, p0, [%[outptr5]]\n"
486 "st1w z10.s, p1, [%[outptr5], #1, MUL VL]\n"
487 "st1w z11.s, p2, [%[outptr5], #2, MUL VL]\n"
488 "addvl %[outptr5], %[outptr5], #3\n"
489 : [outptr0]
"+r" (outptr0), [outptr1]
"+r" (outptr1), [outptr2]
"+r" (outptr2), [outptr3]
"+r" (outptr3), [outptr4]
"+r" (outptr4), [outptr5]
"+r" (outptr5), [outptr6]
"+r" (outptr6), [outptr7]
"+r" (outptr7),
490 [inptr]
"+r" (inptr), [p]
"+r" (p)
492 :
"x8",
"z0",
"z1",
"z2",
"z3",
"z4",
"z5",
"z6",
"z7",
"z8",
"z9",
"z10",
"z11",
"z12",
"z13",
"z14",
"z15",
"z16",
"z17",
"z18",
"z19",
"z20",
"memory",
"cc"
503 "addvl x8, %[inptr], #16\n"
504 "whilelt p0.s, %[p], %[w]\n"
505 "incw %[p], all, mul #1\n"
506 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
507 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
508 "ld1w z2.s, p0/z, [%[outptr0]]\n"
509 "whilelt p1.s, %[p], %[w]\n"
510 "ld1w z10.s, p0/z, [%[inptr]]\n"
511 "incw %[p], all, mul #1\n"
512 "ld1w z5.s, p0/z, [%[outptr1]]\n"
513 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
514 "add z10.s, z10.s, z2.s\n"
515 "ld1w z3.s, p1/z, [%[outptr0], #1, MUL VL]\n"
516 "ld1w z11.s, p1/z, [%[inptr], #1, MUL VL]\n"
517 "whilelt p2.s, %[p], %[w]\n"
518 "ld1w z13.s, p0/z, [%[inptr], #3, MUL VL]\n"
519 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
520 "add z11.s, z11.s, z3.s\n"
521 "st1w z10.s, p0, [%[outptr0]]\n"
522 "ld1w z4.s, p2/z, [%[outptr0], #2, MUL VL]\n"
523 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
524 "add z13.s, z13.s, z5.s\n"
525 "st1w z11.s, p1, [%[outptr0], #1, MUL VL]\n"
526 "ld1w z12.s, p2/z, [%[inptr], #2, MUL VL]\n"
527 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
528 "ld1w z6.s, p1/z, [%[outptr1], #1, MUL VL]\n"
529 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
530 "add z12.s, z12.s, z4.s\n"
531 "ld1w z14.s, p1/z, [%[inptr], #4, MUL VL]\n"
532 "ld1w z7.s, p2/z, [%[outptr1], #2, MUL VL]\n"
533 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
534 "ld1w z15.s, p2/z, [%[inptr], #5, MUL VL]\n"
535 "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
536 "add z14.s, z14.s, z6.s\n"
537 "st1w z12.s, p2, [%[outptr0], #2, MUL VL]\n"
538 "ld1w z8.s, p0/z, [%[outptr2]]\n"
539 "addvl %[outptr0], %[outptr0], #3\n"
540 "add z15.s, z15.s, z7.s\n"
541 "st1w z13.s, p0, [%[outptr1]]\n"
542 "ld1w z16.s, p0/z, [%[inptr], #6, MUL VL]\n"
543 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
544 "ld1w z9.s, p1/z, [%[outptr2], #1, MUL VL]\n"
545 "prfm PLDL1KEEP, [%[outptr5], #0x60]\n"
546 "add z16.s, z16.s, z8.s\n"
547 "st1w z14.s, p1, [%[outptr1], #1, MUL VL]\n"
548 "ld1w z17.s, p1/z, [%[inptr], #7, MUL VL]\n"
549 "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
550 "ld1w z2.s, p2/z, [%[outptr2], #2, MUL VL]\n"
551 "prfm PLDL1KEEP, [%[outptr6], #0x60]\n"
552 "add z17.s, z17.s, z9.s\n"
553 "st1w z15.s, p2, [%[outptr1], #2, MUL VL]\n"
554 "ld1w z10.s, p2/z, [x8, #-8, MUL VL]\n"
555 "addvl %[outptr1], %[outptr1], #3\n"
556 "ld1w z3.s, p0/z, [%[outptr3]]\n"
557 "addvl %[inptr], %[inptr], #24\n"
558 "add z10.s, z10.s, z2.s\n"
559 "st1w z16.s, p0, [%[outptr2]]\n"
560 "ld1w z11.s, p0/z, [x8, #-7, MUL VL]\n"
561 "ld1w z4.s, p1/z, [%[outptr3], #1, MUL VL]\n"
562 "ld1w z12.s, p1/z, [x8, #-6, MUL VL]\n"
563 "st1w z17.s, p1, [%[outptr2], #1, MUL VL]\n"
564 "add z11.s, z11.s, z3.s\n"
565 "ld1w z5.s, p2/z, [%[outptr3], #2, MUL VL]\n"
566 "ld1w z13.s, p2/z, [x8, #-5, MUL VL]\n"
567 "add z12.s, z12.s, z4.s\n"
568 "st1w z10.s, p2, [%[outptr2], #2, MUL VL]\n"
569 "ld1w z6.s, p0/z, [%[outptr4]]\n"
570 "addvl %[outptr2], %[outptr2], #3\n"
571 "add z13.s, z13.s, z5.s\n"
572 "st1w z11.s, p0, [%[outptr3]]\n"
573 "ld1w z14.s, p0/z, [x8, #-4, MUL VL]\n"
574 "ld1w z7.s, p1/z, [%[outptr4], #1, MUL VL]\n"
575 "ld1w z15.s, p1/z, [x8, #-3, MUL VL]\n"
576 "st1w z12.s, p1, [%[outptr3], #1, MUL VL]\n"
577 "add z14.s, z14.s, z6.s\n"
578 "ld1w z8.s, p2/z, [%[outptr4], #2, MUL VL]\n"
579 "ld1w z16.s, p2/z, [x8, #-2, MUL VL]\n"
580 "add z15.s, z15.s, z7.s\n"
581 "st1w z13.s, p2, [%[outptr3], #2, MUL VL]\n"
582 "ld1w z9.s, p0/z, [%[outptr5]]\n"
583 "addvl %[outptr3], %[outptr3], #3\n"
584 "add z16.s, z16.s, z8.s\n"
585 "st1w z14.s, p0, [%[outptr4]]\n"
586 "ld1w z17.s, p0/z, [x8, #-1, MUL VL]\n"
587 "ld1w z2.s, p1/z, [%[outptr5], #1, MUL VL]\n"
588 "ld1w z10.s, p1/z, [x8]\n"
589 "st1w z15.s, p1, [%[outptr4], #1, MUL VL]\n"
590 "add z17.s, z17.s, z9.s\n"
591 "ld1w z3.s, p2/z, [%[outptr5], #2, MUL VL]\n"
592 "ld1w z11.s, p2/z, [x8, #1, MUL VL]\n"
593 "add z10.s, z10.s, z2.s\n"
594 "st1w z16.s, p2, [%[outptr4], #2, MUL VL]\n"
595 "ld1w z4.s, p0/z, [%[outptr6]]\n"
596 "addvl %[outptr4], %[outptr4], #3\n"
597 "add z11.s, z11.s, z3.s\n"
598 "st1w z17.s, p0, [%[outptr5]]\n"
599 "ld1w z12.s, p0/z, [x8, #2, MUL VL]\n"
600 "ld1w z5.s, p1/z, [%[outptr6], #1, MUL VL]\n"
601 "ld1w z13.s, p1/z, [x8, #3, MUL VL]\n"
602 "st1w z10.s, p1, [%[outptr5], #1, MUL VL]\n"
603 "add z12.s, z12.s, z4.s\n"
604 "ld1w z6.s, p2/z, [%[outptr6], #2, MUL VL]\n"
605 "ld1w z14.s, p2/z, [x8, #4, MUL VL]\n"
606 "add z13.s, z13.s, z5.s\n"
607 "st1w z11.s, p2, [%[outptr5], #2, MUL VL]\n"
608 "addvl %[outptr5], %[outptr5], #3\n"
609 "add z14.s, z14.s, z6.s\n"
610 "st1w z12.s, p0, [%[outptr6]]\n"
611 "st1w z13.s, p1, [%[outptr6], #1, MUL VL]\n"
612 "st1w z14.s, p2, [%[outptr6], #2, MUL VL]\n"
613 "addvl %[outptr6], %[outptr6], #3\n"
614 : [outptr0]
"+r" (outptr0), [outptr1]
"+r" (outptr1), [outptr2]
"+r" (outptr2), [outptr3]
"+r" (outptr3), [outptr4]
"+r" (outptr4), [outptr5]
"+r" (outptr5), [outptr6]
"+r" (outptr6), [outptr7]
"+r" (outptr7),
615 [inptr]
"+r" (inptr), [p]
"+r" (p)
617 :
"x8",
"z0",
"z1",
"z2",
"z3",
"z4",
"z5",
"z6",
"z7",
"z8",
"z9",
"z10",
"z11",
"z12",
"z13",
"z14",
"z15",
"z16",
"z17",
"z18",
"z19",
"z20",
"memory",
"cc"
629 "addvl x8, %[inptr], #16\n"
630 "whilelt p0.s, %[p], %[w]\n"
631 "incw %[p], all, mul #1\n"
632 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
633 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
634 "ld1w z2.s, p0/z, [%[outptr0]]\n"
635 "whilelt p1.s, %[p], %[w]\n"
636 "ld1w z10.s, p0/z, [%[inptr]]\n"
637 "incw %[p], all, mul #1\n"
638 "ld1w z5.s, p0/z, [%[outptr1]]\n"
639 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
640 "add z10.s, z10.s, z2.s\n"
641 "ld1w z3.s, p1/z, [%[outptr0], #1, MUL VL]\n"
642 "ld1w z11.s, p1/z, [%[inptr], #1, MUL VL]\n"
643 "whilelt p2.s, %[p], %[w]\n"
644 "ld1w z13.s, p0/z, [%[inptr], #3, MUL VL]\n"
645 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
646 "add z11.s, z11.s, z3.s\n"
647 "st1w z10.s, p0, [%[outptr0]]\n"
648 "ld1w z4.s, p2/z, [%[outptr0], #2, MUL VL]\n"
649 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
650 "add z13.s, z13.s, z5.s\n"
651 "st1w z11.s, p1, [%[outptr0], #1, MUL VL]\n"
652 "ld1w z12.s, p2/z, [%[inptr], #2, MUL VL]\n"
653 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
654 "ld1w z6.s, p1/z, [%[outptr1], #1, MUL VL]\n"
655 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
656 "add z12.s, z12.s, z4.s\n"
657 "ld1w z14.s, p1/z, [%[inptr], #4, MUL VL]\n"
658 "ld1w z7.s, p2/z, [%[outptr1], #2, MUL VL]\n"
659 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
660 "ld1w z15.s, p2/z, [%[inptr], #5, MUL VL]\n"
661 "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
662 "add z14.s, z14.s, z6.s\n"
663 "st1w z12.s, p2, [%[outptr0], #2, MUL VL]\n"
664 "ld1w z8.s, p0/z, [%[outptr2]]\n"
665 "addvl %[outptr0], %[outptr0], #3\n"
666 "add z15.s, z15.s, z7.s\n"
667 "st1w z13.s, p0, [%[outptr1]]\n"
668 "ld1w z16.s, p0/z, [%[inptr], #6, MUL VL]\n"
669 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
670 "ld1w z9.s, p1/z, [%[outptr2], #1, MUL VL]\n"
671 "prfm PLDL1KEEP, [%[outptr5], #0x60]\n"
672 "add z16.s, z16.s, z8.s\n"
673 "st1w z14.s, p1, [%[outptr1], #1, MUL VL]\n"
674 "ld1w z17.s, p1/z, [%[inptr], #7, MUL VL]\n"
675 "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
676 "ld1w z2.s, p2/z, [%[outptr2], #2, MUL VL]\n"
677 "prfm PLDL1KEEP, [%[outptr6], #0x60]\n"
678 "add z17.s, z17.s, z9.s\n"
679 "st1w z15.s, p2, [%[outptr1], #2, MUL VL]\n"
680 "ld1w z10.s, p2/z, [x8, #-8, MUL VL]\n"
681 "addvl %[outptr1], %[outptr1], #3\n"
682 "ld1w z3.s, p0/z, [%[outptr3]]\n"
683 "prfm PLDL1KEEP, [%[outptr7], #0x60]\n"
684 "add z10.s, z10.s, z2.s\n"
685 "st1w z16.s, p0, [%[outptr2]]\n"
686 "ld1w z11.s, p0/z, [x8, #-7, MUL VL]\n"
687 "addvl %[inptr], %[inptr], #24\n"
688 "ld1w z4.s, p1/z, [%[outptr3], #1, MUL VL]\n"
689 "st1w z17.s, p1, [%[outptr2], #1, MUL VL]\n"
690 "add z11.s, z11.s, z3.s\n"
691 "ld1w z12.s, p1/z, [x8, #-6, MUL VL]\n"
692 "ld1w z5.s, p2/z, [%[outptr3], #2, MUL VL]\n"
693 "ld1w z13.s, p2/z, [x8, #-5, MUL VL]\n"
694 "st1w z10.s, p2, [%[outptr2], #2, MUL VL]\n"
695 "addvl %[outptr2], %[outptr2], #3\n"
696 "add z12.s, z12.s, z4.s\n"
697 "ld1w z6.s, p0/z, [%[outptr4]]\n"
698 "add z13.s, z13.s, z5.s\n"
699 "st1w z11.s, p0, [%[outptr3]]\n"
700 "ld1w z14.s, p0/z, [x8, #-4, MUL VL]\n"
701 "ld1w z7.s, p1/z, [%[outptr4], #1, MUL VL]\n"
702 "ld1w z15.s, p1/z, [x8, #-3, MUL VL]\n"
703 "st1w z12.s, p1, [%[outptr3], #1, MUL VL]\n"
704 "add z14.s, z14.s, z6.s\n"
705 "ld1w z8.s, p2/z, [%[outptr4], #2, MUL VL]\n"
706 "ld1w z16.s, p2/z, [x8, #-2, MUL VL]\n"
707 "add z15.s, z15.s, z7.s\n"
708 "st1w z13.s, p2, [%[outptr3], #2, MUL VL]\n"
709 "ld1w z9.s, p0/z, [%[outptr5]]\n"
710 "addvl %[outptr3], %[outptr3], #3\n"
711 "add z16.s, z16.s, z8.s\n"
712 "st1w z14.s, p0, [%[outptr4]]\n"
713 "ld1w z17.s, p0/z, [x8, #-1, MUL VL]\n"
714 "ld1w z2.s, p1/z, [%[outptr5], #1, MUL VL]\n"
715 "ld1w z10.s, p1/z, [x8]\n"
716 "st1w z15.s, p1, [%[outptr4], #1, MUL VL]\n"
717 "add z17.s, z17.s, z9.s\n"
718 "ld1w z3.s, p2/z, [%[outptr5], #2, MUL VL]\n"
719 "ld1w z11.s, p2/z, [x8, #1, MUL VL]\n"
720 "add z10.s, z10.s, z2.s\n"
721 "st1w z16.s, p2, [%[outptr4], #2, MUL VL]\n"
722 "ld1w z4.s, p0/z, [%[outptr6]]\n"
723 "addvl %[outptr4], %[outptr4], #3\n"
724 "add z11.s, z11.s, z3.s\n"
725 "st1w z17.s, p0, [%[outptr5]]\n"
726 "ld1w z12.s, p0/z, [x8, #2, MUL VL]\n"
727 "ld1w z5.s, p1/z, [%[outptr6], #1, MUL VL]\n"
728 "ld1w z13.s, p1/z, [x8, #3, MUL VL]\n"
729 "st1w z10.s, p1, [%[outptr5], #1, MUL VL]\n"
730 "add z12.s, z12.s, z4.s\n"
731 "ld1w z6.s, p2/z, [%[outptr6], #2, MUL VL]\n"
732 "ld1w z14.s, p2/z, [x8, #4, MUL VL]\n"
733 "add z13.s, z13.s, z5.s\n"
734 "st1w z11.s, p2, [%[outptr5], #2, MUL VL]\n"
735 "ld1w z7.s, p0/z, [%[outptr7]]\n"
736 "addvl %[outptr5], %[outptr5], #3\n"
737 "add z14.s, z14.s, z6.s\n"
738 "st1w z12.s, p0, [%[outptr6]]\n"
739 "ld1w z15.s, p0/z, [x8, #5, MUL VL]\n"
740 "ld1w z8.s, p1/z, [%[outptr7], #1, MUL VL]\n"
741 "ld1w z16.s, p1/z, [x8, #6, MUL VL]\n"
742 "st1w z13.s, p1, [%[outptr6], #1, MUL VL]\n"
743 "add z15.s, z15.s, z7.s\n"
744 "ld1w z9.s, p2/z, [%[outptr7], #2, MUL VL]\n"
745 "ld1w z17.s, p2/z, [x8, #7, MUL VL]\n"
746 "add z16.s, z16.s, z8.s\n"
747 "st1w z14.s, p2, [%[outptr6], #2, MUL VL]\n"
748 "addvl %[outptr6], %[outptr6], #3\n"
749 "add z17.s, z17.s, z9.s\n"
750 "st1w z15.s, p0, [%[outptr7]]\n"
751 "st1w z16.s, p1, [%[outptr7], #1, MUL VL]\n"
752 "st1w z17.s, p2, [%[outptr7], #2, MUL VL]\n"
753 "addvl %[outptr7], %[outptr7], #3\n"
754 : [outptr0]
"+r" (outptr0), [outptr1]
"+r" (outptr1), [outptr2]
"+r" (outptr2), [outptr3]
"+r" (outptr3), [outptr4]
"+r" (outptr4), [outptr5]
"+r" (outptr5), [outptr6]
"+r" (outptr6), [outptr7]
"+r" (outptr7),
755 [inptr]
"+r" (inptr), [p]
"+r" (p)
757 :
"x8",
"z0",
"z1",
"z2",
"z3",
"z4",
"z5",
"z6",
"z7",
"z8",
"z9",
"z10",
"z11",
"z12",
"z13",
"z14",
"z15",
"z16",
"z17",
"z18",
"z19",
"z20",
"memory",
"cc"
767 const uint32_t *biasptr =
bias ?
bias + i : nullbias;
777 "addvl x8, %[inptr], #16\n"
778 "whilelt p0.s, %[p], %[w]\n"
779 "incw %[p], all, mul #1\n"
780 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
781 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
782 "ld1w z2.s, p0/z, [%[biasptr]]\n"
783 "whilelt p1.s, %[p], %[w]\n"
784 "ld1w z13.s, p0/z, [%[inptr]]\n"
785 "incw %[p], all, mul #1\n"
786 "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
787 "add z13.s, z13.s, z2.s\n"
788 "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
789 "whilelt p2.s, %[p], %[w]\n"
790 "add z14.s, z14.s, z3.s\n"
791 "st1w z13.s, p0, [%[outptr0]]\n"
792 "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
793 "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
794 "addvl %[inptr], %[inptr], #24\n"
795 "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
796 "add z15.s, z15.s, z4.s\n"
797 "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
798 "addvl %[outptr0], %[outptr0], #3\n"
799 : [outptr0]
"+r" (outptr0), [outptr1]
"+r" (outptr1), [outptr2]
"+r" (outptr2), [outptr3]
"+r" (outptr3), [outptr4]
"+r" (outptr4), [outptr5]
"+r" (outptr5), [outptr6]
"+r" (outptr6), [outptr7]
"+r" (outptr7),
800 [inptr]
"+r" (inptr), [p]
"+r" (p)
801 : [
w]
"r" (
w), [biasptr]
"r" (biasptr)
802 :
"x8",
"z0",
"z1",
"z2",
"z3",
"z4",
"z5",
"z6",
"z7",
"z8",
"z9",
"z10",
"z11",
"z12",
"z13",
"z14",
"z15",
"z16",
"z17",
"z18",
"z19",
"z20",
"memory",
"cc"
813 "addvl x8, %[inptr], #16\n"
814 "whilelt p0.s, %[p], %[w]\n"
815 "incw %[p], all, mul #1\n"
816 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
817 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
818 "ld1w z2.s, p0/z, [%[biasptr]]\n"
819 "whilelt p1.s, %[p], %[w]\n"
820 "ld1w z13.s, p0/z, [%[inptr]]\n"
821 "incw %[p], all, mul #1\n"
822 "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
823 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
824 "add z13.s, z13.s, z2.s\n"
825 "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
826 "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
827 "whilelt p2.s, %[p], %[w]\n"
828 "add z16.s, z16.s, z2.s\n"
829 "st1w z13.s, p0, [%[outptr0]]\n"
830 "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
831 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
832 "add z14.s, z14.s, z3.s\n"
833 "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
834 "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
835 "add z17.s, z17.s, z3.s\n"
836 "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
837 "addvl %[inptr], %[inptr], #24\n"
838 "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
839 "add z15.s, z15.s, z4.s\n"
840 "add z18.s, z18.s, z4.s\n"
841 "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
842 "addvl %[outptr0], %[outptr0], #3\n"
843 "st1w z16.s, p0, [%[outptr1]]\n"
844 "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
845 "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
846 "addvl %[outptr1], %[outptr1], #3\n"
847 : [outptr0]
"+r" (outptr0), [outptr1]
"+r" (outptr1), [outptr2]
"+r" (outptr2), [outptr3]
"+r" (outptr3), [outptr4]
"+r" (outptr4), [outptr5]
"+r" (outptr5), [outptr6]
"+r" (outptr6), [outptr7]
"+r" (outptr7),
848 [inptr]
"+r" (inptr), [p]
"+r" (p)
849 : [
w]
"r" (
w), [biasptr]
"r" (biasptr)
850 :
"x8",
"z0",
"z1",
"z2",
"z3",
"z4",
"z5",
"z6",
"z7",
"z8",
"z9",
"z10",
"z11",
"z12",
"z13",
"z14",
"z15",
"z16",
"z17",
"z18",
"z19",
"z20",
"memory",
"cc"
861 "addvl x8, %[inptr], #16\n"
862 "whilelt p0.s, %[p], %[w]\n"
863 "incw %[p], all, mul #1\n"
864 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
865 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
866 "ld1w z2.s, p0/z, [%[biasptr]]\n"
867 "whilelt p1.s, %[p], %[w]\n"
868 "ld1w z13.s, p0/z, [%[inptr]]\n"
869 "incw %[p], all, mul #1\n"
870 "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
871 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
872 "add z13.s, z13.s, z2.s\n"
873 "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
874 "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
875 "whilelt p2.s, %[p], %[w]\n"
876 "add z16.s, z16.s, z2.s\n"
877 "st1w z13.s, p0, [%[outptr0]]\n"
878 "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
879 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
880 "add z14.s, z14.s, z3.s\n"
881 "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
882 "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
883 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
884 "add z17.s, z17.s, z3.s\n"
885 "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
886 "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
887 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
888 "add z15.s, z15.s, z4.s\n"
889 "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
890 "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
891 "addvl %[inptr], %[inptr], #24\n"
892 "add z18.s, z18.s, z4.s\n"
893 "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
894 "add z19.s, z19.s, z2.s\n"
895 "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
896 "add z20.s, z20.s, z3.s\n"
897 "addvl %[outptr0], %[outptr0], #3\n"
898 "st1w z16.s, p0, [%[outptr1]]\n"
899 "add z13.s, z13.s, z4.s\n"
900 "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
901 "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
902 "addvl %[outptr1], %[outptr1], #3\n"
903 "st1w z19.s, p0, [%[outptr2]]\n"
904 "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
905 "st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
906 "addvl %[outptr2], %[outptr2], #3\n"
907 : [outptr0]
"+r" (outptr0), [outptr1]
"+r" (outptr1), [outptr2]
"+r" (outptr2), [outptr3]
"+r" (outptr3), [outptr4]
"+r" (outptr4), [outptr5]
"+r" (outptr5), [outptr6]
"+r" (outptr6), [outptr7]
"+r" (outptr7),
908 [inptr]
"+r" (inptr), [p]
"+r" (p)
909 : [
w]
"r" (
w), [biasptr]
"r" (biasptr)
910 :
"x8",
"z0",
"z1",
"z2",
"z3",
"z4",
"z5",
"z6",
"z7",
"z8",
"z9",
"z10",
"z11",
"z12",
"z13",
"z14",
"z15",
"z16",
"z17",
"z18",
"z19",
"z20",
"memory",
"cc"
921 "addvl x8, %[inptr], #16\n"
922 "whilelt p0.s, %[p], %[w]\n"
923 "incw %[p], all, mul #1\n"
924 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
925 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
926 "ld1w z2.s, p0/z, [%[biasptr]]\n"
927 "whilelt p1.s, %[p], %[w]\n"
928 "ld1w z13.s, p0/z, [%[inptr]]\n"
929 "incw %[p], all, mul #1\n"
930 "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
931 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
932 "add z13.s, z13.s, z2.s\n"
933 "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
934 "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
935 "whilelt p2.s, %[p], %[w]\n"
936 "add z16.s, z16.s, z2.s\n"
937 "st1w z13.s, p0, [%[outptr0]]\n"
938 "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
939 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
940 "add z14.s, z14.s, z3.s\n"
941 "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
942 "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
943 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
944 "add z17.s, z17.s, z3.s\n"
945 "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
946 "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
947 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
948 "add z15.s, z15.s, z4.s\n"
949 "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
950 "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
951 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
952 "add z18.s, z18.s, z4.s\n"
953 "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
954 "add z19.s, z19.s, z2.s\n"
955 "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
956 "add z20.s, z20.s, z3.s\n"
957 "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
958 "ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
959 "addvl %[outptr0], %[outptr0], #3\n"
960 "add z13.s, z13.s, z4.s\n"
961 "st1w z16.s, p0, [%[outptr1]]\n"
962 "add z14.s, z14.s, z2.s\n"
963 "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
964 "add z15.s, z15.s, z3.s\n"
965 "addvl %[inptr], %[inptr], #24\n"
966 "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
967 "add z16.s, z16.s, z4.s\n"
968 "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
969 "addvl %[outptr1], %[outptr1], #3\n"
970 "st1w z19.s, p0, [%[outptr2]]\n"
971 "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
972 "st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
973 "addvl %[outptr2], %[outptr2], #3\n"
974 "st1w z14.s, p0, [%[outptr3]]\n"
975 "st1w z15.s, p1, [%[outptr3], #1, MUL VL]\n"
976 "st1w z16.s, p2, [%[outptr3], #2, MUL VL]\n"
977 "addvl %[outptr3], %[outptr3], #3\n"
978 : [outptr0]
"+r" (outptr0), [outptr1]
"+r" (outptr1), [outptr2]
"+r" (outptr2), [outptr3]
"+r" (outptr3), [outptr4]
"+r" (outptr4), [outptr5]
"+r" (outptr5), [outptr6]
"+r" (outptr6), [outptr7]
"+r" (outptr7),
979 [inptr]
"+r" (inptr), [p]
"+r" (p)
980 : [
w]
"r" (
w), [biasptr]
"r" (biasptr)
981 :
"x8",
"z0",
"z1",
"z2",
"z3",
"z4",
"z5",
"z6",
"z7",
"z8",
"z9",
"z10",
"z11",
"z12",
"z13",
"z14",
"z15",
"z16",
"z17",
"z18",
"z19",
"z20",
"memory",
"cc"
992 "addvl x8, %[inptr], #16\n"
993 "whilelt p0.s, %[p], %[w]\n"
994 "incw %[p], all, mul #1\n"
995 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
996 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
997 "ld1w z2.s, p0/z, [%[biasptr]]\n"
998 "whilelt p1.s, %[p], %[w]\n"
999 "ld1w z13.s, p0/z, [%[inptr]]\n"
1000 "incw %[p], all, mul #1\n"
1001 "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
1002 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1003 "add z13.s, z13.s, z2.s\n"
1004 "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
1005 "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
1006 "whilelt p2.s, %[p], %[w]\n"
1007 "add z16.s, z16.s, z2.s\n"
1008 "st1w z13.s, p0, [%[outptr0]]\n"
1009 "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
1010 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1011 "add z14.s, z14.s, z3.s\n"
1012 "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
1013 "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
1014 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1015 "add z17.s, z17.s, z3.s\n"
1016 "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
1017 "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
1018 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1019 "add z15.s, z15.s, z4.s\n"
1020 "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
1021 "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
1022 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
1023 "add z18.s, z18.s, z4.s\n"
1024 "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
1025 "add z19.s, z19.s, z2.s\n"
1026 "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
1027 "add z20.s, z20.s, z3.s\n"
1028 "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
1029 "ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
1030 "addvl %[outptr0], %[outptr0], #3\n"
1031 "add z13.s, z13.s, z4.s\n"
1032 "st1w z16.s, p0, [%[outptr1]]\n"
1033 "add z14.s, z14.s, z2.s\n"
1034 "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
1035 "add z15.s, z15.s, z3.s\n"
1036 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
1037 "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
1038 "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
1039 "add z16.s, z16.s, z4.s\n"
1040 "ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n"
1041 "addvl %[inptr], %[inptr], #24\n"
1042 "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
1043 "addvl %[outptr1], %[outptr1], #3\n"
1044 "add z17.s, z17.s, z2.s\n"
1045 "ld1w z18.s, p1/z, [x8, #-3, MUL VL]\n"
1046 "st1w z19.s, p0, [%[outptr2]]\n"
1047 "ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n"
1048 "add z18.s, z18.s, z3.s\n"
1049 "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
1050 "add z19.s, z19.s, z4.s\n"
1051 "st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
1052 "addvl %[outptr2], %[outptr2], #3\n"
1053 "st1w z14.s, p0, [%[outptr3]]\n"
1054 "st1w z15.s, p1, [%[outptr3], #1, MUL VL]\n"
1055 "st1w z16.s, p2, [%[outptr3], #2, MUL VL]\n"
1056 "addvl %[outptr3], %[outptr3], #3\n"
1057 "st1w z17.s, p0, [%[outptr4]]\n"
1058 "st1w z18.s, p1, [%[outptr4], #1, MUL VL]\n"
1059 "st1w z19.s, p2, [%[outptr4], #2, MUL VL]\n"
1060 "addvl %[outptr4], %[outptr4], #3\n"
1061 : [outptr0]
"+r" (outptr0), [outptr1]
"+r" (outptr1), [outptr2]
"+r" (outptr2), [outptr3]
"+r" (outptr3), [outptr4]
"+r" (outptr4), [outptr5]
"+r" (outptr5), [outptr6]
"+r" (outptr6), [outptr7]
"+r" (outptr7),
1062 [inptr]
"+r" (inptr), [p]
"+r" (p)
1063 : [
w]
"r" (
w), [biasptr]
"r" (biasptr)
1064 :
"x8",
"z0",
"z1",
"z2",
"z3",
"z4",
"z5",
"z6",
"z7",
"z8",
"z9",
"z10",
"z11",
"z12",
"z13",
"z14",
"z15",
"z16",
"z17",
"z18",
"z19",
"z20",
"memory",
"cc"
1075 "addvl x8, %[inptr], #16\n"
1076 "whilelt p0.s, %[p], %[w]\n"
1077 "incw %[p], all, mul #1\n"
1078 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1079 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1080 "ld1w z2.s, p0/z, [%[biasptr]]\n"
1081 "whilelt p1.s, %[p], %[w]\n"
1082 "ld1w z13.s, p0/z, [%[inptr]]\n"
1083 "incw %[p], all, mul #1\n"
1084 "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
1085 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1086 "add z13.s, z13.s, z2.s\n"
1087 "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
1088 "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
1089 "whilelt p2.s, %[p], %[w]\n"
1090 "add z16.s, z16.s, z2.s\n"
1091 "st1w z13.s, p0, [%[outptr0]]\n"
1092 "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
1093 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1094 "add z14.s, z14.s, z3.s\n"
1095 "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
1096 "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
1097 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1098 "add z17.s, z17.s, z3.s\n"
1099 "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
1100 "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
1101 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1102 "add z15.s, z15.s, z4.s\n"
1103 "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
1104 "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
1105 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
1106 "add z18.s, z18.s, z4.s\n"
1107 "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
1108 "add z19.s, z19.s, z2.s\n"
1109 "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
1110 "add z20.s, z20.s, z3.s\n"
1111 "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
1112 "ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
1113 "addvl %[outptr0], %[outptr0], #3\n"
1114 "add z13.s, z13.s, z4.s\n"
1115 "st1w z16.s, p0, [%[outptr1]]\n"
1116 "add z14.s, z14.s, z2.s\n"
1117 "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
1118 "add z15.s, z15.s, z3.s\n"
1119 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
1120 "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
1121 "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
1122 "add z16.s, z16.s, z4.s\n"
1123 "ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n"
1124 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
1125 "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
1126 "addvl %[outptr1], %[outptr1], #3\n"
1127 "add z17.s, z17.s, z2.s\n"
1128 "ld1w z18.s, p1/z, [x8, #-3, MUL VL]\n"
1129 "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
1130 "st1w z19.s, p0, [%[outptr2]]\n"
1131 "addvl %[inptr], %[inptr], #24\n"
1132 "add z18.s, z18.s, z3.s\n"
1133 "ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n"
1134 "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
1135 "ld1w z20.s, p0/z, [x8, #-1, MUL VL]\n"
1136 "add z19.s, z19.s, z4.s\n"
1137 "st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
1138 "addvl %[outptr2], %[outptr2], #3\n"
1139 "add z20.s, z20.s, z2.s\n"
1140 "ld1w z13.s, p1/z, [x8]\n"
1141 "st1w z14.s, p0, [%[outptr3]]\n"
1142 "ld1w z14.s, p2/z, [x8, #1, MUL VL]\n"
1143 "add z13.s, z13.s, z3.s\n"
1144 "st1w z15.s, p1, [%[outptr3], #1, MUL VL]\n"
1145 "add z14.s, z14.s, z4.s\n"
1146 "st1w z16.s, p2, [%[outptr3], #2, MUL VL]\n"
1147 "addvl %[outptr3], %[outptr3], #3\n"
1148 "st1w z17.s, p0, [%[outptr4]]\n"
1149 "st1w z18.s, p1, [%[outptr4], #1, MUL VL]\n"
1150 "st1w z19.s, p2, [%[outptr4], #2, MUL VL]\n"
1151 "addvl %[outptr4], %[outptr4], #3\n"
1152 "st1w z20.s, p0, [%[outptr5]]\n"
1153 "st1w z13.s, p1, [%[outptr5], #1, MUL VL]\n"
1154 "st1w z14.s, p2, [%[outptr5], #2, MUL VL]\n"
1155 "addvl %[outptr5], %[outptr5], #3\n"
1156 : [outptr0]
"+r" (outptr0), [outptr1]
"+r" (outptr1), [outptr2]
"+r" (outptr2), [outptr3]
"+r" (outptr3), [outptr4]
"+r" (outptr4), [outptr5]
"+r" (outptr5), [outptr6]
"+r" (outptr6), [outptr7]
"+r" (outptr7),
1157 [inptr]
"+r" (inptr), [p]
"+r" (p)
1158 : [
w]
"r" (
w), [biasptr]
"r" (biasptr)
1159 :
"x8",
"z0",
"z1",
"z2",
"z3",
"z4",
"z5",
"z6",
"z7",
"z8",
"z9",
"z10",
"z11",
"z12",
"z13",
"z14",
"z15",
"z16",
"z17",
"z18",
"z19",
"z20",
"memory",
"cc"
1170 "addvl x8, %[inptr], #16\n"
1171 "whilelt p0.s, %[p], %[w]\n"
1172 "incw %[p], all, mul #1\n"
1173 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1174 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1175 "ld1w z2.s, p0/z, [%[biasptr]]\n"
1176 "whilelt p1.s, %[p], %[w]\n"
1177 "ld1w z13.s, p0/z, [%[inptr]]\n"
1178 "incw %[p], all, mul #1\n"
1179 "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
1180 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1181 "add z13.s, z13.s, z2.s\n"
1182 "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
1183 "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
1184 "whilelt p2.s, %[p], %[w]\n"
1185 "add z16.s, z16.s, z2.s\n"
1186 "st1w z13.s, p0, [%[outptr0]]\n"
1187 "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
1188 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1189 "add z14.s, z14.s, z3.s\n"
1190 "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
1191 "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
1192 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1193 "add z17.s, z17.s, z3.s\n"
1194 "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
1195 "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
1196 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1197 "add z15.s, z15.s, z4.s\n"
1198 "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
1199 "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
1200 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
1201 "add z18.s, z18.s, z4.s\n"
1202 "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
1203 "add z19.s, z19.s, z2.s\n"
1204 "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
1205 "add z20.s, z20.s, z3.s\n"
1206 "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
1207 "ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
1208 "addvl %[outptr0], %[outptr0], #3\n"
1209 "add z13.s, z13.s, z4.s\n"
1210 "st1w z16.s, p0, [%[outptr1]]\n"
1211 "add z14.s, z14.s, z2.s\n"
1212 "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
1213 "add z15.s, z15.s, z3.s\n"
1214 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
1215 "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
1216 "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
1217 "add z16.s, z16.s, z4.s\n"
1218 "ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n"
1219 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
1220 "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
1221 "addvl %[outptr1], %[outptr1], #3\n"
1222 "add z17.s, z17.s, z2.s\n"
1223 "ld1w z18.s, p1/z, [x8, #-3, MUL VL]\n"
1224 "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
1225 "st1w z19.s, p0, [%[outptr2]]\n"
1226 "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
1227 "add z18.s, z18.s, z3.s\n"
1228 "ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n"
1229 "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
1230 "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
1231 "addvl %[inptr], %[inptr], #24\n"
1232 "add z19.s, z19.s, z4.s\n"
1233 "ld1w z20.s, p0/z, [x8, #-1, MUL VL]\n"
1234 "st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
1235 "addvl %[outptr2], %[outptr2], #3\n"
1236 "add z20.s, z20.s, z2.s\n"
1237 "ld1w z13.s, p1/z, [x8]\n"
1238 "st1w z14.s, p0, [%[outptr3]]\n"
1239 "ld1w z14.s, p2/z, [x8, #1, MUL VL]\n"
1240 "add z13.s, z13.s, z3.s\n"
1241 "st1w z15.s, p1, [%[outptr3], #1, MUL VL]\n"
1242 "add z14.s, z14.s, z4.s\n"
1243 "ld1w z15.s, p0/z, [x8, #2, MUL VL]\n"
1244 "st1w z16.s, p2, [%[outptr3], #2, MUL VL]\n"
1245 "addvl %[outptr3], %[outptr3], #3\n"
1246 "add z15.s, z15.s, z2.s\n"
1247 "ld1w z16.s, p1/z, [x8, #3, MUL VL]\n"
1248 "st1w z17.s, p0, [%[outptr4]]\n"
1249 "ld1w z17.s, p2/z, [x8, #4, MUL VL]\n"
1250 "add z16.s, z16.s, z3.s\n"
1251 "st1w z18.s, p1, [%[outptr4], #1, MUL VL]\n"
1252 "add z17.s, z17.s, z4.s\n"
1253 "st1w z19.s, p2, [%[outptr4], #2, MUL VL]\n"
1254 "addvl %[outptr4], %[outptr4], #3\n"
1255 "st1w z20.s, p0, [%[outptr5]]\n"
1256 "st1w z13.s, p1, [%[outptr5], #1, MUL VL]\n"
1257 "st1w z14.s, p2, [%[outptr5], #2, MUL VL]\n"
1258 "addvl %[outptr5], %[outptr5], #3\n"
1259 "st1w z15.s, p0, [%[outptr6]]\n"
1260 "st1w z16.s, p1, [%[outptr6], #1, MUL VL]\n"
1261 "st1w z17.s, p2, [%[outptr6], #2, MUL VL]\n"
1262 "addvl %[outptr6], %[outptr6], #3\n"
1263 : [outptr0]
"+r" (outptr0), [outptr1]
"+r" (outptr1), [outptr2]
"+r" (outptr2), [outptr3]
"+r" (outptr3), [outptr4]
"+r" (outptr4), [outptr5]
"+r" (outptr5), [outptr6]
"+r" (outptr6), [outptr7]
"+r" (outptr7),
1264 [inptr]
"+r" (inptr), [p]
"+r" (p)
1265 : [
w]
"r" (
w), [biasptr]
"r" (biasptr)
1266 :
"x8",
"z0",
"z1",
"z2",
"z3",
"z4",
"z5",
"z6",
"z7",
"z8",
"z9",
"z10",
"z11",
"z12",
"z13",
"z14",
"z15",
"z16",
"z17",
"z18",
"z19",
"z20",
"memory",
"cc"
1278 "addvl x8, %[inptr], #16\n"
1279 "whilelt p0.s, %[p], %[w]\n"
1280 "incw %[p], all, mul #1\n"
1281 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1282 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1283 "ld1w z2.s, p0/z, [%[biasptr]]\n"
1284 "whilelt p1.s, %[p], %[w]\n"
1285 "ld1w z13.s, p0/z, [%[inptr]]\n"
1286 "incw %[p], all, mul #1\n"
1287 "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
1288 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1289 "add z13.s, z13.s, z2.s\n"
1290 "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
1291 "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
1292 "whilelt p2.s, %[p], %[w]\n"
1293 "add z16.s, z16.s, z2.s\n"
1294 "st1w z13.s, p0, [%[outptr0]]\n"
1295 "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
1296 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1297 "add z14.s, z14.s, z3.s\n"
1298 "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
1299 "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
1300 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1301 "add z17.s, z17.s, z3.s\n"
1302 "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
1303 "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
1304 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1305 "add z15.s, z15.s, z4.s\n"
1306 "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
1307 "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
1308 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
1309 "add z18.s, z18.s, z4.s\n"
1310 "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
1311 "add z19.s, z19.s, z2.s\n"
1312 "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
1313 "add z20.s, z20.s, z3.s\n"
1314 "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
1315 "ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
1316 "addvl %[outptr0], %[outptr0], #3\n"
1317 "add z13.s, z13.s, z4.s\n"
1318 "st1w z16.s, p0, [%[outptr1]]\n"
1319 "add z14.s, z14.s, z2.s\n"
1320 "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
1321 "add z15.s, z15.s, z3.s\n"
1322 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
1323 "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
1324 "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
1325 "add z16.s, z16.s, z4.s\n"
1326 "ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n"
1327 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
1328 "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
1329 "addvl %[outptr1], %[outptr1], #3\n"
1330 "add z17.s, z17.s, z2.s\n"
1331 "ld1w z18.s, p1/z, [x8, #-3, MUL VL]\n"
1332 "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
1333 "st1w z19.s, p0, [%[outptr2]]\n"
1334 "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
1335 "add z18.s, z18.s, z3.s\n"
1336 "ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n"
1337 "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
1338 "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
1339 "prfm PSTL1KEEP, [%[outptr7], #0x60]\n"
1340 "add z19.s, z19.s, z4.s\n"
1341 "ld1w z20.s, p0/z, [x8, #-1, MUL VL]\n"
1342 "addvl %[inptr], %[inptr], #24\n"
1343 "st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
1344 "addvl %[outptr2], %[outptr2], #3\n"
1345 "add z20.s, z20.s, z2.s\n"
1346 "ld1w z13.s, p1/z, [x8]\n"
1347 "st1w z14.s, p0, [%[outptr3]]\n"
1348 "ld1w z14.s, p2/z, [x8, #1, MUL VL]\n"
1349 "add z13.s, z13.s, z3.s\n"
1350 "st1w z15.s, p1, [%[outptr3], #1, MUL VL]\n"
1351 "add z14.s, z14.s, z4.s\n"
1352 "ld1w z15.s, p0/z, [x8, #2, MUL VL]\n"
1353 "st1w z16.s, p2, [%[outptr3], #2, MUL VL]\n"
1354 "addvl %[outptr3], %[outptr3], #3\n"
1355 "add z15.s, z15.s, z2.s\n"
1356 "ld1w z16.s, p1/z, [x8, #3, MUL VL]\n"
1357 "st1w z17.s, p0, [%[outptr4]]\n"
1358 "ld1w z17.s, p2/z, [x8, #4, MUL VL]\n"
1359 "add z16.s, z16.s, z3.s\n"
1360 "st1w z18.s, p1, [%[outptr4], #1, MUL VL]\n"
1361 "add z17.s, z17.s, z4.s\n"
1362 "ld1w z18.s, p0/z, [x8, #5, MUL VL]\n"
1363 "st1w z19.s, p2, [%[outptr4], #2, MUL VL]\n"
1364 "addvl %[outptr4], %[outptr4], #3\n"
1365 "add z18.s, z18.s, z2.s\n"
1366 "ld1w z19.s, p1/z, [x8, #6, MUL VL]\n"
1367 "st1w z20.s, p0, [%[outptr5]]\n"
1368 "ld1w z20.s, p2/z, [x8, #7, MUL VL]\n"
1369 "add z19.s, z19.s, z3.s\n"
1370 "st1w z13.s, p1, [%[outptr5], #1, MUL VL]\n"
1371 "add z20.s, z20.s, z4.s\n"
1372 "st1w z14.s, p2, [%[outptr5], #2, MUL VL]\n"
1373 "addvl %[outptr5], %[outptr5], #3\n"
1374 "st1w z15.s, p0, [%[outptr6]]\n"
1375 "st1w z16.s, p1, [%[outptr6], #1, MUL VL]\n"
1376 "st1w z17.s, p2, [%[outptr6], #2, MUL VL]\n"
1377 "addvl %[outptr6], %[outptr6], #3\n"
1378 "st1w z18.s, p0, [%[outptr7]]\n"
1379 "st1w z19.s, p1, [%[outptr7], #1, MUL VL]\n"
1380 "st1w z20.s, p2, [%[outptr7], #2, MUL VL]\n"
1381 "addvl %[outptr7], %[outptr7], #3\n"
1382 : [outptr0]
"+r" (outptr0), [outptr1]
"+r" (outptr1), [outptr2]
"+r" (outptr2), [outptr3]
"+r" (outptr3), [outptr4]
"+r" (outptr4), [outptr5]
"+r" (outptr5), [outptr6]
"+r" (outptr6), [outptr7]
"+r" (outptr7),
1383 [inptr]
"+r" (inptr), [p]
"+r" (p)
1384 : [
w]
"r" (
w), [biasptr]
"r" (biasptr)
1385 :
"x8",
"z0",
"z1",
"z2",
"z3",
"z4",
"z5",
"z6",
"z7",
"z8",
"z9",
"z10",
"z11",
"z12",
"z13",
"z14",
"z15",
"z16",
"z17",
"z18",
"z19",
"z20",
"memory",
"cc"
1397 #endif // ARM_COMPUTE_ENABLE_SVE