26 #ifdef ARM_COMPUTE_ENABLE_SVE
29 void MergeResults<3, 8, true>(
float *out,
const float *in,
const int ldout,
const int y0,
const int ymax,
const int x0,
const int xmax,
const float *
bias,
Activation act,
bool append)
31 const float *inptr = in;
33 float minval = - std::numeric_limits<float>::infinity();
34 float maxval = std::numeric_limits<float>::infinity();
41 case Activation::Type::BoundedReLU:
42 maxval =
static_cast<float>(act.param1);
44 case Activation::Type::ReLU:
51 memset(nullbias, 0, (3 * get_vector_length<float>() *
sizeof(
float)));
54 for (
int y=y0; y<ymax; y+=8)
56 float *outptr0 = out + (y * ldout) + x0;
57 float *outptr1 = outptr0 + ldout;
58 float *outptr2 = outptr1 + ldout;
59 float *outptr3 = outptr2 + ldout;
60 float *outptr4 = outptr3 + ldout;
61 float *outptr5 = outptr4 + ldout;
62 float *outptr6 = outptr5 + ldout;
63 float *outptr7 = outptr6 + ldout;
65 const int height = ymax - y;
67 for (
int i=x0; i<xmax; i+=(3 * get_vector_length<float>()))
79 "mov z0.s, %s[maxval]\n"
80 "addvl x8, %[inptr], #16\n"
81 "mov z1.s, %s[minval]\n"
82 "whilelt p0.s, %[p], %[w]\n"
83 "incw %[p], all, mul #1\n"
84 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
85 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
86 "ld1w z2.s, p0/z, [%[outptr0]]\n"
87 "whilelt p1.s, %[p], %[w]\n"
88 "ld1w z10.s, p0/z, [%[inptr]]\n"
89 "incw %[p], all, mul #1\n"
90 "ld1w z3.s, p1/z, [%[outptr0], #1, MUL VL]\n"
91 "fadd z10.s, z10.s, z2.s\n"
92 "ld1w z11.s, p1/z, [%[inptr], #1, MUL VL]\n"
93 "whilelt p2.s, %[p], %[w]\n"
94 "fmin z10.s, p0/m, z10.s, z0.s\n"
95 "ld1w z4.s, p2/z, [%[outptr0], #2, MUL VL]\n"
96 "fadd z11.s, z11.s, z3.s\n"
97 "ld1w z12.s, p2/z, [%[inptr], #2, MUL VL]\n"
98 "addvl %[inptr], %[inptr], #24\n"
99 "fmax z10.s, p0/m, z10.s, z1.s\n"
100 "fmin z11.s, p1/m, z11.s, z0.s\n"
101 "fadd z12.s, z12.s, z4.s\n"
102 "st1w z10.s, p0, [%[outptr0]]\n"
103 "fmax z11.s, p1/m, z11.s, z1.s\n"
104 "fmin z12.s, p2/m, z12.s, z0.s\n"
105 "st1w z11.s, p1, [%[outptr0], #1, MUL VL]\n"
106 "fmax z12.s, p2/m, z12.s, z1.s\n"
107 "st1w z12.s, p2, [%[outptr0], #2, MUL VL]\n"
108 "addvl %[outptr0], %[outptr0], #3\n"
109 : [outptr0]
"+r" (outptr0), [outptr1]
"+r" (outptr1), [outptr2]
"+r" (outptr2), [outptr3]
"+r" (outptr3), [outptr4]
"+r" (outptr4), [outptr5]
"+r" (outptr5), [outptr6]
"+r" (outptr6), [outptr7]
"+r" (outptr7),
110 [inptr]
"+r" (inptr), [p]
"+r" (p)
111 : [
w]
"r" (
w), [minval]
"w" (minval), [maxval]
"w" (maxval)
112 :
"x8",
"z0",
"z1",
"z2",
"z3",
"z4",
"z5",
"z6",
"z7",
"z8",
"z9",
"z10",
"z11",
"z12",
"z13",
"z14",
"z15",
"z16",
"z17",
"z18",
"z19",
"z20",
"memory",
"cc"
123 "mov z0.s, %s[maxval]\n"
124 "addvl x8, %[inptr], #16\n"
125 "mov z1.s, %s[minval]\n"
126 "whilelt p0.s, %[p], %[w]\n"
127 "incw %[p], all, mul #1\n"
128 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
129 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
130 "ld1w z2.s, p0/z, [%[outptr0]]\n"
131 "whilelt p1.s, %[p], %[w]\n"
132 "ld1w z10.s, p0/z, [%[inptr]]\n"
133 "incw %[p], all, mul #1\n"
134 "ld1w z5.s, p0/z, [%[outptr1]]\n"
135 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
136 "fadd z10.s, z10.s, z2.s\n"
137 "ld1w z3.s, p1/z, [%[outptr0], #1, MUL VL]\n"
138 "ld1w z11.s, p1/z, [%[inptr], #1, MUL VL]\n"
139 "whilelt p2.s, %[p], %[w]\n"
140 "ld1w z13.s, p0/z, [%[inptr], #3, MUL VL]\n"
141 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
142 "fmin z10.s, p0/m, z10.s, z0.s\n"
143 "ld1w z4.s, p2/z, [%[outptr0], #2, MUL VL]\n"
144 "fadd z11.s, z11.s, z3.s\n"
145 "ld1w z12.s, p2/z, [%[inptr], #2, MUL VL]\n"
146 "fadd z13.s, z13.s, z5.s\n"
147 "ld1w z6.s, p1/z, [%[outptr1], #1, MUL VL]\n"
148 "ld1w z14.s, p1/z, [%[inptr], #4, MUL VL]\n"
149 "fmax z10.s, p0/m, z10.s, z1.s\n"
150 "ld1w z7.s, p2/z, [%[outptr1], #2, MUL VL]\n"
151 "fmin z11.s, p1/m, z11.s, z0.s\n"
152 "ld1w z15.s, p2/z, [%[inptr], #5, MUL VL]\n"
153 "fadd z12.s, z12.s, z4.s\n"
154 "addvl %[inptr], %[inptr], #24\n"
155 "fmin z13.s, p0/m, z13.s, z0.s\n"
156 "st1w z10.s, p0, [%[outptr0]]\n"
157 "fmax z11.s, p1/m, z11.s, z1.s\n"
158 "fmin z12.s, p2/m, z12.s, z0.s\n"
159 "fadd z14.s, z14.s, z6.s\n"
160 "fmax z13.s, p0/m, z13.s, z1.s\n"
161 "st1w z11.s, p1, [%[outptr0], #1, MUL VL]\n"
162 "fadd z15.s, z15.s, z7.s\n"
163 "fmax z12.s, p2/m, z12.s, z1.s\n"
164 "fmin z14.s, p1/m, z14.s, z0.s\n"
165 "fmin z15.s, p2/m, z15.s, z0.s\n"
166 "st1w z12.s, p2, [%[outptr0], #2, MUL VL]\n"
167 "addvl %[outptr0], %[outptr0], #3\n"
168 "fmax z14.s, p1/m, z14.s, z1.s\n"
169 "fmax z15.s, p2/m, z15.s, z1.s\n"
170 "st1w z13.s, p0, [%[outptr1]]\n"
171 "st1w z14.s, p1, [%[outptr1], #1, MUL VL]\n"
172 "st1w z15.s, p2, [%[outptr1], #2, MUL VL]\n"
173 "addvl %[outptr1], %[outptr1], #3\n"
174 : [outptr0]
"+r" (outptr0), [outptr1]
"+r" (outptr1), [outptr2]
"+r" (outptr2), [outptr3]
"+r" (outptr3), [outptr4]
"+r" (outptr4), [outptr5]
"+r" (outptr5), [outptr6]
"+r" (outptr6), [outptr7]
"+r" (outptr7),
175 [inptr]
"+r" (inptr), [p]
"+r" (p)
176 : [
w]
"r" (
w), [minval]
"w" (minval), [maxval]
"w" (maxval)
177 :
"x8",
"z0",
"z1",
"z2",
"z3",
"z4",
"z5",
"z6",
"z7",
"z8",
"z9",
"z10",
"z11",
"z12",
"z13",
"z14",
"z15",
"z16",
"z17",
"z18",
"z19",
"z20",
"memory",
"cc"
188 "mov z0.s, %s[maxval]\n"
189 "addvl x8, %[inptr], #16\n"
190 "mov z1.s, %s[minval]\n"
191 "whilelt p0.s, %[p], %[w]\n"
192 "incw %[p], all, mul #1\n"
193 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
194 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
195 "ld1w z2.s, p0/z, [%[outptr0]]\n"
196 "whilelt p1.s, %[p], %[w]\n"
197 "ld1w z10.s, p0/z, [%[inptr]]\n"
198 "incw %[p], all, mul #1\n"
199 "ld1w z5.s, p0/z, [%[outptr1]]\n"
200 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
201 "fadd z10.s, z10.s, z2.s\n"
202 "ld1w z3.s, p1/z, [%[outptr0], #1, MUL VL]\n"
203 "ld1w z11.s, p1/z, [%[inptr], #1, MUL VL]\n"
204 "whilelt p2.s, %[p], %[w]\n"
205 "ld1w z13.s, p0/z, [%[inptr], #3, MUL VL]\n"
206 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
207 "fmin z10.s, p0/m, z10.s, z0.s\n"
208 "ld1w z4.s, p2/z, [%[outptr0], #2, MUL VL]\n"
209 "fadd z11.s, z11.s, z3.s\n"
210 "ld1w z12.s, p2/z, [%[inptr], #2, MUL VL]\n"
211 "fadd z13.s, z13.s, z5.s\n"
212 "ld1w z6.s, p1/z, [%[outptr1], #1, MUL VL]\n"
213 "ld1w z14.s, p1/z, [%[inptr], #4, MUL VL]\n"
214 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
215 "fmax z10.s, p0/m, z10.s, z1.s\n"
216 "ld1w z7.s, p2/z, [%[outptr1], #2, MUL VL]\n"
217 "fmin z11.s, p1/m, z11.s, z0.s\n"
218 "ld1w z15.s, p2/z, [%[inptr], #5, MUL VL]\n"
219 "fadd z12.s, z12.s, z4.s\n"
220 "ld1w z8.s, p0/z, [%[outptr2]]\n"
221 "fmin z13.s, p0/m, z13.s, z0.s\n"
222 "st1w z10.s, p0, [%[outptr0]]\n"
223 "fadd z14.s, z14.s, z6.s\n"
224 "ld1w z16.s, p0/z, [%[inptr], #6, MUL VL]\n"
225 "fmax z11.s, p1/m, z11.s, z1.s\n"
226 "ld1w z9.s, p1/z, [%[outptr2], #1, MUL VL]\n"
227 "fmin z12.s, p2/m, z12.s, z0.s\n"
228 "ld1w z17.s, p1/z, [%[inptr], #7, MUL VL]\n"
229 "fmax z13.s, p0/m, z13.s, z1.s\n"
230 "ld1w z2.s, p2/z, [%[outptr2], #2, MUL VL]\n"
231 "fmin z14.s, p1/m, z14.s, z0.s\n"
232 "st1w z11.s, p1, [%[outptr0], #1, MUL VL]\n"
233 "fadd z15.s, z15.s, z7.s\n"
234 "ld1w z10.s, p2/z, [x8, #-8, MUL VL]\n"
235 "fmax z12.s, p2/m, z12.s, z1.s\n"
236 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
237 "fmax z14.s, p1/m, z14.s, z1.s\n"
238 "addvl %[inptr], %[inptr], #24\n"
239 "fmin z15.s, p2/m, z15.s, z0.s\n"
240 "st1w z12.s, p2, [%[outptr0], #2, MUL VL]\n"
241 "fadd z16.s, z16.s, z8.s\n"
242 "addvl %[outptr0], %[outptr0], #3\n"
243 "fadd z17.s, z17.s, z9.s\n"
244 "st1w z13.s, p0, [%[outptr1]]\n"
245 "fmax z15.s, p2/m, z15.s, z1.s\n"
246 "fmin z16.s, p0/m, z16.s, z0.s\n"
247 "fadd z10.s, z10.s, z2.s\n"
248 "st1w z14.s, p1, [%[outptr1], #1, MUL VL]\n"
249 "fmin z17.s, p1/m, z17.s, z0.s\n"
250 "fmax z16.s, p0/m, z16.s, z1.s\n"
251 "st1w z15.s, p2, [%[outptr1], #2, MUL VL]\n"
252 "fmin z10.s, p2/m, z10.s, z0.s\n"
253 "addvl %[outptr1], %[outptr1], #3\n"
254 "fmax z17.s, p1/m, z17.s, z1.s\n"
255 "st1w z16.s, p0, [%[outptr2]]\n"
256 "fmax z10.s, p2/m, z10.s, z1.s\n"
257 "st1w z17.s, p1, [%[outptr2], #1, MUL VL]\n"
258 "st1w z10.s, p2, [%[outptr2], #2, MUL VL]\n"
259 "addvl %[outptr2], %[outptr2], #3\n"
260 : [outptr0]
"+r" (outptr0), [outptr1]
"+r" (outptr1), [outptr2]
"+r" (outptr2), [outptr3]
"+r" (outptr3), [outptr4]
"+r" (outptr4), [outptr5]
"+r" (outptr5), [outptr6]
"+r" (outptr6), [outptr7]
"+r" (outptr7),
261 [inptr]
"+r" (inptr), [p]
"+r" (p)
262 : [
w]
"r" (
w), [minval]
"w" (minval), [maxval]
"w" (maxval)
263 :
"x8",
"z0",
"z1",
"z2",
"z3",
"z4",
"z5",
"z6",
"z7",
"z8",
"z9",
"z10",
"z11",
"z12",
"z13",
"z14",
"z15",
"z16",
"z17",
"z18",
"z19",
"z20",
"memory",
"cc"
274 "mov z0.s, %s[maxval]\n"
275 "addvl x8, %[inptr], #16\n"
276 "mov z1.s, %s[minval]\n"
277 "whilelt p0.s, %[p], %[w]\n"
278 "incw %[p], all, mul #1\n"
279 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
280 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
281 "ld1w z2.s, p0/z, [%[outptr0]]\n"
282 "whilelt p1.s, %[p], %[w]\n"
283 "ld1w z10.s, p0/z, [%[inptr]]\n"
284 "incw %[p], all, mul #1\n"
285 "ld1w z5.s, p0/z, [%[outptr1]]\n"
286 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
287 "fadd z10.s, z10.s, z2.s\n"
288 "ld1w z3.s, p1/z, [%[outptr0], #1, MUL VL]\n"
289 "ld1w z11.s, p1/z, [%[inptr], #1, MUL VL]\n"
290 "whilelt p2.s, %[p], %[w]\n"
291 "ld1w z13.s, p0/z, [%[inptr], #3, MUL VL]\n"
292 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
293 "fmin z10.s, p0/m, z10.s, z0.s\n"
294 "ld1w z4.s, p2/z, [%[outptr0], #2, MUL VL]\n"
295 "fadd z11.s, z11.s, z3.s\n"
296 "ld1w z12.s, p2/z, [%[inptr], #2, MUL VL]\n"
297 "fadd z13.s, z13.s, z5.s\n"
298 "ld1w z6.s, p1/z, [%[outptr1], #1, MUL VL]\n"
299 "ld1w z14.s, p1/z, [%[inptr], #4, MUL VL]\n"
300 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
301 "fmax z10.s, p0/m, z10.s, z1.s\n"
302 "ld1w z7.s, p2/z, [%[outptr1], #2, MUL VL]\n"
303 "fmin z11.s, p1/m, z11.s, z0.s\n"
304 "ld1w z15.s, p2/z, [%[inptr], #5, MUL VL]\n"
305 "fadd z12.s, z12.s, z4.s\n"
306 "ld1w z8.s, p0/z, [%[outptr2]]\n"
307 "fmin z13.s, p0/m, z13.s, z0.s\n"
308 "st1w z10.s, p0, [%[outptr0]]\n"
309 "fadd z14.s, z14.s, z6.s\n"
310 "ld1w z16.s, p0/z, [%[inptr], #6, MUL VL]\n"
311 "fmax z11.s, p1/m, z11.s, z1.s\n"
312 "ld1w z9.s, p1/z, [%[outptr2], #1, MUL VL]\n"
313 "fmin z12.s, p2/m, z12.s, z0.s\n"
314 "ld1w z17.s, p1/z, [%[inptr], #7, MUL VL]\n"
315 "fmax z13.s, p0/m, z13.s, z1.s\n"
316 "ld1w z2.s, p2/z, [%[outptr2], #2, MUL VL]\n"
317 "fmin z14.s, p1/m, z14.s, z0.s\n"
318 "st1w z11.s, p1, [%[outptr0], #1, MUL VL]\n"
319 "fadd z15.s, z15.s, z7.s\n"
320 "ld1w z10.s, p2/z, [x8, #-8, MUL VL]\n"
321 "fmax z12.s, p2/m, z12.s, z1.s\n"
322 "ld1w z3.s, p0/z, [%[outptr3]]\n"
323 "fadd z16.s, z16.s, z8.s\n"
324 "ld1w z11.s, p0/z, [x8, #-7, MUL VL]\n"
325 "fmax z14.s, p1/m, z14.s, z1.s\n"
326 "ld1w z4.s, p1/z, [%[outptr3], #1, MUL VL]\n"
327 "fmin z15.s, p2/m, z15.s, z0.s\n"
328 "st1w z12.s, p2, [%[outptr0], #2, MUL VL]\n"
329 "fadd z17.s, z17.s, z9.s\n"
330 "ld1w z12.s, p1/z, [x8, #-6, MUL VL]\n"
331 "fmin z16.s, p0/m, z16.s, z0.s\n"
332 "ld1w z5.s, p2/z, [%[outptr3], #2, MUL VL]\n"
333 "fadd z10.s, z10.s, z2.s\n"
334 "st1w z13.s, p0, [%[outptr1]]\n"
335 "fmax z15.s, p2/m, z15.s, z1.s\n"
336 "ld1w z13.s, p2/z, [x8, #-5, MUL VL]\n"
337 "fmin z17.s, p1/m, z17.s, z0.s\n"
338 "addvl %[outptr0], %[outptr0], #3\n"
339 "fmax z16.s, p0/m, z16.s, z1.s\n"
340 "st1w z14.s, p1, [%[outptr1], #1, MUL VL]\n"
341 "fmin z10.s, p2/m, z10.s, z0.s\n"
342 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
343 "fmax z17.s, p1/m, z17.s, z1.s\n"
344 "st1w z15.s, p2, [%[outptr1], #2, MUL VL]\n"
345 "fadd z11.s, z11.s, z3.s\n"
346 "addvl %[outptr1], %[outptr1], #3\n"
347 "fmax z10.s, p2/m, z10.s, z1.s\n"
348 "st1w z16.s, p0, [%[outptr2]]\n"
349 "fadd z12.s, z12.s, z4.s\n"
350 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
351 "fmin z11.s, p0/m, z11.s, z0.s\n"
352 "st1w z17.s, p1, [%[outptr2], #1, MUL VL]\n"
353 "fadd z13.s, z13.s, z5.s\n"
354 "addvl %[inptr], %[inptr], #24\n"
355 "fmin z12.s, p1/m, z12.s, z0.s\n"
356 "st1w z10.s, p2, [%[outptr2], #2, MUL VL]\n"
357 "fmax z11.s, p0/m, z11.s, z1.s\n"
358 "addvl %[outptr2], %[outptr2], #3\n"
359 "fmin z13.s, p2/m, z13.s, z0.s\n"
360 "fmax z12.s, p1/m, z12.s, z1.s\n"
361 "st1w z11.s, p0, [%[outptr3]]\n"
362 "fmax z13.s, p2/m, z13.s, z1.s\n"
363 "st1w z12.s, p1, [%[outptr3], #1, MUL VL]\n"
364 "st1w z13.s, p2, [%[outptr3], #2, MUL VL]\n"
365 "addvl %[outptr3], %[outptr3], #3\n"
366 : [outptr0]
"+r" (outptr0), [outptr1]
"+r" (outptr1), [outptr2]
"+r" (outptr2), [outptr3]
"+r" (outptr3), [outptr4]
"+r" (outptr4), [outptr5]
"+r" (outptr5), [outptr6]
"+r" (outptr6), [outptr7]
"+r" (outptr7),
367 [inptr]
"+r" (inptr), [p]
"+r" (p)
368 : [
w]
"r" (
w), [minval]
"w" (minval), [maxval]
"w" (maxval)
369 :
"x8",
"z0",
"z1",
"z2",
"z3",
"z4",
"z5",
"z6",
"z7",
"z8",
"z9",
"z10",
"z11",
"z12",
"z13",
"z14",
"z15",
"z16",
"z17",
"z18",
"z19",
"z20",
"memory",
"cc"
380 "mov z0.s, %s[maxval]\n"
381 "addvl x8, %[inptr], #16\n"
382 "mov z1.s, %s[minval]\n"
383 "whilelt p0.s, %[p], %[w]\n"
384 "incw %[p], all, mul #1\n"
385 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
386 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
387 "ld1w z2.s, p0/z, [%[outptr0]]\n"
388 "whilelt p1.s, %[p], %[w]\n"
389 "ld1w z10.s, p0/z, [%[inptr]]\n"
390 "incw %[p], all, mul #1\n"
391 "ld1w z5.s, p0/z, [%[outptr1]]\n"
392 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
393 "fadd z10.s, z10.s, z2.s\n"
394 "ld1w z3.s, p1/z, [%[outptr0], #1, MUL VL]\n"
395 "ld1w z11.s, p1/z, [%[inptr], #1, MUL VL]\n"
396 "whilelt p2.s, %[p], %[w]\n"
397 "ld1w z13.s, p0/z, [%[inptr], #3, MUL VL]\n"
398 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
399 "fmin z10.s, p0/m, z10.s, z0.s\n"
400 "ld1w z4.s, p2/z, [%[outptr0], #2, MUL VL]\n"
401 "fadd z11.s, z11.s, z3.s\n"
402 "ld1w z12.s, p2/z, [%[inptr], #2, MUL VL]\n"
403 "fadd z13.s, z13.s, z5.s\n"
404 "ld1w z6.s, p1/z, [%[outptr1], #1, MUL VL]\n"
405 "ld1w z14.s, p1/z, [%[inptr], #4, MUL VL]\n"
406 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
407 "fmax z10.s, p0/m, z10.s, z1.s\n"
408 "ld1w z7.s, p2/z, [%[outptr1], #2, MUL VL]\n"
409 "fmin z11.s, p1/m, z11.s, z0.s\n"
410 "ld1w z15.s, p2/z, [%[inptr], #5, MUL VL]\n"
411 "fadd z12.s, z12.s, z4.s\n"
412 "ld1w z8.s, p0/z, [%[outptr2]]\n"
413 "fmin z13.s, p0/m, z13.s, z0.s\n"
414 "st1w z10.s, p0, [%[outptr0]]\n"
415 "fadd z14.s, z14.s, z6.s\n"
416 "ld1w z16.s, p0/z, [%[inptr], #6, MUL VL]\n"
417 "fmax z11.s, p1/m, z11.s, z1.s\n"
418 "ld1w z9.s, p1/z, [%[outptr2], #1, MUL VL]\n"
419 "fmin z12.s, p2/m, z12.s, z0.s\n"
420 "ld1w z17.s, p1/z, [%[inptr], #7, MUL VL]\n"
421 "fmax z13.s, p0/m, z13.s, z1.s\n"
422 "ld1w z2.s, p2/z, [%[outptr2], #2, MUL VL]\n"
423 "fmin z14.s, p1/m, z14.s, z0.s\n"
424 "st1w z11.s, p1, [%[outptr0], #1, MUL VL]\n"
425 "fadd z15.s, z15.s, z7.s\n"
426 "ld1w z10.s, p2/z, [x8, #-8, MUL VL]\n"
427 "fmax z12.s, p2/m, z12.s, z1.s\n"
428 "ld1w z3.s, p0/z, [%[outptr3]]\n"
429 "fadd z16.s, z16.s, z8.s\n"
430 "ld1w z11.s, p0/z, [x8, #-7, MUL VL]\n"
431 "fmax z14.s, p1/m, z14.s, z1.s\n"
432 "ld1w z4.s, p1/z, [%[outptr3], #1, MUL VL]\n"
433 "fmin z15.s, p2/m, z15.s, z0.s\n"
434 "st1w z12.s, p2, [%[outptr0], #2, MUL VL]\n"
435 "fadd z17.s, z17.s, z9.s\n"
436 "ld1w z12.s, p1/z, [x8, #-6, MUL VL]\n"
437 "fmin z16.s, p0/m, z16.s, z0.s\n"
438 "ld1w z5.s, p2/z, [%[outptr3], #2, MUL VL]\n"
439 "fadd z10.s, z10.s, z2.s\n"
440 "st1w z13.s, p0, [%[outptr1]]\n"
441 "fmax z15.s, p2/m, z15.s, z1.s\n"
442 "ld1w z13.s, p2/z, [x8, #-5, MUL VL]\n"
443 "fmin z17.s, p1/m, z17.s, z0.s\n"
444 "ld1w z6.s, p0/z, [%[outptr4]]\n"
445 "fmax z16.s, p0/m, z16.s, z1.s\n"
446 "st1w z14.s, p1, [%[outptr1], #1, MUL VL]\n"
447 "fmin z10.s, p2/m, z10.s, z0.s\n"
448 "ld1w z14.s, p0/z, [x8, #-4, MUL VL]\n"
449 "fadd z11.s, z11.s, z3.s\n"
450 "ld1w z7.s, p1/z, [%[outptr4], #1, MUL VL]\n"
451 "fmax z17.s, p1/m, z17.s, z1.s\n"
452 "st1w z15.s, p2, [%[outptr1], #2, MUL VL]\n"
453 "fadd z12.s, z12.s, z4.s\n"
454 "ld1w z15.s, p1/z, [x8, #-3, MUL VL]\n"
455 "fmax z10.s, p2/m, z10.s, z1.s\n"
456 "ld1w z8.s, p2/z, [%[outptr4], #2, MUL VL]\n"
457 "fmin z11.s, p0/m, z11.s, z0.s\n"
458 "st1w z16.s, p0, [%[outptr2]]\n"
459 "fadd z13.s, z13.s, z5.s\n"
460 "ld1w z16.s, p2/z, [x8, #-2, MUL VL]\n"
461 "fmin z12.s, p1/m, z12.s, z0.s\n"
462 "addvl %[outptr0], %[outptr0], #3\n"
463 "fmax z11.s, p0/m, z11.s, z1.s\n"
464 "st1w z17.s, p1, [%[outptr2], #1, MUL VL]\n"
465 "fmin z13.s, p2/m, z13.s, z0.s\n"
466 "addvl %[outptr1], %[outptr1], #3\n"
467 "fmax z12.s, p1/m, z12.s, z1.s\n"
468 "st1w z10.s, p2, [%[outptr2], #2, MUL VL]\n"
469 "fadd z14.s, z14.s, z6.s\n"
470 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
471 "fmax z13.s, p2/m, z13.s, z1.s\n"
472 "st1w z11.s, p0, [%[outptr3]]\n"
473 "fadd z15.s, z15.s, z7.s\n"
474 "addvl %[outptr2], %[outptr2], #3\n"
475 "fmin z14.s, p0/m, z14.s, z0.s\n"
476 "st1w z12.s, p1, [%[outptr3], #1, MUL VL]\n"
477 "fadd z16.s, z16.s, z8.s\n"
478 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
479 "fmin z15.s, p1/m, z15.s, z0.s\n"
480 "st1w z13.s, p2, [%[outptr3], #2, MUL VL]\n"
481 "fmax z14.s, p0/m, z14.s, z1.s\n"
482 "addvl %[outptr3], %[outptr3], #3\n"
483 "fmin z16.s, p2/m, z16.s, z0.s\n"
484 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
485 "fmax z15.s, p1/m, z15.s, z1.s\n"
486 "st1w z14.s, p0, [%[outptr4]]\n"
487 "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
488 "fmax z16.s, p2/m, z16.s, z1.s\n"
489 "addvl %[inptr], %[inptr], #24\n"
490 "st1w z15.s, p1, [%[outptr4], #1, MUL VL]\n"
491 "st1w z16.s, p2, [%[outptr4], #2, MUL VL]\n"
492 "addvl %[outptr4], %[outptr4], #3\n"
493 : [outptr0]
"+r" (outptr0), [outptr1]
"+r" (outptr1), [outptr2]
"+r" (outptr2), [outptr3]
"+r" (outptr3), [outptr4]
"+r" (outptr4), [outptr5]
"+r" (outptr5), [outptr6]
"+r" (outptr6), [outptr7]
"+r" (outptr7),
494 [inptr]
"+r" (inptr), [p]
"+r" (p)
495 : [
w]
"r" (
w), [minval]
"w" (minval), [maxval]
"w" (maxval)
496 :
"x8",
"z0",
"z1",
"z2",
"z3",
"z4",
"z5",
"z6",
"z7",
"z8",
"z9",
"z10",
"z11",
"z12",
"z13",
"z14",
"z15",
"z16",
"z17",
"z18",
"z19",
"z20",
"memory",
"cc"
507 "mov z0.s, %s[maxval]\n"
508 "addvl x8, %[inptr], #16\n"
509 "mov z1.s, %s[minval]\n"
510 "whilelt p0.s, %[p], %[w]\n"
511 "incw %[p], all, mul #1\n"
512 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
513 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
514 "ld1w z2.s, p0/z, [%[outptr0]]\n"
515 "whilelt p1.s, %[p], %[w]\n"
516 "ld1w z10.s, p0/z, [%[inptr]]\n"
517 "incw %[p], all, mul #1\n"
518 "ld1w z5.s, p0/z, [%[outptr1]]\n"
519 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
520 "fadd z10.s, z10.s, z2.s\n"
521 "ld1w z3.s, p1/z, [%[outptr0], #1, MUL VL]\n"
522 "ld1w z11.s, p1/z, [%[inptr], #1, MUL VL]\n"
523 "whilelt p2.s, %[p], %[w]\n"
524 "ld1w z13.s, p0/z, [%[inptr], #3, MUL VL]\n"
525 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
526 "fmin z10.s, p0/m, z10.s, z0.s\n"
527 "ld1w z4.s, p2/z, [%[outptr0], #2, MUL VL]\n"
528 "fadd z11.s, z11.s, z3.s\n"
529 "ld1w z12.s, p2/z, [%[inptr], #2, MUL VL]\n"
530 "fadd z13.s, z13.s, z5.s\n"
531 "ld1w z6.s, p1/z, [%[outptr1], #1, MUL VL]\n"
532 "ld1w z14.s, p1/z, [%[inptr], #4, MUL VL]\n"
533 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
534 "fmax z10.s, p0/m, z10.s, z1.s\n"
535 "ld1w z7.s, p2/z, [%[outptr1], #2, MUL VL]\n"
536 "fmin z11.s, p1/m, z11.s, z0.s\n"
537 "ld1w z15.s, p2/z, [%[inptr], #5, MUL VL]\n"
538 "fadd z12.s, z12.s, z4.s\n"
539 "ld1w z8.s, p0/z, [%[outptr2]]\n"
540 "fmin z13.s, p0/m, z13.s, z0.s\n"
541 "st1w z10.s, p0, [%[outptr0]]\n"
542 "fadd z14.s, z14.s, z6.s\n"
543 "ld1w z16.s, p0/z, [%[inptr], #6, MUL VL]\n"
544 "fmax z11.s, p1/m, z11.s, z1.s\n"
545 "ld1w z9.s, p1/z, [%[outptr2], #1, MUL VL]\n"
546 "fmin z12.s, p2/m, z12.s, z0.s\n"
547 "ld1w z17.s, p1/z, [%[inptr], #7, MUL VL]\n"
548 "fmax z13.s, p0/m, z13.s, z1.s\n"
549 "ld1w z2.s, p2/z, [%[outptr2], #2, MUL VL]\n"
550 "fmin z14.s, p1/m, z14.s, z0.s\n"
551 "st1w z11.s, p1, [%[outptr0], #1, MUL VL]\n"
552 "fadd z15.s, z15.s, z7.s\n"
553 "ld1w z10.s, p2/z, [x8, #-8, MUL VL]\n"
554 "fmax z12.s, p2/m, z12.s, z1.s\n"
555 "ld1w z3.s, p0/z, [%[outptr3]]\n"
556 "fadd z16.s, z16.s, z8.s\n"
557 "ld1w z11.s, p0/z, [x8, #-7, MUL VL]\n"
558 "fmax z14.s, p1/m, z14.s, z1.s\n"
559 "ld1w z4.s, p1/z, [%[outptr3], #1, MUL VL]\n"
560 "fmin z15.s, p2/m, z15.s, z0.s\n"
561 "st1w z12.s, p2, [%[outptr0], #2, MUL VL]\n"
562 "fadd z17.s, z17.s, z9.s\n"
563 "ld1w z12.s, p1/z, [x8, #-6, MUL VL]\n"
564 "fmin z16.s, p0/m, z16.s, z0.s\n"
565 "ld1w z5.s, p2/z, [%[outptr3], #2, MUL VL]\n"
566 "fadd z10.s, z10.s, z2.s\n"
567 "st1w z13.s, p0, [%[outptr1]]\n"
568 "fmax z15.s, p2/m, z15.s, z1.s\n"
569 "ld1w z13.s, p2/z, [x8, #-5, MUL VL]\n"
570 "fmin z17.s, p1/m, z17.s, z0.s\n"
571 "ld1w z6.s, p0/z, [%[outptr4]]\n"
572 "fmax z16.s, p0/m, z16.s, z1.s\n"
573 "st1w z14.s, p1, [%[outptr1], #1, MUL VL]\n"
574 "fmin z10.s, p2/m, z10.s, z0.s\n"
575 "ld1w z14.s, p0/z, [x8, #-4, MUL VL]\n"
576 "fadd z11.s, z11.s, z3.s\n"
577 "ld1w z7.s, p1/z, [%[outptr4], #1, MUL VL]\n"
578 "fmax z17.s, p1/m, z17.s, z1.s\n"
579 "st1w z15.s, p2, [%[outptr1], #2, MUL VL]\n"
580 "fadd z12.s, z12.s, z4.s\n"
581 "ld1w z15.s, p1/z, [x8, #-3, MUL VL]\n"
582 "fmax z10.s, p2/m, z10.s, z1.s\n"
583 "ld1w z8.s, p2/z, [%[outptr4], #2, MUL VL]\n"
584 "fmin z11.s, p0/m, z11.s, z0.s\n"
585 "st1w z16.s, p0, [%[outptr2]]\n"
586 "fadd z13.s, z13.s, z5.s\n"
587 "ld1w z16.s, p2/z, [x8, #-2, MUL VL]\n"
588 "fmin z12.s, p1/m, z12.s, z0.s\n"
589 "ld1w z9.s, p0/z, [%[outptr5]]\n"
590 "fadd z14.s, z14.s, z6.s\n"
591 "st1w z17.s, p1, [%[outptr2], #1, MUL VL]\n"
592 "fmax z11.s, p0/m, z11.s, z1.s\n"
593 "ld1w z17.s, p0/z, [x8, #-1, MUL VL]\n"
594 "fmin z13.s, p2/m, z13.s, z0.s\n"
595 "ld1w z2.s, p1/z, [%[outptr5], #1, MUL VL]\n"
596 "fmax z12.s, p1/m, z12.s, z1.s\n"
597 "st1w z10.s, p2, [%[outptr2], #2, MUL VL]\n"
598 "fmin z14.s, p0/m, z14.s, z0.s\n"
599 "ld1w z10.s, p1/z, [x8]\n"
600 "fadd z15.s, z15.s, z7.s\n"
601 "ld1w z3.s, p2/z, [%[outptr5], #2, MUL VL]\n"
602 "fmax z13.s, p2/m, z13.s, z1.s\n"
603 "st1w z11.s, p0, [%[outptr3]]\n"
604 "fadd z16.s, z16.s, z8.s\n"
605 "ld1w z11.s, p2/z, [x8, #1, MUL VL]\n"
606 "fmax z14.s, p0/m, z14.s, z1.s\n"
607 "addvl %[outptr0], %[outptr0], #3\n"
608 "fmin z15.s, p1/m, z15.s, z0.s\n"
609 "st1w z12.s, p1, [%[outptr3], #1, MUL VL]\n"
610 "fmin z16.s, p2/m, z16.s, z0.s\n"
611 "addvl %[outptr1], %[outptr1], #3\n"
612 "fadd z17.s, z17.s, z9.s\n"
613 "st1w z13.s, p2, [%[outptr3], #2, MUL VL]\n"
614 "fmax z15.s, p1/m, z15.s, z1.s\n"
615 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
616 "fmax z16.s, p2/m, z16.s, z1.s\n"
617 "st1w z14.s, p0, [%[outptr4]]\n"
618 "fmin z17.s, p0/m, z17.s, z0.s\n"
619 "addvl %[outptr2], %[outptr2], #3\n"
620 "fadd z10.s, z10.s, z2.s\n"
621 "st1w z15.s, p1, [%[outptr4], #1, MUL VL]\n"
622 "fadd z11.s, z11.s, z3.s\n"
623 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
624 "fmax z17.s, p0/m, z17.s, z1.s\n"
625 "st1w z16.s, p2, [%[outptr4], #2, MUL VL]\n"
626 "fmin z10.s, p1/m, z10.s, z0.s\n"
627 "addvl %[outptr3], %[outptr3], #3\n"
628 "fmin z11.s, p2/m, z11.s, z0.s\n"
629 "st1w z17.s, p0, [%[outptr5]]\n"
630 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
631 "fmax z10.s, p1/m, z10.s, z1.s\n"
632 "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
633 "fmax z11.s, p2/m, z11.s, z1.s\n"
634 "addvl %[outptr4], %[outptr4], #3\n"
635 "st1w z10.s, p1, [%[outptr5], #1, MUL VL]\n"
636 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
637 "prfm PLDL1KEEP, [%[outptr5], #0x60]\n"
638 "addvl %[inptr], %[inptr], #24\n"
639 "st1w z11.s, p2, [%[outptr5], #2, MUL VL]\n"
640 "addvl %[outptr5], %[outptr5], #3\n"
641 : [outptr0]
"+r" (outptr0), [outptr1]
"+r" (outptr1), [outptr2]
"+r" (outptr2), [outptr3]
"+r" (outptr3), [outptr4]
"+r" (outptr4), [outptr5]
"+r" (outptr5), [outptr6]
"+r" (outptr6), [outptr7]
"+r" (outptr7),
642 [inptr]
"+r" (inptr), [p]
"+r" (p)
643 : [
w]
"r" (
w), [minval]
"w" (minval), [maxval]
"w" (maxval)
644 :
"x8",
"z0",
"z1",
"z2",
"z3",
"z4",
"z5",
"z6",
"z7",
"z8",
"z9",
"z10",
"z11",
"z12",
"z13",
"z14",
"z15",
"z16",
"z17",
"z18",
"z19",
"z20",
"memory",
"cc"
655 "mov z0.s, %s[maxval]\n"
656 "addvl x8, %[inptr], #16\n"
657 "mov z1.s, %s[minval]\n"
658 "whilelt p0.s, %[p], %[w]\n"
659 "incw %[p], all, mul #1\n"
660 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
661 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
662 "ld1w z2.s, p0/z, [%[outptr0]]\n"
663 "whilelt p1.s, %[p], %[w]\n"
664 "ld1w z10.s, p0/z, [%[inptr]]\n"
665 "incw %[p], all, mul #1\n"
666 "ld1w z5.s, p0/z, [%[outptr1]]\n"
667 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
668 "fadd z10.s, z10.s, z2.s\n"
669 "ld1w z3.s, p1/z, [%[outptr0], #1, MUL VL]\n"
670 "ld1w z11.s, p1/z, [%[inptr], #1, MUL VL]\n"
671 "whilelt p2.s, %[p], %[w]\n"
672 "ld1w z13.s, p0/z, [%[inptr], #3, MUL VL]\n"
673 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
674 "fmin z10.s, p0/m, z10.s, z0.s\n"
675 "ld1w z4.s, p2/z, [%[outptr0], #2, MUL VL]\n"
676 "fadd z11.s, z11.s, z3.s\n"
677 "ld1w z12.s, p2/z, [%[inptr], #2, MUL VL]\n"
678 "fadd z13.s, z13.s, z5.s\n"
679 "ld1w z6.s, p1/z, [%[outptr1], #1, MUL VL]\n"
680 "ld1w z14.s, p1/z, [%[inptr], #4, MUL VL]\n"
681 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
682 "fmax z10.s, p0/m, z10.s, z1.s\n"
683 "ld1w z7.s, p2/z, [%[outptr1], #2, MUL VL]\n"
684 "fmin z11.s, p1/m, z11.s, z0.s\n"
685 "ld1w z15.s, p2/z, [%[inptr], #5, MUL VL]\n"
686 "fadd z12.s, z12.s, z4.s\n"
687 "ld1w z8.s, p0/z, [%[outptr2]]\n"
688 "fmin z13.s, p0/m, z13.s, z0.s\n"
689 "st1w z10.s, p0, [%[outptr0]]\n"
690 "fadd z14.s, z14.s, z6.s\n"
691 "ld1w z16.s, p0/z, [%[inptr], #6, MUL VL]\n"
692 "fmax z11.s, p1/m, z11.s, z1.s\n"
693 "ld1w z9.s, p1/z, [%[outptr2], #1, MUL VL]\n"
694 "fmin z12.s, p2/m, z12.s, z0.s\n"
695 "ld1w z17.s, p1/z, [%[inptr], #7, MUL VL]\n"
696 "fmax z13.s, p0/m, z13.s, z1.s\n"
697 "ld1w z2.s, p2/z, [%[outptr2], #2, MUL VL]\n"
698 "fmin z14.s, p1/m, z14.s, z0.s\n"
699 "st1w z11.s, p1, [%[outptr0], #1, MUL VL]\n"
700 "fadd z15.s, z15.s, z7.s\n"
701 "ld1w z10.s, p2/z, [x8, #-8, MUL VL]\n"
702 "fmax z12.s, p2/m, z12.s, z1.s\n"
703 "ld1w z3.s, p0/z, [%[outptr3]]\n"
704 "fadd z16.s, z16.s, z8.s\n"
705 "ld1w z11.s, p0/z, [x8, #-7, MUL VL]\n"
706 "fmax z14.s, p1/m, z14.s, z1.s\n"
707 "ld1w z4.s, p1/z, [%[outptr3], #1, MUL VL]\n"
708 "fmin z15.s, p2/m, z15.s, z0.s\n"
709 "st1w z12.s, p2, [%[outptr0], #2, MUL VL]\n"
710 "fadd z17.s, z17.s, z9.s\n"
711 "ld1w z12.s, p1/z, [x8, #-6, MUL VL]\n"
712 "fmin z16.s, p0/m, z16.s, z0.s\n"
713 "ld1w z5.s, p2/z, [%[outptr3], #2, MUL VL]\n"
714 "fadd z10.s, z10.s, z2.s\n"
715 "st1w z13.s, p0, [%[outptr1]]\n"
716 "fmax z15.s, p2/m, z15.s, z1.s\n"
717 "ld1w z13.s, p2/z, [x8, #-5, MUL VL]\n"
718 "fmin z17.s, p1/m, z17.s, z0.s\n"
719 "ld1w z6.s, p0/z, [%[outptr4]]\n"
720 "fmax z16.s, p0/m, z16.s, z1.s\n"
721 "st1w z14.s, p1, [%[outptr1], #1, MUL VL]\n"
722 "fmin z10.s, p2/m, z10.s, z0.s\n"
723 "ld1w z14.s, p0/z, [x8, #-4, MUL VL]\n"
724 "fadd z11.s, z11.s, z3.s\n"
725 "ld1w z7.s, p1/z, [%[outptr4], #1, MUL VL]\n"
726 "fmax z17.s, p1/m, z17.s, z1.s\n"
727 "st1w z15.s, p2, [%[outptr1], #2, MUL VL]\n"
728 "fadd z12.s, z12.s, z4.s\n"
729 "ld1w z15.s, p1/z, [x8, #-3, MUL VL]\n"
730 "fmax z10.s, p2/m, z10.s, z1.s\n"
731 "ld1w z8.s, p2/z, [%[outptr4], #2, MUL VL]\n"
732 "fmin z11.s, p0/m, z11.s, z0.s\n"
733 "st1w z16.s, p0, [%[outptr2]]\n"
734 "fadd z13.s, z13.s, z5.s\n"
735 "ld1w z16.s, p2/z, [x8, #-2, MUL VL]\n"
736 "fmin z12.s, p1/m, z12.s, z0.s\n"
737 "ld1w z9.s, p0/z, [%[outptr5]]\n"
738 "fadd z14.s, z14.s, z6.s\n"
739 "st1w z17.s, p1, [%[outptr2], #1, MUL VL]\n"
740 "fmax z11.s, p0/m, z11.s, z1.s\n"
741 "ld1w z17.s, p0/z, [x8, #-1, MUL VL]\n"
742 "fmin z13.s, p2/m, z13.s, z0.s\n"
743 "ld1w z2.s, p1/z, [%[outptr5], #1, MUL VL]\n"
744 "fmax z12.s, p1/m, z12.s, z1.s\n"
745 "st1w z10.s, p2, [%[outptr2], #2, MUL VL]\n"
746 "fmin z14.s, p0/m, z14.s, z0.s\n"
747 "ld1w z10.s, p1/z, [x8]\n"
748 "fadd z15.s, z15.s, z7.s\n"
749 "ld1w z3.s, p2/z, [%[outptr5], #2, MUL VL]\n"
750 "fmax z13.s, p2/m, z13.s, z1.s\n"
751 "st1w z11.s, p0, [%[outptr3]]\n"
752 "fadd z16.s, z16.s, z8.s\n"
753 "ld1w z11.s, p2/z, [x8, #1, MUL VL]\n"
754 "fmax z14.s, p0/m, z14.s, z1.s\n"
755 "ld1w z4.s, p0/z, [%[outptr6]]\n"
756 "fmin z15.s, p1/m, z15.s, z0.s\n"
757 "st1w z12.s, p1, [%[outptr3], #1, MUL VL]\n"
758 "fadd z17.s, z17.s, z9.s\n"
759 "ld1w z12.s, p0/z, [x8, #2, MUL VL]\n"
760 "fmin z16.s, p2/m, z16.s, z0.s\n"
761 "ld1w z5.s, p1/z, [%[outptr6], #1, MUL VL]\n"
762 "fadd z10.s, z10.s, z2.s\n"
763 "st1w z13.s, p2, [%[outptr3], #2, MUL VL]\n"
764 "fmax z15.s, p1/m, z15.s, z1.s\n"
765 "ld1w z13.s, p1/z, [x8, #3, MUL VL]\n"
766 "fmin z17.s, p0/m, z17.s, z0.s\n"
767 "ld1w z6.s, p2/z, [%[outptr6], #2, MUL VL]\n"
768 "fmax z16.s, p2/m, z16.s, z1.s\n"
769 "st1w z14.s, p0, [%[outptr4]]\n"
770 "fmin z10.s, p1/m, z10.s, z0.s\n"
771 "ld1w z14.s, p2/z, [x8, #4, MUL VL]\n"
772 "fadd z11.s, z11.s, z3.s\n"
773 "addvl %[outptr0], %[outptr0], #3\n"
774 "fmax z17.s, p0/m, z17.s, z1.s\n"
775 "st1w z15.s, p1, [%[outptr4], #1, MUL VL]\n"
776 "fmax z10.s, p1/m, z10.s, z1.s\n"
777 "addvl %[outptr1], %[outptr1], #3\n"
778 "fmin z11.s, p2/m, z11.s, z0.s\n"
779 "st1w z16.s, p2, [%[outptr4], #2, MUL VL]\n"
780 "fadd z12.s, z12.s, z4.s\n"
781 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
782 "fadd z13.s, z13.s, z5.s\n"
783 "st1w z17.s, p0, [%[outptr5]]\n"
784 "fmax z11.s, p2/m, z11.s, z1.s\n"
785 "addvl %[outptr2], %[outptr2], #3\n"
786 "fmin z12.s, p0/m, z12.s, z0.s\n"
787 "st1w z10.s, p1, [%[outptr5], #1, MUL VL]\n"
788 "fmin z13.s, p1/m, z13.s, z0.s\n"
789 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
790 "fadd z14.s, z14.s, z6.s\n"
791 "st1w z11.s, p2, [%[outptr5], #2, MUL VL]\n"
792 "fmax z12.s, p0/m, z12.s, z1.s\n"
793 "addvl %[outptr3], %[outptr3], #3\n"
794 "fmax z13.s, p1/m, z13.s, z1.s\n"
795 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
796 "fmin z14.s, p2/m, z14.s, z0.s\n"
797 "st1w z12.s, p0, [%[outptr6]]\n"
798 "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
799 "addvl %[outptr4], %[outptr4], #3\n"
800 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
801 "fmax z14.s, p2/m, z14.s, z1.s\n"
802 "st1w z13.s, p1, [%[outptr6], #1, MUL VL]\n"
803 "prfm PLDL1KEEP, [%[outptr5], #0x60]\n"
804 "addvl %[outptr5], %[outptr5], #3\n"
805 "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
806 "st1w z14.s, p2, [%[outptr6], #2, MUL VL]\n"
807 "prfm PLDL1KEEP, [%[outptr6], #0x60]\n"
808 "addvl %[outptr6], %[outptr6], #3\n"
809 "addvl %[inptr], %[inptr], #24\n"
810 : [outptr0]
"+r" (outptr0), [outptr1]
"+r" (outptr1), [outptr2]
"+r" (outptr2), [outptr3]
"+r" (outptr3), [outptr4]
"+r" (outptr4), [outptr5]
"+r" (outptr5), [outptr6]
"+r" (outptr6), [outptr7]
"+r" (outptr7),
811 [inptr]
"+r" (inptr), [p]
"+r" (p)
812 : [
w]
"r" (
w), [minval]
"w" (minval), [maxval]
"w" (maxval)
813 :
"x8",
"z0",
"z1",
"z2",
"z3",
"z4",
"z5",
"z6",
"z7",
"z8",
"z9",
"z10",
"z11",
"z12",
"z13",
"z14",
"z15",
"z16",
"z17",
"z18",
"z19",
"z20",
"memory",
"cc"
825 "mov z0.s, %s[maxval]\n"
826 "addvl x8, %[inptr], #16\n"
827 "mov z1.s, %s[minval]\n"
828 "whilelt p0.s, %[p], %[w]\n"
829 "incw %[p], all, mul #1\n"
830 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
831 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
832 "ld1w z2.s, p0/z, [%[outptr0]]\n"
833 "whilelt p1.s, %[p], %[w]\n"
834 "ld1w z10.s, p0/z, [%[inptr]]\n"
835 "incw %[p], all, mul #1\n"
836 "ld1w z5.s, p0/z, [%[outptr1]]\n"
837 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
838 "fadd z10.s, z10.s, z2.s\n"
839 "ld1w z3.s, p1/z, [%[outptr0], #1, MUL VL]\n"
840 "ld1w z11.s, p1/z, [%[inptr], #1, MUL VL]\n"
841 "whilelt p2.s, %[p], %[w]\n"
842 "ld1w z13.s, p0/z, [%[inptr], #3, MUL VL]\n"
843 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
844 "fmin z10.s, p0/m, z10.s, z0.s\n"
845 "ld1w z4.s, p2/z, [%[outptr0], #2, MUL VL]\n"
846 "fadd z11.s, z11.s, z3.s\n"
847 "ld1w z12.s, p2/z, [%[inptr], #2, MUL VL]\n"
848 "fadd z13.s, z13.s, z5.s\n"
849 "ld1w z6.s, p1/z, [%[outptr1], #1, MUL VL]\n"
850 "ld1w z14.s, p1/z, [%[inptr], #4, MUL VL]\n"
851 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
852 "fmax z10.s, p0/m, z10.s, z1.s\n"
853 "ld1w z7.s, p2/z, [%[outptr1], #2, MUL VL]\n"
854 "fmin z11.s, p1/m, z11.s, z0.s\n"
855 "ld1w z15.s, p2/z, [%[inptr], #5, MUL VL]\n"
856 "fadd z12.s, z12.s, z4.s\n"
857 "ld1w z8.s, p0/z, [%[outptr2]]\n"
858 "fmin z13.s, p0/m, z13.s, z0.s\n"
859 "st1w z10.s, p0, [%[outptr0]]\n"
860 "fadd z14.s, z14.s, z6.s\n"
861 "ld1w z16.s, p0/z, [%[inptr], #6, MUL VL]\n"
862 "fmax z11.s, p1/m, z11.s, z1.s\n"
863 "ld1w z9.s, p1/z, [%[outptr2], #1, MUL VL]\n"
864 "fmin z12.s, p2/m, z12.s, z0.s\n"
865 "ld1w z17.s, p1/z, [%[inptr], #7, MUL VL]\n"
866 "fmax z13.s, p0/m, z13.s, z1.s\n"
867 "ld1w z2.s, p2/z, [%[outptr2], #2, MUL VL]\n"
868 "fmin z14.s, p1/m, z14.s, z0.s\n"
869 "st1w z11.s, p1, [%[outptr0], #1, MUL VL]\n"
870 "fadd z15.s, z15.s, z7.s\n"
871 "ld1w z10.s, p2/z, [x8, #-8, MUL VL]\n"
872 "fmax z12.s, p2/m, z12.s, z1.s\n"
873 "ld1w z3.s, p0/z, [%[outptr3]]\n"
874 "fadd z16.s, z16.s, z8.s\n"
875 "ld1w z11.s, p0/z, [x8, #-7, MUL VL]\n"
876 "fmax z14.s, p1/m, z14.s, z1.s\n"
877 "ld1w z4.s, p1/z, [%[outptr3], #1, MUL VL]\n"
878 "fmin z15.s, p2/m, z15.s, z0.s\n"
879 "st1w z12.s, p2, [%[outptr0], #2, MUL VL]\n"
880 "fadd z17.s, z17.s, z9.s\n"
881 "ld1w z12.s, p1/z, [x8, #-6, MUL VL]\n"
882 "fmin z16.s, p0/m, z16.s, z0.s\n"
883 "ld1w z5.s, p2/z, [%[outptr3], #2, MUL VL]\n"
884 "fadd z10.s, z10.s, z2.s\n"
885 "st1w z13.s, p0, [%[outptr1]]\n"
886 "fmax z15.s, p2/m, z15.s, z1.s\n"
887 "ld1w z13.s, p2/z, [x8, #-5, MUL VL]\n"
888 "fmin z17.s, p1/m, z17.s, z0.s\n"
889 "ld1w z6.s, p0/z, [%[outptr4]]\n"
890 "fmax z16.s, p0/m, z16.s, z1.s\n"
891 "st1w z14.s, p1, [%[outptr1], #1, MUL VL]\n"
892 "fmin z10.s, p2/m, z10.s, z0.s\n"
893 "ld1w z14.s, p0/z, [x8, #-4, MUL VL]\n"
894 "fadd z11.s, z11.s, z3.s\n"
895 "ld1w z7.s, p1/z, [%[outptr4], #1, MUL VL]\n"
896 "fmax z17.s, p1/m, z17.s, z1.s\n"
897 "st1w z15.s, p2, [%[outptr1], #2, MUL VL]\n"
898 "fadd z12.s, z12.s, z4.s\n"
899 "ld1w z15.s, p1/z, [x8, #-3, MUL VL]\n"
900 "fmax z10.s, p2/m, z10.s, z1.s\n"
901 "ld1w z8.s, p2/z, [%[outptr4], #2, MUL VL]\n"
902 "fmin z11.s, p0/m, z11.s, z0.s\n"
903 "st1w z16.s, p0, [%[outptr2]]\n"
904 "fadd z13.s, z13.s, z5.s\n"
905 "ld1w z16.s, p2/z, [x8, #-2, MUL VL]\n"
906 "fmin z12.s, p1/m, z12.s, z0.s\n"
907 "ld1w z9.s, p0/z, [%[outptr5]]\n"
908 "fadd z14.s, z14.s, z6.s\n"
909 "st1w z17.s, p1, [%[outptr2], #1, MUL VL]\n"
910 "fmax z11.s, p0/m, z11.s, z1.s\n"
911 "ld1w z17.s, p0/z, [x8, #-1, MUL VL]\n"
912 "fmin z13.s, p2/m, z13.s, z0.s\n"
913 "ld1w z2.s, p1/z, [%[outptr5], #1, MUL VL]\n"
914 "fmax z12.s, p1/m, z12.s, z1.s\n"
915 "st1w z10.s, p2, [%[outptr2], #2, MUL VL]\n"
916 "fmin z14.s, p0/m, z14.s, z0.s\n"
917 "ld1w z10.s, p1/z, [x8]\n"
918 "fadd z15.s, z15.s, z7.s\n"
919 "ld1w z3.s, p2/z, [%[outptr5], #2, MUL VL]\n"
920 "fmax z13.s, p2/m, z13.s, z1.s\n"
921 "st1w z11.s, p0, [%[outptr3]]\n"
922 "fadd z16.s, z16.s, z8.s\n"
923 "ld1w z11.s, p2/z, [x8, #1, MUL VL]\n"
924 "fmax z14.s, p0/m, z14.s, z1.s\n"
925 "ld1w z4.s, p0/z, [%[outptr6]]\n"
926 "fmin z15.s, p1/m, z15.s, z0.s\n"
927 "st1w z12.s, p1, [%[outptr3], #1, MUL VL]\n"
928 "fadd z17.s, z17.s, z9.s\n"
929 "ld1w z12.s, p0/z, [x8, #2, MUL VL]\n"
930 "fmin z16.s, p2/m, z16.s, z0.s\n"
931 "ld1w z5.s, p1/z, [%[outptr6], #1, MUL VL]\n"
932 "fadd z10.s, z10.s, z2.s\n"
933 "st1w z13.s, p2, [%[outptr3], #2, MUL VL]\n"
934 "fmax z15.s, p1/m, z15.s, z1.s\n"
935 "ld1w z13.s, p1/z, [x8, #3, MUL VL]\n"
936 "fmin z17.s, p0/m, z17.s, z0.s\n"
937 "ld1w z6.s, p2/z, [%[outptr6], #2, MUL VL]\n"
938 "fmax z16.s, p2/m, z16.s, z1.s\n"
939 "st1w z14.s, p0, [%[outptr4]]\n"
940 "fmin z10.s, p1/m, z10.s, z0.s\n"
941 "ld1w z14.s, p2/z, [x8, #4, MUL VL]\n"
942 "fadd z11.s, z11.s, z3.s\n"
943 "ld1w z7.s, p0/z, [%[outptr7]]\n"
944 "fmax z17.s, p0/m, z17.s, z1.s\n"
945 "st1w z15.s, p1, [%[outptr4], #1, MUL VL]\n"
946 "fadd z12.s, z12.s, z4.s\n"
947 "ld1w z15.s, p0/z, [x8, #5, MUL VL]\n"
948 "fmax z10.s, p1/m, z10.s, z1.s\n"
949 "ld1w z8.s, p1/z, [%[outptr7], #1, MUL VL]\n"
950 "fmin z11.s, p2/m, z11.s, z0.s\n"
951 "st1w z16.s, p2, [%[outptr4], #2, MUL VL]\n"
952 "fadd z13.s, z13.s, z5.s\n"
953 "ld1w z16.s, p1/z, [x8, #6, MUL VL]\n"
954 "fmin z12.s, p0/m, z12.s, z0.s\n"
955 "ld1w z9.s, p2/z, [%[outptr7], #2, MUL VL]\n"
956 "fadd z14.s, z14.s, z6.s\n"
957 "st1w z17.s, p0, [%[outptr5]]\n"
958 "fmax z11.s, p2/m, z11.s, z1.s\n"
959 "ld1w z17.s, p2/z, [x8, #7, MUL VL]\n"
960 "fmin z13.s, p1/m, z13.s, z0.s\n"
961 "addvl %[outptr0], %[outptr0], #3\n"
962 "fmax z12.s, p0/m, z12.s, z1.s\n"
963 "st1w z10.s, p1, [%[outptr5], #1, MUL VL]\n"
964 "fmin z14.s, p2/m, z14.s, z0.s\n"
965 "addvl %[outptr1], %[outptr1], #3\n"
966 "fmax z13.s, p1/m, z13.s, z1.s\n"
967 "st1w z11.s, p2, [%[outptr5], #2, MUL VL]\n"
968 "fadd z15.s, z15.s, z7.s\n"
969 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
970 "fmax z14.s, p2/m, z14.s, z1.s\n"
971 "st1w z12.s, p0, [%[outptr6]]\n"
972 "fadd z16.s, z16.s, z8.s\n"
973 "addvl %[outptr2], %[outptr2], #3\n"
974 "fmin z15.s, p0/m, z15.s, z0.s\n"
975 "st1w z13.s, p1, [%[outptr6], #1, MUL VL]\n"
976 "fadd z17.s, z17.s, z9.s\n"
977 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
978 "fmin z16.s, p1/m, z16.s, z0.s\n"
979 "st1w z14.s, p2, [%[outptr6], #2, MUL VL]\n"
980 "fmax z15.s, p0/m, z15.s, z1.s\n"
981 "addvl %[outptr3], %[outptr3], #3\n"
982 "fmin z17.s, p2/m, z17.s, z0.s\n"
983 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
984 "fmax z16.s, p1/m, z16.s, z1.s\n"
985 "st1w z15.s, p0, [%[outptr7]]\n"
986 "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
987 "fmax z17.s, p2/m, z17.s, z1.s\n"
988 "addvl %[outptr4], %[outptr4], #3\n"
989 "st1w z16.s, p1, [%[outptr7], #1, MUL VL]\n"
990 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
991 "prfm PLDL1KEEP, [%[outptr5], #0x60]\n"
992 "addvl %[outptr5], %[outptr5], #3\n"
993 "st1w z17.s, p2, [%[outptr7], #2, MUL VL]\n"
994 "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
995 "prfm PLDL1KEEP, [%[outptr6], #0x60]\n"
996 "addvl %[outptr6], %[outptr6], #3\n"
997 "prfm PLDL1KEEP, [%[outptr7], #0x60]\n"
998 "addvl %[outptr7], %[outptr7], #3\n"
999 "addvl %[inptr], %[inptr], #24\n"
1000 : [outptr0]
"+r" (outptr0), [outptr1]
"+r" (outptr1), [outptr2]
"+r" (outptr2), [outptr3]
"+r" (outptr3), [outptr4]
"+r" (outptr4), [outptr5]
"+r" (outptr5), [outptr6]
"+r" (outptr6), [outptr7]
"+r" (outptr7),
1001 [inptr]
"+r" (inptr), [p]
"+r" (p)
1002 : [
w]
"r" (
w), [minval]
"w" (minval), [maxval]
"w" (maxval)
1003 :
"x8",
"z0",
"z1",
"z2",
"z3",
"z4",
"z5",
"z6",
"z7",
"z8",
"z9",
"z10",
"z11",
"z12",
"z13",
"z14",
"z15",
"z16",
"z17",
"z18",
"z19",
"z20",
"memory",
"cc"
1013 const float *biasptr =
bias ?
bias + i : nullbias;
1023 "mov z0.s, %s[maxval]\n"
1024 "addvl x8, %[inptr], #16\n"
1025 "mov z1.s, %s[minval]\n"
1026 "whilelt p0.s, %[p], %[w]\n"
1027 "incw %[p], all, mul #1\n"
1028 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1029 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1030 "ld1w z2.s, p0/z, [%[biasptr]]\n"
1031 "whilelt p1.s, %[p], %[w]\n"
1032 "ld1w z13.s, p0/z, [%[inptr]]\n"
1033 "incw %[p], all, mul #1\n"
1034 "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
1035 "fadd z13.s, z13.s, z2.s\n"
1036 "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
1037 "whilelt p2.s, %[p], %[w]\n"
1038 "fmin z13.s, p0/m, z13.s, z0.s\n"
1039 "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
1040 "fadd z14.s, z14.s, z3.s\n"
1041 "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
1042 "addvl %[inptr], %[inptr], #24\n"
1043 "fmax z13.s, p0/m, z13.s, z1.s\n"
1044 "fmin z14.s, p1/m, z14.s, z0.s\n"
1045 "fadd z15.s, z15.s, z4.s\n"
1046 "st1w z13.s, p0, [%[outptr0]]\n"
1047 "fmax z14.s, p1/m, z14.s, z1.s\n"
1048 "fmin z15.s, p2/m, z15.s, z0.s\n"
1049 "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
1050 "fmax z15.s, p2/m, z15.s, z1.s\n"
1051 "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
1052 "addvl %[outptr0], %[outptr0], #3\n"
1053 : [outptr0]
"+r" (outptr0), [outptr1]
"+r" (outptr1), [outptr2]
"+r" (outptr2), [outptr3]
"+r" (outptr3), [outptr4]
"+r" (outptr4), [outptr5]
"+r" (outptr5), [outptr6]
"+r" (outptr6), [outptr7]
"+r" (outptr7),
1054 [inptr]
"+r" (inptr), [p]
"+r" (p)
1055 : [
w]
"r" (
w), [biasptr]
"r" (biasptr), [minval]
"w" (minval), [maxval]
"w" (maxval)
1056 :
"x8",
"z0",
"z1",
"z2",
"z3",
"z4",
"z5",
"z6",
"z7",
"z8",
"z9",
"z10",
"z11",
"z12",
"z13",
"z14",
"z15",
"z16",
"z17",
"z18",
"z19",
"z20",
"memory",
"cc"
1067 "mov z0.s, %s[maxval]\n"
1068 "addvl x8, %[inptr], #16\n"
1069 "mov z1.s, %s[minval]\n"
1070 "whilelt p0.s, %[p], %[w]\n"
1071 "incw %[p], all, mul #1\n"
1072 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1073 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1074 "ld1w z2.s, p0/z, [%[biasptr]]\n"
1075 "whilelt p1.s, %[p], %[w]\n"
1076 "ld1w z13.s, p0/z, [%[inptr]]\n"
1077 "incw %[p], all, mul #1\n"
1078 "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
1079 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1080 "fadd z13.s, z13.s, z2.s\n"
1081 "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
1082 "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
1083 "whilelt p2.s, %[p], %[w]\n"
1084 "fadd z16.s, z16.s, z2.s\n"
1085 "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
1086 "fmin z13.s, p0/m, z13.s, z0.s\n"
1087 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1088 "fadd z14.s, z14.s, z3.s\n"
1089 "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
1090 "fmin z16.s, p0/m, z16.s, z0.s\n"
1091 "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
1092 "fmax z13.s, p0/m, z13.s, z1.s\n"
1093 "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
1094 "fadd z17.s, z17.s, z3.s\n"
1095 "addvl %[inptr], %[inptr], #24\n"
1096 "fmin z14.s, p1/m, z14.s, z0.s\n"
1097 "st1w z13.s, p0, [%[outptr0]]\n"
1098 "fadd z15.s, z15.s, z4.s\n"
1099 "fmax z16.s, p0/m, z16.s, z1.s\n"
1100 "fmin z17.s, p1/m, z17.s, z0.s\n"
1101 "fmax z14.s, p1/m, z14.s, z1.s\n"
1102 "fmin z15.s, p2/m, z15.s, z0.s\n"
1103 "fadd z18.s, z18.s, z4.s\n"
1104 "fmax z17.s, p1/m, z17.s, z1.s\n"
1105 "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
1106 "fmax z15.s, p2/m, z15.s, z1.s\n"
1107 "fmin z18.s, p2/m, z18.s, z0.s\n"
1108 "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
1109 "addvl %[outptr0], %[outptr0], #3\n"
1110 "fmax z18.s, p2/m, z18.s, z1.s\n"
1111 "st1w z16.s, p0, [%[outptr1]]\n"
1112 "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
1113 "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
1114 "addvl %[outptr1], %[outptr1], #3\n"
1115 : [outptr0]
"+r" (outptr0), [outptr1]
"+r" (outptr1), [outptr2]
"+r" (outptr2), [outptr3]
"+r" (outptr3), [outptr4]
"+r" (outptr4), [outptr5]
"+r" (outptr5), [outptr6]
"+r" (outptr6), [outptr7]
"+r" (outptr7),
1116 [inptr]
"+r" (inptr), [p]
"+r" (p)
1117 : [
w]
"r" (
w), [biasptr]
"r" (biasptr), [minval]
"w" (minval), [maxval]
"w" (maxval)
1118 :
"x8",
"z0",
"z1",
"z2",
"z3",
"z4",
"z5",
"z6",
"z7",
"z8",
"z9",
"z10",
"z11",
"z12",
"z13",
"z14",
"z15",
"z16",
"z17",
"z18",
"z19",
"z20",
"memory",
"cc"
1129 "mov z0.s, %s[maxval]\n"
1130 "addvl x8, %[inptr], #16\n"
1131 "mov z1.s, %s[minval]\n"
1132 "whilelt p0.s, %[p], %[w]\n"
1133 "incw %[p], all, mul #1\n"
1134 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1135 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1136 "ld1w z2.s, p0/z, [%[biasptr]]\n"
1137 "whilelt p1.s, %[p], %[w]\n"
1138 "ld1w z13.s, p0/z, [%[inptr]]\n"
1139 "incw %[p], all, mul #1\n"
1140 "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
1141 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1142 "fadd z13.s, z13.s, z2.s\n"
1143 "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
1144 "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
1145 "whilelt p2.s, %[p], %[w]\n"
1146 "fadd z16.s, z16.s, z2.s\n"
1147 "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
1148 "fmin z13.s, p0/m, z13.s, z0.s\n"
1149 "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
1150 "fadd z14.s, z14.s, z3.s\n"
1151 "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
1152 "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
1153 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1154 "fmax z13.s, p0/m, z13.s, z1.s\n"
1155 "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
1156 "fmin z14.s, p1/m, z14.s, z0.s\n"
1157 "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
1158 "fadd z15.s, z15.s, z4.s\n"
1159 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1160 "fmin z16.s, p0/m, z16.s, z0.s\n"
1161 "st1w z13.s, p0, [%[outptr0]]\n"
1162 "fmax z14.s, p1/m, z14.s, z1.s\n"
1163 "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
1164 "fmin z15.s, p2/m, z15.s, z0.s\n"
1165 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1166 "fmax z16.s, p0/m, z16.s, z1.s\n"
1167 "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
1168 "fadd z17.s, z17.s, z3.s\n"
1169 "addvl %[inptr], %[inptr], #24\n"
1170 "fmax z15.s, p2/m, z15.s, z1.s\n"
1171 "fadd z18.s, z18.s, z4.s\n"
1172 "fmin z17.s, p1/m, z17.s, z0.s\n"
1173 "fadd z19.s, z19.s, z2.s\n"
1174 "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
1175 "fadd z20.s, z20.s, z3.s\n"
1176 "addvl %[outptr0], %[outptr0], #3\n"
1177 "fmax z17.s, p1/m, z17.s, z1.s\n"
1178 "st1w z16.s, p0, [%[outptr1]]\n"
1179 "fmin z18.s, p2/m, z18.s, z0.s\n"
1180 "fmin z19.s, p0/m, z19.s, z0.s\n"
1181 "fmin z20.s, p1/m, z20.s, z0.s\n"
1182 "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
1183 "fadd z13.s, z13.s, z4.s\n"
1184 "fmax z18.s, p2/m, z18.s, z1.s\n"
1185 "fmax z19.s, p0/m, z19.s, z1.s\n"
1186 "fmax z20.s, p1/m, z20.s, z1.s\n"
1187 "fmin z13.s, p2/m, z13.s, z0.s\n"
1188 "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
1189 "addvl %[outptr1], %[outptr1], #3\n"
1190 "fmax z13.s, p2/m, z13.s, z1.s\n"
1191 "st1w z19.s, p0, [%[outptr2]]\n"
1192 "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
1193 "st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
1194 "addvl %[outptr2], %[outptr2], #3\n"
1195 : [outptr0]
"+r" (outptr0), [outptr1]
"+r" (outptr1), [outptr2]
"+r" (outptr2), [outptr3]
"+r" (outptr3), [outptr4]
"+r" (outptr4), [outptr5]
"+r" (outptr5), [outptr6]
"+r" (outptr6), [outptr7]
"+r" (outptr7),
1196 [inptr]
"+r" (inptr), [p]
"+r" (p)
1197 : [
w]
"r" (
w), [biasptr]
"r" (biasptr), [minval]
"w" (minval), [maxval]
"w" (maxval)
1198 :
"x8",
"z0",
"z1",
"z2",
"z3",
"z4",
"z5",
"z6",
"z7",
"z8",
"z9",
"z10",
"z11",
"z12",
"z13",
"z14",
"z15",
"z16",
"z17",
"z18",
"z19",
"z20",
"memory",
"cc"
1209 "mov z0.s, %s[maxval]\n"
1210 "addvl x8, %[inptr], #16\n"
1211 "mov z1.s, %s[minval]\n"
1212 "whilelt p0.s, %[p], %[w]\n"
1213 "incw %[p], all, mul #1\n"
1214 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1215 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1216 "ld1w z2.s, p0/z, [%[biasptr]]\n"
1217 "whilelt p1.s, %[p], %[w]\n"
1218 "ld1w z13.s, p0/z, [%[inptr]]\n"
1219 "incw %[p], all, mul #1\n"
1220 "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
1221 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1222 "fadd z13.s, z13.s, z2.s\n"
1223 "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
1224 "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
1225 "whilelt p2.s, %[p], %[w]\n"
1226 "fadd z16.s, z16.s, z2.s\n"
1227 "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
1228 "fmin z13.s, p0/m, z13.s, z0.s\n"
1229 "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
1230 "fadd z14.s, z14.s, z3.s\n"
1231 "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
1232 "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
1233 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1234 "fmax z13.s, p0/m, z13.s, z1.s\n"
1235 "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
1236 "fmin z14.s, p1/m, z14.s, z0.s\n"
1237 "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
1238 "fadd z15.s, z15.s, z4.s\n"
1239 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1240 "fmin z16.s, p0/m, z16.s, z0.s\n"
1241 "st1w z13.s, p0, [%[outptr0]]\n"
1242 "fmax z14.s, p1/m, z14.s, z1.s\n"
1243 "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
1244 "fmin z15.s, p2/m, z15.s, z0.s\n"
1245 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1246 "fmax z16.s, p0/m, z16.s, z1.s\n"
1247 "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
1248 "fadd z17.s, z17.s, z3.s\n"
1249 "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
1250 "fmax z15.s, p2/m, z15.s, z1.s\n"
1251 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
1252 "fadd z18.s, z18.s, z4.s\n"
1253 "addvl %[inptr], %[inptr], #24\n"
1254 "fmin z17.s, p1/m, z17.s, z0.s\n"
1255 "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
1256 "fadd z19.s, z19.s, z2.s\n"
1257 "ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
1258 "fmin z18.s, p2/m, z18.s, z0.s\n"
1259 "addvl %[outptr0], %[outptr0], #3\n"
1260 "fmax z17.s, p1/m, z17.s, z1.s\n"
1261 "st1w z16.s, p0, [%[outptr1]]\n"
1262 "fmin z19.s, p0/m, z19.s, z0.s\n"
1263 "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
1264 "fmax z18.s, p2/m, z18.s, z1.s\n"
1265 "fadd z20.s, z20.s, z3.s\n"
1266 "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
1267 "fadd z13.s, z13.s, z4.s\n"
1268 "fmax z19.s, p0/m, z19.s, z1.s\n"
1269 "fadd z14.s, z14.s, z2.s\n"
1270 "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
1271 "fmin z20.s, p1/m, z20.s, z0.s\n"
1272 "addvl %[outptr1], %[outptr1], #3\n"
1273 "fmin z13.s, p2/m, z13.s, z0.s\n"
1274 "st1w z19.s, p0, [%[outptr2]]\n"
1275 "fmin z14.s, p0/m, z14.s, z0.s\n"
1276 "fmax z20.s, p1/m, z20.s, z1.s\n"
1277 "fadd z15.s, z15.s, z3.s\n"
1278 "fmax z13.s, p2/m, z13.s, z1.s\n"
1279 "fmax z14.s, p0/m, z14.s, z1.s\n"
1280 "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
1281 "fadd z16.s, z16.s, z4.s\n"
1282 "fmin z15.s, p1/m, z15.s, z0.s\n"
1283 "st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
1284 "addvl %[outptr2], %[outptr2], #3\n"
1285 "fmax z15.s, p1/m, z15.s, z1.s\n"
1286 "fmin z16.s, p2/m, z16.s, z0.s\n"
1287 "st1w z14.s, p0, [%[outptr3]]\n"
1288 "fmax z16.s, p2/m, z16.s, z1.s\n"
1289 "st1w z15.s, p1, [%[outptr3], #1, MUL VL]\n"
1290 "st1w z16.s, p2, [%[outptr3], #2, MUL VL]\n"
1291 "addvl %[outptr3], %[outptr3], #3\n"
1292 : [outptr0]
"+r" (outptr0), [outptr1]
"+r" (outptr1), [outptr2]
"+r" (outptr2), [outptr3]
"+r" (outptr3), [outptr4]
"+r" (outptr4), [outptr5]
"+r" (outptr5), [outptr6]
"+r" (outptr6), [outptr7]
"+r" (outptr7),
1293 [inptr]
"+r" (inptr), [p]
"+r" (p)
1294 : [
w]
"r" (
w), [biasptr]
"r" (biasptr), [minval]
"w" (minval), [maxval]
"w" (maxval)
1295 :
"x8",
"z0",
"z1",
"z2",
"z3",
"z4",
"z5",
"z6",
"z7",
"z8",
"z9",
"z10",
"z11",
"z12",
"z13",
"z14",
"z15",
"z16",
"z17",
"z18",
"z19",
"z20",
"memory",
"cc"
1306 "mov z0.s, %s[maxval]\n"
1307 "addvl x8, %[inptr], #16\n"
1308 "mov z1.s, %s[minval]\n"
1309 "whilelt p0.s, %[p], %[w]\n"
1310 "incw %[p], all, mul #1\n"
1311 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1312 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1313 "ld1w z2.s, p0/z, [%[biasptr]]\n"
1314 "whilelt p1.s, %[p], %[w]\n"
1315 "ld1w z13.s, p0/z, [%[inptr]]\n"
1316 "incw %[p], all, mul #1\n"
1317 "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
1318 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1319 "fadd z13.s, z13.s, z2.s\n"
1320 "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
1321 "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
1322 "whilelt p2.s, %[p], %[w]\n"
1323 "fadd z16.s, z16.s, z2.s\n"
1324 "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
1325 "fmin z13.s, p0/m, z13.s, z0.s\n"
1326 "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
1327 "fadd z14.s, z14.s, z3.s\n"
1328 "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
1329 "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
1330 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1331 "fmax z13.s, p0/m, z13.s, z1.s\n"
1332 "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
1333 "fmin z14.s, p1/m, z14.s, z0.s\n"
1334 "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
1335 "fadd z15.s, z15.s, z4.s\n"
1336 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1337 "fmin z16.s, p0/m, z16.s, z0.s\n"
1338 "st1w z13.s, p0, [%[outptr0]]\n"
1339 "fmax z14.s, p1/m, z14.s, z1.s\n"
1340 "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
1341 "fmin z15.s, p2/m, z15.s, z0.s\n"
1342 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1343 "fmax z16.s, p0/m, z16.s, z1.s\n"
1344 "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
1345 "fadd z17.s, z17.s, z3.s\n"
1346 "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
1347 "fmax z15.s, p2/m, z15.s, z1.s\n"
1348 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
1349 "fadd z18.s, z18.s, z4.s\n"
1350 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
1351 "fmin z17.s, p1/m, z17.s, z0.s\n"
1352 "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
1353 "fadd z19.s, z19.s, z2.s\n"
1354 "ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
1355 "fmin z18.s, p2/m, z18.s, z0.s\n"
1356 "addvl %[outptr0], %[outptr0], #3\n"
1357 "fmax z17.s, p1/m, z17.s, z1.s\n"
1358 "st1w z16.s, p0, [%[outptr1]]\n"
1359 "fmin z19.s, p0/m, z19.s, z0.s\n"
1360 "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
1361 "fmax z18.s, p2/m, z18.s, z1.s\n"
1362 "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
1363 "fadd z20.s, z20.s, z3.s\n"
1364 "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
1365 "fmax z19.s, p0/m, z19.s, z1.s\n"
1366 "ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n"
1367 "fadd z13.s, z13.s, z4.s\n"
1368 "addvl %[inptr], %[inptr], #24\n"
1369 "fmin z20.s, p1/m, z20.s, z0.s\n"
1370 "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
1371 "fadd z14.s, z14.s, z2.s\n"
1372 "ld1w z18.s, p1/z, [x8, #-3, MUL VL]\n"
1373 "fmin z13.s, p2/m, z13.s, z0.s\n"
1374 "addvl %[outptr1], %[outptr1], #3\n"
1375 "fmax z20.s, p1/m, z20.s, z1.s\n"
1376 "st1w z19.s, p0, [%[outptr2]]\n"
1377 "fmin z14.s, p0/m, z14.s, z0.s\n"
1378 "ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n"
1379 "fmax z13.s, p2/m, z13.s, z1.s\n"
1380 "fadd z15.s, z15.s, z3.s\n"
1381 "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
1382 "fadd z16.s, z16.s, z4.s\n"
1383 "fmax z14.s, p0/m, z14.s, z1.s\n"
1384 "fadd z17.s, z17.s, z2.s\n"
1385 "st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
1386 "fmin z15.s, p1/m, z15.s, z0.s\n"
1387 "addvl %[outptr2], %[outptr2], #3\n"
1388 "fmin z16.s, p2/m, z16.s, z0.s\n"
1389 "st1w z14.s, p0, [%[outptr3]]\n"
1390 "fmin z17.s, p0/m, z17.s, z0.s\n"
1391 "fmax z15.s, p1/m, z15.s, z1.s\n"
1392 "fadd z18.s, z18.s, z3.s\n"
1393 "fmax z16.s, p2/m, z16.s, z1.s\n"
1394 "fmax z17.s, p0/m, z17.s, z1.s\n"
1395 "st1w z15.s, p1, [%[outptr3], #1, MUL VL]\n"
1396 "fadd z19.s, z19.s, z4.s\n"
1397 "fmin z18.s, p1/m, z18.s, z0.s\n"
1398 "st1w z16.s, p2, [%[outptr3], #2, MUL VL]\n"
1399 "addvl %[outptr3], %[outptr3], #3\n"
1400 "fmax z18.s, p1/m, z18.s, z1.s\n"
1401 "fmin z19.s, p2/m, z19.s, z0.s\n"
1402 "st1w z17.s, p0, [%[outptr4]]\n"
1403 "fmax z19.s, p2/m, z19.s, z1.s\n"
1404 "st1w z18.s, p1, [%[outptr4], #1, MUL VL]\n"
1405 "st1w z19.s, p2, [%[outptr4], #2, MUL VL]\n"
1406 "addvl %[outptr4], %[outptr4], #3\n"
1407 : [outptr0]
"+r" (outptr0), [outptr1]
"+r" (outptr1), [outptr2]
"+r" (outptr2), [outptr3]
"+r" (outptr3), [outptr4]
"+r" (outptr4), [outptr5]
"+r" (outptr5), [outptr6]
"+r" (outptr6), [outptr7]
"+r" (outptr7),
1408 [inptr]
"+r" (inptr), [p]
"+r" (p)
1409 : [
w]
"r" (
w), [biasptr]
"r" (biasptr), [minval]
"w" (minval), [maxval]
"w" (maxval)
1410 :
"x8",
"z0",
"z1",
"z2",
"z3",
"z4",
"z5",
"z6",
"z7",
"z8",
"z9",
"z10",
"z11",
"z12",
"z13",
"z14",
"z15",
"z16",
"z17",
"z18",
"z19",
"z20",
"memory",
"cc"
1421 "mov z0.s, %s[maxval]\n"
1422 "addvl x8, %[inptr], #16\n"
1423 "mov z1.s, %s[minval]\n"
1424 "whilelt p0.s, %[p], %[w]\n"
1425 "incw %[p], all, mul #1\n"
1426 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1427 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1428 "ld1w z2.s, p0/z, [%[biasptr]]\n"
1429 "whilelt p1.s, %[p], %[w]\n"
1430 "ld1w z13.s, p0/z, [%[inptr]]\n"
1431 "incw %[p], all, mul #1\n"
1432 "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
1433 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1434 "fadd z13.s, z13.s, z2.s\n"
1435 "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
1436 "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
1437 "whilelt p2.s, %[p], %[w]\n"
1438 "fadd z16.s, z16.s, z2.s\n"
1439 "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
1440 "fmin z13.s, p0/m, z13.s, z0.s\n"
1441 "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
1442 "fadd z14.s, z14.s, z3.s\n"
1443 "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
1444 "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
1445 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1446 "fmax z13.s, p0/m, z13.s, z1.s\n"
1447 "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
1448 "fmin z14.s, p1/m, z14.s, z0.s\n"
1449 "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
1450 "fadd z15.s, z15.s, z4.s\n"
1451 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1452 "fmin z16.s, p0/m, z16.s, z0.s\n"
1453 "st1w z13.s, p0, [%[outptr0]]\n"
1454 "fmax z14.s, p1/m, z14.s, z1.s\n"
1455 "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
1456 "fmin z15.s, p2/m, z15.s, z0.s\n"
1457 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1458 "fmax z16.s, p0/m, z16.s, z1.s\n"
1459 "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
1460 "fadd z17.s, z17.s, z3.s\n"
1461 "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
1462 "fmax z15.s, p2/m, z15.s, z1.s\n"
1463 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
1464 "fadd z18.s, z18.s, z4.s\n"
1465 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
1466 "fmin z17.s, p1/m, z17.s, z0.s\n"
1467 "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
1468 "fadd z19.s, z19.s, z2.s\n"
1469 "ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
1470 "fmin z18.s, p2/m, z18.s, z0.s\n"
1471 "addvl %[outptr0], %[outptr0], #3\n"
1472 "fmax z17.s, p1/m, z17.s, z1.s\n"
1473 "st1w z16.s, p0, [%[outptr1]]\n"
1474 "fmin z19.s, p0/m, z19.s, z0.s\n"
1475 "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
1476 "fmax z18.s, p2/m, z18.s, z1.s\n"
1477 "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
1478 "fadd z20.s, z20.s, z3.s\n"
1479 "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
1480 "fmax z19.s, p0/m, z19.s, z1.s\n"
1481 "ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n"
1482 "fadd z13.s, z13.s, z4.s\n"
1483 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
1484 "fmin z20.s, p1/m, z20.s, z0.s\n"
1485 "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
1486 "fadd z14.s, z14.s, z2.s\n"
1487 "ld1w z18.s, p1/z, [x8, #-3, MUL VL]\n"
1488 "fmin z13.s, p2/m, z13.s, z0.s\n"
1489 "addvl %[outptr1], %[outptr1], #3\n"
1490 "fmax z20.s, p1/m, z20.s, z1.s\n"
1491 "st1w z19.s, p0, [%[outptr2]]\n"
1492 "fmin z14.s, p0/m, z14.s, z0.s\n"
1493 "ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n"
1494 "fmax z13.s, p2/m, z13.s, z1.s\n"
1495 "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
1496 "fadd z15.s, z15.s, z3.s\n"
1497 "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
1498 "fmax z14.s, p0/m, z14.s, z1.s\n"
1499 "ld1w z20.s, p0/z, [x8, #-1, MUL VL]\n"
1500 "fadd z16.s, z16.s, z4.s\n"
1501 "addvl %[inptr], %[inptr], #24\n"
1502 "fmin z15.s, p1/m, z15.s, z0.s\n"
1503 "st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
1504 "fadd z17.s, z17.s, z2.s\n"
1505 "ld1w z13.s, p1/z, [x8]\n"
1506 "fmin z16.s, p2/m, z16.s, z0.s\n"
1507 "addvl %[outptr2], %[outptr2], #3\n"
1508 "fmax z15.s, p1/m, z15.s, z1.s\n"
1509 "st1w z14.s, p0, [%[outptr3]]\n"
1510 "fmin z17.s, p0/m, z17.s, z0.s\n"
1511 "ld1w z14.s, p2/z, [x8, #1, MUL VL]\n"
1512 "fmax z16.s, p2/m, z16.s, z1.s\n"
1513 "fadd z18.s, z18.s, z3.s\n"
1514 "st1w z15.s, p1, [%[outptr3], #1, MUL VL]\n"
1515 "fadd z19.s, z19.s, z4.s\n"
1516 "fmax z17.s, p0/m, z17.s, z1.s\n"
1517 "fadd z20.s, z20.s, z2.s\n"
1518 "st1w z16.s, p2, [%[outptr3], #2, MUL VL]\n"
1519 "fmin z18.s, p1/m, z18.s, z0.s\n"
1520 "addvl %[outptr3], %[outptr3], #3\n"
1521 "fmin z19.s, p2/m, z19.s, z0.s\n"
1522 "st1w z17.s, p0, [%[outptr4]]\n"
1523 "fmin z20.s, p0/m, z20.s, z0.s\n"
1524 "fmax z18.s, p1/m, z18.s, z1.s\n"
1525 "fadd z13.s, z13.s, z3.s\n"
1526 "fmax z19.s, p2/m, z19.s, z1.s\n"
1527 "fmax z20.s, p0/m, z20.s, z1.s\n"
1528 "st1w z18.s, p1, [%[outptr4], #1, MUL VL]\n"
1529 "fadd z14.s, z14.s, z4.s\n"
1530 "fmin z13.s, p1/m, z13.s, z0.s\n"
1531 "st1w z19.s, p2, [%[outptr4], #2, MUL VL]\n"
1532 "addvl %[outptr4], %[outptr4], #3\n"
1533 "fmax z13.s, p1/m, z13.s, z1.s\n"
1534 "fmin z14.s, p2/m, z14.s, z0.s\n"
1535 "st1w z20.s, p0, [%[outptr5]]\n"
1536 "fmax z14.s, p2/m, z14.s, z1.s\n"
1537 "st1w z13.s, p1, [%[outptr5], #1, MUL VL]\n"
1538 "st1w z14.s, p2, [%[outptr5], #2, MUL VL]\n"
1539 "addvl %[outptr5], %[outptr5], #3\n"
1540 : [outptr0]
"+r" (outptr0), [outptr1]
"+r" (outptr1), [outptr2]
"+r" (outptr2), [outptr3]
"+r" (outptr3), [outptr4]
"+r" (outptr4), [outptr5]
"+r" (outptr5), [outptr6]
"+r" (outptr6), [outptr7]
"+r" (outptr7),
1541 [inptr]
"+r" (inptr), [p]
"+r" (p)
1542 : [
w]
"r" (
w), [biasptr]
"r" (biasptr), [minval]
"w" (minval), [maxval]
"w" (maxval)
1543 :
"x8",
"z0",
"z1",
"z2",
"z3",
"z4",
"z5",
"z6",
"z7",
"z8",
"z9",
"z10",
"z11",
"z12",
"z13",
"z14",
"z15",
"z16",
"z17",
"z18",
"z19",
"z20",
"memory",
"cc"
1554 "mov z0.s, %s[maxval]\n"
1555 "addvl x8, %[inptr], #16\n"
1556 "mov z1.s, %s[minval]\n"
1557 "whilelt p0.s, %[p], %[w]\n"
1558 "incw %[p], all, mul #1\n"
1559 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1560 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1561 "ld1w z2.s, p0/z, [%[biasptr]]\n"
1562 "whilelt p1.s, %[p], %[w]\n"
1563 "ld1w z13.s, p0/z, [%[inptr]]\n"
1564 "incw %[p], all, mul #1\n"
1565 "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
1566 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1567 "fadd z13.s, z13.s, z2.s\n"
1568 "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
1569 "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
1570 "whilelt p2.s, %[p], %[w]\n"
1571 "fadd z16.s, z16.s, z2.s\n"
1572 "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
1573 "fmin z13.s, p0/m, z13.s, z0.s\n"
1574 "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
1575 "fadd z14.s, z14.s, z3.s\n"
1576 "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
1577 "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
1578 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1579 "fmax z13.s, p0/m, z13.s, z1.s\n"
1580 "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
1581 "fmin z14.s, p1/m, z14.s, z0.s\n"
1582 "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
1583 "fadd z15.s, z15.s, z4.s\n"
1584 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1585 "fmin z16.s, p0/m, z16.s, z0.s\n"
1586 "st1w z13.s, p0, [%[outptr0]]\n"
1587 "fmax z14.s, p1/m, z14.s, z1.s\n"
1588 "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
1589 "fmin z15.s, p2/m, z15.s, z0.s\n"
1590 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1591 "fmax z16.s, p0/m, z16.s, z1.s\n"
1592 "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
1593 "fadd z17.s, z17.s, z3.s\n"
1594 "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
1595 "fmax z15.s, p2/m, z15.s, z1.s\n"
1596 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
1597 "fadd z18.s, z18.s, z4.s\n"
1598 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
1599 "fmin z17.s, p1/m, z17.s, z0.s\n"
1600 "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
1601 "fadd z19.s, z19.s, z2.s\n"
1602 "ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
1603 "fmin z18.s, p2/m, z18.s, z0.s\n"
1604 "addvl %[outptr0], %[outptr0], #3\n"
1605 "fmax z17.s, p1/m, z17.s, z1.s\n"
1606 "st1w z16.s, p0, [%[outptr1]]\n"
1607 "fmin z19.s, p0/m, z19.s, z0.s\n"
1608 "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
1609 "fmax z18.s, p2/m, z18.s, z1.s\n"
1610 "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
1611 "fadd z20.s, z20.s, z3.s\n"
1612 "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
1613 "fmax z19.s, p0/m, z19.s, z1.s\n"
1614 "ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n"
1615 "fadd z13.s, z13.s, z4.s\n"
1616 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
1617 "fmin z20.s, p1/m, z20.s, z0.s\n"
1618 "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
1619 "fadd z14.s, z14.s, z2.s\n"
1620 "ld1w z18.s, p1/z, [x8, #-3, MUL VL]\n"
1621 "fmin z13.s, p2/m, z13.s, z0.s\n"
1622 "addvl %[outptr1], %[outptr1], #3\n"
1623 "fmax z20.s, p1/m, z20.s, z1.s\n"
1624 "st1w z19.s, p0, [%[outptr2]]\n"
1625 "fmin z14.s, p0/m, z14.s, z0.s\n"
1626 "ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n"
1627 "fmax z13.s, p2/m, z13.s, z1.s\n"
1628 "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
1629 "fadd z15.s, z15.s, z3.s\n"
1630 "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
1631 "fmax z14.s, p0/m, z14.s, z1.s\n"
1632 "ld1w z20.s, p0/z, [x8, #-1, MUL VL]\n"
1633 "fadd z16.s, z16.s, z4.s\n"
1634 "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
1635 "fmin z15.s, p1/m, z15.s, z0.s\n"
1636 "st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
1637 "fadd z17.s, z17.s, z2.s\n"
1638 "ld1w z13.s, p1/z, [x8]\n"
1639 "fmin z16.s, p2/m, z16.s, z0.s\n"
1640 "addvl %[outptr2], %[outptr2], #3\n"
1641 "fmax z15.s, p1/m, z15.s, z1.s\n"
1642 "st1w z14.s, p0, [%[outptr3]]\n"
1643 "fmin z17.s, p0/m, z17.s, z0.s\n"
1644 "ld1w z14.s, p2/z, [x8, #1, MUL VL]\n"
1645 "fmax z16.s, p2/m, z16.s, z1.s\n"
1646 "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
1647 "fadd z18.s, z18.s, z3.s\n"
1648 "st1w z15.s, p1, [%[outptr3], #1, MUL VL]\n"
1649 "fmax z17.s, p0/m, z17.s, z1.s\n"
1650 "ld1w z15.s, p0/z, [x8, #2, MUL VL]\n"
1651 "fadd z19.s, z19.s, z4.s\n"
1652 "addvl %[inptr], %[inptr], #24\n"
1653 "fmin z18.s, p1/m, z18.s, z0.s\n"
1654 "st1w z16.s, p2, [%[outptr3], #2, MUL VL]\n"
1655 "fadd z20.s, z20.s, z2.s\n"
1656 "ld1w z16.s, p1/z, [x8, #3, MUL VL]\n"
1657 "fmin z19.s, p2/m, z19.s, z0.s\n"
1658 "addvl %[outptr3], %[outptr3], #3\n"
1659 "fmax z18.s, p1/m, z18.s, z1.s\n"
1660 "st1w z17.s, p0, [%[outptr4]]\n"
1661 "fmin z20.s, p0/m, z20.s, z0.s\n"
1662 "ld1w z17.s, p2/z, [x8, #4, MUL VL]\n"
1663 "fmax z19.s, p2/m, z19.s, z1.s\n"
1664 "fadd z13.s, z13.s, z3.s\n"
1665 "st1w z18.s, p1, [%[outptr4], #1, MUL VL]\n"
1666 "fadd z14.s, z14.s, z4.s\n"
1667 "fmax z20.s, p0/m, z20.s, z1.s\n"
1668 "fadd z15.s, z15.s, z2.s\n"
1669 "st1w z19.s, p2, [%[outptr4], #2, MUL VL]\n"
1670 "fmin z13.s, p1/m, z13.s, z0.s\n"
1671 "addvl %[outptr4], %[outptr4], #3\n"
1672 "fmin z14.s, p2/m, z14.s, z0.s\n"
1673 "st1w z20.s, p0, [%[outptr5]]\n"
1674 "fmin z15.s, p0/m, z15.s, z0.s\n"
1675 "fmax z13.s, p1/m, z13.s, z1.s\n"
1676 "fadd z16.s, z16.s, z3.s\n"
1677 "fmax z14.s, p2/m, z14.s, z1.s\n"
1678 "fmax z15.s, p0/m, z15.s, z1.s\n"
1679 "st1w z13.s, p1, [%[outptr5], #1, MUL VL]\n"
1680 "fadd z17.s, z17.s, z4.s\n"
1681 "fmin z16.s, p1/m, z16.s, z0.s\n"
1682 "st1w z14.s, p2, [%[outptr5], #2, MUL VL]\n"
1683 "addvl %[outptr5], %[outptr5], #3\n"
1684 "fmax z16.s, p1/m, z16.s, z1.s\n"
1685 "fmin z17.s, p2/m, z17.s, z0.s\n"
1686 "st1w z15.s, p0, [%[outptr6]]\n"
1687 "fmax z17.s, p2/m, z17.s, z1.s\n"
1688 "st1w z16.s, p1, [%[outptr6], #1, MUL VL]\n"
1689 "st1w z17.s, p2, [%[outptr6], #2, MUL VL]\n"
1690 "addvl %[outptr6], %[outptr6], #3\n"
1691 : [outptr0]
"+r" (outptr0), [outptr1]
"+r" (outptr1), [outptr2]
"+r" (outptr2), [outptr3]
"+r" (outptr3), [outptr4]
"+r" (outptr4), [outptr5]
"+r" (outptr5), [outptr6]
"+r" (outptr6), [outptr7]
"+r" (outptr7),
1692 [inptr]
"+r" (inptr), [p]
"+r" (p)
1693 : [
w]
"r" (
w), [biasptr]
"r" (biasptr), [minval]
"w" (minval), [maxval]
"w" (maxval)
1694 :
"x8",
"z0",
"z1",
"z2",
"z3",
"z4",
"z5",
"z6",
"z7",
"z8",
"z9",
"z10",
"z11",
"z12",
"z13",
"z14",
"z15",
"z16",
"z17",
"z18",
"z19",
"z20",
"memory",
"cc"
1706 "mov z0.s, %s[maxval]\n"
1707 "addvl x8, %[inptr], #16\n"
1708 "mov z1.s, %s[minval]\n"
1709 "whilelt p0.s, %[p], %[w]\n"
1710 "incw %[p], all, mul #1\n"
1711 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1712 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1713 "ld1w z2.s, p0/z, [%[biasptr]]\n"
1714 "whilelt p1.s, %[p], %[w]\n"
1715 "ld1w z13.s, p0/z, [%[inptr]]\n"
1716 "incw %[p], all, mul #1\n"
1717 "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
1718 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1719 "fadd z13.s, z13.s, z2.s\n"
1720 "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
1721 "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
1722 "whilelt p2.s, %[p], %[w]\n"
1723 "fadd z16.s, z16.s, z2.s\n"
1724 "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
1725 "fmin z13.s, p0/m, z13.s, z0.s\n"
1726 "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
1727 "fadd z14.s, z14.s, z3.s\n"
1728 "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
1729 "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
1730 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1731 "fmax z13.s, p0/m, z13.s, z1.s\n"
1732 "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
1733 "fmin z14.s, p1/m, z14.s, z0.s\n"
1734 "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
1735 "fadd z15.s, z15.s, z4.s\n"
1736 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1737 "fmin z16.s, p0/m, z16.s, z0.s\n"
1738 "st1w z13.s, p0, [%[outptr0]]\n"
1739 "fmax z14.s, p1/m, z14.s, z1.s\n"
1740 "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
1741 "fmin z15.s, p2/m, z15.s, z0.s\n"
1742 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1743 "fmax z16.s, p0/m, z16.s, z1.s\n"
1744 "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
1745 "fadd z17.s, z17.s, z3.s\n"
1746 "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
1747 "fmax z15.s, p2/m, z15.s, z1.s\n"
1748 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
1749 "fadd z18.s, z18.s, z4.s\n"
1750 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
1751 "fmin z17.s, p1/m, z17.s, z0.s\n"
1752 "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
1753 "fadd z19.s, z19.s, z2.s\n"
1754 "ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
1755 "fmin z18.s, p2/m, z18.s, z0.s\n"
1756 "addvl %[outptr0], %[outptr0], #3\n"
1757 "fmax z17.s, p1/m, z17.s, z1.s\n"
1758 "st1w z16.s, p0, [%[outptr1]]\n"
1759 "fmin z19.s, p0/m, z19.s, z0.s\n"
1760 "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
1761 "fmax z18.s, p2/m, z18.s, z1.s\n"
1762 "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
1763 "fadd z20.s, z20.s, z3.s\n"
1764 "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
1765 "fmax z19.s, p0/m, z19.s, z1.s\n"
1766 "ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n"
1767 "fadd z13.s, z13.s, z4.s\n"
1768 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
1769 "fmin z20.s, p1/m, z20.s, z0.s\n"
1770 "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
1771 "fadd z14.s, z14.s, z2.s\n"
1772 "ld1w z18.s, p1/z, [x8, #-3, MUL VL]\n"
1773 "fmin z13.s, p2/m, z13.s, z0.s\n"
1774 "addvl %[outptr1], %[outptr1], #3\n"
1775 "fmax z20.s, p1/m, z20.s, z1.s\n"
1776 "st1w z19.s, p0, [%[outptr2]]\n"
1777 "fmin z14.s, p0/m, z14.s, z0.s\n"
1778 "ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n"
1779 "fmax z13.s, p2/m, z13.s, z1.s\n"
1780 "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
1781 "fadd z15.s, z15.s, z3.s\n"
1782 "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
1783 "fmax z14.s, p0/m, z14.s, z1.s\n"
1784 "ld1w z20.s, p0/z, [x8, #-1, MUL VL]\n"
1785 "fadd z16.s, z16.s, z4.s\n"
1786 "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
1787 "fmin z15.s, p1/m, z15.s, z0.s\n"
1788 "st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
1789 "fadd z17.s, z17.s, z2.s\n"
1790 "ld1w z13.s, p1/z, [x8]\n"
1791 "fmin z16.s, p2/m, z16.s, z0.s\n"
1792 "addvl %[outptr2], %[outptr2], #3\n"
1793 "fmax z15.s, p1/m, z15.s, z1.s\n"
1794 "st1w z14.s, p0, [%[outptr3]]\n"
1795 "fmin z17.s, p0/m, z17.s, z0.s\n"
1796 "ld1w z14.s, p2/z, [x8, #1, MUL VL]\n"
1797 "fmax z16.s, p2/m, z16.s, z1.s\n"
1798 "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
1799 "fadd z18.s, z18.s, z3.s\n"
1800 "st1w z15.s, p1, [%[outptr3], #1, MUL VL]\n"
1801 "fmax z17.s, p0/m, z17.s, z1.s\n"
1802 "ld1w z15.s, p0/z, [x8, #2, MUL VL]\n"
1803 "fadd z19.s, z19.s, z4.s\n"
1804 "prfm PSTL1KEEP, [%[outptr7], #0x60]\n"
1805 "fmin z18.s, p1/m, z18.s, z0.s\n"
1806 "st1w z16.s, p2, [%[outptr3], #2, MUL VL]\n"
1807 "fadd z20.s, z20.s, z2.s\n"
1808 "ld1w z16.s, p1/z, [x8, #3, MUL VL]\n"
1809 "fmin z19.s, p2/m, z19.s, z0.s\n"
1810 "addvl %[outptr3], %[outptr3], #3\n"
1811 "fmax z18.s, p1/m, z18.s, z1.s\n"
1812 "st1w z17.s, p0, [%[outptr4]]\n"
1813 "fmin z20.s, p0/m, z20.s, z0.s\n"
1814 "ld1w z17.s, p2/z, [x8, #4, MUL VL]\n"
1815 "fmax z19.s, p2/m, z19.s, z1.s\n"
1816 "addvl %[inptr], %[inptr], #24\n"
1817 "fadd z13.s, z13.s, z3.s\n"
1818 "st1w z18.s, p1, [%[outptr4], #1, MUL VL]\n"
1819 "fmax z20.s, p0/m, z20.s, z1.s\n"
1820 "ld1w z18.s, p0/z, [x8, #5, MUL VL]\n"
1821 "fadd z14.s, z14.s, z4.s\n"
1822 "fadd z15.s, z15.s, z2.s\n"
1823 "st1w z19.s, p2, [%[outptr4], #2, MUL VL]\n"
1824 "fmin z13.s, p1/m, z13.s, z0.s\n"
1825 "ld1w z19.s, p1/z, [x8, #6, MUL VL]\n"
1826 "fadd z16.s, z16.s, z3.s\n"
1827 "addvl %[outptr4], %[outptr4], #3\n"
1828 "fmin z14.s, p2/m, z14.s, z0.s\n"
1829 "st1w z20.s, p0, [%[outptr5]]\n"
1830 "fmax z13.s, p1/m, z13.s, z1.s\n"
1831 "ld1w z20.s, p2/z, [x8, #7, MUL VL]\n"
1832 "fmin z15.s, p0/m, z15.s, z0.s\n"
1833 "fmin z16.s, p1/m, z16.s, z0.s\n"
1834 "fmax z14.s, p2/m, z14.s, z1.s\n"
1835 "st1w z13.s, p1, [%[outptr5], #1, MUL VL]\n"
1836 "fadd z17.s, z17.s, z4.s\n"
1837 "fmax z15.s, p0/m, z15.s, z1.s\n"
1838 "fmax z16.s, p1/m, z16.s, z1.s\n"
1839 "st1w z14.s, p2, [%[outptr5], #2, MUL VL]\n"
1840 "fadd z18.s, z18.s, z2.s\n"
1841 "addvl %[outptr5], %[outptr5], #3\n"
1842 "fmin z17.s, p2/m, z17.s, z0.s\n"
1843 "st1w z15.s, p0, [%[outptr6]]\n"
1844 "fadd z19.s, z19.s, z3.s\n"
1845 "fmin z18.s, p0/m, z18.s, z0.s\n"
1846 "fadd z20.s, z20.s, z4.s\n"
1847 "st1w z16.s, p1, [%[outptr6], #1, MUL VL]\n"
1848 "fmax z17.s, p2/m, z17.s, z1.s\n"
1849 "fmin z19.s, p1/m, z19.s, z0.s\n"
1850 "fmax z18.s, p0/m, z18.s, z1.s\n"
1851 "fmin z20.s, p2/m, z20.s, z0.s\n"
1852 "st1w z17.s, p2, [%[outptr6], #2, MUL VL]\n"
1853 "addvl %[outptr6], %[outptr6], #3\n"
1854 "fmax z19.s, p1/m, z19.s, z1.s\n"
1855 "fmax z20.s, p2/m, z20.s, z1.s\n"
1856 "st1w z18.s, p0, [%[outptr7]]\n"
1857 "st1w z19.s, p1, [%[outptr7], #1, MUL VL]\n"
1858 "st1w z20.s, p2, [%[outptr7], #2, MUL VL]\n"
1859 "addvl %[outptr7], %[outptr7], #3\n"
1860 : [outptr0]
"+r" (outptr0), [outptr1]
"+r" (outptr1), [outptr2]
"+r" (outptr2), [outptr3]
"+r" (outptr3), [outptr4]
"+r" (outptr4), [outptr5]
"+r" (outptr5), [outptr6]
"+r" (outptr6), [outptr7]
"+r" (outptr7),
1861 [inptr]
"+r" (inptr), [p]
"+r" (p)
1862 : [
w]
"r" (
w), [biasptr]
"r" (biasptr), [minval]
"w" (minval), [maxval]
"w" (maxval)
1863 :
"x8",
"z0",
"z1",
"z2",
"z3",
"z4",
"z5",
"z6",
"z7",
"z8",
"z9",
"z10",
"z11",
"z12",
"z13",
"z14",
"z15",
"z16",
"z17",
"z18",
"z19",
"z20",
"memory",
"cc"
1875 #endif // ARM_COMPUTE_ENABLE_SVE