28 #if defined(ARM_COMPUTE_ENABLE_SVE)
33 void sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
34 const float *
const *
const input_ptrs,
35 float *
const *
const outptrs,
37 unsigned int n_channels,
44 float *
const *outptrs;
47 const float *inptrs[25];
50 const float *
const *
const input_ptrs,
51 float *
const *
const outptrs,
52 const void *
const params,
55 ) : outptrs(outptrs), params(params), min(min), max(max)
57 inptrs[0] = input_ptrs[12];
58 inptrs[1] = input_ptrs[0];
59 inptrs[2] = input_ptrs[4];
60 inptrs[3] = input_ptrs[20];
61 inptrs[4] = input_ptrs[7];
62 inptrs[5] = input_ptrs[24];
63 inptrs[6] = input_ptrs[11];
64 inptrs[7] = input_ptrs[1];
65 inptrs[8] = input_ptrs[3];
66 inptrs[9] = input_ptrs[13];
67 inptrs[10] = input_ptrs[5];
68 inptrs[11] = input_ptrs[9];
69 inptrs[12] = input_ptrs[15];
70 inptrs[13] = input_ptrs[17];
71 inptrs[14] = input_ptrs[19];
72 inptrs[15] = input_ptrs[21];
73 inptrs[16] = input_ptrs[6];
74 inptrs[17] = input_ptrs[8];
75 inptrs[18] = input_ptrs[23];
76 inptrs[19] = input_ptrs[16];
77 inptrs[20] = input_ptrs[2];
78 inptrs[21] = input_ptrs[18];
79 inptrs[22] = input_ptrs[10];
80 inptrs[23] = input_ptrs[14];
81 inptrs[24] = input_ptrs[22];
86 Args params_struct(input_ptrs, outptrs, params,
91 "ldr x8, [%x[params_struct], %[offsetof_args_params]]\n"
92 "add x17, %x[params_struct], %[offsetof_Args_inptrs]\n"
93 "ld1w { z14.s }, p3/Z, [x8]\n"
96 "ld1w { z0.s }, p3/Z, [x8, #1, MUL VL]\n"
97 "ld1w { z1.s }, p3/Z, [x8, #2, MUL VL]\n"
98 "whilelt p2.s, XZR, %x[n_channels]\n"
99 "ld1w { z2.s }, p3/Z, [x8, #3, MUL VL]\n"
100 "ld1w { z3.s }, p3/Z, [x8, #4, MUL VL]\n"
101 "cmp x16, %x[n_channels]\n"
102 "ld1w { z4.s }, p3/Z, [x8, #5, MUL VL]\n"
103 "ld1w { z5.s }, p3/Z, [x8, #6, MUL VL]\n"
104 "sub x14, XZR, x16\n"
105 "ld1w { z6.s }, p3/Z, [x8, #7, MUL VL]\n"
106 "addvl x8, x8, #16\n"
107 "ldp x24, x23, [x17, #0x0]\n"
108 "ldp x22, x21, [x17, #0x10]\n"
109 "ldr x20, [x17, #0x20]\n"
110 "ldr x13, [%x[params_struct], %[offsetof_args_outptrs]]\n"
111 "ld1rw { z31.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
112 "ld1rw { z30.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
113 "ld1w { z7.s }, p3/Z, [x8, #-8, MUL VL]\n"
114 "ld1w { z8.s }, p3/Z, [x8, #-7, MUL VL]\n"
115 "ld1w { z9.s }, p2/Z, [x24, x15, LSL #2]\n"
116 "addvl x8, x8, #-6\n"
117 "ld1w { z10.s }, p2/Z, [x23, x15, LSL #2]\n"
118 "ld1w { z11.s }, p2/Z, [x22, x15, LSL #2]\n"
119 "ld1w { z12.s }, p2/Z, [x21, x15, LSL #2]\n"
120 "ld1w { z13.s }, p2/Z, [x20, x15, LSL #2]\n"
123 "movprfx z29, z14\n fmla z29.s, p3/M, z8.s, z9.s\n"
124 "movprfx z28, z14\n fmla z28.s, p3/M, z7.s, z9.s\n"
125 "ldr x23, [x17, #0x30]\n"
126 "ldr x26, [x17, #0x38]\n"
127 "movprfx z27, z14\n fmla z27.s, p3/M, z6.s, z9.s\n"
128 "fmla z29.s, p3/M, z0.s, z10.s\n"
129 "ldr x22, [x17, #0x28]\n"
130 "ldr x21, [x17, #0x48]\n"
131 "fmla z28.s, p3/M, z4.s, z13.s\n"
132 "movprfx z26, z14\n fmla z26.s, p3/M, z5.s, z9.s\n"
133 "ldr x20, [x17, #0x40]\n"
134 "ld1w { z19.s }, p2/Z, [x21, x15, LSL #2]\n"
135 "movprfx z25, z14\n fmla z25.s, p3/M, z4.s, z9.s\n"
136 "movprfx z24, z14\n fmla z24.s, p3/M, z3.s, z9.s\n"
137 "ldr x25, [x17, #0x50]\n"
138 "ldr x24, [x17, #0x58]\n"
139 "fmla z27.s, p3/M, z2.s, z11.s\n"
140 "ld1w { z18.s }, p2/Z, [x23, x15, LSL #2]\n"
141 "movprfx z23, z14\n fmla z23.s, p3/M, z2.s, z9.s\n"
142 "ldr x23, [x17, #0x60]\n"
143 "fmla z29.s, p3/M, z5.s, z13.s\n"
144 "fmla z28.s, p3/M, z6.s, z18.s\n"
145 "ldr x12, [x17, #0x70]\n"
146 "ldr x11, [x17, #0x88]\n"
147 "movprfx z22, z14\n fmla z22.s, p3/M, z0.s, z9.s\n"
148 "fmla z27.s, p3/M, z3.s, z13.s\n"
151 "fmla z26.s, p3/M, z2.s, z13.s\n"
152 "fmla z25.s, p3/M, z1.s, z13.s\n"
153 "ldr x10, [x13, #0x0]\n"
154 "whilelt p0.s, x16, %x[n_channels]\n"
155 "fmla z24.s, p3/M, z0.s, z13.s\n"
156 "ld1w { z17.s }, p2/Z, [x26, x15, LSL #2]\n"
157 "fmla z23.s, p3/M, z6.s, z12.s\n"
158 "ld1w { z16.s }, p2/Z, [x22, x15, LSL #2]\n"
159 "movprfx z21, z14\n fmla z21.s, p3/M, z1.s, z9.s\n"
160 "fmla z29.s, p3/M, z7.s, z18.s\n"
161 "ldr x22, [x17, #0x68]\n"
162 "ldr x21, [x17, #0x78]\n"
163 "fmla z28.s, p3/M, z0.s, z17.s\n"
164 "fmla z22.s, p3/M, z8.s, z16.s\n"
165 "ld1w { z16.s }, p2/Z, [x20, x15, LSL #2]\n"
166 "ldr x20, [x17, #0x80]\n"
167 "fmla z26.s, p3/M, z4.s, z18.s\n"
168 "fmla z25.s, p3/M, z3.s, z18.s\n"
169 "ldr x9, [x13, #0x8]\n"
170 "ldr x28, [x13, #0x10]\n"
171 "fmla z21.s, p3/M, z0.s, z18.s\n"
172 "fmla z24.s, p3/M, z4.s, z19.s\n"
173 "ldr x27, [x13, #0x18]\n"
174 "ld1w { z14.s }, p3/Z, [x8]\n"
175 "fmla z23.s, p3/M, z1.s, z18.s\n"
176 "fmla z29.s, p3/M, z1.s, z17.s\n"
177 "ld1w { z20.s }, p2/Z, [x25, x15, LSL #2]\n"
178 "ld1w { z17.s }, p2/Z, [x24, x15, LSL #2]\n"
179 "fmla z28.s, p3/M, z2.s, z16.s\n"
180 "fmla z27.s, p3/M, z1.s, z16.s\n"
181 "ld1w { z16.s }, p2/Z, [x23, x15, LSL #2]\n"
182 "ldr x26, [x17, #0x90]\n"
183 "fmla z25.s, p3/M, z5.s, z19.s\n"
184 "fmla z21.s, p3/M, z2.s, z19.s\n"
185 "ldr x25, [x17, #0xa0]\n"
186 "ldr x24, [x17, #0x98]\n"
187 "fmla z26.s, p3/M, z0.s, z20.s\n"
188 "fmla z24.s, p3/M, z2.s, z17.s\n"
189 "fmla z28.s, p3/M, z8.s, z19.s\n"
190 "fmla z27.s, p3/M, z7.s, z19.s\n"
191 "fmla z22.s, p3/M, z1.s, z19.s\n"
192 "fmla z23.s, p3/M, z3.s, z16.s\n"
193 "ld1w { z18.s }, p2/Z, [x22, x15, LSL #2]\n"
194 "ldr x23, [x17, #0xa8]\n"
195 "fmla z26.s, p3/M, z6.s, z16.s\n"
196 "fmla z25.s, p3/M, z7.s, z18.s\n"
197 "ld1w { z19.s }, p2/Z, [x20, x15, LSL #2]\n"
198 "ldr x22, [x17, #0xc0]\n"
199 "fmla z24.s, p3/M, z6.s, z18.s\n"
200 "fmla z21.s, p3/M, z4.s, z18.s\n"
201 "fmla z29.s, p3/M, z3.s, z20.s\n"
202 "fmla z27.s, p3/M, z5.s, z17.s\n"
203 "ld1w { z17.s }, p2/Z, [x12, x15, LSL #2]\n"
204 "ld1w { z16.s }, p2/Z, [x21, x15, LSL #2]\n"
205 "fmla z23.s, p3/M, z5.s, z18.s\n"
206 "fmla z22.s, p3/M, z3.s, z18.s\n"
207 "ldr x21, [x17, #0xb0]\n"
208 "ldr x20, [x17, #0xb8]\n"
209 "fmla z26.s, p3/M, z8.s, z18.s\n"
210 "fmla z24.s, p3/M, z8.s, z17.s\n"
211 "fmla z21.s, p3/M, z6.s, z16.s\n"
212 "fmla z28.s, p3/M, z3.s, z19.s\n"
213 "fmla z25.s, p3/M, z0.s, z19.s\n"
214 "fmla z22.s, p3/M, z5.s, z17.s\n"
215 "ld1w { z17.s }, p2/Z, [x11, x15, LSL #2]\n"
216 "fmla z23.s, p3/M, z7.s, z16.s\n"
217 "ld1w { z18.s }, p2/Z, [x26, x15, LSL #2]\n"
218 "fmla z29.s, p3/M, z4.s, z19.s\n"
219 "fmla z26.s, p3/M, z1.s, z19.s\n"
220 "fmla z28.s, p3/M, z5.s, z17.s\n"
221 "ld1w { z16.s }, p2/Z, [x24, x15, LSL #2]\n"
222 "fmla z27.s, p3/M, z4.s, z17.s\n"
223 "fmla z25.s, p3/M, z2.s, z17.s\n"
224 "fmla z24.s, p3/M, z1.s, z17.s\n"
225 "fmla z21.s, p3/M, z8.s, z18.s\n"
226 "ld1w { z17.s }, p2/Z, [x25, x15, LSL #2]\n"
227 "ldr x25, [x17, #0x20]\n"
228 "fmla z22.s, p3/M, z7.s, z18.s\n"
229 "ld1w { z18.s }, p2/Z, [x23, x15, LSL #2]\n"
230 "fmla z29.s, p3/M, z2.s, z17.s\n"
231 "fmla z26.s, p3/M, z7.s, z16.s\n"
232 "fmla z25.s, p3/M, z6.s, z16.s\n"
233 "fmla z23.s, p3/M, z4.s, z16.s\n"
234 "fmla z21.s, p3/M, z3.s, z16.s\n"
235 "ld1w { z16.s }, p2/Z, [x21, x15, LSL #2]\n"
236 "fmla z22.s, p3/M, z4.s, z18.s\n"
237 "fmla z28.s, p3/M, z1.s, z17.s\n"
238 "fmax z28.s, p3/M, z28.s, z31.s\n"
239 "fmin z28.s, p3/M, z28.s, z30.s\n"
240 "fmla z27.s, p3/M, z0.s, z17.s\n"
241 "ld1w { z17.s }, p2/Z, [x20, x15, LSL #2]\n"
242 "fmla z29.s, p3/M, z6.s, z16.s\n"
243 "fmax z29.s, p3/M, z29.s, z31.s\n"
244 "fmla z24.s, p3/M, z7.s, z18.s\n"
245 "fmla z21.s, p3/M, z5.s, z18.s\n"
246 "fmin z29.s, p3/M, z29.s, z30.s\n"
247 "st1w { z29.s }, p1, [x10, x14, LSL #2]\n"
248 "fmla z23.s, p3/M, z0.s, z16.s\n"
249 "fmla z22.s, p3/M, z2.s, z17.s\n"
250 "ldr x24, [x13, #0x20]\n"
251 "st1w { z28.s }, p1, [x9, x14, LSL #2]\n"
252 "fmla z25.s, p3/M, z8.s, z18.s\n"
253 "fmla z26.s, p3/M, z3.s, z16.s\n"
254 "ld1w { z16.s }, p2/Z, [x22, x15, LSL #2]\n"
255 "ldp x23, x22, [x17, #0x0]\n"
256 "fmla z27.s, p3/M, z8.s, z17.s\n"
257 "fmla z24.s, p3/M, z5.s, z17.s\n"
258 "ldp x21, x20, [x17, #0x10]\n"
259 "fmax z27.s, p3/M, z27.s, z31.s\n"
260 "fmla z23.s, p3/M, z8.s, z16.s\n"
261 "fmla z21.s, p3/M, z7.s, z16.s\n"
262 "fmax z26.s, p3/M, z26.s, z31.s\n"
263 "fmax z25.s, p3/M, z25.s, z31.s\n"
264 "fmla z22.s, p3/M, z6.s, z16.s\n"
266 "ld1w { z9.s }, p0/Z, [x23, x16, LSL #2]\n"
267 "ld1w { z10.s }, p0/Z, [x22, x16, LSL #2]\n"
268 "ld1w { z11.s }, p0/Z, [x21, x16, LSL #2]\n"
269 "ld1w { z12.s }, p0/Z, [x20, x16, LSL #2]\n"
270 "fmin z27.s, p3/M, z27.s, z30.s\n"
271 "fmin z26.s, p3/M, z26.s, z30.s\n"
272 "ld1w { z13.s }, p0/Z, [x25, x16, LSL #2]\n"
274 "fmin z25.s, p3/M, z25.s, z30.s\n"
275 "st1w { z27.s }, p1, [x28, x14, LSL #2]\n"
276 "fmax z24.s, p3/M, z24.s, z31.s\n"
277 "fmax z23.s, p3/M, z23.s, z31.s\n"
278 "st1w { z26.s }, p1, [x27, x14, LSL #2]\n"
279 "ldr x23, [x13, #0x28]\n"
280 "fmax z21.s, p3/M, z21.s, z31.s\n"
281 "fmax z22.s, p3/M, z22.s, z31.s\n"
282 "st1w { z25.s }, p1, [x24, x14, LSL #2]\n"
283 "ldr x22, [x13, #0x30]\n"
284 "ldr x21, [x13, #0x38]\n"
285 "ldr x20, [x13, #0x40]\n"
286 "whilelt p2.s, x15, %x[n_channels]\n"
287 "cmp x16, %x[n_channels]\n"
288 "ld1w { z0.s }, p3/Z, [x8, #1, MUL VL]\n"
289 "ld1w { z1.s }, p3/Z, [x8, #2, MUL VL]\n"
290 "fmin z24.s, p3/M, z24.s, z30.s\n"
291 "fmin z23.s, p3/M, z23.s, z30.s\n"
292 "ld1w { z2.s }, p3/Z, [x8, #3, MUL VL]\n"
293 "ld1w { z3.s }, p3/Z, [x8, #4, MUL VL]\n"
294 "fmin z21.s, p3/M, z21.s, z30.s\n"
295 "fmin z22.s, p3/M, z22.s, z30.s\n"
296 "ld1w { z4.s }, p3/Z, [x8, #5, MUL VL]\n"
297 "ld1w { z5.s }, p3/Z, [x8, #6, MUL VL]\n"
298 "st1w { z24.s }, p1, [x23, x14, LSL #2]\n"
299 "ld1w { z6.s }, p3/Z, [x8, #7, MUL VL]\n"
300 "addvl x8, x8, #16\n"
301 "st1w { z23.s }, p1, [x22, x14, LSL #2]\n"
302 "ld1w { z7.s }, p3/Z, [x8, #-8, MUL VL]\n"
303 "st1w { z21.s }, p1, [x21, x14, LSL #2]\n"
304 "ld1w { z8.s }, p3/Z, [x8, #-7, MUL VL]\n"
305 "addvl x8, x8, #-6\n"
306 "st1w { z22.s }, p1, [x20, x14, LSL #2]\n"
309 "movprfx z29, z14\n fmla z29.s, p3/M, z8.s, z9.s\n"
310 "movprfx z28, z14\n fmla z28.s, p3/M, z7.s, z9.s\n"
311 "ldr x23, [x17, #0x30]\n"
312 "ldr x26, [x17, #0x38]\n"
313 "movprfx z27, z14\n fmla z27.s, p3/M, z6.s, z9.s\n"
314 "fmla z29.s, p3/M, z0.s, z10.s\n"
315 "ldr x22, [x17, #0x28]\n"
316 "ldr x21, [x17, #0x48]\n"
317 "fmla z28.s, p3/M, z4.s, z13.s\n"
318 "movprfx z26, z14\n fmla z26.s, p3/M, z5.s, z9.s\n"
319 "ldr x20, [x17, #0x40]\n"
320 "ld1w { z19.s }, p2/Z, [x21, x15, LSL #2]\n"
321 "movprfx z25, z14\n fmla z25.s, p3/M, z4.s, z9.s\n"
322 "movprfx z24, z14\n fmla z24.s, p3/M, z3.s, z9.s\n"
323 "ldr x25, [x17, #0x50]\n"
324 "ldr x24, [x17, #0x58]\n"
325 "fmla z27.s, p3/M, z2.s, z11.s\n"
326 "ld1w { z18.s }, p2/Z, [x23, x15, LSL #2]\n"
327 "movprfx z23, z14\n fmla z23.s, p3/M, z2.s, z9.s\n"
328 "ldr x23, [x17, #0x60]\n"
329 "fmla z29.s, p3/M, z5.s, z13.s\n"
330 "fmla z28.s, p3/M, z6.s, z18.s\n"
331 "ldr x12, [x17, #0x70]\n"
332 "ldr x11, [x17, #0x88]\n"
333 "movprfx z22, z14\n fmla z22.s, p3/M, z0.s, z9.s\n"
334 "fmla z27.s, p3/M, z3.s, z13.s\n"
337 "fmla z26.s, p3/M, z2.s, z13.s\n"
338 "fmla z25.s, p3/M, z1.s, z13.s\n"
339 "ldr x10, [x13, #0x0]\n"
340 "ldr x9, [x13, #0x8]\n"
341 "fmla z24.s, p3/M, z0.s, z13.s\n"
342 "ld1w { z17.s }, p2/Z, [x26, x15, LSL #2]\n"
343 "fmla z23.s, p3/M, z6.s, z12.s\n"
344 "ld1w { z16.s }, p2/Z, [x22, x15, LSL #2]\n"
345 "movprfx z21, z14\n fmla z21.s, p3/M, z1.s, z9.s\n"
346 "fmla z29.s, p3/M, z7.s, z18.s\n"
347 "ldr x22, [x17, #0x68]\n"
348 "ldr x21, [x17, #0x78]\n"
349 "fmla z28.s, p3/M, z0.s, z17.s\n"
350 "fmla z22.s, p3/M, z8.s, z16.s\n"
351 "ld1w { z16.s }, p2/Z, [x20, x15, LSL #2]\n"
352 "ldr x20, [x17, #0x80]\n"
353 "fmla z26.s, p3/M, z4.s, z18.s\n"
354 "fmla z25.s, p3/M, z3.s, z18.s\n"
355 "ldr x28, [x13, #0x10]\n"
356 "ldr x27, [x13, #0x18]\n"
357 "fmla z21.s, p3/M, z0.s, z18.s\n"
358 "fmla z24.s, p3/M, z4.s, z19.s\n"
359 "fmla z23.s, p3/M, z1.s, z18.s\n"
360 "fmla z29.s, p3/M, z1.s, z17.s\n"
361 "ld1w { z20.s }, p2/Z, [x25, x15, LSL #2]\n"
362 "ld1w { z17.s }, p2/Z, [x24, x15, LSL #2]\n"
363 "fmla z28.s, p3/M, z2.s, z16.s\n"
364 "fmla z27.s, p3/M, z1.s, z16.s\n"
365 "ld1w { z16.s }, p2/Z, [x23, x15, LSL #2]\n"
366 "ldr x26, [x17, #0x90]\n"
367 "fmla z25.s, p3/M, z5.s, z19.s\n"
368 "fmla z21.s, p3/M, z2.s, z19.s\n"
369 "ldr x25, [x17, #0xa0]\n"
370 "ldr x24, [x17, #0x98]\n"
371 "fmla z26.s, p3/M, z0.s, z20.s\n"
372 "fmla z24.s, p3/M, z2.s, z17.s\n"
373 "fmla z28.s, p3/M, z8.s, z19.s\n"
374 "fmla z27.s, p3/M, z7.s, z19.s\n"
375 "fmla z22.s, p3/M, z1.s, z19.s\n"
376 "fmla z23.s, p3/M, z3.s, z16.s\n"
377 "ld1w { z18.s }, p2/Z, [x22, x15, LSL #2]\n"
378 "ldr x23, [x17, #0xa8]\n"
379 "fmla z26.s, p3/M, z6.s, z16.s\n"
380 "fmla z25.s, p3/M, z7.s, z18.s\n"
381 "ld1w { z19.s }, p2/Z, [x20, x15, LSL #2]\n"
382 "ldr x22, [x17, #0xc0]\n"
383 "fmla z24.s, p3/M, z6.s, z18.s\n"
384 "fmla z21.s, p3/M, z4.s, z18.s\n"
385 "fmla z29.s, p3/M, z3.s, z20.s\n"
386 "fmla z27.s, p3/M, z5.s, z17.s\n"
387 "ld1w { z17.s }, p2/Z, [x12, x15, LSL #2]\n"
388 "ld1w { z16.s }, p2/Z, [x21, x15, LSL #2]\n"
389 "fmla z23.s, p3/M, z5.s, z18.s\n"
390 "fmla z22.s, p3/M, z3.s, z18.s\n"
391 "ldr x21, [x17, #0xb0]\n"
392 "ldr x20, [x17, #0xb8]\n"
393 "fmla z26.s, p3/M, z8.s, z18.s\n"
394 "fmla z24.s, p3/M, z8.s, z17.s\n"
395 "fmla z21.s, p3/M, z6.s, z16.s\n"
396 "fmla z28.s, p3/M, z3.s, z19.s\n"
397 "fmla z25.s, p3/M, z0.s, z19.s\n"
398 "fmla z22.s, p3/M, z5.s, z17.s\n"
399 "ld1w { z17.s }, p2/Z, [x11, x15, LSL #2]\n"
400 "fmla z23.s, p3/M, z7.s, z16.s\n"
401 "ld1w { z18.s }, p2/Z, [x26, x15, LSL #2]\n"
402 "fmla z29.s, p3/M, z4.s, z19.s\n"
403 "fmla z26.s, p3/M, z1.s, z19.s\n"
404 "fmla z28.s, p3/M, z5.s, z17.s\n"
405 "ld1w { z16.s }, p2/Z, [x24, x15, LSL #2]\n"
406 "fmla z27.s, p3/M, z4.s, z17.s\n"
407 "fmla z25.s, p3/M, z2.s, z17.s\n"
408 "fmla z24.s, p3/M, z1.s, z17.s\n"
409 "fmla z21.s, p3/M, z8.s, z18.s\n"
410 "ld1w { z17.s }, p2/Z, [x25, x15, LSL #2]\n"
411 "fmla z22.s, p3/M, z7.s, z18.s\n"
412 "ld1w { z18.s }, p2/Z, [x23, x15, LSL #2]\n"
413 "fmla z29.s, p3/M, z2.s, z17.s\n"
414 "fmla z26.s, p3/M, z7.s, z16.s\n"
415 "fmla z25.s, p3/M, z6.s, z16.s\n"
416 "fmla z23.s, p3/M, z4.s, z16.s\n"
417 "fmla z21.s, p3/M, z3.s, z16.s\n"
418 "ld1w { z16.s }, p2/Z, [x21, x15, LSL #2]\n"
419 "fmla z22.s, p3/M, z4.s, z18.s\n"
420 "fmla z28.s, p3/M, z1.s, z17.s\n"
421 "fmax z28.s, p3/M, z28.s, z31.s\n"
422 "fmin z28.s, p3/M, z28.s, z30.s\n"
423 "fmla z27.s, p3/M, z0.s, z17.s\n"
424 "ld1w { z17.s }, p2/Z, [x20, x15, LSL #2]\n"
425 "fmla z29.s, p3/M, z6.s, z16.s\n"
426 "fmax z29.s, p3/M, z29.s, z31.s\n"
427 "fmla z24.s, p3/M, z7.s, z18.s\n"
428 "fmla z21.s, p3/M, z5.s, z18.s\n"
429 "fmin z29.s, p3/M, z29.s, z30.s\n"
430 "st1w { z29.s }, p0, [x10, x14, LSL #2]\n"
431 "fmla z23.s, p3/M, z0.s, z16.s\n"
432 "fmla z22.s, p3/M, z2.s, z17.s\n"
433 "ldr x20, [x13, #0x20]\n"
434 "st1w { z28.s }, p0, [x9, x14, LSL #2]\n"
435 "fmla z25.s, p3/M, z8.s, z18.s\n"
436 "fmla z26.s, p3/M, z3.s, z16.s\n"
437 "ld1w { z16.s }, p2/Z, [x22, x15, LSL #2]\n"
438 "fmax z26.s, p3/M, z26.s, z31.s\n"
439 "fmla z27.s, p3/M, z8.s, z17.s\n"
440 "fmla z24.s, p3/M, z5.s, z17.s\n"
441 "fmax z27.s, p3/M, z27.s, z31.s\n"
442 "fmax z25.s, p3/M, z25.s, z31.s\n"
443 "fmla z23.s, p3/M, z8.s, z16.s\n"
444 "fmla z21.s, p3/M, z7.s, z16.s\n"
445 "fmin z27.s, p3/M, z27.s, z30.s\n"
446 "fmin z26.s, p3/M, z26.s, z30.s\n"
447 "fmla z22.s, p3/M, z6.s, z16.s\n"
448 "fmin z25.s, p3/M, z25.s, z30.s\n"
449 "fmax z24.s, p3/M, z24.s, z31.s\n"
450 "st1w { z27.s }, p0, [x28, x14, LSL #2]\n"
451 "fmax z23.s, p3/M, z23.s, z31.s\n"
452 "fmax z21.s, p3/M, z21.s, z31.s\n"
453 "st1w { z26.s }, p0, [x27, x14, LSL #2]\n"
454 "ldr x23, [x13, #0x28]\n"
455 "fmax z22.s, p3/M, z22.s, z31.s\n"
456 "st1w { z25.s }, p0, [x20, x14, LSL #2]\n"
457 "ldr x22, [x13, #0x30]\n"
458 "ldr x21, [x13, #0x38]\n"
459 "ldr x20, [x13, #0x40]\n"
460 "fmin z24.s, p3/M, z24.s, z30.s\n"
461 "fmin z23.s, p3/M, z23.s, z30.s\n"
462 "st1w { z24.s }, p0, [x23, x14, LSL #2]\n"
463 "fmin z21.s, p3/M, z21.s, z30.s\n"
464 "fmin z22.s, p3/M, z22.s, z30.s\n"
465 "st1w { z23.s }, p0, [x22, x14, LSL #2]\n"
466 "st1w { z21.s }, p0, [x21, x14, LSL #2]\n"
467 "st1w { z22.s }, p0, [x20, x14, LSL #2]\n"
469 : [n_channels]
"r" ((
unsigned long) n_channels), [offsetof_Args_inptrs]
"I" (offsetof(Args, inptrs)), [offsetof_args_max]
"I" (offsetof(Args, max)), [offsetof_args_min]
"I" (offsetof(Args, min)), [offsetof_args_outptrs]
"I" (offsetof(Args, outptrs)), [offsetof_args_params]
"I" (offsetof(Args, params)), [params_struct]
"r" (¶ms_struct)
470 :
"cc",
"memory",
"p0",
"p1",
"p2",
"p3",
"x8",
"x9",
"x10",
"x11",
"x12",
"x13",
"x14",
"x15",
"x16",
"x17",
"x20",
"x21",
"x22",
"x23",
"x24",
"x25",
"x26",
"x27",
"x28",
"z0",
"z1",
"z2",
"z3",
"z4",
"z5",
"z6",
"z7",
"z8",
"z9",
"z10",
"z11",
"z12",
"z13",
"z14",
"z16",
"z17",
"z18",
"z19",
"z20",
"z21",
"z22",
"z23",
"z24",
"z25",
"z26",
"z27",
"z28",
"z29",
"z30",
"z31"
477 #endif // defined(ARM_COMPUTE_ENABLE_SVE)