28 #if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
33 void sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
34 const __fp16 *
const *
const input_ptrs,
35 __fp16 *
const *
const outptrs,
37 unsigned int n_channels,
44 __fp16 *
const *outptrs;
46 const __fp16 min, max;
47 const __fp16 *inptrs[16];
50 const __fp16 *
const *
const input_ptrs,
51 __fp16 *
const *
const outptrs,
52 const void *
const params,
55 ) : outptrs(outptrs), params(params), min(min), max(max)
57 inptrs[0] = input_ptrs[5];
58 inptrs[1] = input_ptrs[0];
59 inptrs[2] = input_ptrs[3];
60 inptrs[3] = input_ptrs[6];
61 inptrs[4] = input_ptrs[9];
62 inptrs[5] = input_ptrs[12];
63 inptrs[6] = input_ptrs[15];
64 inptrs[7] = input_ptrs[1];
65 inptrs[8] = input_ptrs[2];
66 inptrs[9] = input_ptrs[10];
67 inptrs[10] = input_ptrs[4];
68 inptrs[11] = input_ptrs[7];
69 inptrs[12] = input_ptrs[8];
70 inptrs[13] = input_ptrs[11];
71 inptrs[14] = input_ptrs[13];
72 inptrs[15] = input_ptrs[14];
77 Args params_struct(input_ptrs, outptrs, params,
82 "ldr x20, [%x[params_struct], %[offsetof_args_outptrs]]\n"
83 "ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
84 "add x15, %x[params_struct], %[offsetof_Args_inptrs]\n"
86 "ldp x13, x12, [x20, #0x0]\n"
87 "ldp x11, x10, [x20, #0x10]\n"
89 "whilelt p2.h, XZR, %x[n_channels]\n"
90 "ld1h { z20.h }, p3/Z, [x16]\n"
91 "ld1h { z0.h }, p3/Z, [x16, #1, MUL VL]\n"
92 "cmp x14, %x[n_channels]\n"
93 "ld1h { z1.h }, p3/Z, [x16, #2, MUL VL]\n"
94 "ld1h { z2.h }, p3/Z, [x16, #3, MUL VL]\n"
96 "ld1h { z3.h }, p3/Z, [x16, #4, MUL VL]\n"
97 "ld1h { z4.h }, p3/Z, [x16, #5, MUL VL]\n"
98 "ld1h { z5.h }, p3/Z, [x16, #6, MUL VL]\n"
99 "ld1h { z6.h }, p3/Z, [x16, #7, MUL VL]\n"
100 "addvl x16, x16, #16\n"
101 "ldp x24, x23, [x15, #0x0]\n"
102 "ldp x22, x21, [x15, #0x10]\n"
103 "ldr x20, [x15, #0x20]\n"
104 "ld1rh { z26.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
105 "ld1rh { z25.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
106 "ld1h { z7.h }, p3/Z, [x16, #-8, MUL VL]\n"
107 "ld1h { z8.h }, p3/Z, [x16, #-7, MUL VL]\n"
108 "ld1h { z9.h }, p2/Z, [x24, x9, LSL #1]\n"
109 "addvl x16, x16, #-6\n"
110 "ld1h { z10.h }, p2/Z, [x23, x9, LSL #1]\n"
111 "ld1h { z11.h }, p2/Z, [x22, x9, LSL #1]\n"
112 "ld1h { z12.h }, p2/Z, [x21, x9, LSL #1]\n"
113 "ld1h { z13.h }, p2/Z, [x20, x9, LSL #1]\n"
116 "movprfx z24, z20\n fmla z24.h, p3/M, z4.h, z9.h\n"
117 "movprfx z23, z20\n fmla z23.h, p3/M, z3.h, z9.h\n"
118 "ldr x21, [x15, #0x28]\n"
119 "ldr x20, [x15, #0x30]\n"
120 "movprfx z22, z20\n fmla z22.h, p3/M, z1.h, z9.h\n"
121 "movprfx z21, z20\n fmla z21.h, p3/M, z0.h, z9.h\n"
122 "ld1h { z18.h }, p2/Z, [x21, x9, LSL #1]\n"
123 "ldr x22, [x15, #0x38]\n"
124 "fmla z24.h, p3/M, z0.h, z10.h\n"
125 "fmla z23.h, p3/M, z2.h, z11.h\n"
126 "ld1h { z17.h }, p2/Z, [x20, x9, LSL #1]\n"
127 "ldr x21, [x15, #0x48]\n"
128 "fmla z22.h, p3/M, z2.h, z12.h\n"
129 "fmla z21.h, p3/M, z1.h, z12.h\n"
130 "ldr x20, [x15, #0x40]\n"
131 "ld1h { z20.h }, p2/Z, [x21, x9, LSL #1]\n"
132 "fmla z24.h, p3/M, z5.h, z12.h\n"
133 "fmla z23.h, p3/M, z4.h, z12.h\n"
134 "ld1h { z16.h }, p2/Z, [x22, x9, LSL #1]\n"
135 "ldr x22, [x15, #0x50]\n"
136 "fmla z22.h, p3/M, z6.h, z18.h\n"
137 "fmla z21.h, p3/M, z3.h, z13.h\n"
138 "ld1h { z18.h }, p2/Z, [x20, x9, LSL #1]\n"
139 "ldr x21, [x15, #0x58]\n"
140 "fmla z24.h, p3/M, z7.h, z13.h\n"
141 "fmla z23.h, p3/M, z6.h, z13.h\n"
142 "ldr x20, [x15, #0x60]\n"
143 "ldr x27, [x15, #0x68]\n"
144 "fmla z22.h, p3/M, z4.h, z13.h\n"
145 "fmla z21.h, p3/M, z8.h, z17.h\n"
146 "ld1h { z17.h }, p2/Z, [x22, x9, LSL #1]\n"
147 "ldr x26, [x15, #0x70]\n"
148 "fmla z24.h, p3/M, z1.h, z16.h\n"
149 "fmla z23.h, p3/M, z0.h, z16.h\n"
150 "ld1h { z16.h }, p2/Z, [x21, x9, LSL #1]\n"
151 "ldr x25, [x15, #0x78]\n"
152 "fmla z22.h, p3/M, z5.h, z20.h\n"
153 "fmla z21.h, p3/M, z4.h, z20.h\n"
154 "whilelt p1.h, x14, %x[n_channels]\n"
155 "ldp x24, x23, [x15, #0x0]\n"
156 "fmla z24.h, p3/M, z2.h, z18.h\n"
157 "fmla z23.h, p3/M, z1.h, z18.h\n"
158 "ld1h { z19.h }, p2/Z, [x20, x9, LSL #1]\n"
159 "ldp x22, x21, [x15, #0x10]\n"
160 "fmla z22.h, p3/M, z0.h, z17.h\n"
161 "fmla z21.h, p3/M, z2.h, z16.h\n"
162 "ldr x20, [x15, #0x20]\n"
163 "ld1h { z13.h }, p1/Z, [x20, x14, LSL #1]\n"
164 "fmla z24.h, p3/M, z8.h, z20.h\n"
165 "fmla z23.h, p3/M, z7.h, z20.h\n"
166 "ld1h { z18.h }, p2/Z, [x27, x9, LSL #1]\n"
168 "fmla z22.h, p3/M, z3.h, z19.h\n"
169 "fmla z21.h, p3/M, z5.h, z18.h\n"
171 "ld1h { z20.h }, p3/Z, [x16]\n"
172 "fmla z24.h, p3/M, z3.h, z17.h\n"
173 "ld1h { z17.h }, p2/Z, [x26, x9, LSL #1]\n"
174 "fmla z23.h, p3/M, z5.h, z16.h\n"
175 "ld1h { z16.h }, p2/Z, [x25, x9, LSL #1]\n"
176 "fmla z22.h, p3/M, z7.h, z17.h\n"
177 "fmla z21.h, p3/M, z6.h, z17.h\n"
179 "ld1h { z11.h }, p1/Z, [x22, x14, LSL #1]\n"
180 "fmla z24.h, p3/M, z6.h, z19.h\n"
181 "fmla z23.h, p3/M, z8.h, z18.h\n"
182 "ld1h { z9.h }, p1/Z, [x24, x14, LSL #1]\n"
183 "ld1h { z10.h }, p1/Z, [x23, x14, LSL #1]\n"
184 "fmla z22.h, p3/M, z8.h, z16.h\n"
185 "fmla z21.h, p3/M, z7.h, z16.h\n"
186 "ld1h { z12.h }, p1/Z, [x21, x14, LSL #1]\n"
188 "fmax z24.h, p3/M, z24.h, z26.h\n"
189 "fmax z23.h, p3/M, z23.h, z26.h\n"
190 "ld1h { z0.h }, p3/Z, [x16, #1, MUL VL]\n"
191 "ld1h { z1.h }, p3/Z, [x16, #2, MUL VL]\n"
192 "fmax z22.h, p3/M, z22.h, z26.h\n"
193 "fmax z21.h, p3/M, z21.h, z26.h\n"
194 "ld1h { z2.h }, p3/Z, [x16, #3, MUL VL]\n"
195 "ld1h { z3.h }, p3/Z, [x16, #4, MUL VL]\n"
196 "ld1h { z4.h }, p3/Z, [x16, #5, MUL VL]\n"
197 "ld1h { z5.h }, p3/Z, [x16, #6, MUL VL]\n"
198 "whilelt p2.h, x9, %x[n_channels]\n"
199 "cmp x14, %x[n_channels]\n"
200 "ld1h { z6.h }, p3/Z, [x16, #7, MUL VL]\n"
201 "addvl x16, x16, #16\n"
202 "fmin z24.h, p3/M, z24.h, z25.h\n"
203 "st1h { z24.h }, p0, [x13, x28, LSL #1]\n"
204 "fmin z23.h, p3/M, z23.h, z25.h\n"
205 "fmin z22.h, p3/M, z22.h, z25.h\n"
206 "st1h { z23.h }, p0, [x12, x28, LSL #1]\n"
207 "ld1h { z7.h }, p3/Z, [x16, #-8, MUL VL]\n"
208 "fmin z21.h, p3/M, z21.h, z25.h\n"
209 "st1h { z22.h }, p0, [x11, x28, LSL #1]\n"
210 "ld1h { z8.h }, p3/Z, [x16, #-7, MUL VL]\n"
211 "addvl x16, x16, #-6\n"
212 "st1h { z21.h }, p0, [x10, x28, LSL #1]\n"
215 "movprfx z24, z20\n fmla z24.h, p3/M, z4.h, z9.h\n"
216 "movprfx z23, z20\n fmla z23.h, p3/M, z3.h, z9.h\n"
217 "ldr x21, [x15, #0x28]\n"
218 "ldr x20, [x15, #0x30]\n"
219 "movprfx z22, z20\n fmla z22.h, p3/M, z1.h, z9.h\n"
220 "movprfx z21, z20\n fmla z21.h, p3/M, z0.h, z9.h\n"
221 "ld1h { z18.h }, p2/Z, [x21, x9, LSL #1]\n"
222 "ldr x22, [x15, #0x38]\n"
223 "fmla z24.h, p3/M, z0.h, z10.h\n"
224 "fmla z23.h, p3/M, z2.h, z11.h\n"
225 "ld1h { z17.h }, p2/Z, [x20, x9, LSL #1]\n"
226 "ldr x21, [x15, #0x48]\n"
227 "fmla z22.h, p3/M, z2.h, z12.h\n"
228 "fmla z21.h, p3/M, z1.h, z12.h\n"
229 "ldr x20, [x15, #0x40]\n"
230 "ld1h { z20.h }, p2/Z, [x21, x9, LSL #1]\n"
231 "fmla z24.h, p3/M, z5.h, z12.h\n"
232 "fmla z23.h, p3/M, z4.h, z12.h\n"
233 "ld1h { z16.h }, p2/Z, [x22, x9, LSL #1]\n"
234 "ldr x21, [x15, #0x50]\n"
235 "fmla z22.h, p3/M, z6.h, z18.h\n"
236 "fmla z21.h, p3/M, z3.h, z13.h\n"
237 "ld1h { z18.h }, p2/Z, [x20, x9, LSL #1]\n"
238 "ldr x20, [x15, #0x58]\n"
239 "fmla z24.h, p3/M, z7.h, z13.h\n"
240 "fmla z23.h, p3/M, z6.h, z13.h\n"
241 "ldr x23, [x15, #0x60]\n"
242 "ldr x22, [x15, #0x68]\n"
243 "fmla z22.h, p3/M, z4.h, z13.h\n"
244 "fmla z21.h, p3/M, z8.h, z17.h\n"
245 "ld1h { z17.h }, p2/Z, [x21, x9, LSL #1]\n"
246 "ldr x21, [x15, #0x70]\n"
247 "fmla z24.h, p3/M, z1.h, z16.h\n"
248 "fmla z23.h, p3/M, z0.h, z16.h\n"
249 "ld1h { z16.h }, p2/Z, [x20, x9, LSL #1]\n"
250 "ldr x20, [x15, #0x78]\n"
251 "fmla z22.h, p3/M, z5.h, z20.h\n"
252 "fmla z21.h, p3/M, z4.h, z20.h\n"
255 "fmla z24.h, p3/M, z2.h, z18.h\n"
256 "fmla z23.h, p3/M, z1.h, z18.h\n"
257 "ld1h { z19.h }, p2/Z, [x23, x9, LSL #1]\n"
258 "fmla z22.h, p3/M, z0.h, z17.h\n"
259 "fmla z21.h, p3/M, z2.h, z16.h\n"
260 "fmla z24.h, p3/M, z8.h, z20.h\n"
261 "fmla z23.h, p3/M, z7.h, z20.h\n"
262 "ld1h { z18.h }, p2/Z, [x22, x9, LSL #1]\n"
263 "fmla z22.h, p3/M, z3.h, z19.h\n"
264 "fmla z21.h, p3/M, z5.h, z18.h\n"
265 "fmla z24.h, p3/M, z3.h, z17.h\n"
266 "ld1h { z17.h }, p2/Z, [x21, x9, LSL #1]\n"
267 "fmla z23.h, p3/M, z5.h, z16.h\n"
268 "ld1h { z16.h }, p2/Z, [x20, x9, LSL #1]\n"
269 "fmla z22.h, p3/M, z7.h, z17.h\n"
270 "fmla z21.h, p3/M, z6.h, z17.h\n"
271 "fmla z24.h, p3/M, z6.h, z19.h\n"
272 "fmla z23.h, p3/M, z8.h, z18.h\n"
273 "fmax z24.h, p3/M, z24.h, z26.h\n"
274 "fmax z23.h, p3/M, z23.h, z26.h\n"
275 "fmla z22.h, p3/M, z8.h, z16.h\n"
276 "fmla z21.h, p3/M, z7.h, z16.h\n"
277 "fmax z22.h, p3/M, z22.h, z26.h\n"
278 "fmax z21.h, p3/M, z21.h, z26.h\n"
279 "fmin z24.h, p3/M, z24.h, z25.h\n"
280 "fmin z23.h, p3/M, z23.h, z25.h\n"
281 "st1h { z24.h }, p0, [x13, x28, LSL #1]\n"
282 "fmin z22.h, p3/M, z22.h, z25.h\n"
283 "fmin z21.h, p3/M, z21.h, z25.h\n"
284 "st1h { z23.h }, p0, [x12, x28, LSL #1]\n"
285 "st1h { z22.h }, p0, [x11, x28, LSL #1]\n"
286 "st1h { z21.h }, p0, [x10, x28, LSL #1]\n"
288 : [n_channels]
"r" ((
unsigned long) n_channels), [offsetof_Args_inptrs]
"I" (offsetof(Args, inptrs)), [offsetof_args_max]
"I" (offsetof(Args, max)), [offsetof_args_min]
"I" (offsetof(Args, min)), [offsetof_args_outptrs]
"I" (offsetof(Args, outptrs)), [offsetof_args_params]
"I" (offsetof(Args, params)), [params_struct]
"r" (¶ms_struct)
289 :
"cc",
"memory",
"p0",
"p1",
"p2",
"p3",
"x9",
"x10",
"x11",
"x12",
"x13",
"x14",
"x15",
"x16",
"x20",
"x21",
"x22",
"x23",
"x24",
"x25",
"x26",
"x27",
"x28",
"z0",
"z1",
"z2",
"z3",
"z4",
"z5",
"z6",
"z7",
"z8",
"z9",
"z10",
"z11",
"z12",
"z13",
"z16",
"z17",
"z18",
"z19",
"z20",
"z21",
"z22",
"z23",
"z24",
"z25",
"z26"
296 #endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)