25 #if defined(ARM_COMPUTE_ENABLE_SME2)
28 #include "../../utils.hpp"
35 void sme2_gemv_fp16fp32fp16_dot_16VL (
36 const __fp16 *A_ptr,
const __fp16 *B_ptr, __fp16 *output_ptr,
42 __fp16 maxval =
static_cast<__fp16
>(std::numeric_limits<float>::infinity());
43 __fp16 minval = -
static_cast<__fp16
>(std::numeric_limits<float>::infinity());
44 const __fp16 *B_ptr = {};
45 size_t output_offset = {};
46 unsigned int input_initial_col = {};
49 unsigned long flags=0;
56 ka.maxval =
static_cast<__fp16
>(act.param1);
65 ".inst 0xd503477f // SMSTART ZA\n"
67 "cntw x28, ALL, MUL #4\n"
68 "mov x27, %x[B_ptr]\n"
69 "add x26, %x[N], x28\n"
70 "mov x25, %x[output_ptr]\n"
71 "sub x26, x26, #0x1\n"
73 "udiv x26, x26, x28\n"
74 ".inst 0x25207811 // ptrue pn9.b\n"
75 "add x22, x26, #0x3\n"
77 "and x22, x22, #0xfffffffffffffffc\n"
79 "mul x22, x22, %x[K]\n"
80 "lsl x22, x22, #0x1\n"
82 "cmp x22, #0x200000\n"
85 "lsr x22, x22, #0x1\n"
86 "lsl x21, x21, #0x1\n"
89 "lsl x20, x22, #0x26\n"
90 "sub x21, x21, #0x1\n"
91 "lsl x21, x21, #0x16\n"
94 ".inst 0xf8b64b7a // rprfm pldonce, x22, [x27]\n"
103 "mov x23, %x[A_ptr]\n"
104 "lsl x21, %x[K], #0x1\n"
107 ".inst 0xf8b54af8 // rprfm pldmany, x21, [x23]\n"
108 ".inst 0x257447f0 // whilelt p8.h, XZR, x20, VLx2\n"
110 "ld1h { z20.s }, p1/Z, [x24]\n"
111 "addvl x20, x24, #4\n"
112 "ld1h { z21.s }, p1/Z, [x24, #1, MUL VL]\n"
113 "ld1h { z22.s }, p1/Z, [x24, #2, MUL VL]\n"
114 "ld1h { z23.s }, p1/Z, [x24, #3, MUL VL]\n"
115 "fcvt z20.s, p1/m, z20.h\n"
116 "fcvt z21.s, p1/m, z21.h\n"
117 "fcvt z22.s, p1/m, z22.h\n"
118 "fcvt z23.s, p1/m, z23.h\n"
119 ".inst 0xc0042e80 // mova za.d[x9, #0], { z20.d-z23.d }\n"
122 ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
127 "whilelt p0.h, XZR, x22\n"
128 ".inst 0xa040a771 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x27]\n"
129 "addvl x27, x27, #16\n"
130 "ld1rqh { z0.h }, p0/Z, [x23]\n"
131 "sub x22, x22, #0x8\n"
132 "add x23, x23, #0x10\n"
133 ".inst 0xa040a77d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x27]\n"
134 "addvl x27, x27, #16\n"
136 ".inst 0xa040a779 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x27]\n"
137 "addvl x27, x27, #16\n"
138 ".inst 0xc150b208 // fdot za.s[x9, 0], { z16.h-z19.h }, z0.h[0]\n"
139 ".inst 0xa040a765 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x27]\n"
140 "addvl x27, x27, #16\n"
141 ".inst 0xc150b788 // fdot za.s[x9, 0], { z28.h-z31.h }, z0.h[1]\n"
142 ".inst 0xc150bb08 // fdot za.s[x9, 0], { z24.h-z27.h }, z0.h[2]\n"
143 ".inst 0xc150bc88 // fdot za.s[x9, 0], { z4.h-z7.h }, z0.h[3]\n"
146 "whilelt p0.h, XZR, x22\n"
147 ".inst 0xa040a779 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x27]\n"
148 "subs x22, x22, #0x2\n"
149 "ld1rqh { z11.h }, p0/Z, [x23]\n"
150 "add x23, x23, #0x10\n"
151 "addvl x27, x27, #16\n"
152 ".inst 0xc15bb308 // fdot za.s[x9, 0], { z24.h-z27.h }, z11.h[0]\n"
154 ".inst 0xa040a779 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x27]\n"
155 "subs x22, x22, #0x2\n"
156 "addvl x27, x27, #16\n"
157 ".inst 0xc15bb708 // fdot za.s[x9, 0], { z24.h-z27.h }, z11.h[1]\n"
159 ".inst 0xa040a779 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x27]\n"
160 "subs x22, x22, #0x2\n"
161 "addvl x27, x27, #16\n"
162 ".inst 0xc15bbb08 // fdot za.s[x9, 0], { z24.h-z27.h }, z11.h[2]\n"
164 ".inst 0xa040a77d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x27]\n"
165 "addvl x27, x27, #16\n"
166 ".inst 0xc15bbf88 // fdot za.s[x9, 0], { z28.h-z31.h }, z11.h[3]\n"
168 "tbz %x[flags], #1, 10f\n"
169 ".inst 0xc0062c10 // mova { z16.d-z19.d }, za.d[x9, #0]\n"
170 "add x21, %x[args_ptr], %[offset_min]\n"
171 "add x20, %x[args_ptr], %[offset_max]\n"
172 "ld1rh { z29.h }, p1/Z, [x21]\n"
173 "ld1rh { z20.h }, p1/Z, [x20]\n"
174 ".inst 0xc120e204 // fcvt z4.h, { z16.s-z17.s }\n"
175 ".inst 0xc120e245 // fcvt z5.h, { z18.s-z19.s }\n"
176 ".inst 0xc174c3a4 // fclamp { z4.h-z5.h }, z29.h, z20.h\n"
177 ".inst 0xa0602324 // st1h { z4.h-z5.h }, p8, [x25]\n"
178 "addvl x25, x25, #2\n"
181 ".inst 0xc0062c00 // mova { z0.d-z3.d }, za.d[x9, #0]\n"
182 ".inst 0xc120e012 // fcvt z18.h, { z0.s-z1.s }\n"
183 ".inst 0xc120e05a // fcvt z26.h, { z2.s-z3.s }\n"
184 ".inst 0xa1602332 // st1h { z18.h, z26.h }, p8, [x25]\n"
185 "addvl x25, x25, #2\n"
189 "mov x23, %x[A_ptr]\n"
190 "lsl x21, %x[K], #0x1\n"
191 "sub x20, %x[N], x28\n"
193 ".inst 0xf8b54af8 // rprfm pldmany, x21, [x23]\n"
194 ".inst 0x257447f0 // whilelt p8.h, XZR, x20, VLx2\n"
196 "ld1h { z12.s }, p1/Z, [x24]\n"
197 "addvl x20, x24, #4\n"
198 "ld1h { z13.s }, p1/Z, [x24, #1, MUL VL]\n"
199 "ld1h { z14.s }, p1/Z, [x24, #2, MUL VL]\n"
200 "ld1h { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
201 "fcvt z12.s, p1/m, z12.h\n"
202 "ld1h { z28.s }, p1/Z, [x24, #4, MUL VL]\n"
203 "fcvt z13.s, p1/m, z13.h\n"
204 "ld1h { z29.s }, p1/Z, [x24, #5, MUL VL]\n"
205 "fcvt z14.s, p1/m, z14.h\n"
206 "ld1h { z30.s }, p1/Z, [x24, #6, MUL VL]\n"
207 "fcvt z15.s, p1/m, z15.h\n"
208 "ld1h { z31.s }, p1/Z, [x24, #7, MUL VL]\n"
209 "fcvt z28.s, p1/m, z28.h\n"
210 "fcvt z29.s, p1/m, z29.h\n"
211 "fcvt z30.s, p1/m, z30.h\n"
212 "fcvt z31.s, p1/m, z31.h\n"
213 ".inst 0xc0042d80 // mova za.d[x9, #0], { z12.d-z15.d }\n"
214 ".inst 0xc0042f81 // mova za.d[x9, #1], { z28.d-z31.d }\n"
217 ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
222 "whilelt p0.h, XZR, x22\n"
223 ".inst 0xa040a765 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x27]\n"
224 "sub x22, x22, #0x8\n"
225 "ld1rqh { z8.h }, p0/Z, [x23]\n"
227 "add x23, x23, #0x10\n"
228 ".inst 0xa041a761 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
229 "addvl x27, x27, #16\n"
230 ".inst 0xa040a771 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x27]\n"
231 ".inst 0xc158b088 // fdot za.s[x9, 0], { z4.h-z7.h }, z8.h[0]\n"
232 ".inst 0xa041a76d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
233 "addvl x27, x27, #16\n"
234 ".inst 0xc158b009 // fdot za.s[x9, 1], { z0.h-z3.h }, z8.h[0]\n"
235 ".inst 0xa040a779 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x27]\n"
236 ".inst 0xa041a761 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
237 "addvl x27, x27, #16\n"
238 ".inst 0xa040a765 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x27]\n"
239 ".inst 0xc158b608 // fdot za.s[x9, 0], { z16.h-z19.h }, z8.h[1]\n"
240 ".inst 0xa041a77d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
241 "addvl x27, x27, #16\n"
242 ".inst 0xc158b589 // fdot za.s[x9, 1], { z12.h-z15.h }, z8.h[1]\n"
243 ".inst 0xc158bb08 // fdot za.s[x9, 0], { z24.h-z27.h }, z8.h[2]\n"
244 ".inst 0xc158b809 // fdot za.s[x9, 1], { z0.h-z3.h }, z8.h[2]\n"
245 ".inst 0xc158bc88 // fdot za.s[x9, 0], { z4.h-z7.h }, z8.h[3]\n"
246 ".inst 0xc158bf89 // fdot za.s[x9, 1], { z28.h-z31.h }, z8.h[3]\n"
249 "whilelt p0.h, XZR, x22\n"
250 ".inst 0xa040a765 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x27]\n"
251 "subs x22, x22, #0x2\n"
252 "ld1rqh { z11.h }, p0/Z, [x23]\n"
253 "add x23, x23, #0x10\n"
254 ".inst 0xa041a76d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
255 "addvl x27, x27, #16\n"
256 ".inst 0xc15bb088 // fdot za.s[x9, 0], { z4.h-z7.h }, z11.h[0]\n"
257 ".inst 0xc15bb189 // fdot za.s[x9, 1], { z12.h-z15.h }, z11.h[0]\n"
259 ".inst 0xa040a771 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x27]\n"
260 "subs x22, x22, #0x2\n"
261 ".inst 0xa041a775 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
262 "addvl x27, x27, #16\n"
263 ".inst 0xc15bb608 // fdot za.s[x9, 0], { z16.h-z19.h }, z11.h[1]\n"
264 ".inst 0xc15bb689 // fdot za.s[x9, 1], { z20.h-z23.h }, z11.h[1]\n"
266 ".inst 0xa040a76d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27]\n"
267 "subs x22, x22, #0x2\n"
268 ".inst 0xa041a775 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
269 "addvl x27, x27, #16\n"
270 ".inst 0xc15bb988 // fdot za.s[x9, 0], { z12.h-z15.h }, z11.h[2]\n"
271 ".inst 0xc15bba89 // fdot za.s[x9, 1], { z20.h-z23.h }, z11.h[2]\n"
273 ".inst 0xa040a761 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x27]\n"
274 ".inst 0xa041a779 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
275 "addvl x27, x27, #16\n"
276 ".inst 0xc15bbc08 // fdot za.s[x9, 0], { z0.h-z3.h }, z11.h[3]\n"
277 ".inst 0xc15bbf09 // fdot za.s[x9, 1], { z24.h-z27.h }, z11.h[3]\n"
279 "tbz %x[flags], #1, 18f\n"
280 ".inst 0xc0062c0c // mova { z12.d-z15.d }, za.d[x9, #0]\n"
281 "add x21, %x[args_ptr], %[offset_min]\n"
282 "add x20, %x[args_ptr], %[offset_max]\n"
283 ".inst 0xc0062c3c // mova { z28.d-z31.d }, za.d[x9, #1]\n"
284 "ld1rh { z5.h }, p1/Z, [x21]\n"
285 "ld1rh { z21.h }, p1/Z, [x20]\n"
286 ".inst 0xc120e188 // fcvt z8.h, { z12.s-z13.s }\n"
287 ".inst 0xc120e1c9 // fcvt z9.h, { z14.s-z15.s }\n"
288 ".inst 0xc120e39c // fcvt z28.h, { z28.s-z29.s }\n"
289 ".inst 0xc120e3dd // fcvt z29.h, { z30.s-z31.s }\n"
290 ".inst 0xc175c0a8 // fclamp { z8.h-z9.h }, z5.h, z21.h\n"
291 ".inst 0xc175c0bc // fclamp { z28.h-z29.h }, z5.h, z21.h\n"
292 ".inst 0xa0602728 // st1h { z8.h-z9.h }, pn9.b, [x25]\n"
293 ".inst 0xa061233c // st1h { z28.h-z29.h }, p8, [x25, #0x2, MUL VL]\n"
294 "addvl x25, x25, #4\n"
297 ".inst 0xc0062c0c // mova { z12.d-z15.d }, za.d[x9, #0]\n"
298 ".inst 0xc0062c24 // mova { z4.d-z7.d }, za.d[x9, #1]\n"
299 ".inst 0xc120e194 // fcvt z20.h, { z12.s-z13.s }\n"
300 ".inst 0xc120e1dc // fcvt z28.h, { z14.s-z15.s }\n"
301 ".inst 0xa1602734 // st1h { z20.h, z28.h }, pn9.b, [x25]\n"
302 ".inst 0xc120e09a // fcvt z26.h, { z4.s-z5.s }\n"
303 ".inst 0xc120e0db // fcvt z27.h, { z6.s-z7.s }\n"
304 ".inst 0xa061233a // st1h { z26.h-z27.h }, p8, [x25, #0x2, MUL VL]\n"
305 "addvl x25, x25, #4\n"
310 "mov x23, %x[A_ptr]\n"
311 "lsl x21, %x[K], #0x1\n"
312 "msub x20, x28, x20, %x[N]\n"
314 ".inst 0xf8b54af8 // rprfm pldmany, x21, [x23]\n"
315 ".inst 0x257447f0 // whilelt p8.h, XZR, x20, VLx2\n"
317 "addvl x20, x24, #4\n"
318 "ld1h { z16.s }, p1/Z, [x24]\n"
319 "ld1h { z17.s }, p1/Z, [x24, #1, MUL VL]\n"
320 "ld1h { z18.s }, p1/Z, [x24, #2, MUL VL]\n"
321 "ld1h { z19.s }, p1/Z, [x24, #3, MUL VL]\n"
322 "fcvt z16.s, p1/m, z16.h\n"
323 "ld1h { z8.s }, p1/Z, [x24, #4, MUL VL]\n"
324 "fcvt z17.s, p1/m, z17.h\n"
325 "ld1h { z9.s }, p1/Z, [x24, #5, MUL VL]\n"
326 "fcvt z18.s, p1/m, z18.h\n"
327 "ld1h { z10.s }, p1/Z, [x24, #6, MUL VL]\n"
328 "fcvt z19.s, p1/m, z19.h\n"
329 "ld1h { z11.s }, p1/Z, [x24, #7, MUL VL]\n"
330 "fcvt z8.s, p1/m, z8.h\n"
331 "ld1h { z24.s }, p1/Z, [x20]\n"
332 "fcvt z9.s, p1/m, z9.h\n"
333 "ld1h { z25.s }, p1/Z, [x20, #1, MUL VL]\n"
334 "fcvt z10.s, p1/m, z10.h\n"
335 "ld1h { z26.s }, p1/Z, [x20, #2, MUL VL]\n"
336 "fcvt z11.s, p1/m, z11.h\n"
337 ".inst 0xc0042e00 // mova za.d[x9, #0], { z16.d-z19.d }\n"
338 "ld1h { z27.s }, p1/Z, [x20, #3, MUL VL]\n"
339 "fcvt z24.s, p1/m, z24.h\n"
340 "fcvt z25.s, p1/m, z25.h\n"
341 "fcvt z26.s, p1/m, z26.h\n"
342 "fcvt z27.s, p1/m, z27.h\n"
343 ".inst 0xc0042d01 // mova za.d[x9, #1], { z8.d-z11.d }\n"
344 ".inst 0xc0042f02 // mova za.d[x9, #2], { z24.d-z27.d }\n"
347 ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
352 "whilelt p0.h, XZR, x22\n"
353 ".inst 0xa040a775 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x27]\n"
354 "sub x22, x22, #0x8\n"
355 "ld1rqh { z6.h }, p0/Z, [x23]\n"
357 "add x23, x23, #0x10\n"
358 ".inst 0xa041a76d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
359 ".inst 0xa042a761 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
360 "addvl x27, x27, #16\n"
361 ".inst 0xc156b288 // fdot za.s[x9, 0], { z20.h-z23.h }, z6.h[0]\n"
362 ".inst 0xa040a77d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x27]\n"
363 ".inst 0xc156b189 // fdot za.s[x9, 1], { z12.h-z15.h }, z6.h[0]\n"
364 ".inst 0xa041a76d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
365 ".inst 0xc156b00a // fdot za.s[x9, 2], { z0.h-z3.h }, z6.h[0]\n"
366 ".inst 0xa042a761 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
367 "addvl x27, x27, #16\n"
368 ".inst 0xa040a775 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x27]\n"
369 ".inst 0xc156b788 // fdot za.s[x9, 0], { z28.h-z31.h }, z6.h[1]\n"
370 ".inst 0xa041a769 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
371 ".inst 0xc156b589 // fdot za.s[x9, 1], { z12.h-z15.h }, z6.h[1]\n"
372 ".inst 0xa042a76d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
373 "addvl x27, x27, #16\n"
374 ".inst 0xc156b40a // fdot za.s[x9, 2], { z0.h-z3.h }, z6.h[1]\n"
375 ".inst 0xa040a761 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x27]\n"
376 ".inst 0xa041a771 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
377 ".inst 0xc156ba88 // fdot za.s[x9, 0], { z20.h-z23.h }, z6.h[2]\n"
378 ".inst 0xa042a775 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
379 "addvl x27, x27, #16\n"
380 ".inst 0xc156b909 // fdot za.s[x9, 1], { z8.h-z11.h }, z6.h[2]\n"
381 ".inst 0xc156b98a // fdot za.s[x9, 2], { z12.h-z15.h }, z6.h[2]\n"
382 ".inst 0xc156bc08 // fdot za.s[x9, 0], { z0.h-z3.h }, z6.h[3]\n"
383 ".inst 0xc156be09 // fdot za.s[x9, 1], { z16.h-z19.h }, z6.h[3]\n"
384 ".inst 0xc156be8a // fdot za.s[x9, 2], { z20.h-z23.h }, z6.h[3]\n"
387 "whilelt p0.h, XZR, x22\n"
388 ".inst 0xa040a76d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27]\n"
389 "subs x22, x22, #0x2\n"
390 "ld1rqh { z11.h }, p0/Z, [x23]\n"
391 "add x23, x23, #0x10\n"
392 ".inst 0xa041a761 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
393 ".inst 0xa042a771 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
394 "addvl x27, x27, #16\n"
395 ".inst 0xc15bb188 // fdot za.s[x9, 0], { z12.h-z15.h }, z11.h[0]\n"
396 ".inst 0xc15bb009 // fdot za.s[x9, 1], { z0.h-z3.h }, z11.h[0]\n"
397 ".inst 0xc15bb20a // fdot za.s[x9, 2], { z16.h-z19.h }, z11.h[0]\n"
399 ".inst 0xa040a76d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27]\n"
400 "subs x22, x22, #0x2\n"
401 ".inst 0xa041a771 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
402 ".inst 0xa042a775 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
403 "addvl x27, x27, #16\n"
404 ".inst 0xc15bb588 // fdot za.s[x9, 0], { z12.h-z15.h }, z11.h[1]\n"
405 ".inst 0xc15bb609 // fdot za.s[x9, 1], { z16.h-z19.h }, z11.h[1]\n"
406 ".inst 0xc15bb68a // fdot za.s[x9, 2], { z20.h-z23.h }, z11.h[1]\n"
408 ".inst 0xa040a765 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x27]\n"
409 "subs x22, x22, #0x2\n"
410 ".inst 0xa041a77d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
411 ".inst 0xa042a775 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
412 "addvl x27, x27, #16\n"
413 ".inst 0xc15bb888 // fdot za.s[x9, 0], { z4.h-z7.h }, z11.h[2]\n"
414 ".inst 0xc15bbb89 // fdot za.s[x9, 1], { z28.h-z31.h }, z11.h[2]\n"
415 ".inst 0xc15bba8a // fdot za.s[x9, 2], { z20.h-z23.h }, z11.h[2]\n"
417 ".inst 0xa040a765 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x27]\n"
418 ".inst 0xa041a77d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
419 ".inst 0xa042a76d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
420 "addvl x27, x27, #16\n"
421 ".inst 0xc15bbc88 // fdot za.s[x9, 0], { z4.h-z7.h }, z11.h[3]\n"
422 ".inst 0xc15bbf89 // fdot za.s[x9, 1], { z28.h-z31.h }, z11.h[3]\n"
423 ".inst 0xc15bbd8a // fdot za.s[x9, 2], { z12.h-z15.h }, z11.h[3]\n"
425 "tbz %x[flags], #1, 26f\n"
426 ".inst 0xc0062c0c // mova { z12.d-z15.d }, za.d[x9, #0]\n"
427 "add x21, %x[args_ptr], %[offset_min]\n"
428 "add x20, %x[args_ptr], %[offset_max]\n"
429 ".inst 0xc0062c20 // mova { z0.d-z3.d }, za.d[x9, #1]\n"
430 "ld1rh { z17.h }, p1/Z, [x21]\n"
431 ".inst 0xc0062c44 // mova { z4.d-z7.d }, za.d[x9, #2]\n"
432 "ld1rh { z16.h }, p1/Z, [x20]\n"
433 ".inst 0xc120e18c // fcvt z12.h, { z12.s-z13.s }\n"
434 ".inst 0xc120e1cd // fcvt z13.h, { z14.s-z15.s }\n"
435 ".inst 0xc120e00e // fcvt z14.h, { z0.s-z1.s }\n"
436 ".inst 0xc120e04f // fcvt z15.h, { z2.s-z3.s }\n"
437 ".inst 0xc170c22c // fclamp { z12.h-z13.h }, z17.h, z16.h\n"
438 ".inst 0xc120e092 // fcvt z18.h, { z4.s-z5.s }\n"
439 ".inst 0xc120e0d3 // fcvt z19.h, { z6.s-z7.s }\n"
440 ".inst 0xc170c22e // fclamp { z14.h-z15.h }, z17.h, z16.h\n"
441 ".inst 0xc170c232 // fclamp { z18.h-z19.h }, z17.h, z16.h\n"
442 ".inst 0xa060272c // st1h { z12.h-z13.h }, pn9.b, [x25]\n"
443 ".inst 0xa061272e // st1h { z14.h-z15.h }, pn9.b, [x25, #0x2, MUL VL]\n"
444 ".inst 0xa0622332 // st1h { z18.h-z19.h }, p8, [x25, #0x4, MUL VL]\n"
445 "addvl x25, x25, #6\n"
448 ".inst 0xc0062c18 // mova { z24.d-z27.d }, za.d[x9, #0]\n"
449 ".inst 0xc0062c28 // mova { z8.d-z11.d }, za.d[x9, #1]\n"
450 ".inst 0xc0062c4c // mova { z12.d-z15.d }, za.d[x9, #2]\n"
451 ".inst 0xc120e311 // fcvt z17.h, { z24.s-z25.s }\n"
452 ".inst 0xc120e359 // fcvt z25.h, { z26.s-z27.s }\n"
453 ".inst 0xa1602731 // st1h { z17.h, z25.h }, pn9.b, [x25]\n"
454 ".inst 0xc120e112 // fcvt z18.h, { z8.s-z9.s }\n"
455 ".inst 0xc120e153 // fcvt z19.h, { z10.s-z11.s }\n"
456 ".inst 0xa0612732 // st1h { z18.h-z19.h }, pn9.b, [x25, #0x2, MUL VL]\n"
457 ".inst 0xc120e191 // fcvt z17.h, { z12.s-z13.s }\n"
458 ".inst 0xc120e1d9 // fcvt z25.h, { z14.s-z15.s }\n"
459 ".inst 0xa1622331 // st1h { z17.h, z25.h }, p8, [x25, #0x4, MUL VL]\n"
460 "addvl x25, x25, #6\n"
465 "mov x23, %x[A_ptr]\n"
466 "lsl x21, %x[K], #0x1\n"
467 "msub x20, x28, x20, %x[N]\n"
469 ".inst 0xf8b54af8 // rprfm pldmany, x21, [x23]\n"
470 ".inst 0x257447f0 // whilelt p8.h, XZR, x20, VLx2\n"
472 "addvl x20, x24, #4\n"
473 "ld1h { z28.s }, p1/Z, [x24]\n"
474 "ld1h { z29.s }, p1/Z, [x24, #1, MUL VL]\n"
475 "ld1h { z30.s }, p1/Z, [x24, #2, MUL VL]\n"
476 "ld1h { z31.s }, p1/Z, [x24, #3, MUL VL]\n"
477 "fcvt z28.s, p1/m, z28.h\n"
478 "ld1h { z8.s }, p1/Z, [x24, #4, MUL VL]\n"
479 "fcvt z29.s, p1/m, z29.h\n"
480 "ld1h { z9.s }, p1/Z, [x24, #5, MUL VL]\n"
481 "fcvt z30.s, p1/m, z30.h\n"
482 "ld1h { z10.s }, p1/Z, [x24, #6, MUL VL]\n"
483 "fcvt z31.s, p1/m, z31.h\n"
484 "ld1h { z11.s }, p1/Z, [x24, #7, MUL VL]\n"
485 "fcvt z8.s, p1/m, z8.h\n"
486 "addvl x24, x24, #8\n"
487 "ld1h { z0.s }, p1/Z, [x20]\n"
488 "fcvt z9.s, p1/m, z9.h\n"
489 "ld1h { z1.s }, p1/Z, [x20, #1, MUL VL]\n"
490 "fcvt z10.s, p1/m, z10.h\n"
491 "ld1h { z2.s }, p1/Z, [x20, #2, MUL VL]\n"
492 "fcvt z11.s, p1/m, z11.h\n"
493 ".inst 0xc0042f80 // mova za.d[x9, #0], { z28.d-z31.d }\n"
494 "ld1h { z3.s }, p1/Z, [x20, #3, MUL VL]\n"
495 "fcvt z0.s, p1/m, z0.h\n"
496 "ld1h { z28.s }, p1/Z, [x20, #4, MUL VL]\n"
497 "fcvt z1.s, p1/m, z1.h\n"
498 "ld1h { z29.s }, p1/Z, [x20, #5, MUL VL]\n"
499 "fcvt z2.s, p1/m, z2.h\n"
500 "ld1h { z30.s }, p1/Z, [x20, #6, MUL VL]\n"
501 "fcvt z3.s, p1/m, z3.h\n"
502 ".inst 0xc0042d01 // mova za.d[x9, #1], { z8.d-z11.d }\n"
503 "ld1h { z31.s }, p1/Z, [x20, #7, MUL VL]\n"
504 "fcvt z28.s, p1/m, z28.h\n"
505 "fcvt z29.s, p1/m, z29.h\n"
506 "fcvt z30.s, p1/m, z30.h\n"
507 "fcvt z31.s, p1/m, z31.h\n"
508 ".inst 0xc0042c02 // mova za.d[x9, #2], { z0.d-z3.d }\n"
509 ".inst 0xc0042f83 // mova za.d[x9, #3], { z28.d-z31.d }\n"
512 ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
517 "whilelt p0.h, XZR, x22\n"
518 ".inst 0xa040a769 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27]\n"
519 "sub x22, x22, #0x8\n"
520 "ld1rqh { z3.h }, p0/Z, [x23]\n"
522 "add x23, x23, #0x10\n"
523 ".inst 0xa041a77d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
524 ".inst 0xa042a779 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
525 ".inst 0xa043a765 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x27, #0xc, MUL VL]\n"
526 ".inst 0xc153b108 // fdot za.s[x9, 0], { z8.h-z11.h }, z3.h[0]\n"
527 "addvl x27, x27, #16\n"
528 ".inst 0xc153b389 // fdot za.s[x9, 1], { z28.h-z31.h }, z3.h[0]\n"
529 ".inst 0xa040a76d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27]\n"
530 ".inst 0xc153b30a // fdot za.s[x9, 2], { z24.h-z27.h }, z3.h[0]\n"
531 ".inst 0xa041a769 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
532 ".inst 0xc153b08b // fdot za.s[x9, 3], { z4.h-z7.h }, z3.h[0]\n"
533 ".inst 0xa042a771 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
534 ".inst 0xa043a779 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x27, #0xc, MUL VL]\n"
535 ".inst 0xc153b588 // fdot za.s[x9, 0], { z12.h-z15.h }, z3.h[1]\n"
536 "addvl x27, x27, #16\n"
537 ".inst 0xc153b509 // fdot za.s[x9, 1], { z8.h-z11.h }, z3.h[1]\n"
538 ".inst 0xa040a76d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27]\n"
539 ".inst 0xc153b60a // fdot za.s[x9, 2], { z16.h-z19.h }, z3.h[1]\n"
540 ".inst 0xa041a765 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
541 ".inst 0xc153b70b // fdot za.s[x9, 3], { z24.h-z27.h }, z3.h[1]\n"
542 ".inst 0xa042a775 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
543 ".inst 0xa043a769 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27, #0xc, MUL VL]\n"
544 ".inst 0xc153b988 // fdot za.s[x9, 0], { z12.h-z15.h }, z3.h[2]\n"
545 "addvl x27, x27, #16\n"
546 ".inst 0xc153b889 // fdot za.s[x9, 1], { z4.h-z7.h }, z3.h[2]\n"
547 ".inst 0xa040a76d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27]\n"
548 ".inst 0xc153ba8a // fdot za.s[x9, 2], { z20.h-z23.h }, z3.h[2]\n"
549 ".inst 0xa041a765 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
550 ".inst 0xc153b90b // fdot za.s[x9, 3], { z8.h-z11.h }, z3.h[2]\n"
551 ".inst 0xa042a769 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
552 ".inst 0xa043a771 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x27, #0xc, MUL VL]\n"
553 ".inst 0xc153bd88 // fdot za.s[x9, 0], { z12.h-z15.h }, z3.h[3]\n"
554 "addvl x27, x27, #16\n"
555 ".inst 0xc153bc89 // fdot za.s[x9, 1], { z4.h-z7.h }, z3.h[3]\n"
556 ".inst 0xc153bd0a // fdot za.s[x9, 2], { z8.h-z11.h }, z3.h[3]\n"
557 ".inst 0xc153be0b // fdot za.s[x9, 3], { z16.h-z19.h }, z3.h[3]\n"
560 "whilelt p0.h, XZR, x22\n"
561 ".inst 0xa040a771 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x27]\n"
562 "subs x22, x22, #0x2\n"
563 "ld1rqh { z11.h }, p0/Z, [x23]\n"
564 "add x23, x23, #0x10\n"
565 ".inst 0xa041a765 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
566 ".inst 0xa042a76d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
567 ".inst 0xa043a77d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x27, #0xc, MUL VL]\n"
568 ".inst 0xc15bb208 // fdot za.s[x9, 0], { z16.h-z19.h }, z11.h[0]\n"
569 "addvl x27, x27, #16\n"
570 ".inst 0xc15bb089 // fdot za.s[x9, 1], { z4.h-z7.h }, z11.h[0]\n"
571 ".inst 0xc15bb18a // fdot za.s[x9, 2], { z12.h-z15.h }, z11.h[0]\n"
572 ".inst 0xc15bb38b // fdot za.s[x9, 3], { z28.h-z31.h }, z11.h[0]\n"
574 ".inst 0xa040a765 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x27]\n"
575 "subs x22, x22, #0x2\n"
576 ".inst 0xa041a771 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
577 ".inst 0xa042a76d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
578 ".inst 0xa043a779 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x27, #0xc, MUL VL]\n"
579 ".inst 0xc15bb488 // fdot za.s[x9, 0], { z4.h-z7.h }, z11.h[1]\n"
580 "addvl x27, x27, #16\n"
581 ".inst 0xc15bb609 // fdot za.s[x9, 1], { z16.h-z19.h }, z11.h[1]\n"
582 ".inst 0xc15bb58a // fdot za.s[x9, 2], { z12.h-z15.h }, z11.h[1]\n"
583 ".inst 0xc15bb70b // fdot za.s[x9, 3], { z24.h-z27.h }, z11.h[1]\n"
585 ".inst 0xa040a76d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x27]\n"
586 "subs x22, x22, #0x2\n"
587 ".inst 0xa041a77d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
588 ".inst 0xa042a761 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
589 ".inst 0xa043a765 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x27, #0xc, MUL VL]\n"
590 ".inst 0xc15bb988 // fdot za.s[x9, 0], { z12.h-z15.h }, z11.h[2]\n"
591 "addvl x27, x27, #16\n"
592 ".inst 0xc15bbb89 // fdot za.s[x9, 1], { z28.h-z31.h }, z11.h[2]\n"
593 ".inst 0xc15bb80a // fdot za.s[x9, 2], { z0.h-z3.h }, z11.h[2]\n"
594 ".inst 0xc15bb88b // fdot za.s[x9, 3], { z4.h-z7.h }, z11.h[2]\n"
596 ".inst 0xa040a761 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x27]\n"
597 ".inst 0xa041a771 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n"
598 ".inst 0xa042a765 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x27, #0x8, MUL VL]\n"
599 ".inst 0xa043a775 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x27, #0xc, MUL VL]\n"
600 ".inst 0xc15bbc08 // fdot za.s[x9, 0], { z0.h-z3.h }, z11.h[3]\n"
601 "addvl x27, x27, #16\n"
602 ".inst 0xc15bbe09 // fdot za.s[x9, 1], { z16.h-z19.h }, z11.h[3]\n"
603 ".inst 0xc15bbc8a // fdot za.s[x9, 2], { z4.h-z7.h }, z11.h[3]\n"
604 ".inst 0xc15bbe8b // fdot za.s[x9, 3], { z20.h-z23.h }, z11.h[3]\n"
606 "tbz %x[flags], #1, 34f\n"
607 ".inst 0xc0062c1c // mova { z28.d-z31.d }, za.d[x9, #0]\n"
608 "add x21, %x[args_ptr], %[offset_min]\n"
609 "add x20, %x[args_ptr], %[offset_max]\n"
610 ".inst 0xc0062c2c // mova { z12.d-z15.d }, za.d[x9, #1]\n"
611 "ld1rh { z19.h }, p1/Z, [x21]\n"
612 ".inst 0xc0062c40 // mova { z0.d-z3.d }, za.d[x9, #2]\n"
613 "ld1rh { z18.h }, p1/Z, [x20]\n"
614 ".inst 0xc0062c64 // mova { z4.d-z7.d }, za.d[x9, #3]\n"
615 ".inst 0xc120e38a // fcvt z10.h, { z28.s-z29.s }\n"
616 ".inst 0xc120e3cb // fcvt z11.h, { z30.s-z31.s }\n"
617 ".inst 0xc120e18c // fcvt z12.h, { z12.s-z13.s }\n"
618 ".inst 0xc120e1cd // fcvt z13.h, { z14.s-z15.s }\n"
619 ".inst 0xc172c26a // fclamp { z10.h-z11.h }, z19.h, z18.h\n"
620 ".inst 0xc120e00e // fcvt z14.h, { z0.s-z1.s }\n"
621 ".inst 0xc120e04f // fcvt z15.h, { z2.s-z3.s }\n"
622 ".inst 0xc172c26c // fclamp { z12.h-z13.h }, z19.h, z18.h\n"
623 ".inst 0xc120e090 // fcvt z16.h, { z4.s-z5.s }\n"
624 ".inst 0xc120e0d1 // fcvt z17.h, { z6.s-z7.s }\n"
625 ".inst 0xc172c26e // fclamp { z14.h-z15.h }, z19.h, z18.h\n"
626 ".inst 0xc172c270 // fclamp { z16.h-z17.h }, z19.h, z18.h\n"
627 ".inst 0xa060272a // st1h { z10.h-z11.h }, pn9.b, [x25]\n"
628 ".inst 0xa061272c // st1h { z12.h-z13.h }, pn9.b, [x25, #0x2, MUL VL]\n"
629 ".inst 0xa062272e // st1h { z14.h-z15.h }, pn9.b, [x25, #0x4, MUL VL]\n"
630 ".inst 0xa0632330 // st1h { z16.h-z17.h }, p8, [x25, #0x6, MUL VL]\n"
631 "addvl x25, x25, #8\n"
634 ".inst 0xc0062c0c // mova { z12.d-z15.d }, za.d[x9, #0]\n"
635 ".inst 0xc0062c30 // mova { z16.d-z19.d }, za.d[x9, #1]\n"
636 ".inst 0xc0062c5c // mova { z28.d-z31.d }, za.d[x9, #2]\n"
637 ".inst 0xc0062c68 // mova { z8.d-z11.d }, za.d[x9, #3]\n"
638 ".inst 0xc120e187 // fcvt z7.h, { z12.s-z13.s }\n"
639 ".inst 0xc120e1cf // fcvt z15.h, { z14.s-z15.s }\n"
640 ".inst 0xa1602727 // st1h { z7.h, z15.h }, pn9.b, [x25]\n"
641 ".inst 0xc120e207 // fcvt z7.h, { z16.s-z17.s }\n"
642 ".inst 0xc120e24f // fcvt z15.h, { z18.s-z19.s }\n"
643 ".inst 0xa1612727 // st1h { z7.h, z15.h }, pn9.b, [x25, #0x2, MUL VL]\n"
644 ".inst 0xc120e38e // fcvt z14.h, { z28.s-z29.s }\n"
645 ".inst 0xc120e3cf // fcvt z15.h, { z30.s-z31.s }\n"
646 ".inst 0xa062272e // st1h { z14.h-z15.h }, pn9.b, [x25, #0x4, MUL VL]\n"
647 ".inst 0xc120e112 // fcvt z18.h, { z8.s-z9.s }\n"
648 ".inst 0xc120e15a // fcvt z26.h, { z10.s-z11.s }\n"
649 ".inst 0xa1632332 // st1h { z18.h, z26.h }, p8, [x25, #0x6, MUL VL]\n"
650 "addvl x25, x25, #8\n"
652 "subs x26, x26, #0x4\n"
653 "sub %x[N], %x[N], x28, LSL #2\n"
656 ".inst 0xd503467f // SMSTOP\n"
659 : [A_ptr]
"r" (A_ptr), [B_ptr]
"r" (B_ptr), [
K]
"r" (
K), [args_ptr]
"r" (&ka), [
bias]
"r" (
bias), [flags]
"r" (flags), [offset_max]
"I" (offsetof(KernelArgs, maxval)), [offset_min]
"I" (offsetof(KernelArgs, minval)), [output_ptr]
"r" (output_ptr)
660 :
"cc",
"memory",
"p0",
"p1",
"p2",
"p3",
"p4",
"p5",
"p6",
"p7",
"p8",
"p9",
"p10",
"p11",
"p12",
"p13",
"p14",
"p15",
"x9",
"x20",
"x21",
"x22",
"x23",
"x24",
"x25",
"x26",
"x27",
"x28",
"z0",
"z1",
"z2",
"z3",
"z4",
"z5",
"z6",
"z7",
"z8",
"z9",
"z10",
"z11",
"z12",
"z13",
"z14",
"z15",
"z16",
"z17",
"z18",
"z19",
"z20",
"z21",
"z22",
"z23",
"z24",
"z25",
"z26",
"z27",
"z28",
"z29",
"z30",
"z31"
666 #endif // defined(ARM_COMPUTE_ENABLE_SME2)