31 #include "../../asmlib.hpp"
32 #include "../../utils.hpp"
36 void a64_smallK_hybrid_u8u32_dot_6x4_a55(
const uint8_t *A,
int lda,
const uint8_t *B, uint32_t *C,
int ldc,
int M,
int N,
int K,
const uint32_t *,
Activation,
bool) {
37 const long loops_count =
iceildiv(
N, (
int)4) - 1;
38 const long ldab = lda *
sizeof(uint8_t);
39 const long ldcb = ldc *
sizeof(uint32_t);
40 const long odds_count =
K % 4;
43 for (
int y0=0; y0<
M; y0+=6) {
44 long loops = loops_count;
45 long oob_rows = std::max(6 - (
M-y0), 0);
46 long odds = odds_count;
47 const uint8_t *b_ptr0 =
B;
48 const uint8_t *a_ptr0 =
A + (y0 * lda);
50 uint32_t *c_ptr0 = C + (y0 * ldc);
65 "temploadreg0 .req X10\n"
66 "temploadreg1 .req X11\n"
67 "temploadreg2 .req X12\n"
68 "temploadreg3 .req X13\n"
69 "add a_ptr1, %[a_ptr0], %[lda]\n"
70 "add c_ptr1, %[c_ptr0], %[ldc]\n"
71 "add a_ptr2, a_ptr1, %[lda]\n"
72 "add c_ptr2, c_ptr1, %[ldc]\n"
73 "add a_ptr3, a_ptr2, %[lda]\n"
74 "add c_ptr3, c_ptr2, %[ldc]\n"
75 "add a_ptr4, a_ptr3, %[lda]\n"
76 "add c_ptr4, c_ptr3, %[ldc]\n"
77 "add a_ptr5, a_ptr4, %[lda]\n"
78 "add c_ptr5, c_ptr4, %[ldc]\n"
79 "cbz %[oob_rows], 1f\n"
80 "subs %[oob_rows], %[oob_rows], #0x1\n"
81 "add c_ptr5, %[c_ptr0], #0x0\n"
82 "add a_ptr5, %[a_ptr0], #0x0\n"
84 "subs %[oob_rows], %[oob_rows], #0x1\n"
85 "add c_ptr4, %[c_ptr0], #0x0\n"
86 "add a_ptr4, %[a_ptr0], #0x0\n"
88 "subs %[oob_rows], %[oob_rows], #0x1\n"
89 "add c_ptr3, %[c_ptr0], #0x0\n"
90 "add a_ptr3, %[a_ptr0], #0x0\n"
92 "subs %[oob_rows], %[oob_rows], #0x1\n"
93 "add c_ptr2, %[c_ptr0], #0x0\n"
94 "add a_ptr2, %[a_ptr0], #0x0\n"
96 "subs %[oob_rows], %[oob_rows], #0x1\n"
97 "add c_ptr1, %[c_ptr0], #0x0\n"
98 "add a_ptr1, %[a_ptr0], #0x0\n"
101 "ldr q0, [%[a_ptr0]], #0x10\n"
102 "ldr q3, [a_ptr1], #0x10\n"
103 "ldr q6, [a_ptr2], #0x10\n"
104 "ldr q9, [a_ptr3], #0x10\n"
105 "ldr q12, [a_ptr4], #0x10\n"
106 "ldr q15, [a_ptr5], #0x10\n"
107 "ldr q1, [%[a_ptr0]], #0x10\n"
108 "ldr q4, [a_ptr1], #0x10\n"
109 "ldr q7, [a_ptr2], #0x10\n"
110 "ldr q10, [a_ptr3], #0x10\n"
111 "ldr s2, [%[a_ptr0]]\n"
112 "ldr q13, [a_ptr4], #0x10\n"
114 "ldr q16, [a_ptr5], #0x10\n"
116 "ldr s11, [a_ptr3]\n"
117 "ldr s14, [a_ptr4]\n"
118 "ldr s17, [a_ptr5]\n"
121 "ldr q0, [%[a_ptr0]], #0x10\n"
122 "subs %[odds], %[odds], #0x1\n"
123 "ldr q3, [a_ptr1], #0x10\n"
124 "ldr q6, [a_ptr2], #0x10\n"
125 "ldr q9, [a_ptr3], #0x10\n"
126 "ldr q12, [a_ptr4], #0x10\n"
127 "ldr q15, [a_ptr5], #0x10\n"
128 "ldr q1, [%[a_ptr0]], #0x10\n"
129 "ldr q4, [a_ptr1], #0x10\n"
130 "ldr q7, [a_ptr2], #0x10\n"
131 "ldr q10, [a_ptr3], #0x10\n"
132 "ldr q13, [a_ptr4], #0x10\n"
133 "ldr q16, [a_ptr5], #0x10\n"
135 "ldr b2, [%[a_ptr0]]\n"
138 "ldr b11, [a_ptr3]\n"
139 "ldr b14, [a_ptr4]\n"
140 "ldr b17, [a_ptr5]\n"
143 "ldr h2, [%[a_ptr0]], #0x2\n"
144 "ldr h5, [a_ptr1], #0x2\n"
145 "ldr h8, [a_ptr2], #0x2\n"
146 "ldr h11, [a_ptr3], #0x2\n"
147 "ldr h14, [a_ptr4], #0x2\n"
148 "ldr h17, [a_ptr5], #0x2\n"
149 "subs %[odds], %[odds], #0x1\n"
153 "ld1 {v2.b}[2], [%[a_ptr0]]\n"
154 "ld1 {v5.b}[2], [a_ptr1]\n"
155 "ld1 {v8.b}[2], [a_ptr2]\n"
156 "ld1 {v11.b}[2], [a_ptr3]\n"
157 "ld1 {v14.b}[2], [a_ptr4]\n"
158 "ld1 {v17.b}[2], [a_ptr5]\n"
160 "ldr q18, [%[b_ptr0]]\n"
161 "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
162 "ldr q19, [%[b_ptr0], #0x10]\n"
163 "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
164 "ldr q20, [%[b_ptr0], #0x20]\n"
165 "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
166 "ldr q21, [%[b_ptr0], #0x30]\n"
167 "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
168 "ldr q22, [%[b_ptr0], #0x40]\n"
169 "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
170 "ldr q23, [%[b_ptr0], #0x50]\n"
171 "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
172 "ldr q24, [%[b_ptr0], #0x60]\n"
173 "ldr q25, [%[b_ptr0], #0x70]\n"
174 "add %[b_ptr0], %[b_ptr0], #0x80\n"
177 "subs %[loops], %[loops], #0x1\n"
183 ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
184 ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
185 ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
186 ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
187 ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
188 ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
189 "ldr q18, [%[b_ptr0]]\n"
190 ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
191 "add %[b_ptr0], %[b_ptr0], #0x10\n"
192 ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
193 ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
194 ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
195 ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
196 ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
197 ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
198 ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
199 ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
200 ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
201 ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
202 ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
203 ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
204 ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
205 ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
206 ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
207 ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
208 ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
209 ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
210 ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
211 ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
212 ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
213 ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
214 ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
215 ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
216 ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
217 ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
218 ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
219 ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
220 ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
221 ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
222 ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
223 ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
224 ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
225 ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
226 ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
227 ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
228 ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
229 ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
230 ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
231 ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
232 ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
233 ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
234 ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
235 ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
236 ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
237 ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
238 ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
241 "str q26, [%[c_ptr0]]\n"
242 "subs %[loops], %[loops], #0x1\n"
244 "ldr d18, [%[b_ptr0]]\n"
245 "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
246 "add %[c_ptr0], %[c_ptr0], #0x10\n"
247 "str q27, [c_ptr1]\n"
248 "add c_ptr1, c_ptr1, #0x10\n"
250 "ldr d19, [%[b_ptr0], #0x10]\n"
251 "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
252 "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
253 "str q28, [c_ptr2]\n"
254 "add c_ptr2, c_ptr2, #0x10\n"
256 "ldr d20, [%[b_ptr0], #0x20]\n"
257 "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
258 "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
259 "str q29, [c_ptr3]\n"
260 "add c_ptr3, c_ptr3, #0x10\n"
262 "ldr d21, [%[b_ptr0], #0x30]\n"
263 "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
264 "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
265 "str q30, [c_ptr4]\n"
266 "add c_ptr4, c_ptr4, #0x10\n"
268 "ldr d22, [%[b_ptr0], #0x40]\n"
269 "ins v18.d[1], temploadreg2\n"
270 "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
271 "str q31, [c_ptr5]\n"
272 "add c_ptr5, c_ptr5, #0x10\n"
274 "ldr temploadreg2, [%[b_ptr0], #0x48]\n"
275 ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
276 "ldr d23, [%[b_ptr0], #0x50]\n"
277 ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
278 "ins v19.d[1], temploadreg3\n"
279 ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
280 "ldr temploadreg3, [%[b_ptr0], #0x58]\n"
281 ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
282 "ldr d24, [%[b_ptr0], #0x60]\n"
283 ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
284 "ins v20.d[1], temploadreg0\n"
285 ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
286 "ldr temploadreg0, [%[b_ptr0], #0x68]\n"
287 ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
288 "ldr d25, [%[b_ptr0], #0x70]\n"
289 ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
290 "ins v21.d[1], temploadreg1\n"
291 ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
292 "ldr temploadreg1, [%[b_ptr0], #0x78]\n"
293 ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
294 "ins v22.d[1], temploadreg2\n"
295 ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
296 "ins v23.d[1], temploadreg3\n"
297 ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
298 "ins v24.d[1], temploadreg0\n"
299 ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
300 "ins v25.d[1], temploadreg1\n"
301 ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
302 "add %[b_ptr0], %[b_ptr0], #0x80\n"
303 ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
304 "ldr d18, [%[b_ptr0]]\n"
305 ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
306 "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
307 ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
308 "add %[b_ptr0], %[b_ptr0], #0x10\n"
309 ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
310 "ins v18.d[1], temploadreg2\n"
311 ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
312 "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
313 ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
314 "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
315 ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
316 ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
317 ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
318 ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
319 ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
320 ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
321 ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
322 ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
323 ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
324 ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
325 ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
326 ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
327 ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
328 ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
329 ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
330 ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
331 ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
332 ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
333 ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
334 ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
335 ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
336 ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
337 ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
338 ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
339 ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
340 ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
341 ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
342 ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
343 ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
344 ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
345 ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
346 ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
347 ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
348 ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
351 "str q26, [%[c_ptr0]]\n"
352 "add %[c_ptr0], %[c_ptr0], #0x10\n"
354 "ldr q18, [%[b_ptr0]]\n"
355 "ldr q19, [%[b_ptr0], #0x10]\n"
356 "str q27, [c_ptr1]\n"
357 "add c_ptr1, c_ptr1, #0x10\n"
359 "ldr q20, [%[b_ptr0], #0x20]\n"
360 ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
361 "str q28, [c_ptr2]\n"
363 "ldr q21, [%[b_ptr0], #0x30]\n"
364 "ldr q22, [%[b_ptr0], #0x40]\n"
365 "add c_ptr2, c_ptr2, #0x10\n"
366 ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
367 "str q29, [c_ptr3]\n"
369 "ldr q23, [%[b_ptr0], #0x50]\n"
370 ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
371 "ldr q24, [%[b_ptr0], #0x60]\n"
372 ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
373 "str q30, [c_ptr4]\n"
375 "ldr q25, [%[b_ptr0], #0x70]\n"
376 ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
377 "add c_ptr3, c_ptr3, #0x10\n"
378 ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
379 "str q31, [c_ptr5]\n"
381 "add c_ptr4, c_ptr4, #0x10\n"
382 ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
383 "add c_ptr5, c_ptr5, #0x10\n"
384 ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
385 "add %[b_ptr0], %[b_ptr0], #0x80\n"
386 ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
387 "ldr q18, [%[b_ptr0]]\n"
388 ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
389 "add %[b_ptr0], %[b_ptr0], #0x10\n"
390 ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
391 ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
392 ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
393 ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
394 ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
395 ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
396 ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
397 ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
398 ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
399 ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
400 ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
401 ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
402 ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
403 ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
404 ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
405 ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
406 ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
407 ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
408 ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
409 ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
410 ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
411 ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
412 ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
413 ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
414 ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
415 ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
416 ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
417 ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
418 ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
419 ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
420 ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
421 ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
422 ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
423 ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
424 ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
425 ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
426 ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
427 ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
428 ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
429 ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
430 ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
431 ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
432 ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
433 ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
442 ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
443 ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
444 ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
445 ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
446 ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
447 ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
448 "ldr q18, [%[b_ptr0]]\n"
449 ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
450 "add %[b_ptr0], %[b_ptr0], #0x10\n"
451 ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
452 ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
453 ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
454 ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
455 ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
456 ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
457 ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
458 ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
459 ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
460 ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
461 ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
462 ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
463 ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
464 ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
465 ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
466 ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
467 ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
468 ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
469 ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
470 ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
471 ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
472 ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
473 ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
474 ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
475 ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
476 ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
477 ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
478 ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
479 ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
480 ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
481 ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
482 ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
483 ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
484 ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
485 ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
486 ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
487 ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
488 ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
489 ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
490 ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
491 ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
492 ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
493 ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
494 ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
495 ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
496 ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
497 ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
499 "str q26, [%[c_ptr0]]\n"
500 "add %[c_ptr0], %[c_ptr0], #0x10\n"
501 "str q27, [c_ptr1]\n"
502 "str q28, [c_ptr2]\n"
503 "str q29, [c_ptr3]\n"
504 "str q30, [c_ptr4]\n"
505 "str q31, [c_ptr5]\n"
516 ".unreq temploadreg0\n"
517 ".unreq temploadreg1\n"
518 ".unreq temploadreg2\n"
519 ".unreq temploadreg3\n"
520 : [a_ptr0]
"+r" (a_ptr0), [b_ptr0]
"+r" (b_ptr0), [c_ptr0]
"+r" (c_ptr0), [loops]
"+r" (loops), [oob_rows]
"+r" (oob_rows), [odds]
"+r" (odds)
521 : [lda]
"r" (ldab), [ldc]
"r" (ldcb)
522 :
"x0",
"x1",
"x2",
"x3",
"x4",
"x5",
"x6",
"x7",
"x8",
"x9",
"x10",
"x11",
"x12",
"x13",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23",
"v24",
"v25",
"v26",
"v27",
"v28",
"v29",
"v30",
"v31",
"cc",
"memory"
537 "temploadreg0 .req X10\n"
538 "temploadreg1 .req X11\n"
539 "temploadreg2 .req X12\n"
540 "temploadreg3 .req X13\n"
541 "add a_ptr1, %[a_ptr0], %[lda]\n"
542 "add c_ptr1, %[c_ptr0], %[ldc]\n"
543 "add a_ptr2, a_ptr1, %[lda]\n"
544 "add c_ptr2, c_ptr1, %[ldc]\n"
545 "add a_ptr3, a_ptr2, %[lda]\n"
546 "add c_ptr3, c_ptr2, %[ldc]\n"
547 "add a_ptr4, a_ptr3, %[lda]\n"
548 "add c_ptr4, c_ptr3, %[ldc]\n"
549 "add a_ptr5, a_ptr4, %[lda]\n"
550 "add c_ptr5, c_ptr4, %[ldc]\n"
551 "cbz %[oob_rows], 1f\n"
552 "subs %[oob_rows], %[oob_rows], #0x1\n"
553 "add c_ptr5, %[c_ptr0], #0x0\n"
554 "add a_ptr5, %[a_ptr0], #0x0\n"
556 "subs %[oob_rows], %[oob_rows], #0x1\n"
557 "add c_ptr4, %[c_ptr0], #0x0\n"
558 "add a_ptr4, %[a_ptr0], #0x0\n"
560 "subs %[oob_rows], %[oob_rows], #0x1\n"
561 "add c_ptr3, %[c_ptr0], #0x0\n"
562 "add a_ptr3, %[a_ptr0], #0x0\n"
564 "subs %[oob_rows], %[oob_rows], #0x1\n"
565 "add c_ptr2, %[c_ptr0], #0x0\n"
566 "add a_ptr2, %[a_ptr0], #0x0\n"
568 "subs %[oob_rows], %[oob_rows], #0x1\n"
569 "add c_ptr1, %[c_ptr0], #0x0\n"
570 "add a_ptr1, %[a_ptr0], #0x0\n"
573 "ldr q0, [%[a_ptr0]], #0x10\n"
574 "ldr q3, [a_ptr1], #0x10\n"
575 "ldr q6, [a_ptr2], #0x10\n"
576 "ldr q9, [a_ptr3], #0x10\n"
577 "ldr q12, [a_ptr4], #0x10\n"
578 "ldr q15, [a_ptr5], #0x10\n"
579 "ldr q1, [%[a_ptr0]], #0x10\n"
580 "ldr q4, [a_ptr1], #0x10\n"
581 "ldr q7, [a_ptr2], #0x10\n"
582 "ldr q10, [a_ptr3], #0x10\n"
583 "ldr d2, [%[a_ptr0]]\n"
584 "ldr q13, [a_ptr4], #0x10\n"
586 "ldr q16, [a_ptr5], #0x10\n"
588 "ldr d11, [a_ptr3]\n"
589 "ldr d14, [a_ptr4]\n"
590 "ldr d17, [a_ptr5]\n"
593 "ldr q0, [%[a_ptr0]], #0x10\n"
594 "subs %[odds], %[odds], #0x1\n"
595 "ldr q3, [a_ptr1], #0x10\n"
596 "ldr q6, [a_ptr2], #0x10\n"
597 "ldr q9, [a_ptr3], #0x10\n"
598 "ldr q12, [a_ptr4], #0x10\n"
599 "ldr q15, [a_ptr5], #0x10\n"
600 "ldr q1, [%[a_ptr0]], #0x10\n"
601 "ldr q4, [a_ptr1], #0x10\n"
602 "ldr q7, [a_ptr2], #0x10\n"
603 "ldr q10, [a_ptr3], #0x10\n"
604 "ldr s2, [%[a_ptr0]], #0x4\n"
605 "ldr q13, [a_ptr4], #0x10\n"
606 "ldr s5, [a_ptr1], #0x4\n"
607 "ldr q16, [a_ptr5], #0x10\n"
608 "ldr s8, [a_ptr2], #0x4\n"
609 "ldr s11, [a_ptr3], #0x4\n"
610 "ldr s14, [a_ptr4], #0x4\n"
611 "ldr s17, [a_ptr5], #0x4\n"
613 "ld1 {v2.b}[4], [%[a_ptr0]]\n"
614 "ld1 {v5.b}[4], [a_ptr1]\n"
615 "ld1 {v8.b}[4], [a_ptr2]\n"
616 "ld1 {v11.b}[4], [a_ptr3]\n"
617 "ld1 {v14.b}[4], [a_ptr4]\n"
618 "ld1 {v17.b}[4], [a_ptr5]\n"
621 "ld1 {v2.h}[2], [%[a_ptr0]], #2\n"
622 "ld1 {v5.h}[2], [a_ptr1], #2\n"
623 "ld1 {v8.h}[2], [a_ptr2], #2\n"
624 "ld1 {v11.h}[2], [a_ptr3], #2\n"
625 "ld1 {v14.h}[2], [a_ptr4], #2\n"
626 "ld1 {v17.h}[2], [a_ptr5], #2\n"
627 "subs %[odds], %[odds], #0x1\n"
631 "ld1 {v2.b}[6], [%[a_ptr0]]\n"
632 "ld1 {v5.b}[6], [a_ptr1]\n"
633 "ld1 {v8.b}[6], [a_ptr2]\n"
634 "ld1 {v11.b}[6], [a_ptr3]\n"
635 "ld1 {v14.b}[6], [a_ptr4]\n"
636 "ld1 {v17.b}[6], [a_ptr5]\n"
638 "ldr q18, [%[b_ptr0]]\n"
639 "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
640 "ldr q19, [%[b_ptr0], #0x10]\n"
641 "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
642 "ldr q20, [%[b_ptr0], #0x20]\n"
643 "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
644 "ldr q21, [%[b_ptr0], #0x30]\n"
645 "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
646 "ldr q22, [%[b_ptr0], #0x40]\n"
647 "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
648 "ldr q23, [%[b_ptr0], #0x50]\n"
649 "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
650 "ldr q24, [%[b_ptr0], #0x60]\n"
651 "ldr q25, [%[b_ptr0], #0x70]\n"
652 "add %[b_ptr0], %[b_ptr0], #0x80\n"
655 "subs %[loops], %[loops], #0x1\n"
661 ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
662 ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
663 ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
664 ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
665 ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
666 ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
667 "ldr q18, [%[b_ptr0]]\n"
668 ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
669 ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
670 ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
671 ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
672 ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
673 ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
674 "ldr q19, [%[b_ptr0], #0x10]\n"
675 ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
676 "add %[b_ptr0], %[b_ptr0], #0x20\n"
677 ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
678 ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
679 ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
680 ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
681 ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
682 ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
683 ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
684 ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
685 ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
686 ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
687 ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
688 ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
689 ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
690 ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
691 ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
692 ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
693 ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
694 ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
695 ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
696 ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
697 ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
698 ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
699 ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
700 ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
701 ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
702 ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
703 ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
704 ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
705 ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
706 ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
707 ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
708 ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
709 ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
710 ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
711 ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
712 ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
713 ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
714 ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
715 ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
716 ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
717 ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
718 ".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
719 ".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
720 ".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
721 ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
722 ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
723 ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
726 "str q26, [%[c_ptr0]]\n"
727 "subs %[loops], %[loops], #0x1\n"
729 "ldr d18, [%[b_ptr0]]\n"
730 "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
731 "add %[c_ptr0], %[c_ptr0], #0x10\n"
732 "str q27, [c_ptr1]\n"
733 "add c_ptr1, c_ptr1, #0x10\n"
735 "ldr d19, [%[b_ptr0], #0x10]\n"
736 "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
737 "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
738 "str q28, [c_ptr2]\n"
739 "add c_ptr2, c_ptr2, #0x10\n"
741 "ldr d20, [%[b_ptr0], #0x20]\n"
742 "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
743 "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
744 "str q29, [c_ptr3]\n"
745 "add c_ptr3, c_ptr3, #0x10\n"
747 "ldr d21, [%[b_ptr0], #0x30]\n"
748 "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
749 "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
750 "str q30, [c_ptr4]\n"
751 "add c_ptr4, c_ptr4, #0x10\n"
753 "ldr d22, [%[b_ptr0], #0x40]\n"
754 "ins v18.d[1], temploadreg2\n"
755 "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
756 "str q31, [c_ptr5]\n"
757 "add c_ptr5, c_ptr5, #0x10\n"
759 "ldr temploadreg2, [%[b_ptr0], #0x48]\n"
760 ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
761 "ldr d23, [%[b_ptr0], #0x50]\n"
762 ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
763 "ins v19.d[1], temploadreg3\n"
764 ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
765 "ldr temploadreg3, [%[b_ptr0], #0x58]\n"
766 ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
767 "ldr d24, [%[b_ptr0], #0x60]\n"
768 ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
769 "ins v20.d[1], temploadreg0\n"
770 ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
771 "ldr temploadreg0, [%[b_ptr0], #0x68]\n"
772 ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
773 "ldr d25, [%[b_ptr0], #0x70]\n"
774 ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
775 "ins v21.d[1], temploadreg1\n"
776 ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
777 "ldr temploadreg1, [%[b_ptr0], #0x78]\n"
778 ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
779 "ins v22.d[1], temploadreg2\n"
780 ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
781 "ins v23.d[1], temploadreg3\n"
782 ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
783 "ins v24.d[1], temploadreg0\n"
784 ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
785 "ins v25.d[1], temploadreg1\n"
786 ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
787 "add %[b_ptr0], %[b_ptr0], #0x80\n"
788 ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
789 "ldr d18, [%[b_ptr0]]\n"
790 ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
791 "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
792 ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
793 "ldr d19, [%[b_ptr0], #0x10]\n"
794 ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
795 "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
796 ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
797 "ins v18.d[1], temploadreg2\n"
798 ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
799 "add %[b_ptr0], %[b_ptr0], #0x20\n"
800 ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
801 "ins v19.d[1], temploadreg3\n"
802 ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
803 "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
804 ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
805 "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
806 ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
807 ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
808 ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
809 ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
810 ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
811 ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
812 ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
813 ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
814 ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
815 ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
816 ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
817 ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
818 ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
819 ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
820 ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
821 ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
822 ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
823 ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
824 ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
825 ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
826 ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
827 ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
828 ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
829 ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
830 ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
831 ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
832 ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
833 ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
834 ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
835 ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
836 ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
837 ".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
838 ".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
839 ".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
840 ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
841 ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
842 ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
845 "str q26, [%[c_ptr0]]\n"
846 "add %[c_ptr0], %[c_ptr0], #0x10\n"
848 "ldr q18, [%[b_ptr0]]\n"
849 "ldr q19, [%[b_ptr0], #0x10]\n"
850 "str q27, [c_ptr1]\n"
851 "add c_ptr1, c_ptr1, #0x10\n"
853 "ldr q20, [%[b_ptr0], #0x20]\n"
854 ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
855 "str q28, [c_ptr2]\n"
857 "ldr q21, [%[b_ptr0], #0x30]\n"
858 "ldr q22, [%[b_ptr0], #0x40]\n"
859 "add c_ptr2, c_ptr2, #0x10\n"
860 ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
861 "str q29, [c_ptr3]\n"
863 "ldr q23, [%[b_ptr0], #0x50]\n"
864 ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
865 "ldr q24, [%[b_ptr0], #0x60]\n"
866 ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
867 "str q30, [c_ptr4]\n"
869 "ldr q25, [%[b_ptr0], #0x70]\n"
870 ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
871 "add c_ptr3, c_ptr3, #0x10\n"
872 ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
873 "str q31, [c_ptr5]\n"
875 "add c_ptr4, c_ptr4, #0x10\n"
876 ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
877 "add c_ptr5, c_ptr5, #0x10\n"
878 ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
879 "add %[b_ptr0], %[b_ptr0], #0x80\n"
880 ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
881 "ldr q18, [%[b_ptr0]]\n"
882 ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
883 ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
884 ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
885 "ldr q19, [%[b_ptr0], #0x10]\n"
886 ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
887 "add %[b_ptr0], %[b_ptr0], #0x20\n"
888 ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
889 ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
890 ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
891 ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
892 ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
893 ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
894 ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
895 ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
896 ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
897 ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
898 ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
899 ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
900 ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
901 ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
902 ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
903 ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
904 ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
905 ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
906 ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
907 ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
908 ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
909 ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
910 ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
911 ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
912 ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
913 ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
914 ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
915 ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
916 ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
917 ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
918 ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
919 ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
920 ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
921 ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
922 ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
923 ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
924 ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
925 ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
926 ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
927 ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
928 ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
929 ".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
930 ".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
931 ".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
932 ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
933 ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
934 ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
943 ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
944 ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
945 ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
946 ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
947 ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
948 ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
949 "ldr q18, [%[b_ptr0]]\n"
950 ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
951 ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
952 ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
953 ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
954 ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
955 ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
956 "ldr q19, [%[b_ptr0], #0x10]\n"
957 ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
958 "add %[b_ptr0], %[b_ptr0], #0x20\n"
959 ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
960 ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
961 ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
962 ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
963 ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
964 ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
965 ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
966 ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
967 ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
968 ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
969 ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
970 ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
971 ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
972 ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
973 ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
974 ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
975 ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
976 ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
977 ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
978 ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
979 ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
980 ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
981 ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
982 ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
983 ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
984 ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
985 ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
986 ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
987 ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
988 ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
989 ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
990 ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
991 ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
992 ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
993 ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
994 ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
995 ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
996 ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
997 ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
998 ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
999 ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
1000 ".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
1001 ".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
1002 ".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
1003 ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
1004 ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
1005 ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
1007 "str q26, [%[c_ptr0]]\n"
1008 "add %[c_ptr0], %[c_ptr0], #0x10\n"
1009 "str q27, [c_ptr1]\n"
1010 "str q28, [c_ptr2]\n"
1011 "str q29, [c_ptr3]\n"
1012 "str q30, [c_ptr4]\n"
1013 "str q31, [c_ptr5]\n"
1024 ".unreq temploadreg0\n"
1025 ".unreq temploadreg1\n"
1026 ".unreq temploadreg2\n"
1027 ".unreq temploadreg3\n"
1028 : [a_ptr0]
"+r" (a_ptr0), [b_ptr0]
"+r" (b_ptr0), [c_ptr0]
"+r" (c_ptr0), [loops]
"+r" (loops), [oob_rows]
"+r" (oob_rows), [odds]
"+r" (odds)
1029 : [lda]
"r" (ldab), [ldc]
"r" (ldcb)
1030 :
"x0",
"x1",
"x2",
"x3",
"x4",
"x5",
"x6",
"x7",
"x8",
"x9",
"x10",
"x11",
"x12",
"x13",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23",
"v24",
"v25",
"v26",
"v27",
"v28",
"v29",
"v30",
"v31",
"cc",
"memory"
1045 "temploadreg0 .req X10\n"
1046 "temploadreg1 .req X11\n"
1047 "temploadreg2 .req X12\n"
1048 "temploadreg3 .req X13\n"
1049 "add a_ptr1, %[a_ptr0], %[lda]\n"
1050 "add c_ptr1, %[c_ptr0], %[ldc]\n"
1051 "add a_ptr2, a_ptr1, %[lda]\n"
1052 "add c_ptr2, c_ptr1, %[ldc]\n"
1053 "add a_ptr3, a_ptr2, %[lda]\n"
1054 "add c_ptr3, c_ptr2, %[ldc]\n"
1055 "add a_ptr4, a_ptr3, %[lda]\n"
1056 "add c_ptr4, c_ptr3, %[ldc]\n"
1057 "add a_ptr5, a_ptr4, %[lda]\n"
1058 "add c_ptr5, c_ptr4, %[ldc]\n"
1059 "cbz %[oob_rows], 1f\n"
1060 "subs %[oob_rows], %[oob_rows], #0x1\n"
1061 "add c_ptr5, %[c_ptr0], #0x0\n"
1062 "add a_ptr5, %[a_ptr0], #0x0\n"
1064 "subs %[oob_rows], %[oob_rows], #0x1\n"
1065 "add c_ptr4, %[c_ptr0], #0x0\n"
1066 "add a_ptr4, %[a_ptr0], #0x0\n"
1068 "subs %[oob_rows], %[oob_rows], #0x1\n"
1069 "add c_ptr3, %[c_ptr0], #0x0\n"
1070 "add a_ptr3, %[a_ptr0], #0x0\n"
1072 "subs %[oob_rows], %[oob_rows], #0x1\n"
1073 "add c_ptr2, %[c_ptr0], #0x0\n"
1074 "add a_ptr2, %[a_ptr0], #0x0\n"
1076 "subs %[oob_rows], %[oob_rows], #0x1\n"
1077 "add c_ptr1, %[c_ptr0], #0x0\n"
1078 "add a_ptr1, %[a_ptr0], #0x0\n"
1080 "ldr q0, [%[a_ptr0]], #0x10\n"
1081 "ldr q3, [a_ptr1], #0x10\n"
1082 "ldr q6, [a_ptr2], #0x10\n"
1083 "ldr q9, [a_ptr3], #0x10\n"
1084 "ldr q12, [a_ptr4], #0x10\n"
1085 "ldr q15, [a_ptr5], #0x10\n"
1086 "ldr q1, [%[a_ptr0]], #0x10\n"
1087 "ldr q4, [a_ptr1], #0x10\n"
1088 "ldr q7, [a_ptr2], #0x10\n"
1089 "ldr q10, [a_ptr3], #0x10\n"
1090 "ldr d2, [%[a_ptr0]], #0x8\n"
1091 "ldr q13, [a_ptr4], #0x10\n"
1092 "ldr d5, [a_ptr1], #0x8\n"
1093 "ldr q16, [a_ptr5], #0x10\n"
1094 "ldr d8, [a_ptr2], #0x8\n"
1095 "ldr d11, [a_ptr3], #0x8\n"
1096 "ldr d14, [a_ptr4], #0x8\n"
1097 "ldr d17, [a_ptr5], #0x8\n"
1098 "cbnz %[odds], 2f\n"
1099 "ld1 {v2.s}[2], [%[a_ptr0]]\n"
1100 "ld1 {v5.s}[2], [a_ptr1]\n"
1101 "ld1 {v8.s}[2], [a_ptr2]\n"
1102 "ld1 {v11.s}[2], [a_ptr3]\n"
1103 "ld1 {v14.s}[2], [a_ptr4]\n"
1104 "ld1 {v17.s}[2], [a_ptr5]\n"
1107 "subs %[odds], %[odds], #0x1\n"
1109 "ld1 {v2.b}[8], [%[a_ptr0]]\n"
1110 "ld1 {v5.b}[8], [a_ptr1]\n"
1111 "ld1 {v8.b}[8], [a_ptr2]\n"
1112 "ld1 {v11.b}[8], [a_ptr3]\n"
1113 "ld1 {v14.b}[8], [a_ptr4]\n"
1114 "ld1 {v17.b}[8], [a_ptr5]\n"
1117 "ld1 {v2.h}[4], [%[a_ptr0]], #2\n"
1118 "ld1 {v5.h}[4], [a_ptr1], #2\n"
1119 "ld1 {v8.h}[4], [a_ptr2], #2\n"
1120 "ld1 {v11.h}[4], [a_ptr3], #2\n"
1121 "ld1 {v14.h}[4], [a_ptr4], #2\n"
1122 "ld1 {v17.h}[4], [a_ptr5], #2\n"
1123 "subs %[odds], %[odds], #0x1\n"
1127 "ld1 {v2.b}[10], [%[a_ptr0]]\n"
1128 "ld1 {v5.b}[10], [a_ptr1]\n"
1129 "ld1 {v8.b}[10], [a_ptr2]\n"
1130 "ld1 {v11.b}[10], [a_ptr3]\n"
1131 "ld1 {v14.b}[10], [a_ptr4]\n"
1132 "ld1 {v17.b}[10], [a_ptr5]\n"
1134 "ldr q18, [%[b_ptr0]]\n"
1135 "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
1136 "ldr q19, [%[b_ptr0], #0x10]\n"
1137 "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
1138 "ldr q20, [%[b_ptr0], #0x20]\n"
1139 "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
1140 "ldr q21, [%[b_ptr0], #0x30]\n"
1141 "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
1142 "ldr q22, [%[b_ptr0], #0x40]\n"
1143 "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
1144 "ldr q23, [%[b_ptr0], #0x50]\n"
1145 "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
1146 "ldr q24, [%[b_ptr0], #0x60]\n"
1147 "ldr q25, [%[b_ptr0], #0x70]\n"
1148 "add %[b_ptr0], %[b_ptr0], #0x80\n"
1149 "cbz %[loops], 6f\n"
1151 "subs %[loops], %[loops], #0x1\n"
1157 ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
1158 ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
1159 ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
1160 ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
1161 ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
1162 ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
1163 "ldr q18, [%[b_ptr0]]\n"
1164 ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
1165 ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
1166 ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
1167 ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
1168 ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
1169 ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
1170 "ldr q19, [%[b_ptr0], #0x10]\n"
1171 ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
1172 ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
1173 ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
1174 ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
1175 ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
1176 ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
1177 "ldr q20, [%[b_ptr0], #0x20]\n"
1178 ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
1179 "add %[b_ptr0], %[b_ptr0], #0x30\n"
1180 ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
1181 ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
1182 ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
1183 ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
1184 ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
1185 ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
1186 ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
1187 ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
1188 ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
1189 ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
1190 ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
1191 ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
1192 ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
1193 ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
1194 ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
1195 ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
1196 ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
1197 ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
1198 ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
1199 ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
1200 ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
1201 ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
1202 ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
1203 ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
1204 ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
1205 ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
1206 ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
1207 ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
1208 ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
1209 ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
1210 ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
1211 ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
1212 ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
1213 ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
1214 ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
1215 ".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
1216 ".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
1217 ".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
1218 ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
1219 ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
1220 ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
1221 ".inst 0x6f82ea9a // udot v26.4s, v20.16b, v2.4b[2]\n"
1222 ".inst 0x6f85ea9b // udot v27.4s, v20.16b, v5.4b[2]\n"
1223 ".inst 0x6f88ea9c // udot v28.4s, v20.16b, v8.4b[2]\n"
1224 ".inst 0x6f8bea9d // udot v29.4s, v20.16b, v11.4b[2]\n"
1225 ".inst 0x6f8eea9e // udot v30.4s, v20.16b, v14.4b[2]\n"
1226 ".inst 0x6f91ea9f // udot v31.4s, v20.16b, v17.4b[2]\n"
1229 "str q26, [%[c_ptr0]]\n"
1230 "subs %[loops], %[loops], #0x1\n"
1232 "ldr d18, [%[b_ptr0]]\n"
1233 "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
1234 "add %[c_ptr0], %[c_ptr0], #0x10\n"
1235 "str q27, [c_ptr1]\n"
1236 "add c_ptr1, c_ptr1, #0x10\n"
1238 "ldr d19, [%[b_ptr0], #0x10]\n"
1239 "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
1240 "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
1241 "str q28, [c_ptr2]\n"
1242 "add c_ptr2, c_ptr2, #0x10\n"
1244 "ldr d20, [%[b_ptr0], #0x20]\n"
1245 "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
1246 "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
1247 "str q29, [c_ptr3]\n"
1248 "add c_ptr3, c_ptr3, #0x10\n"
1250 "ldr d21, [%[b_ptr0], #0x30]\n"
1251 "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
1252 "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
1253 "str q30, [c_ptr4]\n"
1254 "add c_ptr4, c_ptr4, #0x10\n"
1256 "ldr d22, [%[b_ptr0], #0x40]\n"
1257 "ins v18.d[1], temploadreg2\n"
1258 "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
1259 "str q31, [c_ptr5]\n"
1260 "add c_ptr5, c_ptr5, #0x10\n"
1262 "ldr temploadreg2, [%[b_ptr0], #0x48]\n"
1263 ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
1264 "ldr d23, [%[b_ptr0], #0x50]\n"
1265 ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
1266 "ins v19.d[1], temploadreg3\n"
1267 ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
1268 "ldr temploadreg3, [%[b_ptr0], #0x58]\n"
1269 ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
1270 "ldr d24, [%[b_ptr0], #0x60]\n"
1271 ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
1272 "ins v20.d[1], temploadreg0\n"
1273 ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
1274 "ldr temploadreg0, [%[b_ptr0], #0x68]\n"
1275 ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
1276 "ldr d25, [%[b_ptr0], #0x70]\n"
1277 ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
1278 "ins v21.d[1], temploadreg1\n"
1279 ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
1280 "ldr temploadreg1, [%[b_ptr0], #0x78]\n"
1281 ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
1282 "ins v22.d[1], temploadreg2\n"
1283 ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
1284 "ins v23.d[1], temploadreg3\n"
1285 ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
1286 "ins v24.d[1], temploadreg0\n"
1287 ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
1288 "ins v25.d[1], temploadreg1\n"
1289 ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
1290 "add %[b_ptr0], %[b_ptr0], #0x80\n"
1291 ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
1292 "ldr d18, [%[b_ptr0]]\n"
1293 ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
1294 "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
1295 ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
1296 "ldr d19, [%[b_ptr0], #0x10]\n"
1297 ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
1298 "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
1299 ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
1300 "ldr d20, [%[b_ptr0], #0x20]\n"
1301 ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
1302 "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
1303 ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
1304 "ins v18.d[1], temploadreg2\n"
1305 ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
1306 "ins v19.d[1], temploadreg3\n"
1307 ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
1308 "ins v20.d[1], temploadreg0\n"
1309 ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
1310 "add %[b_ptr0], %[b_ptr0], #0x30\n"
1311 ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
1312 "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
1313 ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
1314 "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
1315 ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
1316 ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
1317 ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
1318 ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
1319 ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
1320 ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
1321 ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
1322 ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
1323 ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
1324 ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
1325 ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
1326 ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
1327 ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
1328 ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
1329 ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
1330 ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
1331 ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
1332 ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
1333 ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
1334 ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
1335 ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
1336 ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
1337 ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
1338 ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
1339 ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
1340 ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
1341 ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
1342 ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
1343 ".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
1344 ".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
1345 ".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
1346 ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
1347 ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
1348 ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
1349 ".inst 0x6f82ea9a // udot v26.4s, v20.16b, v2.4b[2]\n"
1350 ".inst 0x6f85ea9b // udot v27.4s, v20.16b, v5.4b[2]\n"
1351 ".inst 0x6f88ea9c // udot v28.4s, v20.16b, v8.4b[2]\n"
1352 ".inst 0x6f8bea9d // udot v29.4s, v20.16b, v11.4b[2]\n"
1353 ".inst 0x6f8eea9e // udot v30.4s, v20.16b, v14.4b[2]\n"
1354 ".inst 0x6f91ea9f // udot v31.4s, v20.16b, v17.4b[2]\n"
1357 "str q26, [%[c_ptr0]]\n"
1358 "add %[c_ptr0], %[c_ptr0], #0x10\n"
1360 "ldr q18, [%[b_ptr0]]\n"
1361 "ldr q19, [%[b_ptr0], #0x10]\n"
1362 "str q27, [c_ptr1]\n"
1363 "add c_ptr1, c_ptr1, #0x10\n"
1365 "ldr q20, [%[b_ptr0], #0x20]\n"
1366 ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
1367 "str q28, [c_ptr2]\n"
1369 "ldr q21, [%[b_ptr0], #0x30]\n"
1370 "ldr q22, [%[b_ptr0], #0x40]\n"
1371 "add c_ptr2, c_ptr2, #0x10\n"
1372 ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
1373 "str q29, [c_ptr3]\n"
1375 "ldr q23, [%[b_ptr0], #0x50]\n"
1376 ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
1377 "ldr q24, [%[b_ptr0], #0x60]\n"
1378 ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
1379 "str q30, [c_ptr4]\n"
1381 "ldr q25, [%[b_ptr0], #0x70]\n"
1382 ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
1383 "add c_ptr3, c_ptr3, #0x10\n"
1384 ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
1385 "str q31, [c_ptr5]\n"
1387 "add c_ptr4, c_ptr4, #0x10\n"
1388 ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
1389 "add c_ptr5, c_ptr5, #0x10\n"
1390 ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
1391 "add %[b_ptr0], %[b_ptr0], #0x80\n"
1392 ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
1393 "ldr q18, [%[b_ptr0]]\n"
1394 ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
1395 ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
1396 ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
1397 "ldr q19, [%[b_ptr0], #0x10]\n"
1398 ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
1399 ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
1400 ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
1401 ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
1402 ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
1403 ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
1404 "ldr q20, [%[b_ptr0], #0x20]\n"
1405 ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
1406 "add %[b_ptr0], %[b_ptr0], #0x30\n"
1407 ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
1408 ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
1409 ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
1410 ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
1411 ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
1412 ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
1413 ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
1414 ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
1415 ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
1416 ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
1417 ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
1418 ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
1419 ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
1420 ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
1421 ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
1422 ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
1423 ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
1424 ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
1425 ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
1426 ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
1427 ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
1428 ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
1429 ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
1430 ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
1431 ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
1432 ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
1433 ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
1434 ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
1435 ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
1436 ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
1437 ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
1438 ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
1439 ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
1440 ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
1441 ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
1442 ".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
1443 ".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
1444 ".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
1445 ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
1446 ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
1447 ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
1448 ".inst 0x6f82ea9a // udot v26.4s, v20.16b, v2.4b[2]\n"
1449 ".inst 0x6f85ea9b // udot v27.4s, v20.16b, v5.4b[2]\n"
1450 ".inst 0x6f88ea9c // udot v28.4s, v20.16b, v8.4b[2]\n"
1451 ".inst 0x6f8bea9d // udot v29.4s, v20.16b, v11.4b[2]\n"
1452 ".inst 0x6f8eea9e // udot v30.4s, v20.16b, v14.4b[2]\n"
1453 ".inst 0x6f91ea9f // udot v31.4s, v20.16b, v17.4b[2]\n"
1462 ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
1463 ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
1464 ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
1465 ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
1466 ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
1467 ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
1468 "ldr q18, [%[b_ptr0]]\n"
1469 ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
1470 ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
1471 ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
1472 ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
1473 ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
1474 ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
1475 "ldr q19, [%[b_ptr0], #0x10]\n"
1476 ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
1477 ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
1478 ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
1479 ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
1480 ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
1481 ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
1482 "ldr q20, [%[b_ptr0], #0x20]\n"
1483 ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
1484 "add %[b_ptr0], %[b_ptr0], #0x30\n"
1485 ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
1486 ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
1487 ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
1488 ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
1489 ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
1490 ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
1491 ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
1492 ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
1493 ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
1494 ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
1495 ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
1496 ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
1497 ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
1498 ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
1499 ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
1500 ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
1501 ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
1502 ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
1503 ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
1504 ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
1505 ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
1506 ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
1507 ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
1508 ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
1509 ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
1510 ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
1511 ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
1512 ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
1513 ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
1514 ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
1515 ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
1516 ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
1517 ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
1518 ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
1519 ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
1520 ".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
1521 ".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
1522 ".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
1523 ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
1524 ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
1525 ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
1526 ".inst 0x6f82ea9a // udot v26.4s, v20.16b, v2.4b[2]\n"
1527 ".inst 0x6f85ea9b // udot v27.4s, v20.16b, v5.4b[2]\n"
1528 ".inst 0x6f88ea9c // udot v28.4s, v20.16b, v8.4b[2]\n"
1529 ".inst 0x6f8bea9d // udot v29.4s, v20.16b, v11.4b[2]\n"
1530 ".inst 0x6f8eea9e // udot v30.4s, v20.16b, v14.4b[2]\n"
1531 ".inst 0x6f91ea9f // udot v31.4s, v20.16b, v17.4b[2]\n"
1533 "str q26, [%[c_ptr0]]\n"
1534 "add %[c_ptr0], %[c_ptr0], #0x10\n"
1535 "str q27, [c_ptr1]\n"
1536 "str q28, [c_ptr2]\n"
1537 "str q29, [c_ptr3]\n"
1538 "str q30, [c_ptr4]\n"
1539 "str q31, [c_ptr5]\n"
1550 ".unreq temploadreg0\n"
1551 ".unreq temploadreg1\n"
1552 ".unreq temploadreg2\n"
1553 ".unreq temploadreg3\n"
1554 : [a_ptr0]
"+r" (a_ptr0), [b_ptr0]
"+r" (b_ptr0), [c_ptr0]
"+r" (c_ptr0), [loops]
"+r" (loops), [oob_rows]
"+r" (oob_rows), [odds]
"+r" (odds)
1555 : [lda]
"r" (ldab), [ldc]
"r" (ldcb)
1556 :
"x0",
"x1",
"x2",
"x3",
"x4",
"x5",
"x6",
"x7",
"x8",
"x9",
"x10",
"x11",
"x12",
"x13",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23",
"v24",
"v25",
"v26",
"v27",
"v28",
"v29",
"v30",
"v31",
"cc",
"memory"
1571 "temploadreg0 .req X10\n"
1572 "temploadreg1 .req X11\n"
1573 "temploadreg2 .req X12\n"
1574 "temploadreg3 .req X13\n"
1575 "add a_ptr1, %[a_ptr0], %[lda]\n"
1576 "add c_ptr1, %[c_ptr0], %[ldc]\n"
1577 "add a_ptr2, a_ptr1, %[lda]\n"
1578 "add c_ptr2, c_ptr1, %[ldc]\n"
1579 "add a_ptr3, a_ptr2, %[lda]\n"
1580 "add c_ptr3, c_ptr2, %[ldc]\n"
1581 "add a_ptr4, a_ptr3, %[lda]\n"
1582 "add c_ptr4, c_ptr3, %[ldc]\n"
1583 "add a_ptr5, a_ptr4, %[lda]\n"
1584 "add c_ptr5, c_ptr4, %[ldc]\n"
1585 "cbz %[oob_rows], 1f\n"
1586 "subs %[oob_rows], %[oob_rows], #0x1\n"
1587 "add c_ptr5, %[c_ptr0], #0x0\n"
1588 "add a_ptr5, %[a_ptr0], #0x0\n"
1590 "subs %[oob_rows], %[oob_rows], #0x1\n"
1591 "add c_ptr4, %[c_ptr0], #0x0\n"
1592 "add a_ptr4, %[a_ptr0], #0x0\n"
1594 "subs %[oob_rows], %[oob_rows], #0x1\n"
1595 "add c_ptr3, %[c_ptr0], #0x0\n"
1596 "add a_ptr3, %[a_ptr0], #0x0\n"
1598 "subs %[oob_rows], %[oob_rows], #0x1\n"
1599 "add c_ptr2, %[c_ptr0], #0x0\n"
1600 "add a_ptr2, %[a_ptr0], #0x0\n"
1602 "subs %[oob_rows], %[oob_rows], #0x1\n"
1603 "add c_ptr1, %[c_ptr0], #0x0\n"
1604 "add a_ptr1, %[a_ptr0], #0x0\n"
1606 "cbnz %[odds], 2f\n"
1607 "ldr q0, [%[a_ptr0]], #0x10\n"
1608 "ldr q3, [a_ptr1], #0x10\n"
1609 "ldr q6, [a_ptr2], #0x10\n"
1610 "ldr q9, [a_ptr3], #0x10\n"
1611 "ldr q12, [a_ptr4], #0x10\n"
1612 "ldr q15, [a_ptr5], #0x10\n"
1613 "ldr q1, [%[a_ptr0]], #0x10\n"
1614 "ldr q4, [a_ptr1], #0x10\n"
1615 "ldr q7, [a_ptr2], #0x10\n"
1616 "ldr q10, [a_ptr3], #0x10\n"
1617 "ldr q13, [a_ptr4], #0x10\n"
1618 "ldr q16, [a_ptr5], #0x10\n"
1619 "ldr q2, [%[a_ptr0]]\n"
1620 "ldr q5, [a_ptr1]\n"
1621 "ldr q8, [a_ptr2]\n"
1622 "ldr q11, [a_ptr3]\n"
1623 "ldr q14, [a_ptr4]\n"
1624 "ldr q17, [a_ptr5]\n"
1627 "ldr q0, [%[a_ptr0]], #0x10\n"
1628 "subs %[odds], %[odds], #0x1\n"
1629 "ldr q3, [a_ptr1], #0x10\n"
1630 "ldr q6, [a_ptr2], #0x10\n"
1631 "ldr q9, [a_ptr3], #0x10\n"
1632 "ldr q12, [a_ptr4], #0x10\n"
1633 "ldr q15, [a_ptr5], #0x10\n"
1634 "ldr q1, [%[a_ptr0]], #0x10\n"
1635 "ldr q4, [a_ptr1], #0x10\n"
1636 "ldr q7, [a_ptr2], #0x10\n"
1637 "ldr q10, [a_ptr3], #0x10\n"
1638 "ldr d2, [%[a_ptr0]], #0x8\n"
1639 "ldr q13, [a_ptr4], #0x10\n"
1640 "ldr d5, [a_ptr1], #0x8\n"
1641 "ldr q16, [a_ptr5], #0x10\n"
1642 "ldr d8, [a_ptr2], #0x8\n"
1643 "ldr d11, [a_ptr3], #0x8\n"
1644 "ldr d14, [a_ptr4], #0x8\n"
1645 "ldr d17, [a_ptr5], #0x8\n"
1646 "ld1 {v2.s}[2], [%[a_ptr0]], #4\n"
1647 "ld1 {v5.s}[2], [a_ptr1], #4\n"
1648 "ld1 {v8.s}[2], [a_ptr2], #4\n"
1649 "ld1 {v11.s}[2], [a_ptr3], #4\n"
1650 "ld1 {v14.s}[2], [a_ptr4], #4\n"
1651 "ld1 {v17.s}[2], [a_ptr5], #4\n"
1653 "ld1 {v2.b}[12], [%[a_ptr0]]\n"
1654 "ld1 {v5.b}[12], [a_ptr1]\n"
1655 "ld1 {v8.b}[12], [a_ptr2]\n"
1656 "ld1 {v11.b}[12], [a_ptr3]\n"
1657 "ld1 {v14.b}[12], [a_ptr4]\n"
1658 "ld1 {v17.b}[12], [a_ptr5]\n"
1661 "ld1 {v2.h}[6], [%[a_ptr0]], #2\n"
1662 "ld1 {v5.h}[6], [a_ptr1], #2\n"
1663 "ld1 {v8.h}[6], [a_ptr2], #2\n"
1664 "ld1 {v11.h}[6], [a_ptr3], #2\n"
1665 "ld1 {v14.h}[6], [a_ptr4], #2\n"
1666 "ld1 {v17.h}[6], [a_ptr5], #2\n"
1667 "subs %[odds], %[odds], #0x1\n"
1671 "ld1 {v2.b}[14], [%[a_ptr0]]\n"
1672 "ld1 {v5.b}[14], [a_ptr1]\n"
1673 "ld1 {v8.b}[14], [a_ptr2]\n"
1674 "ld1 {v11.b}[14], [a_ptr3]\n"
1675 "ld1 {v14.b}[14], [a_ptr4]\n"
1676 "ld1 {v17.b}[14], [a_ptr5]\n"
1678 "ldr q18, [%[b_ptr0]]\n"
1679 "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
1680 "ldr q19, [%[b_ptr0], #0x10]\n"
1681 "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
1682 "ldr q20, [%[b_ptr0], #0x20]\n"
1683 "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
1684 "ldr q21, [%[b_ptr0], #0x30]\n"
1685 "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
1686 "ldr q22, [%[b_ptr0], #0x40]\n"
1687 "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
1688 "ldr q23, [%[b_ptr0], #0x50]\n"
1689 "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
1690 "ldr q24, [%[b_ptr0], #0x60]\n"
1691 "ldr q25, [%[b_ptr0], #0x70]\n"
1692 "add %[b_ptr0], %[b_ptr0], #0x80\n"
1693 "cbz %[loops], 6f\n"
1695 "subs %[loops], %[loops], #0x1\n"
1701 ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
1702 ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
1703 ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
1704 ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
1705 ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
1706 ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
1707 "ldr q18, [%[b_ptr0]]\n"
1708 ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
1709 ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
1710 ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
1711 ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
1712 ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
1713 ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
1714 "ldr q19, [%[b_ptr0], #0x10]\n"
1715 ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
1716 ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
1717 ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
1718 ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
1719 ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
1720 ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
1721 "ldr q20, [%[b_ptr0], #0x20]\n"
1722 ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
1723 ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
1724 ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
1725 ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
1726 ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
1727 ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
1728 "ldr q21, [%[b_ptr0], #0x30]\n"
1729 ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
1730 "add %[b_ptr0], %[b_ptr0], #0x40\n"
1731 ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
1732 ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
1733 ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
1734 ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
1735 ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
1736 ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
1737 ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
1738 ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
1739 ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
1740 ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
1741 ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
1742 ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
1743 ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
1744 ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
1745 ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
1746 ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
1747 ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
1748 ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
1749 ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
1750 ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
1751 ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
1752 ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
1753 ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
1754 ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
1755 ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
1756 ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
1757 ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
1758 ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
1759 ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
1760 ".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
1761 ".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
1762 ".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
1763 ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
1764 ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
1765 ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
1766 ".inst 0x6f82ea9a // udot v26.4s, v20.16b, v2.4b[2]\n"
1767 ".inst 0x6f85ea9b // udot v27.4s, v20.16b, v5.4b[2]\n"
1768 ".inst 0x6f88ea9c // udot v28.4s, v20.16b, v8.4b[2]\n"
1769 ".inst 0x6f8bea9d // udot v29.4s, v20.16b, v11.4b[2]\n"
1770 ".inst 0x6f8eea9e // udot v30.4s, v20.16b, v14.4b[2]\n"
1771 ".inst 0x6f91ea9f // udot v31.4s, v20.16b, v17.4b[2]\n"
1772 ".inst 0x6fa2eaba // udot v26.4s, v21.16b, v2.4b[3]\n"
1773 ".inst 0x6fa5eabb // udot v27.4s, v21.16b, v5.4b[3]\n"
1774 ".inst 0x6fa8eabc // udot v28.4s, v21.16b, v8.4b[3]\n"
1775 ".inst 0x6fabeabd // udot v29.4s, v21.16b, v11.4b[3]\n"
1776 ".inst 0x6faeeabe // udot v30.4s, v21.16b, v14.4b[3]\n"
1777 ".inst 0x6fb1eabf // udot v31.4s, v21.16b, v17.4b[3]\n"
1780 "str q26, [%[c_ptr0]]\n"
1781 "subs %[loops], %[loops], #0x1\n"
1783 "ldr d18, [%[b_ptr0]]\n"
1784 "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
1785 "add %[c_ptr0], %[c_ptr0], #0x10\n"
1786 "str q27, [c_ptr1]\n"
1787 "add c_ptr1, c_ptr1, #0x10\n"
1789 "ldr d19, [%[b_ptr0], #0x10]\n"
1790 "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
1791 "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
1792 "str q28, [c_ptr2]\n"
1793 "add c_ptr2, c_ptr2, #0x10\n"
1795 "ldr d20, [%[b_ptr0], #0x20]\n"
1796 "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
1797 "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
1798 "str q29, [c_ptr3]\n"
1799 "add c_ptr3, c_ptr3, #0x10\n"
1801 "ldr d21, [%[b_ptr0], #0x30]\n"
1802 "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
1803 "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
1804 "str q30, [c_ptr4]\n"
1805 "add c_ptr4, c_ptr4, #0x10\n"
1807 "ldr d22, [%[b_ptr0], #0x40]\n"
1808 "ins v18.d[1], temploadreg2\n"
1809 "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
1810 "str q31, [c_ptr5]\n"
1811 "add c_ptr5, c_ptr5, #0x10\n"
1813 "ldr temploadreg2, [%[b_ptr0], #0x48]\n"
1814 ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
1815 "ldr d23, [%[b_ptr0], #0x50]\n"
1816 ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
1817 "ins v19.d[1], temploadreg3\n"
1818 ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
1819 "ldr temploadreg3, [%[b_ptr0], #0x58]\n"
1820 ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
1821 "ldr d24, [%[b_ptr0], #0x60]\n"
1822 ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
1823 "ins v20.d[1], temploadreg0\n"
1824 ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
1825 "ldr temploadreg0, [%[b_ptr0], #0x68]\n"
1826 ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
1827 "ldr d25, [%[b_ptr0], #0x70]\n"
1828 ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
1829 "ins v21.d[1], temploadreg1\n"
1830 ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
1831 "ldr temploadreg1, [%[b_ptr0], #0x78]\n"
1832 ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
1833 "ins v22.d[1], temploadreg2\n"
1834 ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
1835 "ins v23.d[1], temploadreg3\n"
1836 ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
1837 "ins v24.d[1], temploadreg0\n"
1838 ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
1839 "ins v25.d[1], temploadreg1\n"
1840 ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
1841 "add %[b_ptr0], %[b_ptr0], #0x80\n"
1842 ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
1843 "ldr d18, [%[b_ptr0]]\n"
1844 ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
1845 "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
1846 ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
1847 "ldr d19, [%[b_ptr0], #0x10]\n"
1848 ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
1849 "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
1850 ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
1851 "ldr d20, [%[b_ptr0], #0x20]\n"
1852 ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
1853 "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
1854 ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
1855 "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
1856 ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
1857 "ins v18.d[1], temploadreg2\n"
1858 ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
1859 "ins v19.d[1], temploadreg3\n"
1860 ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
1861 "ldr d21, [%[b_ptr0], #0x30]\n"
1862 ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
1863 "ins v20.d[1], temploadreg0\n"
1864 ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
1865 "add %[b_ptr0], %[b_ptr0], #0x40\n"
1866 ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
1867 "ins v21.d[1], temploadreg1\n"
1868 ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
1869 "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
1870 ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
1871 "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
1872 ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
1873 ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
1874 ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
1875 ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
1876 ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
1877 ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
1878 ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
1879 ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
1880 ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
1881 ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
1882 ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
1883 ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
1884 ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
1885 ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
1886 ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
1887 ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
1888 ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
1889 ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
1890 ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
1891 ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
1892 ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
1893 ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
1894 ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
1895 ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
1896 ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
1897 ".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
1898 ".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
1899 ".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
1900 ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
1901 ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
1902 ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
1903 ".inst 0x6f82ea9a // udot v26.4s, v20.16b, v2.4b[2]\n"
1904 ".inst 0x6f85ea9b // udot v27.4s, v20.16b, v5.4b[2]\n"
1905 ".inst 0x6f88ea9c // udot v28.4s, v20.16b, v8.4b[2]\n"
1906 ".inst 0x6f8bea9d // udot v29.4s, v20.16b, v11.4b[2]\n"
1907 ".inst 0x6f8eea9e // udot v30.4s, v20.16b, v14.4b[2]\n"
1908 ".inst 0x6f91ea9f // udot v31.4s, v20.16b, v17.4b[2]\n"
1909 ".inst 0x6fa2eaba // udot v26.4s, v21.16b, v2.4b[3]\n"
1910 ".inst 0x6fa5eabb // udot v27.4s, v21.16b, v5.4b[3]\n"
1911 ".inst 0x6fa8eabc // udot v28.4s, v21.16b, v8.4b[3]\n"
1912 ".inst 0x6fabeabd // udot v29.4s, v21.16b, v11.4b[3]\n"
1913 ".inst 0x6faeeabe // udot v30.4s, v21.16b, v14.4b[3]\n"
1914 ".inst 0x6fb1eabf // udot v31.4s, v21.16b, v17.4b[3]\n"
1917 "str q26, [%[c_ptr0]]\n"
1918 "add %[c_ptr0], %[c_ptr0], #0x10\n"
1920 "ldr q18, [%[b_ptr0]]\n"
1921 "ldr q19, [%[b_ptr0], #0x10]\n"
1922 "str q27, [c_ptr1]\n"
1923 "add c_ptr1, c_ptr1, #0x10\n"
1925 "ldr q20, [%[b_ptr0], #0x20]\n"
1926 ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
1927 "str q28, [c_ptr2]\n"
1929 "ldr q21, [%[b_ptr0], #0x30]\n"
1930 "ldr q22, [%[b_ptr0], #0x40]\n"
1931 "add c_ptr2, c_ptr2, #0x10\n"
1932 ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
1933 "str q29, [c_ptr3]\n"
1935 "ldr q23, [%[b_ptr0], #0x50]\n"
1936 ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
1937 "ldr q24, [%[b_ptr0], #0x60]\n"
1938 ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
1939 "str q30, [c_ptr4]\n"
1941 "ldr q25, [%[b_ptr0], #0x70]\n"
1942 ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
1943 "add c_ptr3, c_ptr3, #0x10\n"
1944 ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
1945 "str q31, [c_ptr5]\n"
1947 "add c_ptr4, c_ptr4, #0x10\n"
1948 ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
1949 "add c_ptr5, c_ptr5, #0x10\n"
1950 ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
1951 "add %[b_ptr0], %[b_ptr0], #0x80\n"
1952 ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
1953 "ldr q18, [%[b_ptr0]]\n"
1954 ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
1955 ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
1956 ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
1957 "ldr q19, [%[b_ptr0], #0x10]\n"
1958 ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
1959 ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
1960 ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
1961 ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
1962 ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
1963 ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
1964 "ldr q20, [%[b_ptr0], #0x20]\n"
1965 ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
1966 ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
1967 ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
1968 ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
1969 ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
1970 ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
1971 "ldr q21, [%[b_ptr0], #0x30]\n"
1972 ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
1973 "add %[b_ptr0], %[b_ptr0], #0x40\n"
1974 ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
1975 ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
1976 ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
1977 ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
1978 ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
1979 ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
1980 ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
1981 ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
1982 ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
1983 ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
1984 ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
1985 ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
1986 ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
1987 ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
1988 ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
1989 ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
1990 ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
1991 ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
1992 ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
1993 ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
1994 ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
1995 ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
1996 ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
1997 ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
1998 ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
1999 ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
2000 ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
2001 ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
2002 ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
2003 ".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
2004 ".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
2005 ".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
2006 ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
2007 ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
2008 ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
2009 ".inst 0x6f82ea9a // udot v26.4s, v20.16b, v2.4b[2]\n"
2010 ".inst 0x6f85ea9b // udot v27.4s, v20.16b, v5.4b[2]\n"
2011 ".inst 0x6f88ea9c // udot v28.4s, v20.16b, v8.4b[2]\n"
2012 ".inst 0x6f8bea9d // udot v29.4s, v20.16b, v11.4b[2]\n"
2013 ".inst 0x6f8eea9e // udot v30.4s, v20.16b, v14.4b[2]\n"
2014 ".inst 0x6f91ea9f // udot v31.4s, v20.16b, v17.4b[2]\n"
2015 ".inst 0x6fa2eaba // udot v26.4s, v21.16b, v2.4b[3]\n"
2016 ".inst 0x6fa5eabb // udot v27.4s, v21.16b, v5.4b[3]\n"
2017 ".inst 0x6fa8eabc // udot v28.4s, v21.16b, v8.4b[3]\n"
2018 ".inst 0x6fabeabd // udot v29.4s, v21.16b, v11.4b[3]\n"
2019 ".inst 0x6faeeabe // udot v30.4s, v21.16b, v14.4b[3]\n"
2020 ".inst 0x6fb1eabf // udot v31.4s, v21.16b, v17.4b[3]\n"
2029 ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
2030 ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
2031 ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
2032 ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
2033 ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
2034 ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
2035 "ldr q18, [%[b_ptr0]]\n"
2036 ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
2037 ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
2038 ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
2039 ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
2040 ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
2041 ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
2042 "ldr q19, [%[b_ptr0], #0x10]\n"
2043 ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
2044 ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
2045 ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
2046 ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
2047 ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
2048 ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
2049 "ldr q20, [%[b_ptr0], #0x20]\n"
2050 ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
2051 ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
2052 ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
2053 ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
2054 ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
2055 ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
2056 "ldr q21, [%[b_ptr0], #0x30]\n"
2057 ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
2058 "add %[b_ptr0], %[b_ptr0], #0x40\n"
2059 ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
2060 ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
2061 ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
2062 ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
2063 ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
2064 ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
2065 ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
2066 ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
2067 ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
2068 ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
2069 ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
2070 ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
2071 ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
2072 ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
2073 ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
2074 ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
2075 ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
2076 ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
2077 ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
2078 ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
2079 ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
2080 ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
2081 ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
2082 ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
2083 ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
2084 ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
2085 ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
2086 ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
2087 ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
2088 ".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
2089 ".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
2090 ".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
2091 ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
2092 ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
2093 ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
2094 ".inst 0x6f82ea9a // udot v26.4s, v20.16b, v2.4b[2]\n"
2095 ".inst 0x6f85ea9b // udot v27.4s, v20.16b, v5.4b[2]\n"
2096 ".inst 0x6f88ea9c // udot v28.4s, v20.16b, v8.4b[2]\n"
2097 ".inst 0x6f8bea9d // udot v29.4s, v20.16b, v11.4b[2]\n"
2098 ".inst 0x6f8eea9e // udot v30.4s, v20.16b, v14.4b[2]\n"
2099 ".inst 0x6f91ea9f // udot v31.4s, v20.16b, v17.4b[2]\n"
2100 ".inst 0x6fa2eaba // udot v26.4s, v21.16b, v2.4b[3]\n"
2101 ".inst 0x6fa5eabb // udot v27.4s, v21.16b, v5.4b[3]\n"
2102 ".inst 0x6fa8eabc // udot v28.4s, v21.16b, v8.4b[3]\n"
2103 ".inst 0x6fabeabd // udot v29.4s, v21.16b, v11.4b[3]\n"
2104 ".inst 0x6faeeabe // udot v30.4s, v21.16b, v14.4b[3]\n"
2105 ".inst 0x6fb1eabf // udot v31.4s, v21.16b, v17.4b[3]\n"
2107 "str q26, [%[c_ptr0]]\n"
2108 "add %[c_ptr0], %[c_ptr0], #0x10\n"
2109 "str q27, [c_ptr1]\n"
2110 "str q28, [c_ptr2]\n"
2111 "str q29, [c_ptr3]\n"
2112 "str q30, [c_ptr4]\n"
2113 "str q31, [c_ptr5]\n"
2124 ".unreq temploadreg0\n"
2125 ".unreq temploadreg1\n"
2126 ".unreq temploadreg2\n"
2127 ".unreq temploadreg3\n"
2128 : [a_ptr0]
"+r" (a_ptr0), [b_ptr0]
"+r" (b_ptr0), [c_ptr0]
"+r" (c_ptr0), [loops]
"+r" (loops), [oob_rows]
"+r" (oob_rows), [odds]
"+r" (odds)
2129 : [lda]
"r" (ldab), [ldc]
"r" (ldcb)
2130 :
"x0",
"x1",
"x2",
"x3",
"x4",
"x5",
"x6",
"x7",
"x8",
"x9",
"x10",
"x11",
"x12",
"x13",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23",
"v24",
"v25",
"v26",
"v27",
"v28",
"v29",
"v30",
"v31",
"cc",
"memory"
2145 "temploadreg0 .req X10\n"
2146 "temploadreg1 .req X11\n"
2147 "temploadreg2 .req X12\n"
2148 "temploadreg3 .req X13\n"
2149 "add a_ptr1, %[a_ptr0], %[lda]\n"
2150 "add c_ptr1, %[c_ptr0], %[ldc]\n"
2151 "add a_ptr2, a_ptr1, %[lda]\n"
2152 "add c_ptr2, c_ptr1, %[ldc]\n"
2153 "add a_ptr3, a_ptr2, %[lda]\n"
2154 "add c_ptr3, c_ptr2, %[ldc]\n"
2155 "add a_ptr4, a_ptr3, %[lda]\n"
2156 "add c_ptr4, c_ptr3, %[ldc]\n"
2157 "add a_ptr5, a_ptr4, %[lda]\n"
2158 "add c_ptr5, c_ptr4, %[ldc]\n"
2159 "cbz %[oob_rows], 1f\n"
2160 "subs %[oob_rows], %[oob_rows], #0x1\n"
2161 "add c_ptr5, %[c_ptr0], #0x0\n"
2162 "add a_ptr5, %[a_ptr0], #0x0\n"
2164 "subs %[oob_rows], %[oob_rows], #0x1\n"
2165 "add c_ptr4, %[c_ptr0], #0x0\n"
2166 "add a_ptr4, %[a_ptr0], #0x0\n"
2168 "subs %[oob_rows], %[oob_rows], #0x1\n"
2169 "add c_ptr3, %[c_ptr0], #0x0\n"
2170 "add a_ptr3, %[a_ptr0], #0x0\n"
2172 "subs %[oob_rows], %[oob_rows], #0x1\n"
2173 "add c_ptr2, %[c_ptr0], #0x0\n"
2174 "add a_ptr2, %[a_ptr0], #0x0\n"
2176 "subs %[oob_rows], %[oob_rows], #0x1\n"
2177 "add c_ptr1, %[c_ptr0], #0x0\n"
2178 "add a_ptr1, %[a_ptr0], #0x0\n"
2180 "cbnz %[odds], 2f\n"
2181 "ldr q0, [%[a_ptr0]], #0x10\n"
2182 "ldr q4, [a_ptr1], #0x10\n"
2183 "ldr q8, [a_ptr2], #0x10\n"
2184 "ldr q12, [a_ptr3], #0x10\n"
2185 "ldr q16, [a_ptr4], #0x10\n"
2186 "ldr q20, [a_ptr5], #0x10\n"
2187 "ldr q1, [%[a_ptr0]], #0x10\n"
2188 "ldr q5, [a_ptr1], #0x10\n"
2189 "ldr q9, [a_ptr2], #0x10\n"
2190 "ldr q13, [a_ptr3], #0x10\n"
2191 "ldr q17, [a_ptr4], #0x10\n"
2192 "ldr q21, [a_ptr5], #0x10\n"
2193 "ldr q2, [%[a_ptr0]], #0x10\n"
2194 "ldr q6, [a_ptr1], #0x10\n"
2195 "ldr q10, [a_ptr2], #0x10\n"
2196 "ldr q14, [a_ptr3], #0x10\n"
2197 "ldr s3, [%[a_ptr0]]\n"
2198 "ldr q18, [a_ptr4], #0x10\n"
2199 "ldr s7, [a_ptr1]\n"
2200 "ldr q22, [a_ptr5], #0x10\n"
2201 "ldr s11, [a_ptr2]\n"
2202 "ldr s15, [a_ptr3]\n"
2203 "ldr s19, [a_ptr4]\n"
2204 "ldr s23, [a_ptr5]\n"
2207 "ldr q0, [%[a_ptr0]], #0x10\n"
2208 "subs %[odds], %[odds], #0x1\n"
2209 "ldr q4, [a_ptr1], #0x10\n"
2210 "ldr q8, [a_ptr2], #0x10\n"
2211 "ldr q12, [a_ptr3], #0x10\n"
2212 "ldr q16, [a_ptr4], #0x10\n"
2213 "ldr q20, [a_ptr5], #0x10\n"
2214 "ldr q1, [%[a_ptr0]], #0x10\n"
2215 "ldr q5, [a_ptr1], #0x10\n"
2216 "ldr q9, [a_ptr2], #0x10\n"
2217 "ldr q13, [a_ptr3], #0x10\n"
2218 "ldr q17, [a_ptr4], #0x10\n"
2219 "ldr q21, [a_ptr5], #0x10\n"
2220 "ldr q2, [%[a_ptr0]], #0x10\n"
2221 "ldr q6, [a_ptr1], #0x10\n"
2222 "ldr q10, [a_ptr2], #0x10\n"
2223 "ldr q14, [a_ptr3], #0x10\n"
2224 "ldr q18, [a_ptr4], #0x10\n"
2225 "ldr q22, [a_ptr5], #0x10\n"
2227 "ldr b3, [%[a_ptr0]]\n"
2228 "ldr b7, [a_ptr1]\n"
2229 "ldr b11, [a_ptr2]\n"
2230 "ldr b15, [a_ptr3]\n"
2231 "ldr b19, [a_ptr4]\n"
2232 "ldr b23, [a_ptr5]\n"
2235 "ldr h3, [%[a_ptr0]], #0x2\n"
2236 "ldr h7, [a_ptr1], #0x2\n"
2237 "ldr h11, [a_ptr2], #0x2\n"
2238 "ldr h15, [a_ptr3], #0x2\n"
2239 "ldr h19, [a_ptr4], #0x2\n"
2240 "ldr h23, [a_ptr5], #0x2\n"
2241 "subs %[odds], %[odds], #0x1\n"
2245 "ld1 {v3.b}[2], [%[a_ptr0]]\n"
2246 "ld1 {v7.b}[2], [a_ptr1]\n"
2247 "ld1 {v11.b}[2], [a_ptr2]\n"
2248 "ld1 {v15.b}[2], [a_ptr3]\n"
2249 "ld1 {v19.b}[2], [a_ptr4]\n"
2250 "ld1 {v23.b}[2], [a_ptr5]\n"
2252 "ldr q24, [%[b_ptr0]]\n"
2253 "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
2254 "ldr q25, [%[b_ptr0], #0x10]\n"
2255 "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
2256 "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
2257 "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
2258 "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
2259 "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
2260 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2261 "cbz %[loops], 6f\n"
2263 "subs %[loops], %[loops], #0x1\n"
2269 ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
2270 ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
2271 ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
2272 ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
2273 ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
2274 ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
2275 "ldr q24, [%[b_ptr0]]\n"
2276 ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
2277 ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
2278 ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
2279 ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
2280 ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
2281 ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
2282 "ldr q25, [%[b_ptr0], #0x10]\n"
2283 ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
2284 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2285 ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
2286 ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
2287 ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
2288 ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
2289 ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
2290 "ldr q24, [%[b_ptr0]]\n"
2291 ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
2292 ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
2293 ".inst 0x6fa8eb3c // udot v28.4s, v25.16b, v8.4b[3]\n"
2294 ".inst 0x6faceb3d // udot v29.4s, v25.16b, v12.4b[3]\n"
2295 ".inst 0x6fb0eb3e // udot v30.4s, v25.16b, v16.4b[3]\n"
2296 ".inst 0x6fb4eb3f // udot v31.4s, v25.16b, v20.4b[3]\n"
2297 "ldr q25, [%[b_ptr0], #0x10]\n"
2298 ".inst 0x6f81e31a // udot v26.4s, v24.16b, v1.4b[0]\n"
2299 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2300 ".inst 0x6f85e31b // udot v27.4s, v24.16b, v5.4b[0]\n"
2301 ".inst 0x6f89e31c // udot v28.4s, v24.16b, v9.4b[0]\n"
2302 ".inst 0x6f8de31d // udot v29.4s, v24.16b, v13.4b[0]\n"
2303 ".inst 0x6f91e31e // udot v30.4s, v24.16b, v17.4b[0]\n"
2304 ".inst 0x6f95e31f // udot v31.4s, v24.16b, v21.4b[0]\n"
2305 "ldr q24, [%[b_ptr0]]\n"
2306 ".inst 0x6fa1e33a // udot v26.4s, v25.16b, v1.4b[1]\n"
2307 ".inst 0x6fa5e33b // udot v27.4s, v25.16b, v5.4b[1]\n"
2308 ".inst 0x6fa9e33c // udot v28.4s, v25.16b, v9.4b[1]\n"
2309 ".inst 0x6fade33d // udot v29.4s, v25.16b, v13.4b[1]\n"
2310 ".inst 0x6fb1e33e // udot v30.4s, v25.16b, v17.4b[1]\n"
2311 ".inst 0x6fb5e33f // udot v31.4s, v25.16b, v21.4b[1]\n"
2312 "ldr q25, [%[b_ptr0], #0x10]\n"
2313 ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
2314 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2315 ".inst 0x6f85eb1b // udot v27.4s, v24.16b, v5.4b[2]\n"
2316 ".inst 0x6f89eb1c // udot v28.4s, v24.16b, v9.4b[2]\n"
2317 ".inst 0x6f8deb1d // udot v29.4s, v24.16b, v13.4b[2]\n"
2318 ".inst 0x6f91eb1e // udot v30.4s, v24.16b, v17.4b[2]\n"
2319 ".inst 0x6f95eb1f // udot v31.4s, v24.16b, v21.4b[2]\n"
2320 "ldr q24, [%[b_ptr0]]\n"
2321 ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
2322 ".inst 0x6fa5eb3b // udot v27.4s, v25.16b, v5.4b[3]\n"
2323 ".inst 0x6fa9eb3c // udot v28.4s, v25.16b, v9.4b[3]\n"
2324 ".inst 0x6fadeb3d // udot v29.4s, v25.16b, v13.4b[3]\n"
2325 ".inst 0x6fb1eb3e // udot v30.4s, v25.16b, v17.4b[3]\n"
2326 ".inst 0x6fb5eb3f // udot v31.4s, v25.16b, v21.4b[3]\n"
2327 "ldr q25, [%[b_ptr0], #0x10]\n"
2328 ".inst 0x6f82e31a // udot v26.4s, v24.16b, v2.4b[0]\n"
2329 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2330 ".inst 0x6f86e31b // udot v27.4s, v24.16b, v6.4b[0]\n"
2331 ".inst 0x6f8ae31c // udot v28.4s, v24.16b, v10.4b[0]\n"
2332 ".inst 0x6f8ee31d // udot v29.4s, v24.16b, v14.4b[0]\n"
2333 ".inst 0x6f92e31e // udot v30.4s, v24.16b, v18.4b[0]\n"
2334 ".inst 0x6f96e31f // udot v31.4s, v24.16b, v22.4b[0]\n"
2335 "ldr q24, [%[b_ptr0]]\n"
2336 ".inst 0x6fa2e33a // udot v26.4s, v25.16b, v2.4b[1]\n"
2337 ".inst 0x6fa6e33b // udot v27.4s, v25.16b, v6.4b[1]\n"
2338 ".inst 0x6faae33c // udot v28.4s, v25.16b, v10.4b[1]\n"
2339 ".inst 0x6faee33d // udot v29.4s, v25.16b, v14.4b[1]\n"
2340 ".inst 0x6fb2e33e // udot v30.4s, v25.16b, v18.4b[1]\n"
2341 ".inst 0x6fb6e33f // udot v31.4s, v25.16b, v22.4b[1]\n"
2342 "ldr q25, [%[b_ptr0], #0x10]\n"
2343 ".inst 0x6f82eb1a // udot v26.4s, v24.16b, v2.4b[2]\n"
2344 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2345 ".inst 0x6f86eb1b // udot v27.4s, v24.16b, v6.4b[2]\n"
2346 ".inst 0x6f8aeb1c // udot v28.4s, v24.16b, v10.4b[2]\n"
2347 ".inst 0x6f8eeb1d // udot v29.4s, v24.16b, v14.4b[2]\n"
2348 ".inst 0x6f92eb1e // udot v30.4s, v24.16b, v18.4b[2]\n"
2349 ".inst 0x6f96eb1f // udot v31.4s, v24.16b, v22.4b[2]\n"
2350 "ldr q24, [%[b_ptr0]]\n"
2351 ".inst 0x6fa2eb3a // udot v26.4s, v25.16b, v2.4b[3]\n"
2352 "add %[b_ptr0], %[b_ptr0], #0x10\n"
2353 ".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
2354 ".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
2355 ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
2356 ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
2357 ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
2358 ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
2359 ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
2360 ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
2361 ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
2362 ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
2363 ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
2366 "str q26, [%[c_ptr0]]\n"
2367 "subs %[loops], %[loops], #0x1\n"
2369 "ldr d24, [%[b_ptr0]]\n"
2370 "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
2371 "add %[c_ptr0], %[c_ptr0], #0x10\n"
2372 "str q27, [c_ptr1]\n"
2373 "add c_ptr1, c_ptr1, #0x10\n"
2375 "ldr d25, [%[b_ptr0], #0x10]\n"
2376 "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
2377 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2378 "str q28, [c_ptr2]\n"
2379 "add c_ptr2, c_ptr2, #0x10\n"
2381 "ins v24.d[1], temploadreg0\n"
2382 "ins v25.d[1], temploadreg1\n"
2383 "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
2384 "str q29, [c_ptr3]\n"
2385 "add c_ptr3, c_ptr3, #0x10\n"
2387 "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
2388 ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
2389 "str q30, [c_ptr4]\n"
2391 "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
2392 ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
2393 "add c_ptr4, c_ptr4, #0x10\n"
2394 ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
2395 "str q31, [c_ptr5]\n"
2397 "add c_ptr5, c_ptr5, #0x10\n"
2398 ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
2399 "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
2400 ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
2401 "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
2402 ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
2403 "ldr d24, [%[b_ptr0]]\n"
2404 ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
2405 "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
2406 ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
2407 "ins v24.d[1], temploadreg0\n"
2408 ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
2409 "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
2410 ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
2411 "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
2412 ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
2413 ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
2414 "ldr d25, [%[b_ptr0], #0x10]\n"
2415 ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
2416 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2417 ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
2418 "ins v25.d[1], temploadreg1\n"
2419 ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
2420 "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
2421 ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
2422 "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
2423 ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
2424 ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
2425 "ldr d24, [%[b_ptr0]]\n"
2426 ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
2427 ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
2428 ".inst 0x6fa8eb3c // udot v28.4s, v25.16b, v8.4b[3]\n"
2429 "ins v24.d[1], temploadreg0\n"
2430 ".inst 0x6faceb3d // udot v29.4s, v25.16b, v12.4b[3]\n"
2431 ".inst 0x6fb0eb3e // udot v30.4s, v25.16b, v16.4b[3]\n"
2432 ".inst 0x6fb4eb3f // udot v31.4s, v25.16b, v20.4b[3]\n"
2433 "ldr d25, [%[b_ptr0], #0x10]\n"
2434 ".inst 0x6f81e31a // udot v26.4s, v24.16b, v1.4b[0]\n"
2435 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2436 ".inst 0x6f85e31b // udot v27.4s, v24.16b, v5.4b[0]\n"
2437 "ins v25.d[1], temploadreg1\n"
2438 ".inst 0x6f89e31c // udot v28.4s, v24.16b, v9.4b[0]\n"
2439 "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
2440 ".inst 0x6f8de31d // udot v29.4s, v24.16b, v13.4b[0]\n"
2441 "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
2442 ".inst 0x6f91e31e // udot v30.4s, v24.16b, v17.4b[0]\n"
2443 ".inst 0x6f95e31f // udot v31.4s, v24.16b, v21.4b[0]\n"
2444 "ldr d24, [%[b_ptr0]]\n"
2445 ".inst 0x6fa1e33a // udot v26.4s, v25.16b, v1.4b[1]\n"
2446 ".inst 0x6fa5e33b // udot v27.4s, v25.16b, v5.4b[1]\n"
2447 ".inst 0x6fa9e33c // udot v28.4s, v25.16b, v9.4b[1]\n"
2448 "ins v24.d[1], temploadreg0\n"
2449 ".inst 0x6fade33d // udot v29.4s, v25.16b, v13.4b[1]\n"
2450 ".inst 0x6fb1e33e // udot v30.4s, v25.16b, v17.4b[1]\n"
2451 ".inst 0x6fb5e33f // udot v31.4s, v25.16b, v21.4b[1]\n"
2452 "ldr d25, [%[b_ptr0], #0x10]\n"
2453 ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
2454 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2455 ".inst 0x6f85eb1b // udot v27.4s, v24.16b, v5.4b[2]\n"
2456 "ins v25.d[1], temploadreg1\n"
2457 ".inst 0x6f89eb1c // udot v28.4s, v24.16b, v9.4b[2]\n"
2458 "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
2459 ".inst 0x6f8deb1d // udot v29.4s, v24.16b, v13.4b[2]\n"
2460 "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
2461 ".inst 0x6f91eb1e // udot v30.4s, v24.16b, v17.4b[2]\n"
2462 ".inst 0x6f95eb1f // udot v31.4s, v24.16b, v21.4b[2]\n"
2463 "ldr d24, [%[b_ptr0]]\n"
2464 ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
2465 ".inst 0x6fa5eb3b // udot v27.4s, v25.16b, v5.4b[3]\n"
2466 ".inst 0x6fa9eb3c // udot v28.4s, v25.16b, v9.4b[3]\n"
2467 "ins v24.d[1], temploadreg0\n"
2468 ".inst 0x6fadeb3d // udot v29.4s, v25.16b, v13.4b[3]\n"
2469 ".inst 0x6fb1eb3e // udot v30.4s, v25.16b, v17.4b[3]\n"
2470 ".inst 0x6fb5eb3f // udot v31.4s, v25.16b, v21.4b[3]\n"
2471 "ldr d25, [%[b_ptr0], #0x10]\n"
2472 ".inst 0x6f82e31a // udot v26.4s, v24.16b, v2.4b[0]\n"
2473 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2474 ".inst 0x6f86e31b // udot v27.4s, v24.16b, v6.4b[0]\n"
2475 "ins v25.d[1], temploadreg1\n"
2476 ".inst 0x6f8ae31c // udot v28.4s, v24.16b, v10.4b[0]\n"
2477 "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
2478 ".inst 0x6f8ee31d // udot v29.4s, v24.16b, v14.4b[0]\n"
2479 "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
2480 ".inst 0x6f92e31e // udot v30.4s, v24.16b, v18.4b[0]\n"
2481 ".inst 0x6f96e31f // udot v31.4s, v24.16b, v22.4b[0]\n"
2482 "ldr d24, [%[b_ptr0]]\n"
2483 ".inst 0x6fa2e33a // udot v26.4s, v25.16b, v2.4b[1]\n"
2484 ".inst 0x6fa6e33b // udot v27.4s, v25.16b, v6.4b[1]\n"
2485 ".inst 0x6faae33c // udot v28.4s, v25.16b, v10.4b[1]\n"
2486 "ins v24.d[1], temploadreg0\n"
2487 ".inst 0x6faee33d // udot v29.4s, v25.16b, v14.4b[1]\n"
2488 ".inst 0x6fb2e33e // udot v30.4s, v25.16b, v18.4b[1]\n"
2489 ".inst 0x6fb6e33f // udot v31.4s, v25.16b, v22.4b[1]\n"
2490 "ldr d25, [%[b_ptr0], #0x10]\n"
2491 ".inst 0x6f82eb1a // udot v26.4s, v24.16b, v2.4b[2]\n"
2492 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2493 ".inst 0x6f86eb1b // udot v27.4s, v24.16b, v6.4b[2]\n"
2494 "ins v25.d[1], temploadreg1\n"
2495 ".inst 0x6f8aeb1c // udot v28.4s, v24.16b, v10.4b[2]\n"
2496 "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
2497 ".inst 0x6f8eeb1d // udot v29.4s, v24.16b, v14.4b[2]\n"
2498 ".inst 0x6f92eb1e // udot v30.4s, v24.16b, v18.4b[2]\n"
2499 ".inst 0x6f96eb1f // udot v31.4s, v24.16b, v22.4b[2]\n"
2500 "ldr d24, [%[b_ptr0]]\n"
2501 ".inst 0x6fa2eb3a // udot v26.4s, v25.16b, v2.4b[3]\n"
2502 "add %[b_ptr0], %[b_ptr0], #0x10\n"
2503 ".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
2504 "ins v24.d[1], temploadreg0\n"
2505 ".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
2506 ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
2507 ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
2508 ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
2509 ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
2510 ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
2511 ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
2512 ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
2513 ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
2514 ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
2517 "str q26, [%[c_ptr0]]\n"
2518 "add %[c_ptr0], %[c_ptr0], #0x10\n"
2520 "ldr q24, [%[b_ptr0]]\n"
2521 "ldr q25, [%[b_ptr0], #0x10]\n"
2522 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2523 "str q27, [c_ptr1]\n"
2524 "add c_ptr1, c_ptr1, #0x10\n"
2526 ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
2527 "str q28, [c_ptr2]\n"
2529 "add c_ptr2, c_ptr2, #0x10\n"
2530 ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
2531 "str q29, [c_ptr3]\n"
2533 "add c_ptr3, c_ptr3, #0x10\n"
2534 ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
2535 "str q30, [c_ptr4]\n"
2537 "add c_ptr4, c_ptr4, #0x10\n"
2538 ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
2539 "str q31, [c_ptr5]\n"
2541 "add c_ptr5, c_ptr5, #0x10\n"
2542 ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
2543 ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
2544 ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
2545 "ldr q24, [%[b_ptr0]]\n"
2546 ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
2547 ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
2548 ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
2549 ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
2550 ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
2551 "ldr q25, [%[b_ptr0], #0x10]\n"
2552 ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
2553 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2554 ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
2555 ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
2556 ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
2557 ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
2558 ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
2559 "ldr q24, [%[b_ptr0]]\n"
2560 ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
2561 ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
2562 ".inst 0x6fa8eb3c // udot v28.4s, v25.16b, v8.4b[3]\n"
2563 ".inst 0x6faceb3d // udot v29.4s, v25.16b, v12.4b[3]\n"
2564 ".inst 0x6fb0eb3e // udot v30.4s, v25.16b, v16.4b[3]\n"
2565 ".inst 0x6fb4eb3f // udot v31.4s, v25.16b, v20.4b[3]\n"
2566 "ldr q25, [%[b_ptr0], #0x10]\n"
2567 ".inst 0x6f81e31a // udot v26.4s, v24.16b, v1.4b[0]\n"
2568 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2569 ".inst 0x6f85e31b // udot v27.4s, v24.16b, v5.4b[0]\n"
2570 ".inst 0x6f89e31c // udot v28.4s, v24.16b, v9.4b[0]\n"
2571 ".inst 0x6f8de31d // udot v29.4s, v24.16b, v13.4b[0]\n"
2572 ".inst 0x6f91e31e // udot v30.4s, v24.16b, v17.4b[0]\n"
2573 ".inst 0x6f95e31f // udot v31.4s, v24.16b, v21.4b[0]\n"
2574 "ldr q24, [%[b_ptr0]]\n"
2575 ".inst 0x6fa1e33a // udot v26.4s, v25.16b, v1.4b[1]\n"
2576 ".inst 0x6fa5e33b // udot v27.4s, v25.16b, v5.4b[1]\n"
2577 ".inst 0x6fa9e33c // udot v28.4s, v25.16b, v9.4b[1]\n"
2578 ".inst 0x6fade33d // udot v29.4s, v25.16b, v13.4b[1]\n"
2579 ".inst 0x6fb1e33e // udot v30.4s, v25.16b, v17.4b[1]\n"
2580 ".inst 0x6fb5e33f // udot v31.4s, v25.16b, v21.4b[1]\n"
2581 "ldr q25, [%[b_ptr0], #0x10]\n"
2582 ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
2583 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2584 ".inst 0x6f85eb1b // udot v27.4s, v24.16b, v5.4b[2]\n"
2585 ".inst 0x6f89eb1c // udot v28.4s, v24.16b, v9.4b[2]\n"
2586 ".inst 0x6f8deb1d // udot v29.4s, v24.16b, v13.4b[2]\n"
2587 ".inst 0x6f91eb1e // udot v30.4s, v24.16b, v17.4b[2]\n"
2588 ".inst 0x6f95eb1f // udot v31.4s, v24.16b, v21.4b[2]\n"
2589 "ldr q24, [%[b_ptr0]]\n"
2590 ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
2591 ".inst 0x6fa5eb3b // udot v27.4s, v25.16b, v5.4b[3]\n"
2592 ".inst 0x6fa9eb3c // udot v28.4s, v25.16b, v9.4b[3]\n"
2593 ".inst 0x6fadeb3d // udot v29.4s, v25.16b, v13.4b[3]\n"
2594 ".inst 0x6fb1eb3e // udot v30.4s, v25.16b, v17.4b[3]\n"
2595 ".inst 0x6fb5eb3f // udot v31.4s, v25.16b, v21.4b[3]\n"
2596 "ldr q25, [%[b_ptr0], #0x10]\n"
2597 ".inst 0x6f82e31a // udot v26.4s, v24.16b, v2.4b[0]\n"
2598 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2599 ".inst 0x6f86e31b // udot v27.4s, v24.16b, v6.4b[0]\n"
2600 ".inst 0x6f8ae31c // udot v28.4s, v24.16b, v10.4b[0]\n"
2601 ".inst 0x6f8ee31d // udot v29.4s, v24.16b, v14.4b[0]\n"
2602 ".inst 0x6f92e31e // udot v30.4s, v24.16b, v18.4b[0]\n"
2603 ".inst 0x6f96e31f // udot v31.4s, v24.16b, v22.4b[0]\n"
2604 "ldr q24, [%[b_ptr0]]\n"
2605 ".inst 0x6fa2e33a // udot v26.4s, v25.16b, v2.4b[1]\n"
2606 ".inst 0x6fa6e33b // udot v27.4s, v25.16b, v6.4b[1]\n"
2607 ".inst 0x6faae33c // udot v28.4s, v25.16b, v10.4b[1]\n"
2608 ".inst 0x6faee33d // udot v29.4s, v25.16b, v14.4b[1]\n"
2609 ".inst 0x6fb2e33e // udot v30.4s, v25.16b, v18.4b[1]\n"
2610 ".inst 0x6fb6e33f // udot v31.4s, v25.16b, v22.4b[1]\n"
2611 "ldr q25, [%[b_ptr0], #0x10]\n"
2612 ".inst 0x6f82eb1a // udot v26.4s, v24.16b, v2.4b[2]\n"
2613 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2614 ".inst 0x6f86eb1b // udot v27.4s, v24.16b, v6.4b[2]\n"
2615 ".inst 0x6f8aeb1c // udot v28.4s, v24.16b, v10.4b[2]\n"
2616 ".inst 0x6f8eeb1d // udot v29.4s, v24.16b, v14.4b[2]\n"
2617 ".inst 0x6f92eb1e // udot v30.4s, v24.16b, v18.4b[2]\n"
2618 ".inst 0x6f96eb1f // udot v31.4s, v24.16b, v22.4b[2]\n"
2619 "ldr q24, [%[b_ptr0]]\n"
2620 ".inst 0x6fa2eb3a // udot v26.4s, v25.16b, v2.4b[3]\n"
2621 "add %[b_ptr0], %[b_ptr0], #0x10\n"
2622 ".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
2623 ".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
2624 ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
2625 ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
2626 ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
2627 ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
2628 ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
2629 ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
2630 ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
2631 ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
2632 ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
2641 ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
2642 ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
2643 ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
2644 ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
2645 ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
2646 ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
2647 "ldr q24, [%[b_ptr0]]\n"
2648 ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
2649 ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
2650 ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
2651 ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
2652 ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
2653 ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
2654 "ldr q25, [%[b_ptr0], #0x10]\n"
2655 ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
2656 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2657 ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
2658 ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
2659 ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
2660 ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
2661 ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
2662 "ldr q24, [%[b_ptr0]]\n"
2663 ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
2664 ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
2665 ".inst 0x6fa8eb3c // udot v28.4s, v25.16b, v8.4b[3]\n"
2666 ".inst 0x6faceb3d // udot v29.4s, v25.16b, v12.4b[3]\n"
2667 ".inst 0x6fb0eb3e // udot v30.4s, v25.16b, v16.4b[3]\n"
2668 ".inst 0x6fb4eb3f // udot v31.4s, v25.16b, v20.4b[3]\n"
2669 "ldr q25, [%[b_ptr0], #0x10]\n"
2670 ".inst 0x6f81e31a // udot v26.4s, v24.16b, v1.4b[0]\n"
2671 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2672 ".inst 0x6f85e31b // udot v27.4s, v24.16b, v5.4b[0]\n"
2673 ".inst 0x6f89e31c // udot v28.4s, v24.16b, v9.4b[0]\n"
2674 ".inst 0x6f8de31d // udot v29.4s, v24.16b, v13.4b[0]\n"
2675 ".inst 0x6f91e31e // udot v30.4s, v24.16b, v17.4b[0]\n"
2676 ".inst 0x6f95e31f // udot v31.4s, v24.16b, v21.4b[0]\n"
2677 "ldr q24, [%[b_ptr0]]\n"
2678 ".inst 0x6fa1e33a // udot v26.4s, v25.16b, v1.4b[1]\n"
2679 ".inst 0x6fa5e33b // udot v27.4s, v25.16b, v5.4b[1]\n"
2680 ".inst 0x6fa9e33c // udot v28.4s, v25.16b, v9.4b[1]\n"
2681 ".inst 0x6fade33d // udot v29.4s, v25.16b, v13.4b[1]\n"
2682 ".inst 0x6fb1e33e // udot v30.4s, v25.16b, v17.4b[1]\n"
2683 ".inst 0x6fb5e33f // udot v31.4s, v25.16b, v21.4b[1]\n"
2684 "ldr q25, [%[b_ptr0], #0x10]\n"
2685 ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
2686 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2687 ".inst 0x6f85eb1b // udot v27.4s, v24.16b, v5.4b[2]\n"
2688 ".inst 0x6f89eb1c // udot v28.4s, v24.16b, v9.4b[2]\n"
2689 ".inst 0x6f8deb1d // udot v29.4s, v24.16b, v13.4b[2]\n"
2690 ".inst 0x6f91eb1e // udot v30.4s, v24.16b, v17.4b[2]\n"
2691 ".inst 0x6f95eb1f // udot v31.4s, v24.16b, v21.4b[2]\n"
2692 "ldr q24, [%[b_ptr0]]\n"
2693 ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
2694 ".inst 0x6fa5eb3b // udot v27.4s, v25.16b, v5.4b[3]\n"
2695 ".inst 0x6fa9eb3c // udot v28.4s, v25.16b, v9.4b[3]\n"
2696 ".inst 0x6fadeb3d // udot v29.4s, v25.16b, v13.4b[3]\n"
2697 ".inst 0x6fb1eb3e // udot v30.4s, v25.16b, v17.4b[3]\n"
2698 ".inst 0x6fb5eb3f // udot v31.4s, v25.16b, v21.4b[3]\n"
2699 "ldr q25, [%[b_ptr0], #0x10]\n"
2700 ".inst 0x6f82e31a // udot v26.4s, v24.16b, v2.4b[0]\n"
2701 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2702 ".inst 0x6f86e31b // udot v27.4s, v24.16b, v6.4b[0]\n"
2703 ".inst 0x6f8ae31c // udot v28.4s, v24.16b, v10.4b[0]\n"
2704 ".inst 0x6f8ee31d // udot v29.4s, v24.16b, v14.4b[0]\n"
2705 ".inst 0x6f92e31e // udot v30.4s, v24.16b, v18.4b[0]\n"
2706 ".inst 0x6f96e31f // udot v31.4s, v24.16b, v22.4b[0]\n"
2707 "ldr q24, [%[b_ptr0]]\n"
2708 ".inst 0x6fa2e33a // udot v26.4s, v25.16b, v2.4b[1]\n"
2709 ".inst 0x6fa6e33b // udot v27.4s, v25.16b, v6.4b[1]\n"
2710 ".inst 0x6faae33c // udot v28.4s, v25.16b, v10.4b[1]\n"
2711 ".inst 0x6faee33d // udot v29.4s, v25.16b, v14.4b[1]\n"
2712 ".inst 0x6fb2e33e // udot v30.4s, v25.16b, v18.4b[1]\n"
2713 ".inst 0x6fb6e33f // udot v31.4s, v25.16b, v22.4b[1]\n"
2714 "ldr q25, [%[b_ptr0], #0x10]\n"
2715 ".inst 0x6f82eb1a // udot v26.4s, v24.16b, v2.4b[2]\n"
2716 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2717 ".inst 0x6f86eb1b // udot v27.4s, v24.16b, v6.4b[2]\n"
2718 ".inst 0x6f8aeb1c // udot v28.4s, v24.16b, v10.4b[2]\n"
2719 ".inst 0x6f8eeb1d // udot v29.4s, v24.16b, v14.4b[2]\n"
2720 ".inst 0x6f92eb1e // udot v30.4s, v24.16b, v18.4b[2]\n"
2721 ".inst 0x6f96eb1f // udot v31.4s, v24.16b, v22.4b[2]\n"
2722 "ldr q24, [%[b_ptr0]]\n"
2723 ".inst 0x6fa2eb3a // udot v26.4s, v25.16b, v2.4b[3]\n"
2724 "add %[b_ptr0], %[b_ptr0], #0x10\n"
2725 ".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
2726 ".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
2727 ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
2728 ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
2729 ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
2730 ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
2731 ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
2732 ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
2733 ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
2734 ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
2735 ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
2737 "str q26, [%[c_ptr0]]\n"
2738 "add %[c_ptr0], %[c_ptr0], #0x10\n"
2739 "str q27, [c_ptr1]\n"
2740 "str q28, [c_ptr2]\n"
2741 "str q29, [c_ptr3]\n"
2742 "str q30, [c_ptr4]\n"
2743 "str q31, [c_ptr5]\n"
2754 ".unreq temploadreg0\n"
2755 ".unreq temploadreg1\n"
2756 ".unreq temploadreg2\n"
2757 ".unreq temploadreg3\n"
2758 : [a_ptr0]
"+r" (a_ptr0), [b_ptr0]
"+r" (b_ptr0), [c_ptr0]
"+r" (c_ptr0), [loops]
"+r" (loops), [oob_rows]
"+r" (oob_rows), [odds]
"+r" (odds)
2759 : [lda]
"r" (ldab), [ldc]
"r" (ldcb)
2760 :
"x0",
"x1",
"x2",
"x3",
"x4",
"x5",
"x6",
"x7",
"x8",
"x9",
"x10",
"x11",
"x12",
"x13",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23",
"v24",
"v25",
"v26",
"v27",
"v28",
"v29",
"v30",
"v31",
"cc",
"memory"
2775 "temploadreg0 .req X10\n"
2776 "temploadreg1 .req X11\n"
2777 "temploadreg2 .req X12\n"
2778 "temploadreg3 .req X13\n"
2779 "add a_ptr1, %[a_ptr0], %[lda]\n"
2780 "add c_ptr1, %[c_ptr0], %[ldc]\n"
2781 "add a_ptr2, a_ptr1, %[lda]\n"
2782 "add c_ptr2, c_ptr1, %[ldc]\n"
2783 "add a_ptr3, a_ptr2, %[lda]\n"
2784 "add c_ptr3, c_ptr2, %[ldc]\n"
2785 "add a_ptr4, a_ptr3, %[lda]\n"
2786 "add c_ptr4, c_ptr3, %[ldc]\n"
2787 "add a_ptr5, a_ptr4, %[lda]\n"
2788 "add c_ptr5, c_ptr4, %[ldc]\n"
2789 "cbz %[oob_rows], 1f\n"
2790 "subs %[oob_rows], %[oob_rows], #0x1\n"
2791 "add c_ptr5, %[c_ptr0], #0x0\n"
2792 "add a_ptr5, %[a_ptr0], #0x0\n"
2794 "subs %[oob_rows], %[oob_rows], #0x1\n"
2795 "add c_ptr4, %[c_ptr0], #0x0\n"
2796 "add a_ptr4, %[a_ptr0], #0x0\n"
2798 "subs %[oob_rows], %[oob_rows], #0x1\n"
2799 "add c_ptr3, %[c_ptr0], #0x0\n"
2800 "add a_ptr3, %[a_ptr0], #0x0\n"
2802 "subs %[oob_rows], %[oob_rows], #0x1\n"
2803 "add c_ptr2, %[c_ptr0], #0x0\n"
2804 "add a_ptr2, %[a_ptr0], #0x0\n"
2806 "subs %[oob_rows], %[oob_rows], #0x1\n"
2807 "add c_ptr1, %[c_ptr0], #0x0\n"
2808 "add a_ptr1, %[a_ptr0], #0x0\n"
2810 "cbnz %[odds], 2f\n"
2811 "ldr q0, [%[a_ptr0]], #0x10\n"
2812 "ldr q4, [a_ptr1], #0x10\n"
2813 "ldr q8, [a_ptr2], #0x10\n"
2814 "ldr q12, [a_ptr3], #0x10\n"
2815 "ldr q16, [a_ptr4], #0x10\n"
2816 "ldr q20, [a_ptr5], #0x10\n"
2817 "ldr q1, [%[a_ptr0]], #0x10\n"
2818 "ldr q5, [a_ptr1], #0x10\n"
2819 "ldr q9, [a_ptr2], #0x10\n"
2820 "ldr q13, [a_ptr3], #0x10\n"
2821 "ldr q17, [a_ptr4], #0x10\n"
2822 "ldr q21, [a_ptr5], #0x10\n"
2823 "ldr q2, [%[a_ptr0]], #0x10\n"
2824 "ldr q6, [a_ptr1], #0x10\n"
2825 "ldr q10, [a_ptr2], #0x10\n"
2826 "ldr q14, [a_ptr3], #0x10\n"
2827 "ldr d3, [%[a_ptr0]]\n"
2828 "ldr q18, [a_ptr4], #0x10\n"
2829 "ldr d7, [a_ptr1]\n"
2830 "ldr q22, [a_ptr5], #0x10\n"
2831 "ldr d11, [a_ptr2]\n"
2832 "ldr d15, [a_ptr3]\n"
2833 "ldr d19, [a_ptr4]\n"
2834 "ldr d23, [a_ptr5]\n"
2837 "ldr q0, [%[a_ptr0]], #0x10\n"
2838 "subs %[odds], %[odds], #0x1\n"
2839 "ldr q4, [a_ptr1], #0x10\n"
2840 "ldr q8, [a_ptr2], #0x10\n"
2841 "ldr q12, [a_ptr3], #0x10\n"
2842 "ldr q16, [a_ptr4], #0x10\n"
2843 "ldr q20, [a_ptr5], #0x10\n"
2844 "ldr q1, [%[a_ptr0]], #0x10\n"
2845 "ldr q5, [a_ptr1], #0x10\n"
2846 "ldr q9, [a_ptr2], #0x10\n"
2847 "ldr q13, [a_ptr3], #0x10\n"
2848 "ldr q17, [a_ptr4], #0x10\n"
2849 "ldr q21, [a_ptr5], #0x10\n"
2850 "ldr q2, [%[a_ptr0]], #0x10\n"
2851 "ldr q6, [a_ptr1], #0x10\n"
2852 "ldr q10, [a_ptr2], #0x10\n"
2853 "ldr q14, [a_ptr3], #0x10\n"
2854 "ldr s3, [%[a_ptr0]], #0x4\n"
2855 "ldr q18, [a_ptr4], #0x10\n"
2856 "ldr s7, [a_ptr1], #0x4\n"
2857 "ldr q22, [a_ptr5], #0x10\n"
2858 "ldr s11, [a_ptr2], #0x4\n"
2859 "ldr s15, [a_ptr3], #0x4\n"
2860 "ldr s19, [a_ptr4], #0x4\n"
2861 "ldr s23, [a_ptr5], #0x4\n"
2863 "ld1 {v3.b}[4], [%[a_ptr0]]\n"
2864 "ld1 {v7.b}[4], [a_ptr1]\n"
2865 "ld1 {v11.b}[4], [a_ptr2]\n"
2866 "ld1 {v15.b}[4], [a_ptr3]\n"
2867 "ld1 {v19.b}[4], [a_ptr4]\n"
2868 "ld1 {v23.b}[4], [a_ptr5]\n"
2871 "ld1 {v3.h}[2], [%[a_ptr0]], #2\n"
2872 "ld1 {v7.h}[2], [a_ptr1], #2\n"
2873 "ld1 {v11.h}[2], [a_ptr2], #2\n"
2874 "ld1 {v15.h}[2], [a_ptr3], #2\n"
2875 "ld1 {v19.h}[2], [a_ptr4], #2\n"
2876 "ld1 {v23.h}[2], [a_ptr5], #2\n"
2877 "subs %[odds], %[odds], #0x1\n"
2881 "ld1 {v3.b}[6], [%[a_ptr0]]\n"
2882 "ld1 {v7.b}[6], [a_ptr1]\n"
2883 "ld1 {v11.b}[6], [a_ptr2]\n"
2884 "ld1 {v15.b}[6], [a_ptr3]\n"
2885 "ld1 {v19.b}[6], [a_ptr4]\n"
2886 "ld1 {v23.b}[6], [a_ptr5]\n"
2888 "ldr q24, [%[b_ptr0]]\n"
2889 "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
2890 "ldr q25, [%[b_ptr0], #0x10]\n"
2891 "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
2892 "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
2893 "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
2894 "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
2895 "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
2896 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2897 "cbz %[loops], 6f\n"
2899 "subs %[loops], %[loops], #0x1\n"
2905 ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
2906 ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
2907 ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
2908 ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
2909 ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
2910 ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
2911 "ldr q24, [%[b_ptr0]]\n"
2912 ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
2913 ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
2914 ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
2915 ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
2916 ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
2917 ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
2918 "ldr q25, [%[b_ptr0], #0x10]\n"
2919 ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
2920 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2921 ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
2922 ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
2923 ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
2924 ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
2925 ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
2926 "ldr q24, [%[b_ptr0]]\n"
2927 ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
2928 ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
2929 ".inst 0x6fa8eb3c // udot v28.4s, v25.16b, v8.4b[3]\n"
2930 ".inst 0x6faceb3d // udot v29.4s, v25.16b, v12.4b[3]\n"
2931 ".inst 0x6fb0eb3e // udot v30.4s, v25.16b, v16.4b[3]\n"
2932 ".inst 0x6fb4eb3f // udot v31.4s, v25.16b, v20.4b[3]\n"
2933 "ldr q25, [%[b_ptr0], #0x10]\n"
2934 ".inst 0x6f81e31a // udot v26.4s, v24.16b, v1.4b[0]\n"
2935 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2936 ".inst 0x6f85e31b // udot v27.4s, v24.16b, v5.4b[0]\n"
2937 ".inst 0x6f89e31c // udot v28.4s, v24.16b, v9.4b[0]\n"
2938 ".inst 0x6f8de31d // udot v29.4s, v24.16b, v13.4b[0]\n"
2939 ".inst 0x6f91e31e // udot v30.4s, v24.16b, v17.4b[0]\n"
2940 ".inst 0x6f95e31f // udot v31.4s, v24.16b, v21.4b[0]\n"
2941 "ldr q24, [%[b_ptr0]]\n"
2942 ".inst 0x6fa1e33a // udot v26.4s, v25.16b, v1.4b[1]\n"
2943 ".inst 0x6fa5e33b // udot v27.4s, v25.16b, v5.4b[1]\n"
2944 ".inst 0x6fa9e33c // udot v28.4s, v25.16b, v9.4b[1]\n"
2945 ".inst 0x6fade33d // udot v29.4s, v25.16b, v13.4b[1]\n"
2946 ".inst 0x6fb1e33e // udot v30.4s, v25.16b, v17.4b[1]\n"
2947 ".inst 0x6fb5e33f // udot v31.4s, v25.16b, v21.4b[1]\n"
2948 "ldr q25, [%[b_ptr0], #0x10]\n"
2949 ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
2950 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2951 ".inst 0x6f85eb1b // udot v27.4s, v24.16b, v5.4b[2]\n"
2952 ".inst 0x6f89eb1c // udot v28.4s, v24.16b, v9.4b[2]\n"
2953 ".inst 0x6f8deb1d // udot v29.4s, v24.16b, v13.4b[2]\n"
2954 ".inst 0x6f91eb1e // udot v30.4s, v24.16b, v17.4b[2]\n"
2955 ".inst 0x6f95eb1f // udot v31.4s, v24.16b, v21.4b[2]\n"
2956 "ldr q24, [%[b_ptr0]]\n"
2957 ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
2958 ".inst 0x6fa5eb3b // udot v27.4s, v25.16b, v5.4b[3]\n"
2959 ".inst 0x6fa9eb3c // udot v28.4s, v25.16b, v9.4b[3]\n"
2960 ".inst 0x6fadeb3d // udot v29.4s, v25.16b, v13.4b[3]\n"
2961 ".inst 0x6fb1eb3e // udot v30.4s, v25.16b, v17.4b[3]\n"
2962 ".inst 0x6fb5eb3f // udot v31.4s, v25.16b, v21.4b[3]\n"
2963 "ldr q25, [%[b_ptr0], #0x10]\n"
2964 ".inst 0x6f82e31a // udot v26.4s, v24.16b, v2.4b[0]\n"
2965 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2966 ".inst 0x6f86e31b // udot v27.4s, v24.16b, v6.4b[0]\n"
2967 ".inst 0x6f8ae31c // udot v28.4s, v24.16b, v10.4b[0]\n"
2968 ".inst 0x6f8ee31d // udot v29.4s, v24.16b, v14.4b[0]\n"
2969 ".inst 0x6f92e31e // udot v30.4s, v24.16b, v18.4b[0]\n"
2970 ".inst 0x6f96e31f // udot v31.4s, v24.16b, v22.4b[0]\n"
2971 "ldr q24, [%[b_ptr0]]\n"
2972 ".inst 0x6fa2e33a // udot v26.4s, v25.16b, v2.4b[1]\n"
2973 ".inst 0x6fa6e33b // udot v27.4s, v25.16b, v6.4b[1]\n"
2974 ".inst 0x6faae33c // udot v28.4s, v25.16b, v10.4b[1]\n"
2975 ".inst 0x6faee33d // udot v29.4s, v25.16b, v14.4b[1]\n"
2976 ".inst 0x6fb2e33e // udot v30.4s, v25.16b, v18.4b[1]\n"
2977 ".inst 0x6fb6e33f // udot v31.4s, v25.16b, v22.4b[1]\n"
2978 "ldr q25, [%[b_ptr0], #0x10]\n"
2979 ".inst 0x6f82eb1a // udot v26.4s, v24.16b, v2.4b[2]\n"
2980 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2981 ".inst 0x6f86eb1b // udot v27.4s, v24.16b, v6.4b[2]\n"
2982 ".inst 0x6f8aeb1c // udot v28.4s, v24.16b, v10.4b[2]\n"
2983 ".inst 0x6f8eeb1d // udot v29.4s, v24.16b, v14.4b[2]\n"
2984 ".inst 0x6f92eb1e // udot v30.4s, v24.16b, v18.4b[2]\n"
2985 ".inst 0x6f96eb1f // udot v31.4s, v24.16b, v22.4b[2]\n"
2986 "ldr q24, [%[b_ptr0]]\n"
2987 ".inst 0x6fa2eb3a // udot v26.4s, v25.16b, v2.4b[3]\n"
2988 ".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
2989 ".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
2990 ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
2991 ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
2992 ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
2993 "ldr q25, [%[b_ptr0], #0x10]\n"
2994 ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
2995 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2996 ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
2997 ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
2998 ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
2999 ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
3000 ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
3001 ".inst 0x6fa3e33a // udot v26.4s, v25.16b, v3.4b[1]\n"
3002 ".inst 0x6fa7e33b // udot v27.4s, v25.16b, v7.4b[1]\n"
3003 ".inst 0x6fabe33c // udot v28.4s, v25.16b, v11.4b[1]\n"
3004 ".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
3005 ".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
3006 ".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
3009 "str q26, [%[c_ptr0]]\n"
3010 "subs %[loops], %[loops], #0x1\n"
3012 "ldr d24, [%[b_ptr0]]\n"
3013 "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
3014 "add %[c_ptr0], %[c_ptr0], #0x10\n"
3015 "str q27, [c_ptr1]\n"
3016 "add c_ptr1, c_ptr1, #0x10\n"
3018 "ldr d25, [%[b_ptr0], #0x10]\n"
3019 "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
3020 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3021 "str q28, [c_ptr2]\n"
3022 "add c_ptr2, c_ptr2, #0x10\n"
3024 "ins v24.d[1], temploadreg0\n"
3025 "ins v25.d[1], temploadreg1\n"
3026 "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
3027 "str q29, [c_ptr3]\n"
3028 "add c_ptr3, c_ptr3, #0x10\n"
3030 "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
3031 ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
3032 "str q30, [c_ptr4]\n"
3034 "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
3035 ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
3036 "add c_ptr4, c_ptr4, #0x10\n"
3037 ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
3038 "str q31, [c_ptr5]\n"
3040 "add c_ptr5, c_ptr5, #0x10\n"
3041 ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
3042 "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
3043 ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
3044 "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
3045 ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
3046 "ldr d24, [%[b_ptr0]]\n"
3047 ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
3048 "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
3049 ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
3050 "ins v24.d[1], temploadreg0\n"
3051 ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
3052 "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
3053 ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
3054 "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
3055 ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
3056 ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
3057 "ldr d25, [%[b_ptr0], #0x10]\n"
3058 ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
3059 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3060 ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
3061 "ins v25.d[1], temploadreg1\n"
3062 ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
3063 "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
3064 ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
3065 "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
3066 ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
3067 ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
3068 "ldr d24, [%[b_ptr0]]\n"
3069 ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
3070 ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
3071 ".inst 0x6fa8eb3c // udot v28.4s, v25.16b, v8.4b[3]\n"
3072 "ins v24.d[1], temploadreg0\n"
3073 ".inst 0x6faceb3d // udot v29.4s, v25.16b, v12.4b[3]\n"
3074 ".inst 0x6fb0eb3e // udot v30.4s, v25.16b, v16.4b[3]\n"
3075 ".inst 0x6fb4eb3f // udot v31.4s, v25.16b, v20.4b[3]\n"
3076 "ldr d25, [%[b_ptr0], #0x10]\n"
3077 ".inst 0x6f81e31a // udot v26.4s, v24.16b, v1.4b[0]\n"
3078 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3079 ".inst 0x6f85e31b // udot v27.4s, v24.16b, v5.4b[0]\n"
3080 "ins v25.d[1], temploadreg1\n"
3081 ".inst 0x6f89e31c // udot v28.4s, v24.16b, v9.4b[0]\n"
3082 "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
3083 ".inst 0x6f8de31d // udot v29.4s, v24.16b, v13.4b[0]\n"
3084 "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
3085 ".inst 0x6f91e31e // udot v30.4s, v24.16b, v17.4b[0]\n"
3086 ".inst 0x6f95e31f // udot v31.4s, v24.16b, v21.4b[0]\n"
3087 "ldr d24, [%[b_ptr0]]\n"
3088 ".inst 0x6fa1e33a // udot v26.4s, v25.16b, v1.4b[1]\n"
3089 ".inst 0x6fa5e33b // udot v27.4s, v25.16b, v5.4b[1]\n"
3090 ".inst 0x6fa9e33c // udot v28.4s, v25.16b, v9.4b[1]\n"
3091 "ins v24.d[1], temploadreg0\n"
3092 ".inst 0x6fade33d // udot v29.4s, v25.16b, v13.4b[1]\n"
3093 ".inst 0x6fb1e33e // udot v30.4s, v25.16b, v17.4b[1]\n"
3094 ".inst 0x6fb5e33f // udot v31.4s, v25.16b, v21.4b[1]\n"
3095 "ldr d25, [%[b_ptr0], #0x10]\n"
3096 ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
3097 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3098 ".inst 0x6f85eb1b // udot v27.4s, v24.16b, v5.4b[2]\n"
3099 "ins v25.d[1], temploadreg1\n"
3100 ".inst 0x6f89eb1c // udot v28.4s, v24.16b, v9.4b[2]\n"
3101 "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
3102 ".inst 0x6f8deb1d // udot v29.4s, v24.16b, v13.4b[2]\n"
3103 "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
3104 ".inst 0x6f91eb1e // udot v30.4s, v24.16b, v17.4b[2]\n"
3105 ".inst 0x6f95eb1f // udot v31.4s, v24.16b, v21.4b[2]\n"
3106 "ldr d24, [%[b_ptr0]]\n"
3107 ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
3108 ".inst 0x6fa5eb3b // udot v27.4s, v25.16b, v5.4b[3]\n"
3109 ".inst 0x6fa9eb3c // udot v28.4s, v25.16b, v9.4b[3]\n"
3110 "ins v24.d[1], temploadreg0\n"
3111 ".inst 0x6fadeb3d // udot v29.4s, v25.16b, v13.4b[3]\n"
3112 ".inst 0x6fb1eb3e // udot v30.4s, v25.16b, v17.4b[3]\n"
3113 ".inst 0x6fb5eb3f // udot v31.4s, v25.16b, v21.4b[3]\n"
3114 "ldr d25, [%[b_ptr0], #0x10]\n"
3115 ".inst 0x6f82e31a // udot v26.4s, v24.16b, v2.4b[0]\n"
3116 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3117 ".inst 0x6f86e31b // udot v27.4s, v24.16b, v6.4b[0]\n"
3118 "ins v25.d[1], temploadreg1\n"
3119 ".inst 0x6f8ae31c // udot v28.4s, v24.16b, v10.4b[0]\n"
3120 "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
3121 ".inst 0x6f8ee31d // udot v29.4s, v24.16b, v14.4b[0]\n"
3122 "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
3123 ".inst 0x6f92e31e // udot v30.4s, v24.16b, v18.4b[0]\n"
3124 ".inst 0x6f96e31f // udot v31.4s, v24.16b, v22.4b[0]\n"
3125 "ldr d24, [%[b_ptr0]]\n"
3126 ".inst 0x6fa2e33a // udot v26.4s, v25.16b, v2.4b[1]\n"
3127 ".inst 0x6fa6e33b // udot v27.4s, v25.16b, v6.4b[1]\n"
3128 ".inst 0x6faae33c // udot v28.4s, v25.16b, v10.4b[1]\n"
3129 "ins v24.d[1], temploadreg0\n"
3130 ".inst 0x6faee33d // udot v29.4s, v25.16b, v14.4b[1]\n"
3131 ".inst 0x6fb2e33e // udot v30.4s, v25.16b, v18.4b[1]\n"
3132 ".inst 0x6fb6e33f // udot v31.4s, v25.16b, v22.4b[1]\n"
3133 "ldr d25, [%[b_ptr0], #0x10]\n"
3134 ".inst 0x6f82eb1a // udot v26.4s, v24.16b, v2.4b[2]\n"
3135 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3136 ".inst 0x6f86eb1b // udot v27.4s, v24.16b, v6.4b[2]\n"
3137 "ins v25.d[1], temploadreg1\n"
3138 ".inst 0x6f8aeb1c // udot v28.4s, v24.16b, v10.4b[2]\n"
3139 "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
3140 ".inst 0x6f8eeb1d // udot v29.4s, v24.16b, v14.4b[2]\n"
3141 "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
3142 ".inst 0x6f92eb1e // udot v30.4s, v24.16b, v18.4b[2]\n"
3143 ".inst 0x6f96eb1f // udot v31.4s, v24.16b, v22.4b[2]\n"
3144 "ldr d24, [%[b_ptr0]]\n"
3145 ".inst 0x6fa2eb3a // udot v26.4s, v25.16b, v2.4b[3]\n"
3146 ".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
3147 ".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
3148 "ins v24.d[1], temploadreg0\n"
3149 ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
3150 ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
3151 ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
3152 "ldr d25, [%[b_ptr0], #0x10]\n"
3153 ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
3154 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3155 ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
3156 "ins v25.d[1], temploadreg1\n"
3157 ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
3158 ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
3159 ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
3160 ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
3161 ".inst 0x6fa3e33a // udot v26.4s, v25.16b, v3.4b[1]\n"
3162 ".inst 0x6fa7e33b // udot v27.4s, v25.16b, v7.4b[1]\n"
3163 ".inst 0x6fabe33c // udot v28.4s, v25.16b, v11.4b[1]\n"
3164 ".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
3165 ".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
3166 ".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
3169 "str q26, [%[c_ptr0]]\n"
3170 "add %[c_ptr0], %[c_ptr0], #0x10\n"
3172 "ldr q24, [%[b_ptr0]]\n"
3173 "ldr q25, [%[b_ptr0], #0x10]\n"
3174 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3175 "str q27, [c_ptr1]\n"
3176 "add c_ptr1, c_ptr1, #0x10\n"
3178 ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
3179 "str q28, [c_ptr2]\n"
3181 "add c_ptr2, c_ptr2, #0x10\n"
3182 ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
3183 "str q29, [c_ptr3]\n"
3185 "add c_ptr3, c_ptr3, #0x10\n"
3186 ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
3187 "str q30, [c_ptr4]\n"
3189 "add c_ptr4, c_ptr4, #0x10\n"
3190 ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
3191 "str q31, [c_ptr5]\n"
3193 "add c_ptr5, c_ptr5, #0x10\n"
3194 ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
3195 ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
3196 ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
3197 "ldr q24, [%[b_ptr0]]\n"
3198 ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
3199 ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
3200 ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
3201 ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
3202 ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
3203 "ldr q25, [%[b_ptr0], #0x10]\n"
3204 ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
3205 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3206 ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
3207 ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
3208 ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
3209 ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
3210 ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
3211 "ldr q24, [%[b_ptr0]]\n"
3212 ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
3213 ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
3214 ".inst 0x6fa8eb3c // udot v28.4s, v25.16b, v8.4b[3]\n"
3215 ".inst 0x6faceb3d // udot v29.4s, v25.16b, v12.4b[3]\n"
3216 ".inst 0x6fb0eb3e // udot v30.4s, v25.16b, v16.4b[3]\n"
3217 ".inst 0x6fb4eb3f // udot v31.4s, v25.16b, v20.4b[3]\n"
3218 "ldr q25, [%[b_ptr0], #0x10]\n"
3219 ".inst 0x6f81e31a // udot v26.4s, v24.16b, v1.4b[0]\n"
3220 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3221 ".inst 0x6f85e31b // udot v27.4s, v24.16b, v5.4b[0]\n"
3222 ".inst 0x6f89e31c // udot v28.4s, v24.16b, v9.4b[0]\n"
3223 ".inst 0x6f8de31d // udot v29.4s, v24.16b, v13.4b[0]\n"
3224 ".inst 0x6f91e31e // udot v30.4s, v24.16b, v17.4b[0]\n"
3225 ".inst 0x6f95e31f // udot v31.4s, v24.16b, v21.4b[0]\n"
3226 "ldr q24, [%[b_ptr0]]\n"
3227 ".inst 0x6fa1e33a // udot v26.4s, v25.16b, v1.4b[1]\n"
3228 ".inst 0x6fa5e33b // udot v27.4s, v25.16b, v5.4b[1]\n"
3229 ".inst 0x6fa9e33c // udot v28.4s, v25.16b, v9.4b[1]\n"
3230 ".inst 0x6fade33d // udot v29.4s, v25.16b, v13.4b[1]\n"
3231 ".inst 0x6fb1e33e // udot v30.4s, v25.16b, v17.4b[1]\n"
3232 ".inst 0x6fb5e33f // udot v31.4s, v25.16b, v21.4b[1]\n"
3233 "ldr q25, [%[b_ptr0], #0x10]\n"
3234 ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
3235 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3236 ".inst 0x6f85eb1b // udot v27.4s, v24.16b, v5.4b[2]\n"
3237 ".inst 0x6f89eb1c // udot v28.4s, v24.16b, v9.4b[2]\n"
3238 ".inst 0x6f8deb1d // udot v29.4s, v24.16b, v13.4b[2]\n"
3239 ".inst 0x6f91eb1e // udot v30.4s, v24.16b, v17.4b[2]\n"
3240 ".inst 0x6f95eb1f // udot v31.4s, v24.16b, v21.4b[2]\n"
3241 "ldr q24, [%[b_ptr0]]\n"
3242 ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
3243 ".inst 0x6fa5eb3b // udot v27.4s, v25.16b, v5.4b[3]\n"
3244 ".inst 0x6fa9eb3c // udot v28.4s, v25.16b, v9.4b[3]\n"
3245 ".inst 0x6fadeb3d // udot v29.4s, v25.16b, v13.4b[3]\n"
3246 ".inst 0x6fb1eb3e // udot v30.4s, v25.16b, v17.4b[3]\n"
3247 ".inst 0x6fb5eb3f // udot v31.4s, v25.16b, v21.4b[3]\n"
3248 "ldr q25, [%[b_ptr0], #0x10]\n"
3249 ".inst 0x6f82e31a // udot v26.4s, v24.16b, v2.4b[0]\n"
3250 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3251 ".inst 0x6f86e31b // udot v27.4s, v24.16b, v6.4b[0]\n"
3252 ".inst 0x6f8ae31c // udot v28.4s, v24.16b, v10.4b[0]\n"
3253 ".inst 0x6f8ee31d // udot v29.4s, v24.16b, v14.4b[0]\n"
3254 ".inst 0x6f92e31e // udot v30.4s, v24.16b, v18.4b[0]\n"
3255 ".inst 0x6f96e31f // udot v31.4s, v24.16b, v22.4b[0]\n"
3256 "ldr q24, [%[b_ptr0]]\n"
3257 ".inst 0x6fa2e33a // udot v26.4s, v25.16b, v2.4b[1]\n"
3258 ".inst 0x6fa6e33b // udot v27.4s, v25.16b, v6.4b[1]\n"
3259 ".inst 0x6faae33c // udot v28.4s, v25.16b, v10.4b[1]\n"
3260 ".inst 0x6faee33d // udot v29.4s, v25.16b, v14.4b[1]\n"
3261 ".inst 0x6fb2e33e // udot v30.4s, v25.16b, v18.4b[1]\n"
3262 ".inst 0x6fb6e33f // udot v31.4s, v25.16b, v22.4b[1]\n"
3263 "ldr q25, [%[b_ptr0], #0x10]\n"
3264 ".inst 0x6f82eb1a // udot v26.4s, v24.16b, v2.4b[2]\n"
3265 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3266 ".inst 0x6f86eb1b // udot v27.4s, v24.16b, v6.4b[2]\n"
3267 ".inst 0x6f8aeb1c // udot v28.4s, v24.16b, v10.4b[2]\n"
3268 ".inst 0x6f8eeb1d // udot v29.4s, v24.16b, v14.4b[2]\n"
3269 ".inst 0x6f92eb1e // udot v30.4s, v24.16b, v18.4b[2]\n"
3270 ".inst 0x6f96eb1f // udot v31.4s, v24.16b, v22.4b[2]\n"
3271 "ldr q24, [%[b_ptr0]]\n"
3272 ".inst 0x6fa2eb3a // udot v26.4s, v25.16b, v2.4b[3]\n"
3273 ".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
3274 ".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
3275 ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
3276 ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
3277 ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
3278 "ldr q25, [%[b_ptr0], #0x10]\n"
3279 ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
3280 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3281 ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
3282 ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
3283 ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
3284 ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
3285 ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
3286 ".inst 0x6fa3e33a // udot v26.4s, v25.16b, v3.4b[1]\n"
3287 ".inst 0x6fa7e33b // udot v27.4s, v25.16b, v7.4b[1]\n"
3288 ".inst 0x6fabe33c // udot v28.4s, v25.16b, v11.4b[1]\n"
3289 ".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
3290 ".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
3291 ".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
3300 ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
3301 ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
3302 ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
3303 ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
3304 ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
3305 ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
3306 "ldr q24, [%[b_ptr0]]\n"
3307 ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
3308 ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
3309 ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
3310 ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
3311 ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
3312 ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
3313 "ldr q25, [%[b_ptr0], #0x10]\n"
3314 ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
3315 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3316 ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
3317 ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
3318 ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
3319 ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
3320 ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
3321 "ldr q24, [%[b_ptr0]]\n"
3322 ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
3323 ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
3324 ".inst 0x6fa8eb3c // udot v28.4s, v25.16b, v8.4b[3]\n"
3325 ".inst 0x6faceb3d // udot v29.4s, v25.16b, v12.4b[3]\n"
3326 ".inst 0x6fb0eb3e // udot v30.4s, v25.16b, v16.4b[3]\n"
3327 ".inst 0x6fb4eb3f // udot v31.4s, v25.16b, v20.4b[3]\n"
3328 "ldr q25, [%[b_ptr0], #0x10]\n"
3329 ".inst 0x6f81e31a // udot v26.4s, v24.16b, v1.4b[0]\n"
3330 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3331 ".inst 0x6f85e31b // udot v27.4s, v24.16b, v5.4b[0]\n"
3332 ".inst 0x6f89e31c // udot v28.4s, v24.16b, v9.4b[0]\n"
3333 ".inst 0x6f8de31d // udot v29.4s, v24.16b, v13.4b[0]\n"
3334 ".inst 0x6f91e31e // udot v30.4s, v24.16b, v17.4b[0]\n"
3335 ".inst 0x6f95e31f // udot v31.4s, v24.16b, v21.4b[0]\n"
3336 "ldr q24, [%[b_ptr0]]\n"
3337 ".inst 0x6fa1e33a // udot v26.4s, v25.16b, v1.4b[1]\n"
3338 ".inst 0x6fa5e33b // udot v27.4s, v25.16b, v5.4b[1]\n"
3339 ".inst 0x6fa9e33c // udot v28.4s, v25.16b, v9.4b[1]\n"
3340 ".inst 0x6fade33d // udot v29.4s, v25.16b, v13.4b[1]\n"
3341 ".inst 0x6fb1e33e // udot v30.4s, v25.16b, v17.4b[1]\n"
3342 ".inst 0x6fb5e33f // udot v31.4s, v25.16b, v21.4b[1]\n"
3343 "ldr q25, [%[b_ptr0], #0x10]\n"
3344 ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
3345 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3346 ".inst 0x6f85eb1b // udot v27.4s, v24.16b, v5.4b[2]\n"
3347 ".inst 0x6f89eb1c // udot v28.4s, v24.16b, v9.4b[2]\n"
3348 ".inst 0x6f8deb1d // udot v29.4s, v24.16b, v13.4b[2]\n"
3349 ".inst 0x6f91eb1e // udot v30.4s, v24.16b, v17.4b[2]\n"
3350 ".inst 0x6f95eb1f // udot v31.4s, v24.16b, v21.4b[2]\n"
3351 "ldr q24, [%[b_ptr0]]\n"
3352 ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
3353 ".inst 0x6fa5eb3b // udot v27.4s, v25.16b, v5.4b[3]\n"
3354 ".inst 0x6fa9eb3c // udot v28.4s, v25.16b, v9.4b[3]\n"
3355 ".inst 0x6fadeb3d // udot v29.4s, v25.16b, v13.4b[3]\n"
3356 ".inst 0x6fb1eb3e // udot v30.4s, v25.16b, v17.4b[3]\n"
3357 ".inst 0x6fb5eb3f // udot v31.4s, v25.16b, v21.4b[3]\n"
3358 "ldr q25, [%[b_ptr0], #0x10]\n"
3359 ".inst 0x6f82e31a // udot v26.4s, v24.16b, v2.4b[0]\n"
3360 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3361 ".inst 0x6f86e31b // udot v27.4s, v24.16b, v6.4b[0]\n"
3362 ".inst 0x6f8ae31c // udot v28.4s, v24.16b, v10.4b[0]\n"
3363 ".inst 0x6f8ee31d // udot v29.4s, v24.16b, v14.4b[0]\n"
3364 ".inst 0x6f92e31e // udot v30.4s, v24.16b, v18.4b[0]\n"
3365 ".inst 0x6f96e31f // udot v31.4s, v24.16b, v22.4b[0]\n"
3366 "ldr q24, [%[b_ptr0]]\n"
3367 ".inst 0x6fa2e33a // udot v26.4s, v25.16b, v2.4b[1]\n"
3368 ".inst 0x6fa6e33b // udot v27.4s, v25.16b, v6.4b[1]\n"
3369 ".inst 0x6faae33c // udot v28.4s, v25.16b, v10.4b[1]\n"
3370 ".inst 0x6faee33d // udot v29.4s, v25.16b, v14.4b[1]\n"
3371 ".inst 0x6fb2e33e // udot v30.4s, v25.16b, v18.4b[1]\n"
3372 ".inst 0x6fb6e33f // udot v31.4s, v25.16b, v22.4b[1]\n"
3373 "ldr q25, [%[b_ptr0], #0x10]\n"
3374 ".inst 0x6f82eb1a // udot v26.4s, v24.16b, v2.4b[2]\n"
3375 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3376 ".inst 0x6f86eb1b // udot v27.4s, v24.16b, v6.4b[2]\n"
3377 ".inst 0x6f8aeb1c // udot v28.4s, v24.16b, v10.4b[2]\n"
3378 ".inst 0x6f8eeb1d // udot v29.4s, v24.16b, v14.4b[2]\n"
3379 ".inst 0x6f92eb1e // udot v30.4s, v24.16b, v18.4b[2]\n"
3380 ".inst 0x6f96eb1f // udot v31.4s, v24.16b, v22.4b[2]\n"
3381 "ldr q24, [%[b_ptr0]]\n"
3382 ".inst 0x6fa2eb3a // udot v26.4s, v25.16b, v2.4b[3]\n"
3383 ".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
3384 ".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
3385 ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
3386 ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
3387 ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
3388 "ldr q25, [%[b_ptr0], #0x10]\n"
3389 ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
3390 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3391 ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
3392 ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
3393 ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
3394 ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
3395 ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
3396 ".inst 0x6fa3e33a // udot v26.4s, v25.16b, v3.4b[1]\n"
3397 ".inst 0x6fa7e33b // udot v27.4s, v25.16b, v7.4b[1]\n"
3398 ".inst 0x6fabe33c // udot v28.4s, v25.16b, v11.4b[1]\n"
3399 ".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
3400 ".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
3401 ".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
3403 "str q26, [%[c_ptr0]]\n"
3404 "add %[c_ptr0], %[c_ptr0], #0x10\n"
3405 "str q27, [c_ptr1]\n"
3406 "str q28, [c_ptr2]\n"
3407 "str q29, [c_ptr3]\n"
3408 "str q30, [c_ptr4]\n"
3409 "str q31, [c_ptr5]\n"
3420 ".unreq temploadreg0\n"
3421 ".unreq temploadreg1\n"
3422 ".unreq temploadreg2\n"
3423 ".unreq temploadreg3\n"
3424 : [a_ptr0]
"+r" (a_ptr0), [b_ptr0]
"+r" (b_ptr0), [c_ptr0]
"+r" (c_ptr0), [loops]
"+r" (loops), [oob_rows]
"+r" (oob_rows), [odds]
"+r" (odds)
3425 : [lda]
"r" (ldab), [ldc]
"r" (ldcb)
3426 :
"x0",
"x1",
"x2",
"x3",
"x4",
"x5",
"x6",
"x7",
"x8",
"x9",
"x10",
"x11",
"x12",
"x13",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23",
"v24",
"v25",
"v26",
"v27",
"v28",
"v29",
"v30",
"v31",
"cc",
"memory"
3441 "temploadreg0 .req X10\n"
3442 "temploadreg1 .req X11\n"
3443 "temploadreg2 .req X12\n"
3444 "temploadreg3 .req X13\n"
3445 "add a_ptr1, %[a_ptr0], %[lda]\n"
3446 "add c_ptr1, %[c_ptr0], %[ldc]\n"
3447 "add a_ptr2, a_ptr1, %[lda]\n"
3448 "add c_ptr2, c_ptr1, %[ldc]\n"
3449 "add a_ptr3, a_ptr2, %[lda]\n"
3450 "add c_ptr3, c_ptr2, %[ldc]\n"
3451 "add a_ptr4, a_ptr3, %[lda]\n"
3452 "add c_ptr4, c_ptr3, %[ldc]\n"
3453 "add a_ptr5, a_ptr4, %[lda]\n"
3454 "add c_ptr5, c_ptr4, %[ldc]\n"
3455 "cbz %[oob_rows], 1f\n"
3456 "subs %[oob_rows], %[oob_rows], #0x1\n"
3457 "add c_ptr5, %[c_ptr0], #0x0\n"
3458 "add a_ptr5, %[a_ptr0], #0x0\n"
3460 "subs %[oob_rows], %[oob_rows], #0x1\n"
3461 "add c_ptr4, %[c_ptr0], #0x0\n"
3462 "add a_ptr4, %[a_ptr0], #0x0\n"
3464 "subs %[oob_rows], %[oob_rows], #0x1\n"
3465 "add c_ptr3, %[c_ptr0], #0x0\n"
3466 "add a_ptr3, %[a_ptr0], #0x0\n"
3468 "subs %[oob_rows], %[oob_rows], #0x1\n"
3469 "add c_ptr2, %[c_ptr0], #0x0\n"
3470 "add a_ptr2, %[a_ptr0], #0x0\n"
3472 "subs %[oob_rows], %[oob_rows], #0x1\n"
3473 "add c_ptr1, %[c_ptr0], #0x0\n"
3474 "add a_ptr1, %[a_ptr0], #0x0\n"
3476 "ldr q0, [%[a_ptr0]], #0x10\n"
3477 "ldr q4, [a_ptr1], #0x10\n"
3478 "ldr q8, [a_ptr2], #0x10\n"
3479 "ldr q12, [a_ptr3], #0x10\n"
3480 "ldr q16, [a_ptr4], #0x10\n"
3481 "ldr q20, [a_ptr5], #0x10\n"
3482 "ldr q1, [%[a_ptr0]], #0x10\n"
3483 "ldr q5, [a_ptr1], #0x10\n"
3484 "ldr q9, [a_ptr2], #0x10\n"
3485 "ldr q13, [a_ptr3], #0x10\n"
3486 "ldr q17, [a_ptr4], #0x10\n"
3487 "ldr q21, [a_ptr5], #0x10\n"
3488 "ldr q2, [%[a_ptr0]], #0x10\n"
3489 "ldr q6, [a_ptr1], #0x10\n"
3490 "ldr q10, [a_ptr2], #0x10\n"
3491 "ldr q14, [a_ptr3], #0x10\n"
3492 "ldr d3, [%[a_ptr0]], #0x8\n"
3493 "ldr q18, [a_ptr4], #0x10\n"
3494 "ldr d7, [a_ptr1], #0x8\n"
3495 "ldr q22, [a_ptr5], #0x10\n"
3496 "ldr d11, [a_ptr2], #0x8\n"
3497 "ldr d15, [a_ptr3], #0x8\n"
3498 "ldr d19, [a_ptr4], #0x8\n"
3499 "ldr d23, [a_ptr5], #0x8\n"
3500 "cbnz %[odds], 2f\n"
3501 "ld1 {v3.s}[2], [%[a_ptr0]]\n"
3502 "ld1 {v7.s}[2], [a_ptr1]\n"
3503 "ld1 {v11.s}[2], [a_ptr2]\n"
3504 "ld1 {v15.s}[2], [a_ptr3]\n"
3505 "ld1 {v19.s}[2], [a_ptr4]\n"
3506 "ld1 {v23.s}[2], [a_ptr5]\n"
3509 "subs %[odds], %[odds], #0x1\n"
3511 "ld1 {v3.b}[8], [%[a_ptr0]]\n"
3512 "ld1 {v7.b}[8], [a_ptr1]\n"
3513 "ld1 {v11.b}[8], [a_ptr2]\n"
3514 "ld1 {v15.b}[8], [a_ptr3]\n"
3515 "ld1 {v19.b}[8], [a_ptr4]\n"
3516 "ld1 {v23.b}[8], [a_ptr5]\n"
3519 "ld1 {v3.h}[4], [%[a_ptr0]], #2\n"
3520 "ld1 {v7.h}[4], [a_ptr1], #2\n"
3521 "ld1 {v11.h}[4], [a_ptr2], #2\n"
3522 "ld1 {v15.h}[4], [a_ptr3], #2\n"
3523 "ld1 {v19.h}[4], [a_ptr4], #2\n"
3524 "ld1 {v23.h}[4], [a_ptr5], #2\n"
3525 "subs %[odds], %[odds], #0x1\n"
3529 "ld1 {v3.b}[10], [%[a_ptr0]]\n"
3530 "ld1 {v7.b}[10], [a_ptr1]\n"
3531 "ld1 {v11.b}[10], [a_ptr2]\n"
3532 "ld1 {v15.b}[10], [a_ptr3]\n"
3533 "ld1 {v19.b}[10], [a_ptr4]\n"
3534 "ld1 {v23.b}[10], [a_ptr5]\n"
3536 "ldr q24, [%[b_ptr0]]\n"
3537 "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
3538 "ldr q25, [%[b_ptr0], #0x10]\n"
3539 "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
3540 "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
3541 "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
3542 "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
3543 "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
3544 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3545 "cbz %[loops], 6f\n"
3547 "subs %[loops], %[loops], #0x1\n"
3553 ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
3554 ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
3555 ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
3556 ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
3557 ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
3558 ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
3559 "ldr q24, [%[b_ptr0]]\n"
3560 ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
3561 ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
3562 ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
3563 ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
3564 ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
3565 ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
3566 "ldr q25, [%[b_ptr0], #0x10]\n"
3567 ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
3568 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3569 ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
3570 ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
3571 ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
3572 ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
3573 ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
3574 "ldr q24, [%[b_ptr0]]\n"
3575 ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
3576 ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
3577 ".inst 0x6fa8eb3c // udot v28.4s, v25.16b, v8.4b[3]\n"
3578 ".inst 0x6faceb3d // udot v29.4s, v25.16b, v12.4b[3]\n"
3579 ".inst 0x6fb0eb3e // udot v30.4s, v25.16b, v16.4b[3]\n"
3580 ".inst 0x6fb4eb3f // udot v31.4s, v25.16b, v20.4b[3]\n"
3581 "ldr q25, [%[b_ptr0], #0x10]\n"
3582 ".inst 0x6f81e31a // udot v26.4s, v24.16b, v1.4b[0]\n"
3583 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3584 ".inst 0x6f85e31b // udot v27.4s, v24.16b, v5.4b[0]\n"
3585 ".inst 0x6f89e31c // udot v28.4s, v24.16b, v9.4b[0]\n"
3586 ".inst 0x6f8de31d // udot v29.4s, v24.16b, v13.4b[0]\n"
3587 ".inst 0x6f91e31e // udot v30.4s, v24.16b, v17.4b[0]\n"
3588 ".inst 0x6f95e31f // udot v31.4s, v24.16b, v21.4b[0]\n"
3589 "ldr q24, [%[b_ptr0]]\n"
3590 ".inst 0x6fa1e33a // udot v26.4s, v25.16b, v1.4b[1]\n"
3591 ".inst 0x6fa5e33b // udot v27.4s, v25.16b, v5.4b[1]\n"
3592 ".inst 0x6fa9e33c // udot v28.4s, v25.16b, v9.4b[1]\n"
3593 ".inst 0x6fade33d // udot v29.4s, v25.16b, v13.4b[1]\n"
3594 ".inst 0x6fb1e33e // udot v30.4s, v25.16b, v17.4b[1]\n"
3595 ".inst 0x6fb5e33f // udot v31.4s, v25.16b, v21.4b[1]\n"
3596 "ldr q25, [%[b_ptr0], #0x10]\n"
3597 ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
3598 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3599 ".inst 0x6f85eb1b // udot v27.4s, v24.16b, v5.4b[2]\n"
3600 ".inst 0x6f89eb1c // udot v28.4s, v24.16b, v9.4b[2]\n"
3601 ".inst 0x6f8deb1d // udot v29.4s, v24.16b, v13.4b[2]\n"
3602 ".inst 0x6f91eb1e // udot v30.4s, v24.16b, v17.4b[2]\n"
3603 ".inst 0x6f95eb1f // udot v31.4s, v24.16b, v21.4b[2]\n"
3604 "ldr q24, [%[b_ptr0]]\n"
3605 ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
3606 ".inst 0x6fa5eb3b // udot v27.4s, v25.16b, v5.4b[3]\n"
3607 ".inst 0x6fa9eb3c // udot v28.4s, v25.16b, v9.4b[3]\n"
3608 ".inst 0x6fadeb3d // udot v29.4s, v25.16b, v13.4b[3]\n"
3609 ".inst 0x6fb1eb3e // udot v30.4s, v25.16b, v17.4b[3]\n"
3610 ".inst 0x6fb5eb3f // udot v31.4s, v25.16b, v21.4b[3]\n"
3611 "ldr q25, [%[b_ptr0], #0x10]\n"
3612 ".inst 0x6f82e31a // udot v26.4s, v24.16b, v2.4b[0]\n"
3613 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3614 ".inst 0x6f86e31b // udot v27.4s, v24.16b, v6.4b[0]\n"
3615 ".inst 0x6f8ae31c // udot v28.4s, v24.16b, v10.4b[0]\n"
3616 ".inst 0x6f8ee31d // udot v29.4s, v24.16b, v14.4b[0]\n"
3617 ".inst 0x6f92e31e // udot v30.4s, v24.16b, v18.4b[0]\n"
3618 ".inst 0x6f96e31f // udot v31.4s, v24.16b, v22.4b[0]\n"
3619 "ldr q24, [%[b_ptr0]]\n"
3620 ".inst 0x6fa2e33a // udot v26.4s, v25.16b, v2.4b[1]\n"
3621 ".inst 0x6fa6e33b // udot v27.4s, v25.16b, v6.4b[1]\n"
3622 ".inst 0x6faae33c // udot v28.4s, v25.16b, v10.4b[1]\n"
3623 ".inst 0x6faee33d // udot v29.4s, v25.16b, v14.4b[1]\n"
3624 ".inst 0x6fb2e33e // udot v30.4s, v25.16b, v18.4b[1]\n"
3625 ".inst 0x6fb6e33f // udot v31.4s, v25.16b, v22.4b[1]\n"
3626 "ldr q25, [%[b_ptr0], #0x10]\n"
3627 ".inst 0x6f82eb1a // udot v26.4s, v24.16b, v2.4b[2]\n"
3628 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3629 ".inst 0x6f86eb1b // udot v27.4s, v24.16b, v6.4b[2]\n"
3630 ".inst 0x6f8aeb1c // udot v28.4s, v24.16b, v10.4b[2]\n"
3631 ".inst 0x6f8eeb1d // udot v29.4s, v24.16b, v14.4b[2]\n"
3632 ".inst 0x6f92eb1e // udot v30.4s, v24.16b, v18.4b[2]\n"
3633 ".inst 0x6f96eb1f // udot v31.4s, v24.16b, v22.4b[2]\n"
3634 "ldr q24, [%[b_ptr0]]\n"
3635 ".inst 0x6fa2eb3a // udot v26.4s, v25.16b, v2.4b[3]\n"
3636 ".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
3637 ".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
3638 ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
3639 ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
3640 ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
3641 "ldr q25, [%[b_ptr0], #0x10]\n"
3642 ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
3643 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3644 ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
3645 ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
3646 ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
3647 ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
3648 ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
3649 "ldr q24, [%[b_ptr0]]\n"
3650 ".inst 0x6fa3e33a // udot v26.4s, v25.16b, v3.4b[1]\n"
3651 "add %[b_ptr0], %[b_ptr0], #0x10\n"
3652 ".inst 0x6fa7e33b // udot v27.4s, v25.16b, v7.4b[1]\n"
3653 ".inst 0x6fabe33c // udot v28.4s, v25.16b, v11.4b[1]\n"
3654 ".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
3655 ".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
3656 ".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
3657 ".inst 0x6f83eb1a // udot v26.4s, v24.16b, v3.4b[2]\n"
3658 ".inst 0x6f87eb1b // udot v27.4s, v24.16b, v7.4b[2]\n"
3659 ".inst 0x6f8beb1c // udot v28.4s, v24.16b, v11.4b[2]\n"
3660 ".inst 0x6f8feb1d // udot v29.4s, v24.16b, v15.4b[2]\n"
3661 ".inst 0x6f93eb1e // udot v30.4s, v24.16b, v19.4b[2]\n"
3662 ".inst 0x6f97eb1f // udot v31.4s, v24.16b, v23.4b[2]\n"
3665 "str q26, [%[c_ptr0]]\n"
3666 "subs %[loops], %[loops], #0x1\n"
3668 "ldr d24, [%[b_ptr0]]\n"
3669 "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
3670 "add %[c_ptr0], %[c_ptr0], #0x10\n"
3671 "str q27, [c_ptr1]\n"
3672 "add c_ptr1, c_ptr1, #0x10\n"
3674 "ldr d25, [%[b_ptr0], #0x10]\n"
3675 "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
3676 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3677 "str q28, [c_ptr2]\n"
3678 "add c_ptr2, c_ptr2, #0x10\n"
3680 "ins v24.d[1], temploadreg0\n"
3681 "ins v25.d[1], temploadreg1\n"
3682 "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
3683 "str q29, [c_ptr3]\n"
3684 "add c_ptr3, c_ptr3, #0x10\n"
3686 "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
3687 ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
3688 "str q30, [c_ptr4]\n"
3690 "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
3691 ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
3692 "add c_ptr4, c_ptr4, #0x10\n"
3693 ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
3694 "str q31, [c_ptr5]\n"
3696 "add c_ptr5, c_ptr5, #0x10\n"
3697 ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
3698 "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
3699 ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
3700 "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
3701 ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
3702 "ldr d24, [%[b_ptr0]]\n"
3703 ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
3704 "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
3705 ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
3706 "ins v24.d[1], temploadreg0\n"
3707 ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
3708 "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
3709 ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
3710 "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
3711 ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
3712 ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
3713 "ldr d25, [%[b_ptr0], #0x10]\n"
3714 ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
3715 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3716 ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
3717 "ins v25.d[1], temploadreg1\n"
3718 ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
3719 "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
3720 ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
3721 "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
3722 ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
3723 ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
3724 "ldr d24, [%[b_ptr0]]\n"
3725 ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
3726 ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
3727 ".inst 0x6fa8eb3c // udot v28.4s, v25.16b, v8.4b[3]\n"
3728 "ins v24.d[1], temploadreg0\n"
3729 ".inst 0x6faceb3d // udot v29.4s, v25.16b, v12.4b[3]\n"
3730 ".inst 0x6fb0eb3e // udot v30.4s, v25.16b, v16.4b[3]\n"
3731 ".inst 0x6fb4eb3f // udot v31.4s, v25.16b, v20.4b[3]\n"
3732 "ldr d25, [%[b_ptr0], #0x10]\n"
3733 ".inst 0x6f81e31a // udot v26.4s, v24.16b, v1.4b[0]\n"
3734 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3735 ".inst 0x6f85e31b // udot v27.4s, v24.16b, v5.4b[0]\n"
3736 "ins v25.d[1], temploadreg1\n"
3737 ".inst 0x6f89e31c // udot v28.4s, v24.16b, v9.4b[0]\n"
3738 "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
3739 ".inst 0x6f8de31d // udot v29.4s, v24.16b, v13.4b[0]\n"
3740 "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
3741 ".inst 0x6f91e31e // udot v30.4s, v24.16b, v17.4b[0]\n"
3742 ".inst 0x6f95e31f // udot v31.4s, v24.16b, v21.4b[0]\n"
3743 "ldr d24, [%[b_ptr0]]\n"
3744 ".inst 0x6fa1e33a // udot v26.4s, v25.16b, v1.4b[1]\n"
3745 ".inst 0x6fa5e33b // udot v27.4s, v25.16b, v5.4b[1]\n"
3746 ".inst 0x6fa9e33c // udot v28.4s, v25.16b, v9.4b[1]\n"
3747 "ins v24.d[1], temploadreg0\n"
3748 ".inst 0x6fade33d // udot v29.4s, v25.16b, v13.4b[1]\n"
3749 ".inst 0x6fb1e33e // udot v30.4s, v25.16b, v17.4b[1]\n"
3750 ".inst 0x6fb5e33f // udot v31.4s, v25.16b, v21.4b[1]\n"
3751 "ldr d25, [%[b_ptr0], #0x10]\n"
3752 ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
3753 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3754 ".inst 0x6f85eb1b // udot v27.4s, v24.16b, v5.4b[2]\n"
3755 "ins v25.d[1], temploadreg1\n"
3756 ".inst 0x6f89eb1c // udot v28.4s, v24.16b, v9.4b[2]\n"
3757 "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
3758 ".inst 0x6f8deb1d // udot v29.4s, v24.16b, v13.4b[2]\n"
3759 "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
3760 ".inst 0x6f91eb1e // udot v30.4s, v24.16b, v17.4b[2]\n"
3761 ".inst 0x6f95eb1f // udot v31.4s, v24.16b, v21.4b[2]\n"
3762 "ldr d24, [%[b_ptr0]]\n"
3763 ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
3764 ".inst 0x6fa5eb3b // udot v27.4s, v25.16b, v5.4b[3]\n"
3765 ".inst 0x6fa9eb3c // udot v28.4s, v25.16b, v9.4b[3]\n"
3766 "ins v24.d[1], temploadreg0\n"
3767 ".inst 0x6fadeb3d // udot v29.4s, v25.16b, v13.4b[3]\n"
3768 ".inst 0x6fb1eb3e // udot v30.4s, v25.16b, v17.4b[3]\n"
3769 ".inst 0x6fb5eb3f // udot v31.4s, v25.16b, v21.4b[3]\n"
3770 "ldr d25, [%[b_ptr0], #0x10]\n"
3771 ".inst 0x6f82e31a // udot v26.4s, v24.16b, v2.4b[0]\n"
3772 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3773 ".inst 0x6f86e31b // udot v27.4s, v24.16b, v6.4b[0]\n"
3774 "ins v25.d[1], temploadreg1\n"
3775 ".inst 0x6f8ae31c // udot v28.4s, v24.16b, v10.4b[0]\n"
3776 "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
3777 ".inst 0x6f8ee31d // udot v29.4s, v24.16b, v14.4b[0]\n"
3778 "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
3779 ".inst 0x6f92e31e // udot v30.4s, v24.16b, v18.4b[0]\n"
3780 ".inst 0x6f96e31f // udot v31.4s, v24.16b, v22.4b[0]\n"
3781 "ldr d24, [%[b_ptr0]]\n"
3782 ".inst 0x6fa2e33a // udot v26.4s, v25.16b, v2.4b[1]\n"
3783 ".inst 0x6fa6e33b // udot v27.4s, v25.16b, v6.4b[1]\n"
3784 ".inst 0x6faae33c // udot v28.4s, v25.16b, v10.4b[1]\n"
3785 "ins v24.d[1], temploadreg0\n"
3786 ".inst 0x6faee33d // udot v29.4s, v25.16b, v14.4b[1]\n"
3787 ".inst 0x6fb2e33e // udot v30.4s, v25.16b, v18.4b[1]\n"
3788 ".inst 0x6fb6e33f // udot v31.4s, v25.16b, v22.4b[1]\n"
3789 "ldr d25, [%[b_ptr0], #0x10]\n"
3790 ".inst 0x6f82eb1a // udot v26.4s, v24.16b, v2.4b[2]\n"
3791 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3792 ".inst 0x6f86eb1b // udot v27.4s, v24.16b, v6.4b[2]\n"
3793 "ins v25.d[1], temploadreg1\n"
3794 ".inst 0x6f8aeb1c // udot v28.4s, v24.16b, v10.4b[2]\n"
3795 "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
3796 ".inst 0x6f8eeb1d // udot v29.4s, v24.16b, v14.4b[2]\n"
3797 "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
3798 ".inst 0x6f92eb1e // udot v30.4s, v24.16b, v18.4b[2]\n"
3799 ".inst 0x6f96eb1f // udot v31.4s, v24.16b, v22.4b[2]\n"
3800 "ldr d24, [%[b_ptr0]]\n"
3801 ".inst 0x6fa2eb3a // udot v26.4s, v25.16b, v2.4b[3]\n"
3802 ".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
3803 ".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
3804 "ins v24.d[1], temploadreg0\n"
3805 ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
3806 ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
3807 ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
3808 "ldr d25, [%[b_ptr0], #0x10]\n"
3809 ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
3810 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3811 ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
3812 "ins v25.d[1], temploadreg1\n"
3813 ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
3814 "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
3815 ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
3816 ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
3817 ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
3818 "ldr d24, [%[b_ptr0]]\n"
3819 ".inst 0x6fa3e33a // udot v26.4s, v25.16b, v3.4b[1]\n"
3820 "add %[b_ptr0], %[b_ptr0], #0x10\n"
3821 ".inst 0x6fa7e33b // udot v27.4s, v25.16b, v7.4b[1]\n"
3822 "ins v24.d[1], temploadreg0\n"
3823 ".inst 0x6fabe33c // udot v28.4s, v25.16b, v11.4b[1]\n"
3824 ".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
3825 ".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
3826 ".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
3827 ".inst 0x6f83eb1a // udot v26.4s, v24.16b, v3.4b[2]\n"
3828 ".inst 0x6f87eb1b // udot v27.4s, v24.16b, v7.4b[2]\n"
3829 ".inst 0x6f8beb1c // udot v28.4s, v24.16b, v11.4b[2]\n"
3830 ".inst 0x6f8feb1d // udot v29.4s, v24.16b, v15.4b[2]\n"
3831 ".inst 0x6f93eb1e // udot v30.4s, v24.16b, v19.4b[2]\n"
3832 ".inst 0x6f97eb1f // udot v31.4s, v24.16b, v23.4b[2]\n"
3835 "str q26, [%[c_ptr0]]\n"
3836 "add %[c_ptr0], %[c_ptr0], #0x10\n"
3838 "ldr q24, [%[b_ptr0]]\n"
3839 "ldr q25, [%[b_ptr0], #0x10]\n"
3840 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3841 "str q27, [c_ptr1]\n"
3842 "add c_ptr1, c_ptr1, #0x10\n"
3844 ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
3845 "str q28, [c_ptr2]\n"
3847 "add c_ptr2, c_ptr2, #0x10\n"
3848 ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
3849 "str q29, [c_ptr3]\n"
3851 "add c_ptr3, c_ptr3, #0x10\n"
3852 ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
3853 "str q30, [c_ptr4]\n"
3855 "add c_ptr4, c_ptr4, #0x10\n"
3856 ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
3857 "str q31, [c_ptr5]\n"
3859 "add c_ptr5, c_ptr5, #0x10\n"
3860 ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
3861 ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
3862 ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
3863 "ldr q24, [%[b_ptr0]]\n"
3864 ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
3865 ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
3866 ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
3867 ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
3868 ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
3869 "ldr q25, [%[b_ptr0], #0x10]\n"
3870 ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
3871 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3872 ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
3873 ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
3874 ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
3875 ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
3876 ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
3877 "ldr q24, [%[b_ptr0]]\n"
3878 ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
3879 ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
3880 ".inst 0x6fa8eb3c // udot v28.4s, v25.16b, v8.4b[3]\n"
3881 ".inst 0x6faceb3d // udot v29.4s, v25.16b, v12.4b[3]\n"
3882 ".inst 0x6fb0eb3e // udot v30.4s, v25.16b, v16.4b[3]\n"
3883 ".inst 0x6fb4eb3f // udot v31.4s, v25.16b, v20.4b[3]\n"
3884 "ldr q25, [%[b_ptr0], #0x10]\n"
3885 ".inst 0x6f81e31a // udot v26.4s, v24.16b, v1.4b[0]\n"
3886 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3887 ".inst 0x6f85e31b // udot v27.4s, v24.16b, v5.4b[0]\n"
3888 ".inst 0x6f89e31c // udot v28.4s, v24.16b, v9.4b[0]\n"
3889 ".inst 0x6f8de31d // udot v29.4s, v24.16b, v13.4b[0]\n"
3890 ".inst 0x6f91e31e // udot v30.4s, v24.16b, v17.4b[0]\n"
3891 ".inst 0x6f95e31f // udot v31.4s, v24.16b, v21.4b[0]\n"
3892 "ldr q24, [%[b_ptr0]]\n"
3893 ".inst 0x6fa1e33a // udot v26.4s, v25.16b, v1.4b[1]\n"
3894 ".inst 0x6fa5e33b // udot v27.4s, v25.16b, v5.4b[1]\n"
3895 ".inst 0x6fa9e33c // udot v28.4s, v25.16b, v9.4b[1]\n"
3896 ".inst 0x6fade33d // udot v29.4s, v25.16b, v13.4b[1]\n"
3897 ".inst 0x6fb1e33e // udot v30.4s, v25.16b, v17.4b[1]\n"
3898 ".inst 0x6fb5e33f // udot v31.4s, v25.16b, v21.4b[1]\n"
3899 "ldr q25, [%[b_ptr0], #0x10]\n"
3900 ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
3901 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3902 ".inst 0x6f85eb1b // udot v27.4s, v24.16b, v5.4b[2]\n"
3903 ".inst 0x6f89eb1c // udot v28.4s, v24.16b, v9.4b[2]\n"
3904 ".inst 0x6f8deb1d // udot v29.4s, v24.16b, v13.4b[2]\n"
3905 ".inst 0x6f91eb1e // udot v30.4s, v24.16b, v17.4b[2]\n"
3906 ".inst 0x6f95eb1f // udot v31.4s, v24.16b, v21.4b[2]\n"
3907 "ldr q24, [%[b_ptr0]]\n"
3908 ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
3909 ".inst 0x6fa5eb3b // udot v27.4s, v25.16b, v5.4b[3]\n"
3910 ".inst 0x6fa9eb3c // udot v28.4s, v25.16b, v9.4b[3]\n"
3911 ".inst 0x6fadeb3d // udot v29.4s, v25.16b, v13.4b[3]\n"
3912 ".inst 0x6fb1eb3e // udot v30.4s, v25.16b, v17.4b[3]\n"
3913 ".inst 0x6fb5eb3f // udot v31.4s, v25.16b, v21.4b[3]\n"
3914 "ldr q25, [%[b_ptr0], #0x10]\n"
3915 ".inst 0x6f82e31a // udot v26.4s, v24.16b, v2.4b[0]\n"
3916 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3917 ".inst 0x6f86e31b // udot v27.4s, v24.16b, v6.4b[0]\n"
3918 ".inst 0x6f8ae31c // udot v28.4s, v24.16b, v10.4b[0]\n"
3919 ".inst 0x6f8ee31d // udot v29.4s, v24.16b, v14.4b[0]\n"
3920 ".inst 0x6f92e31e // udot v30.4s, v24.16b, v18.4b[0]\n"
3921 ".inst 0x6f96e31f // udot v31.4s, v24.16b, v22.4b[0]\n"
3922 "ldr q24, [%[b_ptr0]]\n"
3923 ".inst 0x6fa2e33a // udot v26.4s, v25.16b, v2.4b[1]\n"
3924 ".inst 0x6fa6e33b // udot v27.4s, v25.16b, v6.4b[1]\n"
3925 ".inst 0x6faae33c // udot v28.4s, v25.16b, v10.4b[1]\n"
3926 ".inst 0x6faee33d // udot v29.4s, v25.16b, v14.4b[1]\n"
3927 ".inst 0x6fb2e33e // udot v30.4s, v25.16b, v18.4b[1]\n"
3928 ".inst 0x6fb6e33f // udot v31.4s, v25.16b, v22.4b[1]\n"
3929 "ldr q25, [%[b_ptr0], #0x10]\n"
3930 ".inst 0x6f82eb1a // udot v26.4s, v24.16b, v2.4b[2]\n"
3931 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3932 ".inst 0x6f86eb1b // udot v27.4s, v24.16b, v6.4b[2]\n"
3933 ".inst 0x6f8aeb1c // udot v28.4s, v24.16b, v10.4b[2]\n"
3934 ".inst 0x6f8eeb1d // udot v29.4s, v24.16b, v14.4b[2]\n"
3935 ".inst 0x6f92eb1e // udot v30.4s, v24.16b, v18.4b[2]\n"
3936 ".inst 0x6f96eb1f // udot v31.4s, v24.16b, v22.4b[2]\n"
3937 "ldr q24, [%[b_ptr0]]\n"
3938 ".inst 0x6fa2eb3a // udot v26.4s, v25.16b, v2.4b[3]\n"
3939 ".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
3940 ".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
3941 ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
3942 ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
3943 ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
3944 "ldr q25, [%[b_ptr0], #0x10]\n"
3945 ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
3946 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3947 ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
3948 ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
3949 ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
3950 ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
3951 ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
3952 "ldr q24, [%[b_ptr0]]\n"
3953 ".inst 0x6fa3e33a // udot v26.4s, v25.16b, v3.4b[1]\n"
3954 "add %[b_ptr0], %[b_ptr0], #0x10\n"
3955 ".inst 0x6fa7e33b // udot v27.4s, v25.16b, v7.4b[1]\n"
3956 ".inst 0x6fabe33c // udot v28.4s, v25.16b, v11.4b[1]\n"
3957 ".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
3958 ".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
3959 ".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
3960 ".inst 0x6f83eb1a // udot v26.4s, v24.16b, v3.4b[2]\n"
3961 ".inst 0x6f87eb1b // udot v27.4s, v24.16b, v7.4b[2]\n"
3962 ".inst 0x6f8beb1c // udot v28.4s, v24.16b, v11.4b[2]\n"
3963 ".inst 0x6f8feb1d // udot v29.4s, v24.16b, v15.4b[2]\n"
3964 ".inst 0x6f93eb1e // udot v30.4s, v24.16b, v19.4b[2]\n"
3965 ".inst 0x6f97eb1f // udot v31.4s, v24.16b, v23.4b[2]\n"
3974 ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
3975 ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
3976 ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
3977 ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
3978 ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
3979 ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
3980 "ldr q24, [%[b_ptr0]]\n"
3981 ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
3982 ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
3983 ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
3984 ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
3985 ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
3986 ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
3987 "ldr q25, [%[b_ptr0], #0x10]\n"
3988 ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
3989 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3990 ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
3991 ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
3992 ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
3993 ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
3994 ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
3995 "ldr q24, [%[b_ptr0]]\n"
3996 ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
3997 ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
3998 ".inst 0x6fa8eb3c // udot v28.4s, v25.16b, v8.4b[3]\n"
3999 ".inst 0x6faceb3d // udot v29.4s, v25.16b, v12.4b[3]\n"
4000 ".inst 0x6fb0eb3e // udot v30.4s, v25.16b, v16.4b[3]\n"
4001 ".inst 0x6fb4eb3f // udot v31.4s, v25.16b, v20.4b[3]\n"
4002 "ldr q25, [%[b_ptr0], #0x10]\n"
4003 ".inst 0x6f81e31a // udot v26.4s, v24.16b, v1.4b[0]\n"
4004 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4005 ".inst 0x6f85e31b // udot v27.4s, v24.16b, v5.4b[0]\n"
4006 ".inst 0x6f89e31c // udot v28.4s, v24.16b, v9.4b[0]\n"
4007 ".inst 0x6f8de31d // udot v29.4s, v24.16b, v13.4b[0]\n"
4008 ".inst 0x6f91e31e // udot v30.4s, v24.16b, v17.4b[0]\n"
4009 ".inst 0x6f95e31f // udot v31.4s, v24.16b, v21.4b[0]\n"
4010 "ldr q24, [%[b_ptr0]]\n"
4011 ".inst 0x6fa1e33a // udot v26.4s, v25.16b, v1.4b[1]\n"
4012 ".inst 0x6fa5e33b // udot v27.4s, v25.16b, v5.4b[1]\n"
4013 ".inst 0x6fa9e33c // udot v28.4s, v25.16b, v9.4b[1]\n"
4014 ".inst 0x6fade33d // udot v29.4s, v25.16b, v13.4b[1]\n"
4015 ".inst 0x6fb1e33e // udot v30.4s, v25.16b, v17.4b[1]\n"
4016 ".inst 0x6fb5e33f // udot v31.4s, v25.16b, v21.4b[1]\n"
4017 "ldr q25, [%[b_ptr0], #0x10]\n"
4018 ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
4019 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4020 ".inst 0x6f85eb1b // udot v27.4s, v24.16b, v5.4b[2]\n"
4021 ".inst 0x6f89eb1c // udot v28.4s, v24.16b, v9.4b[2]\n"
4022 ".inst 0x6f8deb1d // udot v29.4s, v24.16b, v13.4b[2]\n"
4023 ".inst 0x6f91eb1e // udot v30.4s, v24.16b, v17.4b[2]\n"
4024 ".inst 0x6f95eb1f // udot v31.4s, v24.16b, v21.4b[2]\n"
4025 "ldr q24, [%[b_ptr0]]\n"
4026 ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
4027 ".inst 0x6fa5eb3b // udot v27.4s, v25.16b, v5.4b[3]\n"
4028 ".inst 0x6fa9eb3c // udot v28.4s, v25.16b, v9.4b[3]\n"
4029 ".inst 0x6fadeb3d // udot v29.4s, v25.16b, v13.4b[3]\n"
4030 ".inst 0x6fb1eb3e // udot v30.4s, v25.16b, v17.4b[3]\n"
4031 ".inst 0x6fb5eb3f // udot v31.4s, v25.16b, v21.4b[3]\n"
4032 "ldr q25, [%[b_ptr0], #0x10]\n"
4033 ".inst 0x6f82e31a // udot v26.4s, v24.16b, v2.4b[0]\n"
4034 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4035 ".inst 0x6f86e31b // udot v27.4s, v24.16b, v6.4b[0]\n"
4036 ".inst 0x6f8ae31c // udot v28.4s, v24.16b, v10.4b[0]\n"
4037 ".inst 0x6f8ee31d // udot v29.4s, v24.16b, v14.4b[0]\n"
4038 ".inst 0x6f92e31e // udot v30.4s, v24.16b, v18.4b[0]\n"
4039 ".inst 0x6f96e31f // udot v31.4s, v24.16b, v22.4b[0]\n"
4040 "ldr q24, [%[b_ptr0]]\n"
4041 ".inst 0x6fa2e33a // udot v26.4s, v25.16b, v2.4b[1]\n"
4042 ".inst 0x6fa6e33b // udot v27.4s, v25.16b, v6.4b[1]\n"
4043 ".inst 0x6faae33c // udot v28.4s, v25.16b, v10.4b[1]\n"
4044 ".inst 0x6faee33d // udot v29.4s, v25.16b, v14.4b[1]\n"
4045 ".inst 0x6fb2e33e // udot v30.4s, v25.16b, v18.4b[1]\n"
4046 ".inst 0x6fb6e33f // udot v31.4s, v25.16b, v22.4b[1]\n"
4047 "ldr q25, [%[b_ptr0], #0x10]\n"
4048 ".inst 0x6f82eb1a // udot v26.4s, v24.16b, v2.4b[2]\n"
4049 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4050 ".inst 0x6f86eb1b // udot v27.4s, v24.16b, v6.4b[2]\n"
4051 ".inst 0x6f8aeb1c // udot v28.4s, v24.16b, v10.4b[2]\n"
4052 ".inst 0x6f8eeb1d // udot v29.4s, v24.16b, v14.4b[2]\n"
4053 ".inst 0x6f92eb1e // udot v30.4s, v24.16b, v18.4b[2]\n"
4054 ".inst 0x6f96eb1f // udot v31.4s, v24.16b, v22.4b[2]\n"
4055 "ldr q24, [%[b_ptr0]]\n"
4056 ".inst 0x6fa2eb3a // udot v26.4s, v25.16b, v2.4b[3]\n"
4057 ".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
4058 ".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
4059 ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
4060 ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
4061 ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
4062 "ldr q25, [%[b_ptr0], #0x10]\n"
4063 ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
4064 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4065 ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
4066 ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
4067 ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
4068 ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
4069 ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
4070 "ldr q24, [%[b_ptr0]]\n"
4071 ".inst 0x6fa3e33a // udot v26.4s, v25.16b, v3.4b[1]\n"
4072 "add %[b_ptr0], %[b_ptr0], #0x10\n"
4073 ".inst 0x6fa7e33b // udot v27.4s, v25.16b, v7.4b[1]\n"
4074 ".inst 0x6fabe33c // udot v28.4s, v25.16b, v11.4b[1]\n"
4075 ".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
4076 ".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
4077 ".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
4078 ".inst 0x6f83eb1a // udot v26.4s, v24.16b, v3.4b[2]\n"
4079 ".inst 0x6f87eb1b // udot v27.4s, v24.16b, v7.4b[2]\n"
4080 ".inst 0x6f8beb1c // udot v28.4s, v24.16b, v11.4b[2]\n"
4081 ".inst 0x6f8feb1d // udot v29.4s, v24.16b, v15.4b[2]\n"
4082 ".inst 0x6f93eb1e // udot v30.4s, v24.16b, v19.4b[2]\n"
4083 ".inst 0x6f97eb1f // udot v31.4s, v24.16b, v23.4b[2]\n"
4085 "str q26, [%[c_ptr0]]\n"
4086 "add %[c_ptr0], %[c_ptr0], #0x10\n"
4087 "str q27, [c_ptr1]\n"
4088 "str q28, [c_ptr2]\n"
4089 "str q29, [c_ptr3]\n"
4090 "str q30, [c_ptr4]\n"
4091 "str q31, [c_ptr5]\n"
4102 ".unreq temploadreg0\n"
4103 ".unreq temploadreg1\n"
4104 ".unreq temploadreg2\n"
4105 ".unreq temploadreg3\n"
4106 : [a_ptr0]
"+r" (a_ptr0), [b_ptr0]
"+r" (b_ptr0), [c_ptr0]
"+r" (c_ptr0), [loops]
"+r" (loops), [oob_rows]
"+r" (oob_rows), [odds]
"+r" (odds)
4107 : [lda]
"r" (ldab), [ldc]
"r" (ldcb)
4108 :
"x0",
"x1",
"x2",
"x3",
"x4",
"x5",
"x6",
"x7",
"x8",
"x9",
"x10",
"x11",
"x12",
"x13",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23",
"v24",
"v25",
"v26",
"v27",
"v28",
"v29",
"v30",
"v31",
"cc",
"memory"
4124 "temploadreg0 .req X10\n"
4125 "temploadreg1 .req X11\n"
4126 "temploadreg2 .req X12\n"
4127 "temploadreg3 .req X13\n"
4128 "add a_ptr1, %[a_ptr0], %[lda]\n"
4129 "add c_ptr1, %[c_ptr0], %[ldc]\n"
4130 "add a_ptr2, a_ptr1, %[lda]\n"
4131 "add c_ptr2, c_ptr1, %[ldc]\n"
4132 "add a_ptr3, a_ptr2, %[lda]\n"
4133 "add c_ptr3, c_ptr2, %[ldc]\n"
4134 "add a_ptr4, a_ptr3, %[lda]\n"
4135 "add c_ptr4, c_ptr3, %[ldc]\n"
4136 "add a_ptr5, a_ptr4, %[lda]\n"
4137 "add c_ptr5, c_ptr4, %[ldc]\n"
4138 "cbz %[oob_rows], 1f\n"
4139 "subs %[oob_rows], %[oob_rows], #0x1\n"
4140 "add c_ptr5, %[c_ptr0], #0x0\n"
4141 "add a_ptr5, %[a_ptr0], #0x0\n"
4143 "subs %[oob_rows], %[oob_rows], #0x1\n"
4144 "add c_ptr4, %[c_ptr0], #0x0\n"
4145 "add a_ptr4, %[a_ptr0], #0x0\n"
4147 "subs %[oob_rows], %[oob_rows], #0x1\n"
4148 "add c_ptr3, %[c_ptr0], #0x0\n"
4149 "add a_ptr3, %[a_ptr0], #0x0\n"
4151 "subs %[oob_rows], %[oob_rows], #0x1\n"
4152 "add c_ptr2, %[c_ptr0], #0x0\n"
4153 "add a_ptr2, %[a_ptr0], #0x0\n"
4155 "subs %[oob_rows], %[oob_rows], #0x1\n"
4156 "add c_ptr1, %[c_ptr0], #0x0\n"
4157 "add a_ptr1, %[a_ptr0], #0x0\n"
4159 "cbnz %[odds], 2f\n"
4160 "ldr q0, [%[a_ptr0]], #0x10\n"
4161 "ldr q4, [a_ptr1], #0x10\n"
4162 "ldr q8, [a_ptr2], #0x10\n"
4163 "ldr q12, [a_ptr3], #0x10\n"
4164 "ldr q16, [a_ptr4], #0x10\n"
4165 "ldr q20, [a_ptr5], #0x10\n"
4166 "ldr q1, [%[a_ptr0]], #0x10\n"
4167 "ldr q5, [a_ptr1], #0x10\n"
4168 "ldr q9, [a_ptr2], #0x10\n"
4169 "ldr q13, [a_ptr3], #0x10\n"
4170 "ldr q17, [a_ptr4], #0x10\n"
4171 "ldr q21, [a_ptr5], #0x10\n"
4172 "ldr q2, [%[a_ptr0]], #0x10\n"
4173 "ldr q6, [a_ptr1], #0x10\n"
4174 "ldr q10, [a_ptr2], #0x10\n"
4175 "ldr q14, [a_ptr3], #0x10\n"
4176 "ldr q18, [a_ptr4], #0x10\n"
4177 "ldr q22, [a_ptr5], #0x10\n"
4178 "ldr q3, [%[a_ptr0]]\n"
4179 "ldr q7, [a_ptr1]\n"
4180 "ldr q11, [a_ptr2]\n"
4181 "ldr q15, [a_ptr3]\n"
4182 "ldr q19, [a_ptr4]\n"
4183 "ldr q23, [a_ptr5]\n"
4186 "ldr q0, [%[a_ptr0]], #0x10\n"
4187 "subs %[odds], %[odds], #0x1\n"
4188 "ldr q4, [a_ptr1], #0x10\n"
4189 "ldr q8, [a_ptr2], #0x10\n"
4190 "ldr q12, [a_ptr3], #0x10\n"
4191 "ldr q16, [a_ptr4], #0x10\n"
4192 "ldr q20, [a_ptr5], #0x10\n"
4193 "ldr q1, [%[a_ptr0]], #0x10\n"
4194 "ldr q5, [a_ptr1], #0x10\n"
4195 "ldr q9, [a_ptr2], #0x10\n"
4196 "ldr q13, [a_ptr3], #0x10\n"
4197 "ldr q17, [a_ptr4], #0x10\n"
4198 "ldr q21, [a_ptr5], #0x10\n"
4199 "ldr q2, [%[a_ptr0]], #0x10\n"
4200 "ldr q6, [a_ptr1], #0x10\n"
4201 "ldr q10, [a_ptr2], #0x10\n"
4202 "ldr q14, [a_ptr3], #0x10\n"
4203 "ldr d3, [%[a_ptr0]], #0x8\n"
4204 "ldr q18, [a_ptr4], #0x10\n"
4205 "ldr d7, [a_ptr1], #0x8\n"
4206 "ldr q22, [a_ptr5], #0x10\n"
4207 "ldr d11, [a_ptr2], #0x8\n"
4208 "ldr d15, [a_ptr3], #0x8\n"
4209 "ldr d19, [a_ptr4], #0x8\n"
4210 "ldr d23, [a_ptr5], #0x8\n"
4211 "ld1 {v3.s}[2], [%[a_ptr0]], #4\n"
4212 "ld1 {v7.s}[2], [a_ptr1], #4\n"
4213 "ld1 {v11.s}[2], [a_ptr2], #4\n"
4214 "ld1 {v15.s}[2], [a_ptr3], #4\n"
4215 "ld1 {v19.s}[2], [a_ptr4], #4\n"
4216 "ld1 {v23.s}[2], [a_ptr5], #4\n"
4218 "ld1 {v3.b}[12], [%[a_ptr0]]\n"
4219 "ld1 {v7.b}[12], [a_ptr1]\n"
4220 "ld1 {v11.b}[12], [a_ptr2]\n"
4221 "ld1 {v15.b}[12], [a_ptr3]\n"
4222 "ld1 {v19.b}[12], [a_ptr4]\n"
4223 "ld1 {v23.b}[12], [a_ptr5]\n"
4226 "ld1 {v3.h}[6], [%[a_ptr0]], #2\n"
4227 "ld1 {v7.h}[6], [a_ptr1], #2\n"
4228 "ld1 {v11.h}[6], [a_ptr2], #2\n"
4229 "ld1 {v15.h}[6], [a_ptr3], #2\n"
4230 "ld1 {v19.h}[6], [a_ptr4], #2\n"
4231 "ld1 {v23.h}[6], [a_ptr5], #2\n"
4232 "subs %[odds], %[odds], #0x1\n"
4236 "ld1 {v3.b}[14], [%[a_ptr0]]\n"
4237 "ld1 {v7.b}[14], [a_ptr1]\n"
4238 "ld1 {v11.b}[14], [a_ptr2]\n"
4239 "ld1 {v15.b}[14], [a_ptr3]\n"
4240 "ld1 {v19.b}[14], [a_ptr4]\n"
4241 "ld1 {v23.b}[14], [a_ptr5]\n"
4243 "ldr q24, [%[b_ptr0]]\n"
4244 "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
4245 "ldr q25, [%[b_ptr0], #0x10]\n"
4246 "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
4247 "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
4248 "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
4249 "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
4250 "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
4251 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4252 "cbz %[loops], 6f\n"
4254 "subs %[loops], %[loops], #0x1\n"
4260 ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
4261 ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
4262 ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
4263 ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
4264 ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
4265 ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
4266 "ldr q24, [%[b_ptr0]]\n"
4267 ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
4268 ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
4269 ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
4270 ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
4271 ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
4272 ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
4273 "ldr q25, [%[b_ptr0], #0x10]\n"
4274 ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
4275 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4276 ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
4277 ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
4278 ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
4279 ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
4280 ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
4281 "ldr q24, [%[b_ptr0]]\n"
4282 ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
4283 ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
4284 ".inst 0x6fa8eb3c // udot v28.4s, v25.16b, v8.4b[3]\n"
4285 ".inst 0x6faceb3d // udot v29.4s, v25.16b, v12.4b[3]\n"
4286 ".inst 0x6fb0eb3e // udot v30.4s, v25.16b, v16.4b[3]\n"
4287 ".inst 0x6fb4eb3f // udot v31.4s, v25.16b, v20.4b[3]\n"
4288 "ldr q25, [%[b_ptr0], #0x10]\n"
4289 ".inst 0x6f81e31a // udot v26.4s, v24.16b, v1.4b[0]\n"
4290 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4291 ".inst 0x6f85e31b // udot v27.4s, v24.16b, v5.4b[0]\n"
4292 ".inst 0x6f89e31c // udot v28.4s, v24.16b, v9.4b[0]\n"
4293 ".inst 0x6f8de31d // udot v29.4s, v24.16b, v13.4b[0]\n"
4294 ".inst 0x6f91e31e // udot v30.4s, v24.16b, v17.4b[0]\n"
4295 ".inst 0x6f95e31f // udot v31.4s, v24.16b, v21.4b[0]\n"
4296 "ldr q24, [%[b_ptr0]]\n"
4297 ".inst 0x6fa1e33a // udot v26.4s, v25.16b, v1.4b[1]\n"
4298 ".inst 0x6fa5e33b // udot v27.4s, v25.16b, v5.4b[1]\n"
4299 ".inst 0x6fa9e33c // udot v28.4s, v25.16b, v9.4b[1]\n"
4300 ".inst 0x6fade33d // udot v29.4s, v25.16b, v13.4b[1]\n"
4301 ".inst 0x6fb1e33e // udot v30.4s, v25.16b, v17.4b[1]\n"
4302 ".inst 0x6fb5e33f // udot v31.4s, v25.16b, v21.4b[1]\n"
4303 "ldr q25, [%[b_ptr0], #0x10]\n"
4304 ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
4305 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4306 ".inst 0x6f85eb1b // udot v27.4s, v24.16b, v5.4b[2]\n"
4307 ".inst 0x6f89eb1c // udot v28.4s, v24.16b, v9.4b[2]\n"
4308 ".inst 0x6f8deb1d // udot v29.4s, v24.16b, v13.4b[2]\n"
4309 ".inst 0x6f91eb1e // udot v30.4s, v24.16b, v17.4b[2]\n"
4310 ".inst 0x6f95eb1f // udot v31.4s, v24.16b, v21.4b[2]\n"
4311 "ldr q24, [%[b_ptr0]]\n"
4312 ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
4313 ".inst 0x6fa5eb3b // udot v27.4s, v25.16b, v5.4b[3]\n"
4314 ".inst 0x6fa9eb3c // udot v28.4s, v25.16b, v9.4b[3]\n"
4315 ".inst 0x6fadeb3d // udot v29.4s, v25.16b, v13.4b[3]\n"
4316 ".inst 0x6fb1eb3e // udot v30.4s, v25.16b, v17.4b[3]\n"
4317 ".inst 0x6fb5eb3f // udot v31.4s, v25.16b, v21.4b[3]\n"
4318 "ldr q25, [%[b_ptr0], #0x10]\n"
4319 ".inst 0x6f82e31a // udot v26.4s, v24.16b, v2.4b[0]\n"
4320 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4321 ".inst 0x6f86e31b // udot v27.4s, v24.16b, v6.4b[0]\n"
4322 ".inst 0x6f8ae31c // udot v28.4s, v24.16b, v10.4b[0]\n"
4323 ".inst 0x6f8ee31d // udot v29.4s, v24.16b, v14.4b[0]\n"
4324 ".inst 0x6f92e31e // udot v30.4s, v24.16b, v18.4b[0]\n"
4325 ".inst 0x6f96e31f // udot v31.4s, v24.16b, v22.4b[0]\n"
4326 "ldr q24, [%[b_ptr0]]\n"
4327 ".inst 0x6fa2e33a // udot v26.4s, v25.16b, v2.4b[1]\n"
4328 ".inst 0x6fa6e33b // udot v27.4s, v25.16b, v6.4b[1]\n"
4329 ".inst 0x6faae33c // udot v28.4s, v25.16b, v10.4b[1]\n"
4330 ".inst 0x6faee33d // udot v29.4s, v25.16b, v14.4b[1]\n"
4331 ".inst 0x6fb2e33e // udot v30.4s, v25.16b, v18.4b[1]\n"
4332 ".inst 0x6fb6e33f // udot v31.4s, v25.16b, v22.4b[1]\n"
4333 "ldr q25, [%[b_ptr0], #0x10]\n"
4334 ".inst 0x6f82eb1a // udot v26.4s, v24.16b, v2.4b[2]\n"
4335 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4336 ".inst 0x6f86eb1b // udot v27.4s, v24.16b, v6.4b[2]\n"
4337 ".inst 0x6f8aeb1c // udot v28.4s, v24.16b, v10.4b[2]\n"
4338 ".inst 0x6f8eeb1d // udot v29.4s, v24.16b, v14.4b[2]\n"
4339 ".inst 0x6f92eb1e // udot v30.4s, v24.16b, v18.4b[2]\n"
4340 ".inst 0x6f96eb1f // udot v31.4s, v24.16b, v22.4b[2]\n"
4341 "ldr q24, [%[b_ptr0]]\n"
4342 ".inst 0x6fa2eb3a // udot v26.4s, v25.16b, v2.4b[3]\n"
4343 ".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
4344 ".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
4345 ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
4346 ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
4347 ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
4348 "ldr q25, [%[b_ptr0], #0x10]\n"
4349 ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
4350 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4351 ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
4352 ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
4353 ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
4354 ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
4355 ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
4356 "ldr q24, [%[b_ptr0]]\n"
4357 ".inst 0x6fa3e33a // udot v26.4s, v25.16b, v3.4b[1]\n"
4358 ".inst 0x6fa7e33b // udot v27.4s, v25.16b, v7.4b[1]\n"
4359 ".inst 0x6fabe33c // udot v28.4s, v25.16b, v11.4b[1]\n"
4360 ".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
4361 ".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
4362 ".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
4363 "ldr q25, [%[b_ptr0], #0x10]\n"
4364 ".inst 0x6f83eb1a // udot v26.4s, v24.16b, v3.4b[2]\n"
4365 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4366 ".inst 0x6f87eb1b // udot v27.4s, v24.16b, v7.4b[2]\n"
4367 ".inst 0x6f8beb1c // udot v28.4s, v24.16b, v11.4b[2]\n"
4368 ".inst 0x6f8feb1d // udot v29.4s, v24.16b, v15.4b[2]\n"
4369 ".inst 0x6f93eb1e // udot v30.4s, v24.16b, v19.4b[2]\n"
4370 ".inst 0x6f97eb1f // udot v31.4s, v24.16b, v23.4b[2]\n"
4371 ".inst 0x6fa3eb3a // udot v26.4s, v25.16b, v3.4b[3]\n"
4372 ".inst 0x6fa7eb3b // udot v27.4s, v25.16b, v7.4b[3]\n"
4373 ".inst 0x6fabeb3c // udot v28.4s, v25.16b, v11.4b[3]\n"
4374 ".inst 0x6fafeb3d // udot v29.4s, v25.16b, v15.4b[3]\n"
4375 ".inst 0x6fb3eb3e // udot v30.4s, v25.16b, v19.4b[3]\n"
4376 ".inst 0x6fb7eb3f // udot v31.4s, v25.16b, v23.4b[3]\n"
4379 "str q26, [%[c_ptr0]]\n"
4380 "subs %[loops], %[loops], #0x1\n"
4382 "ldr d24, [%[b_ptr0]]\n"
4383 "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
4384 "add %[c_ptr0], %[c_ptr0], #0x10\n"
4385 "str q27, [c_ptr1]\n"
4386 "add c_ptr1, c_ptr1, #0x10\n"
4388 "ldr d25, [%[b_ptr0], #0x10]\n"
4389 "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
4390 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4391 "str q28, [c_ptr2]\n"
4392 "add c_ptr2, c_ptr2, #0x10\n"
4394 "ins v24.d[1], temploadreg0\n"
4395 "ins v25.d[1], temploadreg1\n"
4396 "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
4397 "str q29, [c_ptr3]\n"
4398 "add c_ptr3, c_ptr3, #0x10\n"
4400 "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
4401 ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
4402 "str q30, [c_ptr4]\n"
4404 "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
4405 ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
4406 "add c_ptr4, c_ptr4, #0x10\n"
4407 ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
4408 "str q31, [c_ptr5]\n"
4410 "add c_ptr5, c_ptr5, #0x10\n"
4411 ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
4412 "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
4413 ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
4414 "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
4415 ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
4416 "ldr d24, [%[b_ptr0]]\n"
4417 ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
4418 "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
4419 ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
4420 "ins v24.d[1], temploadreg0\n"
4421 ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
4422 "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
4423 ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
4424 "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
4425 ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
4426 ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
4427 "ldr d25, [%[b_ptr0], #0x10]\n"
4428 ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
4429 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4430 ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
4431 "ins v25.d[1], temploadreg1\n"
4432 ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
4433 "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
4434 ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
4435 "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
4436 ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
4437 ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
4438 "ldr d24, [%[b_ptr0]]\n"
4439 ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
4440 ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
4441 ".inst 0x6fa8eb3c // udot v28.4s, v25.16b, v8.4b[3]\n"
4442 "ins v24.d[1], temploadreg0\n"
4443 ".inst 0x6faceb3d // udot v29.4s, v25.16b, v12.4b[3]\n"
4444 ".inst 0x6fb0eb3e // udot v30.4s, v25.16b, v16.4b[3]\n"
4445 ".inst 0x6fb4eb3f // udot v31.4s, v25.16b, v20.4b[3]\n"
4446 "ldr d25, [%[b_ptr0], #0x10]\n"
4447 ".inst 0x6f81e31a // udot v26.4s, v24.16b, v1.4b[0]\n"
4448 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4449 ".inst 0x6f85e31b // udot v27.4s, v24.16b, v5.4b[0]\n"
4450 "ins v25.d[1], temploadreg1\n"
4451 ".inst 0x6f89e31c // udot v28.4s, v24.16b, v9.4b[0]\n"
4452 "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
4453 ".inst 0x6f8de31d // udot v29.4s, v24.16b, v13.4b[0]\n"
4454 "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
4455 ".inst 0x6f91e31e // udot v30.4s, v24.16b, v17.4b[0]\n"
4456 ".inst 0x6f95e31f // udot v31.4s, v24.16b, v21.4b[0]\n"
4457 "ldr d24, [%[b_ptr0]]\n"
4458 ".inst 0x6fa1e33a // udot v26.4s, v25.16b, v1.4b[1]\n"
4459 ".inst 0x6fa5e33b // udot v27.4s, v25.16b, v5.4b[1]\n"
4460 ".inst 0x6fa9e33c // udot v28.4s, v25.16b, v9.4b[1]\n"
4461 "ins v24.d[1], temploadreg0\n"
4462 ".inst 0x6fade33d // udot v29.4s, v25.16b, v13.4b[1]\n"
4463 ".inst 0x6fb1e33e // udot v30.4s, v25.16b, v17.4b[1]\n"
4464 ".inst 0x6fb5e33f // udot v31.4s, v25.16b, v21.4b[1]\n"
4465 "ldr d25, [%[b_ptr0], #0x10]\n"
4466 ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
4467 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4468 ".inst 0x6f85eb1b // udot v27.4s, v24.16b, v5.4b[2]\n"
4469 "ins v25.d[1], temploadreg1\n"
4470 ".inst 0x6f89eb1c // udot v28.4s, v24.16b, v9.4b[2]\n"
4471 "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
4472 ".inst 0x6f8deb1d // udot v29.4s, v24.16b, v13.4b[2]\n"
4473 "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
4474 ".inst 0x6f91eb1e // udot v30.4s, v24.16b, v17.4b[2]\n"
4475 ".inst 0x6f95eb1f // udot v31.4s, v24.16b, v21.4b[2]\n"
4476 "ldr d24, [%[b_ptr0]]\n"
4477 ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
4478 ".inst 0x6fa5eb3b // udot v27.4s, v25.16b, v5.4b[3]\n"
4479 ".inst 0x6fa9eb3c // udot v28.4s, v25.16b, v9.4b[3]\n"
4480 "ins v24.d[1], temploadreg0\n"
4481 ".inst 0x6fadeb3d // udot v29.4s, v25.16b, v13.4b[3]\n"
4482 ".inst 0x6fb1eb3e // udot v30.4s, v25.16b, v17.4b[3]\n"
4483 ".inst 0x6fb5eb3f // udot v31.4s, v25.16b, v21.4b[3]\n"
4484 "ldr d25, [%[b_ptr0], #0x10]\n"
4485 ".inst 0x6f82e31a // udot v26.4s, v24.16b, v2.4b[0]\n"
4486 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4487 ".inst 0x6f86e31b // udot v27.4s, v24.16b, v6.4b[0]\n"
4488 "ins v25.d[1], temploadreg1\n"
4489 ".inst 0x6f8ae31c // udot v28.4s, v24.16b, v10.4b[0]\n"
4490 "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
4491 ".inst 0x6f8ee31d // udot v29.4s, v24.16b, v14.4b[0]\n"
4492 "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
4493 ".inst 0x6f92e31e // udot v30.4s, v24.16b, v18.4b[0]\n"
4494 ".inst 0x6f96e31f // udot v31.4s, v24.16b, v22.4b[0]\n"
4495 "ldr d24, [%[b_ptr0]]\n"
4496 ".inst 0x6fa2e33a // udot v26.4s, v25.16b, v2.4b[1]\n"
4497 ".inst 0x6fa6e33b // udot v27.4s, v25.16b, v6.4b[1]\n"
4498 ".inst 0x6faae33c // udot v28.4s, v25.16b, v10.4b[1]\n"
4499 "ins v24.d[1], temploadreg0\n"
4500 ".inst 0x6faee33d // udot v29.4s, v25.16b, v14.4b[1]\n"
4501 ".inst 0x6fb2e33e // udot v30.4s, v25.16b, v18.4b[1]\n"
4502 ".inst 0x6fb6e33f // udot v31.4s, v25.16b, v22.4b[1]\n"
4503 "ldr d25, [%[b_ptr0], #0x10]\n"
4504 ".inst 0x6f82eb1a // udot v26.4s, v24.16b, v2.4b[2]\n"
4505 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4506 ".inst 0x6f86eb1b // udot v27.4s, v24.16b, v6.4b[2]\n"
4507 "ins v25.d[1], temploadreg1\n"
4508 ".inst 0x6f8aeb1c // udot v28.4s, v24.16b, v10.4b[2]\n"
4509 "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
4510 ".inst 0x6f8eeb1d // udot v29.4s, v24.16b, v14.4b[2]\n"
4511 "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
4512 ".inst 0x6f92eb1e // udot v30.4s, v24.16b, v18.4b[2]\n"
4513 ".inst 0x6f96eb1f // udot v31.4s, v24.16b, v22.4b[2]\n"
4514 "ldr d24, [%[b_ptr0]]\n"
4515 ".inst 0x6fa2eb3a // udot v26.4s, v25.16b, v2.4b[3]\n"
4516 ".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
4517 ".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
4518 "ins v24.d[1], temploadreg0\n"
4519 ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
4520 ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
4521 ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
4522 "ldr d25, [%[b_ptr0], #0x10]\n"
4523 ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
4524 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4525 ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
4526 "ins v25.d[1], temploadreg1\n"
4527 ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
4528 "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
4529 ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
4530 "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
4531 ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
4532 ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
4533 "ldr d24, [%[b_ptr0]]\n"
4534 ".inst 0x6fa3e33a // udot v26.4s, v25.16b, v3.4b[1]\n"
4535 ".inst 0x6fa7e33b // udot v27.4s, v25.16b, v7.4b[1]\n"
4536 ".inst 0x6fabe33c // udot v28.4s, v25.16b, v11.4b[1]\n"
4537 "ins v24.d[1], temploadreg0\n"
4538 ".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
4539 ".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
4540 ".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
4541 "ldr d25, [%[b_ptr0], #0x10]\n"
4542 ".inst 0x6f83eb1a // udot v26.4s, v24.16b, v3.4b[2]\n"
4543 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4544 ".inst 0x6f87eb1b // udot v27.4s, v24.16b, v7.4b[2]\n"
4545 "ins v25.d[1], temploadreg1\n"
4546 ".inst 0x6f8beb1c // udot v28.4s, v24.16b, v11.4b[2]\n"
4547 ".inst 0x6f8feb1d // udot v29.4s, v24.16b, v15.4b[2]\n"
4548 ".inst 0x6f93eb1e // udot v30.4s, v24.16b, v19.4b[2]\n"
4549 ".inst 0x6f97eb1f // udot v31.4s, v24.16b, v23.4b[2]\n"
4550 ".inst 0x6fa3eb3a // udot v26.4s, v25.16b, v3.4b[3]\n"
4551 ".inst 0x6fa7eb3b // udot v27.4s, v25.16b, v7.4b[3]\n"
4552 ".inst 0x6fabeb3c // udot v28.4s, v25.16b, v11.4b[3]\n"
4553 ".inst 0x6fafeb3d // udot v29.4s, v25.16b, v15.4b[3]\n"
4554 ".inst 0x6fb3eb3e // udot v30.4s, v25.16b, v19.4b[3]\n"
4555 ".inst 0x6fb7eb3f // udot v31.4s, v25.16b, v23.4b[3]\n"
4558 "str q26, [%[c_ptr0]]\n"
4559 "add %[c_ptr0], %[c_ptr0], #0x10\n"
4561 "ldr q24, [%[b_ptr0]]\n"
4562 "ldr q25, [%[b_ptr0], #0x10]\n"
4563 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4564 "str q27, [c_ptr1]\n"
4565 "add c_ptr1, c_ptr1, #0x10\n"
4567 ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
4568 "str q28, [c_ptr2]\n"
4570 "add c_ptr2, c_ptr2, #0x10\n"
4571 ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
4572 "str q29, [c_ptr3]\n"
4574 "add c_ptr3, c_ptr3, #0x10\n"
4575 ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
4576 "str q30, [c_ptr4]\n"
4578 "add c_ptr4, c_ptr4, #0x10\n"
4579 ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
4580 "str q31, [c_ptr5]\n"
4582 "add c_ptr5, c_ptr5, #0x10\n"
4583 ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
4584 ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
4585 ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
4586 "ldr q24, [%[b_ptr0]]\n"
4587 ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
4588 ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
4589 ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
4590 ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
4591 ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
4592 "ldr q25, [%[b_ptr0], #0x10]\n"
4593 ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
4594 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4595 ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
4596 ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
4597 ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
4598 ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
4599 ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
4600 "ldr q24, [%[b_ptr0]]\n"
4601 ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
4602 ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
4603 ".inst 0x6fa8eb3c // udot v28.4s, v25.16b, v8.4b[3]\n"
4604 ".inst 0x6faceb3d // udot v29.4s, v25.16b, v12.4b[3]\n"
4605 ".inst 0x6fb0eb3e // udot v30.4s, v25.16b, v16.4b[3]\n"
4606 ".inst 0x6fb4eb3f // udot v31.4s, v25.16b, v20.4b[3]\n"
4607 "ldr q25, [%[b_ptr0], #0x10]\n"
4608 ".inst 0x6f81e31a // udot v26.4s, v24.16b, v1.4b[0]\n"
4609 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4610 ".inst 0x6f85e31b // udot v27.4s, v24.16b, v5.4b[0]\n"
4611 ".inst 0x6f89e31c // udot v28.4s, v24.16b, v9.4b[0]\n"
4612 ".inst 0x6f8de31d // udot v29.4s, v24.16b, v13.4b[0]\n"
4613 ".inst 0x6f91e31e // udot v30.4s, v24.16b, v17.4b[0]\n"
4614 ".inst 0x6f95e31f // udot v31.4s, v24.16b, v21.4b[0]\n"
4615 "ldr q24, [%[b_ptr0]]\n"
4616 ".inst 0x6fa1e33a // udot v26.4s, v25.16b, v1.4b[1]\n"
4617 ".inst 0x6fa5e33b // udot v27.4s, v25.16b, v5.4b[1]\n"
4618 ".inst 0x6fa9e33c // udot v28.4s, v25.16b, v9.4b[1]\n"
4619 ".inst 0x6fade33d // udot v29.4s, v25.16b, v13.4b[1]\n"
4620 ".inst 0x6fb1e33e // udot v30.4s, v25.16b, v17.4b[1]\n"
4621 ".inst 0x6fb5e33f // udot v31.4s, v25.16b, v21.4b[1]\n"
4622 "ldr q25, [%[b_ptr0], #0x10]\n"
4623 ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
4624 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4625 ".inst 0x6f85eb1b // udot v27.4s, v24.16b, v5.4b[2]\n"
4626 ".inst 0x6f89eb1c // udot v28.4s, v24.16b, v9.4b[2]\n"
4627 ".inst 0x6f8deb1d // udot v29.4s, v24.16b, v13.4b[2]\n"
4628 ".inst 0x6f91eb1e // udot v30.4s, v24.16b, v17.4b[2]\n"
4629 ".inst 0x6f95eb1f // udot v31.4s, v24.16b, v21.4b[2]\n"
4630 "ldr q24, [%[b_ptr0]]\n"
4631 ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
4632 ".inst 0x6fa5eb3b // udot v27.4s, v25.16b, v5.4b[3]\n"
4633 ".inst 0x6fa9eb3c // udot v28.4s, v25.16b, v9.4b[3]\n"
4634 ".inst 0x6fadeb3d // udot v29.4s, v25.16b, v13.4b[3]\n"
4635 ".inst 0x6fb1eb3e // udot v30.4s, v25.16b, v17.4b[3]\n"
4636 ".inst 0x6fb5eb3f // udot v31.4s, v25.16b, v21.4b[3]\n"
4637 "ldr q25, [%[b_ptr0], #0x10]\n"
4638 ".inst 0x6f82e31a // udot v26.4s, v24.16b, v2.4b[0]\n"
4639 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4640 ".inst 0x6f86e31b // udot v27.4s, v24.16b, v6.4b[0]\n"
4641 ".inst 0x6f8ae31c // udot v28.4s, v24.16b, v10.4b[0]\n"
4642 ".inst 0x6f8ee31d // udot v29.4s, v24.16b, v14.4b[0]\n"
4643 ".inst 0x6f92e31e // udot v30.4s, v24.16b, v18.4b[0]\n"
4644 ".inst 0x6f96e31f // udot v31.4s, v24.16b, v22.4b[0]\n"
4645 "ldr q24, [%[b_ptr0]]\n"
4646 ".inst 0x6fa2e33a // udot v26.4s, v25.16b, v2.4b[1]\n"
4647 ".inst 0x6fa6e33b // udot v27.4s, v25.16b, v6.4b[1]\n"
4648 ".inst 0x6faae33c // udot v28.4s, v25.16b, v10.4b[1]\n"
4649 ".inst 0x6faee33d // udot v29.4s, v25.16b, v14.4b[1]\n"
4650 ".inst 0x6fb2e33e // udot v30.4s, v25.16b, v18.4b[1]\n"
4651 ".inst 0x6fb6e33f // udot v31.4s, v25.16b, v22.4b[1]\n"
4652 "ldr q25, [%[b_ptr0], #0x10]\n"
4653 ".inst 0x6f82eb1a // udot v26.4s, v24.16b, v2.4b[2]\n"
4654 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4655 ".inst 0x6f86eb1b // udot v27.4s, v24.16b, v6.4b[2]\n"
4656 ".inst 0x6f8aeb1c // udot v28.4s, v24.16b, v10.4b[2]\n"
4657 ".inst 0x6f8eeb1d // udot v29.4s, v24.16b, v14.4b[2]\n"
4658 ".inst 0x6f92eb1e // udot v30.4s, v24.16b, v18.4b[2]\n"
4659 ".inst 0x6f96eb1f // udot v31.4s, v24.16b, v22.4b[2]\n"
4660 "ldr q24, [%[b_ptr0]]\n"
4661 ".inst 0x6fa2eb3a // udot v26.4s, v25.16b, v2.4b[3]\n"
4662 ".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
4663 ".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
4664 ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
4665 ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
4666 ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
4667 "ldr q25, [%[b_ptr0], #0x10]\n"
4668 ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
4669 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4670 ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
4671 ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
4672 ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
4673 ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
4674 ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
4675 "ldr q24, [%[b_ptr0]]\n"
4676 ".inst 0x6fa3e33a // udot v26.4s, v25.16b, v3.4b[1]\n"
4677 ".inst 0x6fa7e33b // udot v27.4s, v25.16b, v7.4b[1]\n"
4678 ".inst 0x6fabe33c // udot v28.4s, v25.16b, v11.4b[1]\n"
4679 ".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
4680 ".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
4681 ".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
4682 "ldr q25, [%[b_ptr0], #0x10]\n"
4683 ".inst 0x6f83eb1a // udot v26.4s, v24.16b, v3.4b[2]\n"
4684 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4685 ".inst 0x6f87eb1b // udot v27.4s, v24.16b, v7.4b[2]\n"
4686 ".inst 0x6f8beb1c // udot v28.4s, v24.16b, v11.4b[2]\n"
4687 ".inst 0x6f8feb1d // udot v29.4s, v24.16b, v15.4b[2]\n"
4688 ".inst 0x6f93eb1e // udot v30.4s, v24.16b, v19.4b[2]\n"
4689 ".inst 0x6f97eb1f // udot v31.4s, v24.16b, v23.4b[2]\n"
4690 ".inst 0x6fa3eb3a // udot v26.4s, v25.16b, v3.4b[3]\n"
4691 ".inst 0x6fa7eb3b // udot v27.4s, v25.16b, v7.4b[3]\n"
4692 ".inst 0x6fabeb3c // udot v28.4s, v25.16b, v11.4b[3]\n"
4693 ".inst 0x6fafeb3d // udot v29.4s, v25.16b, v15.4b[3]\n"
4694 ".inst 0x6fb3eb3e // udot v30.4s, v25.16b, v19.4b[3]\n"
4695 ".inst 0x6fb7eb3f // udot v31.4s, v25.16b, v23.4b[3]\n"
4704 ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
4705 ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
4706 ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
4707 ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
4708 ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
4709 ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
4710 "ldr q24, [%[b_ptr0]]\n"
4711 ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
4712 ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
4713 ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
4714 ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
4715 ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
4716 ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
4717 "ldr q25, [%[b_ptr0], #0x10]\n"
4718 ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
4719 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4720 ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
4721 ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
4722 ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
4723 ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
4724 ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
4725 "ldr q24, [%[b_ptr0]]\n"
4726 ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
4727 ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
4728 ".inst 0x6fa8eb3c // udot v28.4s, v25.16b, v8.4b[3]\n"
4729 ".inst 0x6faceb3d // udot v29.4s, v25.16b, v12.4b[3]\n"
4730 ".inst 0x6fb0eb3e // udot v30.4s, v25.16b, v16.4b[3]\n"
4731 ".inst 0x6fb4eb3f // udot v31.4s, v25.16b, v20.4b[3]\n"
4732 "ldr q25, [%[b_ptr0], #0x10]\n"
4733 ".inst 0x6f81e31a // udot v26.4s, v24.16b, v1.4b[0]\n"
4734 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4735 ".inst 0x6f85e31b // udot v27.4s, v24.16b, v5.4b[0]\n"
4736 ".inst 0x6f89e31c // udot v28.4s, v24.16b, v9.4b[0]\n"
4737 ".inst 0x6f8de31d // udot v29.4s, v24.16b, v13.4b[0]\n"
4738 ".inst 0x6f91e31e // udot v30.4s, v24.16b, v17.4b[0]\n"
4739 ".inst 0x6f95e31f // udot v31.4s, v24.16b, v21.4b[0]\n"
4740 "ldr q24, [%[b_ptr0]]\n"
4741 ".inst 0x6fa1e33a // udot v26.4s, v25.16b, v1.4b[1]\n"
4742 ".inst 0x6fa5e33b // udot v27.4s, v25.16b, v5.4b[1]\n"
4743 ".inst 0x6fa9e33c // udot v28.4s, v25.16b, v9.4b[1]\n"
4744 ".inst 0x6fade33d // udot v29.4s, v25.16b, v13.4b[1]\n"
4745 ".inst 0x6fb1e33e // udot v30.4s, v25.16b, v17.4b[1]\n"
4746 ".inst 0x6fb5e33f // udot v31.4s, v25.16b, v21.4b[1]\n"
4747 "ldr q25, [%[b_ptr0], #0x10]\n"
4748 ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
4749 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4750 ".inst 0x6f85eb1b // udot v27.4s, v24.16b, v5.4b[2]\n"
4751 ".inst 0x6f89eb1c // udot v28.4s, v24.16b, v9.4b[2]\n"
4752 ".inst 0x6f8deb1d // udot v29.4s, v24.16b, v13.4b[2]\n"
4753 ".inst 0x6f91eb1e // udot v30.4s, v24.16b, v17.4b[2]\n"
4754 ".inst 0x6f95eb1f // udot v31.4s, v24.16b, v21.4b[2]\n"
4755 "ldr q24, [%[b_ptr0]]\n"
4756 ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
4757 ".inst 0x6fa5eb3b // udot v27.4s, v25.16b, v5.4b[3]\n"
4758 ".inst 0x6fa9eb3c // udot v28.4s, v25.16b, v9.4b[3]\n"
4759 ".inst 0x6fadeb3d // udot v29.4s, v25.16b, v13.4b[3]\n"
4760 ".inst 0x6fb1eb3e // udot v30.4s, v25.16b, v17.4b[3]\n"
4761 ".inst 0x6fb5eb3f // udot v31.4s, v25.16b, v21.4b[3]\n"
4762 "ldr q25, [%[b_ptr0], #0x10]\n"
4763 ".inst 0x6f82e31a // udot v26.4s, v24.16b, v2.4b[0]\n"
4764 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4765 ".inst 0x6f86e31b // udot v27.4s, v24.16b, v6.4b[0]\n"
4766 ".inst 0x6f8ae31c // udot v28.4s, v24.16b, v10.4b[0]\n"
4767 ".inst 0x6f8ee31d // udot v29.4s, v24.16b, v14.4b[0]\n"
4768 ".inst 0x6f92e31e // udot v30.4s, v24.16b, v18.4b[0]\n"
4769 ".inst 0x6f96e31f // udot v31.4s, v24.16b, v22.4b[0]\n"
4770 "ldr q24, [%[b_ptr0]]\n"
4771 ".inst 0x6fa2e33a // udot v26.4s, v25.16b, v2.4b[1]\n"
4772 ".inst 0x6fa6e33b // udot v27.4s, v25.16b, v6.4b[1]\n"
4773 ".inst 0x6faae33c // udot v28.4s, v25.16b, v10.4b[1]\n"
4774 ".inst 0x6faee33d // udot v29.4s, v25.16b, v14.4b[1]\n"
4775 ".inst 0x6fb2e33e // udot v30.4s, v25.16b, v18.4b[1]\n"
4776 ".inst 0x6fb6e33f // udot v31.4s, v25.16b, v22.4b[1]\n"
4777 "ldr q25, [%[b_ptr0], #0x10]\n"
4778 ".inst 0x6f82eb1a // udot v26.4s, v24.16b, v2.4b[2]\n"
4779 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4780 ".inst 0x6f86eb1b // udot v27.4s, v24.16b, v6.4b[2]\n"
4781 ".inst 0x6f8aeb1c // udot v28.4s, v24.16b, v10.4b[2]\n"
4782 ".inst 0x6f8eeb1d // udot v29.4s, v24.16b, v14.4b[2]\n"
4783 ".inst 0x6f92eb1e // udot v30.4s, v24.16b, v18.4b[2]\n"
4784 ".inst 0x6f96eb1f // udot v31.4s, v24.16b, v22.4b[2]\n"
4785 "ldr q24, [%[b_ptr0]]\n"
4786 ".inst 0x6fa2eb3a // udot v26.4s, v25.16b, v2.4b[3]\n"
4787 ".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
4788 ".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
4789 ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
4790 ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
4791 ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
4792 "ldr q25, [%[b_ptr0], #0x10]\n"
4793 ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
4794 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4795 ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
4796 ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
4797 ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
4798 ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
4799 ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
4800 "ldr q24, [%[b_ptr0]]\n"
4801 ".inst 0x6fa3e33a // udot v26.4s, v25.16b, v3.4b[1]\n"
4802 ".inst 0x6fa7e33b // udot v27.4s, v25.16b, v7.4b[1]\n"
4803 ".inst 0x6fabe33c // udot v28.4s, v25.16b, v11.4b[1]\n"
4804 ".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
4805 ".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
4806 ".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
4807 "ldr q25, [%[b_ptr0], #0x10]\n"
4808 ".inst 0x6f83eb1a // udot v26.4s, v24.16b, v3.4b[2]\n"
4809 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4810 ".inst 0x6f87eb1b // udot v27.4s, v24.16b, v7.4b[2]\n"
4811 ".inst 0x6f8beb1c // udot v28.4s, v24.16b, v11.4b[2]\n"
4812 ".inst 0x6f8feb1d // udot v29.4s, v24.16b, v15.4b[2]\n"
4813 ".inst 0x6f93eb1e // udot v30.4s, v24.16b, v19.4b[2]\n"
4814 ".inst 0x6f97eb1f // udot v31.4s, v24.16b, v23.4b[2]\n"
4815 ".inst 0x6fa3eb3a // udot v26.4s, v25.16b, v3.4b[3]\n"
4816 ".inst 0x6fa7eb3b // udot v27.4s, v25.16b, v7.4b[3]\n"
4817 ".inst 0x6fabeb3c // udot v28.4s, v25.16b, v11.4b[3]\n"
4818 ".inst 0x6fafeb3d // udot v29.4s, v25.16b, v15.4b[3]\n"
4819 ".inst 0x6fb3eb3e // udot v30.4s, v25.16b, v19.4b[3]\n"
4820 ".inst 0x6fb7eb3f // udot v31.4s, v25.16b, v23.4b[3]\n"
4822 "str q26, [%[c_ptr0]]\n"
4823 "add %[c_ptr0], %[c_ptr0], #0x10\n"
4824 "str q27, [c_ptr1]\n"
4825 "str q28, [c_ptr2]\n"
4826 "str q29, [c_ptr3]\n"
4827 "str q30, [c_ptr4]\n"
4828 "str q31, [c_ptr5]\n"
4839 ".unreq temploadreg0\n"
4840 ".unreq temploadreg1\n"
4841 ".unreq temploadreg2\n"
4842 ".unreq temploadreg3\n"
4843 : [a_ptr0]
"+r" (a_ptr0), [b_ptr0]
"+r" (b_ptr0), [c_ptr0]
"+r" (c_ptr0), [loops]
"+r" (loops), [oob_rows]
"+r" (oob_rows), [odds]
"+r" (odds)
4844 : [lda]
"r" (ldab), [ldc]
"r" (ldcb)
4845 :
"x0",
"x1",
"x2",
"x3",
"x4",
"x5",
"x6",
"x7",
"x8",
"x9",
"x10",
"x11",
"x12",
"x13",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23",
"v24",
"v25",
"v26",
"v27",
"v28",
"v29",
"v30",
"v31",
"cc",
"memory"
4854 #endif // __aarch64__