31 #include "../../asmlib.hpp"
32 #include "../../utils.hpp"
36 void a64_smallK_hybrid_u8u32_dot_6x4(
const uint8_t *A,
int lda,
const uint8_t *B, uint32_t *C,
int ldc,
int M,
int N,
int K,
const uint32_t *,
Activation,
bool) {
37 const long loops_count =
iceildiv(
N, (
int)4) - 1;
38 const long ldab = lda *
sizeof(uint8_t);
39 const long ldcb = ldc *
sizeof(uint32_t);
40 const long odds_count =
K % 4;
43 for (
int y0=0; y0<
M; y0+=6) {
44 long loops = loops_count;
45 long oob_rows = std::max(6 - (
M-y0), 0);
46 long odds = odds_count;
47 const uint8_t *b_ptr0 =
B;
48 const uint8_t *a_ptr0 =
A + (y0 * lda);
50 uint32_t *c_ptr0 = C + (y0 * ldc);
65 "add a_ptr1, %[a_ptr0], %[lda]\n"
66 "add c_ptr1, %[c_ptr0], %[ldc]\n"
67 "add a_ptr2, a_ptr1, %[lda]\n"
68 "add c_ptr2, c_ptr1, %[ldc]\n"
69 "add a_ptr3, a_ptr2, %[lda]\n"
70 "add c_ptr3, c_ptr2, %[ldc]\n"
71 "add a_ptr4, a_ptr3, %[lda]\n"
72 "add c_ptr4, c_ptr3, %[ldc]\n"
73 "add a_ptr5, a_ptr4, %[lda]\n"
74 "add c_ptr5, c_ptr4, %[ldc]\n"
75 "cbz %[oob_rows], 1f\n"
76 "subs %[oob_rows], %[oob_rows], #0x1\n"
77 "add c_ptr5, %[c_ptr0], #0x0\n"
78 "add a_ptr5, %[a_ptr0], #0x0\n"
80 "subs %[oob_rows], %[oob_rows], #0x1\n"
81 "add c_ptr4, %[c_ptr0], #0x0\n"
82 "add a_ptr4, %[a_ptr0], #0x0\n"
84 "subs %[oob_rows], %[oob_rows], #0x1\n"
85 "add c_ptr3, %[c_ptr0], #0x0\n"
86 "add a_ptr3, %[a_ptr0], #0x0\n"
88 "subs %[oob_rows], %[oob_rows], #0x1\n"
89 "add c_ptr2, %[c_ptr0], #0x0\n"
90 "add a_ptr2, %[a_ptr0], #0x0\n"
92 "subs %[oob_rows], %[oob_rows], #0x1\n"
93 "add c_ptr1, %[c_ptr0], #0x0\n"
94 "add a_ptr1, %[a_ptr0], #0x0\n"
97 "ldr q0, [%[a_ptr0]], #0x10\n"
98 "ldr q3, [a_ptr1], #0x10\n"
99 "ldr q6, [a_ptr2], #0x10\n"
100 "ldr q9, [a_ptr3], #0x10\n"
101 "ldr q12, [a_ptr4], #0x10\n"
102 "ldr q15, [a_ptr5], #0x10\n"
103 "ldr q1, [%[a_ptr0]], #0x10\n"
104 "ldr q4, [a_ptr1], #0x10\n"
105 "ldr q7, [a_ptr2], #0x10\n"
106 "ldr q10, [a_ptr3], #0x10\n"
107 "ldr s2, [%[a_ptr0]]\n"
108 "ldr q13, [a_ptr4], #0x10\n"
110 "ldr q16, [a_ptr5], #0x10\n"
112 "ldr s11, [a_ptr3]\n"
113 "ldr s14, [a_ptr4]\n"
114 "ldr s17, [a_ptr5]\n"
117 "ldr q0, [%[a_ptr0]], #0x10\n"
118 "subs %[odds], %[odds], #0x1\n"
119 "ldr q3, [a_ptr1], #0x10\n"
120 "ldr q6, [a_ptr2], #0x10\n"
121 "ldr q9, [a_ptr3], #0x10\n"
122 "ldr q12, [a_ptr4], #0x10\n"
123 "ldr q15, [a_ptr5], #0x10\n"
124 "ldr q1, [%[a_ptr0]], #0x10\n"
125 "ldr q4, [a_ptr1], #0x10\n"
126 "ldr q7, [a_ptr2], #0x10\n"
127 "ldr q10, [a_ptr3], #0x10\n"
128 "ldr q13, [a_ptr4], #0x10\n"
129 "ldr q16, [a_ptr5], #0x10\n"
131 "ldr b2, [%[a_ptr0]]\n"
134 "ldr b11, [a_ptr3]\n"
135 "ldr b14, [a_ptr4]\n"
136 "ldr b17, [a_ptr5]\n"
139 "ldr h2, [%[a_ptr0]], #0x2\n"
140 "ldr h5, [a_ptr1], #0x2\n"
141 "ldr h8, [a_ptr2], #0x2\n"
142 "ldr h11, [a_ptr3], #0x2\n"
143 "ldr h14, [a_ptr4], #0x2\n"
144 "ldr h17, [a_ptr5], #0x2\n"
145 "subs %[odds], %[odds], #0x1\n"
149 "ld1 {v2.b}[2], [%[a_ptr0]]\n"
150 "ld1 {v5.b}[2], [a_ptr1]\n"
151 "ld1 {v8.b}[2], [a_ptr2]\n"
152 "ld1 {v11.b}[2], [a_ptr3]\n"
153 "ld1 {v14.b}[2], [a_ptr4]\n"
154 "ld1 {v17.b}[2], [a_ptr5]\n"
156 "ldr q18, [%[b_ptr0]]\n"
157 "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
158 "ldr q19, [%[b_ptr0], #0x10]\n"
159 "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
160 "ldr q20, [%[b_ptr0], #0x20]\n"
161 "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
162 "ldr q21, [%[b_ptr0], #0x30]\n"
163 "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
164 "ldr q22, [%[b_ptr0], #0x40]\n"
165 "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
166 "ldr q23, [%[b_ptr0], #0x50]\n"
167 "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
168 "ldr q24, [%[b_ptr0], #0x60]\n"
169 "ldr q25, [%[b_ptr0], #0x70]\n"
170 "add %[b_ptr0], %[b_ptr0], #0x80\n"
173 "subs %[loops], %[loops], #0x1\n"
179 ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
180 ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
181 ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
182 ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
183 ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
184 ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
185 "ldr q18, [%[b_ptr0]]\n"
186 ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
187 "add %[b_ptr0], %[b_ptr0], #0x10\n"
188 ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
189 ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
190 ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
191 ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
192 ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
193 ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
194 ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
195 ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
196 ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
197 ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
198 ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
199 ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
200 ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
201 ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
202 ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
203 ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
204 ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
205 ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
206 ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
207 ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
208 ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
209 ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
210 ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
211 ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
212 ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
213 ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
214 ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
215 ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
216 ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
217 ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
218 ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
219 ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
220 ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
221 ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
222 ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
223 ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
224 ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
225 ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
226 ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
227 ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
228 ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
229 ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
230 ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
231 ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
232 ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
233 ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
234 ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
237 "str q26, [%[c_ptr0]]\n"
238 "subs %[loops], %[loops], #0x1\n"
240 "ldr q18, [%[b_ptr0]]\n"
241 "ldr q19, [%[b_ptr0], #0x10]\n"
242 "add %[c_ptr0], %[c_ptr0], #0x10\n"
243 "str q27, [c_ptr1]\n"
244 "add c_ptr1, c_ptr1, #0x10\n"
246 "ldr q20, [%[b_ptr0], #0x20]\n"
247 ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
248 "str q28, [c_ptr2]\n"
250 "ldr q21, [%[b_ptr0], #0x30]\n"
251 "ldr q22, [%[b_ptr0], #0x40]\n"
252 "add c_ptr2, c_ptr2, #0x10\n"
253 ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
254 "str q29, [c_ptr3]\n"
256 "ldr q23, [%[b_ptr0], #0x50]\n"
257 ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
258 "ldr q24, [%[b_ptr0], #0x60]\n"
259 ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
260 "str q30, [c_ptr4]\n"
262 "ldr q25, [%[b_ptr0], #0x70]\n"
263 ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
264 "add c_ptr3, c_ptr3, #0x10\n"
265 ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
266 "str q31, [c_ptr5]\n"
268 "add c_ptr4, c_ptr4, #0x10\n"
269 ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
270 "add c_ptr5, c_ptr5, #0x10\n"
271 ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
272 "add %[b_ptr0], %[b_ptr0], #0x80\n"
273 ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
274 "ldr q18, [%[b_ptr0]]\n"
275 ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
276 "add %[b_ptr0], %[b_ptr0], #0x10\n"
277 ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
278 "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
279 ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
280 "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
281 ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
282 "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
283 ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
284 "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
285 ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
286 "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
287 ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
288 "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
289 ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
290 ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
291 ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
292 ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
293 ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
294 ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
295 ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
296 ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
297 ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
298 ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
299 ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
300 ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
301 ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
302 ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
303 ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
304 ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
305 ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
306 ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
307 ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
308 ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
309 ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
310 ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
311 ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
312 ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
313 ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
314 ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
315 ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
316 ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
317 ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
318 ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
319 ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
320 ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
321 ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
322 ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
323 ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
324 ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
325 ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
326 ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
329 "str q26, [%[c_ptr0]]\n"
330 "add %[c_ptr0], %[c_ptr0], #0x10\n"
332 "ldr q18, [%[b_ptr0]]\n"
333 "ldr q19, [%[b_ptr0], #0x10]\n"
334 "str q27, [c_ptr1]\n"
335 "add c_ptr1, c_ptr1, #0x10\n"
337 "ldr q20, [%[b_ptr0], #0x20]\n"
338 ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
339 "str q28, [c_ptr2]\n"
341 "ldr q21, [%[b_ptr0], #0x30]\n"
342 "ldr q22, [%[b_ptr0], #0x40]\n"
343 "add c_ptr2, c_ptr2, #0x10\n"
344 ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
345 "str q29, [c_ptr3]\n"
347 "ldr q23, [%[b_ptr0], #0x50]\n"
348 ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
349 "ldr q24, [%[b_ptr0], #0x60]\n"
350 ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
351 "str q30, [c_ptr4]\n"
353 "ldr q25, [%[b_ptr0], #0x70]\n"
354 ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
355 "add c_ptr3, c_ptr3, #0x10\n"
356 ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
357 "str q31, [c_ptr5]\n"
359 "add c_ptr4, c_ptr4, #0x10\n"
360 ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
361 "add c_ptr5, c_ptr5, #0x10\n"
362 ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
363 "add %[b_ptr0], %[b_ptr0], #0x80\n"
364 ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
365 "ldr q18, [%[b_ptr0]]\n"
366 ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
367 "add %[b_ptr0], %[b_ptr0], #0x10\n"
368 ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
369 ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
370 ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
371 ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
372 ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
373 ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
374 ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
375 ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
376 ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
377 ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
378 ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
379 ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
380 ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
381 ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
382 ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
383 ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
384 ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
385 ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
386 ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
387 ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
388 ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
389 ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
390 ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
391 ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
392 ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
393 ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
394 ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
395 ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
396 ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
397 ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
398 ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
399 ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
400 ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
401 ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
402 ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
403 ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
404 ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
405 ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
406 ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
407 ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
408 ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
409 ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
410 ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
411 ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
420 ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
421 ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
422 ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
423 ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
424 ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
425 ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
426 "ldr q18, [%[b_ptr0]]\n"
427 ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
428 "add %[b_ptr0], %[b_ptr0], #0x10\n"
429 ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
430 ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
431 ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
432 ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
433 ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
434 ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
435 ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
436 ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
437 ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
438 ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
439 ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
440 ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
441 ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
442 ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
443 ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
444 ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
445 ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
446 ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
447 ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
448 ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
449 ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
450 ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
451 ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
452 ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
453 ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
454 ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
455 ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
456 ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
457 ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
458 ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
459 ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
460 ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
461 ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
462 ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
463 ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
464 ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
465 ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
466 ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
467 ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
468 ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
469 ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
470 ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
471 ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
472 ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
473 ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
474 ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
475 ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
477 "str q26, [%[c_ptr0]]\n"
478 "add %[c_ptr0], %[c_ptr0], #0x10\n"
479 "str q27, [c_ptr1]\n"
480 "str q28, [c_ptr2]\n"
481 "str q29, [c_ptr3]\n"
482 "str q30, [c_ptr4]\n"
483 "str q31, [c_ptr5]\n"
494 : [a_ptr0]
"+r" (a_ptr0), [b_ptr0]
"+r" (b_ptr0), [c_ptr0]
"+r" (c_ptr0), [loops]
"+r" (loops), [oob_rows]
"+r" (oob_rows), [odds]
"+r" (odds)
495 : [lda]
"r" (ldab), [ldc]
"r" (ldcb)
496 :
"x0",
"x1",
"x2",
"x3",
"x4",
"x5",
"x6",
"x7",
"x8",
"x9",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23",
"v24",
"v25",
"v26",
"v27",
"v28",
"v29",
"v30",
"v31",
"cc",
"memory"
511 "add a_ptr1, %[a_ptr0], %[lda]\n"
512 "add c_ptr1, %[c_ptr0], %[ldc]\n"
513 "add a_ptr2, a_ptr1, %[lda]\n"
514 "add c_ptr2, c_ptr1, %[ldc]\n"
515 "add a_ptr3, a_ptr2, %[lda]\n"
516 "add c_ptr3, c_ptr2, %[ldc]\n"
517 "add a_ptr4, a_ptr3, %[lda]\n"
518 "add c_ptr4, c_ptr3, %[ldc]\n"
519 "add a_ptr5, a_ptr4, %[lda]\n"
520 "add c_ptr5, c_ptr4, %[ldc]\n"
521 "cbz %[oob_rows], 1f\n"
522 "subs %[oob_rows], %[oob_rows], #0x1\n"
523 "add c_ptr5, %[c_ptr0], #0x0\n"
524 "add a_ptr5, %[a_ptr0], #0x0\n"
526 "subs %[oob_rows], %[oob_rows], #0x1\n"
527 "add c_ptr4, %[c_ptr0], #0x0\n"
528 "add a_ptr4, %[a_ptr0], #0x0\n"
530 "subs %[oob_rows], %[oob_rows], #0x1\n"
531 "add c_ptr3, %[c_ptr0], #0x0\n"
532 "add a_ptr3, %[a_ptr0], #0x0\n"
534 "subs %[oob_rows], %[oob_rows], #0x1\n"
535 "add c_ptr2, %[c_ptr0], #0x0\n"
536 "add a_ptr2, %[a_ptr0], #0x0\n"
538 "subs %[oob_rows], %[oob_rows], #0x1\n"
539 "add c_ptr1, %[c_ptr0], #0x0\n"
540 "add a_ptr1, %[a_ptr0], #0x0\n"
543 "ldr q0, [%[a_ptr0]], #0x10\n"
544 "ldr q3, [a_ptr1], #0x10\n"
545 "ldr q6, [a_ptr2], #0x10\n"
546 "ldr q9, [a_ptr3], #0x10\n"
547 "ldr q12, [a_ptr4], #0x10\n"
548 "ldr q15, [a_ptr5], #0x10\n"
549 "ldr q1, [%[a_ptr0]], #0x10\n"
550 "ldr q4, [a_ptr1], #0x10\n"
551 "ldr q7, [a_ptr2], #0x10\n"
552 "ldr q10, [a_ptr3], #0x10\n"
553 "ldr d2, [%[a_ptr0]]\n"
554 "ldr q13, [a_ptr4], #0x10\n"
556 "ldr q16, [a_ptr5], #0x10\n"
558 "ldr d11, [a_ptr3]\n"
559 "ldr d14, [a_ptr4]\n"
560 "ldr d17, [a_ptr5]\n"
563 "ldr q0, [%[a_ptr0]], #0x10\n"
564 "subs %[odds], %[odds], #0x1\n"
565 "ldr q3, [a_ptr1], #0x10\n"
566 "ldr q6, [a_ptr2], #0x10\n"
567 "ldr q9, [a_ptr3], #0x10\n"
568 "ldr q12, [a_ptr4], #0x10\n"
569 "ldr q15, [a_ptr5], #0x10\n"
570 "ldr q1, [%[a_ptr0]], #0x10\n"
571 "ldr q4, [a_ptr1], #0x10\n"
572 "ldr q7, [a_ptr2], #0x10\n"
573 "ldr q10, [a_ptr3], #0x10\n"
574 "ldr s2, [%[a_ptr0]], #0x4\n"
575 "ldr q13, [a_ptr4], #0x10\n"
576 "ldr s5, [a_ptr1], #0x4\n"
577 "ldr q16, [a_ptr5], #0x10\n"
578 "ldr s8, [a_ptr2], #0x4\n"
579 "ldr s11, [a_ptr3], #0x4\n"
580 "ldr s14, [a_ptr4], #0x4\n"
581 "ldr s17, [a_ptr5], #0x4\n"
583 "ld1 {v2.b}[4], [%[a_ptr0]]\n"
584 "ld1 {v5.b}[4], [a_ptr1]\n"
585 "ld1 {v8.b}[4], [a_ptr2]\n"
586 "ld1 {v11.b}[4], [a_ptr3]\n"
587 "ld1 {v14.b}[4], [a_ptr4]\n"
588 "ld1 {v17.b}[4], [a_ptr5]\n"
591 "ld1 {v2.h}[2], [%[a_ptr0]], #2\n"
592 "ld1 {v5.h}[2], [a_ptr1], #2\n"
593 "ld1 {v8.h}[2], [a_ptr2], #2\n"
594 "ld1 {v11.h}[2], [a_ptr3], #2\n"
595 "ld1 {v14.h}[2], [a_ptr4], #2\n"
596 "ld1 {v17.h}[2], [a_ptr5], #2\n"
597 "subs %[odds], %[odds], #0x1\n"
601 "ld1 {v2.b}[6], [%[a_ptr0]]\n"
602 "ld1 {v5.b}[6], [a_ptr1]\n"
603 "ld1 {v8.b}[6], [a_ptr2]\n"
604 "ld1 {v11.b}[6], [a_ptr3]\n"
605 "ld1 {v14.b}[6], [a_ptr4]\n"
606 "ld1 {v17.b}[6], [a_ptr5]\n"
608 "ldr q18, [%[b_ptr0]]\n"
609 "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
610 "ldr q19, [%[b_ptr0], #0x10]\n"
611 "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
612 "ldr q20, [%[b_ptr0], #0x20]\n"
613 "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
614 "ldr q21, [%[b_ptr0], #0x30]\n"
615 "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
616 "ldr q22, [%[b_ptr0], #0x40]\n"
617 "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
618 "ldr q23, [%[b_ptr0], #0x50]\n"
619 "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
620 "ldr q24, [%[b_ptr0], #0x60]\n"
621 "ldr q25, [%[b_ptr0], #0x70]\n"
622 "add %[b_ptr0], %[b_ptr0], #0x80\n"
625 "subs %[loops], %[loops], #0x1\n"
631 ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
632 ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
633 ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
634 ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
635 ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
636 ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
637 "ldr q18, [%[b_ptr0]]\n"
638 ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
639 ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
640 ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
641 ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
642 ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
643 ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
644 "ldr q19, [%[b_ptr0], #0x10]\n"
645 ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
646 "add %[b_ptr0], %[b_ptr0], #0x20\n"
647 ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
648 ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
649 ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
650 ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
651 ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
652 ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
653 ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
654 ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
655 ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
656 ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
657 ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
658 ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
659 ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
660 ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
661 ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
662 ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
663 ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
664 ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
665 ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
666 ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
667 ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
668 ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
669 ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
670 ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
671 ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
672 ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
673 ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
674 ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
675 ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
676 ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
677 ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
678 ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
679 ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
680 ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
681 ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
682 ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
683 ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
684 ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
685 ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
686 ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
687 ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
688 ".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
689 ".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
690 ".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
691 ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
692 ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
693 ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
696 "str q26, [%[c_ptr0]]\n"
697 "subs %[loops], %[loops], #0x1\n"
699 "ldr q18, [%[b_ptr0]]\n"
700 "ldr q19, [%[b_ptr0], #0x10]\n"
701 "add %[c_ptr0], %[c_ptr0], #0x10\n"
702 "str q27, [c_ptr1]\n"
703 "add c_ptr1, c_ptr1, #0x10\n"
705 "ldr q20, [%[b_ptr0], #0x20]\n"
706 ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
707 "str q28, [c_ptr2]\n"
709 "ldr q21, [%[b_ptr0], #0x30]\n"
710 "ldr q22, [%[b_ptr0], #0x40]\n"
711 "add c_ptr2, c_ptr2, #0x10\n"
712 ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
713 "str q29, [c_ptr3]\n"
715 "ldr q23, [%[b_ptr0], #0x50]\n"
716 ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
717 "ldr q24, [%[b_ptr0], #0x60]\n"
718 ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
719 "str q30, [c_ptr4]\n"
721 "ldr q25, [%[b_ptr0], #0x70]\n"
722 ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
723 "add c_ptr3, c_ptr3, #0x10\n"
724 ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
725 "str q31, [c_ptr5]\n"
727 "add c_ptr4, c_ptr4, #0x10\n"
728 ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
729 "add c_ptr5, c_ptr5, #0x10\n"
730 ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
731 "add %[b_ptr0], %[b_ptr0], #0x80\n"
732 ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
733 "ldr q18, [%[b_ptr0]]\n"
734 ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
735 "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
736 ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
737 "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
738 ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
739 "ldr q19, [%[b_ptr0], #0x10]\n"
740 ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
741 "add %[b_ptr0], %[b_ptr0], #0x20\n"
742 ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
743 "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
744 ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
745 "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
746 ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
747 "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
748 ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
749 "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
750 ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
751 ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
752 ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
753 ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
754 ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
755 ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
756 ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
757 ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
758 ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
759 ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
760 ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
761 ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
762 ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
763 ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
764 ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
765 ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
766 ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
767 ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
768 ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
769 ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
770 ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
771 ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
772 ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
773 ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
774 ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
775 ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
776 ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
777 ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
778 ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
779 ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
780 ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
781 ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
782 ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
783 ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
784 ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
785 ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
786 ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
787 ".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
788 ".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
789 ".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
790 ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
791 ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
792 ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
795 "str q26, [%[c_ptr0]]\n"
796 "add %[c_ptr0], %[c_ptr0], #0x10\n"
798 "ldr q18, [%[b_ptr0]]\n"
799 "ldr q19, [%[b_ptr0], #0x10]\n"
800 "str q27, [c_ptr1]\n"
801 "add c_ptr1, c_ptr1, #0x10\n"
803 "ldr q20, [%[b_ptr0], #0x20]\n"
804 ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
805 "str q28, [c_ptr2]\n"
807 "ldr q21, [%[b_ptr0], #0x30]\n"
808 "ldr q22, [%[b_ptr0], #0x40]\n"
809 "add c_ptr2, c_ptr2, #0x10\n"
810 ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
811 "str q29, [c_ptr3]\n"
813 "ldr q23, [%[b_ptr0], #0x50]\n"
814 ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
815 "ldr q24, [%[b_ptr0], #0x60]\n"
816 ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
817 "str q30, [c_ptr4]\n"
819 "ldr q25, [%[b_ptr0], #0x70]\n"
820 ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
821 "add c_ptr3, c_ptr3, #0x10\n"
822 ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
823 "str q31, [c_ptr5]\n"
825 "add c_ptr4, c_ptr4, #0x10\n"
826 ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
827 "add c_ptr5, c_ptr5, #0x10\n"
828 ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
829 "add %[b_ptr0], %[b_ptr0], #0x80\n"
830 ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
831 "ldr q18, [%[b_ptr0]]\n"
832 ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
833 ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
834 ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
835 "ldr q19, [%[b_ptr0], #0x10]\n"
836 ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
837 "add %[b_ptr0], %[b_ptr0], #0x20\n"
838 ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
839 ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
840 ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
841 ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
842 ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
843 ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
844 ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
845 ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
846 ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
847 ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
848 ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
849 ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
850 ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
851 ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
852 ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
853 ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
854 ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
855 ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
856 ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
857 ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
858 ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
859 ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
860 ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
861 ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
862 ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
863 ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
864 ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
865 ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
866 ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
867 ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
868 ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
869 ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
870 ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
871 ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
872 ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
873 ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
874 ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
875 ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
876 ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
877 ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
878 ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
879 ".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
880 ".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
881 ".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
882 ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
883 ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
884 ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
893 ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
894 ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
895 ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
896 ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
897 ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
898 ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
899 "ldr q18, [%[b_ptr0]]\n"
900 ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
901 ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
902 ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
903 ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
904 ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
905 ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
906 "ldr q19, [%[b_ptr0], #0x10]\n"
907 ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
908 "add %[b_ptr0], %[b_ptr0], #0x20\n"
909 ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
910 ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
911 ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
912 ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
913 ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
914 ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
915 ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
916 ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
917 ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
918 ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
919 ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
920 ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
921 ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
922 ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
923 ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
924 ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
925 ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
926 ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
927 ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
928 ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
929 ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
930 ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
931 ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
932 ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
933 ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
934 ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
935 ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
936 ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
937 ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
938 ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
939 ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
940 ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
941 ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
942 ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
943 ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
944 ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
945 ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
946 ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
947 ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
948 ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
949 ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
950 ".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
951 ".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
952 ".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
953 ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
954 ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
955 ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
957 "str q26, [%[c_ptr0]]\n"
958 "add %[c_ptr0], %[c_ptr0], #0x10\n"
959 "str q27, [c_ptr1]\n"
960 "str q28, [c_ptr2]\n"
961 "str q29, [c_ptr3]\n"
962 "str q30, [c_ptr4]\n"
963 "str q31, [c_ptr5]\n"
974 : [a_ptr0]
"+r" (a_ptr0), [b_ptr0]
"+r" (b_ptr0), [c_ptr0]
"+r" (c_ptr0), [loops]
"+r" (loops), [oob_rows]
"+r" (oob_rows), [odds]
"+r" (odds)
975 : [lda]
"r" (ldab), [ldc]
"r" (ldcb)
976 :
"x0",
"x1",
"x2",
"x3",
"x4",
"x5",
"x6",
"x7",
"x8",
"x9",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23",
"v24",
"v25",
"v26",
"v27",
"v28",
"v29",
"v30",
"v31",
"cc",
"memory"
991 "add a_ptr1, %[a_ptr0], %[lda]\n"
992 "add c_ptr1, %[c_ptr0], %[ldc]\n"
993 "add a_ptr2, a_ptr1, %[lda]\n"
994 "add c_ptr2, c_ptr1, %[ldc]\n"
995 "add a_ptr3, a_ptr2, %[lda]\n"
996 "add c_ptr3, c_ptr2, %[ldc]\n"
997 "add a_ptr4, a_ptr3, %[lda]\n"
998 "add c_ptr4, c_ptr3, %[ldc]\n"
999 "add a_ptr5, a_ptr4, %[lda]\n"
1000 "add c_ptr5, c_ptr4, %[ldc]\n"
1001 "cbz %[oob_rows], 1f\n"
1002 "subs %[oob_rows], %[oob_rows], #0x1\n"
1003 "add c_ptr5, %[c_ptr0], #0x0\n"
1004 "add a_ptr5, %[a_ptr0], #0x0\n"
1006 "subs %[oob_rows], %[oob_rows], #0x1\n"
1007 "add c_ptr4, %[c_ptr0], #0x0\n"
1008 "add a_ptr4, %[a_ptr0], #0x0\n"
1010 "subs %[oob_rows], %[oob_rows], #0x1\n"
1011 "add c_ptr3, %[c_ptr0], #0x0\n"
1012 "add a_ptr3, %[a_ptr0], #0x0\n"
1014 "subs %[oob_rows], %[oob_rows], #0x1\n"
1015 "add c_ptr2, %[c_ptr0], #0x0\n"
1016 "add a_ptr2, %[a_ptr0], #0x0\n"
1018 "subs %[oob_rows], %[oob_rows], #0x1\n"
1019 "add c_ptr1, %[c_ptr0], #0x0\n"
1020 "add a_ptr1, %[a_ptr0], #0x0\n"
1022 "ldr q0, [%[a_ptr0]], #0x10\n"
1023 "ldr q3, [a_ptr1], #0x10\n"
1024 "ldr q6, [a_ptr2], #0x10\n"
1025 "ldr q9, [a_ptr3], #0x10\n"
1026 "ldr q12, [a_ptr4], #0x10\n"
1027 "ldr q15, [a_ptr5], #0x10\n"
1028 "ldr q1, [%[a_ptr0]], #0x10\n"
1029 "ldr q4, [a_ptr1], #0x10\n"
1030 "ldr q7, [a_ptr2], #0x10\n"
1031 "ldr q10, [a_ptr3], #0x10\n"
1032 "ldr d2, [%[a_ptr0]], #0x8\n"
1033 "ldr q13, [a_ptr4], #0x10\n"
1034 "ldr d5, [a_ptr1], #0x8\n"
1035 "ldr q16, [a_ptr5], #0x10\n"
1036 "ldr d8, [a_ptr2], #0x8\n"
1037 "ldr d11, [a_ptr3], #0x8\n"
1038 "ldr d14, [a_ptr4], #0x8\n"
1039 "ldr d17, [a_ptr5], #0x8\n"
1040 "cbnz %[odds], 2f\n"
1041 "ld1 {v2.s}[2], [%[a_ptr0]]\n"
1042 "ld1 {v5.s}[2], [a_ptr1]\n"
1043 "ld1 {v8.s}[2], [a_ptr2]\n"
1044 "ld1 {v11.s}[2], [a_ptr3]\n"
1045 "ld1 {v14.s}[2], [a_ptr4]\n"
1046 "ld1 {v17.s}[2], [a_ptr5]\n"
1049 "subs %[odds], %[odds], #0x1\n"
1051 "ld1 {v2.b}[8], [%[a_ptr0]]\n"
1052 "ld1 {v5.b}[8], [a_ptr1]\n"
1053 "ld1 {v8.b}[8], [a_ptr2]\n"
1054 "ld1 {v11.b}[8], [a_ptr3]\n"
1055 "ld1 {v14.b}[8], [a_ptr4]\n"
1056 "ld1 {v17.b}[8], [a_ptr5]\n"
1059 "ld1 {v2.h}[4], [%[a_ptr0]], #2\n"
1060 "ld1 {v5.h}[4], [a_ptr1], #2\n"
1061 "ld1 {v8.h}[4], [a_ptr2], #2\n"
1062 "ld1 {v11.h}[4], [a_ptr3], #2\n"
1063 "ld1 {v14.h}[4], [a_ptr4], #2\n"
1064 "ld1 {v17.h}[4], [a_ptr5], #2\n"
1065 "subs %[odds], %[odds], #0x1\n"
1069 "ld1 {v2.b}[10], [%[a_ptr0]]\n"
1070 "ld1 {v5.b}[10], [a_ptr1]\n"
1071 "ld1 {v8.b}[10], [a_ptr2]\n"
1072 "ld1 {v11.b}[10], [a_ptr3]\n"
1073 "ld1 {v14.b}[10], [a_ptr4]\n"
1074 "ld1 {v17.b}[10], [a_ptr5]\n"
1076 "ldr q18, [%[b_ptr0]]\n"
1077 "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
1078 "ldr q19, [%[b_ptr0], #0x10]\n"
1079 "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
1080 "ldr q20, [%[b_ptr0], #0x20]\n"
1081 "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
1082 "ldr q21, [%[b_ptr0], #0x30]\n"
1083 "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
1084 "ldr q22, [%[b_ptr0], #0x40]\n"
1085 "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
1086 "ldr q23, [%[b_ptr0], #0x50]\n"
1087 "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
1088 "ldr q24, [%[b_ptr0], #0x60]\n"
1089 "ldr q25, [%[b_ptr0], #0x70]\n"
1090 "add %[b_ptr0], %[b_ptr0], #0x80\n"
1091 "cbz %[loops], 6f\n"
1093 "subs %[loops], %[loops], #0x1\n"
1099 ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
1100 ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
1101 ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
1102 ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
1103 ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
1104 ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
1105 "ldr q18, [%[b_ptr0]]\n"
1106 ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
1107 ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
1108 ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
1109 ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
1110 ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
1111 ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
1112 "ldr q19, [%[b_ptr0], #0x10]\n"
1113 ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
1114 ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
1115 ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
1116 ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
1117 ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
1118 ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
1119 "ldr q20, [%[b_ptr0], #0x20]\n"
1120 ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
1121 "add %[b_ptr0], %[b_ptr0], #0x30\n"
1122 ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
1123 ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
1124 ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
1125 ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
1126 ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
1127 ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
1128 ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
1129 ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
1130 ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
1131 ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
1132 ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
1133 ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
1134 ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
1135 ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
1136 ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
1137 ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
1138 ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
1139 ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
1140 ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
1141 ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
1142 ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
1143 ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
1144 ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
1145 ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
1146 ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
1147 ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
1148 ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
1149 ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
1150 ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
1151 ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
1152 ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
1153 ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
1154 ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
1155 ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
1156 ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
1157 ".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
1158 ".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
1159 ".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
1160 ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
1161 ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
1162 ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
1163 ".inst 0x6f82ea9a // udot v26.4s, v20.16b, v2.4b[2]\n"
1164 ".inst 0x6f85ea9b // udot v27.4s, v20.16b, v5.4b[2]\n"
1165 ".inst 0x6f88ea9c // udot v28.4s, v20.16b, v8.4b[2]\n"
1166 ".inst 0x6f8bea9d // udot v29.4s, v20.16b, v11.4b[2]\n"
1167 ".inst 0x6f8eea9e // udot v30.4s, v20.16b, v14.4b[2]\n"
1168 ".inst 0x6f91ea9f // udot v31.4s, v20.16b, v17.4b[2]\n"
1171 "str q26, [%[c_ptr0]]\n"
1172 "subs %[loops], %[loops], #0x1\n"
1174 "ldr q18, [%[b_ptr0]]\n"
1175 "ldr q19, [%[b_ptr0], #0x10]\n"
1176 "add %[c_ptr0], %[c_ptr0], #0x10\n"
1177 "str q27, [c_ptr1]\n"
1178 "add c_ptr1, c_ptr1, #0x10\n"
1180 "ldr q20, [%[b_ptr0], #0x20]\n"
1181 ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
1182 "str q28, [c_ptr2]\n"
1184 "ldr q21, [%[b_ptr0], #0x30]\n"
1185 "ldr q22, [%[b_ptr0], #0x40]\n"
1186 "add c_ptr2, c_ptr2, #0x10\n"
1187 ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
1188 "str q29, [c_ptr3]\n"
1190 "ldr q23, [%[b_ptr0], #0x50]\n"
1191 ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
1192 "ldr q24, [%[b_ptr0], #0x60]\n"
1193 ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
1194 "str q30, [c_ptr4]\n"
1196 "ldr q25, [%[b_ptr0], #0x70]\n"
1197 ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
1198 "add c_ptr3, c_ptr3, #0x10\n"
1199 ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
1200 "str q31, [c_ptr5]\n"
1202 "add c_ptr4, c_ptr4, #0x10\n"
1203 ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
1204 "add c_ptr5, c_ptr5, #0x10\n"
1205 ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
1206 "add %[b_ptr0], %[b_ptr0], #0x80\n"
1207 ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
1208 "ldr q18, [%[b_ptr0]]\n"
1209 ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
1210 "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
1211 ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
1212 "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
1213 ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
1214 "ldr q19, [%[b_ptr0], #0x10]\n"
1215 ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
1216 "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
1217 ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
1218 "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
1219 ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
1220 "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
1221 ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
1222 "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
1223 ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
1224 ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
1225 "ldr q20, [%[b_ptr0], #0x20]\n"
1226 ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
1227 "add %[b_ptr0], %[b_ptr0], #0x30\n"
1228 ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
1229 ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
1230 ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
1231 ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
1232 ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
1233 ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
1234 ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
1235 ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
1236 ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
1237 ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
1238 ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
1239 ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
1240 ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
1241 ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
1242 ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
1243 ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
1244 ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
1245 ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
1246 ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
1247 ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
1248 ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
1249 ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
1250 ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
1251 ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
1252 ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
1253 ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
1254 ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
1255 ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
1256 ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
1257 ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
1258 ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
1259 ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
1260 ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
1261 ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
1262 ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
1263 ".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
1264 ".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
1265 ".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
1266 ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
1267 ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
1268 ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
1269 ".inst 0x6f82ea9a // udot v26.4s, v20.16b, v2.4b[2]\n"
1270 ".inst 0x6f85ea9b // udot v27.4s, v20.16b, v5.4b[2]\n"
1271 ".inst 0x6f88ea9c // udot v28.4s, v20.16b, v8.4b[2]\n"
1272 ".inst 0x6f8bea9d // udot v29.4s, v20.16b, v11.4b[2]\n"
1273 ".inst 0x6f8eea9e // udot v30.4s, v20.16b, v14.4b[2]\n"
1274 ".inst 0x6f91ea9f // udot v31.4s, v20.16b, v17.4b[2]\n"
1277 "str q26, [%[c_ptr0]]\n"
1278 "add %[c_ptr0], %[c_ptr0], #0x10\n"
1280 "ldr q18, [%[b_ptr0]]\n"
1281 "ldr q19, [%[b_ptr0], #0x10]\n"
1282 "str q27, [c_ptr1]\n"
1283 "add c_ptr1, c_ptr1, #0x10\n"
1285 "ldr q20, [%[b_ptr0], #0x20]\n"
1286 ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
1287 "str q28, [c_ptr2]\n"
1289 "ldr q21, [%[b_ptr0], #0x30]\n"
1290 "ldr q22, [%[b_ptr0], #0x40]\n"
1291 "add c_ptr2, c_ptr2, #0x10\n"
1292 ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
1293 "str q29, [c_ptr3]\n"
1295 "ldr q23, [%[b_ptr0], #0x50]\n"
1296 ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
1297 "ldr q24, [%[b_ptr0], #0x60]\n"
1298 ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
1299 "str q30, [c_ptr4]\n"
1301 "ldr q25, [%[b_ptr0], #0x70]\n"
1302 ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
1303 "add c_ptr3, c_ptr3, #0x10\n"
1304 ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
1305 "str q31, [c_ptr5]\n"
1307 "add c_ptr4, c_ptr4, #0x10\n"
1308 ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
1309 "add c_ptr5, c_ptr5, #0x10\n"
1310 ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
1311 "add %[b_ptr0], %[b_ptr0], #0x80\n"
1312 ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
1313 "ldr q18, [%[b_ptr0]]\n"
1314 ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
1315 ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
1316 ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
1317 "ldr q19, [%[b_ptr0], #0x10]\n"
1318 ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
1319 ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
1320 ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
1321 ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
1322 ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
1323 ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
1324 "ldr q20, [%[b_ptr0], #0x20]\n"
1325 ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
1326 "add %[b_ptr0], %[b_ptr0], #0x30\n"
1327 ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
1328 ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
1329 ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
1330 ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
1331 ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
1332 ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
1333 ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
1334 ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
1335 ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
1336 ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
1337 ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
1338 ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
1339 ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
1340 ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
1341 ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
1342 ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
1343 ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
1344 ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
1345 ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
1346 ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
1347 ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
1348 ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
1349 ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
1350 ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
1351 ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
1352 ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
1353 ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
1354 ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
1355 ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
1356 ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
1357 ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
1358 ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
1359 ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
1360 ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
1361 ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
1362 ".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
1363 ".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
1364 ".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
1365 ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
1366 ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
1367 ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
1368 ".inst 0x6f82ea9a // udot v26.4s, v20.16b, v2.4b[2]\n"
1369 ".inst 0x6f85ea9b // udot v27.4s, v20.16b, v5.4b[2]\n"
1370 ".inst 0x6f88ea9c // udot v28.4s, v20.16b, v8.4b[2]\n"
1371 ".inst 0x6f8bea9d // udot v29.4s, v20.16b, v11.4b[2]\n"
1372 ".inst 0x6f8eea9e // udot v30.4s, v20.16b, v14.4b[2]\n"
1373 ".inst 0x6f91ea9f // udot v31.4s, v20.16b, v17.4b[2]\n"
1382 ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
1383 ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
1384 ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
1385 ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
1386 ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
1387 ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
1388 "ldr q18, [%[b_ptr0]]\n"
1389 ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
1390 ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
1391 ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
1392 ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
1393 ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
1394 ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
1395 "ldr q19, [%[b_ptr0], #0x10]\n"
1396 ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
1397 ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
1398 ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
1399 ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
1400 ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
1401 ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
1402 "ldr q20, [%[b_ptr0], #0x20]\n"
1403 ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
1404 "add %[b_ptr0], %[b_ptr0], #0x30\n"
1405 ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
1406 ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
1407 ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
1408 ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
1409 ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
1410 ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
1411 ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
1412 ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
1413 ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
1414 ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
1415 ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
1416 ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
1417 ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
1418 ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
1419 ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
1420 ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
1421 ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
1422 ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
1423 ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
1424 ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
1425 ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
1426 ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
1427 ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
1428 ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
1429 ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
1430 ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
1431 ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
1432 ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
1433 ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
1434 ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
1435 ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
1436 ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
1437 ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
1438 ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
1439 ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
1440 ".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
1441 ".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
1442 ".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
1443 ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
1444 ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
1445 ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
1446 ".inst 0x6f82ea9a // udot v26.4s, v20.16b, v2.4b[2]\n"
1447 ".inst 0x6f85ea9b // udot v27.4s, v20.16b, v5.4b[2]\n"
1448 ".inst 0x6f88ea9c // udot v28.4s, v20.16b, v8.4b[2]\n"
1449 ".inst 0x6f8bea9d // udot v29.4s, v20.16b, v11.4b[2]\n"
1450 ".inst 0x6f8eea9e // udot v30.4s, v20.16b, v14.4b[2]\n"
1451 ".inst 0x6f91ea9f // udot v31.4s, v20.16b, v17.4b[2]\n"
1453 "str q26, [%[c_ptr0]]\n"
1454 "add %[c_ptr0], %[c_ptr0], #0x10\n"
1455 "str q27, [c_ptr1]\n"
1456 "str q28, [c_ptr2]\n"
1457 "str q29, [c_ptr3]\n"
1458 "str q30, [c_ptr4]\n"
1459 "str q31, [c_ptr5]\n"
1470 : [a_ptr0]
"+r" (a_ptr0), [b_ptr0]
"+r" (b_ptr0), [c_ptr0]
"+r" (c_ptr0), [loops]
"+r" (loops), [oob_rows]
"+r" (oob_rows), [odds]
"+r" (odds)
1471 : [lda]
"r" (ldab), [ldc]
"r" (ldcb)
1472 :
"x0",
"x1",
"x2",
"x3",
"x4",
"x5",
"x6",
"x7",
"x8",
"x9",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23",
"v24",
"v25",
"v26",
"v27",
"v28",
"v29",
"v30",
"v31",
"cc",
"memory"
1487 "add a_ptr1, %[a_ptr0], %[lda]\n"
1488 "add c_ptr1, %[c_ptr0], %[ldc]\n"
1489 "add a_ptr2, a_ptr1, %[lda]\n"
1490 "add c_ptr2, c_ptr1, %[ldc]\n"
1491 "add a_ptr3, a_ptr2, %[lda]\n"
1492 "add c_ptr3, c_ptr2, %[ldc]\n"
1493 "add a_ptr4, a_ptr3, %[lda]\n"
1494 "add c_ptr4, c_ptr3, %[ldc]\n"
1495 "add a_ptr5, a_ptr4, %[lda]\n"
1496 "add c_ptr5, c_ptr4, %[ldc]\n"
1497 "cbz %[oob_rows], 1f\n"
1498 "subs %[oob_rows], %[oob_rows], #0x1\n"
1499 "add c_ptr5, %[c_ptr0], #0x0\n"
1500 "add a_ptr5, %[a_ptr0], #0x0\n"
1502 "subs %[oob_rows], %[oob_rows], #0x1\n"
1503 "add c_ptr4, %[c_ptr0], #0x0\n"
1504 "add a_ptr4, %[a_ptr0], #0x0\n"
1506 "subs %[oob_rows], %[oob_rows], #0x1\n"
1507 "add c_ptr3, %[c_ptr0], #0x0\n"
1508 "add a_ptr3, %[a_ptr0], #0x0\n"
1510 "subs %[oob_rows], %[oob_rows], #0x1\n"
1511 "add c_ptr2, %[c_ptr0], #0x0\n"
1512 "add a_ptr2, %[a_ptr0], #0x0\n"
1514 "subs %[oob_rows], %[oob_rows], #0x1\n"
1515 "add c_ptr1, %[c_ptr0], #0x0\n"
1516 "add a_ptr1, %[a_ptr0], #0x0\n"
1518 "cbnz %[odds], 2f\n"
1519 "ldr q0, [%[a_ptr0]], #0x10\n"
1520 "ldr q3, [a_ptr1], #0x10\n"
1521 "ldr q6, [a_ptr2], #0x10\n"
1522 "ldr q9, [a_ptr3], #0x10\n"
1523 "ldr q12, [a_ptr4], #0x10\n"
1524 "ldr q15, [a_ptr5], #0x10\n"
1525 "ldr q1, [%[a_ptr0]], #0x10\n"
1526 "ldr q4, [a_ptr1], #0x10\n"
1527 "ldr q7, [a_ptr2], #0x10\n"
1528 "ldr q10, [a_ptr3], #0x10\n"
1529 "ldr q13, [a_ptr4], #0x10\n"
1530 "ldr q16, [a_ptr5], #0x10\n"
1531 "ldr q2, [%[a_ptr0]]\n"
1532 "ldr q5, [a_ptr1]\n"
1533 "ldr q8, [a_ptr2]\n"
1534 "ldr q11, [a_ptr3]\n"
1535 "ldr q14, [a_ptr4]\n"
1536 "ldr q17, [a_ptr5]\n"
1539 "ldr q0, [%[a_ptr0]], #0x10\n"
1540 "subs %[odds], %[odds], #0x1\n"
1541 "ldr q3, [a_ptr1], #0x10\n"
1542 "ldr q6, [a_ptr2], #0x10\n"
1543 "ldr q9, [a_ptr3], #0x10\n"
1544 "ldr q12, [a_ptr4], #0x10\n"
1545 "ldr q15, [a_ptr5], #0x10\n"
1546 "ldr q1, [%[a_ptr0]], #0x10\n"
1547 "ldr q4, [a_ptr1], #0x10\n"
1548 "ldr q7, [a_ptr2], #0x10\n"
1549 "ldr q10, [a_ptr3], #0x10\n"
1550 "ldr d2, [%[a_ptr0]], #0x8\n"
1551 "ldr q13, [a_ptr4], #0x10\n"
1552 "ldr d5, [a_ptr1], #0x8\n"
1553 "ldr q16, [a_ptr5], #0x10\n"
1554 "ldr d8, [a_ptr2], #0x8\n"
1555 "ldr d11, [a_ptr3], #0x8\n"
1556 "ldr d14, [a_ptr4], #0x8\n"
1557 "ldr d17, [a_ptr5], #0x8\n"
1558 "ld1 {v2.s}[2], [%[a_ptr0]], #4\n"
1559 "ld1 {v5.s}[2], [a_ptr1], #4\n"
1560 "ld1 {v8.s}[2], [a_ptr2], #4\n"
1561 "ld1 {v11.s}[2], [a_ptr3], #4\n"
1562 "ld1 {v14.s}[2], [a_ptr4], #4\n"
1563 "ld1 {v17.s}[2], [a_ptr5], #4\n"
1565 "ld1 {v2.b}[12], [%[a_ptr0]]\n"
1566 "ld1 {v5.b}[12], [a_ptr1]\n"
1567 "ld1 {v8.b}[12], [a_ptr2]\n"
1568 "ld1 {v11.b}[12], [a_ptr3]\n"
1569 "ld1 {v14.b}[12], [a_ptr4]\n"
1570 "ld1 {v17.b}[12], [a_ptr5]\n"
1573 "ld1 {v2.h}[6], [%[a_ptr0]], #2\n"
1574 "ld1 {v5.h}[6], [a_ptr1], #2\n"
1575 "ld1 {v8.h}[6], [a_ptr2], #2\n"
1576 "ld1 {v11.h}[6], [a_ptr3], #2\n"
1577 "ld1 {v14.h}[6], [a_ptr4], #2\n"
1578 "ld1 {v17.h}[6], [a_ptr5], #2\n"
1579 "subs %[odds], %[odds], #0x1\n"
1583 "ld1 {v2.b}[14], [%[a_ptr0]]\n"
1584 "ld1 {v5.b}[14], [a_ptr1]\n"
1585 "ld1 {v8.b}[14], [a_ptr2]\n"
1586 "ld1 {v11.b}[14], [a_ptr3]\n"
1587 "ld1 {v14.b}[14], [a_ptr4]\n"
1588 "ld1 {v17.b}[14], [a_ptr5]\n"
1590 "ldr q18, [%[b_ptr0]]\n"
1591 "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
1592 "ldr q19, [%[b_ptr0], #0x10]\n"
1593 "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
1594 "ldr q20, [%[b_ptr0], #0x20]\n"
1595 "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
1596 "ldr q21, [%[b_ptr0], #0x30]\n"
1597 "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
1598 "ldr q22, [%[b_ptr0], #0x40]\n"
1599 "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
1600 "ldr q23, [%[b_ptr0], #0x50]\n"
1601 "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
1602 "ldr q24, [%[b_ptr0], #0x60]\n"
1603 "ldr q25, [%[b_ptr0], #0x70]\n"
1604 "add %[b_ptr0], %[b_ptr0], #0x80\n"
1605 "cbz %[loops], 6f\n"
1607 "subs %[loops], %[loops], #0x1\n"
1613 ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
1614 ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
1615 ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
1616 ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
1617 ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
1618 ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
1619 "ldr q18, [%[b_ptr0]]\n"
1620 ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
1621 ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
1622 ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
1623 ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
1624 ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
1625 ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
1626 "ldr q19, [%[b_ptr0], #0x10]\n"
1627 ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
1628 ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
1629 ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
1630 ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
1631 ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
1632 ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
1633 "ldr q20, [%[b_ptr0], #0x20]\n"
1634 ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
1635 ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
1636 ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
1637 ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
1638 ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
1639 ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
1640 "ldr q21, [%[b_ptr0], #0x30]\n"
1641 ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
1642 "add %[b_ptr0], %[b_ptr0], #0x40\n"
1643 ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
1644 ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
1645 ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
1646 ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
1647 ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
1648 ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
1649 ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
1650 ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
1651 ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
1652 ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
1653 ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
1654 ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
1655 ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
1656 ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
1657 ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
1658 ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
1659 ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
1660 ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
1661 ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
1662 ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
1663 ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
1664 ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
1665 ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
1666 ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
1667 ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
1668 ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
1669 ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
1670 ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
1671 ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
1672 ".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
1673 ".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
1674 ".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
1675 ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
1676 ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
1677 ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
1678 ".inst 0x6f82ea9a // udot v26.4s, v20.16b, v2.4b[2]\n"
1679 ".inst 0x6f85ea9b // udot v27.4s, v20.16b, v5.4b[2]\n"
1680 ".inst 0x6f88ea9c // udot v28.4s, v20.16b, v8.4b[2]\n"
1681 ".inst 0x6f8bea9d // udot v29.4s, v20.16b, v11.4b[2]\n"
1682 ".inst 0x6f8eea9e // udot v30.4s, v20.16b, v14.4b[2]\n"
1683 ".inst 0x6f91ea9f // udot v31.4s, v20.16b, v17.4b[2]\n"
1684 ".inst 0x6fa2eaba // udot v26.4s, v21.16b, v2.4b[3]\n"
1685 ".inst 0x6fa5eabb // udot v27.4s, v21.16b, v5.4b[3]\n"
1686 ".inst 0x6fa8eabc // udot v28.4s, v21.16b, v8.4b[3]\n"
1687 ".inst 0x6fabeabd // udot v29.4s, v21.16b, v11.4b[3]\n"
1688 ".inst 0x6faeeabe // udot v30.4s, v21.16b, v14.4b[3]\n"
1689 ".inst 0x6fb1eabf // udot v31.4s, v21.16b, v17.4b[3]\n"
1692 "str q26, [%[c_ptr0]]\n"
1693 "subs %[loops], %[loops], #0x1\n"
1695 "ldr q18, [%[b_ptr0]]\n"
1696 "ldr q19, [%[b_ptr0], #0x10]\n"
1697 "add %[c_ptr0], %[c_ptr0], #0x10\n"
1698 "str q27, [c_ptr1]\n"
1699 "add c_ptr1, c_ptr1, #0x10\n"
1701 "ldr q20, [%[b_ptr0], #0x20]\n"
1702 ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
1703 "str q28, [c_ptr2]\n"
1705 "ldr q21, [%[b_ptr0], #0x30]\n"
1706 "ldr q22, [%[b_ptr0], #0x40]\n"
1707 "add c_ptr2, c_ptr2, #0x10\n"
1708 ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
1709 "str q29, [c_ptr3]\n"
1711 "ldr q23, [%[b_ptr0], #0x50]\n"
1712 ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
1713 "ldr q24, [%[b_ptr0], #0x60]\n"
1714 ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
1715 "str q30, [c_ptr4]\n"
1717 "ldr q25, [%[b_ptr0], #0x70]\n"
1718 ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
1719 "add c_ptr3, c_ptr3, #0x10\n"
1720 ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
1721 "str q31, [c_ptr5]\n"
1723 "add c_ptr4, c_ptr4, #0x10\n"
1724 ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
1725 "add c_ptr5, c_ptr5, #0x10\n"
1726 ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
1727 "add %[b_ptr0], %[b_ptr0], #0x80\n"
1728 ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
1729 "ldr q18, [%[b_ptr0]]\n"
1730 ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
1731 "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
1732 ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
1733 "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
1734 ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
1735 "ldr q19, [%[b_ptr0], #0x10]\n"
1736 ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
1737 "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
1738 ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
1739 "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
1740 ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
1741 "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
1742 ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
1743 "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
1744 ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
1745 ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
1746 "ldr q20, [%[b_ptr0], #0x20]\n"
1747 ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
1748 ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
1749 ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
1750 ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
1751 ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
1752 ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
1753 "ldr q21, [%[b_ptr0], #0x30]\n"
1754 ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
1755 "add %[b_ptr0], %[b_ptr0], #0x40\n"
1756 ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
1757 ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
1758 ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
1759 ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
1760 ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
1761 ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
1762 ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
1763 ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
1764 ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
1765 ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
1766 ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
1767 ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
1768 ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
1769 ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
1770 ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
1771 ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
1772 ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
1773 ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
1774 ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
1775 ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
1776 ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
1777 ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
1778 ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
1779 ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
1780 ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
1781 ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
1782 ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
1783 ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
1784 ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
1785 ".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
1786 ".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
1787 ".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
1788 ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
1789 ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
1790 ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
1791 ".inst 0x6f82ea9a // udot v26.4s, v20.16b, v2.4b[2]\n"
1792 ".inst 0x6f85ea9b // udot v27.4s, v20.16b, v5.4b[2]\n"
1793 ".inst 0x6f88ea9c // udot v28.4s, v20.16b, v8.4b[2]\n"
1794 ".inst 0x6f8bea9d // udot v29.4s, v20.16b, v11.4b[2]\n"
1795 ".inst 0x6f8eea9e // udot v30.4s, v20.16b, v14.4b[2]\n"
1796 ".inst 0x6f91ea9f // udot v31.4s, v20.16b, v17.4b[2]\n"
1797 ".inst 0x6fa2eaba // udot v26.4s, v21.16b, v2.4b[3]\n"
1798 ".inst 0x6fa5eabb // udot v27.4s, v21.16b, v5.4b[3]\n"
1799 ".inst 0x6fa8eabc // udot v28.4s, v21.16b, v8.4b[3]\n"
1800 ".inst 0x6fabeabd // udot v29.4s, v21.16b, v11.4b[3]\n"
1801 ".inst 0x6faeeabe // udot v30.4s, v21.16b, v14.4b[3]\n"
1802 ".inst 0x6fb1eabf // udot v31.4s, v21.16b, v17.4b[3]\n"
1805 "str q26, [%[c_ptr0]]\n"
1806 "add %[c_ptr0], %[c_ptr0], #0x10\n"
1808 "ldr q18, [%[b_ptr0]]\n"
1809 "ldr q19, [%[b_ptr0], #0x10]\n"
1810 "str q27, [c_ptr1]\n"
1811 "add c_ptr1, c_ptr1, #0x10\n"
1813 "ldr q20, [%[b_ptr0], #0x20]\n"
1814 ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
1815 "str q28, [c_ptr2]\n"
1817 "ldr q21, [%[b_ptr0], #0x30]\n"
1818 "ldr q22, [%[b_ptr0], #0x40]\n"
1819 "add c_ptr2, c_ptr2, #0x10\n"
1820 ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
1821 "str q29, [c_ptr3]\n"
1823 "ldr q23, [%[b_ptr0], #0x50]\n"
1824 ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
1825 "ldr q24, [%[b_ptr0], #0x60]\n"
1826 ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
1827 "str q30, [c_ptr4]\n"
1829 "ldr q25, [%[b_ptr0], #0x70]\n"
1830 ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
1831 "add c_ptr3, c_ptr3, #0x10\n"
1832 ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
1833 "str q31, [c_ptr5]\n"
1835 "add c_ptr4, c_ptr4, #0x10\n"
1836 ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
1837 "add c_ptr5, c_ptr5, #0x10\n"
1838 ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
1839 "add %[b_ptr0], %[b_ptr0], #0x80\n"
1840 ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
1841 "ldr q18, [%[b_ptr0]]\n"
1842 ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
1843 ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
1844 ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
1845 "ldr q19, [%[b_ptr0], #0x10]\n"
1846 ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
1847 ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
1848 ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
1849 ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
1850 ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
1851 ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
1852 "ldr q20, [%[b_ptr0], #0x20]\n"
1853 ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
1854 ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
1855 ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
1856 ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
1857 ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
1858 ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
1859 "ldr q21, [%[b_ptr0], #0x30]\n"
1860 ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
1861 "add %[b_ptr0], %[b_ptr0], #0x40\n"
1862 ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
1863 ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
1864 ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
1865 ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
1866 ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
1867 ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
1868 ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
1869 ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
1870 ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
1871 ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
1872 ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
1873 ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
1874 ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
1875 ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
1876 ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
1877 ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
1878 ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
1879 ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
1880 ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
1881 ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
1882 ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
1883 ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
1884 ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
1885 ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
1886 ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
1887 ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
1888 ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
1889 ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
1890 ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
1891 ".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
1892 ".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
1893 ".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
1894 ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
1895 ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
1896 ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
1897 ".inst 0x6f82ea9a // udot v26.4s, v20.16b, v2.4b[2]\n"
1898 ".inst 0x6f85ea9b // udot v27.4s, v20.16b, v5.4b[2]\n"
1899 ".inst 0x6f88ea9c // udot v28.4s, v20.16b, v8.4b[2]\n"
1900 ".inst 0x6f8bea9d // udot v29.4s, v20.16b, v11.4b[2]\n"
1901 ".inst 0x6f8eea9e // udot v30.4s, v20.16b, v14.4b[2]\n"
1902 ".inst 0x6f91ea9f // udot v31.4s, v20.16b, v17.4b[2]\n"
1903 ".inst 0x6fa2eaba // udot v26.4s, v21.16b, v2.4b[3]\n"
1904 ".inst 0x6fa5eabb // udot v27.4s, v21.16b, v5.4b[3]\n"
1905 ".inst 0x6fa8eabc // udot v28.4s, v21.16b, v8.4b[3]\n"
1906 ".inst 0x6fabeabd // udot v29.4s, v21.16b, v11.4b[3]\n"
1907 ".inst 0x6faeeabe // udot v30.4s, v21.16b, v14.4b[3]\n"
1908 ".inst 0x6fb1eabf // udot v31.4s, v21.16b, v17.4b[3]\n"
1917 ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
1918 ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
1919 ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
1920 ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
1921 ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
1922 ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
1923 "ldr q18, [%[b_ptr0]]\n"
1924 ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
1925 ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
1926 ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
1927 ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
1928 ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
1929 ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
1930 "ldr q19, [%[b_ptr0], #0x10]\n"
1931 ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
1932 ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
1933 ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
1934 ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
1935 ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
1936 ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
1937 "ldr q20, [%[b_ptr0], #0x20]\n"
1938 ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
1939 ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
1940 ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
1941 ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
1942 ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
1943 ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
1944 "ldr q21, [%[b_ptr0], #0x30]\n"
1945 ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
1946 "add %[b_ptr0], %[b_ptr0], #0x40\n"
1947 ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
1948 ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
1949 ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
1950 ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
1951 ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
1952 ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
1953 ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
1954 ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
1955 ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
1956 ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
1957 ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
1958 ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
1959 ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
1960 ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
1961 ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
1962 ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
1963 ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
1964 ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
1965 ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
1966 ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
1967 ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
1968 ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
1969 ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
1970 ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
1971 ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
1972 ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
1973 ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
1974 ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
1975 ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
1976 ".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
1977 ".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
1978 ".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
1979 ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
1980 ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
1981 ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
1982 ".inst 0x6f82ea9a // udot v26.4s, v20.16b, v2.4b[2]\n"
1983 ".inst 0x6f85ea9b // udot v27.4s, v20.16b, v5.4b[2]\n"
1984 ".inst 0x6f88ea9c // udot v28.4s, v20.16b, v8.4b[2]\n"
1985 ".inst 0x6f8bea9d // udot v29.4s, v20.16b, v11.4b[2]\n"
1986 ".inst 0x6f8eea9e // udot v30.4s, v20.16b, v14.4b[2]\n"
1987 ".inst 0x6f91ea9f // udot v31.4s, v20.16b, v17.4b[2]\n"
1988 ".inst 0x6fa2eaba // udot v26.4s, v21.16b, v2.4b[3]\n"
1989 ".inst 0x6fa5eabb // udot v27.4s, v21.16b, v5.4b[3]\n"
1990 ".inst 0x6fa8eabc // udot v28.4s, v21.16b, v8.4b[3]\n"
1991 ".inst 0x6fabeabd // udot v29.4s, v21.16b, v11.4b[3]\n"
1992 ".inst 0x6faeeabe // udot v30.4s, v21.16b, v14.4b[3]\n"
1993 ".inst 0x6fb1eabf // udot v31.4s, v21.16b, v17.4b[3]\n"
1995 "str q26, [%[c_ptr0]]\n"
1996 "add %[c_ptr0], %[c_ptr0], #0x10\n"
1997 "str q27, [c_ptr1]\n"
1998 "str q28, [c_ptr2]\n"
1999 "str q29, [c_ptr3]\n"
2000 "str q30, [c_ptr4]\n"
2001 "str q31, [c_ptr5]\n"
2012 : [a_ptr0]
"+r" (a_ptr0), [b_ptr0]
"+r" (b_ptr0), [c_ptr0]
"+r" (c_ptr0), [loops]
"+r" (loops), [oob_rows]
"+r" (oob_rows), [odds]
"+r" (odds)
2013 : [lda]
"r" (ldab), [ldc]
"r" (ldcb)
2014 :
"x0",
"x1",
"x2",
"x3",
"x4",
"x5",
"x6",
"x7",
"x8",
"x9",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23",
"v24",
"v25",
"v26",
"v27",
"v28",
"v29",
"v30",
"v31",
"cc",
"memory"
2029 "add a_ptr1, %[a_ptr0], %[lda]\n"
2030 "add c_ptr1, %[c_ptr0], %[ldc]\n"
2031 "add a_ptr2, a_ptr1, %[lda]\n"
2032 "add c_ptr2, c_ptr1, %[ldc]\n"
2033 "add a_ptr3, a_ptr2, %[lda]\n"
2034 "add c_ptr3, c_ptr2, %[ldc]\n"
2035 "add a_ptr4, a_ptr3, %[lda]\n"
2036 "add c_ptr4, c_ptr3, %[ldc]\n"
2037 "add a_ptr5, a_ptr4, %[lda]\n"
2038 "add c_ptr5, c_ptr4, %[ldc]\n"
2039 "cbz %[oob_rows], 1f\n"
2040 "subs %[oob_rows], %[oob_rows], #0x1\n"
2041 "add c_ptr5, %[c_ptr0], #0x0\n"
2042 "add a_ptr5, %[a_ptr0], #0x0\n"
2044 "subs %[oob_rows], %[oob_rows], #0x1\n"
2045 "add c_ptr4, %[c_ptr0], #0x0\n"
2046 "add a_ptr4, %[a_ptr0], #0x0\n"
2048 "subs %[oob_rows], %[oob_rows], #0x1\n"
2049 "add c_ptr3, %[c_ptr0], #0x0\n"
2050 "add a_ptr3, %[a_ptr0], #0x0\n"
2052 "subs %[oob_rows], %[oob_rows], #0x1\n"
2053 "add c_ptr2, %[c_ptr0], #0x0\n"
2054 "add a_ptr2, %[a_ptr0], #0x0\n"
2056 "subs %[oob_rows], %[oob_rows], #0x1\n"
2057 "add c_ptr1, %[c_ptr0], #0x0\n"
2058 "add a_ptr1, %[a_ptr0], #0x0\n"
2060 "cbnz %[odds], 2f\n"
2061 "ldr q0, [%[a_ptr0]], #0x10\n"
2062 "ldr q4, [a_ptr1], #0x10\n"
2063 "ldr q8, [a_ptr2], #0x10\n"
2064 "ldr q12, [a_ptr3], #0x10\n"
2065 "ldr q16, [a_ptr4], #0x10\n"
2066 "ldr q20, [a_ptr5], #0x10\n"
2067 "ldr q1, [%[a_ptr0]], #0x10\n"
2068 "ldr q5, [a_ptr1], #0x10\n"
2069 "ldr q9, [a_ptr2], #0x10\n"
2070 "ldr q13, [a_ptr3], #0x10\n"
2071 "ldr q17, [a_ptr4], #0x10\n"
2072 "ldr q21, [a_ptr5], #0x10\n"
2073 "ldr q2, [%[a_ptr0]], #0x10\n"
2074 "ldr q6, [a_ptr1], #0x10\n"
2075 "ldr q10, [a_ptr2], #0x10\n"
2076 "ldr q14, [a_ptr3], #0x10\n"
2077 "ldr s3, [%[a_ptr0]]\n"
2078 "ldr q18, [a_ptr4], #0x10\n"
2079 "ldr s7, [a_ptr1]\n"
2080 "ldr q22, [a_ptr5], #0x10\n"
2081 "ldr s11, [a_ptr2]\n"
2082 "ldr s15, [a_ptr3]\n"
2083 "ldr s19, [a_ptr4]\n"
2084 "ldr s23, [a_ptr5]\n"
2087 "ldr q0, [%[a_ptr0]], #0x10\n"
2088 "subs %[odds], %[odds], #0x1\n"
2089 "ldr q4, [a_ptr1], #0x10\n"
2090 "ldr q8, [a_ptr2], #0x10\n"
2091 "ldr q12, [a_ptr3], #0x10\n"
2092 "ldr q16, [a_ptr4], #0x10\n"
2093 "ldr q20, [a_ptr5], #0x10\n"
2094 "ldr q1, [%[a_ptr0]], #0x10\n"
2095 "ldr q5, [a_ptr1], #0x10\n"
2096 "ldr q9, [a_ptr2], #0x10\n"
2097 "ldr q13, [a_ptr3], #0x10\n"
2098 "ldr q17, [a_ptr4], #0x10\n"
2099 "ldr q21, [a_ptr5], #0x10\n"
2100 "ldr q2, [%[a_ptr0]], #0x10\n"
2101 "ldr q6, [a_ptr1], #0x10\n"
2102 "ldr q10, [a_ptr2], #0x10\n"
2103 "ldr q14, [a_ptr3], #0x10\n"
2104 "ldr q18, [a_ptr4], #0x10\n"
2105 "ldr q22, [a_ptr5], #0x10\n"
2107 "ldr b3, [%[a_ptr0]]\n"
2108 "ldr b7, [a_ptr1]\n"
2109 "ldr b11, [a_ptr2]\n"
2110 "ldr b15, [a_ptr3]\n"
2111 "ldr b19, [a_ptr4]\n"
2112 "ldr b23, [a_ptr5]\n"
2115 "ldr h3, [%[a_ptr0]], #0x2\n"
2116 "ldr h7, [a_ptr1], #0x2\n"
2117 "ldr h11, [a_ptr2], #0x2\n"
2118 "ldr h15, [a_ptr3], #0x2\n"
2119 "ldr h19, [a_ptr4], #0x2\n"
2120 "ldr h23, [a_ptr5], #0x2\n"
2121 "subs %[odds], %[odds], #0x1\n"
2125 "ld1 {v3.b}[2], [%[a_ptr0]]\n"
2126 "ld1 {v7.b}[2], [a_ptr1]\n"
2127 "ld1 {v11.b}[2], [a_ptr2]\n"
2128 "ld1 {v15.b}[2], [a_ptr3]\n"
2129 "ld1 {v19.b}[2], [a_ptr4]\n"
2130 "ld1 {v23.b}[2], [a_ptr5]\n"
2132 "ldr q24, [%[b_ptr0]]\n"
2133 "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
2134 "ldr q25, [%[b_ptr0], #0x10]\n"
2135 "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
2136 "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
2137 "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
2138 "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
2139 "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
2140 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2141 "cbz %[loops], 6f\n"
2143 "subs %[loops], %[loops], #0x1\n"
2149 ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
2150 ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
2151 ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
2152 ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
2153 ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
2154 ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
2155 "ldr q24, [%[b_ptr0]]\n"
2156 ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
2157 ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
2158 ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
2159 ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
2160 ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
2161 ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
2162 "ldr q25, [%[b_ptr0], #0x10]\n"
2163 ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
2164 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2165 ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
2166 ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
2167 ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
2168 ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
2169 ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
2170 "ldr q24, [%[b_ptr0]]\n"
2171 ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
2172 ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
2173 ".inst 0x6fa8eb3c // udot v28.4s, v25.16b, v8.4b[3]\n"
2174 ".inst 0x6faceb3d // udot v29.4s, v25.16b, v12.4b[3]\n"
2175 ".inst 0x6fb0eb3e // udot v30.4s, v25.16b, v16.4b[3]\n"
2176 ".inst 0x6fb4eb3f // udot v31.4s, v25.16b, v20.4b[3]\n"
2177 "ldr q25, [%[b_ptr0], #0x10]\n"
2178 ".inst 0x6f81e31a // udot v26.4s, v24.16b, v1.4b[0]\n"
2179 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2180 ".inst 0x6f85e31b // udot v27.4s, v24.16b, v5.4b[0]\n"
2181 ".inst 0x6f89e31c // udot v28.4s, v24.16b, v9.4b[0]\n"
2182 ".inst 0x6f8de31d // udot v29.4s, v24.16b, v13.4b[0]\n"
2183 ".inst 0x6f91e31e // udot v30.4s, v24.16b, v17.4b[0]\n"
2184 ".inst 0x6f95e31f // udot v31.4s, v24.16b, v21.4b[0]\n"
2185 "ldr q24, [%[b_ptr0]]\n"
2186 ".inst 0x6fa1e33a // udot v26.4s, v25.16b, v1.4b[1]\n"
2187 ".inst 0x6fa5e33b // udot v27.4s, v25.16b, v5.4b[1]\n"
2188 ".inst 0x6fa9e33c // udot v28.4s, v25.16b, v9.4b[1]\n"
2189 ".inst 0x6fade33d // udot v29.4s, v25.16b, v13.4b[1]\n"
2190 ".inst 0x6fb1e33e // udot v30.4s, v25.16b, v17.4b[1]\n"
2191 ".inst 0x6fb5e33f // udot v31.4s, v25.16b, v21.4b[1]\n"
2192 "ldr q25, [%[b_ptr0], #0x10]\n"
2193 ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
2194 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2195 ".inst 0x6f85eb1b // udot v27.4s, v24.16b, v5.4b[2]\n"
2196 ".inst 0x6f89eb1c // udot v28.4s, v24.16b, v9.4b[2]\n"
2197 ".inst 0x6f8deb1d // udot v29.4s, v24.16b, v13.4b[2]\n"
2198 ".inst 0x6f91eb1e // udot v30.4s, v24.16b, v17.4b[2]\n"
2199 ".inst 0x6f95eb1f // udot v31.4s, v24.16b, v21.4b[2]\n"
2200 "ldr q24, [%[b_ptr0]]\n"
2201 ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
2202 ".inst 0x6fa5eb3b // udot v27.4s, v25.16b, v5.4b[3]\n"
2203 ".inst 0x6fa9eb3c // udot v28.4s, v25.16b, v9.4b[3]\n"
2204 ".inst 0x6fadeb3d // udot v29.4s, v25.16b, v13.4b[3]\n"
2205 ".inst 0x6fb1eb3e // udot v30.4s, v25.16b, v17.4b[3]\n"
2206 ".inst 0x6fb5eb3f // udot v31.4s, v25.16b, v21.4b[3]\n"
2207 "ldr q25, [%[b_ptr0], #0x10]\n"
2208 ".inst 0x6f82e31a // udot v26.4s, v24.16b, v2.4b[0]\n"
2209 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2210 ".inst 0x6f86e31b // udot v27.4s, v24.16b, v6.4b[0]\n"
2211 ".inst 0x6f8ae31c // udot v28.4s, v24.16b, v10.4b[0]\n"
2212 ".inst 0x6f8ee31d // udot v29.4s, v24.16b, v14.4b[0]\n"
2213 ".inst 0x6f92e31e // udot v30.4s, v24.16b, v18.4b[0]\n"
2214 ".inst 0x6f96e31f // udot v31.4s, v24.16b, v22.4b[0]\n"
2215 "ldr q24, [%[b_ptr0]]\n"
2216 ".inst 0x6fa2e33a // udot v26.4s, v25.16b, v2.4b[1]\n"
2217 ".inst 0x6fa6e33b // udot v27.4s, v25.16b, v6.4b[1]\n"
2218 ".inst 0x6faae33c // udot v28.4s, v25.16b, v10.4b[1]\n"
2219 ".inst 0x6faee33d // udot v29.4s, v25.16b, v14.4b[1]\n"
2220 ".inst 0x6fb2e33e // udot v30.4s, v25.16b, v18.4b[1]\n"
2221 ".inst 0x6fb6e33f // udot v31.4s, v25.16b, v22.4b[1]\n"
2222 "ldr q25, [%[b_ptr0], #0x10]\n"
2223 ".inst 0x6f82eb1a // udot v26.4s, v24.16b, v2.4b[2]\n"
2224 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2225 ".inst 0x6f86eb1b // udot v27.4s, v24.16b, v6.4b[2]\n"
2226 ".inst 0x6f8aeb1c // udot v28.4s, v24.16b, v10.4b[2]\n"
2227 ".inst 0x6f8eeb1d // udot v29.4s, v24.16b, v14.4b[2]\n"
2228 ".inst 0x6f92eb1e // udot v30.4s, v24.16b, v18.4b[2]\n"
2229 ".inst 0x6f96eb1f // udot v31.4s, v24.16b, v22.4b[2]\n"
2230 "ldr q24, [%[b_ptr0]]\n"
2231 ".inst 0x6fa2eb3a // udot v26.4s, v25.16b, v2.4b[3]\n"
2232 "add %[b_ptr0], %[b_ptr0], #0x10\n"
2233 ".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
2234 ".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
2235 ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
2236 ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
2237 ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
2238 ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
2239 ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
2240 ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
2241 ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
2242 ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
2243 ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
2246 "str q26, [%[c_ptr0]]\n"
2247 "subs %[loops], %[loops], #0x1\n"
2249 "ldr q24, [%[b_ptr0]]\n"
2250 "ldr q25, [%[b_ptr0], #0x10]\n"
2251 "add %[c_ptr0], %[c_ptr0], #0x10\n"
2252 "str q27, [c_ptr1]\n"
2253 "add c_ptr1, c_ptr1, #0x10\n"
2255 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2256 ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
2257 "str q28, [c_ptr2]\n"
2259 "add c_ptr2, c_ptr2, #0x10\n"
2260 ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
2261 "str q29, [c_ptr3]\n"
2263 "add c_ptr3, c_ptr3, #0x10\n"
2264 ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
2265 "str q30, [c_ptr4]\n"
2267 "add c_ptr4, c_ptr4, #0x10\n"
2268 ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
2269 "str q31, [c_ptr5]\n"
2271 "add c_ptr5, c_ptr5, #0x10\n"
2272 ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
2273 "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
2274 ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
2275 "ldr q24, [%[b_ptr0]]\n"
2276 ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
2277 "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
2278 ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
2279 "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
2280 ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
2281 "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
2282 ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
2283 "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
2284 ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
2285 "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
2286 ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
2287 "ldr q25, [%[b_ptr0], #0x10]\n"
2288 ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
2289 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2290 ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
2291 ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
2292 ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
2293 ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
2294 ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
2295 "ldr q24, [%[b_ptr0]]\n"
2296 ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
2297 ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
2298 ".inst 0x6fa8eb3c // udot v28.4s, v25.16b, v8.4b[3]\n"
2299 ".inst 0x6faceb3d // udot v29.4s, v25.16b, v12.4b[3]\n"
2300 ".inst 0x6fb0eb3e // udot v30.4s, v25.16b, v16.4b[3]\n"
2301 ".inst 0x6fb4eb3f // udot v31.4s, v25.16b, v20.4b[3]\n"
2302 "ldr q25, [%[b_ptr0], #0x10]\n"
2303 ".inst 0x6f81e31a // udot v26.4s, v24.16b, v1.4b[0]\n"
2304 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2305 ".inst 0x6f85e31b // udot v27.4s, v24.16b, v5.4b[0]\n"
2306 ".inst 0x6f89e31c // udot v28.4s, v24.16b, v9.4b[0]\n"
2307 ".inst 0x6f8de31d // udot v29.4s, v24.16b, v13.4b[0]\n"
2308 ".inst 0x6f91e31e // udot v30.4s, v24.16b, v17.4b[0]\n"
2309 ".inst 0x6f95e31f // udot v31.4s, v24.16b, v21.4b[0]\n"
2310 "ldr q24, [%[b_ptr0]]\n"
2311 ".inst 0x6fa1e33a // udot v26.4s, v25.16b, v1.4b[1]\n"
2312 ".inst 0x6fa5e33b // udot v27.4s, v25.16b, v5.4b[1]\n"
2313 ".inst 0x6fa9e33c // udot v28.4s, v25.16b, v9.4b[1]\n"
2314 ".inst 0x6fade33d // udot v29.4s, v25.16b, v13.4b[1]\n"
2315 ".inst 0x6fb1e33e // udot v30.4s, v25.16b, v17.4b[1]\n"
2316 ".inst 0x6fb5e33f // udot v31.4s, v25.16b, v21.4b[1]\n"
2317 "ldr q25, [%[b_ptr0], #0x10]\n"
2318 ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
2319 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2320 ".inst 0x6f85eb1b // udot v27.4s, v24.16b, v5.4b[2]\n"
2321 ".inst 0x6f89eb1c // udot v28.4s, v24.16b, v9.4b[2]\n"
2322 ".inst 0x6f8deb1d // udot v29.4s, v24.16b, v13.4b[2]\n"
2323 ".inst 0x6f91eb1e // udot v30.4s, v24.16b, v17.4b[2]\n"
2324 ".inst 0x6f95eb1f // udot v31.4s, v24.16b, v21.4b[2]\n"
2325 "ldr q24, [%[b_ptr0]]\n"
2326 ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
2327 ".inst 0x6fa5eb3b // udot v27.4s, v25.16b, v5.4b[3]\n"
2328 ".inst 0x6fa9eb3c // udot v28.4s, v25.16b, v9.4b[3]\n"
2329 ".inst 0x6fadeb3d // udot v29.4s, v25.16b, v13.4b[3]\n"
2330 ".inst 0x6fb1eb3e // udot v30.4s, v25.16b, v17.4b[3]\n"
2331 ".inst 0x6fb5eb3f // udot v31.4s, v25.16b, v21.4b[3]\n"
2332 "ldr q25, [%[b_ptr0], #0x10]\n"
2333 ".inst 0x6f82e31a // udot v26.4s, v24.16b, v2.4b[0]\n"
2334 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2335 ".inst 0x6f86e31b // udot v27.4s, v24.16b, v6.4b[0]\n"
2336 ".inst 0x6f8ae31c // udot v28.4s, v24.16b, v10.4b[0]\n"
2337 ".inst 0x6f8ee31d // udot v29.4s, v24.16b, v14.4b[0]\n"
2338 ".inst 0x6f92e31e // udot v30.4s, v24.16b, v18.4b[0]\n"
2339 ".inst 0x6f96e31f // udot v31.4s, v24.16b, v22.4b[0]\n"
2340 "ldr q24, [%[b_ptr0]]\n"
2341 ".inst 0x6fa2e33a // udot v26.4s, v25.16b, v2.4b[1]\n"
2342 ".inst 0x6fa6e33b // udot v27.4s, v25.16b, v6.4b[1]\n"
2343 ".inst 0x6faae33c // udot v28.4s, v25.16b, v10.4b[1]\n"
2344 ".inst 0x6faee33d // udot v29.4s, v25.16b, v14.4b[1]\n"
2345 ".inst 0x6fb2e33e // udot v30.4s, v25.16b, v18.4b[1]\n"
2346 ".inst 0x6fb6e33f // udot v31.4s, v25.16b, v22.4b[1]\n"
2347 "ldr q25, [%[b_ptr0], #0x10]\n"
2348 ".inst 0x6f82eb1a // udot v26.4s, v24.16b, v2.4b[2]\n"
2349 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2350 ".inst 0x6f86eb1b // udot v27.4s, v24.16b, v6.4b[2]\n"
2351 ".inst 0x6f8aeb1c // udot v28.4s, v24.16b, v10.4b[2]\n"
2352 ".inst 0x6f8eeb1d // udot v29.4s, v24.16b, v14.4b[2]\n"
2353 ".inst 0x6f92eb1e // udot v30.4s, v24.16b, v18.4b[2]\n"
2354 ".inst 0x6f96eb1f // udot v31.4s, v24.16b, v22.4b[2]\n"
2355 "ldr q24, [%[b_ptr0]]\n"
2356 ".inst 0x6fa2eb3a // udot v26.4s, v25.16b, v2.4b[3]\n"
2357 "add %[b_ptr0], %[b_ptr0], #0x10\n"
2358 ".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
2359 ".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
2360 ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
2361 ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
2362 ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
2363 ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
2364 ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
2365 ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
2366 ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
2367 ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
2368 ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
2371 "str q26, [%[c_ptr0]]\n"
2372 "add %[c_ptr0], %[c_ptr0], #0x10\n"
2374 "ldr q24, [%[b_ptr0]]\n"
2375 "ldr q25, [%[b_ptr0], #0x10]\n"
2376 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2377 "str q27, [c_ptr1]\n"
2378 "add c_ptr1, c_ptr1, #0x10\n"
2380 ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
2381 "str q28, [c_ptr2]\n"
2383 "add c_ptr2, c_ptr2, #0x10\n"
2384 ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
2385 "str q29, [c_ptr3]\n"
2387 "add c_ptr3, c_ptr3, #0x10\n"
2388 ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
2389 "str q30, [c_ptr4]\n"
2391 "add c_ptr4, c_ptr4, #0x10\n"
2392 ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
2393 "str q31, [c_ptr5]\n"
2395 "add c_ptr5, c_ptr5, #0x10\n"
2396 ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
2397 ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
2398 ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
2399 "ldr q24, [%[b_ptr0]]\n"
2400 ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
2401 ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
2402 ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
2403 ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
2404 ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
2405 "ldr q25, [%[b_ptr0], #0x10]\n"
2406 ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
2407 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2408 ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
2409 ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
2410 ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
2411 ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
2412 ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
2413 "ldr q24, [%[b_ptr0]]\n"
2414 ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
2415 ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
2416 ".inst 0x6fa8eb3c // udot v28.4s, v25.16b, v8.4b[3]\n"
2417 ".inst 0x6faceb3d // udot v29.4s, v25.16b, v12.4b[3]\n"
2418 ".inst 0x6fb0eb3e // udot v30.4s, v25.16b, v16.4b[3]\n"
2419 ".inst 0x6fb4eb3f // udot v31.4s, v25.16b, v20.4b[3]\n"
2420 "ldr q25, [%[b_ptr0], #0x10]\n"
2421 ".inst 0x6f81e31a // udot v26.4s, v24.16b, v1.4b[0]\n"
2422 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2423 ".inst 0x6f85e31b // udot v27.4s, v24.16b, v5.4b[0]\n"
2424 ".inst 0x6f89e31c // udot v28.4s, v24.16b, v9.4b[0]\n"
2425 ".inst 0x6f8de31d // udot v29.4s, v24.16b, v13.4b[0]\n"
2426 ".inst 0x6f91e31e // udot v30.4s, v24.16b, v17.4b[0]\n"
2427 ".inst 0x6f95e31f // udot v31.4s, v24.16b, v21.4b[0]\n"
2428 "ldr q24, [%[b_ptr0]]\n"
2429 ".inst 0x6fa1e33a // udot v26.4s, v25.16b, v1.4b[1]\n"
2430 ".inst 0x6fa5e33b // udot v27.4s, v25.16b, v5.4b[1]\n"
2431 ".inst 0x6fa9e33c // udot v28.4s, v25.16b, v9.4b[1]\n"
2432 ".inst 0x6fade33d // udot v29.4s, v25.16b, v13.4b[1]\n"
2433 ".inst 0x6fb1e33e // udot v30.4s, v25.16b, v17.4b[1]\n"
2434 ".inst 0x6fb5e33f // udot v31.4s, v25.16b, v21.4b[1]\n"
2435 "ldr q25, [%[b_ptr0], #0x10]\n"
2436 ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
2437 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2438 ".inst 0x6f85eb1b // udot v27.4s, v24.16b, v5.4b[2]\n"
2439 ".inst 0x6f89eb1c // udot v28.4s, v24.16b, v9.4b[2]\n"
2440 ".inst 0x6f8deb1d // udot v29.4s, v24.16b, v13.4b[2]\n"
2441 ".inst 0x6f91eb1e // udot v30.4s, v24.16b, v17.4b[2]\n"
2442 ".inst 0x6f95eb1f // udot v31.4s, v24.16b, v21.4b[2]\n"
2443 "ldr q24, [%[b_ptr0]]\n"
2444 ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
2445 ".inst 0x6fa5eb3b // udot v27.4s, v25.16b, v5.4b[3]\n"
2446 ".inst 0x6fa9eb3c // udot v28.4s, v25.16b, v9.4b[3]\n"
2447 ".inst 0x6fadeb3d // udot v29.4s, v25.16b, v13.4b[3]\n"
2448 ".inst 0x6fb1eb3e // udot v30.4s, v25.16b, v17.4b[3]\n"
2449 ".inst 0x6fb5eb3f // udot v31.4s, v25.16b, v21.4b[3]\n"
2450 "ldr q25, [%[b_ptr0], #0x10]\n"
2451 ".inst 0x6f82e31a // udot v26.4s, v24.16b, v2.4b[0]\n"
2452 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2453 ".inst 0x6f86e31b // udot v27.4s, v24.16b, v6.4b[0]\n"
2454 ".inst 0x6f8ae31c // udot v28.4s, v24.16b, v10.4b[0]\n"
2455 ".inst 0x6f8ee31d // udot v29.4s, v24.16b, v14.4b[0]\n"
2456 ".inst 0x6f92e31e // udot v30.4s, v24.16b, v18.4b[0]\n"
2457 ".inst 0x6f96e31f // udot v31.4s, v24.16b, v22.4b[0]\n"
2458 "ldr q24, [%[b_ptr0]]\n"
2459 ".inst 0x6fa2e33a // udot v26.4s, v25.16b, v2.4b[1]\n"
2460 ".inst 0x6fa6e33b // udot v27.4s, v25.16b, v6.4b[1]\n"
2461 ".inst 0x6faae33c // udot v28.4s, v25.16b, v10.4b[1]\n"
2462 ".inst 0x6faee33d // udot v29.4s, v25.16b, v14.4b[1]\n"
2463 ".inst 0x6fb2e33e // udot v30.4s, v25.16b, v18.4b[1]\n"
2464 ".inst 0x6fb6e33f // udot v31.4s, v25.16b, v22.4b[1]\n"
2465 "ldr q25, [%[b_ptr0], #0x10]\n"
2466 ".inst 0x6f82eb1a // udot v26.4s, v24.16b, v2.4b[2]\n"
2467 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2468 ".inst 0x6f86eb1b // udot v27.4s, v24.16b, v6.4b[2]\n"
2469 ".inst 0x6f8aeb1c // udot v28.4s, v24.16b, v10.4b[2]\n"
2470 ".inst 0x6f8eeb1d // udot v29.4s, v24.16b, v14.4b[2]\n"
2471 ".inst 0x6f92eb1e // udot v30.4s, v24.16b, v18.4b[2]\n"
2472 ".inst 0x6f96eb1f // udot v31.4s, v24.16b, v22.4b[2]\n"
2473 "ldr q24, [%[b_ptr0]]\n"
2474 ".inst 0x6fa2eb3a // udot v26.4s, v25.16b, v2.4b[3]\n"
2475 "add %[b_ptr0], %[b_ptr0], #0x10\n"
2476 ".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
2477 ".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
2478 ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
2479 ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
2480 ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
2481 ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
2482 ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
2483 ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
2484 ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
2485 ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
2486 ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
2495 ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
2496 ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
2497 ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
2498 ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
2499 ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
2500 ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
2501 "ldr q24, [%[b_ptr0]]\n"
2502 ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
2503 ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
2504 ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
2505 ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
2506 ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
2507 ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
2508 "ldr q25, [%[b_ptr0], #0x10]\n"
2509 ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
2510 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2511 ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
2512 ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
2513 ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
2514 ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
2515 ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
2516 "ldr q24, [%[b_ptr0]]\n"
2517 ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
2518 ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
2519 ".inst 0x6fa8eb3c // udot v28.4s, v25.16b, v8.4b[3]\n"
2520 ".inst 0x6faceb3d // udot v29.4s, v25.16b, v12.4b[3]\n"
2521 ".inst 0x6fb0eb3e // udot v30.4s, v25.16b, v16.4b[3]\n"
2522 ".inst 0x6fb4eb3f // udot v31.4s, v25.16b, v20.4b[3]\n"
2523 "ldr q25, [%[b_ptr0], #0x10]\n"
2524 ".inst 0x6f81e31a // udot v26.4s, v24.16b, v1.4b[0]\n"
2525 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2526 ".inst 0x6f85e31b // udot v27.4s, v24.16b, v5.4b[0]\n"
2527 ".inst 0x6f89e31c // udot v28.4s, v24.16b, v9.4b[0]\n"
2528 ".inst 0x6f8de31d // udot v29.4s, v24.16b, v13.4b[0]\n"
2529 ".inst 0x6f91e31e // udot v30.4s, v24.16b, v17.4b[0]\n"
2530 ".inst 0x6f95e31f // udot v31.4s, v24.16b, v21.4b[0]\n"
2531 "ldr q24, [%[b_ptr0]]\n"
2532 ".inst 0x6fa1e33a // udot v26.4s, v25.16b, v1.4b[1]\n"
2533 ".inst 0x6fa5e33b // udot v27.4s, v25.16b, v5.4b[1]\n"
2534 ".inst 0x6fa9e33c // udot v28.4s, v25.16b, v9.4b[1]\n"
2535 ".inst 0x6fade33d // udot v29.4s, v25.16b, v13.4b[1]\n"
2536 ".inst 0x6fb1e33e // udot v30.4s, v25.16b, v17.4b[1]\n"
2537 ".inst 0x6fb5e33f // udot v31.4s, v25.16b, v21.4b[1]\n"
2538 "ldr q25, [%[b_ptr0], #0x10]\n"
2539 ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
2540 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2541 ".inst 0x6f85eb1b // udot v27.4s, v24.16b, v5.4b[2]\n"
2542 ".inst 0x6f89eb1c // udot v28.4s, v24.16b, v9.4b[2]\n"
2543 ".inst 0x6f8deb1d // udot v29.4s, v24.16b, v13.4b[2]\n"
2544 ".inst 0x6f91eb1e // udot v30.4s, v24.16b, v17.4b[2]\n"
2545 ".inst 0x6f95eb1f // udot v31.4s, v24.16b, v21.4b[2]\n"
2546 "ldr q24, [%[b_ptr0]]\n"
2547 ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
2548 ".inst 0x6fa5eb3b // udot v27.4s, v25.16b, v5.4b[3]\n"
2549 ".inst 0x6fa9eb3c // udot v28.4s, v25.16b, v9.4b[3]\n"
2550 ".inst 0x6fadeb3d // udot v29.4s, v25.16b, v13.4b[3]\n"
2551 ".inst 0x6fb1eb3e // udot v30.4s, v25.16b, v17.4b[3]\n"
2552 ".inst 0x6fb5eb3f // udot v31.4s, v25.16b, v21.4b[3]\n"
2553 "ldr q25, [%[b_ptr0], #0x10]\n"
2554 ".inst 0x6f82e31a // udot v26.4s, v24.16b, v2.4b[0]\n"
2555 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2556 ".inst 0x6f86e31b // udot v27.4s, v24.16b, v6.4b[0]\n"
2557 ".inst 0x6f8ae31c // udot v28.4s, v24.16b, v10.4b[0]\n"
2558 ".inst 0x6f8ee31d // udot v29.4s, v24.16b, v14.4b[0]\n"
2559 ".inst 0x6f92e31e // udot v30.4s, v24.16b, v18.4b[0]\n"
2560 ".inst 0x6f96e31f // udot v31.4s, v24.16b, v22.4b[0]\n"
2561 "ldr q24, [%[b_ptr0]]\n"
2562 ".inst 0x6fa2e33a // udot v26.4s, v25.16b, v2.4b[1]\n"
2563 ".inst 0x6fa6e33b // udot v27.4s, v25.16b, v6.4b[1]\n"
2564 ".inst 0x6faae33c // udot v28.4s, v25.16b, v10.4b[1]\n"
2565 ".inst 0x6faee33d // udot v29.4s, v25.16b, v14.4b[1]\n"
2566 ".inst 0x6fb2e33e // udot v30.4s, v25.16b, v18.4b[1]\n"
2567 ".inst 0x6fb6e33f // udot v31.4s, v25.16b, v22.4b[1]\n"
2568 "ldr q25, [%[b_ptr0], #0x10]\n"
2569 ".inst 0x6f82eb1a // udot v26.4s, v24.16b, v2.4b[2]\n"
2570 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2571 ".inst 0x6f86eb1b // udot v27.4s, v24.16b, v6.4b[2]\n"
2572 ".inst 0x6f8aeb1c // udot v28.4s, v24.16b, v10.4b[2]\n"
2573 ".inst 0x6f8eeb1d // udot v29.4s, v24.16b, v14.4b[2]\n"
2574 ".inst 0x6f92eb1e // udot v30.4s, v24.16b, v18.4b[2]\n"
2575 ".inst 0x6f96eb1f // udot v31.4s, v24.16b, v22.4b[2]\n"
2576 "ldr q24, [%[b_ptr0]]\n"
2577 ".inst 0x6fa2eb3a // udot v26.4s, v25.16b, v2.4b[3]\n"
2578 "add %[b_ptr0], %[b_ptr0], #0x10\n"
2579 ".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
2580 ".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
2581 ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
2582 ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
2583 ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
2584 ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
2585 ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
2586 ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
2587 ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
2588 ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
2589 ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
2591 "str q26, [%[c_ptr0]]\n"
2592 "add %[c_ptr0], %[c_ptr0], #0x10\n"
2593 "str q27, [c_ptr1]\n"
2594 "str q28, [c_ptr2]\n"
2595 "str q29, [c_ptr3]\n"
2596 "str q30, [c_ptr4]\n"
2597 "str q31, [c_ptr5]\n"
2608 : [a_ptr0]
"+r" (a_ptr0), [b_ptr0]
"+r" (b_ptr0), [c_ptr0]
"+r" (c_ptr0), [loops]
"+r" (loops), [oob_rows]
"+r" (oob_rows), [odds]
"+r" (odds)
2609 : [lda]
"r" (ldab), [ldc]
"r" (ldcb)
2610 :
"x0",
"x1",
"x2",
"x3",
"x4",
"x5",
"x6",
"x7",
"x8",
"x9",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23",
"v24",
"v25",
"v26",
"v27",
"v28",
"v29",
"v30",
"v31",
"cc",
"memory"
2625 "add a_ptr1, %[a_ptr0], %[lda]\n"
2626 "add c_ptr1, %[c_ptr0], %[ldc]\n"
2627 "add a_ptr2, a_ptr1, %[lda]\n"
2628 "add c_ptr2, c_ptr1, %[ldc]\n"
2629 "add a_ptr3, a_ptr2, %[lda]\n"
2630 "add c_ptr3, c_ptr2, %[ldc]\n"
2631 "add a_ptr4, a_ptr3, %[lda]\n"
2632 "add c_ptr4, c_ptr3, %[ldc]\n"
2633 "add a_ptr5, a_ptr4, %[lda]\n"
2634 "add c_ptr5, c_ptr4, %[ldc]\n"
2635 "cbz %[oob_rows], 1f\n"
2636 "subs %[oob_rows], %[oob_rows], #0x1\n"
2637 "add c_ptr5, %[c_ptr0], #0x0\n"
2638 "add a_ptr5, %[a_ptr0], #0x0\n"
2640 "subs %[oob_rows], %[oob_rows], #0x1\n"
2641 "add c_ptr4, %[c_ptr0], #0x0\n"
2642 "add a_ptr4, %[a_ptr0], #0x0\n"
2644 "subs %[oob_rows], %[oob_rows], #0x1\n"
2645 "add c_ptr3, %[c_ptr0], #0x0\n"
2646 "add a_ptr3, %[a_ptr0], #0x0\n"
2648 "subs %[oob_rows], %[oob_rows], #0x1\n"
2649 "add c_ptr2, %[c_ptr0], #0x0\n"
2650 "add a_ptr2, %[a_ptr0], #0x0\n"
2652 "subs %[oob_rows], %[oob_rows], #0x1\n"
2653 "add c_ptr1, %[c_ptr0], #0x0\n"
2654 "add a_ptr1, %[a_ptr0], #0x0\n"
2656 "cbnz %[odds], 2f\n"
2657 "ldr q0, [%[a_ptr0]], #0x10\n"
2658 "ldr q4, [a_ptr1], #0x10\n"
2659 "ldr q8, [a_ptr2], #0x10\n"
2660 "ldr q12, [a_ptr3], #0x10\n"
2661 "ldr q16, [a_ptr4], #0x10\n"
2662 "ldr q20, [a_ptr5], #0x10\n"
2663 "ldr q1, [%[a_ptr0]], #0x10\n"
2664 "ldr q5, [a_ptr1], #0x10\n"
2665 "ldr q9, [a_ptr2], #0x10\n"
2666 "ldr q13, [a_ptr3], #0x10\n"
2667 "ldr q17, [a_ptr4], #0x10\n"
2668 "ldr q21, [a_ptr5], #0x10\n"
2669 "ldr q2, [%[a_ptr0]], #0x10\n"
2670 "ldr q6, [a_ptr1], #0x10\n"
2671 "ldr q10, [a_ptr2], #0x10\n"
2672 "ldr q14, [a_ptr3], #0x10\n"
2673 "ldr d3, [%[a_ptr0]]\n"
2674 "ldr q18, [a_ptr4], #0x10\n"
2675 "ldr d7, [a_ptr1]\n"
2676 "ldr q22, [a_ptr5], #0x10\n"
2677 "ldr d11, [a_ptr2]\n"
2678 "ldr d15, [a_ptr3]\n"
2679 "ldr d19, [a_ptr4]\n"
2680 "ldr d23, [a_ptr5]\n"
2683 "ldr q0, [%[a_ptr0]], #0x10\n"
2684 "subs %[odds], %[odds], #0x1\n"
2685 "ldr q4, [a_ptr1], #0x10\n"
2686 "ldr q8, [a_ptr2], #0x10\n"
2687 "ldr q12, [a_ptr3], #0x10\n"
2688 "ldr q16, [a_ptr4], #0x10\n"
2689 "ldr q20, [a_ptr5], #0x10\n"
2690 "ldr q1, [%[a_ptr0]], #0x10\n"
2691 "ldr q5, [a_ptr1], #0x10\n"
2692 "ldr q9, [a_ptr2], #0x10\n"
2693 "ldr q13, [a_ptr3], #0x10\n"
2694 "ldr q17, [a_ptr4], #0x10\n"
2695 "ldr q21, [a_ptr5], #0x10\n"
2696 "ldr q2, [%[a_ptr0]], #0x10\n"
2697 "ldr q6, [a_ptr1], #0x10\n"
2698 "ldr q10, [a_ptr2], #0x10\n"
2699 "ldr q14, [a_ptr3], #0x10\n"
2700 "ldr s3, [%[a_ptr0]], #0x4\n"
2701 "ldr q18, [a_ptr4], #0x10\n"
2702 "ldr s7, [a_ptr1], #0x4\n"
2703 "ldr q22, [a_ptr5], #0x10\n"
2704 "ldr s11, [a_ptr2], #0x4\n"
2705 "ldr s15, [a_ptr3], #0x4\n"
2706 "ldr s19, [a_ptr4], #0x4\n"
2707 "ldr s23, [a_ptr5], #0x4\n"
2709 "ld1 {v3.b}[4], [%[a_ptr0]]\n"
2710 "ld1 {v7.b}[4], [a_ptr1]\n"
2711 "ld1 {v11.b}[4], [a_ptr2]\n"
2712 "ld1 {v15.b}[4], [a_ptr3]\n"
2713 "ld1 {v19.b}[4], [a_ptr4]\n"
2714 "ld1 {v23.b}[4], [a_ptr5]\n"
2717 "ld1 {v3.h}[2], [%[a_ptr0]], #2\n"
2718 "ld1 {v7.h}[2], [a_ptr1], #2\n"
2719 "ld1 {v11.h}[2], [a_ptr2], #2\n"
2720 "ld1 {v15.h}[2], [a_ptr3], #2\n"
2721 "ld1 {v19.h}[2], [a_ptr4], #2\n"
2722 "ld1 {v23.h}[2], [a_ptr5], #2\n"
2723 "subs %[odds], %[odds], #0x1\n"
2727 "ld1 {v3.b}[6], [%[a_ptr0]]\n"
2728 "ld1 {v7.b}[6], [a_ptr1]\n"
2729 "ld1 {v11.b}[6], [a_ptr2]\n"
2730 "ld1 {v15.b}[6], [a_ptr3]\n"
2731 "ld1 {v19.b}[6], [a_ptr4]\n"
2732 "ld1 {v23.b}[6], [a_ptr5]\n"
2734 "ldr q24, [%[b_ptr0]]\n"
2735 "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
2736 "ldr q25, [%[b_ptr0], #0x10]\n"
2737 "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
2738 "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
2739 "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
2740 "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
2741 "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
2742 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2743 "cbz %[loops], 6f\n"
2745 "subs %[loops], %[loops], #0x1\n"
2751 ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
2752 ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
2753 ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
2754 ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
2755 ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
2756 ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
2757 "ldr q24, [%[b_ptr0]]\n"
2758 ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
2759 ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
2760 ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
2761 ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
2762 ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
2763 ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
2764 "ldr q25, [%[b_ptr0], #0x10]\n"
2765 ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
2766 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2767 ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
2768 ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
2769 ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
2770 ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
2771 ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
2772 "ldr q24, [%[b_ptr0]]\n"
2773 ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
2774 ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
2775 ".inst 0x6fa8eb3c // udot v28.4s, v25.16b, v8.4b[3]\n"
2776 ".inst 0x6faceb3d // udot v29.4s, v25.16b, v12.4b[3]\n"
2777 ".inst 0x6fb0eb3e // udot v30.4s, v25.16b, v16.4b[3]\n"
2778 ".inst 0x6fb4eb3f // udot v31.4s, v25.16b, v20.4b[3]\n"
2779 "ldr q25, [%[b_ptr0], #0x10]\n"
2780 ".inst 0x6f81e31a // udot v26.4s, v24.16b, v1.4b[0]\n"
2781 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2782 ".inst 0x6f85e31b // udot v27.4s, v24.16b, v5.4b[0]\n"
2783 ".inst 0x6f89e31c // udot v28.4s, v24.16b, v9.4b[0]\n"
2784 ".inst 0x6f8de31d // udot v29.4s, v24.16b, v13.4b[0]\n"
2785 ".inst 0x6f91e31e // udot v30.4s, v24.16b, v17.4b[0]\n"
2786 ".inst 0x6f95e31f // udot v31.4s, v24.16b, v21.4b[0]\n"
2787 "ldr q24, [%[b_ptr0]]\n"
2788 ".inst 0x6fa1e33a // udot v26.4s, v25.16b, v1.4b[1]\n"
2789 ".inst 0x6fa5e33b // udot v27.4s, v25.16b, v5.4b[1]\n"
2790 ".inst 0x6fa9e33c // udot v28.4s, v25.16b, v9.4b[1]\n"
2791 ".inst 0x6fade33d // udot v29.4s, v25.16b, v13.4b[1]\n"
2792 ".inst 0x6fb1e33e // udot v30.4s, v25.16b, v17.4b[1]\n"
2793 ".inst 0x6fb5e33f // udot v31.4s, v25.16b, v21.4b[1]\n"
2794 "ldr q25, [%[b_ptr0], #0x10]\n"
2795 ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
2796 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2797 ".inst 0x6f85eb1b // udot v27.4s, v24.16b, v5.4b[2]\n"
2798 ".inst 0x6f89eb1c // udot v28.4s, v24.16b, v9.4b[2]\n"
2799 ".inst 0x6f8deb1d // udot v29.4s, v24.16b, v13.4b[2]\n"
2800 ".inst 0x6f91eb1e // udot v30.4s, v24.16b, v17.4b[2]\n"
2801 ".inst 0x6f95eb1f // udot v31.4s, v24.16b, v21.4b[2]\n"
2802 "ldr q24, [%[b_ptr0]]\n"
2803 ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
2804 ".inst 0x6fa5eb3b // udot v27.4s, v25.16b, v5.4b[3]\n"
2805 ".inst 0x6fa9eb3c // udot v28.4s, v25.16b, v9.4b[3]\n"
2806 ".inst 0x6fadeb3d // udot v29.4s, v25.16b, v13.4b[3]\n"
2807 ".inst 0x6fb1eb3e // udot v30.4s, v25.16b, v17.4b[3]\n"
2808 ".inst 0x6fb5eb3f // udot v31.4s, v25.16b, v21.4b[3]\n"
2809 "ldr q25, [%[b_ptr0], #0x10]\n"
2810 ".inst 0x6f82e31a // udot v26.4s, v24.16b, v2.4b[0]\n"
2811 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2812 ".inst 0x6f86e31b // udot v27.4s, v24.16b, v6.4b[0]\n"
2813 ".inst 0x6f8ae31c // udot v28.4s, v24.16b, v10.4b[0]\n"
2814 ".inst 0x6f8ee31d // udot v29.4s, v24.16b, v14.4b[0]\n"
2815 ".inst 0x6f92e31e // udot v30.4s, v24.16b, v18.4b[0]\n"
2816 ".inst 0x6f96e31f // udot v31.4s, v24.16b, v22.4b[0]\n"
2817 "ldr q24, [%[b_ptr0]]\n"
2818 ".inst 0x6fa2e33a // udot v26.4s, v25.16b, v2.4b[1]\n"
2819 ".inst 0x6fa6e33b // udot v27.4s, v25.16b, v6.4b[1]\n"
2820 ".inst 0x6faae33c // udot v28.4s, v25.16b, v10.4b[1]\n"
2821 ".inst 0x6faee33d // udot v29.4s, v25.16b, v14.4b[1]\n"
2822 ".inst 0x6fb2e33e // udot v30.4s, v25.16b, v18.4b[1]\n"
2823 ".inst 0x6fb6e33f // udot v31.4s, v25.16b, v22.4b[1]\n"
2824 "ldr q25, [%[b_ptr0], #0x10]\n"
2825 ".inst 0x6f82eb1a // udot v26.4s, v24.16b, v2.4b[2]\n"
2826 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2827 ".inst 0x6f86eb1b // udot v27.4s, v24.16b, v6.4b[2]\n"
2828 ".inst 0x6f8aeb1c // udot v28.4s, v24.16b, v10.4b[2]\n"
2829 ".inst 0x6f8eeb1d // udot v29.4s, v24.16b, v14.4b[2]\n"
2830 ".inst 0x6f92eb1e // udot v30.4s, v24.16b, v18.4b[2]\n"
2831 ".inst 0x6f96eb1f // udot v31.4s, v24.16b, v22.4b[2]\n"
2832 "ldr q24, [%[b_ptr0]]\n"
2833 ".inst 0x6fa2eb3a // udot v26.4s, v25.16b, v2.4b[3]\n"
2834 ".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
2835 ".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
2836 ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
2837 ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
2838 ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
2839 "ldr q25, [%[b_ptr0], #0x10]\n"
2840 ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
2841 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2842 ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
2843 ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
2844 ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
2845 ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
2846 ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
2847 ".inst 0x6fa3e33a // udot v26.4s, v25.16b, v3.4b[1]\n"
2848 ".inst 0x6fa7e33b // udot v27.4s, v25.16b, v7.4b[1]\n"
2849 ".inst 0x6fabe33c // udot v28.4s, v25.16b, v11.4b[1]\n"
2850 ".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
2851 ".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
2852 ".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
2855 "str q26, [%[c_ptr0]]\n"
2856 "subs %[loops], %[loops], #0x1\n"
2858 "ldr q24, [%[b_ptr0]]\n"
2859 "ldr q25, [%[b_ptr0], #0x10]\n"
2860 "add %[c_ptr0], %[c_ptr0], #0x10\n"
2861 "str q27, [c_ptr1]\n"
2862 "add c_ptr1, c_ptr1, #0x10\n"
2864 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2865 ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
2866 "str q28, [c_ptr2]\n"
2868 "add c_ptr2, c_ptr2, #0x10\n"
2869 ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
2870 "str q29, [c_ptr3]\n"
2872 "add c_ptr3, c_ptr3, #0x10\n"
2873 ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
2874 "str q30, [c_ptr4]\n"
2876 "add c_ptr4, c_ptr4, #0x10\n"
2877 ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
2878 "str q31, [c_ptr5]\n"
2880 "add c_ptr5, c_ptr5, #0x10\n"
2881 ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
2882 "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
2883 ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
2884 "ldr q24, [%[b_ptr0]]\n"
2885 ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
2886 "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
2887 ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
2888 "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
2889 ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
2890 "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
2891 ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
2892 "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
2893 ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
2894 "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
2895 ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
2896 "ldr q25, [%[b_ptr0], #0x10]\n"
2897 ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
2898 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2899 ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
2900 ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
2901 ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
2902 ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
2903 ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
2904 "ldr q24, [%[b_ptr0]]\n"
2905 ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
2906 ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
2907 ".inst 0x6fa8eb3c // udot v28.4s, v25.16b, v8.4b[3]\n"
2908 ".inst 0x6faceb3d // udot v29.4s, v25.16b, v12.4b[3]\n"
2909 ".inst 0x6fb0eb3e // udot v30.4s, v25.16b, v16.4b[3]\n"
2910 ".inst 0x6fb4eb3f // udot v31.4s, v25.16b, v20.4b[3]\n"
2911 "ldr q25, [%[b_ptr0], #0x10]\n"
2912 ".inst 0x6f81e31a // udot v26.4s, v24.16b, v1.4b[0]\n"
2913 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2914 ".inst 0x6f85e31b // udot v27.4s, v24.16b, v5.4b[0]\n"
2915 ".inst 0x6f89e31c // udot v28.4s, v24.16b, v9.4b[0]\n"
2916 ".inst 0x6f8de31d // udot v29.4s, v24.16b, v13.4b[0]\n"
2917 ".inst 0x6f91e31e // udot v30.4s, v24.16b, v17.4b[0]\n"
2918 ".inst 0x6f95e31f // udot v31.4s, v24.16b, v21.4b[0]\n"
2919 "ldr q24, [%[b_ptr0]]\n"
2920 ".inst 0x6fa1e33a // udot v26.4s, v25.16b, v1.4b[1]\n"
2921 ".inst 0x6fa5e33b // udot v27.4s, v25.16b, v5.4b[1]\n"
2922 ".inst 0x6fa9e33c // udot v28.4s, v25.16b, v9.4b[1]\n"
2923 ".inst 0x6fade33d // udot v29.4s, v25.16b, v13.4b[1]\n"
2924 ".inst 0x6fb1e33e // udot v30.4s, v25.16b, v17.4b[1]\n"
2925 ".inst 0x6fb5e33f // udot v31.4s, v25.16b, v21.4b[1]\n"
2926 "ldr q25, [%[b_ptr0], #0x10]\n"
2927 ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
2928 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2929 ".inst 0x6f85eb1b // udot v27.4s, v24.16b, v5.4b[2]\n"
2930 ".inst 0x6f89eb1c // udot v28.4s, v24.16b, v9.4b[2]\n"
2931 ".inst 0x6f8deb1d // udot v29.4s, v24.16b, v13.4b[2]\n"
2932 ".inst 0x6f91eb1e // udot v30.4s, v24.16b, v17.4b[2]\n"
2933 ".inst 0x6f95eb1f // udot v31.4s, v24.16b, v21.4b[2]\n"
2934 "ldr q24, [%[b_ptr0]]\n"
2935 ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
2936 ".inst 0x6fa5eb3b // udot v27.4s, v25.16b, v5.4b[3]\n"
2937 ".inst 0x6fa9eb3c // udot v28.4s, v25.16b, v9.4b[3]\n"
2938 ".inst 0x6fadeb3d // udot v29.4s, v25.16b, v13.4b[3]\n"
2939 ".inst 0x6fb1eb3e // udot v30.4s, v25.16b, v17.4b[3]\n"
2940 ".inst 0x6fb5eb3f // udot v31.4s, v25.16b, v21.4b[3]\n"
2941 "ldr q25, [%[b_ptr0], #0x10]\n"
2942 ".inst 0x6f82e31a // udot v26.4s, v24.16b, v2.4b[0]\n"
2943 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2944 ".inst 0x6f86e31b // udot v27.4s, v24.16b, v6.4b[0]\n"
2945 ".inst 0x6f8ae31c // udot v28.4s, v24.16b, v10.4b[0]\n"
2946 ".inst 0x6f8ee31d // udot v29.4s, v24.16b, v14.4b[0]\n"
2947 ".inst 0x6f92e31e // udot v30.4s, v24.16b, v18.4b[0]\n"
2948 ".inst 0x6f96e31f // udot v31.4s, v24.16b, v22.4b[0]\n"
2949 "ldr q24, [%[b_ptr0]]\n"
2950 ".inst 0x6fa2e33a // udot v26.4s, v25.16b, v2.4b[1]\n"
2951 ".inst 0x6fa6e33b // udot v27.4s, v25.16b, v6.4b[1]\n"
2952 ".inst 0x6faae33c // udot v28.4s, v25.16b, v10.4b[1]\n"
2953 ".inst 0x6faee33d // udot v29.4s, v25.16b, v14.4b[1]\n"
2954 ".inst 0x6fb2e33e // udot v30.4s, v25.16b, v18.4b[1]\n"
2955 ".inst 0x6fb6e33f // udot v31.4s, v25.16b, v22.4b[1]\n"
2956 "ldr q25, [%[b_ptr0], #0x10]\n"
2957 ".inst 0x6f82eb1a // udot v26.4s, v24.16b, v2.4b[2]\n"
2958 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2959 ".inst 0x6f86eb1b // udot v27.4s, v24.16b, v6.4b[2]\n"
2960 ".inst 0x6f8aeb1c // udot v28.4s, v24.16b, v10.4b[2]\n"
2961 ".inst 0x6f8eeb1d // udot v29.4s, v24.16b, v14.4b[2]\n"
2962 ".inst 0x6f92eb1e // udot v30.4s, v24.16b, v18.4b[2]\n"
2963 ".inst 0x6f96eb1f // udot v31.4s, v24.16b, v22.4b[2]\n"
2964 "ldr q24, [%[b_ptr0]]\n"
2965 ".inst 0x6fa2eb3a // udot v26.4s, v25.16b, v2.4b[3]\n"
2966 ".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
2967 ".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
2968 ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
2969 ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
2970 ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
2971 "ldr q25, [%[b_ptr0], #0x10]\n"
2972 ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
2973 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2974 ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
2975 ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
2976 ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
2977 ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
2978 ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
2979 ".inst 0x6fa3e33a // udot v26.4s, v25.16b, v3.4b[1]\n"
2980 ".inst 0x6fa7e33b // udot v27.4s, v25.16b, v7.4b[1]\n"
2981 ".inst 0x6fabe33c // udot v28.4s, v25.16b, v11.4b[1]\n"
2982 ".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
2983 ".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
2984 ".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
2987 "str q26, [%[c_ptr0]]\n"
2988 "add %[c_ptr0], %[c_ptr0], #0x10\n"
2990 "ldr q24, [%[b_ptr0]]\n"
2991 "ldr q25, [%[b_ptr0], #0x10]\n"
2992 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2993 "str q27, [c_ptr1]\n"
2994 "add c_ptr1, c_ptr1, #0x10\n"
2996 ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
2997 "str q28, [c_ptr2]\n"
2999 "add c_ptr2, c_ptr2, #0x10\n"
3000 ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
3001 "str q29, [c_ptr3]\n"
3003 "add c_ptr3, c_ptr3, #0x10\n"
3004 ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
3005 "str q30, [c_ptr4]\n"
3007 "add c_ptr4, c_ptr4, #0x10\n"
3008 ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
3009 "str q31, [c_ptr5]\n"
3011 "add c_ptr5, c_ptr5, #0x10\n"
3012 ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
3013 ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
3014 ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
3015 "ldr q24, [%[b_ptr0]]\n"
3016 ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
3017 ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
3018 ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
3019 ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
3020 ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
3021 "ldr q25, [%[b_ptr0], #0x10]\n"
3022 ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
3023 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3024 ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
3025 ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
3026 ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
3027 ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
3028 ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
3029 "ldr q24, [%[b_ptr0]]\n"
3030 ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
3031 ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
3032 ".inst 0x6fa8eb3c // udot v28.4s, v25.16b, v8.4b[3]\n"
3033 ".inst 0x6faceb3d // udot v29.4s, v25.16b, v12.4b[3]\n"
3034 ".inst 0x6fb0eb3e // udot v30.4s, v25.16b, v16.4b[3]\n"
3035 ".inst 0x6fb4eb3f // udot v31.4s, v25.16b, v20.4b[3]\n"
3036 "ldr q25, [%[b_ptr0], #0x10]\n"
3037 ".inst 0x6f81e31a // udot v26.4s, v24.16b, v1.4b[0]\n"
3038 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3039 ".inst 0x6f85e31b // udot v27.4s, v24.16b, v5.4b[0]\n"
3040 ".inst 0x6f89e31c // udot v28.4s, v24.16b, v9.4b[0]\n"
3041 ".inst 0x6f8de31d // udot v29.4s, v24.16b, v13.4b[0]\n"
3042 ".inst 0x6f91e31e // udot v30.4s, v24.16b, v17.4b[0]\n"
3043 ".inst 0x6f95e31f // udot v31.4s, v24.16b, v21.4b[0]\n"
3044 "ldr q24, [%[b_ptr0]]\n"
3045 ".inst 0x6fa1e33a // udot v26.4s, v25.16b, v1.4b[1]\n"
3046 ".inst 0x6fa5e33b // udot v27.4s, v25.16b, v5.4b[1]\n"
3047 ".inst 0x6fa9e33c // udot v28.4s, v25.16b, v9.4b[1]\n"
3048 ".inst 0x6fade33d // udot v29.4s, v25.16b, v13.4b[1]\n"
3049 ".inst 0x6fb1e33e // udot v30.4s, v25.16b, v17.4b[1]\n"
3050 ".inst 0x6fb5e33f // udot v31.4s, v25.16b, v21.4b[1]\n"
3051 "ldr q25, [%[b_ptr0], #0x10]\n"
3052 ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
3053 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3054 ".inst 0x6f85eb1b // udot v27.4s, v24.16b, v5.4b[2]\n"
3055 ".inst 0x6f89eb1c // udot v28.4s, v24.16b, v9.4b[2]\n"
3056 ".inst 0x6f8deb1d // udot v29.4s, v24.16b, v13.4b[2]\n"
3057 ".inst 0x6f91eb1e // udot v30.4s, v24.16b, v17.4b[2]\n"
3058 ".inst 0x6f95eb1f // udot v31.4s, v24.16b, v21.4b[2]\n"
3059 "ldr q24, [%[b_ptr0]]\n"
3060 ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
3061 ".inst 0x6fa5eb3b // udot v27.4s, v25.16b, v5.4b[3]\n"
3062 ".inst 0x6fa9eb3c // udot v28.4s, v25.16b, v9.4b[3]\n"
3063 ".inst 0x6fadeb3d // udot v29.4s, v25.16b, v13.4b[3]\n"
3064 ".inst 0x6fb1eb3e // udot v30.4s, v25.16b, v17.4b[3]\n"
3065 ".inst 0x6fb5eb3f // udot v31.4s, v25.16b, v21.4b[3]\n"
3066 "ldr q25, [%[b_ptr0], #0x10]\n"
3067 ".inst 0x6f82e31a // udot v26.4s, v24.16b, v2.4b[0]\n"
3068 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3069 ".inst 0x6f86e31b // udot v27.4s, v24.16b, v6.4b[0]\n"
3070 ".inst 0x6f8ae31c // udot v28.4s, v24.16b, v10.4b[0]\n"
3071 ".inst 0x6f8ee31d // udot v29.4s, v24.16b, v14.4b[0]\n"
3072 ".inst 0x6f92e31e // udot v30.4s, v24.16b, v18.4b[0]\n"
3073 ".inst 0x6f96e31f // udot v31.4s, v24.16b, v22.4b[0]\n"
3074 "ldr q24, [%[b_ptr0]]\n"
3075 ".inst 0x6fa2e33a // udot v26.4s, v25.16b, v2.4b[1]\n"
3076 ".inst 0x6fa6e33b // udot v27.4s, v25.16b, v6.4b[1]\n"
3077 ".inst 0x6faae33c // udot v28.4s, v25.16b, v10.4b[1]\n"
3078 ".inst 0x6faee33d // udot v29.4s, v25.16b, v14.4b[1]\n"
3079 ".inst 0x6fb2e33e // udot v30.4s, v25.16b, v18.4b[1]\n"
3080 ".inst 0x6fb6e33f // udot v31.4s, v25.16b, v22.4b[1]\n"
3081 "ldr q25, [%[b_ptr0], #0x10]\n"
3082 ".inst 0x6f82eb1a // udot v26.4s, v24.16b, v2.4b[2]\n"
3083 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3084 ".inst 0x6f86eb1b // udot v27.4s, v24.16b, v6.4b[2]\n"
3085 ".inst 0x6f8aeb1c // udot v28.4s, v24.16b, v10.4b[2]\n"
3086 ".inst 0x6f8eeb1d // udot v29.4s, v24.16b, v14.4b[2]\n"
3087 ".inst 0x6f92eb1e // udot v30.4s, v24.16b, v18.4b[2]\n"
3088 ".inst 0x6f96eb1f // udot v31.4s, v24.16b, v22.4b[2]\n"
3089 "ldr q24, [%[b_ptr0]]\n"
3090 ".inst 0x6fa2eb3a // udot v26.4s, v25.16b, v2.4b[3]\n"
3091 ".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
3092 ".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
3093 ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
3094 ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
3095 ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
3096 "ldr q25, [%[b_ptr0], #0x10]\n"
3097 ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
3098 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3099 ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
3100 ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
3101 ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
3102 ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
3103 ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
3104 ".inst 0x6fa3e33a // udot v26.4s, v25.16b, v3.4b[1]\n"
3105 ".inst 0x6fa7e33b // udot v27.4s, v25.16b, v7.4b[1]\n"
3106 ".inst 0x6fabe33c // udot v28.4s, v25.16b, v11.4b[1]\n"
3107 ".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
3108 ".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
3109 ".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
3118 ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
3119 ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
3120 ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
3121 ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
3122 ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
3123 ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
3124 "ldr q24, [%[b_ptr0]]\n"
3125 ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
3126 ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
3127 ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
3128 ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
3129 ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
3130 ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
3131 "ldr q25, [%[b_ptr0], #0x10]\n"
3132 ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
3133 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3134 ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
3135 ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
3136 ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
3137 ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
3138 ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
3139 "ldr q24, [%[b_ptr0]]\n"
3140 ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
3141 ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
3142 ".inst 0x6fa8eb3c // udot v28.4s, v25.16b, v8.4b[3]\n"
3143 ".inst 0x6faceb3d // udot v29.4s, v25.16b, v12.4b[3]\n"
3144 ".inst 0x6fb0eb3e // udot v30.4s, v25.16b, v16.4b[3]\n"
3145 ".inst 0x6fb4eb3f // udot v31.4s, v25.16b, v20.4b[3]\n"
3146 "ldr q25, [%[b_ptr0], #0x10]\n"
3147 ".inst 0x6f81e31a // udot v26.4s, v24.16b, v1.4b[0]\n"
3148 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3149 ".inst 0x6f85e31b // udot v27.4s, v24.16b, v5.4b[0]\n"
3150 ".inst 0x6f89e31c // udot v28.4s, v24.16b, v9.4b[0]\n"
3151 ".inst 0x6f8de31d // udot v29.4s, v24.16b, v13.4b[0]\n"
3152 ".inst 0x6f91e31e // udot v30.4s, v24.16b, v17.4b[0]\n"
3153 ".inst 0x6f95e31f // udot v31.4s, v24.16b, v21.4b[0]\n"
3154 "ldr q24, [%[b_ptr0]]\n"
3155 ".inst 0x6fa1e33a // udot v26.4s, v25.16b, v1.4b[1]\n"
3156 ".inst 0x6fa5e33b // udot v27.4s, v25.16b, v5.4b[1]\n"
3157 ".inst 0x6fa9e33c // udot v28.4s, v25.16b, v9.4b[1]\n"
3158 ".inst 0x6fade33d // udot v29.4s, v25.16b, v13.4b[1]\n"
3159 ".inst 0x6fb1e33e // udot v30.4s, v25.16b, v17.4b[1]\n"
3160 ".inst 0x6fb5e33f // udot v31.4s, v25.16b, v21.4b[1]\n"
3161 "ldr q25, [%[b_ptr0], #0x10]\n"
3162 ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
3163 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3164 ".inst 0x6f85eb1b // udot v27.4s, v24.16b, v5.4b[2]\n"
3165 ".inst 0x6f89eb1c // udot v28.4s, v24.16b, v9.4b[2]\n"
3166 ".inst 0x6f8deb1d // udot v29.4s, v24.16b, v13.4b[2]\n"
3167 ".inst 0x6f91eb1e // udot v30.4s, v24.16b, v17.4b[2]\n"
3168 ".inst 0x6f95eb1f // udot v31.4s, v24.16b, v21.4b[2]\n"
3169 "ldr q24, [%[b_ptr0]]\n"
3170 ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
3171 ".inst 0x6fa5eb3b // udot v27.4s, v25.16b, v5.4b[3]\n"
3172 ".inst 0x6fa9eb3c // udot v28.4s, v25.16b, v9.4b[3]\n"
3173 ".inst 0x6fadeb3d // udot v29.4s, v25.16b, v13.4b[3]\n"
3174 ".inst 0x6fb1eb3e // udot v30.4s, v25.16b, v17.4b[3]\n"
3175 ".inst 0x6fb5eb3f // udot v31.4s, v25.16b, v21.4b[3]\n"
3176 "ldr q25, [%[b_ptr0], #0x10]\n"
3177 ".inst 0x6f82e31a // udot v26.4s, v24.16b, v2.4b[0]\n"
3178 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3179 ".inst 0x6f86e31b // udot v27.4s, v24.16b, v6.4b[0]\n"
3180 ".inst 0x6f8ae31c // udot v28.4s, v24.16b, v10.4b[0]\n"
3181 ".inst 0x6f8ee31d // udot v29.4s, v24.16b, v14.4b[0]\n"
3182 ".inst 0x6f92e31e // udot v30.4s, v24.16b, v18.4b[0]\n"
3183 ".inst 0x6f96e31f // udot v31.4s, v24.16b, v22.4b[0]\n"
3184 "ldr q24, [%[b_ptr0]]\n"
3185 ".inst 0x6fa2e33a // udot v26.4s, v25.16b, v2.4b[1]\n"
3186 ".inst 0x6fa6e33b // udot v27.4s, v25.16b, v6.4b[1]\n"
3187 ".inst 0x6faae33c // udot v28.4s, v25.16b, v10.4b[1]\n"
3188 ".inst 0x6faee33d // udot v29.4s, v25.16b, v14.4b[1]\n"
3189 ".inst 0x6fb2e33e // udot v30.4s, v25.16b, v18.4b[1]\n"
3190 ".inst 0x6fb6e33f // udot v31.4s, v25.16b, v22.4b[1]\n"
3191 "ldr q25, [%[b_ptr0], #0x10]\n"
3192 ".inst 0x6f82eb1a // udot v26.4s, v24.16b, v2.4b[2]\n"
3193 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3194 ".inst 0x6f86eb1b // udot v27.4s, v24.16b, v6.4b[2]\n"
3195 ".inst 0x6f8aeb1c // udot v28.4s, v24.16b, v10.4b[2]\n"
3196 ".inst 0x6f8eeb1d // udot v29.4s, v24.16b, v14.4b[2]\n"
3197 ".inst 0x6f92eb1e // udot v30.4s, v24.16b, v18.4b[2]\n"
3198 ".inst 0x6f96eb1f // udot v31.4s, v24.16b, v22.4b[2]\n"
3199 "ldr q24, [%[b_ptr0]]\n"
3200 ".inst 0x6fa2eb3a // udot v26.4s, v25.16b, v2.4b[3]\n"
3201 ".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
3202 ".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
3203 ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
3204 ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
3205 ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
3206 "ldr q25, [%[b_ptr0], #0x10]\n"
3207 ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
3208 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3209 ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
3210 ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
3211 ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
3212 ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
3213 ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
3214 ".inst 0x6fa3e33a // udot v26.4s, v25.16b, v3.4b[1]\n"
3215 ".inst 0x6fa7e33b // udot v27.4s, v25.16b, v7.4b[1]\n"
3216 ".inst 0x6fabe33c // udot v28.4s, v25.16b, v11.4b[1]\n"
3217 ".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
3218 ".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
3219 ".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
3221 "str q26, [%[c_ptr0]]\n"
3222 "add %[c_ptr0], %[c_ptr0], #0x10\n"
3223 "str q27, [c_ptr1]\n"
3224 "str q28, [c_ptr2]\n"
3225 "str q29, [c_ptr3]\n"
3226 "str q30, [c_ptr4]\n"
3227 "str q31, [c_ptr5]\n"
3238 : [a_ptr0]
"+r" (a_ptr0), [b_ptr0]
"+r" (b_ptr0), [c_ptr0]
"+r" (c_ptr0), [loops]
"+r" (loops), [oob_rows]
"+r" (oob_rows), [odds]
"+r" (odds)
3239 : [lda]
"r" (ldab), [ldc]
"r" (ldcb)
3240 :
"x0",
"x1",
"x2",
"x3",
"x4",
"x5",
"x6",
"x7",
"x8",
"x9",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23",
"v24",
"v25",
"v26",
"v27",
"v28",
"v29",
"v30",
"v31",
"cc",
"memory"
3255 "add a_ptr1, %[a_ptr0], %[lda]\n"
3256 "add c_ptr1, %[c_ptr0], %[ldc]\n"
3257 "add a_ptr2, a_ptr1, %[lda]\n"
3258 "add c_ptr2, c_ptr1, %[ldc]\n"
3259 "add a_ptr3, a_ptr2, %[lda]\n"
3260 "add c_ptr3, c_ptr2, %[ldc]\n"
3261 "add a_ptr4, a_ptr3, %[lda]\n"
3262 "add c_ptr4, c_ptr3, %[ldc]\n"
3263 "add a_ptr5, a_ptr4, %[lda]\n"
3264 "add c_ptr5, c_ptr4, %[ldc]\n"
3265 "cbz %[oob_rows], 1f\n"
3266 "subs %[oob_rows], %[oob_rows], #0x1\n"
3267 "add c_ptr5, %[c_ptr0], #0x0\n"
3268 "add a_ptr5, %[a_ptr0], #0x0\n"
3270 "subs %[oob_rows], %[oob_rows], #0x1\n"
3271 "add c_ptr4, %[c_ptr0], #0x0\n"
3272 "add a_ptr4, %[a_ptr0], #0x0\n"
3274 "subs %[oob_rows], %[oob_rows], #0x1\n"
3275 "add c_ptr3, %[c_ptr0], #0x0\n"
3276 "add a_ptr3, %[a_ptr0], #0x0\n"
3278 "subs %[oob_rows], %[oob_rows], #0x1\n"
3279 "add c_ptr2, %[c_ptr0], #0x0\n"
3280 "add a_ptr2, %[a_ptr0], #0x0\n"
3282 "subs %[oob_rows], %[oob_rows], #0x1\n"
3283 "add c_ptr1, %[c_ptr0], #0x0\n"
3284 "add a_ptr1, %[a_ptr0], #0x0\n"
3286 "ldr q0, [%[a_ptr0]], #0x10\n"
3287 "ldr q4, [a_ptr1], #0x10\n"
3288 "ldr q8, [a_ptr2], #0x10\n"
3289 "ldr q12, [a_ptr3], #0x10\n"
3290 "ldr q16, [a_ptr4], #0x10\n"
3291 "ldr q20, [a_ptr5], #0x10\n"
3292 "ldr q1, [%[a_ptr0]], #0x10\n"
3293 "ldr q5, [a_ptr1], #0x10\n"
3294 "ldr q9, [a_ptr2], #0x10\n"
3295 "ldr q13, [a_ptr3], #0x10\n"
3296 "ldr q17, [a_ptr4], #0x10\n"
3297 "ldr q21, [a_ptr5], #0x10\n"
3298 "ldr q2, [%[a_ptr0]], #0x10\n"
3299 "ldr q6, [a_ptr1], #0x10\n"
3300 "ldr q10, [a_ptr2], #0x10\n"
3301 "ldr q14, [a_ptr3], #0x10\n"
3302 "ldr d3, [%[a_ptr0]], #0x8\n"
3303 "ldr q18, [a_ptr4], #0x10\n"
3304 "ldr d7, [a_ptr1], #0x8\n"
3305 "ldr q22, [a_ptr5], #0x10\n"
3306 "ldr d11, [a_ptr2], #0x8\n"
3307 "ldr d15, [a_ptr3], #0x8\n"
3308 "ldr d19, [a_ptr4], #0x8\n"
3309 "ldr d23, [a_ptr5], #0x8\n"
3310 "cbnz %[odds], 2f\n"
3311 "ld1 {v3.s}[2], [%[a_ptr0]]\n"
3312 "ld1 {v7.s}[2], [a_ptr1]\n"
3313 "ld1 {v11.s}[2], [a_ptr2]\n"
3314 "ld1 {v15.s}[2], [a_ptr3]\n"
3315 "ld1 {v19.s}[2], [a_ptr4]\n"
3316 "ld1 {v23.s}[2], [a_ptr5]\n"
3319 "subs %[odds], %[odds], #0x1\n"
3321 "ld1 {v3.b}[8], [%[a_ptr0]]\n"
3322 "ld1 {v7.b}[8], [a_ptr1]\n"
3323 "ld1 {v11.b}[8], [a_ptr2]\n"
3324 "ld1 {v15.b}[8], [a_ptr3]\n"
3325 "ld1 {v19.b}[8], [a_ptr4]\n"
3326 "ld1 {v23.b}[8], [a_ptr5]\n"
3329 "ld1 {v3.h}[4], [%[a_ptr0]], #2\n"
3330 "ld1 {v7.h}[4], [a_ptr1], #2\n"
3331 "ld1 {v11.h}[4], [a_ptr2], #2\n"
3332 "ld1 {v15.h}[4], [a_ptr3], #2\n"
3333 "ld1 {v19.h}[4], [a_ptr4], #2\n"
3334 "ld1 {v23.h}[4], [a_ptr5], #2\n"
3335 "subs %[odds], %[odds], #0x1\n"
3339 "ld1 {v3.b}[10], [%[a_ptr0]]\n"
3340 "ld1 {v7.b}[10], [a_ptr1]\n"
3341 "ld1 {v11.b}[10], [a_ptr2]\n"
3342 "ld1 {v15.b}[10], [a_ptr3]\n"
3343 "ld1 {v19.b}[10], [a_ptr4]\n"
3344 "ld1 {v23.b}[10], [a_ptr5]\n"
3346 "ldr q24, [%[b_ptr0]]\n"
3347 "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
3348 "ldr q25, [%[b_ptr0], #0x10]\n"
3349 "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
3350 "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
3351 "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
3352 "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
3353 "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
3354 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3355 "cbz %[loops], 6f\n"
3357 "subs %[loops], %[loops], #0x1\n"
3363 ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
3364 ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
3365 ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
3366 ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
3367 ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
3368 ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
3369 "ldr q24, [%[b_ptr0]]\n"
3370 ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
3371 ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
3372 ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
3373 ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
3374 ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
3375 ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
3376 "ldr q25, [%[b_ptr0], #0x10]\n"
3377 ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
3378 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3379 ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
3380 ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
3381 ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
3382 ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
3383 ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
3384 "ldr q24, [%[b_ptr0]]\n"
3385 ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
3386 ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
3387 ".inst 0x6fa8eb3c // udot v28.4s, v25.16b, v8.4b[3]\n"
3388 ".inst 0x6faceb3d // udot v29.4s, v25.16b, v12.4b[3]\n"
3389 ".inst 0x6fb0eb3e // udot v30.4s, v25.16b, v16.4b[3]\n"
3390 ".inst 0x6fb4eb3f // udot v31.4s, v25.16b, v20.4b[3]\n"
3391 "ldr q25, [%[b_ptr0], #0x10]\n"
3392 ".inst 0x6f81e31a // udot v26.4s, v24.16b, v1.4b[0]\n"
3393 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3394 ".inst 0x6f85e31b // udot v27.4s, v24.16b, v5.4b[0]\n"
3395 ".inst 0x6f89e31c // udot v28.4s, v24.16b, v9.4b[0]\n"
3396 ".inst 0x6f8de31d // udot v29.4s, v24.16b, v13.4b[0]\n"
3397 ".inst 0x6f91e31e // udot v30.4s, v24.16b, v17.4b[0]\n"
3398 ".inst 0x6f95e31f // udot v31.4s, v24.16b, v21.4b[0]\n"
3399 "ldr q24, [%[b_ptr0]]\n"
3400 ".inst 0x6fa1e33a // udot v26.4s, v25.16b, v1.4b[1]\n"
3401 ".inst 0x6fa5e33b // udot v27.4s, v25.16b, v5.4b[1]\n"
3402 ".inst 0x6fa9e33c // udot v28.4s, v25.16b, v9.4b[1]\n"
3403 ".inst 0x6fade33d // udot v29.4s, v25.16b, v13.4b[1]\n"
3404 ".inst 0x6fb1e33e // udot v30.4s, v25.16b, v17.4b[1]\n"
3405 ".inst 0x6fb5e33f // udot v31.4s, v25.16b, v21.4b[1]\n"
3406 "ldr q25, [%[b_ptr0], #0x10]\n"
3407 ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
3408 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3409 ".inst 0x6f85eb1b // udot v27.4s, v24.16b, v5.4b[2]\n"
3410 ".inst 0x6f89eb1c // udot v28.4s, v24.16b, v9.4b[2]\n"
3411 ".inst 0x6f8deb1d // udot v29.4s, v24.16b, v13.4b[2]\n"
3412 ".inst 0x6f91eb1e // udot v30.4s, v24.16b, v17.4b[2]\n"
3413 ".inst 0x6f95eb1f // udot v31.4s, v24.16b, v21.4b[2]\n"
3414 "ldr q24, [%[b_ptr0]]\n"
3415 ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
3416 ".inst 0x6fa5eb3b // udot v27.4s, v25.16b, v5.4b[3]\n"
3417 ".inst 0x6fa9eb3c // udot v28.4s, v25.16b, v9.4b[3]\n"
3418 ".inst 0x6fadeb3d // udot v29.4s, v25.16b, v13.4b[3]\n"
3419 ".inst 0x6fb1eb3e // udot v30.4s, v25.16b, v17.4b[3]\n"
3420 ".inst 0x6fb5eb3f // udot v31.4s, v25.16b, v21.4b[3]\n"
3421 "ldr q25, [%[b_ptr0], #0x10]\n"
3422 ".inst 0x6f82e31a // udot v26.4s, v24.16b, v2.4b[0]\n"
3423 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3424 ".inst 0x6f86e31b // udot v27.4s, v24.16b, v6.4b[0]\n"
3425 ".inst 0x6f8ae31c // udot v28.4s, v24.16b, v10.4b[0]\n"
3426 ".inst 0x6f8ee31d // udot v29.4s, v24.16b, v14.4b[0]\n"
3427 ".inst 0x6f92e31e // udot v30.4s, v24.16b, v18.4b[0]\n"
3428 ".inst 0x6f96e31f // udot v31.4s, v24.16b, v22.4b[0]\n"
3429 "ldr q24, [%[b_ptr0]]\n"
3430 ".inst 0x6fa2e33a // udot v26.4s, v25.16b, v2.4b[1]\n"
3431 ".inst 0x6fa6e33b // udot v27.4s, v25.16b, v6.4b[1]\n"
3432 ".inst 0x6faae33c // udot v28.4s, v25.16b, v10.4b[1]\n"
3433 ".inst 0x6faee33d // udot v29.4s, v25.16b, v14.4b[1]\n"
3434 ".inst 0x6fb2e33e // udot v30.4s, v25.16b, v18.4b[1]\n"
3435 ".inst 0x6fb6e33f // udot v31.4s, v25.16b, v22.4b[1]\n"
3436 "ldr q25, [%[b_ptr0], #0x10]\n"
3437 ".inst 0x6f82eb1a // udot v26.4s, v24.16b, v2.4b[2]\n"
3438 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3439 ".inst 0x6f86eb1b // udot v27.4s, v24.16b, v6.4b[2]\n"
3440 ".inst 0x6f8aeb1c // udot v28.4s, v24.16b, v10.4b[2]\n"
3441 ".inst 0x6f8eeb1d // udot v29.4s, v24.16b, v14.4b[2]\n"
3442 ".inst 0x6f92eb1e // udot v30.4s, v24.16b, v18.4b[2]\n"
3443 ".inst 0x6f96eb1f // udot v31.4s, v24.16b, v22.4b[2]\n"
3444 "ldr q24, [%[b_ptr0]]\n"
3445 ".inst 0x6fa2eb3a // udot v26.4s, v25.16b, v2.4b[3]\n"
3446 ".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
3447 ".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
3448 ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
3449 ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
3450 ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
3451 "ldr q25, [%[b_ptr0], #0x10]\n"
3452 ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
3453 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3454 ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
3455 ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
3456 ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
3457 ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
3458 ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
3459 "ldr q24, [%[b_ptr0]]\n"
3460 ".inst 0x6fa3e33a // udot v26.4s, v25.16b, v3.4b[1]\n"
3461 "add %[b_ptr0], %[b_ptr0], #0x10\n"
3462 ".inst 0x6fa7e33b // udot v27.4s, v25.16b, v7.4b[1]\n"
3463 ".inst 0x6fabe33c // udot v28.4s, v25.16b, v11.4b[1]\n"
3464 ".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
3465 ".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
3466 ".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
3467 ".inst 0x6f83eb1a // udot v26.4s, v24.16b, v3.4b[2]\n"
3468 ".inst 0x6f87eb1b // udot v27.4s, v24.16b, v7.4b[2]\n"
3469 ".inst 0x6f8beb1c // udot v28.4s, v24.16b, v11.4b[2]\n"
3470 ".inst 0x6f8feb1d // udot v29.4s, v24.16b, v15.4b[2]\n"
3471 ".inst 0x6f93eb1e // udot v30.4s, v24.16b, v19.4b[2]\n"
3472 ".inst 0x6f97eb1f // udot v31.4s, v24.16b, v23.4b[2]\n"
3475 "str q26, [%[c_ptr0]]\n"
3476 "subs %[loops], %[loops], #0x1\n"
3478 "ldr q24, [%[b_ptr0]]\n"
3479 "ldr q25, [%[b_ptr0], #0x10]\n"
3480 "add %[c_ptr0], %[c_ptr0], #0x10\n"
3481 "str q27, [c_ptr1]\n"
3482 "add c_ptr1, c_ptr1, #0x10\n"
3484 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3485 ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
3486 "str q28, [c_ptr2]\n"
3488 "add c_ptr2, c_ptr2, #0x10\n"
3489 ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
3490 "str q29, [c_ptr3]\n"
3492 "add c_ptr3, c_ptr3, #0x10\n"
3493 ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
3494 "str q30, [c_ptr4]\n"
3496 "add c_ptr4, c_ptr4, #0x10\n"
3497 ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
3498 "str q31, [c_ptr5]\n"
3500 "add c_ptr5, c_ptr5, #0x10\n"
3501 ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
3502 "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
3503 ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
3504 "ldr q24, [%[b_ptr0]]\n"
3505 ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
3506 "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
3507 ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
3508 "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
3509 ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
3510 "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
3511 ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
3512 "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
3513 ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
3514 "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
3515 ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
3516 "ldr q25, [%[b_ptr0], #0x10]\n"
3517 ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
3518 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3519 ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
3520 ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
3521 ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
3522 ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
3523 ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
3524 "ldr q24, [%[b_ptr0]]\n"
3525 ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
3526 ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
3527 ".inst 0x6fa8eb3c // udot v28.4s, v25.16b, v8.4b[3]\n"
3528 ".inst 0x6faceb3d // udot v29.4s, v25.16b, v12.4b[3]\n"
3529 ".inst 0x6fb0eb3e // udot v30.4s, v25.16b, v16.4b[3]\n"
3530 ".inst 0x6fb4eb3f // udot v31.4s, v25.16b, v20.4b[3]\n"
3531 "ldr q25, [%[b_ptr0], #0x10]\n"
3532 ".inst 0x6f81e31a // udot v26.4s, v24.16b, v1.4b[0]\n"
3533 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3534 ".inst 0x6f85e31b // udot v27.4s, v24.16b, v5.4b[0]\n"
3535 ".inst 0x6f89e31c // udot v28.4s, v24.16b, v9.4b[0]\n"
3536 ".inst 0x6f8de31d // udot v29.4s, v24.16b, v13.4b[0]\n"
3537 ".inst 0x6f91e31e // udot v30.4s, v24.16b, v17.4b[0]\n"
3538 ".inst 0x6f95e31f // udot v31.4s, v24.16b, v21.4b[0]\n"
3539 "ldr q24, [%[b_ptr0]]\n"
3540 ".inst 0x6fa1e33a // udot v26.4s, v25.16b, v1.4b[1]\n"
3541 ".inst 0x6fa5e33b // udot v27.4s, v25.16b, v5.4b[1]\n"
3542 ".inst 0x6fa9e33c // udot v28.4s, v25.16b, v9.4b[1]\n"
3543 ".inst 0x6fade33d // udot v29.4s, v25.16b, v13.4b[1]\n"
3544 ".inst 0x6fb1e33e // udot v30.4s, v25.16b, v17.4b[1]\n"
3545 ".inst 0x6fb5e33f // udot v31.4s, v25.16b, v21.4b[1]\n"
3546 "ldr q25, [%[b_ptr0], #0x10]\n"
3547 ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
3548 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3549 ".inst 0x6f85eb1b // udot v27.4s, v24.16b, v5.4b[2]\n"
3550 ".inst 0x6f89eb1c // udot v28.4s, v24.16b, v9.4b[2]\n"
3551 ".inst 0x6f8deb1d // udot v29.4s, v24.16b, v13.4b[2]\n"
3552 ".inst 0x6f91eb1e // udot v30.4s, v24.16b, v17.4b[2]\n"
3553 ".inst 0x6f95eb1f // udot v31.4s, v24.16b, v21.4b[2]\n"
3554 "ldr q24, [%[b_ptr0]]\n"
3555 ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
3556 ".inst 0x6fa5eb3b // udot v27.4s, v25.16b, v5.4b[3]\n"
3557 ".inst 0x6fa9eb3c // udot v28.4s, v25.16b, v9.4b[3]\n"
3558 ".inst 0x6fadeb3d // udot v29.4s, v25.16b, v13.4b[3]\n"
3559 ".inst 0x6fb1eb3e // udot v30.4s, v25.16b, v17.4b[3]\n"
3560 ".inst 0x6fb5eb3f // udot v31.4s, v25.16b, v21.4b[3]\n"
3561 "ldr q25, [%[b_ptr0], #0x10]\n"
3562 ".inst 0x6f82e31a // udot v26.4s, v24.16b, v2.4b[0]\n"
3563 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3564 ".inst 0x6f86e31b // udot v27.4s, v24.16b, v6.4b[0]\n"
3565 ".inst 0x6f8ae31c // udot v28.4s, v24.16b, v10.4b[0]\n"
3566 ".inst 0x6f8ee31d // udot v29.4s, v24.16b, v14.4b[0]\n"
3567 ".inst 0x6f92e31e // udot v30.4s, v24.16b, v18.4b[0]\n"
3568 ".inst 0x6f96e31f // udot v31.4s, v24.16b, v22.4b[0]\n"
3569 "ldr q24, [%[b_ptr0]]\n"
3570 ".inst 0x6fa2e33a // udot v26.4s, v25.16b, v2.4b[1]\n"
3571 ".inst 0x6fa6e33b // udot v27.4s, v25.16b, v6.4b[1]\n"
3572 ".inst 0x6faae33c // udot v28.4s, v25.16b, v10.4b[1]\n"
3573 ".inst 0x6faee33d // udot v29.4s, v25.16b, v14.4b[1]\n"
3574 ".inst 0x6fb2e33e // udot v30.4s, v25.16b, v18.4b[1]\n"
3575 ".inst 0x6fb6e33f // udot v31.4s, v25.16b, v22.4b[1]\n"
3576 "ldr q25, [%[b_ptr0], #0x10]\n"
3577 ".inst 0x6f82eb1a // udot v26.4s, v24.16b, v2.4b[2]\n"
3578 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3579 ".inst 0x6f86eb1b // udot v27.4s, v24.16b, v6.4b[2]\n"
3580 ".inst 0x6f8aeb1c // udot v28.4s, v24.16b, v10.4b[2]\n"
3581 ".inst 0x6f8eeb1d // udot v29.4s, v24.16b, v14.4b[2]\n"
3582 ".inst 0x6f92eb1e // udot v30.4s, v24.16b, v18.4b[2]\n"
3583 ".inst 0x6f96eb1f // udot v31.4s, v24.16b, v22.4b[2]\n"
3584 "ldr q24, [%[b_ptr0]]\n"
3585 ".inst 0x6fa2eb3a // udot v26.4s, v25.16b, v2.4b[3]\n"
3586 ".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
3587 ".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
3588 ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
3589 ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
3590 ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
3591 "ldr q25, [%[b_ptr0], #0x10]\n"
3592 ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
3593 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3594 ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
3595 ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
3596 ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
3597 ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
3598 ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
3599 "ldr q24, [%[b_ptr0]]\n"
3600 ".inst 0x6fa3e33a // udot v26.4s, v25.16b, v3.4b[1]\n"
3601 "add %[b_ptr0], %[b_ptr0], #0x10\n"
3602 ".inst 0x6fa7e33b // udot v27.4s, v25.16b, v7.4b[1]\n"
3603 ".inst 0x6fabe33c // udot v28.4s, v25.16b, v11.4b[1]\n"
3604 ".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
3605 ".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
3606 ".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
3607 ".inst 0x6f83eb1a // udot v26.4s, v24.16b, v3.4b[2]\n"
3608 ".inst 0x6f87eb1b // udot v27.4s, v24.16b, v7.4b[2]\n"
3609 ".inst 0x6f8beb1c // udot v28.4s, v24.16b, v11.4b[2]\n"
3610 ".inst 0x6f8feb1d // udot v29.4s, v24.16b, v15.4b[2]\n"
3611 ".inst 0x6f93eb1e // udot v30.4s, v24.16b, v19.4b[2]\n"
3612 ".inst 0x6f97eb1f // udot v31.4s, v24.16b, v23.4b[2]\n"
3615 "str q26, [%[c_ptr0]]\n"
3616 "add %[c_ptr0], %[c_ptr0], #0x10\n"
3618 "ldr q24, [%[b_ptr0]]\n"
3619 "ldr q25, [%[b_ptr0], #0x10]\n"
3620 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3621 "str q27, [c_ptr1]\n"
3622 "add c_ptr1, c_ptr1, #0x10\n"
3624 ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
3625 "str q28, [c_ptr2]\n"
3627 "add c_ptr2, c_ptr2, #0x10\n"
3628 ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
3629 "str q29, [c_ptr3]\n"
3631 "add c_ptr3, c_ptr3, #0x10\n"
3632 ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
3633 "str q30, [c_ptr4]\n"
3635 "add c_ptr4, c_ptr4, #0x10\n"
3636 ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
3637 "str q31, [c_ptr5]\n"
3639 "add c_ptr5, c_ptr5, #0x10\n"
3640 ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
3641 ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
3642 ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
3643 "ldr q24, [%[b_ptr0]]\n"
3644 ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
3645 ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
3646 ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
3647 ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
3648 ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
3649 "ldr q25, [%[b_ptr0], #0x10]\n"
3650 ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
3651 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3652 ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
3653 ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
3654 ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
3655 ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
3656 ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
3657 "ldr q24, [%[b_ptr0]]\n"
3658 ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
3659 ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
3660 ".inst 0x6fa8eb3c // udot v28.4s, v25.16b, v8.4b[3]\n"
3661 ".inst 0x6faceb3d // udot v29.4s, v25.16b, v12.4b[3]\n"
3662 ".inst 0x6fb0eb3e // udot v30.4s, v25.16b, v16.4b[3]\n"
3663 ".inst 0x6fb4eb3f // udot v31.4s, v25.16b, v20.4b[3]\n"
3664 "ldr q25, [%[b_ptr0], #0x10]\n"
3665 ".inst 0x6f81e31a // udot v26.4s, v24.16b, v1.4b[0]\n"
3666 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3667 ".inst 0x6f85e31b // udot v27.4s, v24.16b, v5.4b[0]\n"
3668 ".inst 0x6f89e31c // udot v28.4s, v24.16b, v9.4b[0]\n"
3669 ".inst 0x6f8de31d // udot v29.4s, v24.16b, v13.4b[0]\n"
3670 ".inst 0x6f91e31e // udot v30.4s, v24.16b, v17.4b[0]\n"
3671 ".inst 0x6f95e31f // udot v31.4s, v24.16b, v21.4b[0]\n"
3672 "ldr q24, [%[b_ptr0]]\n"
3673 ".inst 0x6fa1e33a // udot v26.4s, v25.16b, v1.4b[1]\n"
3674 ".inst 0x6fa5e33b // udot v27.4s, v25.16b, v5.4b[1]\n"
3675 ".inst 0x6fa9e33c // udot v28.4s, v25.16b, v9.4b[1]\n"
3676 ".inst 0x6fade33d // udot v29.4s, v25.16b, v13.4b[1]\n"
3677 ".inst 0x6fb1e33e // udot v30.4s, v25.16b, v17.4b[1]\n"
3678 ".inst 0x6fb5e33f // udot v31.4s, v25.16b, v21.4b[1]\n"
3679 "ldr q25, [%[b_ptr0], #0x10]\n"
3680 ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
3681 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3682 ".inst 0x6f85eb1b // udot v27.4s, v24.16b, v5.4b[2]\n"
3683 ".inst 0x6f89eb1c // udot v28.4s, v24.16b, v9.4b[2]\n"
3684 ".inst 0x6f8deb1d // udot v29.4s, v24.16b, v13.4b[2]\n"
3685 ".inst 0x6f91eb1e // udot v30.4s, v24.16b, v17.4b[2]\n"
3686 ".inst 0x6f95eb1f // udot v31.4s, v24.16b, v21.4b[2]\n"
3687 "ldr q24, [%[b_ptr0]]\n"
3688 ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
3689 ".inst 0x6fa5eb3b // udot v27.4s, v25.16b, v5.4b[3]\n"
3690 ".inst 0x6fa9eb3c // udot v28.4s, v25.16b, v9.4b[3]\n"
3691 ".inst 0x6fadeb3d // udot v29.4s, v25.16b, v13.4b[3]\n"
3692 ".inst 0x6fb1eb3e // udot v30.4s, v25.16b, v17.4b[3]\n"
3693 ".inst 0x6fb5eb3f // udot v31.4s, v25.16b, v21.4b[3]\n"
3694 "ldr q25, [%[b_ptr0], #0x10]\n"
3695 ".inst 0x6f82e31a // udot v26.4s, v24.16b, v2.4b[0]\n"
3696 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3697 ".inst 0x6f86e31b // udot v27.4s, v24.16b, v6.4b[0]\n"
3698 ".inst 0x6f8ae31c // udot v28.4s, v24.16b, v10.4b[0]\n"
3699 ".inst 0x6f8ee31d // udot v29.4s, v24.16b, v14.4b[0]\n"
3700 ".inst 0x6f92e31e // udot v30.4s, v24.16b, v18.4b[0]\n"
3701 ".inst 0x6f96e31f // udot v31.4s, v24.16b, v22.4b[0]\n"
3702 "ldr q24, [%[b_ptr0]]\n"
3703 ".inst 0x6fa2e33a // udot v26.4s, v25.16b, v2.4b[1]\n"
3704 ".inst 0x6fa6e33b // udot v27.4s, v25.16b, v6.4b[1]\n"
3705 ".inst 0x6faae33c // udot v28.4s, v25.16b, v10.4b[1]\n"
3706 ".inst 0x6faee33d // udot v29.4s, v25.16b, v14.4b[1]\n"
3707 ".inst 0x6fb2e33e // udot v30.4s, v25.16b, v18.4b[1]\n"
3708 ".inst 0x6fb6e33f // udot v31.4s, v25.16b, v22.4b[1]\n"
3709 "ldr q25, [%[b_ptr0], #0x10]\n"
3710 ".inst 0x6f82eb1a // udot v26.4s, v24.16b, v2.4b[2]\n"
3711 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3712 ".inst 0x6f86eb1b // udot v27.4s, v24.16b, v6.4b[2]\n"
3713 ".inst 0x6f8aeb1c // udot v28.4s, v24.16b, v10.4b[2]\n"
3714 ".inst 0x6f8eeb1d // udot v29.4s, v24.16b, v14.4b[2]\n"
3715 ".inst 0x6f92eb1e // udot v30.4s, v24.16b, v18.4b[2]\n"
3716 ".inst 0x6f96eb1f // udot v31.4s, v24.16b, v22.4b[2]\n"
3717 "ldr q24, [%[b_ptr0]]\n"
3718 ".inst 0x6fa2eb3a // udot v26.4s, v25.16b, v2.4b[3]\n"
3719 ".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
3720 ".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
3721 ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
3722 ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
3723 ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
3724 "ldr q25, [%[b_ptr0], #0x10]\n"
3725 ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
3726 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3727 ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
3728 ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
3729 ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
3730 ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
3731 ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
3732 "ldr q24, [%[b_ptr0]]\n"
3733 ".inst 0x6fa3e33a // udot v26.4s, v25.16b, v3.4b[1]\n"
3734 "add %[b_ptr0], %[b_ptr0], #0x10\n"
3735 ".inst 0x6fa7e33b // udot v27.4s, v25.16b, v7.4b[1]\n"
3736 ".inst 0x6fabe33c // udot v28.4s, v25.16b, v11.4b[1]\n"
3737 ".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
3738 ".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
3739 ".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
3740 ".inst 0x6f83eb1a // udot v26.4s, v24.16b, v3.4b[2]\n"
3741 ".inst 0x6f87eb1b // udot v27.4s, v24.16b, v7.4b[2]\n"
3742 ".inst 0x6f8beb1c // udot v28.4s, v24.16b, v11.4b[2]\n"
3743 ".inst 0x6f8feb1d // udot v29.4s, v24.16b, v15.4b[2]\n"
3744 ".inst 0x6f93eb1e // udot v30.4s, v24.16b, v19.4b[2]\n"
3745 ".inst 0x6f97eb1f // udot v31.4s, v24.16b, v23.4b[2]\n"
3754 ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
3755 ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
3756 ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
3757 ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
3758 ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
3759 ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
3760 "ldr q24, [%[b_ptr0]]\n"
3761 ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
3762 ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
3763 ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
3764 ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
3765 ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
3766 ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
3767 "ldr q25, [%[b_ptr0], #0x10]\n"
3768 ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
3769 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3770 ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
3771 ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
3772 ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
3773 ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
3774 ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
3775 "ldr q24, [%[b_ptr0]]\n"
3776 ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
3777 ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
3778 ".inst 0x6fa8eb3c // udot v28.4s, v25.16b, v8.4b[3]\n"
3779 ".inst 0x6faceb3d // udot v29.4s, v25.16b, v12.4b[3]\n"
3780 ".inst 0x6fb0eb3e // udot v30.4s, v25.16b, v16.4b[3]\n"
3781 ".inst 0x6fb4eb3f // udot v31.4s, v25.16b, v20.4b[3]\n"
3782 "ldr q25, [%[b_ptr0], #0x10]\n"
3783 ".inst 0x6f81e31a // udot v26.4s, v24.16b, v1.4b[0]\n"
3784 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3785 ".inst 0x6f85e31b // udot v27.4s, v24.16b, v5.4b[0]\n"
3786 ".inst 0x6f89e31c // udot v28.4s, v24.16b, v9.4b[0]\n"
3787 ".inst 0x6f8de31d // udot v29.4s, v24.16b, v13.4b[0]\n"
3788 ".inst 0x6f91e31e // udot v30.4s, v24.16b, v17.4b[0]\n"
3789 ".inst 0x6f95e31f // udot v31.4s, v24.16b, v21.4b[0]\n"
3790 "ldr q24, [%[b_ptr0]]\n"
3791 ".inst 0x6fa1e33a // udot v26.4s, v25.16b, v1.4b[1]\n"
3792 ".inst 0x6fa5e33b // udot v27.4s, v25.16b, v5.4b[1]\n"
3793 ".inst 0x6fa9e33c // udot v28.4s, v25.16b, v9.4b[1]\n"
3794 ".inst 0x6fade33d // udot v29.4s, v25.16b, v13.4b[1]\n"
3795 ".inst 0x6fb1e33e // udot v30.4s, v25.16b, v17.4b[1]\n"
3796 ".inst 0x6fb5e33f // udot v31.4s, v25.16b, v21.4b[1]\n"
3797 "ldr q25, [%[b_ptr0], #0x10]\n"
3798 ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
3799 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3800 ".inst 0x6f85eb1b // udot v27.4s, v24.16b, v5.4b[2]\n"
3801 ".inst 0x6f89eb1c // udot v28.4s, v24.16b, v9.4b[2]\n"
3802 ".inst 0x6f8deb1d // udot v29.4s, v24.16b, v13.4b[2]\n"
3803 ".inst 0x6f91eb1e // udot v30.4s, v24.16b, v17.4b[2]\n"
3804 ".inst 0x6f95eb1f // udot v31.4s, v24.16b, v21.4b[2]\n"
3805 "ldr q24, [%[b_ptr0]]\n"
3806 ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
3807 ".inst 0x6fa5eb3b // udot v27.4s, v25.16b, v5.4b[3]\n"
3808 ".inst 0x6fa9eb3c // udot v28.4s, v25.16b, v9.4b[3]\n"
3809 ".inst 0x6fadeb3d // udot v29.4s, v25.16b, v13.4b[3]\n"
3810 ".inst 0x6fb1eb3e // udot v30.4s, v25.16b, v17.4b[3]\n"
3811 ".inst 0x6fb5eb3f // udot v31.4s, v25.16b, v21.4b[3]\n"
3812 "ldr q25, [%[b_ptr0], #0x10]\n"
3813 ".inst 0x6f82e31a // udot v26.4s, v24.16b, v2.4b[0]\n"
3814 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3815 ".inst 0x6f86e31b // udot v27.4s, v24.16b, v6.4b[0]\n"
3816 ".inst 0x6f8ae31c // udot v28.4s, v24.16b, v10.4b[0]\n"
3817 ".inst 0x6f8ee31d // udot v29.4s, v24.16b, v14.4b[0]\n"
3818 ".inst 0x6f92e31e // udot v30.4s, v24.16b, v18.4b[0]\n"
3819 ".inst 0x6f96e31f // udot v31.4s, v24.16b, v22.4b[0]\n"
3820 "ldr q24, [%[b_ptr0]]\n"
3821 ".inst 0x6fa2e33a // udot v26.4s, v25.16b, v2.4b[1]\n"
3822 ".inst 0x6fa6e33b // udot v27.4s, v25.16b, v6.4b[1]\n"
3823 ".inst 0x6faae33c // udot v28.4s, v25.16b, v10.4b[1]\n"
3824 ".inst 0x6faee33d // udot v29.4s, v25.16b, v14.4b[1]\n"
3825 ".inst 0x6fb2e33e // udot v30.4s, v25.16b, v18.4b[1]\n"
3826 ".inst 0x6fb6e33f // udot v31.4s, v25.16b, v22.4b[1]\n"
3827 "ldr q25, [%[b_ptr0], #0x10]\n"
3828 ".inst 0x6f82eb1a // udot v26.4s, v24.16b, v2.4b[2]\n"
3829 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3830 ".inst 0x6f86eb1b // udot v27.4s, v24.16b, v6.4b[2]\n"
3831 ".inst 0x6f8aeb1c // udot v28.4s, v24.16b, v10.4b[2]\n"
3832 ".inst 0x6f8eeb1d // udot v29.4s, v24.16b, v14.4b[2]\n"
3833 ".inst 0x6f92eb1e // udot v30.4s, v24.16b, v18.4b[2]\n"
3834 ".inst 0x6f96eb1f // udot v31.4s, v24.16b, v22.4b[2]\n"
3835 "ldr q24, [%[b_ptr0]]\n"
3836 ".inst 0x6fa2eb3a // udot v26.4s, v25.16b, v2.4b[3]\n"
3837 ".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
3838 ".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
3839 ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
3840 ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
3841 ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
3842 "ldr q25, [%[b_ptr0], #0x10]\n"
3843 ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
3844 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3845 ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
3846 ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
3847 ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
3848 ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
3849 ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
3850 "ldr q24, [%[b_ptr0]]\n"
3851 ".inst 0x6fa3e33a // udot v26.4s, v25.16b, v3.4b[1]\n"
3852 "add %[b_ptr0], %[b_ptr0], #0x10\n"
3853 ".inst 0x6fa7e33b // udot v27.4s, v25.16b, v7.4b[1]\n"
3854 ".inst 0x6fabe33c // udot v28.4s, v25.16b, v11.4b[1]\n"
3855 ".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
3856 ".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
3857 ".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
3858 ".inst 0x6f83eb1a // udot v26.4s, v24.16b, v3.4b[2]\n"
3859 ".inst 0x6f87eb1b // udot v27.4s, v24.16b, v7.4b[2]\n"
3860 ".inst 0x6f8beb1c // udot v28.4s, v24.16b, v11.4b[2]\n"
3861 ".inst 0x6f8feb1d // udot v29.4s, v24.16b, v15.4b[2]\n"
3862 ".inst 0x6f93eb1e // udot v30.4s, v24.16b, v19.4b[2]\n"
3863 ".inst 0x6f97eb1f // udot v31.4s, v24.16b, v23.4b[2]\n"
3865 "str q26, [%[c_ptr0]]\n"
3866 "add %[c_ptr0], %[c_ptr0], #0x10\n"
3867 "str q27, [c_ptr1]\n"
3868 "str q28, [c_ptr2]\n"
3869 "str q29, [c_ptr3]\n"
3870 "str q30, [c_ptr4]\n"
3871 "str q31, [c_ptr5]\n"
3882 : [a_ptr0]
"+r" (a_ptr0), [b_ptr0]
"+r" (b_ptr0), [c_ptr0]
"+r" (c_ptr0), [loops]
"+r" (loops), [oob_rows]
"+r" (oob_rows), [odds]
"+r" (odds)
3883 : [lda]
"r" (ldab), [ldc]
"r" (ldcb)
3884 :
"x0",
"x1",
"x2",
"x3",
"x4",
"x5",
"x6",
"x7",
"x8",
"x9",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23",
"v24",
"v25",
"v26",
"v27",
"v28",
"v29",
"v30",
"v31",
"cc",
"memory"
3900 "add a_ptr1, %[a_ptr0], %[lda]\n"
3901 "add c_ptr1, %[c_ptr0], %[ldc]\n"
3902 "add a_ptr2, a_ptr1, %[lda]\n"
3903 "add c_ptr2, c_ptr1, %[ldc]\n"
3904 "add a_ptr3, a_ptr2, %[lda]\n"
3905 "add c_ptr3, c_ptr2, %[ldc]\n"
3906 "add a_ptr4, a_ptr3, %[lda]\n"
3907 "add c_ptr4, c_ptr3, %[ldc]\n"
3908 "add a_ptr5, a_ptr4, %[lda]\n"
3909 "add c_ptr5, c_ptr4, %[ldc]\n"
3910 "cbz %[oob_rows], 1f\n"
3911 "subs %[oob_rows], %[oob_rows], #0x1\n"
3912 "add c_ptr5, %[c_ptr0], #0x0\n"
3913 "add a_ptr5, %[a_ptr0], #0x0\n"
3915 "subs %[oob_rows], %[oob_rows], #0x1\n"
3916 "add c_ptr4, %[c_ptr0], #0x0\n"
3917 "add a_ptr4, %[a_ptr0], #0x0\n"
3919 "subs %[oob_rows], %[oob_rows], #0x1\n"
3920 "add c_ptr3, %[c_ptr0], #0x0\n"
3921 "add a_ptr3, %[a_ptr0], #0x0\n"
3923 "subs %[oob_rows], %[oob_rows], #0x1\n"
3924 "add c_ptr2, %[c_ptr0], #0x0\n"
3925 "add a_ptr2, %[a_ptr0], #0x0\n"
3927 "subs %[oob_rows], %[oob_rows], #0x1\n"
3928 "add c_ptr1, %[c_ptr0], #0x0\n"
3929 "add a_ptr1, %[a_ptr0], #0x0\n"
3931 "cbnz %[odds], 2f\n"
3932 "ldr q0, [%[a_ptr0]], #0x10\n"
3933 "ldr q4, [a_ptr1], #0x10\n"
3934 "ldr q8, [a_ptr2], #0x10\n"
3935 "ldr q12, [a_ptr3], #0x10\n"
3936 "ldr q16, [a_ptr4], #0x10\n"
3937 "ldr q20, [a_ptr5], #0x10\n"
3938 "ldr q1, [%[a_ptr0]], #0x10\n"
3939 "ldr q5, [a_ptr1], #0x10\n"
3940 "ldr q9, [a_ptr2], #0x10\n"
3941 "ldr q13, [a_ptr3], #0x10\n"
3942 "ldr q17, [a_ptr4], #0x10\n"
3943 "ldr q21, [a_ptr5], #0x10\n"
3944 "ldr q2, [%[a_ptr0]], #0x10\n"
3945 "ldr q6, [a_ptr1], #0x10\n"
3946 "ldr q10, [a_ptr2], #0x10\n"
3947 "ldr q14, [a_ptr3], #0x10\n"
3948 "ldr q18, [a_ptr4], #0x10\n"
3949 "ldr q22, [a_ptr5], #0x10\n"
3950 "ldr q3, [%[a_ptr0]]\n"
3951 "ldr q7, [a_ptr1]\n"
3952 "ldr q11, [a_ptr2]\n"
3953 "ldr q15, [a_ptr3]\n"
3954 "ldr q19, [a_ptr4]\n"
3955 "ldr q23, [a_ptr5]\n"
3958 "ldr q0, [%[a_ptr0]], #0x10\n"
3959 "subs %[odds], %[odds], #0x1\n"
3960 "ldr q4, [a_ptr1], #0x10\n"
3961 "ldr q8, [a_ptr2], #0x10\n"
3962 "ldr q12, [a_ptr3], #0x10\n"
3963 "ldr q16, [a_ptr4], #0x10\n"
3964 "ldr q20, [a_ptr5], #0x10\n"
3965 "ldr q1, [%[a_ptr0]], #0x10\n"
3966 "ldr q5, [a_ptr1], #0x10\n"
3967 "ldr q9, [a_ptr2], #0x10\n"
3968 "ldr q13, [a_ptr3], #0x10\n"
3969 "ldr q17, [a_ptr4], #0x10\n"
3970 "ldr q21, [a_ptr5], #0x10\n"
3971 "ldr q2, [%[a_ptr0]], #0x10\n"
3972 "ldr q6, [a_ptr1], #0x10\n"
3973 "ldr q10, [a_ptr2], #0x10\n"
3974 "ldr q14, [a_ptr3], #0x10\n"
3975 "ldr d3, [%[a_ptr0]], #0x8\n"
3976 "ldr q18, [a_ptr4], #0x10\n"
3977 "ldr d7, [a_ptr1], #0x8\n"
3978 "ldr q22, [a_ptr5], #0x10\n"
3979 "ldr d11, [a_ptr2], #0x8\n"
3980 "ldr d15, [a_ptr3], #0x8\n"
3981 "ldr d19, [a_ptr4], #0x8\n"
3982 "ldr d23, [a_ptr5], #0x8\n"
3983 "ld1 {v3.s}[2], [%[a_ptr0]], #4\n"
3984 "ld1 {v7.s}[2], [a_ptr1], #4\n"
3985 "ld1 {v11.s}[2], [a_ptr2], #4\n"
3986 "ld1 {v15.s}[2], [a_ptr3], #4\n"
3987 "ld1 {v19.s}[2], [a_ptr4], #4\n"
3988 "ld1 {v23.s}[2], [a_ptr5], #4\n"
3990 "ld1 {v3.b}[12], [%[a_ptr0]]\n"
3991 "ld1 {v7.b}[12], [a_ptr1]\n"
3992 "ld1 {v11.b}[12], [a_ptr2]\n"
3993 "ld1 {v15.b}[12], [a_ptr3]\n"
3994 "ld1 {v19.b}[12], [a_ptr4]\n"
3995 "ld1 {v23.b}[12], [a_ptr5]\n"
3998 "ld1 {v3.h}[6], [%[a_ptr0]], #2\n"
3999 "ld1 {v7.h}[6], [a_ptr1], #2\n"
4000 "ld1 {v11.h}[6], [a_ptr2], #2\n"
4001 "ld1 {v15.h}[6], [a_ptr3], #2\n"
4002 "ld1 {v19.h}[6], [a_ptr4], #2\n"
4003 "ld1 {v23.h}[6], [a_ptr5], #2\n"
4004 "subs %[odds], %[odds], #0x1\n"
4008 "ld1 {v3.b}[14], [%[a_ptr0]]\n"
4009 "ld1 {v7.b}[14], [a_ptr1]\n"
4010 "ld1 {v11.b}[14], [a_ptr2]\n"
4011 "ld1 {v15.b}[14], [a_ptr3]\n"
4012 "ld1 {v19.b}[14], [a_ptr4]\n"
4013 "ld1 {v23.b}[14], [a_ptr5]\n"
4015 "ldr q24, [%[b_ptr0]]\n"
4016 "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
4017 "ldr q25, [%[b_ptr0], #0x10]\n"
4018 "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
4019 "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
4020 "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
4021 "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
4022 "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
4023 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4024 "cbz %[loops], 6f\n"
4026 "subs %[loops], %[loops], #0x1\n"
4032 ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
4033 ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
4034 ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
4035 ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
4036 ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
4037 ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
4038 "ldr q24, [%[b_ptr0]]\n"
4039 ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
4040 ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
4041 ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
4042 ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
4043 ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
4044 ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
4045 "ldr q25, [%[b_ptr0], #0x10]\n"
4046 ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
4047 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4048 ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
4049 ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
4050 ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
4051 ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
4052 ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
4053 "ldr q24, [%[b_ptr0]]\n"
4054 ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
4055 ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
4056 ".inst 0x6fa8eb3c // udot v28.4s, v25.16b, v8.4b[3]\n"
4057 ".inst 0x6faceb3d // udot v29.4s, v25.16b, v12.4b[3]\n"
4058 ".inst 0x6fb0eb3e // udot v30.4s, v25.16b, v16.4b[3]\n"
4059 ".inst 0x6fb4eb3f // udot v31.4s, v25.16b, v20.4b[3]\n"
4060 "ldr q25, [%[b_ptr0], #0x10]\n"
4061 ".inst 0x6f81e31a // udot v26.4s, v24.16b, v1.4b[0]\n"
4062 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4063 ".inst 0x6f85e31b // udot v27.4s, v24.16b, v5.4b[0]\n"
4064 ".inst 0x6f89e31c // udot v28.4s, v24.16b, v9.4b[0]\n"
4065 ".inst 0x6f8de31d // udot v29.4s, v24.16b, v13.4b[0]\n"
4066 ".inst 0x6f91e31e // udot v30.4s, v24.16b, v17.4b[0]\n"
4067 ".inst 0x6f95e31f // udot v31.4s, v24.16b, v21.4b[0]\n"
4068 "ldr q24, [%[b_ptr0]]\n"
4069 ".inst 0x6fa1e33a // udot v26.4s, v25.16b, v1.4b[1]\n"
4070 ".inst 0x6fa5e33b // udot v27.4s, v25.16b, v5.4b[1]\n"
4071 ".inst 0x6fa9e33c // udot v28.4s, v25.16b, v9.4b[1]\n"
4072 ".inst 0x6fade33d // udot v29.4s, v25.16b, v13.4b[1]\n"
4073 ".inst 0x6fb1e33e // udot v30.4s, v25.16b, v17.4b[1]\n"
4074 ".inst 0x6fb5e33f // udot v31.4s, v25.16b, v21.4b[1]\n"
4075 "ldr q25, [%[b_ptr0], #0x10]\n"
4076 ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
4077 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4078 ".inst 0x6f85eb1b // udot v27.4s, v24.16b, v5.4b[2]\n"
4079 ".inst 0x6f89eb1c // udot v28.4s, v24.16b, v9.4b[2]\n"
4080 ".inst 0x6f8deb1d // udot v29.4s, v24.16b, v13.4b[2]\n"
4081 ".inst 0x6f91eb1e // udot v30.4s, v24.16b, v17.4b[2]\n"
4082 ".inst 0x6f95eb1f // udot v31.4s, v24.16b, v21.4b[2]\n"
4083 "ldr q24, [%[b_ptr0]]\n"
4084 ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
4085 ".inst 0x6fa5eb3b // udot v27.4s, v25.16b, v5.4b[3]\n"
4086 ".inst 0x6fa9eb3c // udot v28.4s, v25.16b, v9.4b[3]\n"
4087 ".inst 0x6fadeb3d // udot v29.4s, v25.16b, v13.4b[3]\n"
4088 ".inst 0x6fb1eb3e // udot v30.4s, v25.16b, v17.4b[3]\n"
4089 ".inst 0x6fb5eb3f // udot v31.4s, v25.16b, v21.4b[3]\n"
4090 "ldr q25, [%[b_ptr0], #0x10]\n"
4091 ".inst 0x6f82e31a // udot v26.4s, v24.16b, v2.4b[0]\n"
4092 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4093 ".inst 0x6f86e31b // udot v27.4s, v24.16b, v6.4b[0]\n"
4094 ".inst 0x6f8ae31c // udot v28.4s, v24.16b, v10.4b[0]\n"
4095 ".inst 0x6f8ee31d // udot v29.4s, v24.16b, v14.4b[0]\n"
4096 ".inst 0x6f92e31e // udot v30.4s, v24.16b, v18.4b[0]\n"
4097 ".inst 0x6f96e31f // udot v31.4s, v24.16b, v22.4b[0]\n"
4098 "ldr q24, [%[b_ptr0]]\n"
4099 ".inst 0x6fa2e33a // udot v26.4s, v25.16b, v2.4b[1]\n"
4100 ".inst 0x6fa6e33b // udot v27.4s, v25.16b, v6.4b[1]\n"
4101 ".inst 0x6faae33c // udot v28.4s, v25.16b, v10.4b[1]\n"
4102 ".inst 0x6faee33d // udot v29.4s, v25.16b, v14.4b[1]\n"
4103 ".inst 0x6fb2e33e // udot v30.4s, v25.16b, v18.4b[1]\n"
4104 ".inst 0x6fb6e33f // udot v31.4s, v25.16b, v22.4b[1]\n"
4105 "ldr q25, [%[b_ptr0], #0x10]\n"
4106 ".inst 0x6f82eb1a // udot v26.4s, v24.16b, v2.4b[2]\n"
4107 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4108 ".inst 0x6f86eb1b // udot v27.4s, v24.16b, v6.4b[2]\n"
4109 ".inst 0x6f8aeb1c // udot v28.4s, v24.16b, v10.4b[2]\n"
4110 ".inst 0x6f8eeb1d // udot v29.4s, v24.16b, v14.4b[2]\n"
4111 ".inst 0x6f92eb1e // udot v30.4s, v24.16b, v18.4b[2]\n"
4112 ".inst 0x6f96eb1f // udot v31.4s, v24.16b, v22.4b[2]\n"
4113 "ldr q24, [%[b_ptr0]]\n"
4114 ".inst 0x6fa2eb3a // udot v26.4s, v25.16b, v2.4b[3]\n"
4115 ".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
4116 ".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
4117 ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
4118 ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
4119 ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
4120 "ldr q25, [%[b_ptr0], #0x10]\n"
4121 ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
4122 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4123 ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
4124 ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
4125 ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
4126 ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
4127 ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
4128 "ldr q24, [%[b_ptr0]]\n"
4129 ".inst 0x6fa3e33a // udot v26.4s, v25.16b, v3.4b[1]\n"
4130 ".inst 0x6fa7e33b // udot v27.4s, v25.16b, v7.4b[1]\n"
4131 ".inst 0x6fabe33c // udot v28.4s, v25.16b, v11.4b[1]\n"
4132 ".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
4133 ".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
4134 ".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
4135 "ldr q25, [%[b_ptr0], #0x10]\n"
4136 ".inst 0x6f83eb1a // udot v26.4s, v24.16b, v3.4b[2]\n"
4137 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4138 ".inst 0x6f87eb1b // udot v27.4s, v24.16b, v7.4b[2]\n"
4139 ".inst 0x6f8beb1c // udot v28.4s, v24.16b, v11.4b[2]\n"
4140 ".inst 0x6f8feb1d // udot v29.4s, v24.16b, v15.4b[2]\n"
4141 ".inst 0x6f93eb1e // udot v30.4s, v24.16b, v19.4b[2]\n"
4142 ".inst 0x6f97eb1f // udot v31.4s, v24.16b, v23.4b[2]\n"
4143 ".inst 0x6fa3eb3a // udot v26.4s, v25.16b, v3.4b[3]\n"
4144 ".inst 0x6fa7eb3b // udot v27.4s, v25.16b, v7.4b[3]\n"
4145 ".inst 0x6fabeb3c // udot v28.4s, v25.16b, v11.4b[3]\n"
4146 ".inst 0x6fafeb3d // udot v29.4s, v25.16b, v15.4b[3]\n"
4147 ".inst 0x6fb3eb3e // udot v30.4s, v25.16b, v19.4b[3]\n"
4148 ".inst 0x6fb7eb3f // udot v31.4s, v25.16b, v23.4b[3]\n"
4151 "str q26, [%[c_ptr0]]\n"
4152 "subs %[loops], %[loops], #0x1\n"
4154 "ldr q24, [%[b_ptr0]]\n"
4155 "ldr q25, [%[b_ptr0], #0x10]\n"
4156 "add %[c_ptr0], %[c_ptr0], #0x10\n"
4157 "str q27, [c_ptr1]\n"
4158 "add c_ptr1, c_ptr1, #0x10\n"
4160 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4161 ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
4162 "str q28, [c_ptr2]\n"
4164 "add c_ptr2, c_ptr2, #0x10\n"
4165 ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
4166 "str q29, [c_ptr3]\n"
4168 "add c_ptr3, c_ptr3, #0x10\n"
4169 ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
4170 "str q30, [c_ptr4]\n"
4172 "add c_ptr4, c_ptr4, #0x10\n"
4173 ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
4174 "str q31, [c_ptr5]\n"
4176 "add c_ptr5, c_ptr5, #0x10\n"
4177 ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
4178 "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
4179 ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
4180 "ldr q24, [%[b_ptr0]]\n"
4181 ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
4182 "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
4183 ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
4184 "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
4185 ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
4186 "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
4187 ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
4188 "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
4189 ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
4190 "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
4191 ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
4192 "ldr q25, [%[b_ptr0], #0x10]\n"
4193 ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
4194 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4195 ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
4196 ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
4197 ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
4198 ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
4199 ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
4200 "ldr q24, [%[b_ptr0]]\n"
4201 ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
4202 ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
4203 ".inst 0x6fa8eb3c // udot v28.4s, v25.16b, v8.4b[3]\n"
4204 ".inst 0x6faceb3d // udot v29.4s, v25.16b, v12.4b[3]\n"
4205 ".inst 0x6fb0eb3e // udot v30.4s, v25.16b, v16.4b[3]\n"
4206 ".inst 0x6fb4eb3f // udot v31.4s, v25.16b, v20.4b[3]\n"
4207 "ldr q25, [%[b_ptr0], #0x10]\n"
4208 ".inst 0x6f81e31a // udot v26.4s, v24.16b, v1.4b[0]\n"
4209 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4210 ".inst 0x6f85e31b // udot v27.4s, v24.16b, v5.4b[0]\n"
4211 ".inst 0x6f89e31c // udot v28.4s, v24.16b, v9.4b[0]\n"
4212 ".inst 0x6f8de31d // udot v29.4s, v24.16b, v13.4b[0]\n"
4213 ".inst 0x6f91e31e // udot v30.4s, v24.16b, v17.4b[0]\n"
4214 ".inst 0x6f95e31f // udot v31.4s, v24.16b, v21.4b[0]\n"
4215 "ldr q24, [%[b_ptr0]]\n"
4216 ".inst 0x6fa1e33a // udot v26.4s, v25.16b, v1.4b[1]\n"
4217 ".inst 0x6fa5e33b // udot v27.4s, v25.16b, v5.4b[1]\n"
4218 ".inst 0x6fa9e33c // udot v28.4s, v25.16b, v9.4b[1]\n"
4219 ".inst 0x6fade33d // udot v29.4s, v25.16b, v13.4b[1]\n"
4220 ".inst 0x6fb1e33e // udot v30.4s, v25.16b, v17.4b[1]\n"
4221 ".inst 0x6fb5e33f // udot v31.4s, v25.16b, v21.4b[1]\n"
4222 "ldr q25, [%[b_ptr0], #0x10]\n"
4223 ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
4224 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4225 ".inst 0x6f85eb1b // udot v27.4s, v24.16b, v5.4b[2]\n"
4226 ".inst 0x6f89eb1c // udot v28.4s, v24.16b, v9.4b[2]\n"
4227 ".inst 0x6f8deb1d // udot v29.4s, v24.16b, v13.4b[2]\n"
4228 ".inst 0x6f91eb1e // udot v30.4s, v24.16b, v17.4b[2]\n"
4229 ".inst 0x6f95eb1f // udot v31.4s, v24.16b, v21.4b[2]\n"
4230 "ldr q24, [%[b_ptr0]]\n"
4231 ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
4232 ".inst 0x6fa5eb3b // udot v27.4s, v25.16b, v5.4b[3]\n"
4233 ".inst 0x6fa9eb3c // udot v28.4s, v25.16b, v9.4b[3]\n"
4234 ".inst 0x6fadeb3d // udot v29.4s, v25.16b, v13.4b[3]\n"
4235 ".inst 0x6fb1eb3e // udot v30.4s, v25.16b, v17.4b[3]\n"
4236 ".inst 0x6fb5eb3f // udot v31.4s, v25.16b, v21.4b[3]\n"
4237 "ldr q25, [%[b_ptr0], #0x10]\n"
4238 ".inst 0x6f82e31a // udot v26.4s, v24.16b, v2.4b[0]\n"
4239 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4240 ".inst 0x6f86e31b // udot v27.4s, v24.16b, v6.4b[0]\n"
4241 ".inst 0x6f8ae31c // udot v28.4s, v24.16b, v10.4b[0]\n"
4242 ".inst 0x6f8ee31d // udot v29.4s, v24.16b, v14.4b[0]\n"
4243 ".inst 0x6f92e31e // udot v30.4s, v24.16b, v18.4b[0]\n"
4244 ".inst 0x6f96e31f // udot v31.4s, v24.16b, v22.4b[0]\n"
4245 "ldr q24, [%[b_ptr0]]\n"
4246 ".inst 0x6fa2e33a // udot v26.4s, v25.16b, v2.4b[1]\n"
4247 ".inst 0x6fa6e33b // udot v27.4s, v25.16b, v6.4b[1]\n"
4248 ".inst 0x6faae33c // udot v28.4s, v25.16b, v10.4b[1]\n"
4249 ".inst 0x6faee33d // udot v29.4s, v25.16b, v14.4b[1]\n"
4250 ".inst 0x6fb2e33e // udot v30.4s, v25.16b, v18.4b[1]\n"
4251 ".inst 0x6fb6e33f // udot v31.4s, v25.16b, v22.4b[1]\n"
4252 "ldr q25, [%[b_ptr0], #0x10]\n"
4253 ".inst 0x6f82eb1a // udot v26.4s, v24.16b, v2.4b[2]\n"
4254 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4255 ".inst 0x6f86eb1b // udot v27.4s, v24.16b, v6.4b[2]\n"
4256 ".inst 0x6f8aeb1c // udot v28.4s, v24.16b, v10.4b[2]\n"
4257 ".inst 0x6f8eeb1d // udot v29.4s, v24.16b, v14.4b[2]\n"
4258 ".inst 0x6f92eb1e // udot v30.4s, v24.16b, v18.4b[2]\n"
4259 ".inst 0x6f96eb1f // udot v31.4s, v24.16b, v22.4b[2]\n"
4260 "ldr q24, [%[b_ptr0]]\n"
4261 ".inst 0x6fa2eb3a // udot v26.4s, v25.16b, v2.4b[3]\n"
4262 ".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
4263 ".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
4264 ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
4265 ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
4266 ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
4267 "ldr q25, [%[b_ptr0], #0x10]\n"
4268 ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
4269 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4270 ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
4271 ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
4272 ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
4273 ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
4274 ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
4275 "ldr q24, [%[b_ptr0]]\n"
4276 ".inst 0x6fa3e33a // udot v26.4s, v25.16b, v3.4b[1]\n"
4277 ".inst 0x6fa7e33b // udot v27.4s, v25.16b, v7.4b[1]\n"
4278 ".inst 0x6fabe33c // udot v28.4s, v25.16b, v11.4b[1]\n"
4279 ".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
4280 ".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
4281 ".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
4282 "ldr q25, [%[b_ptr0], #0x10]\n"
4283 ".inst 0x6f83eb1a // udot v26.4s, v24.16b, v3.4b[2]\n"
4284 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4285 ".inst 0x6f87eb1b // udot v27.4s, v24.16b, v7.4b[2]\n"
4286 ".inst 0x6f8beb1c // udot v28.4s, v24.16b, v11.4b[2]\n"
4287 ".inst 0x6f8feb1d // udot v29.4s, v24.16b, v15.4b[2]\n"
4288 ".inst 0x6f93eb1e // udot v30.4s, v24.16b, v19.4b[2]\n"
4289 ".inst 0x6f97eb1f // udot v31.4s, v24.16b, v23.4b[2]\n"
4290 ".inst 0x6fa3eb3a // udot v26.4s, v25.16b, v3.4b[3]\n"
4291 ".inst 0x6fa7eb3b // udot v27.4s, v25.16b, v7.4b[3]\n"
4292 ".inst 0x6fabeb3c // udot v28.4s, v25.16b, v11.4b[3]\n"
4293 ".inst 0x6fafeb3d // udot v29.4s, v25.16b, v15.4b[3]\n"
4294 ".inst 0x6fb3eb3e // udot v30.4s, v25.16b, v19.4b[3]\n"
4295 ".inst 0x6fb7eb3f // udot v31.4s, v25.16b, v23.4b[3]\n"
4298 "str q26, [%[c_ptr0]]\n"
4299 "add %[c_ptr0], %[c_ptr0], #0x10\n"
4301 "ldr q24, [%[b_ptr0]]\n"
4302 "ldr q25, [%[b_ptr0], #0x10]\n"
4303 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4304 "str q27, [c_ptr1]\n"
4305 "add c_ptr1, c_ptr1, #0x10\n"
4307 ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
4308 "str q28, [c_ptr2]\n"
4310 "add c_ptr2, c_ptr2, #0x10\n"
4311 ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
4312 "str q29, [c_ptr3]\n"
4314 "add c_ptr3, c_ptr3, #0x10\n"
4315 ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
4316 "str q30, [c_ptr4]\n"
4318 "add c_ptr4, c_ptr4, #0x10\n"
4319 ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
4320 "str q31, [c_ptr5]\n"
4322 "add c_ptr5, c_ptr5, #0x10\n"
4323 ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
4324 ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
4325 ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
4326 "ldr q24, [%[b_ptr0]]\n"
4327 ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
4328 ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
4329 ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
4330 ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
4331 ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
4332 "ldr q25, [%[b_ptr0], #0x10]\n"
4333 ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
4334 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4335 ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
4336 ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
4337 ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
4338 ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
4339 ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
4340 "ldr q24, [%[b_ptr0]]\n"
4341 ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
4342 ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
4343 ".inst 0x6fa8eb3c // udot v28.4s, v25.16b, v8.4b[3]\n"
4344 ".inst 0x6faceb3d // udot v29.4s, v25.16b, v12.4b[3]\n"
4345 ".inst 0x6fb0eb3e // udot v30.4s, v25.16b, v16.4b[3]\n"
4346 ".inst 0x6fb4eb3f // udot v31.4s, v25.16b, v20.4b[3]\n"
4347 "ldr q25, [%[b_ptr0], #0x10]\n"
4348 ".inst 0x6f81e31a // udot v26.4s, v24.16b, v1.4b[0]\n"
4349 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4350 ".inst 0x6f85e31b // udot v27.4s, v24.16b, v5.4b[0]\n"
4351 ".inst 0x6f89e31c // udot v28.4s, v24.16b, v9.4b[0]\n"
4352 ".inst 0x6f8de31d // udot v29.4s, v24.16b, v13.4b[0]\n"
4353 ".inst 0x6f91e31e // udot v30.4s, v24.16b, v17.4b[0]\n"
4354 ".inst 0x6f95e31f // udot v31.4s, v24.16b, v21.4b[0]\n"
4355 "ldr q24, [%[b_ptr0]]\n"
4356 ".inst 0x6fa1e33a // udot v26.4s, v25.16b, v1.4b[1]\n"
4357 ".inst 0x6fa5e33b // udot v27.4s, v25.16b, v5.4b[1]\n"
4358 ".inst 0x6fa9e33c // udot v28.4s, v25.16b, v9.4b[1]\n"
4359 ".inst 0x6fade33d // udot v29.4s, v25.16b, v13.4b[1]\n"
4360 ".inst 0x6fb1e33e // udot v30.4s, v25.16b, v17.4b[1]\n"
4361 ".inst 0x6fb5e33f // udot v31.4s, v25.16b, v21.4b[1]\n"
4362 "ldr q25, [%[b_ptr0], #0x10]\n"
4363 ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
4364 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4365 ".inst 0x6f85eb1b // udot v27.4s, v24.16b, v5.4b[2]\n"
4366 ".inst 0x6f89eb1c // udot v28.4s, v24.16b, v9.4b[2]\n"
4367 ".inst 0x6f8deb1d // udot v29.4s, v24.16b, v13.4b[2]\n"
4368 ".inst 0x6f91eb1e // udot v30.4s, v24.16b, v17.4b[2]\n"
4369 ".inst 0x6f95eb1f // udot v31.4s, v24.16b, v21.4b[2]\n"
4370 "ldr q24, [%[b_ptr0]]\n"
4371 ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
4372 ".inst 0x6fa5eb3b // udot v27.4s, v25.16b, v5.4b[3]\n"
4373 ".inst 0x6fa9eb3c // udot v28.4s, v25.16b, v9.4b[3]\n"
4374 ".inst 0x6fadeb3d // udot v29.4s, v25.16b, v13.4b[3]\n"
4375 ".inst 0x6fb1eb3e // udot v30.4s, v25.16b, v17.4b[3]\n"
4376 ".inst 0x6fb5eb3f // udot v31.4s, v25.16b, v21.4b[3]\n"
4377 "ldr q25, [%[b_ptr0], #0x10]\n"
4378 ".inst 0x6f82e31a // udot v26.4s, v24.16b, v2.4b[0]\n"
4379 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4380 ".inst 0x6f86e31b // udot v27.4s, v24.16b, v6.4b[0]\n"
4381 ".inst 0x6f8ae31c // udot v28.4s, v24.16b, v10.4b[0]\n"
4382 ".inst 0x6f8ee31d // udot v29.4s, v24.16b, v14.4b[0]\n"
4383 ".inst 0x6f92e31e // udot v30.4s, v24.16b, v18.4b[0]\n"
4384 ".inst 0x6f96e31f // udot v31.4s, v24.16b, v22.4b[0]\n"
4385 "ldr q24, [%[b_ptr0]]\n"
4386 ".inst 0x6fa2e33a // udot v26.4s, v25.16b, v2.4b[1]\n"
4387 ".inst 0x6fa6e33b // udot v27.4s, v25.16b, v6.4b[1]\n"
4388 ".inst 0x6faae33c // udot v28.4s, v25.16b, v10.4b[1]\n"
4389 ".inst 0x6faee33d // udot v29.4s, v25.16b, v14.4b[1]\n"
4390 ".inst 0x6fb2e33e // udot v30.4s, v25.16b, v18.4b[1]\n"
4391 ".inst 0x6fb6e33f // udot v31.4s, v25.16b, v22.4b[1]\n"
4392 "ldr q25, [%[b_ptr0], #0x10]\n"
4393 ".inst 0x6f82eb1a // udot v26.4s, v24.16b, v2.4b[2]\n"
4394 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4395 ".inst 0x6f86eb1b // udot v27.4s, v24.16b, v6.4b[2]\n"
4396 ".inst 0x6f8aeb1c // udot v28.4s, v24.16b, v10.4b[2]\n"
4397 ".inst 0x6f8eeb1d // udot v29.4s, v24.16b, v14.4b[2]\n"
4398 ".inst 0x6f92eb1e // udot v30.4s, v24.16b, v18.4b[2]\n"
4399 ".inst 0x6f96eb1f // udot v31.4s, v24.16b, v22.4b[2]\n"
4400 "ldr q24, [%[b_ptr0]]\n"
4401 ".inst 0x6fa2eb3a // udot v26.4s, v25.16b, v2.4b[3]\n"
4402 ".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
4403 ".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
4404 ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
4405 ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
4406 ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
4407 "ldr q25, [%[b_ptr0], #0x10]\n"
4408 ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
4409 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4410 ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
4411 ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
4412 ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
4413 ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
4414 ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
4415 "ldr q24, [%[b_ptr0]]\n"
4416 ".inst 0x6fa3e33a // udot v26.4s, v25.16b, v3.4b[1]\n"
4417 ".inst 0x6fa7e33b // udot v27.4s, v25.16b, v7.4b[1]\n"
4418 ".inst 0x6fabe33c // udot v28.4s, v25.16b, v11.4b[1]\n"
4419 ".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
4420 ".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
4421 ".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
4422 "ldr q25, [%[b_ptr0], #0x10]\n"
4423 ".inst 0x6f83eb1a // udot v26.4s, v24.16b, v3.4b[2]\n"
4424 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4425 ".inst 0x6f87eb1b // udot v27.4s, v24.16b, v7.4b[2]\n"
4426 ".inst 0x6f8beb1c // udot v28.4s, v24.16b, v11.4b[2]\n"
4427 ".inst 0x6f8feb1d // udot v29.4s, v24.16b, v15.4b[2]\n"
4428 ".inst 0x6f93eb1e // udot v30.4s, v24.16b, v19.4b[2]\n"
4429 ".inst 0x6f97eb1f // udot v31.4s, v24.16b, v23.4b[2]\n"
4430 ".inst 0x6fa3eb3a // udot v26.4s, v25.16b, v3.4b[3]\n"
4431 ".inst 0x6fa7eb3b // udot v27.4s, v25.16b, v7.4b[3]\n"
4432 ".inst 0x6fabeb3c // udot v28.4s, v25.16b, v11.4b[3]\n"
4433 ".inst 0x6fafeb3d // udot v29.4s, v25.16b, v15.4b[3]\n"
4434 ".inst 0x6fb3eb3e // udot v30.4s, v25.16b, v19.4b[3]\n"
4435 ".inst 0x6fb7eb3f // udot v31.4s, v25.16b, v23.4b[3]\n"
4444 ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
4445 ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
4446 ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
4447 ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
4448 ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
4449 ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
4450 "ldr q24, [%[b_ptr0]]\n"
4451 ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
4452 ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
4453 ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
4454 ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
4455 ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
4456 ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
4457 "ldr q25, [%[b_ptr0], #0x10]\n"
4458 ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
4459 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4460 ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
4461 ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
4462 ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
4463 ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
4464 ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
4465 "ldr q24, [%[b_ptr0]]\n"
4466 ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
4467 ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
4468 ".inst 0x6fa8eb3c // udot v28.4s, v25.16b, v8.4b[3]\n"
4469 ".inst 0x6faceb3d // udot v29.4s, v25.16b, v12.4b[3]\n"
4470 ".inst 0x6fb0eb3e // udot v30.4s, v25.16b, v16.4b[3]\n"
4471 ".inst 0x6fb4eb3f // udot v31.4s, v25.16b, v20.4b[3]\n"
4472 "ldr q25, [%[b_ptr0], #0x10]\n"
4473 ".inst 0x6f81e31a // udot v26.4s, v24.16b, v1.4b[0]\n"
4474 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4475 ".inst 0x6f85e31b // udot v27.4s, v24.16b, v5.4b[0]\n"
4476 ".inst 0x6f89e31c // udot v28.4s, v24.16b, v9.4b[0]\n"
4477 ".inst 0x6f8de31d // udot v29.4s, v24.16b, v13.4b[0]\n"
4478 ".inst 0x6f91e31e // udot v30.4s, v24.16b, v17.4b[0]\n"
4479 ".inst 0x6f95e31f // udot v31.4s, v24.16b, v21.4b[0]\n"
4480 "ldr q24, [%[b_ptr0]]\n"
4481 ".inst 0x6fa1e33a // udot v26.4s, v25.16b, v1.4b[1]\n"
4482 ".inst 0x6fa5e33b // udot v27.4s, v25.16b, v5.4b[1]\n"
4483 ".inst 0x6fa9e33c // udot v28.4s, v25.16b, v9.4b[1]\n"
4484 ".inst 0x6fade33d // udot v29.4s, v25.16b, v13.4b[1]\n"
4485 ".inst 0x6fb1e33e // udot v30.4s, v25.16b, v17.4b[1]\n"
4486 ".inst 0x6fb5e33f // udot v31.4s, v25.16b, v21.4b[1]\n"
4487 "ldr q25, [%[b_ptr0], #0x10]\n"
4488 ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
4489 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4490 ".inst 0x6f85eb1b // udot v27.4s, v24.16b, v5.4b[2]\n"
4491 ".inst 0x6f89eb1c // udot v28.4s, v24.16b, v9.4b[2]\n"
4492 ".inst 0x6f8deb1d // udot v29.4s, v24.16b, v13.4b[2]\n"
4493 ".inst 0x6f91eb1e // udot v30.4s, v24.16b, v17.4b[2]\n"
4494 ".inst 0x6f95eb1f // udot v31.4s, v24.16b, v21.4b[2]\n"
4495 "ldr q24, [%[b_ptr0]]\n"
4496 ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
4497 ".inst 0x6fa5eb3b // udot v27.4s, v25.16b, v5.4b[3]\n"
4498 ".inst 0x6fa9eb3c // udot v28.4s, v25.16b, v9.4b[3]\n"
4499 ".inst 0x6fadeb3d // udot v29.4s, v25.16b, v13.4b[3]\n"
4500 ".inst 0x6fb1eb3e // udot v30.4s, v25.16b, v17.4b[3]\n"
4501 ".inst 0x6fb5eb3f // udot v31.4s, v25.16b, v21.4b[3]\n"
4502 "ldr q25, [%[b_ptr0], #0x10]\n"
4503 ".inst 0x6f82e31a // udot v26.4s, v24.16b, v2.4b[0]\n"
4504 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4505 ".inst 0x6f86e31b // udot v27.4s, v24.16b, v6.4b[0]\n"
4506 ".inst 0x6f8ae31c // udot v28.4s, v24.16b, v10.4b[0]\n"
4507 ".inst 0x6f8ee31d // udot v29.4s, v24.16b, v14.4b[0]\n"
4508 ".inst 0x6f92e31e // udot v30.4s, v24.16b, v18.4b[0]\n"
4509 ".inst 0x6f96e31f // udot v31.4s, v24.16b, v22.4b[0]\n"
4510 "ldr q24, [%[b_ptr0]]\n"
4511 ".inst 0x6fa2e33a // udot v26.4s, v25.16b, v2.4b[1]\n"
4512 ".inst 0x6fa6e33b // udot v27.4s, v25.16b, v6.4b[1]\n"
4513 ".inst 0x6faae33c // udot v28.4s, v25.16b, v10.4b[1]\n"
4514 ".inst 0x6faee33d // udot v29.4s, v25.16b, v14.4b[1]\n"
4515 ".inst 0x6fb2e33e // udot v30.4s, v25.16b, v18.4b[1]\n"
4516 ".inst 0x6fb6e33f // udot v31.4s, v25.16b, v22.4b[1]\n"
4517 "ldr q25, [%[b_ptr0], #0x10]\n"
4518 ".inst 0x6f82eb1a // udot v26.4s, v24.16b, v2.4b[2]\n"
4519 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4520 ".inst 0x6f86eb1b // udot v27.4s, v24.16b, v6.4b[2]\n"
4521 ".inst 0x6f8aeb1c // udot v28.4s, v24.16b, v10.4b[2]\n"
4522 ".inst 0x6f8eeb1d // udot v29.4s, v24.16b, v14.4b[2]\n"
4523 ".inst 0x6f92eb1e // udot v30.4s, v24.16b, v18.4b[2]\n"
4524 ".inst 0x6f96eb1f // udot v31.4s, v24.16b, v22.4b[2]\n"
4525 "ldr q24, [%[b_ptr0]]\n"
4526 ".inst 0x6fa2eb3a // udot v26.4s, v25.16b, v2.4b[3]\n"
4527 ".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
4528 ".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
4529 ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
4530 ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
4531 ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
4532 "ldr q25, [%[b_ptr0], #0x10]\n"
4533 ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
4534 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4535 ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
4536 ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
4537 ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
4538 ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
4539 ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
4540 "ldr q24, [%[b_ptr0]]\n"
4541 ".inst 0x6fa3e33a // udot v26.4s, v25.16b, v3.4b[1]\n"
4542 ".inst 0x6fa7e33b // udot v27.4s, v25.16b, v7.4b[1]\n"
4543 ".inst 0x6fabe33c // udot v28.4s, v25.16b, v11.4b[1]\n"
4544 ".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
4545 ".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
4546 ".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
4547 "ldr q25, [%[b_ptr0], #0x10]\n"
4548 ".inst 0x6f83eb1a // udot v26.4s, v24.16b, v3.4b[2]\n"
4549 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4550 ".inst 0x6f87eb1b // udot v27.4s, v24.16b, v7.4b[2]\n"
4551 ".inst 0x6f8beb1c // udot v28.4s, v24.16b, v11.4b[2]\n"
4552 ".inst 0x6f8feb1d // udot v29.4s, v24.16b, v15.4b[2]\n"
4553 ".inst 0x6f93eb1e // udot v30.4s, v24.16b, v19.4b[2]\n"
4554 ".inst 0x6f97eb1f // udot v31.4s, v24.16b, v23.4b[2]\n"
4555 ".inst 0x6fa3eb3a // udot v26.4s, v25.16b, v3.4b[3]\n"
4556 ".inst 0x6fa7eb3b // udot v27.4s, v25.16b, v7.4b[3]\n"
4557 ".inst 0x6fabeb3c // udot v28.4s, v25.16b, v11.4b[3]\n"
4558 ".inst 0x6fafeb3d // udot v29.4s, v25.16b, v15.4b[3]\n"
4559 ".inst 0x6fb3eb3e // udot v30.4s, v25.16b, v19.4b[3]\n"
4560 ".inst 0x6fb7eb3f // udot v31.4s, v25.16b, v23.4b[3]\n"
4562 "str q26, [%[c_ptr0]]\n"
4563 "add %[c_ptr0], %[c_ptr0], #0x10\n"
4564 "str q27, [c_ptr1]\n"
4565 "str q28, [c_ptr2]\n"
4566 "str q29, [c_ptr3]\n"
4567 "str q30, [c_ptr4]\n"
4568 "str q31, [c_ptr5]\n"
4579 : [a_ptr0]
"+r" (a_ptr0), [b_ptr0]
"+r" (b_ptr0), [c_ptr0]
"+r" (c_ptr0), [loops]
"+r" (loops), [oob_rows]
"+r" (oob_rows), [odds]
"+r" (odds)
4580 : [lda]
"r" (ldab), [ldc]
"r" (ldcb)
4581 :
"x0",
"x1",
"x2",
"x3",
"x4",
"x5",
"x6",
"x7",
"x8",
"x9",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23",
"v24",
"v25",
"v26",
"v27",
"v28",
"v29",
"v30",
"v31",
"cc",
"memory"
4590 #endif // __aarch64__