27 #if defined(__aarch64__)
31 void a64_transpose_interleave_16_2x4_fp32bf16(
bfloat16 *out,
const float *in,
size_t width,
size_t in_stride,
size_t height)
33 float *pad_row =
reinterpret_cast<float *
>(alloca(width *
sizeof(
float)));
36 memset(pad_row, 0, width *
sizeof(
float));
39 size_t out_stride = 16 * roundup<size_t>(height, 4) *
sizeof(
bfloat16);
42 "cmp %x[height], #0x8\n"
46 "add x28, x9, %x[in_stride]\n"
47 "add x27, x28, %x[in_stride]\n"
48 "add x26, x27, %x[in_stride]\n"
49 "add x25, x26, %x[in_stride]\n"
50 "mov x24, %x[width]\n"
51 "add x23, x25, %x[in_stride]\n"
52 "add x22, x23, %x[in_stride]\n"
53 "add x20, x22, %x[in_stride]\n"
55 "add %x[in], x20, %x[in_stride]\n"
57 "sub %x[height], %x[height], #0x8\n"
60 "ldr q13, [x9], #0x10\n"
61 "ldr q12, [x28], #0x10\n"
62 "sub x24, x24, #0x10\n"
64 "ldr q1, [x27], #0x10\n"
65 "ldr q9, [x26], #0x10\n"
66 "zip1 v19.4s, v13.4s, v1.4s\n"
67 "zip1 v14.4s, v12.4s, v9.4s\n"
68 "ldr q15, [x25], #0x10\n"
69 "ldr q4, [x23], #0x10\n"
70 "zip2 v8.4s, v13.4s, v1.4s\n"
71 "zip2 v28.4s, v12.4s, v9.4s\n"
72 "ldr q0, [x22], #0x10\n"
73 "ldr q1, [x20], #0x10\n"
74 "zip1 v16.4s, v15.4s, v0.4s\n"
75 "zip1 v5.4s, v4.4s, v1.4s\n"
76 "ldr q25, [x9], #0x10\n"
77 "ldr q24, [x28], #0x10\n"
78 "zip2 v3.4s, v15.4s, v0.4s\n"
79 "zip2 v2.4s, v4.4s, v1.4s\n"
80 "ldr q21, [x27], #0x10\n"
81 "ldr q30, [x26], #0x10\n"
82 "zip1 v18.4s, v25.4s, v21.4s\n"
83 "zip1 v27.4s, v24.4s, v30.4s\n"
84 "ldr q22, [x25], #0x10\n"
85 "ldr q20, [x23], #0x10\n"
86 "zip2 v9.4s, v25.4s, v21.4s\n"
87 "zip2 v10.4s, v24.4s, v30.4s\n"
88 "ldr q1, [x22], #0x10\n"
89 "ldr q21, [x20], #0x10\n"
90 "zip1 v25.4s, v22.4s, v1.4s\n"
91 "zip1 v7.4s, v20.4s, v21.4s\n"
92 "ldr q31, [x9], #0x10\n"
93 "ldr q17, [x28], #0x10\n"
94 "zip2 v30.4s, v22.4s, v1.4s\n"
95 "zip2 v20.4s, v20.4s, v21.4s\n"
96 "ldr q15, [x27], #0x10\n"
97 "ldr q24, [x26], #0x10\n"
98 "zip1 v6.4s, v31.4s, v15.4s\n"
99 "zip1 v4.4s, v17.4s, v24.4s\n"
100 "ldr q12, [x25], #0x10\n"
101 "ldr q29, [x23], #0x10\n"
102 "zip2 v22.4s, v31.4s, v15.4s\n"
103 "zip2 v26.4s, v17.4s, v24.4s\n"
104 "ldr q0, [x22], #0x10\n"
105 "ldr q24, [x20], #0x10\n"
106 "zip1 v17.4s, v12.4s, v0.4s\n"
107 "zip1 v31.4s, v29.4s, v24.4s\n"
108 "ldr q21, [x9], #0x10\n"
109 "ldr q1, [x28], #0x10\n"
110 "zip2 v23.4s, v12.4s, v0.4s\n"
111 "zip2 v24.4s, v29.4s, v24.4s\n"
112 "ldr q11, [x27], #0x10\n"
113 "ldr q29, [x26], #0x10\n"
114 "zip1 v0.4s, v21.4s, v11.4s\n"
115 "zip1 v13.4s, v1.4s, v29.4s\n"
116 "ldr q15, [x25], #0x10\n"
117 "ldr q12, [x23], #0x10\n"
118 "zip2 v21.4s, v21.4s, v11.4s\n"
119 "zip2 v29.4s, v1.4s, v29.4s\n"
120 "ldr q1, [x22], #0x10\n"
121 "zip1 v11.4s, v15.4s, v1.4s\n"
122 "zip2 v1.4s, v15.4s, v1.4s\n"
123 "zip1 v15.4s, v19.4s, v14.4s\n"
124 ".inst 0x0ea169ef // bfcvtn v15.4h, v15.4s\n"
125 "zip2 v14.4s, v19.4s, v14.4s\n"
126 "ldr q19, [x20], #0x10\n"
127 ".inst 0x4ea169cf // bfcvtn2 v15.8h, v14.4s\n"
128 "str q15, [x21, #0x0]\n"
129 "zip1 v14.4s, v12.4s, v19.4s\n"
130 "zip2 v15.4s, v12.4s, v19.4s\n"
131 "zip1 v12.4s, v8.4s, v28.4s\n"
132 "zip1 v19.4s, v18.4s, v27.4s\n"
133 ".inst 0x0ea1698c // bfcvtn v12.4h, v12.4s\n"
134 "zip2 v28.4s, v8.4s, v28.4s\n"
135 "zip1 v8.4s, v9.4s, v10.4s\n"
136 ".inst 0x0ea16a73 // bfcvtn v19.4h, v19.4s\n"
137 "zip2 v18.4s, v18.4s, v27.4s\n"
138 "zip1 v27.4s, v6.4s, v4.4s\n"
139 ".inst 0x0ea16908 // bfcvtn v8.4h, v8.4s\n"
140 "zip2 v10.4s, v9.4s, v10.4s\n"
141 "zip1 v9.4s, v22.4s, v26.4s\n"
142 ".inst 0x0ea16b7b // bfcvtn v27.4h, v27.4s\n"
143 "zip2 v6.4s, v6.4s, v4.4s\n"
144 "zip1 v4.4s, v0.4s, v13.4s\n"
145 ".inst 0x0ea16929 // bfcvtn v9.4h, v9.4s\n"
146 "zip2 v22.4s, v22.4s, v26.4s\n"
147 "zip1 v26.4s, v21.4s, v29.4s\n"
148 ".inst 0x0ea16884 // bfcvtn v4.4h, v4.4s\n"
149 "zip2 v13.4s, v0.4s, v13.4s\n"
150 "zip1 v0.4s, v16.4s, v5.4s\n"
151 ".inst 0x0ea16b5a // bfcvtn v26.4h, v26.4s\n"
152 "zip2 v21.4s, v21.4s, v29.4s\n"
153 "zip1 v29.4s, v3.4s, v2.4s\n"
154 ".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n"
155 "zip2 v5.4s, v16.4s, v5.4s\n"
156 "zip1 v16.4s, v25.4s, v7.4s\n"
157 ".inst 0x0ea16bbd // bfcvtn v29.4h, v29.4s\n"
158 "zip2 v2.4s, v3.4s, v2.4s\n"
159 "zip1 v3.4s, v30.4s, v20.4s\n"
160 ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n"
161 "zip2 v7.4s, v25.4s, v7.4s\n"
162 "zip1 v25.4s, v17.4s, v31.4s\n"
163 ".inst 0x0ea16863 // bfcvtn v3.4h, v3.4s\n"
164 "zip2 v30.4s, v30.4s, v20.4s\n"
165 "zip1 v20.4s, v23.4s, v24.4s\n"
166 ".inst 0x0ea16b39 // bfcvtn v25.4h, v25.4s\n"
167 "zip2 v17.4s, v17.4s, v31.4s\n"
168 "zip1 v31.4s, v11.4s, v14.4s\n"
169 ".inst 0x0ea16a94 // bfcvtn v20.4h, v20.4s\n"
170 "zip2 v24.4s, v23.4s, v24.4s\n"
171 "zip1 v23.4s, v1.4s, v15.4s\n"
172 ".inst 0x0ea16bff // bfcvtn v31.4h, v31.4s\n"
173 "zip2 v14.4s, v11.4s, v14.4s\n"
174 ".inst 0x0ea16af7 // bfcvtn v23.4h, v23.4s\n"
175 "zip2 v1.4s, v1.4s, v15.4s\n"
176 ".inst 0x4ea16b8c // bfcvtn2 v12.8h, v28.4s\n"
177 "str q12, [x21, #0x10]\n"
178 ".inst 0x4ea16a53 // bfcvtn2 v19.8h, v18.4s\n"
179 ".inst 0x4ea16948 // bfcvtn2 v8.8h, v10.4s\n"
180 "str q19, [x21, #0x20]\n"
181 ".inst 0x4ea168db // bfcvtn2 v27.8h, v6.4s\n"
182 ".inst 0x4ea16ac9 // bfcvtn2 v9.8h, v22.4s\n"
183 "str q8, [x21, #0x30]\n"
184 ".inst 0x4ea169a4 // bfcvtn2 v4.8h, v13.4s\n"
185 ".inst 0x4ea16aba // bfcvtn2 v26.8h, v21.4s\n"
186 "str q27, [x21, #0x40]\n"
187 ".inst 0x4ea168a0 // bfcvtn2 v0.8h, v5.4s\n"
188 ".inst 0x4ea1685d // bfcvtn2 v29.8h, v2.4s\n"
189 "str q9, [x21, #0x50]\n"
190 ".inst 0x4ea168f0 // bfcvtn2 v16.8h, v7.4s\n"
191 ".inst 0x4ea16bc3 // bfcvtn2 v3.8h, v30.4s\n"
192 "str q4, [x21, #0x60]\n"
193 ".inst 0x4ea16a39 // bfcvtn2 v25.8h, v17.4s\n"
194 ".inst 0x4ea16b14 // bfcvtn2 v20.8h, v24.4s\n"
195 "str q26, [x21, #0x70]\n"
196 ".inst 0x4ea169df // bfcvtn2 v31.8h, v14.4s\n"
197 ".inst 0x4ea16837 // bfcvtn2 v23.8h, v1.4s\n"
198 "str q0, [x21, #0x80]\n"
199 "str q29, [x21, #0x90]\n"
200 "str q16, [x21, #0xa0]\n"
201 "str q3, [x21, #0xb0]\n"
202 "str q25, [x21, #0xc0]\n"
203 "str q20, [x21, #0xd0]\n"
204 "str q31, [x21, #0xe0]\n"
205 "str q23, [x21, #0xf0]\n"
206 "add x21, x21, %x[out_stride]\n"
212 "ldr q23, [x9], #0x10\n"
213 "ldr q20, [x28], #0x10\n"
214 "sub x24, x24, #0x4\n"
216 "ldr q17, [x27], #0x10\n"
217 "ldr q16, [x26], #0x10\n"
218 "zip1 v22.4s, v23.4s, v17.4s\n"
219 "zip1 v21.4s, v20.4s, v16.4s\n"
220 "ldr q19, [x25], #0x10\n"
221 "ldr q18, [x23], #0x10\n"
222 "zip2 v28.4s, v23.4s, v17.4s\n"
223 "zip2 v20.4s, v20.4s, v16.4s\n"
224 "ldr q17, [x22], #0x10\n"
225 "ldr q16, [x20], #0x10\n"
226 "zip1 v27.4s, v19.4s, v17.4s\n"
227 "zip1 v26.4s, v18.4s, v16.4s\n"
228 "zip2 v25.4s, v19.4s, v17.4s\n"
229 "zip2 v24.4s, v18.4s, v16.4s\n"
230 "zip1 v19.4s, v22.4s, v21.4s\n"
231 "zip1 v18.4s, v28.4s, v20.4s\n"
232 "zip1 v17.4s, v27.4s, v26.4s\n"
233 "zip1 v16.4s, v25.4s, v24.4s\n"
234 ".inst 0x0ea16a77 // bfcvtn v23.4h, v19.4s\n"
235 "zip2 v22.4s, v22.4s, v21.4s\n"
236 ".inst 0x0ea16a55 // bfcvtn v21.4h, v18.4s\n"
237 "zip2 v20.4s, v28.4s, v20.4s\n"
238 ".inst 0x0ea16a33 // bfcvtn v19.4h, v17.4s\n"
239 "zip2 v18.4s, v27.4s, v26.4s\n"
240 ".inst 0x0ea16a11 // bfcvtn v17.4h, v16.4s\n"
241 "zip2 v16.4s, v25.4s, v24.4s\n"
242 ".inst 0x4ea16ad7 // bfcvtn2 v23.8h, v22.4s\n"
243 ".inst 0x4ea16a95 // bfcvtn2 v21.8h, v20.4s\n"
244 "str q23, [x21, #0x0]\n"
245 ".inst 0x4ea16a53 // bfcvtn2 v19.8h, v18.4s\n"
246 ".inst 0x4ea16a11 // bfcvtn2 v17.8h, v16.4s\n"
247 "str q21, [x21, #0x10]\n"
248 "str q19, [x21, #0x80]\n"
249 "str q17, [x21, #0x90]\n"
250 "add x21, x21, #0x20\n"
256 "ldr s19, [x9], #0x4\n"
257 "ldr s18, [x28], #0x4\n"
258 "sub x24, x24, #0x1\n"
260 "ldr s17, [x27], #0x4\n"
261 "ldr s16, [x26], #0x4\n"
262 "zip1 v17.4s, v19.4s, v17.4s\n"
263 "zip1 v16.4s, v18.4s, v16.4s\n"
264 "ldr s20, [x25], #0x4\n"
265 "ldr s19, [x23], #0x4\n"
266 "zip1 v16.4s, v17.4s, v16.4s\n"
267 ".inst 0x0ea16a12 // bfcvtn v18.4h, v16.4s\n"
268 "ldr s17, [x22], #0x4\n"
269 "ldr s16, [x20], #0x4\n"
270 "zip1 v17.4s, v20.4s, v17.4s\n"
271 "zip1 v16.4s, v19.4s, v16.4s\n"
272 "zip1 v16.4s, v17.4s, v16.4s\n"
273 ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n"
274 "str d18, [x21, #0x0]\n"
275 "str d16, [x21, #0x80]\n"
276 "add x21, x21, #0x8\n"
279 "cmp %x[height], #0x8\n"
280 "add %x[out], %x[out], #0x100\n"
282 "cbz %x[height], 16f\n"
286 "add x28, x9, %x[in_stride]\n"
287 "add x27, x28, %x[in_stride]\n"
288 "mov x20, %x[width]\n"
289 "add x26, x27, %x[in_stride]\n"
290 "cmp %x[height], #0x3\n"
291 "add %x[in], x26, %x[in_stride]\n"
292 "csel x26, x26, %x[pad_row], GT\n"
293 "csel x27, x27, %x[pad_row], GE\n"
294 "cmp %x[height], #0x1\n"
295 "csel x28, x28, %x[pad_row], GT\n"
298 "sub %x[height], %x[height], #0x4\n"
301 "ldr q21, [x9], #0x10\n"
302 "ldr q20, [x28], #0x10\n"
303 "sub x20, x20, #0x10\n"
305 "ldr q17, [x27], #0x10\n"
306 "ldr q16, [x26], #0x10\n"
307 "zip1 v30.4s, v21.4s, v17.4s\n"
308 "zip1 v29.4s, v20.4s, v16.4s\n"
309 "ldr q19, [x9], #0x10\n"
310 "ldr q18, [x28], #0x10\n"
311 "zip2 v28.4s, v21.4s, v17.4s\n"
312 "zip2 v27.4s, v20.4s, v16.4s\n"
313 "ldr q17, [x27], #0x10\n"
314 "ldr q16, [x26], #0x10\n"
315 "zip1 v26.4s, v19.4s, v17.4s\n"
316 "zip1 v25.4s, v18.4s, v16.4s\n"
317 "ldr q21, [x9], #0x10\n"
318 "ldr q20, [x28], #0x10\n"
319 "zip2 v8.4s, v19.4s, v17.4s\n"
320 "zip2 v24.4s, v18.4s, v16.4s\n"
321 "ldr q17, [x27], #0x10\n"
322 "ldr q16, [x26], #0x10\n"
323 "zip1 v7.4s, v21.4s, v17.4s\n"
324 "zip1 v6.4s, v20.4s, v16.4s\n"
325 "ldr q19, [x9], #0x10\n"
326 "ldr q18, [x28], #0x10\n"
327 "zip2 v5.4s, v21.4s, v17.4s\n"
328 "zip2 v4.4s, v20.4s, v16.4s\n"
329 "ldr q17, [x27], #0x10\n"
330 "ldr q16, [x26], #0x10\n"
331 "zip1 v3.4s, v19.4s, v17.4s\n"
332 "zip1 v2.4s, v18.4s, v16.4s\n"
333 "zip2 v1.4s, v19.4s, v17.4s\n"
334 "zip2 v0.4s, v18.4s, v16.4s\n"
335 "zip1 v23.4s, v30.4s, v29.4s\n"
336 "zip1 v22.4s, v28.4s, v27.4s\n"
337 "zip1 v21.4s, v26.4s, v25.4s\n"
338 "zip1 v20.4s, v8.4s, v24.4s\n"
339 "zip1 v19.4s, v7.4s, v6.4s\n"
340 "zip1 v18.4s, v5.4s, v4.4s\n"
341 "zip1 v17.4s, v3.4s, v2.4s\n"
342 "zip1 v16.4s, v1.4s, v0.4s\n"
343 ".inst 0x0ea16aff // bfcvtn v31.4h, v23.4s\n"
344 "zip2 v30.4s, v30.4s, v29.4s\n"
345 ".inst 0x0ea16add // bfcvtn v29.4h, v22.4s\n"
346 "zip2 v28.4s, v28.4s, v27.4s\n"
347 ".inst 0x0ea16abb // bfcvtn v27.4h, v21.4s\n"
348 "zip2 v26.4s, v26.4s, v25.4s\n"
349 ".inst 0x0ea16a99 // bfcvtn v25.4h, v20.4s\n"
350 "zip2 v24.4s, v8.4s, v24.4s\n"
351 ".inst 0x0ea16a77 // bfcvtn v23.4h, v19.4s\n"
352 "zip2 v22.4s, v7.4s, v6.4s\n"
353 ".inst 0x0ea16a55 // bfcvtn v21.4h, v18.4s\n"
354 "zip2 v20.4s, v5.4s, v4.4s\n"
355 ".inst 0x0ea16a33 // bfcvtn v19.4h, v17.4s\n"
356 "zip2 v18.4s, v3.4s, v2.4s\n"
357 ".inst 0x0ea16a11 // bfcvtn v17.4h, v16.4s\n"
358 "zip2 v16.4s, v1.4s, v0.4s\n"
359 ".inst 0x4ea16bdf // bfcvtn2 v31.8h, v30.4s\n"
360 ".inst 0x4ea16b9d // bfcvtn2 v29.8h, v28.4s\n"
361 "str q31, [x21, #0x0]\n"
362 ".inst 0x4ea16b5b // bfcvtn2 v27.8h, v26.4s\n"
363 ".inst 0x4ea16b19 // bfcvtn2 v25.8h, v24.4s\n"
364 "str q29, [x21, #0x10]\n"
365 ".inst 0x4ea16ad7 // bfcvtn2 v23.8h, v22.4s\n"
366 ".inst 0x4ea16a95 // bfcvtn2 v21.8h, v20.4s\n"
367 "str q27, [x21, #0x20]\n"
368 ".inst 0x4ea16a53 // bfcvtn2 v19.8h, v18.4s\n"
369 ".inst 0x4ea16a11 // bfcvtn2 v17.8h, v16.4s\n"
370 "str q25, [x21, #0x30]\n"
371 "str q23, [x21, #0x40]\n"
372 "str q21, [x21, #0x50]\n"
373 "str q19, [x21, #0x60]\n"
374 "str q17, [x21, #0x70]\n"
375 "add x21, x21, %x[out_stride]\n"
381 "ldr q20, [x9], #0x10\n"
382 "ldr q19, [x28], #0x10\n"
383 "sub x20, x20, #0x4\n"
385 "ldr q17, [x27], #0x10\n"
386 "ldr q16, [x26], #0x10\n"
387 "zip1 v22.4s, v20.4s, v17.4s\n"
388 "zip1 v18.4s, v19.4s, v16.4s\n"
389 "zip2 v21.4s, v20.4s, v17.4s\n"
390 "zip2 v20.4s, v19.4s, v16.4s\n"
391 "zip1 v17.4s, v22.4s, v18.4s\n"
392 "zip1 v16.4s, v21.4s, v20.4s\n"
393 ".inst 0x0ea16a33 // bfcvtn v19.4h, v17.4s\n"
394 "zip2 v18.4s, v22.4s, v18.4s\n"
395 ".inst 0x0ea16a11 // bfcvtn v17.4h, v16.4s\n"
396 "zip2 v16.4s, v21.4s, v20.4s\n"
397 ".inst 0x4ea16a53 // bfcvtn2 v19.8h, v18.4s\n"
398 ".inst 0x4ea16a11 // bfcvtn2 v17.8h, v16.4s\n"
399 "str q19, [x21, #0x0]\n"
400 "str q17, [x21, #0x10]\n"
401 "add x21, x21, #0x20\n"
407 "ldr s19, [x9], #0x4\n"
408 "ldr s18, [x28], #0x4\n"
409 "sub x20, x20, #0x1\n"
411 "ldr s17, [x27], #0x4\n"
412 "ldr s16, [x26], #0x4\n"
413 "zip1 v17.4s, v19.4s, v17.4s\n"
414 "zip1 v16.4s, v18.4s, v16.4s\n"
415 "zip1 v16.4s, v17.4s, v16.4s\n"
416 ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n"
417 "str d16, [x21, #0x0]\n"
418 "add x21, x21, #0x8\n"
421 "cmp %x[height], #0x1\n"
422 "add %x[out], %x[out], #0x80\n"
425 : [height]
"+&r" (height), [in]
"+&r" (in), [out]
"+&r" (out)
426 : [in_stride]
"r" (in_stride), [out_stride]
"r" (out_stride), [pad_row]
"r" (pad_row), [width]
"r" (width)
427 :
"cc",
"memory",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23",
"v24",
"v25",
"v26",
"v27",
"v28",
"v29",
"v30",
"v31",
"x9",
"x20",
"x21",
"x22",
"x23",
"x24",
"x25",
"x26",
"x27",
"x28"
433 void Transform<16, 4, true, VLType::None>(
434 bfloat16 *out,
const float *in,
int stride,
int x0,
int xmax,
int k0,
int kmax)
436 a64_transpose_interleave_16_2x4_fp32bf16(
438 in + k0 * stride + x0,
440 stride *
sizeof(
float),
446 #endif // defined(__aarch64__)