27 #if defined(__aarch64__)
31 void a64_transpose_interleave_4_2x4_fp32bf16(
bfloat16 *out,
const float *in,
size_t width,
size_t in_stride,
size_t height)
33 float *pad_row =
reinterpret_cast<float *
>(alloca(width *
sizeof(
float)));
36 memset(pad_row, 0, width *
sizeof(
float));
39 size_t out_stride = 4 * roundup<size_t>(height, 4) *
sizeof(
bfloat16);
42 "cmp %x[height], #0x8\n"
46 "mov x28, %x[width]\n"
48 "sub %x[height], %x[height], #0x8\n"
49 "add x26, x9, %x[in_stride]\n"
50 "add x25, x26, %x[in_stride]\n"
51 "add x24, x25, %x[in_stride]\n"
53 "add x23, x24, %x[in_stride]\n"
54 "add x22, x23, %x[in_stride]\n"
55 "add x21, x22, %x[in_stride]\n"
56 "add x20, x21, %x[in_stride]\n"
57 "add %x[in], x20, %x[in_stride]\n"
60 "ldr q19, [x9], #0x10\n"
61 "ldr q18, [x26], #0x10\n"
62 "sub x28, x28, #0x8\n"
63 "ldr q17, [x25], #0x10\n"
64 "ldr q16, [x24], #0x10\n"
66 "ldr q1, [x23], #0x10\n"
67 "ldr q0, [x22], #0x10\n"
68 "ldr q31, [x21], #0x10\n"
69 "ldr q24, [x20], #0x10\n"
70 "ldr q23, [x9], #0x10\n"
71 "ldr q22, [x26], #0x10\n"
72 "zip1 v30.4s, v19.4s, v17.4s\n"
73 "zip1 v29.4s, v18.4s, v16.4s\n"
74 "ldr q21, [x25], #0x10\n"
75 "ldr q20, [x24], #0x10\n"
76 "zip2 v28.4s, v19.4s, v17.4s\n"
77 "zip2 v27.4s, v18.4s, v16.4s\n"
78 "ldr q19, [x23], #0x10\n"
79 "ldr q18, [x22], #0x10\n"
80 "zip1 v26.4s, v1.4s, v31.4s\n"
81 "zip1 v25.4s, v0.4s, v24.4s\n"
82 "ldr q17, [x21], #0x10\n"
83 "ldr q16, [x20], #0x10\n"
84 "zip2 v8.4s, v1.4s, v31.4s\n"
85 "zip2 v24.4s, v0.4s, v24.4s\n"
86 "zip1 v7.4s, v23.4s, v21.4s\n"
87 "zip1 v6.4s, v22.4s, v20.4s\n"
88 "zip2 v5.4s, v23.4s, v21.4s\n"
89 "zip2 v4.4s, v22.4s, v20.4s\n"
90 "zip1 v3.4s, v19.4s, v17.4s\n"
91 "zip1 v2.4s, v18.4s, v16.4s\n"
92 "zip2 v1.4s, v19.4s, v17.4s\n"
93 "zip2 v0.4s, v18.4s, v16.4s\n"
94 "zip1 v23.4s, v30.4s, v29.4s\n"
95 "zip1 v22.4s, v28.4s, v27.4s\n"
96 "zip1 v21.4s, v26.4s, v25.4s\n"
97 "zip1 v20.4s, v8.4s, v24.4s\n"
98 "zip1 v19.4s, v7.4s, v6.4s\n"
99 "zip1 v18.4s, v5.4s, v4.4s\n"
100 "zip1 v17.4s, v3.4s, v2.4s\n"
101 "zip1 v16.4s, v1.4s, v0.4s\n"
102 ".inst 0x0ea16aff // bfcvtn v31.4h, v23.4s\n"
103 "zip2 v30.4s, v30.4s, v29.4s\n"
104 ".inst 0x0ea16add // bfcvtn v29.4h, v22.4s\n"
105 "zip2 v28.4s, v28.4s, v27.4s\n"
106 ".inst 0x0ea16abb // bfcvtn v27.4h, v21.4s\n"
107 "zip2 v26.4s, v26.4s, v25.4s\n"
108 ".inst 0x0ea16a99 // bfcvtn v25.4h, v20.4s\n"
109 "zip2 v24.4s, v8.4s, v24.4s\n"
110 ".inst 0x0ea16a77 // bfcvtn v23.4h, v19.4s\n"
111 "zip2 v22.4s, v7.4s, v6.4s\n"
112 ".inst 0x0ea16a55 // bfcvtn v21.4h, v18.4s\n"
113 "zip2 v20.4s, v5.4s, v4.4s\n"
114 ".inst 0x0ea16a33 // bfcvtn v19.4h, v17.4s\n"
115 "zip2 v18.4s, v3.4s, v2.4s\n"
116 ".inst 0x0ea16a11 // bfcvtn v17.4h, v16.4s\n"
117 "zip2 v16.4s, v1.4s, v0.4s\n"
118 ".inst 0x4ea16bdf // bfcvtn2 v31.8h, v30.4s\n"
119 ".inst 0x4ea16b9d // bfcvtn2 v29.8h, v28.4s\n"
120 ".inst 0x4ea16b5b // bfcvtn2 v27.8h, v26.4s\n"
121 ".inst 0x4ea16b19 // bfcvtn2 v25.8h, v24.4s\n"
122 ".inst 0x4ea16ad7 // bfcvtn2 v23.8h, v22.4s\n"
123 ".inst 0x4ea16a95 // bfcvtn2 v21.8h, v20.4s\n"
124 "str q31, [x27, #0x0]\n"
125 "str q29, [x27, #0x10]\n"
126 ".inst 0x4ea16a53 // bfcvtn2 v19.8h, v18.4s\n"
127 ".inst 0x4ea16a11 // bfcvtn2 v17.8h, v16.4s\n"
128 "str q27, [x27, #0x20]\n"
129 "str q25, [x27, #0x30]\n"
130 "add x27, x27, %x[out_stride]\n"
131 "str q23, [x27, #0x0]\n"
132 "str q21, [x27, #0x10]\n"
133 "str q19, [x27, #0x20]\n"
134 "str q17, [x27, #0x30]\n"
135 "add x27, x27, %x[out_stride]\n"
141 "ldr q25, [x9], #0x10\n"
142 "ldr q24, [x26], #0x10\n"
143 "sub x28, x28, #0x4\n"
144 "ldr q21, [x25], #0x10\n"
145 "ldr q20, [x24], #0x10\n"
147 "ldr q23, [x23], #0x10\n"
148 "ldr q19, [x22], #0x10\n"
149 "ldr q18, [x21], #0x10\n"
150 "ldr q17, [x20], #0x10\n"
151 "zip1 v22.4s, v25.4s, v21.4s\n"
152 "zip1 v16.4s, v24.4s, v20.4s\n"
153 "zip2 v21.4s, v25.4s, v21.4s\n"
154 "zip2 v20.4s, v24.4s, v20.4s\n"
155 "zip1 v27.4s, v23.4s, v18.4s\n"
156 "zip1 v26.4s, v19.4s, v17.4s\n"
157 "zip2 v25.4s, v23.4s, v18.4s\n"
158 "zip2 v24.4s, v19.4s, v17.4s\n"
159 "zip1 v19.4s, v22.4s, v16.4s\n"
160 "zip1 v18.4s, v21.4s, v20.4s\n"
161 "zip1 v17.4s, v27.4s, v26.4s\n"
162 "zip2 v23.4s, v22.4s, v16.4s\n"
163 "zip1 v16.4s, v25.4s, v24.4s\n"
164 "zip2 v22.4s, v21.4s, v20.4s\n"
165 ".inst 0x0ea16a75 // bfcvtn v21.4h, v19.4s\n"
166 ".inst 0x0ea16a54 // bfcvtn v20.4h, v18.4s\n"
167 ".inst 0x0ea16a33 // bfcvtn v19.4h, v17.4s\n"
168 "zip2 v18.4s, v27.4s, v26.4s\n"
169 ".inst 0x0ea16a11 // bfcvtn v17.4h, v16.4s\n"
170 "zip2 v16.4s, v25.4s, v24.4s\n"
171 ".inst 0x4ea16af5 // bfcvtn2 v21.8h, v23.4s\n"
172 ".inst 0x4ea16ad4 // bfcvtn2 v20.8h, v22.4s\n"
173 ".inst 0x4ea16a53 // bfcvtn2 v19.8h, v18.4s\n"
174 ".inst 0x4ea16a11 // bfcvtn2 v17.8h, v16.4s\n"
175 "str q21, [x27, #0x0]\n"
176 "str q20, [x27, #0x10]\n"
177 "str q19, [x27, #0x20]\n"
178 "str q17, [x27, #0x30]\n"
179 "add x27, x27, %x[out_stride]\n"
183 "movi v16.16b, #0x0\n"
184 "str q16, [x27, #0x0]\n"
185 "str q16, [x27, #0x10]\n"
186 "str q16, [x27, #0x20]\n"
187 "str q16, [x27, #0x30]\n"
189 "ldr s23, [x9], #0x4\n"
190 "ldr s22, [x26], #0x4\n"
191 "sub x28, x28, #0x1\n"
192 "ldr s19, [x25], #0x4\n"
193 "ldr s17, [x24], #0x4\n"
195 "ldr s21, [x23], #0x4\n"
196 "ldr s20, [x22], #0x4\n"
197 "ldr s18, [x21], #0x4\n"
198 "ldr s16, [x20], #0x4\n"
199 "zip1 v19.4s, v23.4s, v19.4s\n"
200 "zip1 v17.4s, v22.4s, v17.4s\n"
201 "zip1 v18.4s, v21.4s, v18.4s\n"
202 "zip1 v16.4s, v20.4s, v16.4s\n"
203 "zip1 v17.4s, v19.4s, v17.4s\n"
204 "zip1 v16.4s, v18.4s, v16.4s\n"
205 ".inst 0x0ea16a31 // bfcvtn v17.4h, v17.4s\n"
206 ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n"
207 "str d17, [x27, #0x0]\n"
208 "str d16, [x27, #0x20]\n"
209 "add x27, x27, #0x8\n"
212 "cmp %x[height], #0x8\n"
213 "add %x[out], %x[out], #0x40\n"
215 "cbz %x[height], 16f\n"
219 "mov x20, %x[width]\n"
220 "cmp %x[height], #0x3\n"
222 "add x26, x9, %x[in_stride]\n"
223 "add x25, x26, %x[in_stride]\n"
224 "add x24, x25, %x[in_stride]\n"
225 "csel x25, x25, %x[pad_row], GE\n"
226 "add %x[in], x24, %x[in_stride]\n"
227 "csel x24, x24, %x[pad_row], GT\n"
228 "cmp %x[height], #0x1\n"
229 "sub %x[height], %x[height], #0x4\n"
230 "csel x26, x26, %x[pad_row], GT\n"
234 "ldr q25, [x9], #0x10\n"
235 "ldr q24, [x26], #0x10\n"
236 "sub x20, x20, #0x8\n"
237 "ldr q21, [x25], #0x10\n"
238 "ldr q20, [x24], #0x10\n"
240 "ldr q23, [x9], #0x10\n"
241 "ldr q19, [x26], #0x10\n"
242 "ldr q18, [x25], #0x10\n"
243 "ldr q17, [x24], #0x10\n"
244 "zip1 v22.4s, v25.4s, v21.4s\n"
245 "zip1 v16.4s, v24.4s, v20.4s\n"
246 "zip2 v21.4s, v25.4s, v21.4s\n"
247 "zip2 v20.4s, v24.4s, v20.4s\n"
248 "zip1 v27.4s, v23.4s, v18.4s\n"
249 "zip1 v26.4s, v19.4s, v17.4s\n"
250 "zip2 v25.4s, v23.4s, v18.4s\n"
251 "zip2 v24.4s, v19.4s, v17.4s\n"
252 "zip1 v19.4s, v22.4s, v16.4s\n"
253 "zip1 v18.4s, v21.4s, v20.4s\n"
254 "zip1 v17.4s, v27.4s, v26.4s\n"
255 "zip2 v23.4s, v22.4s, v16.4s\n"
256 "zip1 v16.4s, v25.4s, v24.4s\n"
257 "zip2 v22.4s, v21.4s, v20.4s\n"
258 ".inst 0x0ea16a75 // bfcvtn v21.4h, v19.4s\n"
259 ".inst 0x0ea16a54 // bfcvtn v20.4h, v18.4s\n"
260 ".inst 0x0ea16a33 // bfcvtn v19.4h, v17.4s\n"
261 "zip2 v18.4s, v27.4s, v26.4s\n"
262 ".inst 0x0ea16a11 // bfcvtn v17.4h, v16.4s\n"
263 "zip2 v16.4s, v25.4s, v24.4s\n"
264 ".inst 0x4ea16af5 // bfcvtn2 v21.8h, v23.4s\n"
265 ".inst 0x4ea16ad4 // bfcvtn2 v20.8h, v22.4s\n"
266 ".inst 0x4ea16a53 // bfcvtn2 v19.8h, v18.4s\n"
267 ".inst 0x4ea16a11 // bfcvtn2 v17.8h, v16.4s\n"
268 "str q21, [x27, #0x0]\n"
269 "str q20, [x27, #0x10]\n"
270 "add x27, x27, %x[out_stride]\n"
271 "str q19, [x27, #0x0]\n"
272 "str q17, [x27, #0x10]\n"
273 "add x27, x27, %x[out_stride]\n"
279 "ldr q21, [x9], #0x10\n"
280 "ldr q20, [x26], #0x10\n"
281 "sub x20, x20, #0x4\n"
282 "ldr q19, [x25], #0x10\n"
283 "ldr q17, [x24], #0x10\n"
285 "zip1 v18.4s, v21.4s, v19.4s\n"
286 "zip1 v16.4s, v20.4s, v17.4s\n"
287 "zip2 v21.4s, v21.4s, v19.4s\n"
288 "zip2 v20.4s, v20.4s, v17.4s\n"
289 "zip1 v17.4s, v18.4s, v16.4s\n"
290 "zip2 v19.4s, v18.4s, v16.4s\n"
291 "zip1 v16.4s, v21.4s, v20.4s\n"
292 ".inst 0x0ea16a32 // bfcvtn v18.4h, v17.4s\n"
293 "zip2 v17.4s, v21.4s, v20.4s\n"
294 ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n"
295 ".inst 0x4ea16a72 // bfcvtn2 v18.8h, v19.4s\n"
296 ".inst 0x4ea16a30 // bfcvtn2 v16.8h, v17.4s\n"
297 "str q18, [x27, #0x0]\n"
298 "str q16, [x27, #0x10]\n"
299 "add x27, x27, %x[out_stride]\n"
303 "movi v16.16b, #0x0\n"
304 "str q16, [x27, #0x0]\n"
305 "str q16, [x27, #0x10]\n"
307 "ldr s19, [x9], #0x4\n"
308 "ldr s18, [x26], #0x4\n"
309 "sub x20, x20, #0x1\n"
310 "ldr s17, [x25], #0x4\n"
311 "ldr s16, [x24], #0x4\n"
313 "zip1 v17.4s, v19.4s, v17.4s\n"
314 "zip1 v16.4s, v18.4s, v16.4s\n"
315 "zip1 v16.4s, v17.4s, v16.4s\n"
316 ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n"
317 "str d16, [x27, #0x0]\n"
318 "add x27, x27, #0x8\n"
321 "cmp %x[height], #0x1\n"
322 "add %x[out], %x[out], #0x20\n"
325 : [height]
"+&r" (height), [in]
"+&r" (in), [out]
"+&r" (out)
326 : [in_stride]
"r" (in_stride), [out_stride]
"r" (out_stride), [pad_row]
"r" (pad_row), [width]
"r" (width)
327 :
"cc",
"memory",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23",
"v24",
"v25",
"v26",
"v27",
"v28",
"v29",
"v30",
"v31",
"x9",
"x20",
"x21",
"x22",
"x23",
"x24",
"x25",
"x26",
"x27",
"x28"
333 void Transform<4, 4, true, VLType::None>(
334 bfloat16 *out,
const float *in,
int stride,
int x0,
int xmax,
int k0,
int kmax)
336 a64_transpose_interleave_4_2x4_fp32bf16(
338 in + k0 * stride + x0,
340 stride *
sizeof(
float),
346 #endif // defined(__aarch64__)