28 #include "../../asmlib.hpp"
32 void a64_gemm_u8_4x4(
const uint8_t *Apanel,
const uint8_t *Bpanel, uint32_t *Cpanel,
int ablocks,
int bblocks,
int K) {
33 const uint8_t *a_ptr = Apanel;
34 uint32_t *c_ptr = Cpanel;
37 for (
int yb=0; yb<ablocks; yb++) {
38 const uint8_t *a_ptr0 = a_ptr;
39 const uint8_t *b_ptr = Bpanel;
41 for (
int xb=0; xb<bblocks; xb++) {
46 register uint8x16_t b0
asm(
"v4");
47 register uint8x16_t b1
asm(
"v5");
48 register uint8x16_t b2
asm(
"v6");
49 register uint8x16_t b3
asm(
"v7");
53 "ldr q0, [%[a_ptr]]\n"
55 "ldr %q[b0], [%[b_ptr]]\n"
57 "ldr %q[b1], [%[b_ptr], #16]\n"
59 "ldr %q[b2], [%[b_ptr], #32]\n"
61 "ldr %q[b3], [%[b_ptr], #48]\n"
63 "ldr q1, [%[a_ptr], #16]\n"
65 "ldr q2, [%[a_ptr], #32]\n"
67 "ldr q3, [%[a_ptr], #48]\n"
85 "umull v12.8h, v0.8b, %[b0].8b\n"
86 "add %[a_ptr], %[a_ptr], #64\n"
87 "umull v13.8h, v0.8b, %[b1].8b\n"
88 "umull v14.8h, v0.8b, %[b2].8b\n"
89 "add %[b_ptr], %[b_ptr], #64\n"
90 "umull v15.8h, v0.8b, %[b3].8b\n"
96 "uadalp v16.4s, v12.8h\n"
97 "umull2 v12.8h, v0.16b, %[b0].16b\n"
98 "uadalp v17.4s, v13.8h\n"
99 "umull2 v13.8h, v0.16b, %[b1].16b\n"
100 "uadalp v18.4s, v14.8h\n"
101 "umull2 v14.8h, v0.16b, %[b2].16b\n"
102 "uadalp v19.4s, v15.8h\n"
103 "umull2 v15.8h, v0.16b, %[b3].16b\n"
104 "ldr q0, [%[a_ptr]]\n"
106 "uadalp v16.4s, v12.8h\n"
107 "umull v12.8h, v1.8b, %[b0].8b\n"
108 "uadalp v17.4s, v13.8h\n"
109 "umull v13.8h, v1.8b, %[b1].8b\n"
110 "subs %w[k], %w[k], #1\n"
111 "uadalp v18.4s, v14.8h\n"
112 "umull v14.8h, v1.8b, %[b2].8b\n"
113 "uadalp v19.4s, v15.8h\n"
114 "umull v15.8h, v1.8b, %[b3].8b\n"
116 "uadalp v20.4s, v12.8h\n"
117 "umull2 v12.8h, v1.16b, %[b0].16b\n"
118 "uadalp v21.4s, v13.8h\n"
119 "umull2 v13.8h, v1.16b, %[b1].16b\n"
121 "uadalp v22.4s, v14.8h\n"
122 "umull2 v14.8h, v1.16b, %[b2].16b\n"
123 "uadalp v23.4s, v15.8h\n"
124 "umull2 v15.8h, v1.16b, %[b3].16b\n"
125 "ldr q1, [%[a_ptr], #16]\n"
127 "uadalp v20.4s, v12.8h\n"
128 "umull v12.8h, v2.8b, %[b0].8b\n"
129 "uadalp v21.4s, v13.8h\n"
130 "umull v13.8h, v2.8b, %[b1].8b\n"
132 "uadalp v22.4s, v14.8h\n"
133 "umull v14.8h, v2.8b, %[b2].8b\n"
134 "uadalp v23.4s, v15.8h\n"
135 "umull v15.8h, v2.8b, %[b3].8b\n"
137 "uadalp v24.4s, v12.8h\n"
138 "umull2 v12.8h, v2.16b, %[b0].16b\n"
139 "uadalp v25.4s, v13.8h\n"
140 "umull2 v13.8h, v2.16b, %[b1].16b\n"
141 "uadalp v26.4s, v14.8h\n"
142 "umull2 v14.8h, v2.16b, %[b2].16b\n"
143 "uadalp v27.4s, v15.8h\n"
144 "umull2 v15.8h, v2.16b, %[b3].16b\n"
145 "ldr q2, [%[a_ptr], #32]\n"
147 "uadalp v24.4s, v12.8h\n"
148 "umull v12.8h, v3.8b, %[b0].8b\n"
149 "uadalp v25.4s, v13.8h\n"
150 "umull v13.8h, v3.8b, %[b1].8b\n"
151 "uadalp v26.4s, v14.8h\n"
152 "umull v14.8h, v3.8b, %[b2].8b\n"
153 "uadalp v27.4s, v15.8h\n"
154 "umull v15.8h, v3.8b, %[b3].8b\n"
156 "uadalp v28.4s, v12.8h\n"
157 "umull2 v12.8h, v3.16b, %[b0].16b\n"
158 "ldr %q[b0], [%[b_ptr]]\n"
159 "uadalp v29.4s, v13.8h\n"
160 "umull2 v13.8h, v3.16b, %[b1].16b\n"
161 "ldr %q[b1], [%[b_ptr], #16]\n"
162 "uadalp v30.4s, v14.8h\n"
163 "umull2 v14.8h, v3.16b, %[b2].16b\n"
164 "ldr %q[b2], [%[b_ptr], #32]\n"
165 "uadalp v31.4s, v15.8h\n"
166 "umull2 v15.8h, v3.16b, %[b3].16b\n"
167 "ldr %q[b3], [%[b_ptr], #48]\n"
169 "uadalp v28.4s, v12.8h\n"
170 "umull v12.8h, v0.8b, %[b0].8b\n"
171 "add %[b_ptr], %[b_ptr], #64\n"
172 "uadalp v29.4s, v13.8h\n"
173 "umull v13.8h, v0.8b, %[b1].8b\n"
174 "ldr q3, [%[a_ptr], #48]\n"
175 "uadalp v30.4s, v14.8h\n"
176 "umull v14.8h, v0.8b, %[b2].8b\n"
177 "add %[a_ptr], %[a_ptr], #64\n"
178 "uadalp v31.4s, v15.8h\n"
179 "umull v15.8h, v0.8b, %[b3].8b\n"
184 "uadalp v16.4s, v12.8h\n"
185 "umull2 v12.8h, v0.16b, %[b0].16b\n"
186 "uadalp v17.4s, v13.8h\n"
187 "umull2 v13.8h, v0.16b, %[b1].16b\n"
188 "uadalp v18.4s, v14.8h\n"
189 "umull2 v14.8h, v0.16b, %[b2].16b\n"
190 "uadalp v19.4s, v15.8h\n"
191 "umull2 v15.8h, v0.16b, %[b3].16b\n"
193 "uadalp v16.4s, v12.8h\n"
194 "umull v12.8h, v1.8b, %[b0].8b\n"
195 "uadalp v17.4s, v13.8h\n"
196 "umull v13.8h, v1.8b, %[b1].8b\n"
197 "uadalp v18.4s, v14.8h\n"
198 "umull v14.8h, v1.8b, %[b2].8b\n"
199 "uadalp v19.4s, v15.8h\n"
200 "umull v15.8h, v1.8b, %[b3].8b\n"
202 "uadalp v20.4s, v12.8h\n"
203 "umull2 v12.8h, v1.16b, %[b0].16b\n"
204 "uadalp v21.4s, v13.8h\n"
205 "umull2 v13.8h, v1.16b, %[b1].16b\n"
206 "uadalp v22.4s, v14.8h\n"
207 "umull2 v14.8h, v1.16b, %[b2].16b\n"
208 "uadalp v23.4s, v15.8h\n"
209 "umull2 v15.8h, v1.16b, %[b3].16b\n"
211 "uadalp v20.4s, v12.8h\n"
212 "umull v12.8h, v2.8b, %[b0].8b\n"
213 "uadalp v21.4s, v13.8h\n"
214 "umull v13.8h, v2.8b, %[b1].8b\n"
215 "uadalp v22.4s, v14.8h\n"
216 "umull v14.8h, v2.8b, %[b2].8b\n"
217 "uadalp v23.4s, v15.8h\n"
218 "umull v15.8h, v2.8b, %[b3].8b\n"
220 "uadalp v24.4s, v12.8h\n"
221 "umull2 v12.8h, v2.16b, %[b0].16b\n"
222 "uadalp v25.4s, v13.8h\n"
223 "umull2 v13.8h, v2.16b, %[b1].16b\n"
224 "uadalp v26.4s, v14.8h\n"
225 "umull2 v14.8h, v2.16b, %[b2].16b\n"
226 "uadalp v27.4s, v15.8h\n"
227 "umull2 v15.8h, v2.16b, %[b3].16b\n"
229 "uadalp v24.4s, v12.8h\n"
230 "umull v12.8h, v3.8b, %[b0].8b\n"
231 "uadalp v25.4s, v13.8h\n"
232 "umull v13.8h, v3.8b, %[b1].8b\n"
233 "uadalp v26.4s, v14.8h\n"
234 "umull v14.8h, v3.8b, %[b2].8b\n"
235 "uadalp v27.4s, v15.8h\n"
236 "umull v15.8h, v3.8b, %[b3].8b\n"
238 "uadalp v28.4s, v12.8h\n"
239 "umull2 v12.8h, v3.16b, %[b0].16b\n"
240 "uadalp v29.4s, v13.8h\n"
241 "umull2 v13.8h, v3.16b, %[b1].16b\n"
242 "uadalp v30.4s, v14.8h\n"
243 "umull2 v14.8h, v3.16b, %[b2].16b\n"
244 "uadalp v31.4s, v15.8h\n"
245 "umull2 v15.8h, v3.16b, %[b3].16b\n"
247 "uadalp v28.4s, v12.8h\n"
248 "uadalp v29.4s, v13.8h\n"
249 "uadalp v30.4s, v14.8h\n"
250 "uadalp v31.4s, v15.8h\n"
252 "addp v16.4s, v16.4s, v17.4s\n"
253 "addp v17.4s, v18.4s, v19.4s\n"
254 "addp v18.4s, v20.4s, v21.4s\n"
255 "addp v19.4s, v22.4s, v23.4s\n"
256 "addp v20.4s, v24.4s, v25.4s\n"
257 "addp v21.4s, v26.4s, v27.4s\n"
258 "addp v22.4s, v28.4s, v29.4s\n"
259 "addp v23.4s, v30.4s, v31.4s\n"
261 "addp v16.4s, v16.4s, v17.4s\n"
262 "addp v17.4s, v18.4s, v19.4s\n"
263 "addp v18.4s, v20.4s, v21.4s\n"
264 "addp v19.4s, v22.4s, v23.4s\n"
266 "str q16, [%[c_ptr]]\n"
267 "str q17, [%[c_ptr], #16]\n"
268 "str q18, [%[c_ptr], #32]\n"
269 "str q19, [%[c_ptr], #48]\n"
270 "add %[c_ptr], %[c_ptr], #64\n"
273 [a_ptr]
"+r" (a_ptr), [b_ptr]
"+r" (b_ptr), [c_ptr]
"+r" (c_ptr),
274 [b0]
"+w" (b0), [b1]
"+w" (b1), [b2]
"+w" (b2), [b3]
"+w" (b3),
277 :
"x20",
"x21",
"v0",
"v1",
"v2",
"v3",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
278 "v20",
"v21",
"v22",
"v23",
"v24",
"v25",
"v26",
"v27",
"v28",
"v29",
"v30",
"v31",
"cc");
285 #endif // __aarch64__