24.04
generic.cpp
Go to the documentation of this file.
1
/*
2
* Copyright (c) 2017-2021 Arm Limited.
3
*
4
* SPDX-License-Identifier: MIT
5
*
6
* Permission is hereby granted, free of charge, to any person obtaining a copy
7
* of this software and associated documentation files (the "Software"), to
8
* deal in the Software without restriction, including without limitation the
9
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10
* sell copies of the Software, and to permit persons to whom the Software is
11
* furnished to do so, subject to the following conditions:
12
*
13
* The above copyright notice and this permission notice shall be included in all
14
* copies or substantial portions of the Software.
15
*
16
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
* SOFTWARE.
23
*/
24
#ifdef __aarch64__
25
26
#include <arm_neon.h>
27
28
#include "../../asmlib.hpp"
29
30
// Kernel implementation.
31
//
32
// Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order.
33
// Assume that "Bpanel" points to a chunk of B blocks (each size 12xK) in read-order.
34
// Assume that "Cpanel" points to a chunk of C output blocks (each size
35
// 12x8), the chunks being arranged in a row major fashion.
36
//
37
// Note that the intent of this is that either ablocks or bblocks will be 1
38
// - this construction allows the output loop to proceed in either order.
39
40
namespace
arm_gemm
{
41
42
void
a64_sgemm_asimd_8x6(
const
float
*Apanel,
const
float
*Bpanel,
float
*Cpanel,
int
ablocks,
int
bblocks,
int
K
) {
43
const
float
*a_ptr = Apanel;
44
float
*c_ptr = Cpanel;
45
46
for
(
int
yb=0; yb<ablocks; yb++) {
47
const
float
*a_ptr0 = a_ptr;
48
const
float
*b_ptr = Bpanel;
49
50
for
(
int
xb=0; xb<bblocks; xb++) {
51
a_ptr = a_ptr0;
52
// Fix up for odd lengths - set a flag if K is odd, but make
53
// sure we round up the iteration count.
54
int
oddk = (
K
& 1);
55
int
k = ((
K
+1)/2) - 1;
56
57
register
float32x4_t a0
asm
(
"v0"
);
58
register
float32x4_t a1
asm
(
"v1"
);
59
register
float32x4_t a2
asm
(
"v2"
);
60
register
float32x4_t a3
asm
(
"v3"
);
61
register
float32x4_t b0
asm
(
"v4"
);
62
register
float32x4_t b1
asm
(
"v5"
);
63
register
float32x4_t b2
asm
(
"v6"
);
64
65
__asm __volatile (
66
// Initialize result registers, load initial operands, prime prefetches.
67
"movi v8.2s, #0x0\n"
68
"ld1r { %[a0].2s }, [%[a_ptr]], #4\n"
69
"movi v9.2s, #0x0\n"
70
"movi v10.2s, #0x0\n"
71
"ld1r { %[a1].2s }, [%[a_ptr]], #4\n"
72
"movi v11.2s, #0x0\n"
73
"movi v12.2s, #0x0\n"
74
"movi v13.2s, #0x0\n"
75
"movi v14.2s, #0x0\n"
76
ASM_PREFETCH
(
"[%[b_ptr], #64]"
)
77
ASM_PREFETCHU(
"[%[a_ptr], #52]"
)
78
ASM_PREFETCHU(
"[%[a_ptr], #116]"
)
79
ASM_PREFETCH
(
"[%[b_ptr], #128]"
)
80
"movi v15.2s, #0x0\n"
81
"movi v16.2s, #0x0\n"
82
"movi v17.2s, #0x0\n"
83
"movi v18.2s, #0x0\n"
84
"movi v19.2s, #0x0\n"
85
"movi v20.2s, #0x0\n"
86
"movi v21.2s, #0x0\n"
87
"movi v22.2s, #0x0\n"
88
"movi v23.2s, #0x0\n"
89
"movi v24.2s, #0x0\n"
90
"movi v25.2s, #0x0\n"
91
"movi v26.2s, #0x0\n"
92
"movi v27.2s, #0x0\n"
93
"movi v28.2s, #0x0\n"
94
"movi v29.2s, #0x0\n"
95
"movi v30.2s, #0x0\n"
96
"movi v31.2s, #0x0\n"
97
98
// Skip loop if we are doing zero iterations of it.
99
"cbz %w[k], 4f\n"
100
101
// Loop proper
102
"1:\n"
103
"ldr %d[b0], [%[b_ptr], #0]\n"
104
"ld1r { %[a2].2s }, [%[a_ptr]], #4\n"
105
"ldr %d[b1], [%[b_ptr], #8]\n"
106
"fmla v8.2s , %[b0].2s, %[a0].2s\n"
107
"fmla v9.2s , %[b0].2s, %[a1].2s\n"
108
"fmla v10.2s, %[b0].2s, %[a2].2s\n"
109
110
"ld1r { %[a3].2s }, [%[a_ptr]], #4\n"
111
"fmla v16.2s, %[b1].2s, %[a0].2s\n"
112
"fmla v17.2s, %[b1].2s, %[a1].2s\n"
113
"fmla v11.2s, %[b0].2s, %[a3].2s\n"
114
115
"ldr %d[b2], [%[b_ptr], #16]\n"
116
"fmla v18.2s, %[b1].2s, %[a2].2s\n"
117
"fmla v19.2s, %[b1].2s, %[a3].2s\n"
118
"fmla v24.2s, %[b2].2s, %[a0].2s\n"
119
120
"ld1r { %[a0].2s }, [%[a_ptr]], #4\n"
121
"fmla v25.2s, %[b2].2s, %[a1].2s\n"
122
"fmla v26.2s, %[b2].2s, %[a2].2s\n"
123
"fmla v27.2s, %[b2].2s, %[a3].2s\n"
124
125
"ld1r { %[a1].2s }, [%[a_ptr]], #4\n"
126
"fmla v12.2s, %[b0].2s, %[a0].2s\n"
127
"fmla v20.2s, %[b1].2s, %[a0].2s\n"
128
"fmla v28.2s, %[b2].2s, %[a0].2s\n"
129
130
"ld1r { %[a2].2s }, [%[a_ptr]], #4\n"
131
"fmla v13.2s, %[b0].2s, %[a1].2s\n"
132
"fmla v21.2s, %[b1].2s, %[a1].2s\n"
133
"fmla v29.2s, %[b2].2s, %[a1].2s\n"
134
135
"ld1r { %[a3].2s }, [%[a_ptr]], #4\n"
136
"fmla v14.2s, %[b0].2s, %[a2].2s\n"
137
"fmla v22.2s, %[b1].2s, %[a2].2s\n"
138
"fmla v30.2s, %[b2].2s, %[a2].2s\n"
139
140
"ld1r { %[a0].2s }, [%[a_ptr]], #4\n"
141
"fmla v15.2s, %[b0].2s, %[a3].2s\n"
142
"fmla v23.2s, %[b1].2s, %[a3].2s\n"
143
"fmla v31.2s, %[b2].2s, %[a3].2s\n"
144
145
"ld1r { %[a1].2s }, [%[a_ptr]], #4\n"
146
ASM_PREFETCH
(
"[%[b_ptr], #128]"
)
147
"subs %w[k], %w[k], #1\n"
148
ASM_PREFETCHU(
"[%[a_ptr], #156]"
)
149
"ldr %d[b0], [%[b_ptr], #24]\n"
150
"ld1r { %[a2].2s }, [%[a_ptr]], #4\n"
151
152
"ldr %d[b1], [%[b_ptr], #32]\n"
153
"fmla v8.2s , %[b0].2s, %[a0].2s\n"
154
"fmla v9.2s , %[b0].2s, %[a1].2s\n"
155
"fmla v10.2s, %[b0].2s, %[a2].2s\n"
156
157
"ld1r { %[a3].2s }, [%[a_ptr]], #4\n"
158
"fmla v16.2s, %[b1].2s, %[a0].2s\n"
159
"fmla v17.2s, %[b1].2s, %[a1].2s\n"
160
"fmla v11.2s, %[b0].2s, %[a3].2s\n"
161
162
"ldr %d[b2], [%[b_ptr], #40]\n"
163
"fmla v18.2s, %[b1].2s, %[a2].2s\n"
164
"fmla v19.2s, %[b1].2s, %[a3].2s\n"
165
"fmla v24.2s, %[b2].2s, %[a0].2s\n"
166
167
"ld1r { %[a0].2s }, [%[a_ptr]], #4\n"
168
"fmla v25.2s, %[b2].2s, %[a1].2s\n"
169
"fmla v26.2s, %[b2].2s, %[a2].2s\n"
170
"fmla v27.2s, %[b2].2s, %[a3].2s\n"
171
172
"ld1r { %[a1].2s }, [%[a_ptr]], #4\n"
173
"fmla v12.2s, %[b0].2s, %[a0].2s\n"
174
"fmla v20.2s, %[b1].2s, %[a0].2s\n"
175
"fmla v28.2s, %[b2].2s, %[a0].2s\n"
176
177
"ld1r { %[a2].2s }, [%[a_ptr]], #4\n"
178
"fmla v13.2s, %[b0].2s, %[a1].2s\n"
179
"fmla v21.2s, %[b1].2s, %[a1].2s\n"
180
"fmla v29.2s, %[b2].2s, %[a1].2s\n"
181
182
"ld1r { %[a3].2s }, [%[a_ptr]], #4\n"
183
"fmla v14.2s, %[b0].2s, %[a2].2s\n"
184
"fmla v22.2s, %[b1].2s, %[a2].2s\n"
185
"fmla v30.2s, %[b2].2s, %[a2].2s\n"
186
187
"ld1r { %[a0].2s }, [%[a_ptr]], #4\n"
188
"fmla v15.2s, %[b0].2s, %[a3].2s\n"
189
"fmla v23.2s, %[b1].2s, %[a3].2s\n"
190
"fmla v31.2s, %[b2].2s, %[a3].2s\n"
191
192
"ld1r { %[a1].2s }, [%[a_ptr]], #4\n"
193
"add %[b_ptr], %[b_ptr], #48\n"
194
ASM_PREFETCHU(
"[%[a_ptr], #188]"
)
195
"bne 1b\n"
196
197
// Target to use when K is 1 or 2 (i.e. zero iterations of main loop)
198
"4:\n"
199
ASM_PREFETCH
(
"[%[c_ptr]]"
)
200
ASM_PREFETCH
(
"[%[c_ptr], #64]"
)
201
202
"ldr %d[b0], [%[b_ptr]]\n"
203
"ld1r { %[a2].2s }, [%[a_ptr]], #4\n"
204
205
// Branch to alternative tail for odd K
206
"cbnz %w[oddk], 2f\n"
207
208
// Detached final iteration (even K)
209
"ldr %d[b1], [%[b_ptr], #8]\n"
210
"fmla v8.2s , %[b0].2s, %[a0].2s\n"
211
"fmla v9.2s , %[b0].2s, %[a1].2s\n"
212
"fmla v10.2s, %[b0].2s, %[a2].2s\n"
213
214
"ld1r { %[a3].2s }, [%[a_ptr]], #4\n"
215
"fmla v16.2s, %[b1].2s, %[a0].2s\n"
216
"fmla v17.2s, %[b1].2s, %[a1].2s\n"
217
"fmla v11.2s, %[b0].2s, %[a3].2s\n"
218
219
"ldr %d[b2], [%[b_ptr], #16]\n"
220
"fmla v18.2s, %[b1].2s, %[a2].2s\n"
221
"fmla v19.2s, %[b1].2s, %[a3].2s\n"
222
"fmla v24.2s, %[b2].2s, %[a0].2s\n"
223
224
"ld1r { %[a0].2s }, [%[a_ptr]], #4\n"
225
"fmla v25.2s, %[b2].2s, %[a1].2s\n"
226
"fmla v26.2s, %[b2].2s, %[a2].2s\n"
227
"fmla v27.2s, %[b2].2s, %[a3].2s\n"
228
229
"ld1r { %[a1].2s }, [%[a_ptr]], #4\n"
230
"fmla v12.2s, %[b0].2s, %[a0].2s\n"
231
"fmla v20.2s, %[b1].2s, %[a0].2s\n"
232
"fmla v28.2s, %[b2].2s, %[a0].2s\n"
233
234
"ld1r { %[a2].2s }, [%[a_ptr]], #4\n"
235
"fmla v13.2s, %[b0].2s, %[a1].2s\n"
236
"fmla v21.2s, %[b1].2s, %[a1].2s\n"
237
"fmla v29.2s, %[b2].2s, %[a1].2s\n"
238
239
"ld1r { %[a3].2s }, [%[a_ptr]], #4\n"
240
"fmla v14.2s, %[b0].2s, %[a2].2s\n"
241
"fmla v22.2s, %[b1].2s, %[a2].2s\n"
242
"fmla v30.2s, %[b2].2s, %[a2].2s\n"
243
244
"ld1r { %[a0].2s }, [%[a_ptr]], #4\n"
245
"fmla v15.2s, %[b0].2s, %[a3].2s\n"
246
"fmla v23.2s, %[b1].2s, %[a3].2s\n"
247
"fmla v31.2s, %[b2].2s, %[a3].2s\n"
248
249
"ldr %d[b0], [%[b_ptr], #24]\n"
250
"add %[b_ptr], %[b_ptr], #48\n"
251
ASM_PREFETCH
(
"[%[b_ptr], #128]"
)
252
"ld1r { %[a1].2s }, [%[a_ptr]], #4\n"
253
"ld1r { %[a2].2s }, [%[a_ptr]], #4\n"
254
255
"ldr %d[b1], [%[b_ptr], #-16]\n"
256
"fmla v8.2s , %[b0].2s, %[a0].2s\n"
257
"fmla v9.2s , %[b0].2s, %[a1].2s\n"
258
"fmla v10.2s, %[b0].2s, %[a2].2s\n"
259
260
"ld1r { %[a3].2s }, [%[a_ptr]], #4\n"
261
"fmla v16.2s, %[b1].2s, %[a0].2s\n"
262
"fmla v17.2s, %[b1].2s, %[a1].2s\n"
263
"fmla v11.2s, %[b0].2s, %[a3].2s\n"
264
265
"ldr %d[b2], [%[b_ptr], #-8]\n"
266
"fmla v18.2s, %[b1].2s, %[a2].2s\n"
267
"fmla v19.2s, %[b1].2s, %[a3].2s\n"
268
"fmla v24.2s, %[b2].2s, %[a0].2s\n"
269
270
"ld1r { %[a0].2s }, [%[a_ptr]], #4\n"
271
"fmla v25.2s, %[b2].2s, %[a1].2s\n"
272
"fmla v26.2s, %[b2].2s, %[a2].2s\n"
273
"fmla v27.2s, %[b2].2s, %[a3].2s\n"
274
275
"ld1r { %[a1].2s }, [%[a_ptr]], #4\n"
276
"fmla v12.2s, %[b0].2s, %[a0].2s\n"
277
"fmla v20.2s, %[b1].2s, %[a0].2s\n"
278
"fmla v28.2s, %[b2].2s, %[a0].2s\n"
279
280
"ld1r { %[a2].2s }, [%[a_ptr]], #4\n"
281
"fmla v13.2s, %[b0].2s, %[a1].2s\n"
282
"fmla v21.2s, %[b1].2s, %[a1].2s\n"
283
"fmla v29.2s, %[b2].2s, %[a1].2s\n"
284
285
"ld1r { %[a3].2s }, [%[a_ptr]], #4\n"
286
"fmla v14.2s, %[b0].2s, %[a2].2s\n"
287
"fmla v22.2s, %[b1].2s, %[a2].2s\n"
288
"fmla v30.2s, %[b2].2s, %[a2].2s\n"
289
290
"fmla v15.2s, %[b0].2s, %[a3].2s\n"
291
"fmla v23.2s, %[b1].2s, %[a3].2s\n"
292
"fmla v31.2s, %[b2].2s, %[a3].2s\n"
293
294
"b 3f\n"
295
296
// Detached final iteration (odd K)
297
"2:\n"
298
"ldr %d[b1], [%[b_ptr], #8]\n"
299
"fmla v8.2s , %[b0].2s, %[a0].2s\n"
300
"fmla v9.2s , %[b0].2s, %[a1].2s\n"
301
"fmla v10.2s, %[b0].2s, %[a2].2s\n"
302
303
"ld1r { %[a3].2s }, [%[a_ptr]], #4\n"
304
"fmla v16.2s, %[b1].2s, %[a0].2s\n"
305
"fmla v17.2s, %[b1].2s, %[a1].2s\n"
306
"fmla v11.2s, %[b0].2s, %[a3].2s\n"
307
308
"ldr %d[b2], [%[b_ptr], #16]\n"
309
"fmla v18.2s, %[b1].2s, %[a2].2s\n"
310
"fmla v19.2s, %[b1].2s, %[a3].2s\n"
311
"fmla v24.2s, %[b2].2s, %[a0].2s\n"
312
313
"ld1r { %[a0].2s }, [%[a_ptr]], #4\n"
314
"fmla v25.2s, %[b2].2s, %[a1].2s\n"
315
"fmla v26.2s, %[b2].2s, %[a2].2s\n"
316
"fmla v27.2s, %[b2].2s, %[a3].2s\n"
317
318
"ld1r { %[a1].2s }, [%[a_ptr]], #4\n"
319
"fmla v12.2s, %[b0].2s, %[a0].2s\n"
320
"fmla v20.2s, %[b1].2s, %[a0].2s\n"
321
"fmla v28.2s, %[b2].2s, %[a0].2s\n"
322
323
"ld1r { %[a2].2s }, [%[a_ptr]], #4\n"
324
"fmla v13.2s, %[b0].2s, %[a1].2s\n"
325
"fmla v21.2s, %[b1].2s, %[a1].2s\n"
326
"fmla v29.2s, %[b2].2s, %[a1].2s\n"
327
328
"ld1r { %[a3].2s }, [%[a_ptr]], #4\n"
329
"fmla v14.2s, %[b0].2s, %[a2].2s\n"
330
"fmla v22.2s, %[b1].2s, %[a2].2s\n"
331
"fmla v30.2s, %[b2].2s, %[a2].2s\n"
332
333
"fmla v15.2s, %[b0].2s, %[a3].2s\n"
334
"fmla v23.2s, %[b1].2s, %[a3].2s\n"
335
"fmla v31.2s, %[b2].2s, %[a3].2s\n"
336
337
"add %[b_ptr], %[b_ptr], #24\n"
338
339
// Common tail
340
"3:\n"
341
"str d8, [%[c_ptr], #0]\n"
342
"str d16, [%[c_ptr], #8]\n"
343
"str d24, [%[c_ptr], #16]\n"
344
"str d9, [%[c_ptr], #24]\n"
345
"str d17, [%[c_ptr], #32]\n"
346
"str d25, [%[c_ptr], #40]\n"
347
"str d10, [%[c_ptr], #48]\n"
348
"str d18, [%[c_ptr], #56]\n"
349
"str d26, [%[c_ptr], #64]\n"
350
"str d11, [%[c_ptr], #72]\n"
351
"str d19, [%[c_ptr], #80]\n"
352
"str d27, [%[c_ptr], #88]\n"
353
"str d12, [%[c_ptr], #96]\n"
354
"str d20, [%[c_ptr], #104]\n"
355
"str d28, [%[c_ptr], #112]\n"
356
"str d13, [%[c_ptr], #120]\n"
357
"str d21, [%[c_ptr], #128]\n"
358
"str d29, [%[c_ptr], #136]\n"
359
"str d14, [%[c_ptr], #144]\n"
360
"str d22, [%[c_ptr], #152]\n"
361
"str d30, [%[c_ptr], #160]\n"
362
"str d15, [%[c_ptr], #168]\n"
363
"str d23, [%[c_ptr], #176]\n"
364
"str d31, [%[c_ptr], #184]\n"
365
"add %[c_ptr], %[c_ptr], #192\n"
366
367
:
368
[a_ptr]
"+r"
(a_ptr), [b_ptr]
"+r"
(b_ptr), [c_ptr]
"+r"
(c_ptr),
369
[a0]
"+w"
(a0), [a1]
"+w"
(a1), [a2]
"+w"
(a2), [a3]
"+w"
(a3),
370
[b0]
"+w"
(b0), [b1]
"+w"
(b1), [b2]
"+w"
(b2), [k]
"+r"
(k)
371
: [oddk]
"r"
(oddk)
372
:
"x20"
,
"x21"
,
"v8"
,
"v9"
,
"v10"
,
"v11"
,
"v12"
,
"v13"
,
"v14"
,
"v15"
,
"v16"
,
"v17"
,
"v18"
,
373
"v19"
,
"v20"
,
"v21"
,
"v22"
,
"v23"
,
"v24"
,
"v25"
,
"v26"
,
"v27"
,
"v28"
,
"v29"
,
"v30"
,
"v31"
,
"cc"
,
"memory"
374
);
375
}
376
}
377
}
378
379
}
// namespace arm_gemm
380
381
#endif
ASM_PREFETCH
#define ASM_PREFETCH(address)
Definition:
asmlib.hpp:45
arm_gemm
Definition:
barrier.hpp:30
K
unsigned int K
Definition:
CpuGemmAssemblyDispatch.cpp:106
src
core
NEON
kernels
arm_gemm
kernels
a64_sgemm_8x6
generic.cpp
Generated on Mon Apr 29 2024 10:53:55 for Compute Library by
1.8.17