29 void MergeResults<4, 4, false>(uint32_t *out,
const uint32_t *in,
const int ldout,
const int y0,
const int ymax,
const int x0,
const int xmax,
const uint32_t *
bias,
Activation ,
bool append)
31 const uint32_t *inptr = in;
37 memset(nullbias, 0, (4 *
sizeof(uint32_t)));
40 for (
int y=y0; y<ymax; y+=4)
42 uint32_t *outptr0 = out + (y * ldout) + x0;
43 uint32_t *outptr1 = outptr0 + ldout;
44 uint32_t *outptr2 = outptr1 + ldout;
45 uint32_t *outptr3 = outptr2 + ldout;
47 const int height = ymax - y;
49 for (
int i=x0; i<xmax; i+=4)
59 for (
int xi=0; xi<3; xi++)
63 *outptr0 += inptr[xi];
71 "ldr q2, [%[outptr0]]\n"
72 "prfm PLDL1KEEP, [%[inptr], #0x40]\n"
73 "ldr q10, [%[inptr]]\n"
74 "prfm PLDL1KEEP, [%[outptr0], #0x20]\n"
75 "add %[inptr], %[inptr], #0x40\n"
76 "add v10.4s, v10.4s, v2.4s\n"
77 "str q10, [%[outptr0]]\n"
78 "add %[outptr0], %[outptr0], #0x10\n"
79 : [outptr0]
"+r" (outptr0), [outptr1]
"+r" (outptr1), [outptr2]
"+r" (outptr2), [outptr3]
"+r" (outptr3),
82 :
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"memory"
92 for (
int xi=0; xi<3; xi++)
96 *outptr0 += inptr[xi];
98 *outptr1 += inptr[xi + 4];
106 "ldr q2, [%[outptr0]]\n"
107 "prfm PLDL1KEEP, [%[inptr], #0x40]\n"
108 "ldr q10, [%[inptr]]\n"
109 "prfm PLDL1KEEP, [%[outptr0], #0x20]\n"
110 "ldr q3, [%[outptr1]]\n"
111 "prfm PLDL1KEEP, [%[outptr1], #0x20]\n"
112 "add v10.4s, v10.4s, v2.4s\n"
113 "ldr q11, [%[inptr], #0x10]\n"
114 "add %[inptr], %[inptr], #0x40\n"
115 "add v11.4s, v11.4s, v3.4s\n"
116 "str q10, [%[outptr0]]\n"
117 "add %[outptr0], %[outptr0], #0x10\n"
118 "str q11, [%[outptr1]]\n"
119 "add %[outptr1], %[outptr1], #0x10\n"
120 : [outptr0]
"+r" (outptr0), [outptr1]
"+r" (outptr1), [outptr2]
"+r" (outptr2), [outptr3]
"+r" (outptr3),
123 :
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"memory"
133 for (
int xi=0; xi<3; xi++)
137 *outptr0 += inptr[xi];
139 *outptr1 += inptr[xi + 4];
141 *outptr2 += inptr[xi + 8];
149 "ldr q2, [%[outptr0]]\n"
150 "prfm PLDL1KEEP, [%[inptr], #0x40]\n"
151 "ldr q10, [%[inptr]]\n"
152 "prfm PLDL1KEEP, [%[outptr0], #0x20]\n"
153 "ldr q3, [%[outptr1]]\n"
154 "prfm PLDL1KEEP, [%[outptr1], #0x20]\n"
155 "add v10.4s, v10.4s, v2.4s\n"
156 "ldr q11, [%[inptr], #0x10]\n"
157 "ldr q4, [%[outptr2]]\n"
158 "prfm PLDL1KEEP, [%[outptr2], #0x20]\n"
159 "ldr q12, [%[inptr], #0x20]\n"
160 "add %[inptr], %[inptr], #0x40\n"
161 "add v11.4s, v11.4s, v3.4s\n"
162 "str q10, [%[outptr0]]\n"
163 "add %[outptr0], %[outptr0], #0x10\n"
164 "add v12.4s, v12.4s, v4.4s\n"
165 "str q11, [%[outptr1]]\n"
166 "add %[outptr1], %[outptr1], #0x10\n"
167 "str q12, [%[outptr2]]\n"
168 "add %[outptr2], %[outptr2], #0x10\n"
169 : [outptr0]
"+r" (outptr0), [outptr1]
"+r" (outptr1), [outptr2]
"+r" (outptr2), [outptr3]
"+r" (outptr3),
172 :
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"memory"
183 for (
int xi=0; xi<3; xi++)
187 *outptr0 += inptr[xi];
189 *outptr1 += inptr[xi + 4];
191 *outptr2 += inptr[xi + 8];
193 *outptr3 += inptr[xi + 12];
201 "ldr q2, [%[outptr0]]\n"
202 "prfm PLDL1KEEP, [%[inptr], #0x40]\n"
203 "ldr q10, [%[inptr]]\n"
204 "prfm PLDL1KEEP, [%[outptr0], #0x20]\n"
205 "ldr q3, [%[outptr1]]\n"
206 "prfm PLDL1KEEP, [%[outptr1], #0x20]\n"
207 "add v10.4s, v10.4s, v2.4s\n"
208 "ldr q11, [%[inptr], #0x10]\n"
209 "ldr q4, [%[outptr2]]\n"
210 "prfm PLDL1KEEP, [%[outptr2], #0x20]\n"
211 "ldr q12, [%[inptr], #0x20]\n"
212 "prfm PLDL1KEEP, [%[outptr3], #0x20]\n"
213 "add v11.4s, v11.4s, v3.4s\n"
214 "str q10, [%[outptr0]]\n"
215 "ldr q5, [%[outptr3]]\n"
216 "add %[outptr0], %[outptr0], #0x10\n"
217 "add v12.4s, v12.4s, v4.4s\n"
218 "str q11, [%[outptr1]]\n"
219 "ldr q13, [%[inptr], #0x30]\n"
220 "add %[outptr1], %[outptr1], #0x10\n"
221 "add %[inptr], %[inptr], #0x40\n"
222 "str q12, [%[outptr2]]\n"
223 "add %[outptr2], %[outptr2], #0x10\n"
224 "add v13.4s, v13.4s, v5.4s\n"
225 "str q13, [%[outptr3]]\n"
226 "add %[outptr3], %[outptr3], #0x10\n"
227 : [outptr0]
"+r" (outptr0), [outptr1]
"+r" (outptr1), [outptr2]
"+r" (outptr2), [outptr3]
"+r" (outptr3),
230 :
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"memory"
241 const uint32_t *biasptr =
bias ?
bias + i : nullbias;
249 for (
int xi=0; xi<3; xi++)
253 *outptr0 = biasptr[xi] + inptr[xi];
261 "ldr q2, [%[biasptr]]\n"
262 "prfm PLDL1KEEP, [%[inptr], #0x40]\n"
263 "ldr q11, [%[inptr]]\n"
264 "prfm PSTL1KEEP, [%[outptr0], #0x20]\n"
265 "add %[inptr], %[inptr], #0x40\n"
266 "add v11.4s, v11.4s, v2.4s\n"
267 "str q11, [%[outptr0]]\n"
268 "add %[outptr0], %[outptr0], #0x10\n"
269 : [outptr0]
"+r" (outptr0), [outptr1]
"+r" (outptr1), [outptr2]
"+r" (outptr2), [outptr3]
"+r" (outptr3),
271 : [biasptr]
"r" (biasptr)
272 :
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"memory"
282 for (
int xi=0; xi<3; xi++)
286 *outptr0 = biasptr[xi] + inptr[xi];
288 *outptr1 = biasptr[xi] + inptr[xi + 4];
296 "ldr q2, [%[biasptr]]\n"
297 "prfm PLDL1KEEP, [%[inptr], #0x40]\n"
298 "ldr q11, [%[inptr]]\n"
299 "prfm PSTL1KEEP, [%[outptr0], #0x20]\n"
300 "ldr q12, [%[inptr], #0x10]\n"
301 "prfm PSTL1KEEP, [%[outptr1], #0x20]\n"
302 "add v11.4s, v11.4s, v2.4s\n"
303 "add %[inptr], %[inptr], #0x40\n"
304 "add v12.4s, v12.4s, v2.4s\n"
305 "str q11, [%[outptr0]]\n"
306 "add %[outptr0], %[outptr0], #0x10\n"
307 "str q12, [%[outptr1]]\n"
308 "add %[outptr1], %[outptr1], #0x10\n"
309 : [outptr0]
"+r" (outptr0), [outptr1]
"+r" (outptr1), [outptr2]
"+r" (outptr2), [outptr3]
"+r" (outptr3),
311 : [biasptr]
"r" (biasptr)
312 :
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"memory"
322 for (
int xi=0; xi<3; xi++)
326 *outptr0 = biasptr[xi] + inptr[xi];
328 *outptr1 = biasptr[xi] + inptr[xi + 4];
330 *outptr2 = biasptr[xi] + inptr[xi + 8];
338 "ldr q2, [%[biasptr]]\n"
339 "prfm PLDL1KEEP, [%[inptr], #0x40]\n"
340 "ldr q11, [%[inptr]]\n"
341 "prfm PSTL1KEEP, [%[outptr0], #0x20]\n"
342 "ldr q12, [%[inptr], #0x10]\n"
343 "prfm PSTL1KEEP, [%[outptr1], #0x20]\n"
344 "add v11.4s, v11.4s, v2.4s\n"
345 "ldr q13, [%[inptr], #0x20]\n"
346 "prfm PSTL1KEEP, [%[outptr2], #0x20]\n"
347 "add v12.4s, v12.4s, v2.4s\n"
348 "add %[inptr], %[inptr], #0x40\n"
349 "add v13.4s, v13.4s, v2.4s\n"
350 "str q11, [%[outptr0]]\n"
351 "add %[outptr0], %[outptr0], #0x10\n"
352 "str q12, [%[outptr1]]\n"
353 "add %[outptr1], %[outptr1], #0x10\n"
354 "str q13, [%[outptr2]]\n"
355 "add %[outptr2], %[outptr2], #0x10\n"
356 : [outptr0]
"+r" (outptr0), [outptr1]
"+r" (outptr1), [outptr2]
"+r" (outptr2), [outptr3]
"+r" (outptr3),
358 : [biasptr]
"r" (biasptr)
359 :
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"memory"
370 for (
int xi=0; xi<3; xi++)
374 *outptr0 = biasptr[xi] + inptr[xi];
376 *outptr1 = biasptr[xi] + inptr[xi + 4];
378 *outptr2 = biasptr[xi] + inptr[xi + 8];
380 *outptr3 = biasptr[xi] + inptr[xi + 12];
388 "ldr q2, [%[biasptr]]\n"
389 "prfm PLDL1KEEP, [%[inptr], #0x40]\n"
390 "ldr q11, [%[inptr]]\n"
391 "prfm PSTL1KEEP, [%[outptr0], #0x20]\n"
392 "ldr q12, [%[inptr], #0x10]\n"
393 "prfm PSTL1KEEP, [%[outptr1], #0x20]\n"
394 "add v11.4s, v11.4s, v2.4s\n"
395 "ldr q13, [%[inptr], #0x20]\n"
396 "ldr q14, [%[inptr], #0x30]\n"
397 "prfm PSTL1KEEP, [%[outptr2], #0x20]\n"
398 "add v12.4s, v12.4s, v2.4s\n"
399 "str q11, [%[outptr0]]\n"
400 "add v13.4s, v13.4s, v2.4s\n"
401 "add %[outptr0], %[outptr0], #0x10\n"
402 "add v14.4s, v14.4s, v2.4s\n"
403 "str q12, [%[outptr1]]\n"
404 "add %[outptr1], %[outptr1], #0x10\n"
405 "prfm PSTL1KEEP, [%[outptr3], #0x20]\n"
406 "add %[inptr], %[inptr], #0x40\n"
407 "str q13, [%[outptr2]]\n"
408 "add %[outptr2], %[outptr2], #0x10\n"
409 "str q14, [%[outptr3]]\n"
410 "add %[outptr3], %[outptr3], #0x10\n"
411 : [outptr0]
"+r" (outptr0), [outptr1]
"+r" (outptr1), [outptr2]
"+r" (outptr2), [outptr3]
"+r" (outptr3),
413 : [biasptr]
"r" (biasptr)
414 :
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"memory"
427 #endif // __aarch64__