30 #include "../../asmlib.hpp"
31 #include "../../utils.hpp"
35 void a64_sgemv_pretransposed(
const float *A,
int lda,
const float *X,
float *Y,
float beta,
int M,
int N) {
36 const bool beta0 = (beta==0.0f);
37 const bool beta1 = (beta==1.0f);
39 for (
int x=0; x<
N; x+=32) {
43 int l = std::min(
N - x, 32);
45 register float32x4_t r0
asm(
"v24");
46 register float32x4_t r1
asm(
"v25");
47 register float32x4_t r2
asm(
"v26");
48 register float32x4_t r3
asm(
"v27");
49 register float32x4_t r4
asm(
"v28");
50 register float32x4_t r5
asm(
"v29");
51 register float32x4_t r6
asm(
"v30");
52 register float32x4_t r7
asm(
"v31");
54 register float32x4_t x0
asm(
"v0");
55 register float32x4_t x0a
asm(
"v1");
57 const float *x_ptr =
X;
58 const float *a_ptr =
A + ((x/32) * lda);
61 r0=r1=r2=r3=r4=r5=r6=r7=vdupq_n_f32(0.0f);
65 r0 = vld1q_f32(y_ptr);
66 r1 = vld1q_f32(y_ptr + 4);
67 r2 = vld1q_f32(y_ptr + 8);
68 r3 = vld1q_f32(y_ptr + 12);
69 r4 = vld1q_f32(y_ptr + 16);
70 r5 = vld1q_f32(y_ptr + 20);
71 r6 = vld1q_f32(y_ptr + 24);
72 r7 = vld1q_f32(y_ptr + 28);
82 float32x4_t oddvec = vdupq_n_f32(0.0f);
83 float *oddbase = y_ptr + l - oddbits;
87 oddvec = vld1q_lane_f32(oddbase + 2, oddvec, 2);
90 oddvec = vld1q_lane_f32(oddbase + 1, oddvec, 1);
93 oddvec = vld1q_lane_f32(oddbase, oddvec, 0);
102 if (vecs==0) { r0 = oddvec;
break; }
104 r0 = vld1q_f32(y_ptr);
105 if (--vecs==0) { r1 = oddvec;
break; }
107 r1 = vld1q_f32(y_ptr + 4);
108 if (--vecs==0) { r2 = oddvec;
break; }
110 r2 = vld1q_f32(y_ptr + 8);
111 if (--vecs==0) { r3 = oddvec;
break; }
113 r3 = vld1q_f32(y_ptr + 12);
114 if (--vecs==0) { r4 = oddvec;
break; }
116 r4 = vld1q_f32(y_ptr + 16);
117 if (--vecs==0) { r5 = oddvec;
break; }
119 r5 = vld1q_f32(y_ptr + 20);
120 if (--vecs==0) { r6 = oddvec;
break; }
122 r6 = vld1q_f32(y_ptr + 24);
129 if (vecs==0) {
UNREACHABLE(
"Impossible lack of work to do"); }
131 r0 = vld1q_f32(y_ptr);
132 if (--vecs==0) {
break; }
134 r1 = vld1q_f32(y_ptr + 4);
135 if (--vecs==0) {
break; }
137 r2 = vld1q_f32(y_ptr + 8);
138 if (--vecs==0) {
break; }
140 r3 = vld1q_f32(y_ptr + 12);
141 if (--vecs==0) {
break; }
143 r4 = vld1q_f32(y_ptr + 16);
144 if (--vecs==0) {
break; }
146 r5 = vld1q_f32(y_ptr + 20);
147 if (--vecs==0) {
break; }
149 r6 = vld1q_f32(y_ptr + 24);
155 const float32x4_t vb = vdupq_n_f32(beta);
157 r0 = vmulq_f32(r0, vb);
158 r1 = vmulq_f32(r1, vb);
159 r2 = vmulq_f32(r2, vb);
160 r3 = vmulq_f32(r3, vb);
161 r4 = vmulq_f32(r4, vb);
162 r5 = vmulq_f32(r5, vb);
163 r6 = vmulq_f32(r6, vb);
164 r7 = vmulq_f32(r7, vb);
170 x0 = vld1q_f32(x_ptr);
173 "ldr q2, [%[a_ptr], #0]\n"
174 "ldr q3, [%[a_ptr], #16]\n"
175 "ldr q4, [%[a_ptr], #32]\n"
176 "ldr q5, [%[a_ptr], #48]\n"
177 "ldr q6, [%[a_ptr], #64]\n"
178 "ldr q7, [%[a_ptr], #80]\n"
179 "ldr q8, [%[a_ptr], #96]\n"
180 "ldr q9, [%[a_ptr], #112]\n"
181 "ldr q10, [%[a_ptr], #128]\n"
182 "ldr q11, [%[a_ptr], #144]\n"
183 "ldr q12, [%[a_ptr], #160]\n"
184 "ldr q13, [%[a_ptr], #176]\n"
185 "ldr q14, [%[a_ptr], #192]\n"
186 "ldr q15, [%[a_ptr], #208]\n"
187 "ldr q16, [%[a_ptr], #224]\n"
188 "ldr q17, [%[a_ptr], #240]\n"
189 "ldr q18, [%[a_ptr], #256]\n"
190 "ldr q19, [%[a_ptr], #272]\n"
191 "ldr q20, [%[a_ptr], #288]\n"
192 "ldr q21, [%[a_ptr], #304]\n"
193 "ldr q22, [%[a_ptr], #320]\n"
194 "ldr q23, [%[a_ptr], #336]\n"
221 "add %[a_ptr], %[a_ptr], #352\n"
227 "fmla %[r0].4s, v2.4s, %[x0].s[0]\n"
228 "ldr %q[x0a], [%[x_ptr], #16]\n"
229 "fmla %[r1].4s, v3.4s, %[x0].s[0]\n"
230 "ldr q3, [%[a_ptr], #0]\n"
231 "subs %w[k], %w[k], #1\n"
232 "fmla %[r2].4s, v4.4s, %[x0].s[0]\n"
233 "ldr q4, [%[a_ptr], #16]\n"
234 "fmla %[r3].4s, v5.4s, %[x0].s[0]\n"
235 "ldr q5, [%[a_ptr], #32]\n"
236 "add %[x_ptr], %[x_ptr], #32\n"
238 "fmla %[r4].4s, v6.4s, %[x0].s[0]\n"
239 "ldr q6, [%[a_ptr], #48]\n"
240 "fmla %[r5].4s, v7.4s, %[x0].s[0]\n"
241 "ldr q7, [%[a_ptr], #64]\n"
242 "fmla %[r6].4s, v8.4s, %[x0].s[0]\n"
243 "ldr q8, [%[a_ptr], #80]\n"
244 "fmla %[r7].4s, v9.4s, %[x0].s[0]\n"
245 "ldr q9, [%[a_ptr], #96]\n"
249 "fmla %[r0].4s, v10.4s, %[x0].s[1]\n"
250 "ldr q10, [%[a_ptr], #112]\n"
251 "fmla %[r1].4s, v11.4s, %[x0].s[1]\n"
252 "ldr q11, [%[a_ptr], #128]\n"
253 "fmla %[r2].4s, v12.4s, %[x0].s[1]\n"
254 "ldr q12, [%[a_ptr], #144]\n"
255 "fmla %[r3].4s, v13.4s, %[x0].s[1]\n"
256 "ldr q13, [%[a_ptr], #160]\n"
258 "fmla %[r4].4s, v14.4s, %[x0].s[1]\n"
259 "ldr q14, [%[a_ptr], #176]\n"
260 "fmla %[r5].4s, v15.4s, %[x0].s[1]\n"
261 "ldr q15, [%[a_ptr], #192]\n"
262 "fmla %[r6].4s, v16.4s, %[x0].s[1]\n"
263 "ldr q16, [%[a_ptr], #208]\n"
264 "fmla %[r7].4s, v17.4s, %[x0].s[1]\n"
265 "ldr q17, [%[a_ptr], #224]\n"
269 "fmla %[r0].4s, v18.4s, %[x0].s[2]\n"
270 "ldr q18, [%[a_ptr], #240]\n"
271 "fmla %[r1].4s, v19.4s, %[x0].s[2]\n"
272 "ldr q19, [%[a_ptr], #256]\n"
273 "fmla %[r2].4s, v20.4s, %[x0].s[2]\n"
274 "ldr q20, [%[a_ptr], #272]\n"
275 "fmla %[r3].4s, v21.4s, %[x0].s[2]\n"
276 "ldr q21, [%[a_ptr], #288]\n"
278 "fmla %[r4].4s, v22.4s, %[x0].s[2]\n"
279 "ldr q22, [%[a_ptr], #304]\n"
280 "fmla %[r5].4s, v23.4s, %[x0].s[2]\n"
281 "ldr q23, [%[a_ptr], #320]\n"
282 "fmla %[r6].4s, v3.4s, %[x0].s[2]\n"
283 "ldr q2, [%[a_ptr], #336]\n"
284 "ldr q3, [%[a_ptr], #352]\n"
285 "fmla %[r7].4s, v4.4s, %[x0].s[2]\n"
286 "ldr q4, [%[a_ptr], #368]\n"
290 "fmla %[r0].4s, v5.4s, %[x0].s[3]\n"
291 "ldr q5, [%[a_ptr], #384]\n"
292 "fmla %[r1].4s, v6.4s, %[x0].s[3]\n"
293 "ldr q6, [%[a_ptr], #400]\n"
294 "fmla %[r2].4s, v7.4s, %[x0].s[3]\n"
295 "ldr q7, [%[a_ptr], #416]\n"
296 "fmla %[r3].4s, v8.4s, %[x0].s[3]\n"
298 "ldr q8, [%[a_ptr], #432]\n"
299 "fmla %[r4].4s, v9.4s, %[x0].s[3]\n"
300 "ldr q9, [%[a_ptr], #448]\n"
301 "fmla %[r5].4s, v10.4s, %[x0].s[3]\n"
302 "ldr q10, [%[a_ptr], #464]\n"
303 "fmla %[r6].4s, v11.4s, %[x0].s[3]\n"
304 "ldr q11, [%[a_ptr], #480]\n"
305 "fmla %[r7].4s, v12.4s, %[x0].s[3]\n"
306 "ldr q12, [%[a_ptr], #496]\n"
310 "fmla %[r0].4s, v13.4s, %[x0a].s[0]\n"
311 "ldr %q[x0], [%[x_ptr]]\n"
312 "fmla %[r1].4s, v14.4s, %[x0a].s[0]\n"
313 "ldr q14, [%[a_ptr], #512]\n"
314 "fmla %[r2].4s, v15.4s, %[x0a].s[0]\n"
315 "ldr q15, [%[a_ptr], #528]\n"
316 "fmla %[r3].4s, v16.4s, %[x0a].s[0]\n"
318 "ldr q16, [%[a_ptr], #544]\n"
319 "fmla %[r4].4s, v17.4s, %[x0a].s[0]\n"
320 "ldr q17, [%[a_ptr], #560]\n"
321 "fmla %[r5].4s, v18.4s, %[x0a].s[0]\n"
322 "ldr q18, [%[a_ptr], #576]\n"
323 "fmla %[r6].4s, v19.4s, %[x0a].s[0]\n"
324 "ldr q19, [%[a_ptr], #592]\n"
325 "fmla %[r7].4s, v20.4s, %[x0a].s[0]\n"
326 "ldr q20, [%[a_ptr], #608]\n"
330 "fmla %[r0].4s, v21.4s, %[x0a].s[1]\n"
331 "ldr q21, [%[a_ptr], #624]\n"
332 "fmla %[r1].4s, v22.4s, %[x0a].s[1]\n"
333 "ldr q22, [%[a_ptr], #640]\n"
334 "fmla %[r2].4s, v23.4s, %[x0a].s[1]\n"
335 "ldr q23, [%[a_ptr], #656]\n"
336 "fmla %[r3].4s, v2.4s, %[x0a].s[1]\n"
337 "ldr q2, [%[a_ptr], #672]\n"
339 "fmla %[r4].4s, v3.4s, %[x0a].s[1]\n"
340 "ldr q3, [%[a_ptr], #688]\n"
341 "fmla %[r5].4s, v4.4s, %[x0a].s[1]\n"
342 "ldr q4, [%[a_ptr], #704]\n"
343 "fmla %[r6].4s, v5.4s, %[x0a].s[1]\n"
344 "ldr q5, [%[a_ptr], #720]\n"
345 "fmla %[r7].4s, v6.4s, %[x0a].s[1]\n"
346 "ldr q6, [%[a_ptr], #736]\n"
350 "fmla %[r0].4s, v7.4s, %[x0a].s[2]\n"
351 "ldr q7, [%[a_ptr], #752]\n"
352 "fmla %[r1].4s, v8.4s, %[x0a].s[2]\n"
353 "ldr q8, [%[a_ptr], #768]\n"
354 "fmla %[r2].4s, v9.4s, %[x0a].s[2]\n"
355 "ldr q9, [%[a_ptr], #784]\n"
356 "fmla %[r3].4s, v10.4s, %[x0a].s[2]\n"
357 "ldr q10, [%[a_ptr], #800]\n"
359 "fmla %[r4].4s, v11.4s, %[x0a].s[2]\n"
360 "ldr q11, [%[a_ptr], #816]\n"
361 "fmla %[r5].4s, v12.4s, %[x0a].s[2]\n"
362 "ldr q12, [%[a_ptr], #832]\n"
363 "fmla %[r6].4s, v14.4s, %[x0a].s[2]\n"
364 "ldr q13, [%[a_ptr], #848]\n"
365 "ldr q14, [%[a_ptr], #864]\n"
366 "fmla %[r7].4s, v15.4s, %[x0a].s[2]\n"
367 "ldr q15, [%[a_ptr], #880]\n"
371 "fmla %[r0].4s, v16.4s, %[x0a].s[3]\n"
372 "ldr q16, [%[a_ptr], #896]\n"
373 "fmla %[r1].4s, v17.4s, %[x0a].s[3]\n"
374 "ldr q17, [%[a_ptr], #912]\n"
375 "fmla %[r2].4s, v18.4s, %[x0a].s[3]\n"
376 "ldr q18, [%[a_ptr], #928]\n"
377 "fmla %[r3].4s, v19.4s, %[x0a].s[3]\n"
379 "ldr q19, [%[a_ptr], #944]\n"
380 "fmla %[r4].4s, v20.4s, %[x0a].s[3]\n"
381 "ldr q20, [%[a_ptr], #960]\n"
382 "fmla %[r5].4s, v21.4s, %[x0a].s[3]\n"
383 "ldr q21, [%[a_ptr], #976]\n"
384 "add %[a_ptr], %[a_ptr], #1024\n"
385 "fmla %[r6].4s, v22.4s, %[x0a].s[3]\n"
386 "ldr q22, [%[a_ptr], #-32]\n"
387 "fmla %[r7].4s, v23.4s, %[x0a].s[3]\n"
388 "ldr q23, [%[a_ptr], #-16]\n"
396 "fmla %[r0].4s, v2.4s, %[x0].s[0]\n"
397 "ldr %q[x0a], [%[x_ptr], #16]\n"
398 "fmla %[r1].4s, v3.4s, %[x0].s[0]\n"
399 "ldr q3, [%[a_ptr], #0]\n"
400 "subs %w[k], %w[k], #1\n"
401 "fmla %[r2].4s, v4.4s, %[x0].s[0]\n"
402 "ldr q4, [%[a_ptr], #16]\n"
403 "fmla %[r3].4s, v5.4s, %[x0].s[0]\n"
404 "ldr q5, [%[a_ptr], #32]\n"
405 "add %[x_ptr], %[x_ptr], #32\n"
406 "fmla %[r4].4s, v6.4s, %[x0].s[0]\n"
407 "ldr q6, [%[a_ptr], #48]\n"
408 "fmla %[r5].4s, v7.4s, %[x0].s[0]\n"
409 "ldr q7, [%[a_ptr], #64]\n"
410 "fmla %[r6].4s, v8.4s, %[x0].s[0]\n"
411 "ldr q8, [%[a_ptr], #80]\n"
412 "fmla %[r7].4s, v9.4s, %[x0].s[0]\n"
413 "ldr q9, [%[a_ptr], #96]\n"
416 "fmla %[r0].4s, v10.4s, %[x0].s[1]\n"
417 "ldr q10, [%[a_ptr], #112]\n"
418 "fmla %[r1].4s, v11.4s, %[x0].s[1]\n"
419 "ldr q11, [%[a_ptr], #128]\n"
420 "fmla %[r2].4s, v12.4s, %[x0].s[1]\n"
421 "ldr q12, [%[a_ptr], #144]\n"
422 "fmla %[r3].4s, v13.4s, %[x0].s[1]\n"
423 "ldr q13, [%[a_ptr], #160]\n"
424 "fmla %[r4].4s, v14.4s, %[x0].s[1]\n"
425 "ldr q14, [%[a_ptr], #176]\n"
426 "fmla %[r5].4s, v15.4s, %[x0].s[1]\n"
427 "ldr q15, [%[a_ptr], #192]\n"
428 "fmla %[r6].4s, v16.4s, %[x0].s[1]\n"
429 "ldr q16, [%[a_ptr], #208]\n"
430 "fmla %[r7].4s, v17.4s, %[x0].s[1]\n"
431 "ldr q17, [%[a_ptr], #224]\n"
434 "fmla %[r0].4s, v18.4s, %[x0].s[2]\n"
435 "ldr q18, [%[a_ptr], #240]\n"
436 "fmla %[r1].4s, v19.4s, %[x0].s[2]\n"
437 "ldr q19, [%[a_ptr], #256]\n"
438 "fmla %[r2].4s, v20.4s, %[x0].s[2]\n"
439 "ldr q20, [%[a_ptr], #272]\n"
440 "fmla %[r3].4s, v21.4s, %[x0].s[2]\n"
441 "ldr q21, [%[a_ptr], #288]\n"
442 "fmla %[r4].4s, v22.4s, %[x0].s[2]\n"
443 "ldr q22, [%[a_ptr], #304]\n"
444 "fmla %[r5].4s, v23.4s, %[x0].s[2]\n"
445 "ldr q23, [%[a_ptr], #320]\n"
446 "fmla %[r6].4s, v3.4s, %[x0].s[2]\n"
447 "ldr q2, [%[a_ptr], #336]\n"
448 "ldr q3, [%[a_ptr], #352]\n"
449 "fmla %[r7].4s, v4.4s, %[x0].s[2]\n"
450 "ldr q4, [%[a_ptr], #368]\n"
453 "fmla %[r0].4s, v5.4s, %[x0].s[3]\n"
454 "ldr q5, [%[a_ptr], #384]\n"
455 "fmla %[r1].4s, v6.4s, %[x0].s[3]\n"
456 "ldr q6, [%[a_ptr], #400]\n"
457 "fmla %[r2].4s, v7.4s, %[x0].s[3]\n"
458 "ldr q7, [%[a_ptr], #416]\n"
459 "fmla %[r3].4s, v8.4s, %[x0].s[3]\n"
460 "ldr q8, [%[a_ptr], #432]\n"
461 "fmla %[r4].4s, v9.4s, %[x0].s[3]\n"
462 "ldr q9, [%[a_ptr], #448]\n"
463 "fmla %[r5].4s, v10.4s, %[x0].s[3]\n"
464 "ldr q10, [%[a_ptr], #464]\n"
465 "fmla %[r6].4s, v11.4s, %[x0].s[3]\n"
466 "ldr q11, [%[a_ptr], #480]\n"
467 "fmla %[r7].4s, v12.4s, %[x0].s[3]\n"
468 "ldr q12, [%[a_ptr], #496]\n"
471 "fmla %[r0].4s, v13.4s, %[x0a].s[0]\n"
472 "fmla %[r1].4s, v14.4s, %[x0a].s[0]\n"
473 "ldr q14, [%[a_ptr], #512]\n"
474 "fmla %[r2].4s, v15.4s, %[x0a].s[0]\n"
475 "ldr q15, [%[a_ptr], #528]\n"
476 "fmla %[r3].4s, v16.4s, %[x0a].s[0]\n"
477 "ldr q16, [%[a_ptr], #544]\n"
478 "fmla %[r4].4s, v17.4s, %[x0a].s[0]\n"
479 "ldr q17, [%[a_ptr], #560]\n"
480 "fmla %[r5].4s, v18.4s, %[x0a].s[0]\n"
481 "ldr q18, [%[a_ptr], #576]\n"
482 "fmla %[r6].4s, v19.4s, %[x0a].s[0]\n"
483 "ldr q19, [%[a_ptr], #592]\n"
484 "fmla %[r7].4s, v20.4s, %[x0a].s[0]\n"
485 "ldr q20, [%[a_ptr], #608]\n"
488 "fmla %[r0].4s, v21.4s, %[x0a].s[1]\n"
489 "ldr q21, [%[a_ptr], #624]\n"
490 "fmla %[r1].4s, v22.4s, %[x0a].s[1]\n"
491 "ldr q22, [%[a_ptr], #640]\n"
492 "fmla %[r2].4s, v23.4s, %[x0a].s[1]\n"
493 "ldr q23, [%[a_ptr], #656]\n"
494 "fmla %[r3].4s, v2.4s, %[x0a].s[1]\n"
495 "add %[a_ptr], %[a_ptr], #672\n"
496 "fmla %[r4].4s, v3.4s, %[x0a].s[1]\n"
497 "fmla %[r5].4s, v4.4s, %[x0a].s[1]\n"
498 "fmla %[r6].4s, v5.4s, %[x0a].s[1]\n"
499 "fmla %[r7].4s, v6.4s, %[x0a].s[1]\n"
502 "fmla %[r0].4s, v7.4s, %[x0a].s[2]\n"
503 "fmla %[r1].4s, v8.4s, %[x0a].s[2]\n"
504 "fmla %[r2].4s, v9.4s, %[x0a].s[2]\n"
505 "fmla %[r3].4s, v10.4s, %[x0a].s[2]\n"
506 "fmla %[r4].4s, v11.4s, %[x0a].s[2]\n"
507 "fmla %[r5].4s, v12.4s, %[x0a].s[2]\n"
508 "fmla %[r6].4s, v14.4s, %[x0a].s[2]\n"
509 "fmla %[r7].4s, v15.4s, %[x0a].s[2]\n"
512 "fmla %[r0].4s, v16.4s, %[x0a].s[3]\n"
513 "fmla %[r1].4s, v17.4s, %[x0a].s[3]\n"
514 "fmla %[r2].4s, v18.4s, %[x0a].s[3]\n"
515 "fmla %[r3].4s, v19.4s, %[x0a].s[3]\n"
516 "fmla %[r4].4s, v20.4s, %[x0a].s[3]\n"
517 "fmla %[r5].4s, v21.4s, %[x0a].s[3]\n"
518 "fmla %[r6].4s, v22.4s, %[x0a].s[3]\n"
519 "fmla %[r7].4s, v23.4s, %[x0a].s[3]\n"
521 [a_ptr]
"+r" (a_ptr), [x_ptr]
"+r" (x_ptr),
522 [x0]
"+w" (x0), [x0a]
"+w" (x0a), [k]
"+r" (k),
523 [r0]
"+w" (r0), [r1]
"+w" (r1), [r2]
"+w" (r2), [r3]
"+w" (r3),
524 [r4]
"+w" (r4), [r5]
"+w" (r5), [r6]
"+w" (r6), [r7]
"+w" (r7)
526 :
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
527 "v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23",
"x20",
"x21",
"cc",
"memory");
535 "ldr q2, [%[a_ptr], #0]\n"
536 "ldr q3, [%[a_ptr], #16]\n"
537 "ldr q4, [%[a_ptr], #32]\n"
538 "ldr q5, [%[a_ptr], #48]\n"
539 "ldr q6, [%[a_ptr], #64]\n"
540 "ldr q7, [%[a_ptr], #80]\n"
541 "ldr q8, [%[a_ptr], #96]\n"
542 "ldr q9, [%[a_ptr], #112]\n"
543 "ldr %s[x0], [%[x_ptr]]\n"
544 "add %[a_ptr], %[a_ptr], #128\n"
545 "add %[x_ptr], %[x_ptr], #4\n"
550 "fmla %[r0].4s, v2.4s, %[x0].s[0]\n"
551 "ldr q2, [%[a_ptr], #0]\n"
552 "subs %w[l], %w[l], #1\n"
553 "fmla %[r1].4s, v3.4s, %[x0].s[0]\n"
554 "ldr q3, [%[a_ptr], #16]\n"
555 "fmla %[r2].4s, v4.4s, %[x0].s[0]\n"
556 "ldr q4, [%[a_ptr], #32]\n"
557 "fmla %[r3].4s, v5.4s, %[x0].s[0]\n"
558 "ldr q5, [%[a_ptr], #48]\n"
559 "fmla %[r4].4s, v6.4s, %[x0].s[0]\n"
560 "ldr q6, [%[a_ptr], #64]\n"
561 "fmla %[r5].4s, v7.4s, %[x0].s[0]\n"
562 "ldr q7, [%[a_ptr], #80]\n"
563 "fmla %[r6].4s, v8.4s, %[x0].s[0]\n"
564 "ldr q8, [%[a_ptr], #96]\n"
565 "fmla %[r7].4s, v9.4s, %[x0].s[0]\n"
566 "ldr q9, [%[a_ptr], #112]\n"
567 "ldr %s[x0], [%[x_ptr]]\n"
568 "add %[a_ptr], %[a_ptr], #128\n"
569 "add %[x_ptr], %[x_ptr], #4\n"
574 "fmla %[r0].4s, v2.4s, %[x0].s[0]\n"
575 "fmla %[r1].4s, v3.4s, %[x0].s[0]\n"
576 "fmla %[r2].4s, v4.4s, %[x0].s[0]\n"
577 "fmla %[r3].4s, v5.4s, %[x0].s[0]\n"
578 "fmla %[r4].4s, v6.4s, %[x0].s[0]\n"
579 "fmla %[r5].4s, v7.4s, %[x0].s[0]\n"
580 "fmla %[r6].4s, v8.4s, %[x0].s[0]\n"
581 "fmla %[r7].4s, v9.4s, %[x0].s[0]\n"
583 [a_ptr]
"+r" (a_ptr), [x_ptr]
"+r" (x_ptr),
584 [x0]
"+w" (x0), [l]
"+r" (l),
585 [r0]
"+w" (r0), [r1]
"+w" (r1), [r2]
"+w" (r2), [r3]
"+w" (r3),
586 [r4]
"+w" (r4), [r5]
"+w" (r5), [r6]
"+w" (r6), [r7]
"+w" (r7)
588 :
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"cc",
"memory");
593 vst1q_f32(y_ptr, r0);
594 vst1q_f32(y_ptr + 4, r1);
595 vst1q_f32(y_ptr + 8, r2);
596 vst1q_f32(y_ptr + 12, r3);
597 vst1q_f32(y_ptr + 16, r4);
598 vst1q_f32(y_ptr + 20, r5);
599 vst1q_f32(y_ptr + 24, r6);
600 vst1q_f32(y_ptr + 28, r7);
610 if (vecs==0) { oddvec=r0;
break; }
612 vst1q_f32(y_ptr, r0);
613 if (--vecs==0) { oddvec=r1;
break; }
615 vst1q_f32(y_ptr + 4, r1);
616 if (--vecs==0) { oddvec=r2;
break; }
618 vst1q_f32(y_ptr + 8, r2);
619 if (--vecs==0) { oddvec=r3;
break; }
621 vst1q_f32(y_ptr + 12, r3);
622 if (--vecs==0) { oddvec=r4;
break; }
624 vst1q_f32(y_ptr + 16, r4);
625 if (--vecs==0) { oddvec=r5;
break; }
627 vst1q_f32(y_ptr + 20, r5);
628 if (--vecs==0) { oddvec=r6;
break; }
630 vst1q_f32(y_ptr + 24, r6);
634 float *oddbase = y_ptr + l - oddbits;
638 vst1q_lane_f32(oddbase + 2, oddvec, 2);
641 vst1q_lane_f32(oddbase + 1, oddvec, 1);
644 vst1q_lane_f32(oddbase, oddvec, 0);
654 if (vecs==0) {
UNREACHABLE(
"vecs and oddbits can't both be 0"); }
656 vst1q_f32(y_ptr, r0);
657 if (--vecs==0) {
break; }
659 vst1q_f32(y_ptr + 4, r1);
660 if (--vecs==0) {
break; }
662 vst1q_f32(y_ptr + 8, r2);
663 if (--vecs==0) {
break; }
665 vst1q_f32(y_ptr + 12, r3);
666 if (--vecs==0) {
break; }
668 vst1q_f32(y_ptr + 16, r4);
669 if (--vecs==0) {
break; }
671 vst1q_f32(y_ptr + 20, r5);
672 if (--vecs==0) {
break; }
674 vst1q_f32(y_ptr + 24, r6);