28 #if defined(ARM_COMPUTE_ENABLE_SME2)
33 void sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
34 const float *
const *
const input_ptrs,
35 float *
const *
const outptrs,
37 unsigned int n_channels,
44 float *
const *outptrs;
47 const float *inptrs[16];
50 const float *
const *
const input_ptrs,
51 float *
const *
const outptrs,
52 const void *
const params,
55 ) : outptrs(outptrs), params(params), min(min), max(max)
57 inptrs[0] = input_ptrs[5];
58 inptrs[1] = input_ptrs[0];
59 inptrs[2] = input_ptrs[3];
60 inptrs[3] = input_ptrs[6];
61 inptrs[4] = input_ptrs[9];
62 inptrs[5] = input_ptrs[12];
63 inptrs[6] = input_ptrs[15];
64 inptrs[7] = input_ptrs[1];
65 inptrs[8] = input_ptrs[2];
66 inptrs[9] = input_ptrs[10];
67 inptrs[10] = input_ptrs[4];
68 inptrs[11] = input_ptrs[7];
69 inptrs[12] = input_ptrs[8];
70 inptrs[13] = input_ptrs[11];
71 inptrs[14] = input_ptrs[13];
72 inptrs[15] = input_ptrs[14];
77 Args params_struct(input_ptrs, outptrs, params,
81 "ldr x20, [%x[params_struct], %[offsetof_args_outptrs]]\n"
82 ".inst 0xd503477f // SMSTART ZA\n"
83 "add x15, %x[params_struct], %[offsetof_Args_inptrs]\n"
85 "ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
86 ".inst 0x25207810 // ptrue pn8.b\n"
87 "ld1w { z23.s }, p3/Z, [x14]\n"
88 "addvl x14, x14, #1\n"
89 "ldp x13, x12, [x20, #0x0]\n"
91 ".inst 0xa040c1c0 // ld1w { z0.s-z3.s }, pn8.b/Z, [x14]\n"
92 "addvl x14, x14, #4\n"
93 "ldp x10, x9, [x20, #0x10]\n"
95 "whilelt p2.s, XZR, %x[n_channels]\n"
96 ".inst 0xa040c1c4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x14]\n"
97 "ldp x24, x23, [x15, #0x0]\n"
98 "addvl x14, x14, #4\n"
99 "cmp x11, %x[n_channels]\n"
100 "ld1rw { z22.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
101 "ldp x22, x21, [x15, #0x10]\n"
102 "ld1rw { z15.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
103 "sub x27, XZR, x11\n"
104 "ldr x20, [x15, #0x20]\n"
105 "ld1w { z8.s }, p3/Z, [x14]\n"
106 "addvl x14, x14, #1\n"
107 "ld1w { z9.s }, p2/Z, [x24, x28, LSL #2]\n"
108 "ld1w { z10.s }, p2/Z, [x23, x28, LSL #2]\n"
109 "ld1w { z11.s }, p2/Z, [x22, x28, LSL #2]\n"
110 "ld1w { z12.s }, p2/Z, [x21, x28, LSL #2]\n"
111 "ld1w { z13.s }, p2/Z, [x20, x28, LSL #2]\n"
114 "movprfx z28, z23\n fmla z28.s, p3/M, z4.s, z9.s\n"
115 "movprfx z29, z23\n fmla z29.s, p3/M, z3.s, z9.s\n"
116 "ldr x20, [x15, #0x28]\n"
117 "whilelt p1.s, x11, %x[n_channels]\n"
118 "movprfx z30, z23\n fmla z30.s, p3/M, z1.s, z9.s\n"
119 "movprfx z31, z23\n fmla z31.s, p3/M, z0.s, z9.s\n"
120 "ld1w { z19.s }, p2/Z, [x20, x28, LSL #2]\n"
121 "ldr x20, [x15, #0x30]\n"
122 "fmla z28.s, p3/M, z0.s, z10.s\n"
123 "fmla z29.s, p3/M, z2.s, z11.s\n"
124 "ldr x21, [x15, #0x38]\n"
125 "ld1w { z18.s }, p2/Z, [x20, x28, LSL #2]\n"
126 "fmla z30.s, p3/M, z2.s, z12.s\n"
127 "fmla z31.s, p3/M, z1.s, z12.s\n"
128 "ldr x20, [x15, #0x48]\n"
129 "ld1w { z17.s }, p2/Z, [x20, x28, LSL #2]\n"
130 "fmla z28.s, p3/M, z5.s, z12.s\n"
131 "fmla z29.s, p3/M, z4.s, z12.s\n"
132 "ld1w { z16.s }, p2/Z, [x21, x28, LSL #2]\n"
133 "ldr x20, [x15, #0x40]\n"
134 "fmla z30.s, p3/M, z6.s, z19.s\n"
135 "fmla z31.s, p3/M, z3.s, z13.s\n"
136 "ld1w { z25.s }, p2/Z, [x20, x28, LSL #2]\n"
137 "ldr x21, [x15, #0x50]\n"
138 "fmla z28.s, p3/M, z7.s, z13.s\n"
139 "fmla z29.s, p3/M, z6.s, z13.s\n"
140 "ldr x20, [x15, #0x58]\n"
141 "ld1w { z23.s }, p3/Z, [x14]\n"
142 "fmla z30.s, p3/M, z4.s, z13.s\n"
143 "fmla z31.s, p3/M, z8.s, z18.s\n"
144 "ld1w { z11.s }, p2/Z, [x21, x28, LSL #2]\n"
145 "ldr x21, [x15, #0x60]\n"
146 "fmla z28.s, p3/M, z1.s, z16.s\n"
147 "fmla z29.s, p3/M, z0.s, z16.s\n"
148 "ld1w { z19.s }, p2/Z, [x20, x28, LSL #2]\n"
149 "ldr x20, [x15, #0x68]\n"
150 "fmla z30.s, p3/M, z5.s, z17.s\n"
151 "fmla z31.s, p3/M, z4.s, z17.s\n"
152 "ldr x26, [x15, #0x70]\n"
153 "addvl x14, x14, #1\n"
154 "fmla z28.s, p3/M, z2.s, z25.s\n"
155 "fmla z29.s, p3/M, z1.s, z25.s\n"
156 "ld1w { z18.s }, p2/Z, [x21, x28, LSL #2]\n"
157 "ldr x25, [x15, #0x78]\n"
158 "fmla z30.s, p3/M, z0.s, z11.s\n"
159 "fmla z31.s, p3/M, z2.s, z19.s\n"
160 "ldp x24, x23, [x15, #0x0]\n"
162 "fmla z28.s, p3/M, z8.s, z17.s\n"
163 "fmla z29.s, p3/M, z7.s, z17.s\n"
164 "ld1w { z17.s }, p2/Z, [x20, x28, LSL #2]\n"
165 "ldp x22, x21, [x15, #0x10]\n"
166 "fmla z30.s, p3/M, z3.s, z18.s\n"
167 "fmla z31.s, p3/M, z5.s, z17.s\n"
168 "ldr x20, [x15, #0x20]\n"
169 "ld1w { z13.s }, p1/Z, [x20, x11, LSL #2]\n"
170 "fmla z28.s, p3/M, z3.s, z11.s\n"
171 "ld1w { z16.s }, p2/Z, [x26, x28, LSL #2]\n"
172 "fmla z29.s, p3/M, z5.s, z19.s\n"
174 "fmla z30.s, p3/M, z7.s, z16.s\n"
175 "fmla z31.s, p3/M, z6.s, z16.s\n"
176 "ld1w { z16.s }, p2/Z, [x25, x28, LSL #2]\n"
178 "fmla z28.s, p3/M, z6.s, z18.s\n"
179 "fmla z29.s, p3/M, z8.s, z17.s\n"
180 "ld1w { z9.s }, p1/Z, [x24, x11, LSL #2]\n"
181 "whilelt p2.s, x28, %x[n_channels]\n"
182 "fmla z30.s, p3/M, z8.s, z16.s\n"
183 "fmla z31.s, p3/M, z7.s, z16.s\n"
184 "ld1w { z10.s }, p1/Z, [x23, x11, LSL #2]\n"
185 "ld1w { z11.s }, p1/Z, [x22, x11, LSL #2]\n"
186 ".inst 0xc1afcadc // fclamp { z28.s-z31.s }, z22.s, z15.s\n"
187 "st1w { z28.s }, p0, [x13, x27, LSL #2]\n"
188 "ld1w { z12.s }, p1/Z, [x21, x11, LSL #2]\n"
190 "cmp x11, %x[n_channels]\n"
191 "st1w { z29.s }, p0, [x12, x27, LSL #2]\n"
192 ".inst 0xa040c1c0 // ld1w { z0.s-z3.s }, pn8.b/Z, [x14]\n"
193 "addvl x14, x14, #4\n"
194 "st1w { z30.s }, p0, [x10, x27, LSL #2]\n"
195 ".inst 0xa040c1c4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x14]\n"
196 "addvl x14, x14, #4\n"
197 "st1w { z31.s }, p0, [x9, x27, LSL #2]\n"
198 "ld1w { z8.s }, p3/Z, [x14]\n"
199 "addvl x14, x14, #1\n"
202 "movprfx z28, z23\n fmla z28.s, p3/M, z4.s, z9.s\n"
203 "movprfx z29, z23\n fmla z29.s, p3/M, z3.s, z9.s\n"
204 "ldr x20, [x15, #0x28]\n"
206 "movprfx z30, z23\n fmla z30.s, p3/M, z1.s, z9.s\n"
207 "movprfx z31, z23\n fmla z31.s, p3/M, z0.s, z9.s\n"
208 "ld1w { z17.s }, p2/Z, [x20, x28, LSL #2]\n"
209 "ldr x20, [x15, #0x30]\n"
210 "fmla z28.s, p3/M, z0.s, z10.s\n"
211 "fmla z29.s, p3/M, z2.s, z11.s\n"
212 "ldr x21, [x15, #0x38]\n"
213 "ld1w { z16.s }, p2/Z, [x20, x28, LSL #2]\n"
214 "fmla z30.s, p3/M, z2.s, z12.s\n"
215 "fmla z31.s, p3/M, z1.s, z12.s\n"
216 "ldr x20, [x15, #0x48]\n"
217 "ld1w { z20.s }, p2/Z, [x20, x28, LSL #2]\n"
218 "fmla z28.s, p3/M, z5.s, z12.s\n"
219 "fmla z29.s, p3/M, z4.s, z12.s\n"
220 "ld1w { z18.s }, p2/Z, [x21, x28, LSL #2]\n"
221 "ldr x20, [x15, #0x40]\n"
222 "fmla z30.s, p3/M, z6.s, z17.s\n"
223 "fmla z31.s, p3/M, z3.s, z13.s\n"
224 "ld1w { z17.s }, p2/Z, [x20, x28, LSL #2]\n"
225 "ldr x20, [x15, #0x50]\n"
226 "fmla z28.s, p3/M, z7.s, z13.s\n"
227 "fmla z29.s, p3/M, z6.s, z13.s\n"
228 "ldr x21, [x15, #0x58]\n"
230 "fmla z30.s, p3/M, z4.s, z13.s\n"
231 "fmla z31.s, p3/M, z8.s, z16.s\n"
232 "ld1w { z16.s }, p2/Z, [x20, x28, LSL #2]\n"
233 "ldr x20, [x15, #0x60]\n"
234 "fmla z28.s, p3/M, z1.s, z18.s\n"
235 "fmla z29.s, p3/M, z0.s, z18.s\n"
236 "ld1w { z19.s }, p2/Z, [x21, x28, LSL #2]\n"
237 "ldr x22, [x15, #0x68]\n"
238 "fmla z30.s, p3/M, z5.s, z20.s\n"
239 "fmla z31.s, p3/M, z4.s, z20.s\n"
240 "ldr x21, [x15, #0x70]\n"
241 "fmla z28.s, p3/M, z2.s, z17.s\n"
242 "fmla z29.s, p3/M, z1.s, z17.s\n"
243 "ld1w { z18.s }, p2/Z, [x20, x28, LSL #2]\n"
244 "ldr x20, [x15, #0x78]\n"
245 "fmla z30.s, p3/M, z0.s, z16.s\n"
246 "fmla z31.s, p3/M, z2.s, z19.s\n"
247 "fmla z28.s, p3/M, z8.s, z20.s\n"
248 "fmla z29.s, p3/M, z7.s, z20.s\n"
249 "ld1w { z17.s }, p2/Z, [x22, x28, LSL #2]\n"
250 "fmla z30.s, p3/M, z3.s, z18.s\n"
251 "fmla z31.s, p3/M, z5.s, z17.s\n"
252 "fmla z28.s, p3/M, z3.s, z16.s\n"
253 "ld1w { z16.s }, p2/Z, [x21, x28, LSL #2]\n"
254 "fmla z29.s, p3/M, z5.s, z19.s\n"
255 "fmla z30.s, p3/M, z7.s, z16.s\n"
256 "fmla z31.s, p3/M, z6.s, z16.s\n"
257 "ld1w { z16.s }, p2/Z, [x20, x28, LSL #2]\n"
258 "fmla z28.s, p3/M, z6.s, z18.s\n"
259 "fmla z29.s, p3/M, z8.s, z17.s\n"
260 "fmla z30.s, p3/M, z8.s, z16.s\n"
261 "fmla z31.s, p3/M, z7.s, z16.s\n"
262 ".inst 0xc1afcadc // fclamp { z28.s-z31.s }, z22.s, z15.s\n"
263 "st1w { z28.s }, p0, [x13, x27, LSL #2]\n"
264 "st1w { z29.s }, p0, [x12, x27, LSL #2]\n"
265 "st1w { z30.s }, p0, [x10, x27, LSL #2]\n"
266 "st1w { z31.s }, p0, [x9, x27, LSL #2]\n"
267 ".inst 0xd503467f // SMSTOP\n"
269 : [n_channels]
"r" ((
unsigned long) n_channels), [offsetof_Args_inptrs]
"I" (offsetof(Args, inptrs)), [offsetof_args_max]
"I" (offsetof(Args, max)), [offsetof_args_min]
"I" (offsetof(Args, min)), [offsetof_args_outptrs]
"I" (offsetof(Args, outptrs)), [offsetof_args_params]
"I" (offsetof(Args, params)), [params_struct]
"r" (¶ms_struct)
270 :
"cc",
"memory",
"p0",
"p1",
"p2",
"p3",
"p4",
"p5",
"p6",
"p7",
"p8",
"p9",
"p10",
"p11",
"p12",
"p13",
"p14",
"p15",
"x9",
"x10",
"x11",
"x12",
"x13",
"x14",
"x15",
"x20",
"x21",
"x22",
"x23",
"x24",
"x25",
"x26",
"x27",
"x28",
"z0",
"z1",
"z2",
"z3",
"z4",
"z5",
"z6",
"z7",
"z8",
"z9",
"z10",
"z11",
"z12",
"z13",
"z14",
"z15",
"z16",
"z17",
"z18",
"z19",
"z20",
"z21",
"z22",
"z23",
"z24",
"z25",
"z26",
"z27",
"z28",
"z29",
"z30",
"z31"
277 #endif // defined(ARM_COMPUTE_ENABLE_SME2)