25 #include "pooling.hpp"
32 #if defined(ARM_COMPUTE_ENABLE_SME)
40 int32_t multiplier, shift;
43 constexpr RescaleParams rescale_params[8] = {
55 void sme_u8q_nhwc_avg_generic_depthfirst_impl(
56 const uint64_t window_cells,
57 const uint64_t n_valid_cells,
59 const uint8_t *
const *
const inptrs,
61 const Requantize32 &qp
64 if (n_valid_cells == 1 && window_cells == 1)
67 std::memcpy(outptr, *inptrs, n_channels);
72 int32_t shift_value = 0, rescale_value = 0;
73 if (2 <= window_cells && window_cells <= 9)
75 auto ¶ms = rescale_params[window_cells - 2];
76 rescale_value = params.multiplier;
77 shift_value = params.shift;
81 auto f_rescale_value = 1.0f /
static_cast<float>(window_cells);
84 while (f_rescale_value < 0.5f)
87 f_rescale_value *= 2.0f;
90 int64_t long_rescale_value =
round(f_rescale_value *
static_cast<float>(1ll << 31));
91 if (long_rescale_value == (1ll << 31))
94 long_rescale_value >>= 1;
96 rescale_value =
static_cast<int32_t
>(long_rescale_value);
102 const int32_t accumulator_init = -qp.input_offset * n_valid_cells;
106 const int32_t shift = qp.per_layer_left_shift - qp.per_layer_right_shift + shift_value;
107 const int32_t left_shift = shift > 0 ? shift : 0;
108 const int32_t right_shift = shift <= 0 ? shift : 0;
110 int32_t combined_rescale_value = 0;
111 __asm__ __volatile__ (
112 "mov v16.s[0], %w[per_layer_mul]\n"
113 "mov v17.s[0], %w[rescale_value]\n"
114 "sqrdmulh s18, s16, s17\n"
115 "mov %w[combined_rescale_value], v18.s[0]\n"
116 : [combined_rescale_value]
"=r" (combined_rescale_value)
117 : [per_layer_mul]
"r" (qp.per_layer_mul), [rescale_value]
"r" (rescale_value)
118 :
"v16",
"v17",
"v18"
121 __asm__ __volatile__(
122 ".inst 0xd503477f // SMSTART ZA\n"
125 "cntb x25, ALL, MUL #2\n"
126 "cntb x24, ALL, MUL #3\n"
127 "whilelt p4.b, x27, %x[n_channels]\n"
128 "whilelt p3.b, x26, %x[n_channels]\n"
129 "whilelt p2.b, x25, %x[n_channels]\n"
130 "whilelt p1.b, x24, %x[n_channels]\n"
134 "ld1rw { z15.s }, p0/Z, [%x[accumulator_init]]\n"
135 "lsr x23, %x[n_valid_cells], #0x1\n"
140 "mov x22, %x[inptrs]\n"
153 "ldp x21, x20, [x22, #0x0]\n"
154 "subs x23, x23, #0x1\n"
155 "add x22, x22, #0x10\n"
156 "ld1b { z31.b }, p4/Z, [x21, x27]\n"
157 "ld1b { z30.b }, p4/Z, [x20, x27]\n"
158 "ld1b { z29.b }, p3/Z, [x21, x26]\n"
159 "ld1b { z28.b }, p3/Z, [x20, x26]\n"
160 "ld1b { z27.b }, p2/Z, [x21, x25]\n"
161 "ld1b { z26.b }, p2/Z, [x20, x25]\n"
162 "ld1b { z25.b }, p1/Z, [x21, x24]\n"
163 "ld1b { z24.b }, p1/Z, [x20, x24]\n"
166 ".inst 0x455e0bf7 // uaddlb z23.h, z31.b, z30.b\n"
167 ".inst 0x455e0ff6 // uaddlt z22.h, z31.b, z30.b\n"
168 "ldp x21, x20, [x22, #0x0]\n"
169 "subs x23, x23, #0x1\n"
170 ".inst 0x455c0bb5 // uaddlb z21.h, z29.b, z28.b\n"
171 ".inst 0x455c0fb4 // uaddlt z20.h, z29.b, z28.b\n"
172 "add x22, x22, #0x10\n"
173 "ld1b { z31.b }, p4/Z, [x21, x27]\n"
174 ".inst 0x455a0b73 // uaddlb z19.h, z27.b, z26.b\n"
175 ".inst 0x455a0f72 // uaddlt z18.h, z27.b, z26.b\n"
176 "ld1b { z30.b }, p4/Z, [x20, x27]\n"
177 ".inst 0x45580b31 // uaddlb z17.h, z25.b, z24.b\n"
178 ".inst 0x45580f30 // uaddlt z16.h, z25.b, z24.b\n"
179 "ld1b { z29.b }, p3/Z, [x21, x26]\n"
180 ".inst 0x459749ef // uaddwb z15.s, z15.s, z23.h\n"
181 ".inst 0x45974dce // uaddwt z14.s, z14.s, z23.h\n"
182 "ld1b { z28.b }, p3/Z, [x20, x26]\n"
183 ".inst 0x459649ad // uaddwb z13.s, z13.s, z22.h\n"
184 ".inst 0x45964d8c // uaddwt z12.s, z12.s, z22.h\n"
185 "ld1b { z27.b }, p2/Z, [x21, x25]\n"
186 ".inst 0x4595496b // uaddwb z11.s, z11.s, z21.h\n"
187 ".inst 0x45954d4a // uaddwt z10.s, z10.s, z21.h\n"
188 "ld1b { z26.b }, p2/Z, [x20, x25]\n"
189 ".inst 0x45944929 // uaddwb z9.s, z9.s, z20.h\n"
190 ".inst 0x45944d08 // uaddwt z8.s, z8.s, z20.h\n"
191 "ld1b { z25.b }, p1/Z, [x21, x24]\n"
192 ".inst 0x459348e7 // uaddwb z7.s, z7.s, z19.h\n"
193 ".inst 0x45934cc6 // uaddwt z6.s, z6.s, z19.h\n"
194 "ld1b { z24.b }, p1/Z, [x20, x24]\n"
195 ".inst 0x459248a5 // uaddwb z5.s, z5.s, z18.h\n"
196 ".inst 0x45924c84 // uaddwt z4.s, z4.s, z18.h\n"
197 ".inst 0x45914863 // uaddwb z3.s, z3.s, z17.h\n"
198 ".inst 0x45914c42 // uaddwt z2.s, z2.s, z17.h\n"
199 ".inst 0x45904821 // uaddwb z1.s, z1.s, z16.h\n"
200 ".inst 0x45904c00 // uaddwt z0.s, z0.s, z16.h\n"
203 ".inst 0x455e0bf7 // uaddlb z23.h, z31.b, z30.b\n"
204 ".inst 0x455e0ff6 // uaddlt z22.h, z31.b, z30.b\n"
205 ".inst 0x455c0bb5 // uaddlb z21.h, z29.b, z28.b\n"
206 ".inst 0x455c0fb4 // uaddlt z20.h, z29.b, z28.b\n"
207 ".inst 0x455a0b73 // uaddlb z19.h, z27.b, z26.b\n"
208 ".inst 0x455a0f72 // uaddlt z18.h, z27.b, z26.b\n"
209 ".inst 0x45580b31 // uaddlb z17.h, z25.b, z24.b\n"
210 ".inst 0x45580f30 // uaddlt z16.h, z25.b, z24.b\n"
211 ".inst 0x459749ef // uaddwb z15.s, z15.s, z23.h\n"
212 ".inst 0x45974dce // uaddwt z14.s, z14.s, z23.h\n"
213 ".inst 0x459649ad // uaddwb z13.s, z13.s, z22.h\n"
214 ".inst 0x45964d8c // uaddwt z12.s, z12.s, z22.h\n"
215 ".inst 0x4595496b // uaddwb z11.s, z11.s, z21.h\n"
216 ".inst 0x45954d4a // uaddwt z10.s, z10.s, z21.h\n"
217 ".inst 0x45944929 // uaddwb z9.s, z9.s, z20.h\n"
218 ".inst 0x45944d08 // uaddwt z8.s, z8.s, z20.h\n"
219 ".inst 0x459348e7 // uaddwb z7.s, z7.s, z19.h\n"
220 ".inst 0x45934cc6 // uaddwt z6.s, z6.s, z19.h\n"
221 ".inst 0x459248a5 // uaddwb z5.s, z5.s, z18.h\n"
222 ".inst 0x45924c84 // uaddwt z4.s, z4.s, z18.h\n"
223 ".inst 0x45914863 // uaddwb z3.s, z3.s, z17.h\n"
224 ".inst 0x45914c42 // uaddwt z2.s, z2.s, z17.h\n"
225 ".inst 0x45904821 // uaddwb z1.s, z1.s, z16.h\n"
226 ".inst 0x45904c00 // uaddwt z0.s, z0.s, z16.h\n"
228 "ands x21, %x[n_valid_cells], #0x1\n"
231 "ldr x20, [x22], #0x8\n"
232 "ld1b { z16.b }, p4/Z, [x20, x27]\n"
233 ".inst 0x4508aa17 // ushllb z23.h, z16.b, #0x0\n"
234 ".inst 0x4508ae16 // ushllt z22.h, z16.b, #0x0\n"
235 "ld1b { z16.b }, p3/Z, [x20, x26]\n"
236 ".inst 0x4508aa15 // ushllb z21.h, z16.b, #0x0\n"
237 ".inst 0x4508ae14 // ushllt z20.h, z16.b, #0x0\n"
238 "subs x21, x21, #0x1\n"
239 "ld1b { z16.b }, p2/Z, [x20, x25]\n"
240 ".inst 0x4508aa13 // ushllb z19.h, z16.b, #0x0\n"
241 ".inst 0x4508ae12 // ushllt z18.h, z16.b, #0x0\n"
242 "ld1b { z16.b }, p1/Z, [x20, x24]\n"
243 ".inst 0x4508aa11 // ushllb z17.h, z16.b, #0x0\n"
244 ".inst 0x4508ae10 // ushllt z16.h, z16.b, #0x0\n"
245 ".inst 0x459749ef // uaddwb z15.s, z15.s, z23.h\n"
246 ".inst 0x45974dce // uaddwt z14.s, z14.s, z23.h\n"
247 ".inst 0x459649ad // uaddwb z13.s, z13.s, z22.h\n"
248 ".inst 0x45964d8c // uaddwt z12.s, z12.s, z22.h\n"
249 ".inst 0x4595496b // uaddwb z11.s, z11.s, z21.h\n"
250 ".inst 0x45954d4a // uaddwt z10.s, z10.s, z21.h\n"
251 ".inst 0x45944929 // uaddwb z9.s, z9.s, z20.h\n"
252 ".inst 0x45944d08 // uaddwt z8.s, z8.s, z20.h\n"
253 ".inst 0x459348e7 // uaddwb z7.s, z7.s, z19.h\n"
254 ".inst 0x45934cc6 // uaddwt z6.s, z6.s, z19.h\n"
255 ".inst 0x459248a5 // uaddwb z5.s, z5.s, z18.h\n"
256 ".inst 0x45924c84 // uaddwt z4.s, z4.s, z18.h\n"
257 ".inst 0x45914863 // uaddwb z3.s, z3.s, z17.h\n"
258 ".inst 0x45914c42 // uaddwt z2.s, z2.s, z17.h\n"
259 ".inst 0x45904821 // uaddwb z1.s, z1.s, z16.h\n"
260 ".inst 0x45904c00 // uaddwt z0.s, z0.s, z16.h\n"
263 "ld1rw { z19.s }, p0/Z, [%x[left_shift]]\n"
264 ".inst 0x4482826f // srshl z15.s, p0/M, z15.s, z19.s\n"
265 ".inst 0x4482826e // srshl z14.s, p0/M, z14.s, z19.s\n"
266 "add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
267 ".inst 0x4482826d // srshl z13.s, p0/M, z13.s, z19.s\n"
268 ".inst 0x4482826c // srshl z12.s, p0/M, z12.s, z19.s\n"
269 "ld1rw { z18.s }, p0/Z, [%x[combined_rescale_value]]\n"
270 ".inst 0x4482826b // srshl z11.s, p0/M, z11.s, z19.s\n"
271 ".inst 0x4482826a // srshl z10.s, p0/M, z10.s, z19.s\n"
272 "ld1rw { z17.s }, p0/Z, [%x[right_shift]]\n"
273 ".inst 0x44828269 // srshl z9.s, p0/M, z9.s, z19.s\n"
274 ".inst 0x44828268 // srshl z8.s, p0/M, z8.s, z19.s\n"
275 "ld1rw { z16.s }, p0/Z, [x20]\n"
276 ".inst 0x44828267 // srshl z7.s, p0/M, z7.s, z19.s\n"
277 ".inst 0x44828266 // srshl z6.s, p0/M, z6.s, z19.s\n"
278 ".inst 0x44828265 // srshl z5.s, p0/M, z5.s, z19.s\n"
279 ".inst 0x44828264 // srshl z4.s, p0/M, z4.s, z19.s\n"
280 ".inst 0x44828263 // srshl z3.s, p0/M, z3.s, z19.s\n"
281 ".inst 0x44828262 // srshl z2.s, p0/M, z2.s, z19.s\n"
282 ".inst 0x44828261 // srshl z1.s, p0/M, z1.s, z19.s\n"
283 ".inst 0x44828260 // srshl z0.s, p0/M, z0.s, z19.s\n"
284 ".inst 0x04b275ef // sqrdmulh z15.s, z15.s, z18.s\n"
285 ".inst 0x04b275ce // sqrdmulh z14.s, z14.s, z18.s\n"
286 ".inst 0x04b275ad // sqrdmulh z13.s, z13.s, z18.s\n"
287 ".inst 0x04b2758c // sqrdmulh z12.s, z12.s, z18.s\n"
288 ".inst 0x04b2756b // sqrdmulh z11.s, z11.s, z18.s\n"
289 ".inst 0x04b2754a // sqrdmulh z10.s, z10.s, z18.s\n"
290 ".inst 0x04b27529 // sqrdmulh z9.s, z9.s, z18.s\n"
291 ".inst 0x04b27508 // sqrdmulh z8.s, z8.s, z18.s\n"
292 ".inst 0x04b274e7 // sqrdmulh z7.s, z7.s, z18.s\n"
293 ".inst 0x04b274c6 // sqrdmulh z6.s, z6.s, z18.s\n"
294 ".inst 0x04b274a5 // sqrdmulh z5.s, z5.s, z18.s\n"
295 ".inst 0x04b27484 // sqrdmulh z4.s, z4.s, z18.s\n"
296 ".inst 0x04b27463 // sqrdmulh z3.s, z3.s, z18.s\n"
297 ".inst 0x04b27442 // sqrdmulh z2.s, z2.s, z18.s\n"
298 ".inst 0x04b27421 // sqrdmulh z1.s, z1.s, z18.s\n"
299 ".inst 0x04b27400 // sqrdmulh z0.s, z0.s, z18.s\n"
300 ".inst 0x4482822f // srshl z15.s, p0/M, z15.s, z17.s\n"
301 ".inst 0x4482822e // srshl z14.s, p0/M, z14.s, z17.s\n"
302 ".inst 0x4482822d // srshl z13.s, p0/M, z13.s, z17.s\n"
303 ".inst 0x4482822c // srshl z12.s, p0/M, z12.s, z17.s\n"
304 ".inst 0x4482822b // srshl z11.s, p0/M, z11.s, z17.s\n"
305 ".inst 0x4482822a // srshl z10.s, p0/M, z10.s, z17.s\n"
306 ".inst 0x44828229 // srshl z9.s, p0/M, z9.s, z17.s\n"
307 ".inst 0x44828228 // srshl z8.s, p0/M, z8.s, z17.s\n"
308 ".inst 0x44828227 // srshl z7.s, p0/M, z7.s, z17.s\n"
309 ".inst 0x44828226 // srshl z6.s, p0/M, z6.s, z17.s\n"
310 ".inst 0x44828225 // srshl z5.s, p0/M, z5.s, z17.s\n"
311 ".inst 0x44828224 // srshl z4.s, p0/M, z4.s, z17.s\n"
312 ".inst 0x44828223 // srshl z3.s, p0/M, z3.s, z17.s\n"
313 ".inst 0x44828222 // srshl z2.s, p0/M, z2.s, z17.s\n"
314 ".inst 0x44828221 // srshl z1.s, p0/M, z1.s, z17.s\n"
315 ".inst 0x44828220 // srshl z0.s, p0/M, z0.s, z17.s\n"
316 "add z15.s, z15.s, z16.s\n"
317 "add z14.s, z14.s, z16.s\n"
318 "add z13.s, z13.s, z16.s\n"
319 "add z12.s, z12.s, z16.s\n"
320 "add z11.s, z11.s, z16.s\n"
321 "add z10.s, z10.s, z16.s\n"
322 "add z9.s, z9.s, z16.s\n"
323 "add z8.s, z8.s, z16.s\n"
324 "add z7.s, z7.s, z16.s\n"
325 "add z6.s, z6.s, z16.s\n"
326 "add z5.s, z5.s, z16.s\n"
327 "add z4.s, z4.s, z16.s\n"
328 "add z3.s, z3.s, z16.s\n"
329 "add z2.s, z2.s, z16.s\n"
330 "add z1.s, z1.s, z16.s\n"
331 "add z0.s, z0.s, z16.s\n"
334 "smax z15.s, p0/M, z15.s, z16.s\n"
335 "smax z14.s, p0/M, z14.s, z16.s\n"
336 "smax z13.s, p0/M, z13.s, z16.s\n"
337 "smax z12.s, p0/M, z12.s, z16.s\n"
338 "smax z11.s, p0/M, z11.s, z16.s\n"
339 "smax z10.s, p0/M, z10.s, z16.s\n"
340 "smax z9.s, p0/M, z9.s, z16.s\n"
341 "smax z8.s, p0/M, z8.s, z16.s\n"
342 "smax z7.s, p0/M, z7.s, z16.s\n"
343 "smax z6.s, p0/M, z6.s, z16.s\n"
344 "smax z5.s, p0/M, z5.s, z16.s\n"
345 "smax z4.s, p0/M, z4.s, z16.s\n"
346 "smax z3.s, p0/M, z3.s, z16.s\n"
347 "smax z2.s, p0/M, z2.s, z16.s\n"
348 "smax z1.s, p0/M, z1.s, z16.s\n"
349 "smax z0.s, p0/M, z0.s, z16.s\n"
350 "smin z15.s, p0/M, z15.s, z19.s\n"
351 "smin z14.s, p0/M, z14.s, z19.s\n"
352 "trn1 z23.h, z15.h, z14.h\n"
353 "smin z13.s, p0/M, z13.s, z19.s\n"
354 "smin z12.s, p0/M, z12.s, z19.s\n"
355 "trn1 z16.h, z13.h, z12.h\n"
356 "smin z11.s, p0/M, z11.s, z19.s\n"
357 "smin z10.s, p0/M, z10.s, z19.s\n"
358 "trn1 z22.h, z11.h, z10.h\n"
359 "smin z9.s, p0/M, z9.s, z19.s\n"
360 "smin z8.s, p0/M, z8.s, z19.s\n"
361 "trn1 z18.h, z9.h, z8.h\n"
362 "smin z7.s, p0/M, z7.s, z19.s\n"
363 "smin z6.s, p0/M, z6.s, z19.s\n"
364 "trn1 z21.h, z7.h, z6.h\n"
365 "smin z5.s, p0/M, z5.s, z19.s\n"
366 "smin z4.s, p0/M, z4.s, z19.s\n"
367 "trn1 z17.h, z5.h, z4.h\n"
368 "smin z3.s, p0/M, z3.s, z19.s\n"
369 "smin z2.s, p0/M, z2.s, z19.s\n"
370 "trn1 z20.h, z3.h, z2.h\n"
371 "smin z1.s, p0/M, z1.s, z19.s\n"
372 "smin z0.s, p0/M, z0.s, z19.s\n"
373 "trn1 z19.h, z1.h, z0.h\n"
374 "trn1 z16.b, z23.b, z16.b\n"
375 "trn1 z18.b, z22.b, z18.b\n"
376 "st1b { z16.b }, p4, [%x[outptr], x27]\n"
377 "incb x27, ALL, MUL #4\n"
378 "trn1 z17.b, z21.b, z17.b\n"
379 "trn1 z16.b, z20.b, z19.b\n"
380 "st1b { z18.b }, p3, [%x[outptr], x26]\n"
381 "incb x26, ALL, MUL #4\n"
382 "st1b { z17.b }, p2, [%x[outptr], x25]\n"
383 "incb x25, ALL, MUL #4\n"
384 "st1b { z16.b }, p1, [%x[outptr], x24]\n"
385 "incb x24, ALL, MUL #4\n"
386 "whilelt p1.b, x24, %x[n_channels]\n"
389 "whilelt p4.b, x27, %x[n_channels]\n"
392 "ld1rw { z15.s }, p0/Z, [%x[accumulator_init]]\n"
393 "lsr x23, %x[n_valid_cells], #0x1\n"
397 "mov x22, %x[inptrs]\n"
399 "ldp x21, x20, [x22, #0x0]\n"
400 "subs x23, x23, #0x1\n"
401 "add x22, x22, #0x10\n"
402 "ld1b { z31.b }, p4/Z, [x21, x27]\n"
403 "ld1b { z30.b }, p4/Z, [x20, x27]\n"
406 ".inst 0x455e0bf1 // uaddlb z17.h, z31.b, z30.b\n"
407 ".inst 0x455e0ff0 // uaddlt z16.h, z31.b, z30.b\n"
408 "ldp x21, x20, [x22, #0x0]\n"
409 "subs x23, x23, #0x1\n"
410 ".inst 0x459149ef // uaddwb z15.s, z15.s, z17.h\n"
411 ".inst 0x45914dce // uaddwt z14.s, z14.s, z17.h\n"
412 "add x22, x22, #0x10\n"
413 "ld1b { z31.b }, p4/Z, [x21, x27]\n"
414 ".inst 0x459049ad // uaddwb z13.s, z13.s, z16.h\n"
415 ".inst 0x45904d8c // uaddwt z12.s, z12.s, z16.h\n"
416 "ld1b { z30.b }, p4/Z, [x20, x27]\n"
419 ".inst 0x455e0bf1 // uaddlb z17.h, z31.b, z30.b\n"
420 ".inst 0x455e0ff0 // uaddlt z16.h, z31.b, z30.b\n"
421 ".inst 0x459149ef // uaddwb z15.s, z15.s, z17.h\n"
422 ".inst 0x45914dce // uaddwt z14.s, z14.s, z17.h\n"
423 ".inst 0x459049ad // uaddwb z13.s, z13.s, z16.h\n"
424 ".inst 0x45904d8c // uaddwt z12.s, z12.s, z16.h\n"
426 "ands x21, %x[n_valid_cells], #0x1\n"
429 "ldr x20, [x22], #0x8\n"
430 "ld1b { z16.b }, p4/Z, [x20, x27]\n"
431 ".inst 0x4508aa11 // ushllb z17.h, z16.b, #0x0\n"
432 ".inst 0x4508ae10 // ushllt z16.h, z16.b, #0x0\n"
433 "subs x21, x21, #0x1\n"
434 ".inst 0x459149ef // uaddwb z15.s, z15.s, z17.h\n"
435 ".inst 0x45914dce // uaddwt z14.s, z14.s, z17.h\n"
436 ".inst 0x459049ad // uaddwb z13.s, z13.s, z16.h\n"
437 ".inst 0x45904d8c // uaddwt z12.s, z12.s, z16.h\n"
440 "ld1rw { z16.s }, p0/Z, [%x[left_shift]]\n"
441 ".inst 0x4482820f // srshl z15.s, p0/M, z15.s, z16.s\n"
442 ".inst 0x4482820e // srshl z14.s, p0/M, z14.s, z16.s\n"
443 "add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
444 ".inst 0x4482820d // srshl z13.s, p0/M, z13.s, z16.s\n"
445 ".inst 0x4482820c // srshl z12.s, p0/M, z12.s, z16.s\n"
446 "ld1rw { z16.s }, p0/Z, [%x[combined_rescale_value]]\n"
447 ".inst 0x04b075ef // sqrdmulh z15.s, z15.s, z16.s\n"
448 ".inst 0x04b075ce // sqrdmulh z14.s, z14.s, z16.s\n"
449 "ld1rw { z17.s }, p0/Z, [%x[right_shift]]\n"
450 ".inst 0x04b075ad // sqrdmulh z13.s, z13.s, z16.s\n"
451 ".inst 0x04b0758c // sqrdmulh z12.s, z12.s, z16.s\n"
452 "ld1rw { z16.s }, p0/Z, [x20]\n"
453 ".inst 0x4482822f // srshl z15.s, p0/M, z15.s, z17.s\n"
454 ".inst 0x4482822e // srshl z14.s, p0/M, z14.s, z17.s\n"
455 ".inst 0x4482822d // srshl z13.s, p0/M, z13.s, z17.s\n"
456 ".inst 0x4482822c // srshl z12.s, p0/M, z12.s, z17.s\n"
457 "add z15.s, z15.s, z16.s\n"
458 "add z14.s, z14.s, z16.s\n"
459 "add z13.s, z13.s, z16.s\n"
460 "add z12.s, z12.s, z16.s\n"
463 "smax z15.s, p0/M, z15.s, z17.s\n"
464 "smax z14.s, p0/M, z14.s, z17.s\n"
465 "smax z13.s, p0/M, z13.s, z17.s\n"
466 "smax z12.s, p0/M, z12.s, z17.s\n"
467 "smin z15.s, p0/M, z15.s, z16.s\n"
468 "smin z14.s, p0/M, z14.s, z16.s\n"
469 "trn1 z17.h, z15.h, z14.h\n"
470 "smin z13.s, p0/M, z13.s, z16.s\n"
471 "smin z12.s, p0/M, z12.s, z16.s\n"
472 "trn1 z16.h, z13.h, z12.h\n"
473 "trn1 z16.b, z17.b, z16.b\n"
474 "st1b { z16.b }, p4, [%x[outptr], x27]\n"
476 "whilelt p4.b, x27, %x[n_channels]\n"
479 ".inst 0xd503467f // SMSTOP\n"
481 : [accumulator_init]
"r" (&accumulator_init), [combined_rescale_value]
"r" (&combined_rescale_value), [inptrs]
"r" (inptrs), [left_shift]
"r" (&left_shift), [n_channels]
"r" (n_channels), [n_valid_cells]
"r" (n_valid_cells), [offsetof_qp_output_offset]
"I" (offsetof(Requantize32, output_offset)), [outptr]
"r" (outptr), [quant_params]
"r" (&qp), [right_shift]
"r" (&right_shift)
482 :
"cc",
"memory",
"p0",
"p1",
"p2",
"p3",
"p4",
"p5",
"p6",
"p7",
"p8",
"p9",
"p10",
"p11",
"p12",
"p13",
"p14",
"p15",
"x20",
"x21",
"x22",
"x23",
"x24",
"x25",
"x26",
"x27",
"z0",
"z1",
"z2",
"z3",
"z4",
"z5",
"z6",
"z7",
"z8",
"z9",
"z10",
"z11",
"z12",
"z13",
"z14",
"z15",
"z16",
"z17",
"z18",
"z19",
"z20",
"z21",
"z22",
"z23",
"z24",
"z25",
"z26",
"z27",
"z28",
"z29",
"z30",
"z31"
489 #endif // defined(ARM_COMPUTE_ENABLE_SME)