27 #if defined(ARM_COMPUTE_ENABLE_SME)
31 void sme_transpose_interleave_16VL(uint32_t *out,
const uint32_t *in,
size_t width,
size_t in_stride,
size_t height)
33 size_t out_stride = 16 * height * sme::get_vector_length<uint8_t>();
36 ".inst 0xd503477f // SMSTART ZA\n"
40 "add %x[in], x23, %x[in_stride]\n"
42 "sub %x[height], %x[height], #0x1\n"
43 "mov x21, %x[width]\n"
46 "whilelt p0.s, XZR, x20\n"
47 "ld1w { z31.s }, p0/Z, [x23]\n"
49 "whilelt p0.s, XZR, x20\n"
50 "ld1w { z30.s }, p0/Z, [x23, #1, MUL VL]\n"
52 "whilelt p0.s, XZR, x20\n"
53 "ld1w { z29.s }, p0/Z, [x23, #2, MUL VL]\n"
55 "whilelt p0.s, XZR, x20\n"
56 "ld1w { z28.s }, p0/Z, [x23, #3, MUL VL]\n"
58 "whilelt p0.s, XZR, x20\n"
59 "ld1w { z27.s }, p0/Z, [x23, #4, MUL VL]\n"
61 "whilelt p0.s, XZR, x20\n"
62 "ld1w { z26.s }, p0/Z, [x23, #5, MUL VL]\n"
64 "whilelt p0.s, XZR, x20\n"
65 "ld1w { z25.s }, p0/Z, [x23, #6, MUL VL]\n"
67 "whilelt p0.s, XZR, x20\n"
68 "ld1w { z24.s }, p0/Z, [x23, #7, MUL VL]\n"
70 "whilelt p0.s, XZR, x20\n"
72 "whilelt p6.s, XZR, x20\n"
74 "whilelt p5.s, XZR, x20\n"
76 "whilelt p4.s, XZR, x20\n"
78 "whilelt p3.s, XZR, x20\n"
80 "whilelt p2.s, XZR, x20\n"
82 "whilelt p1.s, XZR, x20\n"
84 "addvl x23, x23, #16\n"
85 "ld1w { z23.s }, p0/Z, [x23, #-8, MUL VL]\n"
86 "whilelt p0.s, XZR, x20\n"
88 "ld1w { z22.s }, p6/Z, [x23, #-7, MUL VL]\n"
89 "decw x21, ALL, MUL #16\n"
90 "ld1w { z21.s }, p5/Z, [x23, #-6, MUL VL]\n"
92 "ld1w { z20.s }, p4/Z, [x23, #-5, MUL VL]\n"
93 "add x22, x22, %x[out_stride]\n"
94 "ld1w { z19.s }, p3/Z, [x23, #-4, MUL VL]\n"
95 "ld1w { z18.s }, p2/Z, [x23, #-3, MUL VL]\n"
96 "ld1w { z17.s }, p1/Z, [x23, #-2, MUL VL]\n"
97 "ld1w { z16.s }, p0/Z, [x23, #-1, MUL VL]\n"
98 "st1w { z31.s }, p7, [x20]\n"
99 "st1w { z30.s }, p7, [x20, #1, MUL VL]\n"
100 "st1w { z29.s }, p7, [x20, #2, MUL VL]\n"
101 "st1w { z28.s }, p7, [x20, #3, MUL VL]\n"
102 "st1w { z27.s }, p7, [x20, #4, MUL VL]\n"
103 "st1w { z26.s }, p7, [x20, #5, MUL VL]\n"
104 "st1w { z25.s }, p7, [x20, #6, MUL VL]\n"
105 "st1w { z24.s }, p7, [x20, #7, MUL VL]\n"
106 "addvl x20, x20, #16\n"
107 "st1w { z23.s }, p7, [x20, #-8, MUL VL]\n"
108 "st1w { z22.s }, p7, [x20, #-7, MUL VL]\n"
109 "st1w { z21.s }, p7, [x20, #-6, MUL VL]\n"
110 "st1w { z20.s }, p7, [x20, #-5, MUL VL]\n"
111 "st1w { z19.s }, p7, [x20, #-4, MUL VL]\n"
112 "st1w { z18.s }, p7, [x20, #-3, MUL VL]\n"
113 "st1w { z17.s }, p7, [x20, #-2, MUL VL]\n"
114 "st1w { z16.s }, p7, [x20, #-1, MUL VL]\n"
117 "cmp %x[height], #0x1\n"
118 "addvl %x[out], %x[out], #16\n"
120 ".inst 0xd503467f // SMSTOP\n"
121 : [height]
"+&r" (height), [in]
"+&r" (in), [out]
"+&r" (out)
122 : [in_stride]
"r" (in_stride), [out_stride]
"r" (out_stride), [width]
"r" (width)
123 :
"cc",
"memory",
"p0",
"p1",
"p2",
"p3",
"p4",
"p5",
"p6",
"p7",
"p8",
"p9",
"p10",
"p11",
"p12",
"p13",
"p14",
"p15",
"x20",
"x21",
"x22",
"x23",
"z0",
"z1",
"z2",
"z3",
"z4",
"z5",
"z6",
"z7",
"z8",
"z9",
"z10",
"z11",
"z12",
"z13",
"z14",
"z15",
"z16",
"z17",
"z18",
"z19",
"z20",
"z21",
"z22",
"z23",
"z24",
"z25",
"z26",
"z27",
"z28",
"z29",
"z30",
"z31"
130 void Transform<16, 1, true, VLType::SME>(
131 float *out,
const float *in,
int stride,
int x0,
int xmax,
int k0,
int kmax)
133 sme_transpose_interleave_16VL(
134 reinterpret_cast<uint32_t *
>(out),
135 reinterpret_cast<const uint32_t *
>(in + k0 * stride + x0),
136 (xmax-x0) *
sizeof(
float) / 4,
137 stride *
sizeof(
float),
143 #endif // defined(ARM_COMPUTE_ENABLE_SME)