24.04
generic.cpp
Go to the documentation of this file.
1
/*
2
* Copyright (c) 2021, 2023 Arm Limited.
3
*
4
* SPDX-License-Identifier: MIT
5
*
6
* Permission is hereby granted, free of charge, to any person obtaining a copy
7
* of this software and associated documentation files (the "Software"), to
8
* deal in the Software without restriction, including without limitation the
9
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10
* sell copies of the Software, and to permit persons to whom the Software is
11
* furnished to do so, subject to the following conditions:
12
*
13
* The above copyright notice and this permission notice shall be included in all
14
* copies or substantial portions of the Software.
15
*
16
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
* SOFTWARE.
23
*/
24
25
#include <cstddef>
26
#include <cstdint>
27
28
#if defined(ARM_COMPUTE_ENABLE_SVE)
29
30
namespace
arm_conv
{
31
namespace
depthwise
{
32
33
void
sve_fp32_nhwc_generic_output9_mla_depthfirst_impl(
34
const
float
*
const
*
const
inptrs,
35
float
*
const
*
const
outptrs,
36
const
void
*params,
37
const
void
*
bias
,
38
const
unsigned
int
n_points,
39
const
unsigned
int
n_channels,
40
const
float
activation_min
,
41
const
float
activation_max
42
)
43
{
44
const
float
minmax_vals[2] = {
activation_min
,
activation_max
};
45
46
__asm__ __volatile__(
47
"ptrue p1.b\n"
48
"mov x11, #0x0\n"
49
"ld1rw { z2.s }, p1/Z, [%x[minmax_vals]]\n"
50
"ld1rw { z1.s }, p1/Z, [%x[minmax_vals], #4]\n"
51
"whilelt p0.s, x11, %x[n_channels]\n"
52
"1:"
// Channel loop
53
"mov z23.b, #0x0\n"
54
"cbz %x[bias], 2f\n"
55
"ld1w { z23.s }, p0/Z, [%x[bias], x11, LSL #2]\n"
56
"2:"
// Channel loop: Load bias: Done
57
"mov x10, %x[inptrs]\n"
58
"ldp x28, x27, [x10], #0x10\n"
59
"ldp x26, x25, [x10], #0x10\n"
60
"subs x9, %x[n_points], #0x1\n"
61
"ldp x24, x23, [x10], #0x10\n"
62
"ldp x22, x21, [x10], #0x10\n"
63
"mov z24.d, z23.d\n"
64
"mov z25.d, z23.d\n"
65
"ldr x20, [x10], #0x8\n"
66
"mov z26.d, z23.d\n"
67
"mov z27.d, z23.d\n"
68
"ld1w { z0.s }, p1/Z, [%x[params]]\n"
69
"mov z28.d, z23.d\n"
70
"mov z29.d, z23.d\n"
71
"ld1w { z14.s }, p0/Z, [x28, x11, LSL #2]\n"
72
"ld1w { z15.s }, p0/Z, [x27, x11, LSL #2]\n"
73
"mov z30.d, z23.d\n"
74
"mov z31.d, z23.d\n"
75
"ld1w { z16.s }, p0/Z, [x26, x11, LSL #2]\n"
76
"ld1w { z17.s }, p0/Z, [x25, x11, LSL #2]\n"
77
"ld1w { z18.s }, p0/Z, [x24, x11, LSL #2]\n"
78
"ld1w { z19.s }, p0/Z, [x23, x11, LSL #2]\n"
79
"addvl %x[params], %x[params], #1\n"
80
"ld1w { z20.s }, p0/Z, [x22, x11, LSL #2]\n"
81
"ld1w { z21.s }, p0/Z, [x21, x11, LSL #2]\n"
82
"ld1w { z22.s }, p0/Z, [x20, x11, LSL #2]\n"
83
"ble 4f\n"
84
"3:"
// Channel loop: Planar loop
85
"ldp x28, x27, [x10], #0x10\n"
86
"ldp x26, x25, [x10], #0x10\n"
87
"subs x9, x9, #0x1\n"
88
"fmla z23.s, p1/M, z14.s, z0.s\n"
89
"ldp x24, x23, [x10], #0x10\n"
90
"ldp x22, x21, [x10], #0x10\n"
91
"fmla z24.s, p1/M, z15.s, z0.s\n"
92
"fmla z25.s, p1/M, z16.s, z0.s\n"
93
"ldr x20, [x10], #0x8\n"
94
"fmla z26.s, p1/M, z17.s, z0.s\n"
95
"fmla z27.s, p1/M, z18.s, z0.s\n"
96
"ld1w { z14.s }, p0/Z, [x28, x11, LSL #2]\n"
97
"fmla z28.s, p1/M, z19.s, z0.s\n"
98
"fmla z29.s, p1/M, z20.s, z0.s\n"
99
"ld1w { z15.s }, p0/Z, [x27, x11, LSL #2]\n"
100
"ld1w { z16.s }, p0/Z, [x26, x11, LSL #2]\n"
101
"fmla z30.s, p1/M, z21.s, z0.s\n"
102
"fmla z31.s, p1/M, z22.s, z0.s\n"
103
"ld1w { z0.s }, p1/Z, [%x[params]]\n"
104
"ld1w { z17.s }, p0/Z, [x25, x11, LSL #2]\n"
105
"ld1w { z18.s }, p0/Z, [x24, x11, LSL #2]\n"
106
"ld1w { z19.s }, p0/Z, [x23, x11, LSL #2]\n"
107
"addvl %x[params], %x[params], #1\n"
108
"ld1w { z20.s }, p0/Z, [x22, x11, LSL #2]\n"
109
"ld1w { z21.s }, p0/Z, [x21, x11, LSL #2]\n"
110
"ld1w { z22.s }, p0/Z, [x20, x11, LSL #2]\n"
111
"bgt 3b\n"
112
"4:"
// Channel loop: Planar tail
113
"fmla z23.s, p1/M, z14.s, z0.s\n"
114
"fmla z24.s, p1/M, z15.s, z0.s\n"
115
"fmax z23.s, p1/M, z23.s, z2.s\n"
116
"fmax z24.s, p1/M, z24.s, z2.s\n"
117
"fmla z25.s, p1/M, z16.s, z0.s\n"
118
"fmla z26.s, p1/M, z17.s, z0.s\n"
119
"fmax z25.s, p1/M, z25.s, z2.s\n"
120
"fmax z26.s, p1/M, z26.s, z2.s\n"
121
"fmla z27.s, p1/M, z18.s, z0.s\n"
122
"fmla z28.s, p1/M, z19.s, z0.s\n"
123
"fmax z27.s, p1/M, z27.s, z2.s\n"
124
"fmax z28.s, p1/M, z28.s, z2.s\n"
125
"fmla z29.s, p1/M, z20.s, z0.s\n"
126
"fmla z30.s, p1/M, z21.s, z0.s\n"
127
"fmax z29.s, p1/M, z29.s, z2.s\n"
128
"fmax z30.s, p1/M, z30.s, z2.s\n"
129
"fmla z31.s, p1/M, z22.s, z0.s\n"
130
"fmax z31.s, p1/M, z31.s, z2.s\n"
131
"ldp x28, x27, [%x[outptrs], #0x0]\n"
132
"ldp x26, x25, [%x[outptrs], #0x10]\n"
133
"ldp x24, x23, [%x[outptrs], #0x20]\n"
134
"ldp x22, x21, [%x[outptrs], #0x30]\n"
135
"fmin z23.s, p1/M, z23.s, z1.s\n"
136
"fmin z24.s, p1/M, z24.s, z1.s\n"
137
"ldr x20, [%x[outptrs], #0x40]\n"
138
"fmin z25.s, p1/M, z25.s, z1.s\n"
139
"fmin z26.s, p1/M, z26.s, z1.s\n"
140
"st1w { z23.s }, p0, [x28, x11, LSL #2]\n"
141
"fmin z27.s, p1/M, z27.s, z1.s\n"
142
"fmin z28.s, p1/M, z28.s, z1.s\n"
143
"st1w { z24.s }, p0, [x27, x11, LSL #2]\n"
144
"fmin z29.s, p1/M, z29.s, z1.s\n"
145
"fmin z30.s, p1/M, z30.s, z1.s\n"
146
"st1w { z25.s }, p0, [x26, x11, LSL #2]\n"
147
"fmin z31.s, p1/M, z31.s, z1.s\n"
148
"st1w { z26.s }, p0, [x25, x11, LSL #2]\n"
149
"st1w { z27.s }, p0, [x24, x11, LSL #2]\n"
150
"st1w { z28.s }, p0, [x23, x11, LSL #2]\n"
151
"st1w { z29.s }, p0, [x22, x11, LSL #2]\n"
152
"st1w { z30.s }, p0, [x21, x11, LSL #2]\n"
153
"st1w { z31.s }, p0, [x20, x11, LSL #2]\n"
154
"incw x11\n"
155
"whilelt p0.s, x11, %x[n_channels]\n"
156
"b.any 1b\n"
157
: [params]
"+&r"
(params)
158
: [
bias
]
"r"
(
bias
), [inptrs]
"r"
(inptrs), [minmax_vals]
"r"
(minmax_vals), [n_channels]
"r"
((uint64_t) n_channels), [n_points]
"r"
((uint64_t) n_points), [outptrs]
"r"
(outptrs)
159
:
"cc"
,
"memory"
,
"p0"
,
"p1"
,
"x9"
,
"x10"
,
"x11"
,
"x20"
,
"x21"
,
"x22"
,
"x23"
,
"x24"
,
"x25"
,
"x26"
,
"x27"
,
"x28"
,
"z0"
,
"z1"
,
"z2"
,
"z14"
,
"z15"
,
"z16"
,
"z17"
,
"z18"
,
"z19"
,
"z20"
,
"z21"
,
"z22"
,
"z23"
,
"z24"
,
"z25"
,
"z26"
,
"z27"
,
"z28"
,
"z29"
,
"z30"
,
"z31"
160
);
161
}
162
163
}
// namespace depthwise
164
}
// namespace arm_conv
165
166
#endif // defined(ARM_COMPUTE_ENABLE_SVE)
arm_conv::depthwise::depthwise
template UniqueDepthwiseCommon< float > depthwise(const DepthwiseArgs &, const Nothing &)
activation_min
T activation_min
Definition:
working_space.hpp:145
bias
const int32_t * bias
Definition:
working_space.hpp:322
activation_max
T activation_max
Definition:
working_space.hpp:145
arm_conv
Definition:
addressing.cpp:30
src
core
NEON
kernels
arm_conv
depthwise
kernels
sve_fp32_nhwc_generic_output9_mla_depthfirst
generic.cpp
Generated on Mon Apr 29 2024 10:53:54 for Compute Library by
1.8.17