31 void MergeResults<8, 6, false>(
float *out,
const float *in,
const int ldout,
const int y0,
const int ymax,
const int x0,
const int xmax,
const float *
bias,
Activation act,
bool append) {
32 const float *inptr = in;
34 prefetch_6x(inptr + 96);
37 float minval = - std::numeric_limits<float>::infinity();
38 float maxval = std::numeric_limits<float>::infinity();
45 case Activation::Type::BoundedReLU:
46 maxval =
static_cast<float>(act.param1);
48 case Activation::Type::ReLU:
53 float32x4_t minv = vdupq_n_f32(minval);
54 float32x4_t maxv = vdupq_n_f32(maxval);
58 memset(nullbias, 0, (8 *
sizeof(
float)));
61 for (
int y=y0; y<ymax; y+=8) {
62 float *outptr0 = out + (y * ldout) + x0;
63 float *outptr1 = outptr0 + ldout;
64 float *outptr2 = outptr1 + ldout;
65 float *outptr3 = outptr2 + ldout;
66 float *outptr4 = outptr3 + ldout;
67 float *outptr5 = outptr4 + ldout;
76 for (
int i=x0; i<xmax; i+=8) {
83 switch ((y + 5) - ymax) {
110 for (
int xi=0; xi<8; xi++) {
112 *outptr0 = std::min(std::max(minval, inptr[xi] + *outptr0), maxval);
114 *outptr1 = std::min(std::max(minval, inptr[xi + 8] + *outptr1), maxval);
116 *outptr2 = std::min(std::max(minval, inptr[xi + 16] + *outptr2), maxval);
118 *outptr3 = std::min(std::max(minval, inptr[xi + 24] + *outptr3), maxval);
120 *outptr4 = std::min(std::max(minval, inptr[xi + 32] + *outptr4), maxval);
122 *outptr5 = std::min(std::max(minval, inptr[xi + 40] + *outptr5), maxval);
131 "VLD1.32 {d0-d3}, [%[inptr]]!\n"
132 "VLD1.32 {d8-d11}, [%[outptr0]]\n"
133 "VLD1.32 {d4-d7}, [%[inptr]]!\n"
134 "VLD1.32 {d12-d15}, [%[outptr1]]\n"
136 "VADD.f32 q4, q4, q0\n"
138 "VADD.f32 q5, q5, q1\n"
139 "VADD.f32 q6, q6, q2\n"
140 "VADD.f32 q7, q7, q3\n"
142 "VMAX.f32 q4, q4, %q[minv]\n"
143 "VMAX.f32 q5, q5, %q[minv]\n"
144 "VMAX.f32 q6, q6, %q[minv]\n"
146 "VMAX.f32 q7, q7, %q[minv]\n"
147 "VMIN.f32 q4, q4, %q[maxv]\n"
148 "VMIN.f32 q5, q5, %q[maxv]\n"
149 "VST1.32 {d8-d11}, [%[outptr0]]!\n"
150 "VMIN.f32 q6, q6, %q[maxv]\n"
151 "VMIN.f32 q7, q7, %q[maxv]\n"
152 "VST1.32 {d12-d15}, [%[outptr1]]!\n"
155 "VLD1.32 {d0-d3}, [%[inptr]]!\n"
156 "VLD1.32 {d8-d11}, [%[outptr2]]\n"
157 "VLD1.32 {d4-d7}, [%[inptr]]!\n"
158 "VLD1.32 {d12-d15}, [%[outptr3]]\n"
160 "VADD.f32 q4, q4, q0\n"
162 "VADD.f32 q5, q5, q1\n"
163 "VADD.f32 q6, q6, q2\n"
164 "VADD.f32 q7, q7, q3\n"
166 "VMAX.f32 q4, q4, %q[minv]\n"
167 "VMAX.f32 q5, q5, %q[minv]\n"
168 "VMAX.f32 q6, q6, %q[minv]\n"
170 "VMAX.f32 q7, q7, %q[minv]\n"
171 "VMIN.f32 q4, q4, %q[maxv]\n"
172 "VMIN.f32 q5, q5, %q[maxv]\n"
173 "VST1.32 {d8-d11}, [%[outptr2]]!\n"
174 "VMIN.f32 q6, q6, %q[maxv]\n"
175 "VMIN.f32 q7, q7, %q[maxv]\n"
176 "VST1.32 {d12-d15}, [%[outptr3]]!\n"
179 "VLD1.32 {d0-d3}, [%[inptr]]!\n"
180 "VLD1.32 {d8-d11}, [%[outptr4]]\n"
181 "VLD1.32 {d4-d7}, [%[inptr]]!\n"
182 "VLD1.32 {d12-d15}, [%[outptr5]]\n"
184 "VADD.f32 q4, q4, q0\n"
186 "VADD.f32 q5, q5, q1\n"
187 "VADD.f32 q6, q6, q2\n"
188 "VADD.f32 q7, q7, q3\n"
190 "VMAX.f32 q4, q4, %q[minv]\n"
191 "VMAX.f32 q5, q5, %q[minv]\n"
192 "VMAX.f32 q6, q6, %q[minv]\n"
194 "VMAX.f32 q7, q7, %q[minv]\n"
195 "VMIN.f32 q4, q4, %q[maxv]\n"
196 "VMIN.f32 q5, q5, %q[maxv]\n"
197 "VST1.32 {d8-d11}, [%[outptr4]]!\n"
198 "VMIN.f32 q6, q6, %q[maxv]\n"
199 "VMIN.f32 q7, q7, %q[maxv]\n"
200 "VST1.32 {d12-d15}, [%[outptr5]]!\n"
201 : [outptr0]
"+r" (outptr0), [outptr1]
"+r" (outptr1), [outptr2]
"+r" (outptr2), [outptr3]
"+r" (outptr3),
202 [outptr4]
"+r" (outptr4), [outptr5]
"+r" (outptr5), [inptr]
"+r" (inptr)
203 : [minv]
"w" (minv), [maxv]
"w" (maxv)
204 :
"q0",
"q1",
"q2",
"q3",
"q4",
"q5",
"q6",
"q7",
"memory"
209 const float *biasptr =
bias ?
bias + i : nullbias;
213 for (
int xi=0; xi<7; xi++) {
215 *outptr0 = std::min(std::max(minval, inptr[xi] + biasptr[xi]), maxval);
217 *outptr1 = std::min(std::max(minval, inptr[xi + 8] + biasptr[xi]), maxval);
219 *outptr2 = std::min(std::max(minval, inptr[xi + 16] + biasptr[xi]), maxval);
221 *outptr3 = std::min(std::max(minval, inptr[xi + 24] + biasptr[xi]), maxval);
223 *outptr4 = std::min(std::max(minval, inptr[xi + 32] + biasptr[xi]), maxval);
225 *outptr5 = std::min(std::max(minval, inptr[xi + 40] + biasptr[xi]), maxval);
234 "VLD1.32 {d8-d11}, [%[inptr]]!\n"
235 "VLD1.32 {d0-d3}, [%[biasptr]]\n"
236 "VLD1.32 {d12-d15}, [%[inptr]]!\n"
238 "VADD.f32 q4, q4, q0\n"
240 "VADD.f32 q5, q5, q1\n"
241 "VADD.f32 q6, q6, q0\n"
242 "VADD.f32 q7, q7, q1\n"
244 "VMAX.f32 q4, q4, %q[minv]\n"
245 "VMAX.f32 q5, q5, %q[minv]\n"
246 "VMAX.f32 q6, q6, %q[minv]\n"
248 "VMAX.f32 q7, q7, %q[minv]\n"
249 "VMIN.f32 q4, q4, %q[maxv]\n"
250 "VMIN.f32 q5, q5, %q[maxv]\n"
251 "VST1.32 {d8-d11}, [%[outptr0]]!\n"
252 "VMIN.f32 q6, q6, %q[maxv]\n"
253 "VMIN.f32 q7, q7, %q[maxv]\n"
254 "VST1.32 {d12-d15}, [%[outptr1]]!\n"
257 "VLD1.32 {d8-d11}, [%[inptr]]!\n"
258 "VLD1.32 {d12-d15}, [%[inptr]]!\n"
260 "VADD.f32 q4, q4, q0\n"
262 "VADD.f32 q5, q5, q1\n"
263 "VADD.f32 q6, q6, q0\n"
264 "VADD.f32 q7, q7, q1\n"
266 "VMAX.f32 q4, q4, %q[minv]\n"
267 "VMAX.f32 q5, q5, %q[minv]\n"
268 "VMAX.f32 q6, q6, %q[minv]\n"
270 "VMAX.f32 q7, q7, %q[minv]\n"
271 "VMIN.f32 q4, q4, %q[maxv]\n"
272 "VMIN.f32 q5, q5, %q[maxv]\n"
273 "VST1.32 {d8-d11}, [%[outptr2]]!\n"
274 "VMIN.f32 q6, q6, %q[maxv]\n"
275 "VMIN.f32 q7, q7, %q[maxv]\n"
276 "VST1.32 {d12-d15}, [%[outptr3]]!\n"
277 : [outptr0]
"+r" (outptr0), [outptr1]
"+r" (outptr1), [outptr2]
"+r" (outptr2), [outptr3]
"+r" (outptr3),
279 : [minv]
"w" (minv), [maxv]
"w" (maxv), [biasptr]
"r" (biasptr)
280 :
"q0",
"q1",
"q2",
"q3",
"q4",
"q5",
"q6",
"q7",
"memory"
285 "VLD1.32 {d8-d11}, [%[inptr]]!\n"
286 "VLD1.32 {d12-d15}, [%[inptr]]!\n"
288 "VADD.f32 q4, q4, q0\n"
290 "VADD.f32 q5, q5, q1\n"
291 "VADD.f32 q6, q6, q0\n"
292 "VADD.f32 q7, q7, q1\n"
294 "VMAX.f32 q4, q4, %q[minv]\n"
295 "VMAX.f32 q5, q5, %q[minv]\n"
296 "VMAX.f32 q6, q6, %q[minv]\n"
298 "VMAX.f32 q7, q7, %q[minv]\n"
299 "VMIN.f32 q4, q4, %q[maxv]\n"
300 "VMIN.f32 q5, q5, %q[maxv]\n"
301 "VST1.32 {d8-d11}, [%[outptr4]]!\n"
302 "VMIN.f32 q6, q6, %q[maxv]\n"
303 "VMIN.f32 q7, q7, %q[maxv]\n"
304 "VST1.32 {d12-d15}, [%[outptr5]]!\n"
305 : [outptr3]
"+r" (outptr3),
306 [outptr4]
"+r" (outptr4), [outptr5]
"+r" (outptr5), [inptr]
"+r" (inptr)
307 : [minv]
"w" (minv), [maxv]
"w" (maxv), [biasptr]
"r" (biasptr)
308 :
"q0",
"q1",
"q2",
"q3",
"q4",
"q5",
"q6",
"q7",
"memory"