46 constexpr
float kPi = float(
M_PI);
49 constexpr
float kSqrt3Div2 = 0.866025403784438;
52 constexpr
float kW5_0 = 0.30901699437494f;
53 constexpr
float kW5_1 = 0.95105651629515f;
54 constexpr
float kW5_2 = 0.80901699437494f;
55 constexpr
float kW5_3 = 0.58778525229247f;
58 constexpr
float kW7_0 = 0.62348980185873f;
59 constexpr
float kW7_1 = 0.78183148246802f;
60 constexpr
float kW7_2 = 0.22252093395631f;
61 constexpr
float kW7_3 = 0.97492791218182f;
62 constexpr
float kW7_4 = 0.90096886790241f;
63 constexpr
float kW7_5 = 0.43388373911755f;
66 constexpr
float kSqrt2Div2 = 0.707106781186548;
68 float32x2_t c_mul_neon(float32x2_t a, float32x2_t
b)
70 using ExactTagType =
typename wrapper::traits::neon_vector<float, 2>::tag_type;
72 const float32x2_t mask = { -1.0, 1.0 };
85 float32x2_t c_mul_neon_img(float32x2_t a,
float img_constant)
90 const auto out =
wrapper::vmul(float32x2_t{ -a_i, a_r }, float32x2_t{ img_constant, img_constant });
94 float32x2_t reduce_sum_5(float32x2_t a, float32x2_t
b, float32x2_t c, float32x2_t d, float32x2_t e)
102 float32x2_t reduce_sum_7(float32x2_t x1, float32x2_t x2, float32x2_t x3, float32x2_t x4, float32x2_t x5, float32x2_t x6, float32x2_t x7)
113 float32x2_t reduce_sum_8(float32x2_t x1, float32x2_t x2, float32x2_t x3, float32x2_t x4, float32x2_t x5, float32x2_t x6, float32x2_t x7, float32x2_t x8)
125 void fft_2(float32x2_t &x, float32x2_t &y, float32x2_t &
w)
128 float32x2_t
b = c_mul_neon(
w, y);
134 void fft_3(float32x2_t &x, float32x2_t &y, float32x2_t &z,
const float32x2_t &
w,
const float32x2_t &w2)
137 float32x2_t
b = c_mul_neon(
w, y);
138 float32x2_t c = c_mul_neon(w2, z);
144 const auto v2 = c_mul_neon(float32x2_t{ 0.f, -kSqrt3Div2 },
wrapper::vsub(
b, c));
151 void fft_4(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4,
const float32x2_t &
w,
const float32x2_t &w2,
const float32x2_t &w3)
154 float32x2_t
b = c_mul_neon(
w, x2);
155 float32x2_t c = c_mul_neon(w2, x3);
156 float32x2_t d = c_mul_neon(w3, x4);
175 void fft_5(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, float32x2_t &x5,
const float32x2_t &
w,
const float32x2_t &w2,
const float32x2_t &w3,
const float32x2_t &w4)
178 const auto b = c_mul_neon(
w, x2);
179 const auto c = c_mul_neon(w2, x3);
180 const auto d = c_mul_neon(w3, x4);
181 const auto e = c_mul_neon(w4, x5);
183 const auto b0 = c_mul_neon(float32x2_t{ kW5_0, -kW5_1 },
b);
184 const auto b1 = c_mul_neon(float32x2_t{ -kW5_2, -kW5_3 },
b);
185 const auto b2 = c_mul_neon(float32x2_t{ -kW5_2, kW5_3 },
b);
186 const auto b3 = c_mul_neon(float32x2_t{ kW5_0, kW5_1 },
b);
188 const auto c0 = c_mul_neon(float32x2_t{ -kW5_2, -kW5_3 }, c);
189 const auto c1 = c_mul_neon(float32x2_t{ kW5_0, kW5_1 }, c);
190 const auto c2 = c_mul_neon(float32x2_t{ kW5_0, -kW5_1 }, c);
191 const auto c3 = c_mul_neon(float32x2_t{ -kW5_2, kW5_3 }, c);
193 const auto d0 = c_mul_neon(float32x2_t{ -kW5_2, kW5_3 }, d);
194 const auto d1 = c_mul_neon(float32x2_t{ kW5_0, -kW5_1 }, d);
195 const auto d2 = c_mul_neon(float32x2_t{ kW5_0, kW5_1 }, d);
196 const auto d3 = c_mul_neon(float32x2_t{ -kW5_2, -kW5_3 }, d);
198 const auto e0 = c_mul_neon(float32x2_t{ kW5_0, kW5_1 }, e);
199 const auto e1 = c_mul_neon(float32x2_t{ -kW5_2, kW5_3 }, e);
200 const auto e2 = c_mul_neon(float32x2_t{ -kW5_2, -kW5_3 }, e);
201 const auto e3 = c_mul_neon(float32x2_t{ kW5_0, -kW5_1 }, e);
203 x1 = reduce_sum_5(a,
b, c, d, e);
204 x2 = reduce_sum_5(a, b0, c0, d0, e0);
205 x3 = reduce_sum_5(a, b1, c1, d1, e1);
206 x4 = reduce_sum_5(a, b2, c2, d2, e2);
207 x5 = reduce_sum_5(a, b3, c3, d3, e3);
210 void fft_7(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, float32x2_t &x5, float32x2_t &x6, float32x2_t &x7,
const float32x2_t &
w,
const float32x2_t &w2,
const float32x2_t &w3,
211 const float32x2_t &w4,
212 const float32x2_t &w5,
const float32x2_t &w6)
215 const auto b = c_mul_neon(
w, x2);
216 const auto c = c_mul_neon(w2, x3);
217 const auto d = c_mul_neon(w3, x4);
218 const auto e = c_mul_neon(w4, x5);
219 const auto f = c_mul_neon(w5, x6);
220 const auto g = c_mul_neon(w6, x7);
222 const auto b0 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 },
b);
223 const auto b1 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 },
b);
224 const auto b2 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 },
b);
225 const auto b3 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 },
b);
226 const auto b4 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 },
b);
227 const auto b5 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 },
b);
229 const auto c0 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, c);
230 const auto c1 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, c);
231 const auto c2 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, c);
232 const auto c3 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, c);
233 const auto c4 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, c);
234 const auto c5 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, c);
236 const auto d0 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, d);
237 const auto d1 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, d);
238 const auto d2 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, d);
239 const auto d3 = c_mul_neon(float32x2_t{ -kW7_2, +kW7_3 }, d);
240 const auto d4 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, d);
241 const auto d5 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, d);
243 const auto e0 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, e);
244 const auto e1 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, e);
245 const auto e2 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, e);
246 const auto e3 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, e);
247 const auto e4 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, e);
248 const auto e5 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, e);
250 const auto f0 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, f);
251 const auto f1 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, f);
252 const auto f2 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, f);
253 const auto f3 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, f);
254 const auto f4 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, f);
255 const auto f5 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, f);
257 const auto g0 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, g);
258 const auto g1 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, g);
259 const auto g2 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, g);
260 const auto g3 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, g);
261 const auto g4 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, g);
262 const auto g5 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, g);
264 x1 = reduce_sum_7(a,
b, c, d, e, f, g);
265 x2 = reduce_sum_7(a, b0, c0, d0, e0, f0, g0);
266 x3 = reduce_sum_7(a, b1, c1, d1, e1, f1, g1);
267 x4 = reduce_sum_7(a, b2, c2, d2, e2, f2, g2);
268 x5 = reduce_sum_7(a, b3, c3, d3, e3, f3, g3);
269 x6 = reduce_sum_7(a, b4, c4, d4, e4, f4, g4);
270 x7 = reduce_sum_7(a, b5, c5, d5, e5, f5, g5);
273 void fft_8(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, float32x2_t &x5, float32x2_t &x6, float32x2_t &x7, float32x2_t &x8,
const float32x2_t &
w,
const float32x2_t &w2,
274 const float32x2_t &w3,
275 const float32x2_t &w4,
const float32x2_t &w5,
const float32x2_t &w6,
276 const float32x2_t &w7)
279 const auto b = c_mul_neon(
w, x2);
280 const auto c = c_mul_neon(w2, x3);
281 const auto d = c_mul_neon(w3, x4);
282 const auto e = c_mul_neon(w4, x5);
283 const auto f = c_mul_neon(w5, x6);
284 const auto g = c_mul_neon(w6, x7);
285 const auto h = c_mul_neon(w7, x8);
287 const auto b0 = c_mul_neon(float32x2_t{ kSqrt2Div2, -kSqrt2Div2 },
b);
288 const auto b1 = c_mul_neon(float32x2_t{ 0, -1 },
b);
289 const auto b2 = c_mul_neon(float32x2_t{ -kSqrt2Div2, -kSqrt2Div2 },
b);
290 const auto b3 = c_mul_neon(float32x2_t{ -1, 0 },
b);
291 const auto b4 = c_mul_neon(float32x2_t{ -kSqrt2Div2, kSqrt2Div2 },
b);
292 const auto b5 = c_mul_neon(float32x2_t{ 0, 1 },
b);
293 const auto b6 = c_mul_neon(float32x2_t{ kSqrt2Div2, kSqrt2Div2 },
b);
295 const auto c0 = c_mul_neon(float32x2_t{ 0, -1 }, c);
296 const auto c1 = c_mul_neon(float32x2_t{ -1, 0 }, c);
297 const auto c2 = c_mul_neon(float32x2_t{ 0, 1 }, c);
298 const auto c3 = c_mul_neon(float32x2_t{ 1, 0 }, c);
299 const auto c4 = c_mul_neon(float32x2_t{ 0, -1 }, c);
300 const auto c5 = c_mul_neon(float32x2_t{ -1, 0 }, c);
301 const auto c6 = c_mul_neon(float32x2_t{ 0, 1 }, c);
303 const auto d0 = c_mul_neon(float32x2_t{ -kSqrt2Div2, -kSqrt2Div2 }, d);
304 const auto d1 = c_mul_neon(float32x2_t{ 0, 1 }, d);
305 const auto d2 = c_mul_neon(float32x2_t{ kSqrt2Div2, -kSqrt2Div2 }, d);
306 const auto d3 = c_mul_neon(float32x2_t{ -1, 0 }, d);
307 const auto d4 = c_mul_neon(float32x2_t{ kSqrt2Div2, kSqrt2Div2 }, d);
308 const auto d5 = c_mul_neon(float32x2_t{ 0, -1 }, d);
309 const auto d6 = c_mul_neon(float32x2_t{ -kSqrt2Div2, kSqrt2Div2 }, d);
311 const auto e0 = c_mul_neon(float32x2_t{ -1, 0 }, e);
312 const auto e1 = c_mul_neon(float32x2_t{ 1, 0 }, e);
313 const auto e2 = c_mul_neon(float32x2_t{ -1, 0 }, e);
314 const auto e3 = c_mul_neon(float32x2_t{ 1, 0 }, e);
315 const auto e4 = c_mul_neon(float32x2_t{ -1, 0 }, e);
316 const auto e5 = c_mul_neon(float32x2_t{ 1, 0 }, e);
317 const auto e6 = c_mul_neon(float32x2_t{ -1, 0 }, e);
319 const auto f0 = c_mul_neon(float32x2_t{ -kSqrt2Div2, kSqrt2Div2 }, f);
320 const auto f1 = c_mul_neon(float32x2_t{ 0, -1 }, f);
321 const auto f2 = c_mul_neon(float32x2_t{ kSqrt2Div2, kSqrt2Div2 }, f);
322 const auto f3 = c_mul_neon(float32x2_t{ -1, 0 }, f);
323 const auto f4 = c_mul_neon(float32x2_t{ kSqrt2Div2, -kSqrt2Div2 }, f);
324 const auto f5 = c_mul_neon(float32x2_t{ 0, 1 }, f);
325 const auto f6 = c_mul_neon(float32x2_t{ -kSqrt2Div2, -kSqrt2Div2 }, f);
327 const auto g0 = c_mul_neon(float32x2_t{ 0, 1 }, g);
328 const auto g1 = c_mul_neon(float32x2_t{ -1, 0 }, g);
329 const auto g2 = c_mul_neon(float32x2_t{ 0, -1 }, g);
330 const auto g3 = c_mul_neon(float32x2_t{ 1, 0 }, g);
331 const auto g4 = c_mul_neon(float32x2_t{ 0, 1 }, g);
332 const auto g5 = c_mul_neon(float32x2_t{ -1, 0 }, g);
333 const auto g6 = c_mul_neon(float32x2_t{ 0, -1 }, g);
335 const auto h0 = c_mul_neon(float32x2_t{ kSqrt2Div2, kSqrt2Div2 }, h);
336 const auto h1 = c_mul_neon(float32x2_t{ 0, 1 }, h);
337 const auto h2 = c_mul_neon(float32x2_t{ -kSqrt2Div2, kSqrt2Div2 }, h);
338 const auto h3 = c_mul_neon(float32x2_t{ -1, 0 }, h);
339 const auto h4 = c_mul_neon(float32x2_t{ -kSqrt2Div2, -kSqrt2Div2 }, h);
340 const auto h5 = c_mul_neon(float32x2_t{ 0, -1 }, h);
341 const auto h6 = c_mul_neon(float32x2_t{ kSqrt2Div2, -kSqrt2Div2 }, h);
343 x1 = reduce_sum_8(a,
b, c, d, e, f, g, h);
344 x2 = reduce_sum_8(a, b0, c0, d0, e0, f0, g0, h0);
345 x3 = reduce_sum_8(a, b1, c1, d1, e1, f1, g1, h1);
346 x4 = reduce_sum_8(a, b2, c2, d2, e2, f2, g2, h2);
347 x5 = reduce_sum_8(a, b3, c3, d3, e3, f3, g3, h3);
348 x6 = reduce_sum_8(a, b4, c4, d4, e4, f4, g4, h4);
349 x7 = reduce_sum_8(a, b5, c5, d5, e5, f5, g5, h5);
350 x8 = reduce_sum_8(a, b6, c6, d6, e6, f6, g6, h6);
353 template <
bool first_stage>
354 void fft_radix_2_axes_0(
float *X,
float *x,
unsigned int Nx,
unsigned int NxRadix,
const float32x2_t &w_m,
unsigned int N)
356 float32x2_t
w{ 1.0f, 0.0f };
357 for(
unsigned int j = 0; j < Nx; j++)
359 for(
unsigned int k = 2 * j; k < 2 *
N; k += 2 * NxRadix)
361 auto a = float32x2_t{ 0, 0 };
362 auto b = float32x2_t{ 0, 0 };
392 w = c_mul_neon(
w, w_m);
396 void fft_radix_2_axes_1(
float *X,
float *x,
unsigned int Nx,
unsigned int NxRadix,
const float32x2_t &w_m,
unsigned int M,
unsigned int N)
398 float32x2_t
w{ 1.0f, 0.0f };
399 for(
unsigned int j = 0; j < Nx; j++)
401 for(
unsigned int k = 2 * j; k < 2 *
N; k += 2 * NxRadix)
415 w = c_mul_neon(
w, w_m);
419 template <
bool first_stage>
420 void fft_radix_3_axes_0(
float *X,
float *x,
unsigned int Nx,
unsigned int NxRadix,
const float32x2_t &w_m,
unsigned int N)
422 float32x2_t
w{ 1.0f, 0.0f };
423 for(
unsigned int j = 0; j < Nx; j++)
425 const auto w2 = c_mul_neon(
w,
w);
427 for(
unsigned int k = 2 * j; k < 2 *
N; k += 2 * NxRadix)
430 float32x2_t a = { 0, 0 };
431 float32x2_t
b = { 0, 0 };
432 float32x2_t c = { 0, 0 };
447 fft_3(a,
b, c,
w, w2);
460 w = c_mul_neon(
w, w_m);
464 void fft_radix_3_axes_1(
float *X,
float *x,
unsigned int Nx,
unsigned int NxRadix,
const float32x2_t &w_m,
unsigned int M,
unsigned int N)
466 float32x2_t
w{ 1.0f, 0.0f };
467 for(
unsigned int j = 0; j < Nx; j++)
469 const auto w2 = c_mul_neon(
w,
w);
471 for(
unsigned int k = 2 * j; k < 2 *
N; k += 2 * NxRadix)
479 fft_3(a,
b, c,
w, w2);
486 w = c_mul_neon(
w, w_m);
490 template <
bool first_stage>
491 void fft_radix_4_axes_0(
float *X,
float *x,
unsigned int Nx,
unsigned int NxRadix,
const float32x2_t &w_m,
unsigned int N)
493 float32x2_t
w{ 1.0f, 0.0f };
494 for(
unsigned int j = 0; j < Nx; j++)
496 const auto w2 = c_mul_neon(
w,
w);
497 const auto w3 = c_mul_neon(w2,
w);
499 for(
unsigned int k = 2 * j; k < 2 *
N; k += 2 * NxRadix)
501 float32x2_t a = { 0, 0 };
502 float32x2_t
b = { 0, 0 };
503 float32x2_t c = { 0, 0 };
504 float32x2_t d = { 0, 0 };
524 fft_4(a,
b, c, d,
w, w2, w3);
540 w = c_mul_neon(
w, w_m);
544 void fft_radix_4_axes_1(
float *X,
float *x,
unsigned int Nx,
unsigned int NxRadix,
const float32x2_t &w_m,
unsigned int M,
unsigned int N)
546 float32x2_t
w{ 1.0f, 0.0f };
547 for(
unsigned int j = 0; j < Nx; j++)
549 const auto w2 = c_mul_neon(
w,
w);
550 const auto w3 = c_mul_neon(w2,
w);
552 for(
unsigned int k = 2 * j; k < 2 *
N; k += 2 * NxRadix)
561 fft_4(a,
b, c, d,
w, w2, w3);
569 w = c_mul_neon(
w, w_m);
573 template <
bool first_stage>
574 void fft_radix_5_axes_0(
float *X,
float *x,
unsigned int Nx,
unsigned int NxRadix,
const float32x2_t &w_m,
unsigned int N)
576 float32x2_t
w{ 1.0f, 0.0f };
577 for(
unsigned int j = 0; j < Nx; j++)
579 const float32x2_t w2 = c_mul_neon(
w,
w);
580 const float32x2_t w3 = c_mul_neon(w2,
w);
581 const float32x2_t w4 = c_mul_neon(w3,
w);
583 for(
unsigned int k = 2 * j; k < 2 *
N; k += 2 * NxRadix)
585 float32x2_t a = { 0, 0 };
586 float32x2_t
b = { 0, 0 };
587 float32x2_t c = { 0, 0 };
588 float32x2_t d = { 0, 0 };
589 float32x2_t e = { 0, 0 };
612 fft_5(a,
b, c, d, e,
w, w2, w3, w4);
630 w = c_mul_neon(
w, w_m);
634 void fft_radix_5_axes_1(
float *X,
float *x,
unsigned int Nx,
unsigned int NxRadix,
const float32x2_t &w_m,
unsigned int M,
unsigned int N)
636 float32x2_t
w{ 1.0f, 0.0f };
637 for(
unsigned int j = 0; j < Nx; j++)
639 const float32x2_t w2 = c_mul_neon(
w,
w);
640 const float32x2_t w3 = c_mul_neon(w2,
w);
641 const float32x2_t w4 = c_mul_neon(w3,
w);
643 for(
unsigned int k = 2 * j; k < 2 *
N; k += 2 * NxRadix)
653 fft_5(a,
b, c, d, e,
w, w2, w3, w4);
663 w = c_mul_neon(
w, w_m);
667 template <
bool first_stage>
668 void fft_radix_7_axes_0(
float *X,
float *x,
unsigned int Nx,
unsigned int NxRadix,
const float32x2_t &w_m,
unsigned int N)
670 float32x2_t
w{ 1.0f, 0.0f };
671 for(
unsigned int j = 0; j < Nx; j++)
673 const float32x2_t w2 = c_mul_neon(
w,
w);
674 const float32x2_t w3 = c_mul_neon(w2,
w);
675 const float32x2_t w4 = c_mul_neon(w3,
w);
676 const float32x2_t w5 = c_mul_neon(w4,
w);
677 const float32x2_t w6 = c_mul_neon(w5,
w);
679 for(
unsigned int k = 2 * j; k < 2 *
N; k += 2 * NxRadix)
681 float32x2_t a = { 0, 0 };
682 float32x2_t
b = { 0, 0 };
683 float32x2_t c = { 0, 0 };
684 float32x2_t d = { 0, 0 };
685 float32x2_t e = { 0, 0 };
686 float32x2_t f = { 0, 0 };
687 float32x2_t g = { 0, 0 };
715 fft_7(a,
b, c, d, e, f, g,
w, w2, w3, w4, w5, w6);
735 w = c_mul_neon(
w, w_m);
739 void fft_radix_7_axes_1(
float *X,
float *x,
unsigned int Nx,
unsigned int NxRadix,
const float32x2_t &w_m,
unsigned int M,
unsigned int N)
741 float32x2_t
w{ 1.0f, 0.0f };
742 for(
unsigned int j = 0; j < Nx; j++)
744 const float32x2_t w2 = c_mul_neon(
w,
w);
745 const float32x2_t w3 = c_mul_neon(w2,
w);
746 const float32x2_t w4 = c_mul_neon(w3,
w);
747 const float32x2_t w5 = c_mul_neon(w4,
w);
748 const float32x2_t w6 = c_mul_neon(w5,
w);
750 for(
unsigned int k = 2 * j; k < 2 *
N; k += 2 * NxRadix)
762 fft_7(a,
b, c, d, e, f, g,
w, w2, w3, w4, w5, w6);
774 w = c_mul_neon(
w, w_m);
778 template <
bool first_stage>
779 void fft_radix_8_axes_0(
float *X,
float *x,
unsigned int Nx,
unsigned int NxRadix,
const float32x2_t &w_m,
unsigned int N)
781 float32x2_t
w{ 1.0f, 0.0f };
782 for(
unsigned int j = 0; j < Nx; j++)
784 const float32x2_t w2 = c_mul_neon(
w,
w);
785 const float32x2_t w3 = c_mul_neon(w2,
w);
786 const float32x2_t w4 = c_mul_neon(w3,
w);
787 const float32x2_t w5 = c_mul_neon(w4,
w);
788 const float32x2_t w6 = c_mul_neon(w5,
w);
789 const float32x2_t w7 = c_mul_neon(w6,
w);
791 for(
unsigned int k = 2 * j; k < 2 *
N; k += 2 * NxRadix)
794 float32x2_t a = { 0, 0 };
795 float32x2_t
b = { 0, 0 };
796 float32x2_t c = { 0, 0 };
797 float32x2_t d = { 0, 0 };
798 float32x2_t e = { 0, 0 };
799 float32x2_t f = { 0, 0 };
800 float32x2_t g = { 0, 0 };
801 float32x2_t h = { 0, 0 };
833 fft_8(a,
b, c, d, e, f, g, h,
w, w2, w3, w4, w5, w6, w7);
856 w = c_mul_neon(
w, w_m);
860 void fft_radix_8_axes_1(
float *X,
float *x,
unsigned int Nx,
unsigned int NxRadix,
const float32x2_t &w_m,
unsigned int M,
unsigned int N)
862 float32x2_t
w{ 1.0f, 0.0f };
863 for(
unsigned int j = 0; j < Nx; j++)
865 const float32x2_t w2 = c_mul_neon(
w,
w);
866 const float32x2_t w3 = c_mul_neon(w2,
w);
867 const float32x2_t w4 = c_mul_neon(w3,
w);
868 const float32x2_t w5 = c_mul_neon(w4,
w);
869 const float32x2_t w6 = c_mul_neon(w5,
w);
870 const float32x2_t w7 = c_mul_neon(w6,
w);
872 for(
unsigned int k = 2 * j; k < 2 *
N; k += 2 * NxRadix)
885 fft_8(a,
b, c, d, e, f, g, h,
w, w2, w3, w4, w5, w6, w7);
898 w = c_mul_neon(
w, w_m);
902 Status
validate_arguments(
const ITensorInfo *
input,
const ITensorInfo *output,
const FFTRadixStageKernelInfo &config)
910 if((output !=
nullptr) && (output->total_size() != 0))
919 std::pair<Status, Window> validate_and_configure_window(ITensorInfo *
input, ITensorInfo *output,
const FFTRadixStageKernelInfo &config)
923 if(output !=
nullptr)
929 if(output !=
nullptr)
931 output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
934 return std::make_pair(Status{}, win);
939 : _input(nullptr), _output(nullptr), _run_in_place(false), _Nx(0), _axis(0), _radix(0), _func_0(), _func_1()
946 static std::map<unsigned int, std::map<bool, FFTFunctionPointerAxis0>> fft_table_axis0;
948 if(fft_table_axis0.empty())
950 fft_table_axis0[2][
false] = &fft_radix_2_axes_0<false>;
951 fft_table_axis0[3][
false] = &fft_radix_3_axes_0<false>;
952 fft_table_axis0[4][
false] = &fft_radix_4_axes_0<false>;
953 fft_table_axis0[5][
false] = &fft_radix_5_axes_0<false>;
954 fft_table_axis0[7][
false] = &fft_radix_7_axes_0<false>;
955 fft_table_axis0[8][
false] = &fft_radix_8_axes_0<false>;
957 fft_table_axis0[2][
true] = &fft_radix_2_axes_0<true>;
958 fft_table_axis0[3][
true] = &fft_radix_3_axes_0<true>;
959 fft_table_axis0[4][
true] = &fft_radix_4_axes_0<true>;
960 fft_table_axis0[5][
true] = &fft_radix_5_axes_0<true>;
961 fft_table_axis0[7][
true] = &fft_radix_7_axes_0<true>;
962 fft_table_axis0[8][
true] = &fft_radix_8_axes_0<true>;
968 void NEFFTRadixStageKernel::set_radix_stage_axis1(
const FFTRadixStageKernelInfo &config)
971 static std::map<unsigned int, FFTFunctionPointerAxis1> fft_table_axis1;
973 if(fft_table_axis1.empty())
975 fft_table_axis1[2] = &fft_radix_2_axes_1;
976 fft_table_axis1[3] = &fft_radix_3_axes_1;
977 fft_table_axis1[4] = &fft_radix_4_axes_1;
978 fft_table_axis1[5] = &fft_radix_5_axes_1;
979 fft_table_axis1[7] = &fft_radix_7_axes_1;
980 fft_table_axis1[8] = &fft_radix_8_axes_1;
983 _func_1 = fft_table_axis1[config.radix];
991 if(output !=
nullptr)
1000 _run_in_place = (output ==
nullptr) || (output ==
input);
1002 _axis = config.
axis;
1003 _radix = config.
radix;
1008 set_radix_stage_axis0(config);
1011 set_radix_stage_axis1(config);
1019 auto win_config = validate_and_configure_window(
input->info(), (_run_in_place) ?
nullptr : output->
info(), config);
1021 INEKernel::configure(win_config.second);
1026 const bool run_in_place = (output ==
nullptr) || (output ==
input);
1029 (run_in_place) ?
nullptr : output->
clone().get(),
1038 return std::set<unsigned int> { 2, 3, 4, 5, 7, 8 };
1048 input_window.
set(_axis, 0);
1051 Iterator out(_run_in_place ? _input : _output, input_window);
1054 const unsigned int NxRadix = _radix * _Nx;
1055 const float alpha = 2.0f * kPi / float(NxRadix);
1056 const float32x2_t w_m{ cosf(alpha), -sinf(alpha) };
1063 _func_0(reinterpret_cast<float *>(out.
ptr()), reinterpret_cast<float *>(in.
ptr()), _Nx, NxRadix, w_m,
N);
1073 _func_1(reinterpret_cast<float *>(out.
ptr()), reinterpret_cast<float *>(in.
ptr()), _Nx, NxRadix, w_m,
N,
M);
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
Traits defined on NEON vectors.
const Window & window() const
The maximum window the kernel can be executed on.
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
uint8x16_t vloadq(const uint8_t *ptr)
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
uint8x8_t vadd(const uint8x8_t &a, const uint8x8_t &b)
static Status validate(const ITensorInfo *input, const ITensorInfo *output, const FFTRadixStageKernelInfo &config)
Static function to check if given info will lead to a valid configuration of NEFFTRadixStageKernel.
1 channel, 1 F32 per channel
void configure(ITensor *input, ITensor *output, const FFTRadixStageKernelInfo &config)
Set the input and output tensors.
Store the tensor's metadata.
#define ARM_COMPUTE_ERROR_THROW_ON(status)
uint8x8_t vsub(const uint8x8_t &a, const uint8x8_t &b)
unsigned int axis
Axis to run the kernel on.
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
Interface for NEON tensor.
Copyright (c) 2017-2020 Arm Limited.
uint8_t vgetlane(const uint8x8_t vector, const unsigned int lane)
NEFFTRadixStageKernel()
Constructor.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
static std::set< unsigned int > supported_radix()
Returns the radix that are support by the FFT kernel.
bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, QuantizationInfo quantization_info=QuantizationInfo())
Auto initialize the tensor info (shape, number of channels and data type) if the current assignment i...
Descriptor used by the FFT core kernels.
virtual std::unique_ptr< T > clone() const =0
Provide a clone of the current object of class T.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
constexpr uint8_t * ptr() const
Return a pointer to the current pixel.
uint8x8_t vgetlow(const uint8x16_t val)
void set(size_t dimension, const Dimension &dim)
Set the values of a given dimension.
uint8x16_t vcombine(const uint8x8_t &a, const uint8x8_t &b)
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
int8x8_t vneg(const int8x8_t &a)
uint8x8_t vgethigh(const uint8x16_t val)
unsigned int radix
Radix to use.
void run(const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
uint8x8_t vmul(const uint8x8_t &a, const uint8x8_t &b)
Information about executing thread and CPU.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(...)
bool is_first_stage
Flags if the FFT kernels is the first stage of a decomposed FFT.
unsigned int Nx
Nx coefficient.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...)
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage)
uint8x8_t vrev64(const uint8x8_t &a)
uint8x8_t vload(const uint8_t *ptr)
void vstore(uint8_t *ptr, uint8x8_t val)
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Includes all wrapper headers at once.
uint8x8_t vmla(const uint8x8_t &a, const uint8x8_t &b, const uint8x8_t &c)
Iterator updated by execute_window_loop for each window element.
Describe a multidimensional execution window.
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)