48 constexpr
float kPi = float(
M_PI);
51 constexpr
float kSqrt3Div2 = 0.866025403784438;
54 constexpr
float kW5_0 = 0.30901699437494f;
55 constexpr
float kW5_1 = 0.95105651629515f;
56 constexpr
float kW5_2 = 0.80901699437494f;
57 constexpr
float kW5_3 = 0.58778525229247f;
60 constexpr
float kW7_0 = 0.62348980185873f;
61 constexpr
float kW7_1 = 0.78183148246802f;
62 constexpr
float kW7_2 = 0.22252093395631f;
63 constexpr
float kW7_3 = 0.97492791218182f;
64 constexpr
float kW7_4 = 0.90096886790241f;
65 constexpr
float kW7_5 = 0.43388373911755f;
68 constexpr
float kSqrt2Div2 = 0.707106781186548;
70 float32x2_t c_mul_neon(float32x2_t a, float32x2_t
b)
72 using ExactTagType =
typename wrapper::traits::neon_vector<float, 2>::tag_type;
74 const float32x2_t mask = {-1.0, 1.0};
87 float32x2_t c_mul_neon_img(float32x2_t a,
float img_constant)
92 const auto out =
wrapper::vmul(float32x2_t{-a_i, a_r}, float32x2_t{img_constant, img_constant});
96 float32x2_t reduce_sum_5(float32x2_t a, float32x2_t
b, float32x2_t c, float32x2_t d, float32x2_t e)
104 float32x2_t reduce_sum_7(
105 float32x2_t x1, float32x2_t x2, float32x2_t x3, float32x2_t x4, float32x2_t x5, float32x2_t x6, float32x2_t x7)
116 float32x2_t reduce_sum_8(float32x2_t x1,
135 void fft_2(float32x2_t &x, float32x2_t &y, float32x2_t &
w)
138 float32x2_t
b = c_mul_neon(
w, y);
144 void fft_3(float32x2_t &x, float32x2_t &y, float32x2_t &z,
const float32x2_t &
w,
const float32x2_t &w2)
147 float32x2_t
b = c_mul_neon(
w, y);
148 float32x2_t c = c_mul_neon(w2, z);
154 const auto v2 = c_mul_neon(float32x2_t{0.f, -kSqrt3Div2},
wrapper::vsub(
b, c));
161 void fft_4(float32x2_t &x1,
165 const float32x2_t &
w,
166 const float32x2_t &w2,
167 const float32x2_t &w3)
170 float32x2_t
b = c_mul_neon(
w, x2);
171 float32x2_t c = c_mul_neon(w2, x3);
172 float32x2_t d = c_mul_neon(w3, x4);
191 void fft_5(float32x2_t &x1,
196 const float32x2_t &
w,
197 const float32x2_t &w2,
198 const float32x2_t &w3,
199 const float32x2_t &w4)
202 const auto b = c_mul_neon(
w, x2);
203 const auto c = c_mul_neon(w2, x3);
204 const auto d = c_mul_neon(w3, x4);
205 const auto e = c_mul_neon(w4, x5);
207 const auto b0 = c_mul_neon(float32x2_t{kW5_0, -kW5_1},
b);
208 const auto b1 = c_mul_neon(float32x2_t{-kW5_2, -kW5_3},
b);
209 const auto b2 = c_mul_neon(float32x2_t{-kW5_2, kW5_3},
b);
210 const auto b3 = c_mul_neon(float32x2_t{kW5_0, kW5_1},
b);
212 const auto c0 = c_mul_neon(float32x2_t{-kW5_2, -kW5_3}, c);
213 const auto c1 = c_mul_neon(float32x2_t{kW5_0, kW5_1}, c);
214 const auto c2 = c_mul_neon(float32x2_t{kW5_0, -kW5_1}, c);
215 const auto c3 = c_mul_neon(float32x2_t{-kW5_2, kW5_3}, c);
217 const auto d0 = c_mul_neon(float32x2_t{-kW5_2, kW5_3}, d);
218 const auto d1 = c_mul_neon(float32x2_t{kW5_0, -kW5_1}, d);
219 const auto d2 = c_mul_neon(float32x2_t{kW5_0, kW5_1}, d);
220 const auto d3 = c_mul_neon(float32x2_t{-kW5_2, -kW5_3}, d);
222 const auto e0 = c_mul_neon(float32x2_t{kW5_0, kW5_1}, e);
223 const auto e1 = c_mul_neon(float32x2_t{-kW5_2, kW5_3}, e);
224 const auto e2 = c_mul_neon(float32x2_t{-kW5_2, -kW5_3}, e);
225 const auto e3 = c_mul_neon(float32x2_t{kW5_0, -kW5_1}, e);
227 x1 = reduce_sum_5(a,
b, c, d, e);
228 x2 = reduce_sum_5(a, b0, c0, d0, e0);
229 x3 = reduce_sum_5(a, b1, c1, d1, e1);
230 x4 = reduce_sum_5(a, b2, c2, d2, e2);
231 x5 = reduce_sum_5(a, b3, c3, d3, e3);
234 void fft_7(float32x2_t &x1,
241 const float32x2_t &
w,
242 const float32x2_t &w2,
243 const float32x2_t &w3,
244 const float32x2_t &w4,
245 const float32x2_t &w5,
246 const float32x2_t &w6)
249 const auto b = c_mul_neon(
w, x2);
250 const auto c = c_mul_neon(w2, x3);
251 const auto d = c_mul_neon(w3, x4);
252 const auto e = c_mul_neon(w4, x5);
253 const auto f = c_mul_neon(w5, x6);
254 const auto g = c_mul_neon(w6, x7);
256 const auto b0 = c_mul_neon(float32x2_t{kW7_0, -kW7_1},
b);
257 const auto b1 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3},
b);
258 const auto b2 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5},
b);
259 const auto b3 = c_mul_neon(float32x2_t{-kW7_4, kW7_5},
b);
260 const auto b4 = c_mul_neon(float32x2_t{-kW7_2, kW7_3},
b);
261 const auto b5 = c_mul_neon(float32x2_t{kW7_0, kW7_1},
b);
263 const auto c0 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, c);
264 const auto c1 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, c);
265 const auto c2 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, c);
266 const auto c3 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, c);
267 const auto c4 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, c);
268 const auto c5 = c_mul_neon(float32x2_t{-kW7_2, kW7_3}, c);
270 const auto d0 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, d);
271 const auto d1 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, d);
272 const auto d2 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, d);
273 const auto d3 = c_mul_neon(float32x2_t{-kW7_2, +kW7_3}, d);
274 const auto d4 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, d);
275 const auto d5 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, d);
277 const auto e0 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, e);
278 const auto e1 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, e);
279 const auto e2 = c_mul_neon(float32x2_t{-kW7_2, kW7_3}, e);
280 const auto e3 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, e);
281 const auto e4 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, e);
282 const auto e5 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, e);
284 const auto f0 = c_mul_neon(float32x2_t{-kW7_2, kW7_3}, f);
285 const auto f1 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, f);
286 const auto f2 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, f);
287 const auto f3 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, f);
288 const auto f4 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, f);
289 const auto f5 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, f);
291 const auto g0 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, g);
292 const auto g1 = c_mul_neon(float32x2_t{-kW7_2, kW7_3}, g);
293 const auto g2 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, g);
294 const auto g3 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, g);
295 const auto g4 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, g);
296 const auto g5 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, g);
298 x1 = reduce_sum_7(a,
b, c, d, e, f, g);
299 x2 = reduce_sum_7(a, b0, c0, d0, e0, f0, g0);
300 x3 = reduce_sum_7(a, b1, c1, d1, e1, f1, g1);
301 x4 = reduce_sum_7(a, b2, c2, d2, e2, f2, g2);
302 x5 = reduce_sum_7(a, b3, c3, d3, e3, f3, g3);
303 x6 = reduce_sum_7(a, b4, c4, d4, e4, f4, g4);
304 x7 = reduce_sum_7(a, b5, c5, d5, e5, f5, g5);
307 void fft_8(float32x2_t &x1,
315 const float32x2_t &
w,
316 const float32x2_t &w2,
317 const float32x2_t &w3,
318 const float32x2_t &w4,
319 const float32x2_t &w5,
320 const float32x2_t &w6,
321 const float32x2_t &w7)
324 const auto b = c_mul_neon(
w, x2);
325 const auto c = c_mul_neon(w2, x3);
326 const auto d = c_mul_neon(w3, x4);
327 const auto e = c_mul_neon(w4, x5);
328 const auto f = c_mul_neon(w5, x6);
329 const auto g = c_mul_neon(w6, x7);
330 const auto h = c_mul_neon(w7, x8);
332 const auto b0 = c_mul_neon(float32x2_t{kSqrt2Div2, -kSqrt2Div2},
b);
333 const auto b1 = c_mul_neon(float32x2_t{0, -1},
b);
334 const auto b2 = c_mul_neon(float32x2_t{-kSqrt2Div2, -kSqrt2Div2},
b);
335 const auto b3 = c_mul_neon(float32x2_t{-1, 0},
b);
336 const auto b4 = c_mul_neon(float32x2_t{-kSqrt2Div2, kSqrt2Div2},
b);
337 const auto b5 = c_mul_neon(float32x2_t{0, 1},
b);
338 const auto b6 = c_mul_neon(float32x2_t{kSqrt2Div2, kSqrt2Div2},
b);
340 const auto c0 = c_mul_neon(float32x2_t{0, -1}, c);
341 const auto c1 = c_mul_neon(float32x2_t{-1, 0}, c);
342 const auto c2 = c_mul_neon(float32x2_t{0, 1}, c);
343 const auto c3 = c_mul_neon(float32x2_t{1, 0}, c);
344 const auto c4 = c_mul_neon(float32x2_t{0, -1}, c);
345 const auto c5 = c_mul_neon(float32x2_t{-1, 0}, c);
346 const auto c6 = c_mul_neon(float32x2_t{0, 1}, c);
348 const auto d0 = c_mul_neon(float32x2_t{-kSqrt2Div2, -kSqrt2Div2}, d);
349 const auto d1 = c_mul_neon(float32x2_t{0, 1}, d);
350 const auto d2 = c_mul_neon(float32x2_t{kSqrt2Div2, -kSqrt2Div2}, d);
351 const auto d3 = c_mul_neon(float32x2_t{-1, 0}, d);
352 const auto d4 = c_mul_neon(float32x2_t{kSqrt2Div2, kSqrt2Div2}, d);
353 const auto d5 = c_mul_neon(float32x2_t{0, -1}, d);
354 const auto d6 = c_mul_neon(float32x2_t{-kSqrt2Div2, kSqrt2Div2}, d);
356 const auto e0 = c_mul_neon(float32x2_t{-1, 0}, e);
357 const auto e1 = c_mul_neon(float32x2_t{1, 0}, e);
358 const auto e2 = c_mul_neon(float32x2_t{-1, 0}, e);
359 const auto e3 = c_mul_neon(float32x2_t{1, 0}, e);
360 const auto e4 = c_mul_neon(float32x2_t{-1, 0}, e);
361 const auto e5 = c_mul_neon(float32x2_t{1, 0}, e);
362 const auto e6 = c_mul_neon(float32x2_t{-1, 0}, e);
364 const auto f0 = c_mul_neon(float32x2_t{-kSqrt2Div2, kSqrt2Div2}, f);
365 const auto f1 = c_mul_neon(float32x2_t{0, -1}, f);
366 const auto f2 = c_mul_neon(float32x2_t{kSqrt2Div2, kSqrt2Div2}, f);
367 const auto f3 = c_mul_neon(float32x2_t{-1, 0}, f);
368 const auto f4 = c_mul_neon(float32x2_t{kSqrt2Div2, -kSqrt2Div2}, f);
369 const auto f5 = c_mul_neon(float32x2_t{0, 1}, f);
370 const auto f6 = c_mul_neon(float32x2_t{-kSqrt2Div2, -kSqrt2Div2}, f);
372 const auto g0 = c_mul_neon(float32x2_t{0, 1}, g);
373 const auto g1 = c_mul_neon(float32x2_t{-1, 0}, g);
374 const auto g2 = c_mul_neon(float32x2_t{0, -1}, g);
375 const auto g3 = c_mul_neon(float32x2_t{1, 0}, g);
376 const auto g4 = c_mul_neon(float32x2_t{0, 1}, g);
377 const auto g5 = c_mul_neon(float32x2_t{-1, 0}, g);
378 const auto g6 = c_mul_neon(float32x2_t{0, -1}, g);
380 const auto h0 = c_mul_neon(float32x2_t{kSqrt2Div2, kSqrt2Div2}, h);
381 const auto h1 = c_mul_neon(float32x2_t{0, 1}, h);
382 const auto h2 = c_mul_neon(float32x2_t{-kSqrt2Div2, kSqrt2Div2}, h);
383 const auto h3 = c_mul_neon(float32x2_t{-1, 0}, h);
384 const auto h4 = c_mul_neon(float32x2_t{-kSqrt2Div2, -kSqrt2Div2}, h);
385 const auto h5 = c_mul_neon(float32x2_t{0, -1}, h);
386 const auto h6 = c_mul_neon(float32x2_t{kSqrt2Div2, -kSqrt2Div2}, h);
388 x1 = reduce_sum_8(a,
b, c, d, e, f, g, h);
389 x2 = reduce_sum_8(a, b0, c0, d0, e0, f0, g0, h0);
390 x3 = reduce_sum_8(a, b1, c1, d1, e1, f1, g1, h1);
391 x4 = reduce_sum_8(a, b2, c2, d2, e2, f2, g2, h2);
392 x5 = reduce_sum_8(a, b3, c3, d3, e3, f3, g3, h3);
393 x6 = reduce_sum_8(a, b4, c4, d4, e4, f4, g4, h4);
394 x7 = reduce_sum_8(a, b5, c5, d5, e5, f5, g5, h5);
395 x8 = reduce_sum_8(a, b6, c6, d6, e6, f6, g6, h6);
398 template <
bool first_stage>
399 void fft_radix_2_axes_0(
400 float *out,
float *in,
unsigned int Nx,
unsigned int NxRadix,
const float32x2_t &w_m,
unsigned int N)
402 float32x2_t
w{1.0f, 0.0f};
403 for (
unsigned int j = 0; j < Nx; j++)
405 for (
unsigned int k = 2 * j; k < 2 *
N; k += 2 * NxRadix)
407 auto a = float32x2_t{0, 0};
408 auto b = float32x2_t{0, 0};
438 w = c_mul_neon(
w, w_m);
442 void fft_radix_2_axes_1(
float *out,
445 unsigned int NxRadix,
446 const float32x2_t &w_m,
449 unsigned int in_pad_x,
450 unsigned int out_pad_x)
452 float32x2_t
w{1.0f, 0.0f};
453 for (
unsigned int j = 0; j < Nx; j++)
455 for (
unsigned int k = 2 * j; k < 2 *
M; k += 2 * NxRadix)
469 w = c_mul_neon(
w, w_m);
473 template <
bool first_stage>
474 void fft_radix_3_axes_0(
475 float *out,
float *in,
unsigned int Nx,
unsigned int NxRadix,
const float32x2_t &w_m,
unsigned int N)
477 float32x2_t
w{1.0f, 0.0f};
478 for (
unsigned int j = 0; j < Nx; j++)
480 const auto w2 = c_mul_neon(
w,
w);
482 for (
unsigned int k = 2 * j; k < 2 *
N; k += 2 * NxRadix)
485 float32x2_t a = {0, 0};
486 float32x2_t
b = {0, 0};
487 float32x2_t c = {0, 0};
502 fft_3(a,
b, c,
w, w2);
515 w = c_mul_neon(
w, w_m);
519 void fft_radix_3_axes_1(
float *out,
522 unsigned int NxRadix,
523 const float32x2_t &w_m,
526 unsigned int in_pad_x,
527 unsigned int out_pad_x)
529 float32x2_t
w{1.0f, 0.0f};
530 for (
unsigned int j = 0; j < Nx; j++)
532 const auto w2 = c_mul_neon(
w,
w);
534 for (
unsigned int k = 2 * j; k < 2 *
M; k += 2 * NxRadix)
542 fft_3(a,
b, c,
w, w2);
549 w = c_mul_neon(
w, w_m);
553 template <
bool first_stage>
554 void fft_radix_4_axes_0(
555 float *out,
float *in,
unsigned int Nx,
unsigned int NxRadix,
const float32x2_t &w_m,
unsigned int N)
557 float32x2_t
w{1.0f, 0.0f};
558 for (
unsigned int j = 0; j < Nx; j++)
560 const auto w2 = c_mul_neon(
w,
w);
561 const auto w3 = c_mul_neon(w2,
w);
563 for (
unsigned int k = 2 * j; k < 2 *
N; k += 2 * NxRadix)
565 float32x2_t a = {0, 0};
566 float32x2_t
b = {0, 0};
567 float32x2_t c = {0, 0};
568 float32x2_t d = {0, 0};
588 fft_4(a,
b, c, d,
w, w2, w3);
604 w = c_mul_neon(
w, w_m);
608 void fft_radix_4_axes_1(
float *out,
611 unsigned int NxRadix,
612 const float32x2_t &w_m,
615 unsigned int in_pad_x,
616 unsigned int out_pad_x)
618 float32x2_t
w{1.0f, 0.0f};
619 for (
unsigned int j = 0; j < Nx; j++)
621 const auto w2 = c_mul_neon(
w,
w);
622 const auto w3 = c_mul_neon(w2,
w);
624 for (
unsigned int k = 2 * j; k < 2 *
M; k += 2 * NxRadix)
633 fft_4(a,
b, c, d,
w, w2, w3);
641 w = c_mul_neon(
w, w_m);
645 template <
bool first_stage>
646 void fft_radix_5_axes_0(
647 float *out,
float *in,
unsigned int Nx,
unsigned int NxRadix,
const float32x2_t &w_m,
unsigned int N)
649 float32x2_t
w{1.0f, 0.0f};
650 for (
unsigned int j = 0; j < Nx; j++)
652 const float32x2_t w2 = c_mul_neon(
w,
w);
653 const float32x2_t w3 = c_mul_neon(w2,
w);
654 const float32x2_t w4 = c_mul_neon(w3,
w);
656 for (
unsigned int k = 2 * j; k < 2 *
N; k += 2 * NxRadix)
658 float32x2_t a = {0, 0};
659 float32x2_t
b = {0, 0};
660 float32x2_t c = {0, 0};
661 float32x2_t d = {0, 0};
662 float32x2_t e = {0, 0};
685 fft_5(a,
b, c, d, e,
w, w2, w3, w4);
703 w = c_mul_neon(
w, w_m);
707 void fft_radix_5_axes_1(
float *out,
710 unsigned int NxRadix,
711 const float32x2_t &w_m,
714 unsigned int in_pad_x,
715 unsigned int out_pad_x)
717 float32x2_t
w{1.0f, 0.0f};
718 for (
unsigned int j = 0; j < Nx; j++)
720 const float32x2_t w2 = c_mul_neon(
w,
w);
721 const float32x2_t w3 = c_mul_neon(w2,
w);
722 const float32x2_t w4 = c_mul_neon(w3,
w);
724 for (
unsigned int k = 2 * j; k < 2 *
M; k += 2 * NxRadix)
734 fft_5(a,
b, c, d, e,
w, w2, w3, w4);
744 w = c_mul_neon(
w, w_m);
748 template <
bool first_stage>
749 void fft_radix_7_axes_0(
750 float *out,
float *in,
unsigned int Nx,
unsigned int NxRadix,
const float32x2_t &w_m,
unsigned int N)
752 float32x2_t
w{1.0f, 0.0f};
753 for (
unsigned int j = 0; j < Nx; j++)
755 const float32x2_t w2 = c_mul_neon(
w,
w);
756 const float32x2_t w3 = c_mul_neon(w2,
w);
757 const float32x2_t w4 = c_mul_neon(w3,
w);
758 const float32x2_t w5 = c_mul_neon(w4,
w);
759 const float32x2_t w6 = c_mul_neon(w5,
w);
761 for (
unsigned int k = 2 * j; k < 2 *
N; k += 2 * NxRadix)
763 float32x2_t a = {0, 0};
764 float32x2_t
b = {0, 0};
765 float32x2_t c = {0, 0};
766 float32x2_t d = {0, 0};
767 float32x2_t e = {0, 0};
768 float32x2_t f = {0, 0};
769 float32x2_t g = {0, 0};
797 fft_7(a,
b, c, d, e, f, g,
w, w2, w3, w4, w5, w6);
817 w = c_mul_neon(
w, w_m);
821 void fft_radix_7_axes_1(
float *out,
824 unsigned int NxRadix,
825 const float32x2_t &w_m,
828 unsigned int in_pad_x,
829 unsigned int out_pad_x)
831 float32x2_t
w{1.0f, 0.0f};
832 for (
unsigned int j = 0; j < Nx; j++)
834 const float32x2_t w2 = c_mul_neon(
w,
w);
835 const float32x2_t w3 = c_mul_neon(w2,
w);
836 const float32x2_t w4 = c_mul_neon(w3,
w);
837 const float32x2_t w5 = c_mul_neon(w4,
w);
838 const float32x2_t w6 = c_mul_neon(w5,
w);
840 for (
unsigned int k = 2 * j; k < 2 *
M; k += 2 * NxRadix)
852 fft_7(a,
b, c, d, e, f, g,
w, w2, w3, w4, w5, w6);
864 w = c_mul_neon(
w, w_m);
868 template <
bool first_stage>
869 void fft_radix_8_axes_0(
870 float *out,
float *in,
unsigned int Nx,
unsigned int NxRadix,
const float32x2_t &w_m,
unsigned int N)
872 float32x2_t
w{1.0f, 0.0f};
873 for (
unsigned int j = 0; j < Nx; j++)
875 const float32x2_t w2 = c_mul_neon(
w,
w);
876 const float32x2_t w3 = c_mul_neon(w2,
w);
877 const float32x2_t w4 = c_mul_neon(w3,
w);
878 const float32x2_t w5 = c_mul_neon(w4,
w);
879 const float32x2_t w6 = c_mul_neon(w5,
w);
880 const float32x2_t w7 = c_mul_neon(w6,
w);
882 for (
unsigned int k = 2 * j; k < 2 *
N; k += 2 * NxRadix)
885 float32x2_t a = {0, 0};
886 float32x2_t
b = {0, 0};
887 float32x2_t c = {0, 0};
888 float32x2_t d = {0, 0};
889 float32x2_t e = {0, 0};
890 float32x2_t f = {0, 0};
891 float32x2_t g = {0, 0};
892 float32x2_t h = {0, 0};
924 fft_8(a,
b, c, d, e, f, g, h,
w, w2, w3, w4, w5, w6, w7);
947 w = c_mul_neon(
w, w_m);
951 void fft_radix_8_axes_1(
float *out,
954 unsigned int NxRadix,
955 const float32x2_t &w_m,
958 unsigned int in_pad_x,
959 unsigned int out_pad_x)
961 float32x2_t
w{1.0f, 0.0f};
962 for (
unsigned int j = 0; j < Nx; j++)
964 const float32x2_t w2 = c_mul_neon(
w,
w);
965 const float32x2_t w3 = c_mul_neon(w2,
w);
966 const float32x2_t w4 = c_mul_neon(w3,
w);
967 const float32x2_t w5 = c_mul_neon(w4,
w);
968 const float32x2_t w6 = c_mul_neon(w5,
w);
969 const float32x2_t w7 = c_mul_neon(w6,
w);
971 for (
unsigned int k = 2 * j; k < 2 *
M; k += 2 * NxRadix)
984 fft_8(a,
b, c, d, e, f, g, h,
w, w2, w3, w4, w5, w6, w7);
997 w = c_mul_neon(
w, w_m);
1001 Status
validate_arguments(
const ITensorInfo *
input,
const ITensorInfo *output,
const FFTRadixStageKernelInfo &config)
1009 if ((output !=
nullptr) && (output->total_size() != 0))
1018 std::pair<Status, Window>
1023 if (output !=
nullptr)
1030 return std::make_pair(Status{}, win);
1035 : _input(nullptr), _output(nullptr), _Nx(0), _axis(0), _radix(0), _func_0(), _func_1()
1042 static std::map<unsigned int, std::map<bool, FFTFunctionPointerAxis0>> fft_table_axis0;
1044 if (fft_table_axis0.empty())
1046 fft_table_axis0[2][
false] = &fft_radix_2_axes_0<false>;
1047 fft_table_axis0[3][
false] = &fft_radix_3_axes_0<false>;
1048 fft_table_axis0[4][
false] = &fft_radix_4_axes_0<false>;
1049 fft_table_axis0[5][
false] = &fft_radix_5_axes_0<false>;
1050 fft_table_axis0[7][
false] = &fft_radix_7_axes_0<false>;
1051 fft_table_axis0[8][
false] = &fft_radix_8_axes_0<false>;
1053 fft_table_axis0[2][
true] = &fft_radix_2_axes_0<true>;
1054 fft_table_axis0[3][
true] = &fft_radix_3_axes_0<true>;
1055 fft_table_axis0[4][
true] = &fft_radix_4_axes_0<true>;
1056 fft_table_axis0[5][
true] = &fft_radix_5_axes_0<true>;
1057 fft_table_axis0[7][
true] = &fft_radix_7_axes_0<true>;
1058 fft_table_axis0[8][
true] = &fft_radix_8_axes_0<true>;
1064 void NEFFTRadixStageKernel::set_radix_stage_axis1(
const FFTRadixStageKernelInfo &config)
1067 static std::map<unsigned int, FFTFunctionPointerAxis1> fft_table_axis1;
1069 if (fft_table_axis1.empty())
1071 fft_table_axis1[2] = &fft_radix_2_axes_1;
1072 fft_table_axis1[3] = &fft_radix_3_axes_1;
1073 fft_table_axis1[4] = &fft_radix_4_axes_1;
1074 fft_table_axis1[5] = &fft_radix_5_axes_1;
1075 fft_table_axis1[7] = &fft_radix_7_axes_1;
1076 fft_table_axis1[8] = &fft_radix_8_axes_1;
1079 _func_1 = fft_table_axis1[config.radix];
1087 if (output !=
nullptr)
1096 _output = (output ==
nullptr) ?
input : output;
1098 _axis = config.
axis;
1099 _radix = config.
radix;
1101 switch (config.
axis)
1104 set_radix_stage_axis0(config);
1107 set_radix_stage_axis1(config);
1118 INEKernel::configure(win_config.second);
1125 const bool run_in_place = (output ==
nullptr) || (output ==
input);
1136 return std::set<unsigned int>{2, 3, 4, 5, 7, 8};
1146 input_window.
set(_axis, 0);
1149 Iterator out(_output, input_window);
1152 const unsigned int NxRadix = _radix * _Nx;
1153 const float alpha = 2.0f * kPi / float(NxRadix);
1154 const float32x2_t w_m{cosf(alpha), -sinf(alpha)};
1162 _func_0(
reinterpret_cast<float *
>(out.
ptr()),
reinterpret_cast<float *
>(in.
ptr()), _Nx, NxRadix, w_m,
1175 _func_1(
reinterpret_cast<float *
>(out.
ptr()),
reinterpret_cast<float *
>(in.
ptr()), _Nx, NxRadix, w_m,
N,