47 constexpr
float kPi = float(
M_PI);
50 constexpr
float kSqrt3Div2 = 0.866025403784438;
53 constexpr
float kW5_0 = 0.30901699437494f;
54 constexpr
float kW5_1 = 0.95105651629515f;
55 constexpr
float kW5_2 = 0.80901699437494f;
56 constexpr
float kW5_3 = 0.58778525229247f;
59 constexpr
float kW7_0 = 0.62348980185873f;
60 constexpr
float kW7_1 = 0.78183148246802f;
61 constexpr
float kW7_2 = 0.22252093395631f;
62 constexpr
float kW7_3 = 0.97492791218182f;
63 constexpr
float kW7_4 = 0.90096886790241f;
64 constexpr
float kW7_5 = 0.43388373911755f;
67 constexpr
float kSqrt2Div2 = 0.707106781186548;
69 float32x2_t c_mul_neon(float32x2_t a, float32x2_t
b)
71 using ExactTagType =
typename wrapper::traits::neon_vector<float, 2>::tag_type;
73 const float32x2_t mask = { -1.0, 1.0 };
86 float32x2_t c_mul_neon_img(float32x2_t a,
float img_constant)
91 const auto out =
wrapper::vmul(float32x2_t{ -a_i, a_r }, float32x2_t{ img_constant, img_constant });
95 float32x2_t reduce_sum_5(float32x2_t a, float32x2_t b, float32x2_t c, float32x2_t d, float32x2_t e)
103 float32x2_t reduce_sum_7(float32x2_t x1, float32x2_t x2, float32x2_t x3, float32x2_t x4, float32x2_t x5, float32x2_t x6, float32x2_t x7)
114 float32x2_t reduce_sum_8(float32x2_t x1, float32x2_t x2, float32x2_t x3, float32x2_t x4, float32x2_t x5, float32x2_t x6, float32x2_t x7, float32x2_t x8)
126 void fft_2(float32x2_t &x, float32x2_t &y, float32x2_t &
w)
129 float32x2_t b = c_mul_neon(w, y);
135 void fft_3(float32x2_t &x, float32x2_t &y, float32x2_t &z,
const float32x2_t &
w,
const float32x2_t &w2)
138 float32x2_t b = c_mul_neon(w, y);
139 float32x2_t c = c_mul_neon(w2, z);
145 const auto v2 = c_mul_neon(float32x2_t{ 0.f, -kSqrt3Div2 },
wrapper::vsub(b, c));
152 void fft_4(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4,
const float32x2_t &w,
const float32x2_t &w2,
const float32x2_t &w3)
155 float32x2_t b = c_mul_neon(w, x2);
156 float32x2_t c = c_mul_neon(w2, x3);
157 float32x2_t d = c_mul_neon(w3, x4);
176 void fft_5(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, float32x2_t &x5,
const float32x2_t &w,
const float32x2_t &w2,
const float32x2_t &w3,
const float32x2_t &w4)
179 const auto b = c_mul_neon(w, x2);
180 const auto c = c_mul_neon(w2, x3);
181 const auto d = c_mul_neon(w3, x4);
182 const auto e = c_mul_neon(w4, x5);
184 const auto b0 = c_mul_neon(float32x2_t{ kW5_0, -kW5_1 },
b);
185 const auto b1 = c_mul_neon(float32x2_t{ -kW5_2, -kW5_3 },
b);
186 const auto b2 = c_mul_neon(float32x2_t{ -kW5_2, kW5_3 },
b);
187 const auto b3 = c_mul_neon(float32x2_t{ kW5_0, kW5_1 },
b);
189 const auto c0 = c_mul_neon(float32x2_t{ -kW5_2, -kW5_3 }, c);
190 const auto c1 = c_mul_neon(float32x2_t{ kW5_0, kW5_1 }, c);
191 const auto c2 = c_mul_neon(float32x2_t{ kW5_0, -kW5_1 }, c);
192 const auto c3 = c_mul_neon(float32x2_t{ -kW5_2, kW5_3 }, c);
194 const auto d0 = c_mul_neon(float32x2_t{ -kW5_2, kW5_3 }, d);
195 const auto d1 = c_mul_neon(float32x2_t{ kW5_0, -kW5_1 }, d);
196 const auto d2 = c_mul_neon(float32x2_t{ kW5_0, kW5_1 }, d);
197 const auto d3 = c_mul_neon(float32x2_t{ -kW5_2, -kW5_3 }, d);
199 const auto e0 = c_mul_neon(float32x2_t{ kW5_0, kW5_1 }, e);
200 const auto e1 = c_mul_neon(float32x2_t{ -kW5_2, kW5_3 }, e);
201 const auto e2 = c_mul_neon(float32x2_t{ -kW5_2, -kW5_3 }, e);
202 const auto e3 = c_mul_neon(float32x2_t{ kW5_0, -kW5_1 }, e);
204 x1 = reduce_sum_5(a, b, c, d, e);
205 x2 = reduce_sum_5(a, b0, c0, d0, e0);
206 x3 = reduce_sum_5(a, b1, c1, d1, e1);
207 x4 = reduce_sum_5(a, b2, c2, d2, e2);
208 x5 = reduce_sum_5(a, b3, c3, d3, e3);
211 void fft_7(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, float32x2_t &x5, float32x2_t &x6, float32x2_t &x7,
const float32x2_t &w,
const float32x2_t &w2,
const float32x2_t &w3,
212 const float32x2_t &w4,
213 const float32x2_t &w5,
const float32x2_t &w6)
216 const auto b = c_mul_neon(w, x2);
217 const auto c = c_mul_neon(w2, x3);
218 const auto d = c_mul_neon(w3, x4);
219 const auto e = c_mul_neon(w4, x5);
220 const auto f = c_mul_neon(w5, x6);
221 const auto g = c_mul_neon(w6, x7);
223 const auto b0 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 },
b);
224 const auto b1 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 },
b);
225 const auto b2 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 },
b);
226 const auto b3 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 },
b);
227 const auto b4 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 },
b);
228 const auto b5 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 },
b);
230 const auto c0 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, c);
231 const auto c1 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, c);
232 const auto c2 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, c);
233 const auto c3 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, c);
234 const auto c4 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, c);
235 const auto c5 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, c);
237 const auto d0 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, d);
238 const auto d1 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, d);
239 const auto d2 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, d);
240 const auto d3 = c_mul_neon(float32x2_t{ -kW7_2, +kW7_3 }, d);
241 const auto d4 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, d);
242 const auto d5 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, d);
244 const auto e0 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, e);
245 const auto e1 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, e);
246 const auto e2 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, e);
247 const auto e3 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, e);
248 const auto e4 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, e);
249 const auto e5 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, e);
251 const auto f0 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, f);
252 const auto f1 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, f);
253 const auto f2 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, f);
254 const auto f3 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, f);
255 const auto f4 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, f);
256 const auto f5 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, f);
258 const auto g0 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, g);
259 const auto g1 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, g);
260 const auto g2 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, g);
261 const auto g3 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, g);
262 const auto g4 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, g);
263 const auto g5 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, g);
265 x1 = reduce_sum_7(a, b, c, d, e, f, g);
266 x2 = reduce_sum_7(a, b0, c0, d0, e0, f0, g0);
267 x3 = reduce_sum_7(a, b1, c1, d1, e1, f1, g1);
268 x4 = reduce_sum_7(a, b2, c2, d2, e2, f2, g2);
269 x5 = reduce_sum_7(a, b3, c3, d3, e3, f3, g3);
270 x6 = reduce_sum_7(a, b4, c4, d4, e4, f4, g4);
271 x7 = reduce_sum_7(a, b5, c5, d5, e5, f5, g5);
274 void fft_8(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, float32x2_t &x5, float32x2_t &x6, float32x2_t &x7, float32x2_t &x8,
const float32x2_t &w,
const float32x2_t &w2,
275 const float32x2_t &w3,
276 const float32x2_t &w4,
const float32x2_t &w5,
const float32x2_t &w6,
277 const float32x2_t &w7)
280 const auto b = c_mul_neon(w, x2);
281 const auto c = c_mul_neon(w2, x3);
282 const auto d = c_mul_neon(w3, x4);
283 const auto e = c_mul_neon(w4, x5);
284 const auto f = c_mul_neon(w5, x6);
285 const auto g = c_mul_neon(w6, x7);
286 const auto h = c_mul_neon(w7, x8);
288 const auto b0 = c_mul_neon(float32x2_t{ kSqrt2Div2, -kSqrt2Div2 },
b);
289 const auto b1 = c_mul_neon(float32x2_t{ 0, -1 },
b);
290 const auto b2 = c_mul_neon(float32x2_t{ -kSqrt2Div2, -kSqrt2Div2 },
b);
291 const auto b3 = c_mul_neon(float32x2_t{ -1, 0 },
b);
292 const auto b4 = c_mul_neon(float32x2_t{ -kSqrt2Div2, kSqrt2Div2 },
b);
293 const auto b5 = c_mul_neon(float32x2_t{ 0, 1 },
b);
294 const auto b6 = c_mul_neon(float32x2_t{ kSqrt2Div2, kSqrt2Div2 },
b);
296 const auto c0 = c_mul_neon(float32x2_t{ 0, -1 }, c);
297 const auto c1 = c_mul_neon(float32x2_t{ -1, 0 }, c);
298 const auto c2 = c_mul_neon(float32x2_t{ 0, 1 }, c);
299 const auto c3 = c_mul_neon(float32x2_t{ 1, 0 }, c);
300 const auto c4 = c_mul_neon(float32x2_t{ 0, -1 }, c);
301 const auto c5 = c_mul_neon(float32x2_t{ -1, 0 }, c);
302 const auto c6 = c_mul_neon(float32x2_t{ 0, 1 }, c);
304 const auto d0 = c_mul_neon(float32x2_t{ -kSqrt2Div2, -kSqrt2Div2 }, d);
305 const auto d1 = c_mul_neon(float32x2_t{ 0, 1 }, d);
306 const auto d2 = c_mul_neon(float32x2_t{ kSqrt2Div2, -kSqrt2Div2 }, d);
307 const auto d3 = c_mul_neon(float32x2_t{ -1, 0 }, d);
308 const auto d4 = c_mul_neon(float32x2_t{ kSqrt2Div2, kSqrt2Div2 }, d);
309 const auto d5 = c_mul_neon(float32x2_t{ 0, -1 }, d);
310 const auto d6 = c_mul_neon(float32x2_t{ -kSqrt2Div2, kSqrt2Div2 }, d);
312 const auto e0 = c_mul_neon(float32x2_t{ -1, 0 }, e);
313 const auto e1 = c_mul_neon(float32x2_t{ 1, 0 }, e);
314 const auto e2 = c_mul_neon(float32x2_t{ -1, 0 }, e);
315 const auto e3 = c_mul_neon(float32x2_t{ 1, 0 }, e);
316 const auto e4 = c_mul_neon(float32x2_t{ -1, 0 }, e);
317 const auto e5 = c_mul_neon(float32x2_t{ 1, 0 }, e);
318 const auto e6 = c_mul_neon(float32x2_t{ -1, 0 }, e);
320 const auto f0 = c_mul_neon(float32x2_t{ -kSqrt2Div2, kSqrt2Div2 }, f);
321 const auto f1 = c_mul_neon(float32x2_t{ 0, -1 }, f);
322 const auto f2 = c_mul_neon(float32x2_t{ kSqrt2Div2, kSqrt2Div2 }, f);
323 const auto f3 = c_mul_neon(float32x2_t{ -1, 0 }, f);
324 const auto f4 = c_mul_neon(float32x2_t{ kSqrt2Div2, -kSqrt2Div2 }, f);
325 const auto f5 = c_mul_neon(float32x2_t{ 0, 1 }, f);
326 const auto f6 = c_mul_neon(float32x2_t{ -kSqrt2Div2, -kSqrt2Div2 }, f);
328 const auto g0 = c_mul_neon(float32x2_t{ 0, 1 }, g);
329 const auto g1 = c_mul_neon(float32x2_t{ -1, 0 }, g);
330 const auto g2 = c_mul_neon(float32x2_t{ 0, -1 }, g);
331 const auto g3 = c_mul_neon(float32x2_t{ 1, 0 }, g);
332 const auto g4 = c_mul_neon(float32x2_t{ 0, 1 }, g);
333 const auto g5 = c_mul_neon(float32x2_t{ -1, 0 }, g);
334 const auto g6 = c_mul_neon(float32x2_t{ 0, -1 }, g);
336 const auto h0 = c_mul_neon(float32x2_t{ kSqrt2Div2, kSqrt2Div2 }, h);
337 const auto h1 = c_mul_neon(float32x2_t{ 0, 1 }, h);
338 const auto h2 = c_mul_neon(float32x2_t{ -kSqrt2Div2, kSqrt2Div2 }, h);
339 const auto h3 = c_mul_neon(float32x2_t{ -1, 0 }, h);
340 const auto h4 = c_mul_neon(float32x2_t{ -kSqrt2Div2, -kSqrt2Div2 }, h);
341 const auto h5 = c_mul_neon(float32x2_t{ 0, -1 }, h);
342 const auto h6 = c_mul_neon(float32x2_t{ kSqrt2Div2, -kSqrt2Div2 }, h);
344 x1 = reduce_sum_8(a, b, c, d, e, f, g, h);
345 x2 = reduce_sum_8(a, b0, c0, d0, e0, f0, g0, h0);
346 x3 = reduce_sum_8(a, b1, c1, d1, e1, f1, g1, h1);
347 x4 = reduce_sum_8(a, b2, c2, d2, e2, f2, g2, h2);
348 x5 = reduce_sum_8(a, b3, c3, d3, e3, f3, g3, h3);
349 x6 = reduce_sum_8(a, b4, c4, d4, e4, f4, g4, h4);
350 x7 = reduce_sum_8(a, b5, c5, d5, e5, f5, g5, h5);
351 x8 = reduce_sum_8(a, b6, c6, d6, e6, f6, g6, h6);
354 template <
bool first_stage>
355 void fft_radix_2_axes_0(
float *X,
float *x,
unsigned int Nx,
unsigned int NxRadix,
const float32x2_t &w_m,
unsigned int N)
357 float32x2_t w{ 1.0f, 0.0f };
358 for(
unsigned int j = 0; j < Nx; j++)
360 for(
unsigned int k = 2 * j; k < 2 *
N; k += 2 * NxRadix)
362 auto a = float32x2_t{ 0, 0 };
363 auto b = float32x2_t{ 0, 0 };
393 w = c_mul_neon(w, w_m);
397 void fft_radix_2_axes_1(
float *X,
float *x,
unsigned int Nx,
unsigned int NxRadix,
const float32x2_t &w_m,
unsigned int M,
unsigned int N)
399 float32x2_t w{ 1.0f, 0.0f };
400 for(
unsigned int j = 0; j < Nx; j++)
402 for(
unsigned int k = 2 * j; k < 2 *
N; k += 2 * NxRadix)
416 w = c_mul_neon(w, w_m);
420 template <
bool first_stage>
421 void fft_radix_3_axes_0(
float *X,
float *x,
unsigned int Nx,
unsigned int NxRadix,
const float32x2_t &w_m,
unsigned int N)
423 float32x2_t w{ 1.0f, 0.0f };
424 for(
unsigned int j = 0; j < Nx; j++)
426 const auto w2 = c_mul_neon(w, w);
428 for(
unsigned int k = 2 * j; k < 2 *
N; k += 2 * NxRadix)
431 float32x2_t a = { 0, 0 };
432 float32x2_t b = { 0, 0 };
433 float32x2_t c = { 0, 0 };
448 fft_3(a, b, c, w, w2);
461 w = c_mul_neon(w, w_m);
465 void fft_radix_3_axes_1(
float *X,
float *x,
unsigned int Nx,
unsigned int NxRadix,
const float32x2_t &w_m,
unsigned int M,
unsigned int N)
467 float32x2_t w{ 1.0f, 0.0f };
468 for(
unsigned int j = 0; j < Nx; j++)
470 const auto w2 = c_mul_neon(w, w);
472 for(
unsigned int k = 2 * j; k < 2 *
N; k += 2 * NxRadix)
480 fft_3(a, b, c, w, w2);
487 w = c_mul_neon(w, w_m);
491 template <
bool first_stage>
492 void fft_radix_4_axes_0(
float *X,
float *x,
unsigned int Nx,
unsigned int NxRadix,
const float32x2_t &w_m,
unsigned int N)
494 float32x2_t w{ 1.0f, 0.0f };
495 for(
unsigned int j = 0; j < Nx; j++)
497 const auto w2 = c_mul_neon(w, w);
498 const auto w3 = c_mul_neon(w2, w);
500 for(
unsigned int k = 2 * j; k < 2 *
N; k += 2 * NxRadix)
502 float32x2_t a = { 0, 0 };
503 float32x2_t b = { 0, 0 };
504 float32x2_t c = { 0, 0 };
505 float32x2_t d = { 0, 0 };
525 fft_4(a, b, c, d, w, w2, w3);
541 w = c_mul_neon(w, w_m);
545 void fft_radix_4_axes_1(
float *X,
float *x,
unsigned int Nx,
unsigned int NxRadix,
const float32x2_t &w_m,
unsigned int M,
unsigned int N)
547 float32x2_t w{ 1.0f, 0.0f };
548 for(
unsigned int j = 0; j < Nx; j++)
550 const auto w2 = c_mul_neon(w, w);
551 const auto w3 = c_mul_neon(w2, w);
553 for(
unsigned int k = 2 * j; k < 2 *
N; k += 2 * NxRadix)
562 fft_4(a, b, c, d, w, w2, w3);
570 w = c_mul_neon(w, w_m);
574 template <
bool first_stage>
575 void fft_radix_5_axes_0(
float *X,
float *x,
unsigned int Nx,
unsigned int NxRadix,
const float32x2_t &w_m,
unsigned int N)
577 float32x2_t w{ 1.0f, 0.0f };
578 for(
unsigned int j = 0; j < Nx; j++)
580 const float32x2_t w2 = c_mul_neon(w, w);
581 const float32x2_t w3 = c_mul_neon(w2, w);
582 const float32x2_t w4 = c_mul_neon(w3, w);
584 for(
unsigned int k = 2 * j; k < 2 *
N; k += 2 * NxRadix)
586 float32x2_t a = { 0, 0 };
587 float32x2_t b = { 0, 0 };
588 float32x2_t c = { 0, 0 };
589 float32x2_t d = { 0, 0 };
590 float32x2_t e = { 0, 0 };
613 fft_5(a, b, c, d, e, w, w2, w3, w4);
631 w = c_mul_neon(w, w_m);
635 void fft_radix_5_axes_1(
float *X,
float *x,
unsigned int Nx,
unsigned int NxRadix,
const float32x2_t &w_m,
unsigned int M,
unsigned int N)
637 float32x2_t w{ 1.0f, 0.0f };
638 for(
unsigned int j = 0; j < Nx; j++)
640 const float32x2_t w2 = c_mul_neon(w, w);
641 const float32x2_t w3 = c_mul_neon(w2, w);
642 const float32x2_t w4 = c_mul_neon(w3, w);
644 for(
unsigned int k = 2 * j; k < 2 *
N; k += 2 * NxRadix)
654 fft_5(a, b, c, d, e, w, w2, w3, w4);
664 w = c_mul_neon(w, w_m);
668 template <
bool first_stage>
669 void fft_radix_7_axes_0(
float *X,
float *x,
unsigned int Nx,
unsigned int NxRadix,
const float32x2_t &w_m,
unsigned int N)
671 float32x2_t w{ 1.0f, 0.0f };
672 for(
unsigned int j = 0; j < Nx; j++)
674 const float32x2_t w2 = c_mul_neon(w, w);
675 const float32x2_t w3 = c_mul_neon(w2, w);
676 const float32x2_t w4 = c_mul_neon(w3, w);
677 const float32x2_t w5 = c_mul_neon(w4, w);
678 const float32x2_t w6 = c_mul_neon(w5, w);
680 for(
unsigned int k = 2 * j; k < 2 *
N; k += 2 * NxRadix)
682 float32x2_t a = { 0, 0 };
683 float32x2_t b = { 0, 0 };
684 float32x2_t c = { 0, 0 };
685 float32x2_t d = { 0, 0 };
686 float32x2_t e = { 0, 0 };
687 float32x2_t f = { 0, 0 };
688 float32x2_t g = { 0, 0 };
716 fft_7(a, b, c, d, e, f, g, w, w2, w3, w4, w5, w6);
736 w = c_mul_neon(w, w_m);
740 void fft_radix_7_axes_1(
float *X,
float *x,
unsigned int Nx,
unsigned int NxRadix,
const float32x2_t &w_m,
unsigned int M,
unsigned int N)
742 float32x2_t w{ 1.0f, 0.0f };
743 for(
unsigned int j = 0; j < Nx; j++)
745 const float32x2_t w2 = c_mul_neon(w, w);
746 const float32x2_t w3 = c_mul_neon(w2, w);
747 const float32x2_t w4 = c_mul_neon(w3, w);
748 const float32x2_t w5 = c_mul_neon(w4, w);
749 const float32x2_t w6 = c_mul_neon(w5, w);
751 for(
unsigned int k = 2 * j; k < 2 *
N; k += 2 * NxRadix)
763 fft_7(a, b, c, d, e, f, g, w, w2, w3, w4, w5, w6);
775 w = c_mul_neon(w, w_m);
779 template <
bool first_stage>
780 void fft_radix_8_axes_0(
float *X,
float *x,
unsigned int Nx,
unsigned int NxRadix,
const float32x2_t &w_m,
unsigned int N)
782 float32x2_t w{ 1.0f, 0.0f };
783 for(
unsigned int j = 0; j < Nx; j++)
785 const float32x2_t w2 = c_mul_neon(w, w);
786 const float32x2_t w3 = c_mul_neon(w2, w);
787 const float32x2_t w4 = c_mul_neon(w3, w);
788 const float32x2_t w5 = c_mul_neon(w4, w);
789 const float32x2_t w6 = c_mul_neon(w5, w);
790 const float32x2_t w7 = c_mul_neon(w6, w);
792 for(
unsigned int k = 2 * j; k < 2 *
N; k += 2 * NxRadix)
795 float32x2_t a = { 0, 0 };
796 float32x2_t b = { 0, 0 };
797 float32x2_t c = { 0, 0 };
798 float32x2_t d = { 0, 0 };
799 float32x2_t e = { 0, 0 };
800 float32x2_t f = { 0, 0 };
801 float32x2_t g = { 0, 0 };
802 float32x2_t h = { 0, 0 };
834 fft_8(a, b, c, d, e, f, g, h, w, w2, w3, w4, w5, w6, w7);
857 w = c_mul_neon(w, w_m);
861 void fft_radix_8_axes_1(
float *X,
float *x,
unsigned int Nx,
unsigned int NxRadix,
const float32x2_t &w_m,
unsigned int M,
unsigned int N)
863 float32x2_t w{ 1.0f, 0.0f };
864 for(
unsigned int j = 0; j < Nx; j++)
866 const float32x2_t w2 = c_mul_neon(w, w);
867 const float32x2_t w3 = c_mul_neon(w2, w);
868 const float32x2_t w4 = c_mul_neon(w3, w);
869 const float32x2_t w5 = c_mul_neon(w4, w);
870 const float32x2_t w6 = c_mul_neon(w5, w);
871 const float32x2_t w7 = c_mul_neon(w6, w);
873 for(
unsigned int k = 2 * j; k < 2 *
N; k += 2 * NxRadix)
886 fft_8(a, b, c, d, e, f, g, h, w, w2, w3, w4, w5, w6, w7);
899 w = c_mul_neon(w, w_m);
903 Status
validate_arguments(
const ITensorInfo *
input,
const ITensorInfo *output,
const FFTRadixStageKernelInfo &config)
911 if((output !=
nullptr) && (output->total_size() != 0))
920 std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output,
const FFTRadixStageKernelInfo &config)
924 if(output !=
nullptr)
930 if(output !=
nullptr)
932 output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
935 return std::make_pair(Status{}, win);
940 : _input(nullptr), _output(nullptr), _run_in_place(false), _Nx(0), _axis(0), _radix(0), _func_0(), _func_1()
947 static std::map<unsigned int, std::map<bool, FFTFunctionPointerAxis0>> fft_table_axis0;
949 if(fft_table_axis0.empty())
951 fft_table_axis0[2][
false] = &fft_radix_2_axes_0<false>;
952 fft_table_axis0[3][
false] = &fft_radix_3_axes_0<false>;
953 fft_table_axis0[4][
false] = &fft_radix_4_axes_0<false>;
954 fft_table_axis0[5][
false] = &fft_radix_5_axes_0<false>;
955 fft_table_axis0[7][
false] = &fft_radix_7_axes_0<false>;
956 fft_table_axis0[8][
false] = &fft_radix_8_axes_0<false>;
958 fft_table_axis0[2][
true] = &fft_radix_2_axes_0<true>;
959 fft_table_axis0[3][
true] = &fft_radix_3_axes_0<true>;
960 fft_table_axis0[4][
true] = &fft_radix_4_axes_0<true>;
961 fft_table_axis0[5][
true] = &fft_radix_5_axes_0<true>;
962 fft_table_axis0[7][
true] = &fft_radix_7_axes_0<true>;
963 fft_table_axis0[8][
true] = &fft_radix_8_axes_0<true>;
972 static std::map<unsigned int, FFTFunctionPointerAxis1> fft_table_axis1;
974 if(fft_table_axis1.empty())
976 fft_table_axis1[2] = &fft_radix_2_axes_1;
977 fft_table_axis1[3] = &fft_radix_3_axes_1;
978 fft_table_axis1[4] = &fft_radix_4_axes_1;
979 fft_table_axis1[5] = &fft_radix_5_axes_1;
980 fft_table_axis1[7] = &fft_radix_7_axes_1;
981 fft_table_axis1[8] = &fft_radix_8_axes_1;
984 _func_1 = fft_table_axis1[config.
radix];
992 if(output !=
nullptr)
1001 _run_in_place = (output ==
nullptr) || (output == input);
1003 _axis = config.
axis;
1004 _radix = config.
radix;
1009 set_radix_stage_axis0(config);
1012 set_radix_stage_axis1(config);
1020 auto win_config = validate_and_configure_window(input->
info(), (_run_in_place) ?
nullptr : output->
info(), config);
1022 INEKernel::configure(win_config.second);
1027 const bool run_in_place = (output ==
nullptr) || (output == input);
1030 (run_in_place) ?
nullptr : output->
clone().get(),
1039 return std::set<unsigned int> { 2, 3, 4, 5, 7, 8 };
1049 input_window.
set(_axis, 0);
1052 Iterator out(_run_in_place ? _input : _output, input_window);
1055 const unsigned int NxRadix = _radix * _Nx;
1056 const float alpha = 2.0f * kPi / float(NxRadix);
1057 const float32x2_t w_m{ cosf(alpha), -sinf(alpha) };
1064 _func_0(reinterpret_cast<float *>(out.
ptr()), reinterpret_cast<float *>(in.
ptr()), _Nx, NxRadix, w_m, N);
1074 _func_1(reinterpret_cast<float *>(out.
ptr()), reinterpret_cast<float *>(in.
ptr()), _Nx, NxRadix, w_m, N, M);
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
Traits defined on Neon vectors.
const Window & window() const
The maximum window the kernel can be executed on.
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
uint8x16_t vloadq(const uint8_t *ptr)
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
uint8x8_t vadd(const uint8x8_t &a, const uint8x8_t &b)
static Status validate(const ITensorInfo *input, const ITensorInfo *output, const FFTRadixStageKernelInfo &config)
Static function to check if given info will lead to a valid configuration of NEFFTRadixStageKernel.
1 channel, 1 F32 per channel
void configure(ITensor *input, ITensor *output, const FFTRadixStageKernelInfo &config)
Set the input and output tensors.
Store the tensor's metadata.
#define ARM_COMPUTE_ERROR_THROW_ON(status)
uint8x8_t vsub(const uint8x8_t &a, const uint8x8_t &b)
unsigned int axis
Axis to run the kernel on.
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
Interface for Neon tensor.
Copyright (c) 2017-2021 Arm Limited.
uint8_t vgetlane(const uint8x8_t vector, const unsigned int lane)
NEFFTRadixStageKernel()
Constructor.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
static std::set< unsigned int > supported_radix()
Returns the radix that are support by the FFT kernel.
bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, QuantizationInfo quantization_info=QuantizationInfo())
Auto initialize the tensor info (shape, number of channels and data type) if the current assignment i...
Descriptor used by the FFT core kernels.
virtual std::unique_ptr< T > clone() const =0
Provide a clone of the current object of class T.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
constexpr uint8_t * ptr() const
Return a pointer to the current pixel.
uint8x8_t vgetlow(const uint8x16_t val)
void set(size_t dimension, const Dimension &dim)
Set the values of a given dimension.
uint8x16_t vcombine(const uint8x8_t &a, const uint8x8_t &b)
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
int8x8_t vneg(const int8x8_t &a)
uint8x8_t vgethigh(const uint8x16_t val)
unsigned int radix
Radix to use.
void run(const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
uint8x8_t vmul(const uint8x8_t &a, const uint8x8_t &b)
Information about executing thread and CPU.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(...)
bool is_first_stage
Flags if the FFT kernels is the first stage of a decomposed FFT.
unsigned int Nx
Nx coefficient.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...)
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage)
uint8x8_t vrev64(const uint8x8_t &a)
uint8x8_t vload(const uint8_t *ptr)
void vstore(uint8_t *ptr, uint8x8_t val)
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Includes all wrapper headers at once.
uint8x8_t vmla(const uint8x8_t &a, const uint8x8_t &b, const uint8x8_t &c)
Iterator updated by execute_window_loop for each window element.
Describe a multidimensional execution window.
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)