ComputeLibrary/latest/_n_e_f_f_t_radix_stage_kernel_8cpp_source.xhtml

/*

 * Copyright (c) 2019-2021 Arm Limited.

 *

 * SPDX-License-Identifier: MIT

 *

 * Permission is hereby granted, free of charge, to any person obtaining a copy

 * of this software and associated documentation files (the "Software"), to

 * deal in the Software without restriction, including without limitation the

 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or

 * sell copies of the Software, and to permit persons to whom the Software is

 * furnished to do so, subject to the following conditions:

 *

 * The above copyright notice and this permission notice shall be included in all

 * copies or substantial portions of the Software.

 *

 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR

 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,

 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE

 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER

 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,

 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE

 * SOFTWARE.

 */

#include "src/core/NEON/kernels/NEFFTRadixStageKernel.h"


#include "arm_compute/core/ITensor.h"

#include "arm_compute/core/TensorInfo.h"

#include "arm_compute/core/Types.h"

#include "arm_compute/core/Utils.h"

#include "arm_compute/core/Window.h"


#include "src/core/helpers/AutoConfiguration.h"

#include "src/core/helpers/WindowHelpers.h"

#include "src/core/NEON/wrapper/traits.h"

#include "src/core/NEON/wrapper/wrapper.h"

#include "support/ToolchainSupport.h"


#include <arm_neon.h>

#include <cmath>

#include <complex>

#include <map>


namespace arm_compute

{

namespace

{

// PI constant (from cmath)

constexpr float kPi = float(M_PI);


// Constant used in the fft_3 kernel

constexpr float kSqrt3Div2 = 0.866025403784438;


// Constants used in the fft_5 kernel

constexpr float kW5_0 = 0.30901699437494f;

constexpr float kW5_1 = 0.95105651629515f;

constexpr float kW5_2 = 0.80901699437494f;

constexpr float kW5_3 = 0.58778525229247f;


// Constants used in the fft_7 kernel

constexpr float kW7_0 = 0.62348980185873f;

constexpr float kW7_1 = 0.78183148246802f;

constexpr float kW7_2 = 0.22252093395631f;

constexpr float kW7_3 = 0.97492791218182f;

constexpr float kW7_4 = 0.90096886790241f;

constexpr float kW7_5 = 0.43388373911755f;


// Constant used in the fft_8 kernel

constexpr float kSqrt2Div2 = 0.707106781186548;


float32x2_t c_mul_neon(float32x2_t a, float32x2_t b)

{

    using ExactTagType = typename wrapper::traits::neon_vector<float, 2>::tag_type;


    const float32x2_t mask = {-1.0, 1.0};

    const float32x2_t tmp0 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{});

    const float32x2_t tmp1 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{});


    float32x2_t res = wrapper::vmul(tmp0, b);


    b   = wrapper::vrev64(b);

    b   = wrapper::vmul(b, mask);

    res = wrapper::vmla(res, tmp1, b);


    return res;

}


float32x2_t c_mul_neon_img(float32x2_t a, float img_constant)

{

    const float a_r = wrapper::vgetlane(a, 0);

    const float a_i = wrapper::vgetlane(a, 1);


    const auto out = wrapper::vmul(float32x2_t{-a_i, a_r}, float32x2_t{img_constant, img_constant});

    return out;

}


float32x2_t reduce_sum_5(float32x2_t a, float32x2_t b, float32x2_t c, float32x2_t d, float32x2_t e)

{

    const auto t0 = wrapper::vadd(a, b);

    const auto t1 = wrapper::vadd(c, d);

    const auto t2 = wrapper::vadd(t0, t1);

    return wrapper::vadd(t2, e);

}


float32x2_t reduce_sum_7(

    float32x2_t x1, float32x2_t x2, float32x2_t x3, float32x2_t x4, float32x2_t x5, float32x2_t x6, float32x2_t x7)

{

    const auto t0  = wrapper::vadd(x1, x2);

    const auto t1  = wrapper::vadd(x3, x4);

    const auto t2  = wrapper::vadd(x5, x6);

    const auto t00 = wrapper::vadd(t0, t1);

    const auto t01 = wrapper::vadd(t2, x7);


    return wrapper::vadd(t00, t01);

}


float32x2_t reduce_sum_8(float32x2_t x1,

                         float32x2_t x2,

                         float32x2_t x3,

                         float32x2_t x4,

                         float32x2_t x5,

                         float32x2_t x6,

                         float32x2_t x7,

                         float32x2_t x8)

{

    const auto t0  = wrapper::vadd(x1, x2);

    const auto t1  = wrapper::vadd(x3, x4);

    const auto t2  = wrapper::vadd(x5, x6);

    const auto t3  = wrapper::vadd(x7, x8);

    const auto t00 = wrapper::vadd(t0, t1);

    const auto t01 = wrapper::vadd(t2, t3);


    return wrapper::vadd(t00, t01);

}


void fft_2(float32x2_t &x, float32x2_t &y, float32x2_t &w)

{

    float32x2_t a = x;

    float32x2_t b = c_mul_neon(w, y);


    x = wrapper::vadd(a, b);

    y = wrapper::vsub(a, b);

}


void fft_3(float32x2_t &x, float32x2_t &y, float32x2_t &z, const float32x2_t &w, const float32x2_t &w2)

{

    float32x2_t a = x;

    float32x2_t b = c_mul_neon(w, y);

    float32x2_t c = c_mul_neon(w2, z);


    x = wrapper::vadd(a, b);

    x = wrapper::vadd(x, c);


    const auto v1 = wrapper::vmul(float32x2_t{0.5f, 0.5}, wrapper::vadd(b, c));

    const auto v2 = c_mul_neon(float32x2_t{0.f, -kSqrt3Div2}, wrapper::vsub(b, c));


    y = z = wrapper::vsub(a, v1);

    y     = wrapper::vadd(y, v2);

    z     = wrapper::vsub(z, v2);

}


void fft_4(float32x2_t       &x1,

           float32x2_t       &x2,

           float32x2_t       &x3,

           float32x2_t       &x4,

           const float32x2_t &w,

           const float32x2_t &w2,

           const float32x2_t &w3)

{

    float32x2_t a = x1;

    float32x2_t b = c_mul_neon(w, x2);

    float32x2_t c = c_mul_neon(w2, x3);

    float32x2_t d = c_mul_neon(w3, x4);


    const auto x11 = wrapper::vadd(a, b);

    const auto x12 = wrapper::vadd(c, d);

    x1             = wrapper::vadd(x11, x12);


    const auto x21 = wrapper::vadd(a, c_mul_neon_img(b, -1));

    const auto x22 = wrapper::vadd(wrapper::vneg(c), c_mul_neon_img(d, 1.f));

    x2             = wrapper::vadd(x21, x22);


    const auto x31 = wrapper::vadd(a, wrapper::vneg(b));

    const auto x32 = wrapper::vadd(c, wrapper::vneg(d));

    x3             = wrapper::vadd(x31, x32);


    const auto x41 = wrapper::vadd(a, c_mul_neon_img(b, 1));

    const auto x42 = wrapper::vadd(wrapper::vneg(c), c_mul_neon_img(d, -1));

    x4             = wrapper::vadd(x41, x42);

}


void fft_5(float32x2_t       &x1,

           float32x2_t       &x2,

           float32x2_t       &x3,

           float32x2_t       &x4,

           float32x2_t       &x5,

           const float32x2_t &w,

           const float32x2_t &w2,

           const float32x2_t &w3,

           const float32x2_t &w4)

{

    const auto a = x1;

    const auto b = c_mul_neon(w, x2);

    const auto c = c_mul_neon(w2, x3);

    const auto d = c_mul_neon(w3, x4);

    const auto e = c_mul_neon(w4, x5);


    const auto b0 = c_mul_neon(float32x2_t{kW5_0, -kW5_1}, b);

    const auto b1 = c_mul_neon(float32x2_t{-kW5_2, -kW5_3}, b);

    const auto b2 = c_mul_neon(float32x2_t{-kW5_2, kW5_3}, b);

    const auto b3 = c_mul_neon(float32x2_t{kW5_0, kW5_1}, b);


    const auto c0 = c_mul_neon(float32x2_t{-kW5_2, -kW5_3}, c);

    const auto c1 = c_mul_neon(float32x2_t{kW5_0, kW5_1}, c);

    const auto c2 = c_mul_neon(float32x2_t{kW5_0, -kW5_1}, c);

    const auto c3 = c_mul_neon(float32x2_t{-kW5_2, kW5_3}, c);


    const auto d0 = c_mul_neon(float32x2_t{-kW5_2, kW5_3}, d);

    const auto d1 = c_mul_neon(float32x2_t{kW5_0, -kW5_1}, d);

    const auto d2 = c_mul_neon(float32x2_t{kW5_0, kW5_1}, d);

    const auto d3 = c_mul_neon(float32x2_t{-kW5_2, -kW5_3}, d);


    const auto e0 = c_mul_neon(float32x2_t{kW5_0, kW5_1}, e);

    const auto e1 = c_mul_neon(float32x2_t{-kW5_2, kW5_3}, e);

    const auto e2 = c_mul_neon(float32x2_t{-kW5_2, -kW5_3}, e);

    const auto e3 = c_mul_neon(float32x2_t{kW5_0, -kW5_1}, e);


    x1 = reduce_sum_5(a, b, c, d, e);

    x2 = reduce_sum_5(a, b0, c0, d0, e0);

    x3 = reduce_sum_5(a, b1, c1, d1, e1);

    x4 = reduce_sum_5(a, b2, c2, d2, e2);

    x5 = reduce_sum_5(a, b3, c3, d3, e3);

}


void fft_7(float32x2_t       &x1,

           float32x2_t       &x2,

           float32x2_t       &x3,

           float32x2_t       &x4,

           float32x2_t       &x5,

           float32x2_t       &x6,

           float32x2_t       &x7,

           const float32x2_t &w,

           const float32x2_t &w2,

           const float32x2_t &w3,

           const float32x2_t &w4,

           const float32x2_t &w5,

           const float32x2_t &w6)

{

    const auto a = x1;

    const auto b = c_mul_neon(w, x2);

    const auto c = c_mul_neon(w2, x3);

    const auto d = c_mul_neon(w3, x4);

    const auto e = c_mul_neon(w4, x5);

    const auto f = c_mul_neon(w5, x6);

    const auto g = c_mul_neon(w6, x7);


    const auto b0 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, b);

    const auto b1 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, b);

    const auto b2 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, b);

    const auto b3 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, b);

    const auto b4 = c_mul_neon(float32x2_t{-kW7_2, kW7_3}, b);

    const auto b5 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, b);


    const auto c0 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, c);

    const auto c1 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, c);

    const auto c2 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, c);

    const auto c3 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, c);

    const auto c4 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, c);

    const auto c5 = c_mul_neon(float32x2_t{-kW7_2, kW7_3}, c);


    const auto d0 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, d);

    const auto d1 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, d);

    const auto d2 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, d);

    const auto d3 = c_mul_neon(float32x2_t{-kW7_2, +kW7_3}, d);

    const auto d4 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, d);

    const auto d5 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, d);


    const auto e0 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, e);

    const auto e1 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, e);

    const auto e2 = c_mul_neon(float32x2_t{-kW7_2, kW7_3}, e);

    const auto e3 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, e);

    const auto e4 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, e);

    const auto e5 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, e);


    const auto f0 = c_mul_neon(float32x2_t{-kW7_2, kW7_3}, f);

    const auto f1 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, f);

    const auto f2 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, f);

    const auto f3 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, f);

    const auto f4 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, f);

    const auto f5 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, f);


    const auto g0 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, g);

    const auto g1 = c_mul_neon(float32x2_t{-kW7_2, kW7_3}, g);

    const auto g2 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, g);

    const auto g3 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, g);

    const auto g4 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, g);

    const auto g5 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, g);


    x1 = reduce_sum_7(a, b, c, d, e, f, g);

    x2 = reduce_sum_7(a, b0, c0, d0, e0, f0, g0);

    x3 = reduce_sum_7(a, b1, c1, d1, e1, f1, g1);

    x4 = reduce_sum_7(a, b2, c2, d2, e2, f2, g2);

    x5 = reduce_sum_7(a, b3, c3, d3, e3, f3, g3);

    x6 = reduce_sum_7(a, b4, c4, d4, e4, f4, g4);

    x7 = reduce_sum_7(a, b5, c5, d5, e5, f5, g5);

}


void fft_8(float32x2_t       &x1,

           float32x2_t       &x2,

           float32x2_t       &x3,

           float32x2_t       &x4,

           float32x2_t       &x5,

           float32x2_t       &x6,

           float32x2_t       &x7,

           float32x2_t       &x8,

           const float32x2_t &w,

           const float32x2_t &w2,

           const float32x2_t &w3,

           const float32x2_t &w4,

           const float32x2_t &w5,

           const float32x2_t &w6,

           const float32x2_t &w7)

{

    const auto a = x1;

    const auto b = c_mul_neon(w, x2);

    const auto c = c_mul_neon(w2, x3);

    const auto d = c_mul_neon(w3, x4);

    const auto e = c_mul_neon(w4, x5);

    const auto f = c_mul_neon(w5, x6);

    const auto g = c_mul_neon(w6, x7);

    const auto h = c_mul_neon(w7, x8);


    const auto b0 = c_mul_neon(float32x2_t{kSqrt2Div2, -kSqrt2Div2}, b);

    const auto b1 = c_mul_neon(float32x2_t{0, -1}, b);

    const auto b2 = c_mul_neon(float32x2_t{-kSqrt2Div2, -kSqrt2Div2}, b);

    const auto b3 = c_mul_neon(float32x2_t{-1, 0}, b);

    const auto b4 = c_mul_neon(float32x2_t{-kSqrt2Div2, kSqrt2Div2}, b);

    const auto b5 = c_mul_neon(float32x2_t{0, 1}, b);

    const auto b6 = c_mul_neon(float32x2_t{kSqrt2Div2, kSqrt2Div2}, b);


    const auto c0 = c_mul_neon(float32x2_t{0, -1}, c);

    const auto c1 = c_mul_neon(float32x2_t{-1, 0}, c);

    const auto c2 = c_mul_neon(float32x2_t{0, 1}, c);

    const auto c3 = c_mul_neon(float32x2_t{1, 0}, c);

    const auto c4 = c_mul_neon(float32x2_t{0, -1}, c);

    const auto c5 = c_mul_neon(float32x2_t{-1, 0}, c);

    const auto c6 = c_mul_neon(float32x2_t{0, 1}, c);


    const auto d0 = c_mul_neon(float32x2_t{-kSqrt2Div2, -kSqrt2Div2}, d);

    const auto d1 = c_mul_neon(float32x2_t{0, 1}, d);

    const auto d2 = c_mul_neon(float32x2_t{kSqrt2Div2, -kSqrt2Div2}, d);

    const auto d3 = c_mul_neon(float32x2_t{-1, 0}, d);

    const auto d4 = c_mul_neon(float32x2_t{kSqrt2Div2, kSqrt2Div2}, d);

    const auto d5 = c_mul_neon(float32x2_t{0, -1}, d);

    const auto d6 = c_mul_neon(float32x2_t{-kSqrt2Div2, kSqrt2Div2}, d);


    const auto e0 = c_mul_neon(float32x2_t{-1, 0}, e);

    const auto e1 = c_mul_neon(float32x2_t{1, 0}, e);

    const auto e2 = c_mul_neon(float32x2_t{-1, 0}, e);

    const auto e3 = c_mul_neon(float32x2_t{1, 0}, e);

    const auto e4 = c_mul_neon(float32x2_t{-1, 0}, e);

    const auto e5 = c_mul_neon(float32x2_t{1, 0}, e);

    const auto e6 = c_mul_neon(float32x2_t{-1, 0}, e);


    const auto f0 = c_mul_neon(float32x2_t{-kSqrt2Div2, kSqrt2Div2}, f);

    const auto f1 = c_mul_neon(float32x2_t{0, -1}, f);

    const auto f2 = c_mul_neon(float32x2_t{kSqrt2Div2, kSqrt2Div2}, f);

    const auto f3 = c_mul_neon(float32x2_t{-1, 0}, f);

    const auto f4 = c_mul_neon(float32x2_t{kSqrt2Div2, -kSqrt2Div2}, f);

    const auto f5 = c_mul_neon(float32x2_t{0, 1}, f);

    const auto f6 = c_mul_neon(float32x2_t{-kSqrt2Div2, -kSqrt2Div2}, f);


    const auto g0 = c_mul_neon(float32x2_t{0, 1}, g);

    const auto g1 = c_mul_neon(float32x2_t{-1, 0}, g);

    const auto g2 = c_mul_neon(float32x2_t{0, -1}, g);

    const auto g3 = c_mul_neon(float32x2_t{1, 0}, g);

    const auto g4 = c_mul_neon(float32x2_t{0, 1}, g);

    const auto g5 = c_mul_neon(float32x2_t{-1, 0}, g);

    const auto g6 = c_mul_neon(float32x2_t{0, -1}, g);


    const auto h0 = c_mul_neon(float32x2_t{kSqrt2Div2, kSqrt2Div2}, h);

    const auto h1 = c_mul_neon(float32x2_t{0, 1}, h);

    const auto h2 = c_mul_neon(float32x2_t{-kSqrt2Div2, kSqrt2Div2}, h);

    const auto h3 = c_mul_neon(float32x2_t{-1, 0}, h);

    const auto h4 = c_mul_neon(float32x2_t{-kSqrt2Div2, -kSqrt2Div2}, h);

    const auto h5 = c_mul_neon(float32x2_t{0, -1}, h);

    const auto h6 = c_mul_neon(float32x2_t{kSqrt2Div2, -kSqrt2Div2}, h);


    x1 = reduce_sum_8(a, b, c, d, e, f, g, h);

    x2 = reduce_sum_8(a, b0, c0, d0, e0, f0, g0, h0);

    x3 = reduce_sum_8(a, b1, c1, d1, e1, f1, g1, h1);

    x4 = reduce_sum_8(a, b2, c2, d2, e2, f2, g2, h2);

    x5 = reduce_sum_8(a, b3, c3, d3, e3, f3, g3, h3);

    x6 = reduce_sum_8(a, b4, c4, d4, e4, f4, g4, h4);

    x7 = reduce_sum_8(a, b5, c5, d5, e5, f5, g5, h5);

    x8 = reduce_sum_8(a, b6, c6, d6, e6, f6, g6, h6);

}


template <bool first_stage>

void fft_radix_2_axes_0(

    float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)

{

    float32x2_t w{1.0f, 0.0f};

    for (unsigned int j = 0; j < Nx; j++)

    {

        for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)

        {

            auto a = float32x2_t{0, 0};

            auto b = float32x2_t{0, 0};


            // Load inputs

            if (first_stage)

            {

                const auto ab = wrapper::vloadq(in + k);

                a             = wrapper::vgetlow(ab);

                b             = wrapper::vgethigh(ab);

            }

            else

            {

                a = wrapper::vload(in + k);

                b = wrapper::vload(in + k + 2 * Nx);

            }


            // Base-case prime transform

            fft_2(a, b, w);


            // Write outputs

            if (first_stage)

            {

                wrapper::vstore(out + k, wrapper::vcombine(a, b));

            }

            else

            {

                wrapper::vstore(out + k, a);

                wrapper::vstore(out + k + 2 * Nx, b);

            }

        }


        w = c_mul_neon(w, w_m);

    }

}


void fft_radix_2_axes_1(float             *out,

                        float             *in,

                        unsigned int       Nx,

                        unsigned int       NxRadix,

                        const float32x2_t &w_m,

                        unsigned int       N,

                        unsigned int       M,

                        unsigned int       in_pad_x,

                        unsigned int       out_pad_x)

{

    float32x2_t w{1.0f, 0.0f};

    for (unsigned int j = 0; j < Nx; j++)

    {

        for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)

        {

            // Load inputs

            float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k);

            float32x2_t b = wrapper::vload(in + (N + in_pad_x) * (k + 2 * Nx));


            // Base-case prime transform

            fft_2(a, b, w);


            // Write outputs

            wrapper::vstore(out + (N + out_pad_x) * k, a);

            wrapper::vstore(out + (N + out_pad_x) * (k + 2 * Nx), b);

        }


        w = c_mul_neon(w, w_m);

    }

}


template <bool first_stage>

void fft_radix_3_axes_0(

    float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)

{

    float32x2_t w{1.0f, 0.0f};

    for (unsigned int j = 0; j < Nx; j++)

    {

        const auto w2 = c_mul_neon(w, w);


        for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)

        {

            // Load inputs

            float32x2_t a = {0, 0};

            float32x2_t b = {0, 0};

            float32x2_t c = {0, 0};

            if (first_stage)

            {

                const auto ab = wrapper::vloadq(in + k);

                a             = wrapper::vgetlow(ab);

                b             = wrapper::vgethigh(ab);

            }

            else

            {

                a = wrapper::vload(in + k);

                b = wrapper::vload(in + k + 2 * Nx);

            }

            c = wrapper::vload(in + k + 4 * Nx);


            // Base-case prime transform

            fft_3(a, b, c, w, w2);


            if (first_stage)

            {

                wrapper::vstore(out + k, wrapper::vcombine(a, b));

            }

            else

            {

                wrapper::vstore(out + k, a);

                wrapper::vstore(out + k + 2 * Nx, b);

            }

            wrapper::vstore(out + k + 4 * Nx, c);

        }

        w = c_mul_neon(w, w_m);

    }

}


void fft_radix_3_axes_1(float             *out,

                        float             *in,

                        unsigned int       Nx,

                        unsigned int       NxRadix,

                        const float32x2_t &w_m,

                        unsigned int       N,

                        unsigned int       M,

                        unsigned int       in_pad_x,

                        unsigned int       out_pad_x)

{

    float32x2_t w{1.0f, 0.0f};

    for (unsigned int j = 0; j < Nx; j++)

    {

        const auto w2 = c_mul_neon(w, w);


        for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)

        {

            // Load inputs

            float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k);

            float32x2_t b = wrapper::vload(in + (N + in_pad_x) * (k + 2 * Nx));

            float32x2_t c = wrapper::vload(in + (N + in_pad_x) * (k + 4 * Nx));


            // Base-case prime transform

            fft_3(a, b, c, w, w2);


            // Store the output

            wrapper::vstore(out + (N + out_pad_x) * k, a);

            wrapper::vstore(out + (N + out_pad_x) * (k + 2 * Nx), b);

            wrapper::vstore(out + (N + out_pad_x) * (k + 4 * Nx), c);

        }

        w = c_mul_neon(w, w_m);

    }

}


template <bool first_stage>

void fft_radix_4_axes_0(

    float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)

{

    float32x2_t w{1.0f, 0.0f};

    for (unsigned int j = 0; j < Nx; j++)

    {

        const auto w2 = c_mul_neon(w, w);

        const auto w3 = c_mul_neon(w2, w);


        for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)

        {

            float32x2_t a = {0, 0};

            float32x2_t b = {0, 0};

            float32x2_t c = {0, 0};

            float32x2_t d = {0, 0};

            if (first_stage)

            {

                const auto ab = wrapper::vloadq(in + k);

                const auto cd = wrapper::vloadq(in + k + 4 * Nx);

                a             = wrapper::vgetlow(ab);

                b             = wrapper::vgethigh(ab);

                c             = wrapper::vgetlow(cd);

                d             = wrapper::vgethigh(cd);

            }

            else

            {

                // Load inputs

                a = wrapper::vload(in + k);

                b = wrapper::vload(in + k + 2 * Nx);

                c = wrapper::vload(in + k + 4 * Nx);

                d = wrapper::vload(in + k + 6 * Nx);

            }


            // Base-case prime transform

            fft_4(a, b, c, d, w, w2, w3);


            if (first_stage)

            {

                wrapper::vstore(out + k, wrapper::vcombine(a, b));

                wrapper::vstore(out + k + 4 * Nx, wrapper::vcombine(c, d));

            }

            else

            {

                wrapper::vstore(out + k, a);

                wrapper::vstore(out + k + 2 * Nx, b);

                wrapper::vstore(out + k + 4 * Nx, c);

                wrapper::vstore(out + k + 6 * Nx, d);

            }

        }


        w = c_mul_neon(w, w_m);

    }

}


void fft_radix_4_axes_1(float             *out,

                        float             *in,

                        unsigned int       Nx,

                        unsigned int       NxRadix,

                        const float32x2_t &w_m,

                        unsigned int       N,

                        unsigned int       M,

                        unsigned int       in_pad_x,

                        unsigned int       out_pad_x)

{

    float32x2_t w{1.0f, 0.0f};

    for (unsigned int j = 0; j < Nx; j++)

    {

        const auto w2 = c_mul_neon(w, w);

        const auto w3 = c_mul_neon(w2, w);


        for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)

        {

            // Load inputs

            float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k);

            float32x2_t b = wrapper::vload(in + (N + in_pad_x) * (k + 2 * Nx));

            float32x2_t c = wrapper::vload(in + (N + in_pad_x) * (k + 4 * Nx));

            float32x2_t d = wrapper::vload(in + (N + in_pad_x) * (k + 6 * Nx));


            // Base-case prime transform

            fft_4(a, b, c, d, w, w2, w3);


            wrapper::vstore(out + (N + out_pad_x) * k, a);

            wrapper::vstore(out + (N + out_pad_x) * (k + 2 * Nx), b);

            wrapper::vstore(out + (N + out_pad_x) * (k + 4 * Nx), c);

            wrapper::vstore(out + (N + out_pad_x) * (k + 6 * Nx), d);

        }


        w = c_mul_neon(w, w_m);

    }

}


template <bool first_stage>

void fft_radix_5_axes_0(

    float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)

{

    float32x2_t w{1.0f, 0.0f};

    for (unsigned int j = 0; j < Nx; j++)

    {

        const float32x2_t w2 = c_mul_neon(w, w);

        const float32x2_t w3 = c_mul_neon(w2, w);

        const float32x2_t w4 = c_mul_neon(w3, w);


        for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)

        {

            float32x2_t a = {0, 0};

            float32x2_t b = {0, 0};

            float32x2_t c = {0, 0};

            float32x2_t d = {0, 0};

            float32x2_t e = {0, 0};


            // Load inputs

            if (first_stage)

            {

                const auto ab = wrapper::vloadq(in + k);

                const auto cd = wrapper::vloadq(in + k + 4 * Nx);


                a = wrapper::vgetlow(ab);

                b = wrapper::vgethigh(ab);

                c = wrapper::vgetlow(cd);

                d = wrapper::vgethigh(cd);

            }

            else

            {

                a = wrapper::vload(in + k);

                b = wrapper::vload(in + k + 2 * Nx);

                c = wrapper::vload(in + k + 4 * Nx);

                d = wrapper::vload(in + k + 6 * Nx);

            }

            e = wrapper::vload(in + k + 8 * Nx);


            // Base-case prime transform

            fft_5(a, b, c, d, e, w, w2, w3, w4);


            // Store outputs

            if (first_stage)

            {

                wrapper::vstore(out + k, wrapper::vcombine(a, b));

                wrapper::vstore(out + k + 4 * Nx, wrapper::vcombine(c, d));

            }

            else

            {

                wrapper::vstore(out + k, a);

                wrapper::vstore(out + k + 2 * Nx, b);

                wrapper::vstore(out + k + 4 * Nx, c);

                wrapper::vstore(out + k + 6 * Nx, d);

            }

            wrapper::vstore(out + k + 8 * Nx, e);

        }


        w = c_mul_neon(w, w_m);

    }

}


void fft_radix_5_axes_1(float             *out,

                        float             *in,

                        unsigned int       Nx,

                        unsigned int       NxRadix,

                        const float32x2_t &w_m,

                        unsigned int       N,

                        unsigned int       M,

                        unsigned int       in_pad_x,

                        unsigned int       out_pad_x)

{

    float32x2_t w{1.0f, 0.0f};

    for (unsigned int j = 0; j < Nx; j++)

    {

        const float32x2_t w2 = c_mul_neon(w, w);

        const float32x2_t w3 = c_mul_neon(w2, w);

        const float32x2_t w4 = c_mul_neon(w3, w);


        for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)

        {

            // Load inputs

            float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k);

            float32x2_t b = wrapper::vload(in + (N + in_pad_x) * (k + 2 * Nx));

            float32x2_t c = wrapper::vload(in + (N + in_pad_x) * (k + 4 * Nx));

            float32x2_t d = wrapper::vload(in + (N + in_pad_x) * (k + 6 * Nx));

            float32x2_t e = wrapper::vload(in + (N + in_pad_x) * (k + 8 * Nx));


            // Base-case prime transform

            fft_5(a, b, c, d, e, w, w2, w3, w4);


            // Store outputs

            wrapper::vstore(out + (N + out_pad_x) * k, a);

            wrapper::vstore(out + (N + out_pad_x) * (k + 2 * Nx), b);

            wrapper::vstore(out + (N + out_pad_x) * (k + 4 * Nx), c);

            wrapper::vstore(out + (N + out_pad_x) * (k + 6 * Nx), d);

            wrapper::vstore(out + (N + out_pad_x) * (k + 8 * Nx), e);

        }


        w = c_mul_neon(w, w_m);

    }

}


template <bool first_stage>

void fft_radix_7_axes_0(

    float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)

{

    float32x2_t w{1.0f, 0.0f};

    for (unsigned int j = 0; j < Nx; j++)

    {

        const float32x2_t w2 = c_mul_neon(w, w);

        const float32x2_t w3 = c_mul_neon(w2, w);

        const float32x2_t w4 = c_mul_neon(w3, w);

        const float32x2_t w5 = c_mul_neon(w4, w);

        const float32x2_t w6 = c_mul_neon(w5, w);


        for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)

        {

            float32x2_t a = {0, 0};

            float32x2_t b = {0, 0};

            float32x2_t c = {0, 0};

            float32x2_t d = {0, 0};

            float32x2_t e = {0, 0};

            float32x2_t f = {0, 0};

            float32x2_t g = {0, 0};


            // Load inputs

            if (first_stage)

            {

                const auto ab = wrapper::vloadq(in + k);

                const auto cd = wrapper::vloadq(in + k + 4 * Nx);

                const auto ef = wrapper::vloadq(in + k + 8 * Nx);


                a = wrapper::vgetlow(ab);

                b = wrapper::vgethigh(ab);

                c = wrapper::vgetlow(cd);

                d = wrapper::vgethigh(cd);

                e = wrapper::vgetlow(ef);

                f = wrapper::vgethigh(ef);

            }

            else

            {

                a = wrapper::vload(in + k);

                b = wrapper::vload(in + k + 2 * Nx);

                c = wrapper::vload(in + k + 4 * Nx);

                d = wrapper::vload(in + k + 6 * Nx);

                e = wrapper::vload(in + k + 8 * Nx);

                f = wrapper::vload(in + k + 10 * Nx);

            }

            g = wrapper::vload(in + k + 12 * Nx);


            // Base-case prime transform

            fft_7(a, b, c, d, e, f, g, w, w2, w3, w4, w5, w6);


            if (first_stage)

            {

                wrapper::vstore(out + k, wrapper::vcombine(a, b));

                wrapper::vstore(out + k + 4 * Nx, wrapper::vcombine(c, d));

                wrapper::vstore(out + k + 8 * Nx, wrapper::vcombine(e, f));

            }

            else

            {

                wrapper::vstore(out + k, a);

                wrapper::vstore(out + k + 2 * Nx, b);

                wrapper::vstore(out + k + 4 * Nx, c);

                wrapper::vstore(out + k + 6 * Nx, d);

                wrapper::vstore(out + k + 8 * Nx, e);

                wrapper::vstore(out + k + 10 * Nx, f);

            }

            wrapper::vstore(out + k + 12 * Nx, g);

        }


        w = c_mul_neon(w, w_m);

    }

}


void fft_radix_7_axes_1(float             *out,

                        float             *in,

                        unsigned int       Nx,

                        unsigned int       NxRadix,

                        const float32x2_t &w_m,

                        unsigned int       N,

                        unsigned int       M,

                        unsigned int       in_pad_x,

                        unsigned int       out_pad_x)

{

    float32x2_t w{1.0f, 0.0f};

    for (unsigned int j = 0; j < Nx; j++)

    {

        const float32x2_t w2 = c_mul_neon(w, w);

        const float32x2_t w3 = c_mul_neon(w2, w);

        const float32x2_t w4 = c_mul_neon(w3, w);

        const float32x2_t w5 = c_mul_neon(w4, w);

        const float32x2_t w6 = c_mul_neon(w5, w);


        for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)

        {

            // Load inputs

            float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k);

            float32x2_t b = wrapper::vload(in + (N + in_pad_x) * (k + 2 * Nx));

            float32x2_t c = wrapper::vload(in + (N + in_pad_x) * (k + 4 * Nx));

            float32x2_t d = wrapper::vload(in + (N + in_pad_x) * (k + 6 * Nx));

            float32x2_t e = wrapper::vload(in + (N + in_pad_x) * (k + 8 * Nx));

            float32x2_t f = wrapper::vload(in + (N + in_pad_x) * (k + 10 * Nx));

            float32x2_t g = wrapper::vload(in + (N + in_pad_x) * (k + 12 * Nx));


            // Base-case prime transform

            fft_7(a, b, c, d, e, f, g, w, w2, w3, w4, w5, w6);


            // Store outputs

            wrapper::vstore(out + (N + out_pad_x) * k, a);

            wrapper::vstore(out + (N + out_pad_x) * (k + 2 * Nx), b);

            wrapper::vstore(out + (N + out_pad_x) * (k + 4 * Nx), c);

            wrapper::vstore(out + (N + out_pad_x) * (k + 6 * Nx), d);

            wrapper::vstore(out + (N + out_pad_x) * (k + 8 * Nx), e);

            wrapper::vstore(out + (N + out_pad_x) * (k + 10 * Nx), f);

            wrapper::vstore(out + (N + out_pad_x) * (k + 12 * Nx), g);

        }


        w = c_mul_neon(w, w_m);

    }

}


template <bool first_stage>

void fft_radix_8_axes_0(

    float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)

{

    float32x2_t w{1.0f, 0.0f};

    for (unsigned int j = 0; j < Nx; j++)

    {

        const float32x2_t w2 = c_mul_neon(w, w);

        const float32x2_t w3 = c_mul_neon(w2, w);

        const float32x2_t w4 = c_mul_neon(w3, w);

        const float32x2_t w5 = c_mul_neon(w4, w);

        const float32x2_t w6 = c_mul_neon(w5, w);

        const float32x2_t w7 = c_mul_neon(w6, w);


        for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)

        {

            // Load inputs

            float32x2_t a = {0, 0};

            float32x2_t b = {0, 0};

            float32x2_t c = {0, 0};

            float32x2_t d = {0, 0};

            float32x2_t e = {0, 0};

            float32x2_t f = {0, 0};

            float32x2_t g = {0, 0};

            float32x2_t h = {0, 0};


            // Base-case prime transform

            if (first_stage)

            {

                const auto ab = wrapper::vloadq(in + k);

                const auto cd = wrapper::vloadq(in + k + 4 * Nx);

                const auto ef = wrapper::vloadq(in + k + 8 * Nx);

                const auto gh = wrapper::vloadq(in + k + 12 * Nx);


                a = wrapper::vgetlow(ab);

                b = wrapper::vgethigh(ab);

                c = wrapper::vgetlow(cd);

                d = wrapper::vgethigh(cd);

                e = wrapper::vgetlow(ef);

                f = wrapper::vgethigh(ef);

                g = wrapper::vgetlow(gh);

                h = wrapper::vgethigh(gh);

            }

            else

            {

                a = wrapper::vload(in + k);

                b = wrapper::vload(in + k + 2 * Nx);

                c = wrapper::vload(in + k + 4 * Nx);

                d = wrapper::vload(in + k + 6 * Nx);

                e = wrapper::vload(in + k + 8 * Nx);

                f = wrapper::vload(in + k + 10 * Nx);

                g = wrapper::vload(in + k + 12 * Nx);

                h = wrapper::vload(in + k + 14 * Nx);

            }


            // Apply twiddle factors

            fft_8(a, b, c, d, e, f, g, h, w, w2, w3, w4, w5, w6, w7);


            // Store outputs

            if (first_stage)

            {

                wrapper::vstore(out + k, wrapper::vcombine(a, b));

                wrapper::vstore(out + k + 4 * Nx, wrapper::vcombine(c, d));

                wrapper::vstore(out + k + 8 * Nx, wrapper::vcombine(e, f));

                wrapper::vstore(out + k + 12 * Nx, wrapper::vcombine(g, h));

            }

            else

            {

                wrapper::vstore(out + k, a);

                wrapper::vstore(out + k + 2 * Nx, b);

                wrapper::vstore(out + k + 4 * Nx, c);

                wrapper::vstore(out + k + 6 * Nx, d);

                wrapper::vstore(out + k + 8 * Nx, e);

                wrapper::vstore(out + k + 10 * Nx, f);

                wrapper::vstore(out + k + 12 * Nx, g);

                wrapper::vstore(out + k + 14 * Nx, h);

            }

        }


        w = c_mul_neon(w, w_m);

    }

}


void fft_radix_8_axes_1(float             *out,

                        float             *in,

                        unsigned int       Nx,

                        unsigned int       NxRadix,

                        const float32x2_t &w_m,

                        unsigned int       N,

                        unsigned int       M,

                        unsigned int       in_pad_x,

                        unsigned int       out_pad_x)

{

    float32x2_t w{1.0f, 0.0f};

    for (unsigned int j = 0; j < Nx; j++)

    {

        const float32x2_t w2 = c_mul_neon(w, w);

        const float32x2_t w3 = c_mul_neon(w2, w);

        const float32x2_t w4 = c_mul_neon(w3, w);

        const float32x2_t w5 = c_mul_neon(w4, w);

        const float32x2_t w6 = c_mul_neon(w5, w);

        const float32x2_t w7 = c_mul_neon(w6, w);


        for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)

        {

            // Load inputs

            float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k);

            float32x2_t b = wrapper::vload(in + (N + in_pad_x) * (k + 2 * Nx));

            float32x2_t c = wrapper::vload(in + (N + in_pad_x) * (k + 4 * Nx));

            float32x2_t d = wrapper::vload(in + (N + in_pad_x) * (k + 6 * Nx));

            float32x2_t e = wrapper::vload(in + (N + in_pad_x) * (k + 8 * Nx));

            float32x2_t f = wrapper::vload(in + (N + in_pad_x) * (k + 10 * Nx));

            float32x2_t g = wrapper::vload(in + (N + in_pad_x) * (k + 12 * Nx));

            float32x2_t h = wrapper::vload(in + (N + in_pad_x) * (k + 14 * Nx));


            // Base-case prime transform

            fft_8(a, b, c, d, e, f, g, h, w, w2, w3, w4, w5, w6, w7);


            // Store outputs

            wrapper::vstore(out + (N + out_pad_x) * k, a);

            wrapper::vstore(out + (N + out_pad_x) * (k + 2 * Nx), b);

            wrapper::vstore(out + (N + out_pad_x) * (k + 4 * Nx), c);

            wrapper::vstore(out + (N + out_pad_x) * (k + 6 * Nx), d);

            wrapper::vstore(out + (N + out_pad_x) * (k + 8 * Nx), e);

            wrapper::vstore(out + (N + out_pad_x) * (k + 10 * Nx), f);

            wrapper::vstore(out + (N + out_pad_x) * (k + 12 * Nx), g);

            wrapper::vstore(out + (N + out_pad_x) * (k + 14 * Nx), h);

        }


        w = c_mul_neon(w, w_m);

    }

}


Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const FFTRadixStageKernelInfo &config)

{

    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F32);

    ARM_COMPUTE_RETURN_ERROR_ON(config.axis > 1);

    ARM_COMPUTE_RETURN_ERROR_ON(NEFFTRadixStageKernel::supported_radix().count(config.radix) == 0);

    ARM_COMPUTE_UNUSED(config);


    // Checks performed when output is configured

    if ((output != nullptr) && (output->total_size() != 0))

    {

        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);

        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);

    }


    return Status{};

}


std::pair<Status, Window>

validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const FFTRadixStageKernelInfo &config)

{

    ARM_COMPUTE_UNUSED(config);


    if (output != nullptr)

    {

        auto_init_if_empty(*output, *input);

    }


    Window win = calculate_max_window(*input, Steps());


    return std::make_pair(Status{}, win);

}

} // namespace


NEFFTRadixStageKernel::NEFFTRadixStageKernel()

    : _input(nullptr), _output(nullptr), _Nx(0), _axis(0), _radix(0), _func_0(), _func_1()

{

}


void NEFFTRadixStageKernel::set_radix_stage_axis0(const FFTRadixStageKernelInfo &config)

{

    // FFT table axis 0: [radix, first_stage]

    static std::map<unsigned int, std::map<bool, FFTFunctionPointerAxis0>> fft_table_axis0;


    if (fft_table_axis0.empty())

    {

        fft_table_axis0[2][false] = &fft_radix_2_axes_0<false>;

        fft_table_axis0[3][false] = &fft_radix_3_axes_0<false>;

        fft_table_axis0[4][false] = &fft_radix_4_axes_0<false>;

        fft_table_axis0[5][false] = &fft_radix_5_axes_0<false>;

        fft_table_axis0[7][false] = &fft_radix_7_axes_0<false>;

        fft_table_axis0[8][false] = &fft_radix_8_axes_0<false>;


        fft_table_axis0[2][true] = &fft_radix_2_axes_0<true>;

        fft_table_axis0[3][true] = &fft_radix_3_axes_0<true>;

        fft_table_axis0[4][true] = &fft_radix_4_axes_0<true>;

        fft_table_axis0[5][true] = &fft_radix_5_axes_0<true>;

        fft_table_axis0[7][true] = &fft_radix_7_axes_0<true>;

        fft_table_axis0[8][true] = &fft_radix_8_axes_0<true>;

    }


    _func_0 = fft_table_axis0[config.radix][config.is_first_stage];

}


void NEFFTRadixStageKernel::set_radix_stage_axis1(const FFTRadixStageKernelInfo &config)

{

    // FFT table axis 1: [radix, first_stage]

    static std::map<unsigned int, FFTFunctionPointerAxis1> fft_table_axis1;


    if (fft_table_axis1.empty())

    {

        fft_table_axis1[2] = &fft_radix_2_axes_1;

        fft_table_axis1[3] = &fft_radix_3_axes_1;

        fft_table_axis1[4] = &fft_radix_4_axes_1;

        fft_table_axis1[5] = &fft_radix_5_axes_1;

        fft_table_axis1[7] = &fft_radix_7_axes_1;

        fft_table_axis1[8] = &fft_radix_8_axes_1;

    }


    _func_1 = fft_table_axis1[config.radix];

}


void NEFFTRadixStageKernel::configure(ITensor *input, ITensor *output, const FFTRadixStageKernelInfo &config)

{

    ARM_COMPUTE_ERROR_ON_NULLPTR(input);


    // Output auto inizialitation if not yet initialized

    if (output != nullptr)

    {

        auto_init_if_empty(*output->info(), *input->info()->clone());

    }


    ARM_COMPUTE_ERROR_THROW_ON(

        validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, config));


    _input  = input;

    _output = (output == nullptr) ? input : output;

    _Nx     = config.Nx;

    _axis   = config.axis;

    _radix  = config.radix;


    switch (config.axis)

    {

        case 0:

            set_radix_stage_axis0(config);

            break;

        case 1:

            set_radix_stage_axis1(config);

            break;

        default:

            ARM_COMPUTE_ERROR("Axis not supported");

            break;

    }


    // Configure kernel window

    auto win_config =

        validate_and_configure_window(input->info(), (output != nullptr) ? output->info() : nullptr, config);

    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);

    INEKernel::configure(win_config.second);

}


Status NEFFTRadixStageKernel::validate(const ITensorInfo             *input,

                                       const ITensorInfo             *output,

                                       const FFTRadixStageKernelInfo &config)

{

    const bool run_in_place = (output == nullptr) || (output == input);

    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, config));

    ARM_COMPUTE_RETURN_ON_ERROR(

        validate_and_configure_window(input->clone().get(), (run_in_place) ? nullptr : output->clone().get(), config)

            .first);


    return Status{};

}


std::set<unsigned int> NEFFTRadixStageKernel::supported_radix()

{

    return std::set<unsigned int>{2, 3, 4, 5, 7, 8};

}


void NEFFTRadixStageKernel::run(const Window &window, const ThreadInfo &info)

{

    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);

    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);

    ARM_COMPUTE_UNUSED(info);


    Window input_window = window;

    input_window.set(_axis, 0);


    Iterator in(_input, input_window);

    Iterator out(_output, input_window);


    // Precompute FFT constants

    const unsigned int NxRadix = _radix * _Nx;

    const float        alpha   = 2.0f * kPi / float(NxRadix);

    const float32x2_t  w_m{cosf(alpha), -sinf(alpha)};


    if (_axis == 0)

    {

        const unsigned int N = _input->info()->dimension(0);

        execute_window_loop(

            input_window,

            [&](const Coordinates &) {

                _func_0(reinterpret_cast<float *>(out.ptr()), reinterpret_cast<float *>(in.ptr()), _Nx, NxRadix, w_m,

                        N);

            },

            in, out);

    }

    else

    {

        const unsigned int N = _input->info()->dimension(0);

        const unsigned int M = _input->info()->dimension(1);

        execute_window_loop(

            input_window,

            [&](const Coordinates &)

            {

                _func_1(reinterpret_cast<float *>(out.ptr()), reinterpret_cast<float *>(in.ptr()), _Nx, NxRadix, w_m, N,

                        M, _input->info()->padding().right + _input->info()->padding().left,

                        _output->info()->padding().right + _output->info()->padding().left);

            },

            in, out);

    }


    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);

    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);

}

} // namespace arm_compute