27 #define GLFFT_SHADER_FROM_FILE
29 #ifndef GLFFT_SHADER_FROM_FILE
30 #include "glsl/fft_common.inc"
31 #include "glsl/fft_radix4.inc"
32 #include "glsl/fft_radix8.inc"
33 #include "glsl/fft_radix16.inc"
34 #include "glsl/fft_radix64.inc"
35 #include "glsl/fft_shared.inc"
36 #include "glsl/fft_main.inc"
40 using namespace GLFFT;
59 if (divisor > 1 && wg_size >= divisor)
64 else if (divisor > 1 && wg_size < divisor)
87 Mode mode,
unsigned vector_size,
bool shared_banked,
unsigned radix,
91 unsigned wg_x = 0, wg_y = 0;
93 if (Ny == 1 && size.
y > 1)
95 throw logic_error(
"WorkGroupSize.y must be 1, when Ny == 1.\n");
115 wg_x = (2 * Nx) / (vector_size * size.
x) + pow2_stride;
116 wg_y = Ny / (size.
y * radix);
120 vector_size =
max(vector_size, 4u);
121 wg_x = (4 * Nx) / (vector_size * size.
x);
122 wg_y = Ny / (size.
y * radix);
126 wg_x = (2 * Nx) / (vector_size * radix * size.
x);
131 vector_size =
max(vector_size, 4u);
132 wg_x = (4 * Nx) / (vector_size * radix * size.
x);
140 return {
size, wg_x, wg_y, radix, vector_size, shared_banked };
146 return {
size, Nx / size.
x, Ny / size.
y, 2, 2,
false };
151 Mode mode,
unsigned vector_size,
unsigned radix,
156 mode, vector_size,
false, radix,
160 return res.num_workgroups_x > 0 && res.num_workgroups_y > 0;
173 return opt ? opt->first.cost : Nx * Ny * (log2(
float(radix)) + 2.0f);
180 : cost(cost), radices(move(radices)) {}
186 if ((cost == 0.0 || new_cost < cost) && a.
cost != 0.0 && b.
cost != 0.0)
200 bool pow2_stride,
const FFTWisdom &wisdom,
double &accumulate_cost)
226 double cost_table[8] = {0.0};
231 cost_table[2] =
find_cost(Nx, Ny, mode, 4, options, wisdom);
232 cost_table[3] =
find_cost(Nx, Ny, mode, 8, options, wisdom);
233 cost_table[4] =
find_cost(Nx, Ny, mode, 16, options, wisdom);
234 cost_table[6] =
find_cost(Nx, Ny, mode, 64, options, wisdom);
236 auto is_valid = [&](
unsigned radix) ->
bool {
242 mode, opt.vector_size, radix,
243 { opt.workgroup_size_x, opt.workgroup_size_y, workgroup_size_z },
248 for (
unsigned i = 2; i <= 6; i++)
256 if (is_valid(1 << i))
258 cost_propagate[i] =
CostPropagate(cost_table[i], { 1u << i });
263 for (
unsigned i = 4; (1u << i) <=
N; i++)
265 auto &
target = cost_propagate[i];
267 for (
unsigned r = 2; i -
r >=
r; r++)
272 if ((1u << i) ==
N &&
target.cost == 0.0)
274 throw logic_error(
"There is no possible subdivision ...\n");
284 auto &cost = cost_propagate[unsigned(log2(
float(
N)))];
285 auto radices = move(cost.radices);
287 sort(begin(radices),
end(radices), greater<unsigned>());
289 if (accumulate(begin(radices),
end(radices), 1u, multiplies<unsigned>()) !=
N)
291 throw logic_error(
"Radix splits are invalid.");
294 vector<Radix> radices_out;
295 radices_out.reserve(radices.size());
298 for (
auto radix : radices)
300 bool first = radices_out.empty();
301 bool last = radices_out.size() + 1 == radices.size();
306 auto &orig_opt = wisdom.find_optimal_options_or_default(Nx, Ny, radix,
mode,
SSBO,
SSBO, options);
307 auto &opts = wisdom.find_optimal_options_or_default(Nx, Ny, radix,
mode,
308 first ? input_target :
SSBO,
309 last ? output_target : SSBO,
310 { orig_opt, options.type });
313 mode, opts.vector_size, opts.shared_banked, radix,
314 { opts.workgroup_size_x, opts.workgroup_size_y,
radix_to_wg_z(radix) },
318 accumulate_cost += cost.cost;
324 auto itr = programs.find(parameters);
325 if (itr !=
end(programs))
327 return itr->second.get();
337 programs[parameters] =
Program(program);
342 GLuint prog = cache->find_program(params);
345 prog = build_program(params);
348 throw runtime_error(
"Failed to compile shader.\n");
350 cache->insert_program(params, prog);
376 FFT::FFT(
unsigned Nx,
unsigned Ny,
377 unsigned radix,
unsigned p,
379 std::shared_ptr<ProgramCache> program_cache,
const FFTOptions &options)
380 : cache(move(program_cache)), size_x(Nx), size_y(Ny)
384 if (!Nx || !Ny || (Nx & (Nx - 1)) || (Ny & (Ny - 1)))
386 throw logic_error(
"FFT size is not POT.");
389 if (p != 1 && input_target != SSBO)
391 throw logic_error(
"P != 1 only supported with SSBO as input.");
394 if (p < radix && output_target != SSBO)
396 throw logic_error(
"P < radix only supported with SSBO as output.");
435 throw logic_error(
"Invalid workgroup sizes for this radix.");
453 for (
auto &radix : radices[0])
456 radix.size.x, radix.size.y, radix.size.z);
458 radix.num_workgroups_x, radix.num_workgroups_y);
466 for (
auto &radix : radices[1])
469 radix.size.x, radix.size.y, radix.size.z);
471 radix.num_workgroups_x, radix.num_workgroups_y);
500 std::shared_ptr<ProgramCache> program_cache,
const FFTOptions &options,
const FFTWisdom &wisdom)
501 : cache(move(program_cache)), size_x(Nx), size_y(Ny)
509 if (output_target != SSBO)
524 if (!Nx || !Ny || (Nx & (Nx - 1)) || (Ny & (Ny - 1)))
526 throw logic_error(
"FFT size is not POT.");
531 throw logic_error(
"ComplexToReal transforms requires inverse transform.");
536 throw logic_error(
"RealToComplex transforms requires forward transform.");
541 throw logic_error(
"Input real-to-complex must use ImageReal target.");
546 throw logic_error(
"Output complex-to-real must use ImageReal target.");
549 vector<Radix> radices[2];
559 targets[0] = input_target;
560 targets[1] = Ny > 1 ? SSBO : output_target;
561 targets[2] = targets[1];
562 targets[3] = output_target;
564 radices[0] =
split_radices(Nx, Ny, modes[0], targets[0], targets[1], options,
false, wisdom,
cost);
565 radices[1] =
split_radices(Nx, Ny, modes[1], targets[2], targets[3], options, expand, wisdom,
cost);
573 targets[0] = input_target;
574 targets[1] = Ny > 1 ? SSBO : input_target;
575 targets[2] = targets[1];
576 targets[3] = output_target;
578 radices[0] =
split_radices(Nx, Ny, modes[0], targets[0], targets[1], options, expand, wisdom,
cost);
579 radices[1] =
split_radices(Nx, Ny, modes[1], targets[2], targets[3], options,
false, wisdom,
cost);
590 unsigned last_index = (radices[1].empty() && !expand) ? 0 : 1;
592 for (
auto &radix_direction : radices)
601 for (
auto &radix : radix_direction)
604 bool last_pass = index == last_index && i == radix_direction.size() - 1;
607 Target out_target = last_pass ? output_target :
SSBO;
633 radix.num_workgroups_x, radix.num_workgroups_y,
636 last_pass ? 0u : GL_SHADER_STORAGE_BARRIER_BIT,
650 bool last_pass = radices[1].empty();
653 Target out_target = last_pass ? output_target :
SSBO;
655 unsigned uv_scale_x = 1;
657 auto base_opts = options;
661 auto res =
build_resolve_radix(Nx, Ny, { opts.workgroup_size_x, opts.workgroup_size_y, 1 });
676 base_opts.type.fp16, base_opts.type.input_fp16, base_opts.type.output_fp16,
677 base_opts.type.normalize,
686 GL_SHADER_STORAGE_BARRIER_BIT,
701 throw runtime_error(
"Failed to load shader file from disk.\n");
712 file.write(source.data(), source.size());
718 str.reserve(16 * 1024);
755 str +=
"#define FFT_P1\n";
760 str +=
"#define FFT_POW2_STRIDE\n";
765 str +=
"#define FFT_FP16\n";
770 str +=
"#define FFT_INPUT_FP16\n";
775 str +=
"#define FFT_OUTPUT_FP16\n";
780 str +=
"#define FFT_NORMALIZE\n";
785 str +=
"#define FFT_CONVOLVE\n";
788 str += params.
shared_banked ?
"#define FFT_SHARED_BANKED 1\n" :
"#define FFT_SHARED_BANKED 0\n";
790 str += params.
direction ==
Forward ?
"#define FFT_FORWARD\n" :
"#define FFT_INVERSE\n";
791 str += string(
"#define FFT_RADIX ") + to_string(params.
radix) +
"\n";
797 str +=
"#define FFT_DUAL\n";
798 str +=
"#define FFT_VERT\n";
802 str +=
"#define FFT_VERT\n";
806 str +=
"#define FFT_DUAL\n";
807 str +=
"#define FFT_HORIZ\n";
811 str +=
"#define FFT_HORIZ\n";
815 str +=
"#define FFT_RESOLVE_REAL_TO_COMPLEX\n";
816 str +=
"#define FFT_HORIZ\n";
821 str +=
"#define FFT_RESOLVE_COMPLEX_TO_REAL\n";
822 str +=
"#define FFT_HORIZ\n";
830 str +=
"#define FFT_INPUT_REAL\n";
833 str +=
"#define FFT_INPUT_TEXTURE\n";
843 str +=
"#define FFT_OUTPUT_REAL\n";
846 str +=
"#define FFT_OUTPUT_IMAGE\n";
856 str +=
"#define FFT_VEC2\n";
860 str +=
"#define FFT_VEC4\n";
864 str +=
"#define FFT_VEC8\n";
868 str += string(
"layout(local_size_x = ") +
870 ", local_size_y = " +
872 ", local_size_z = " +
876 #ifdef GLFFT_SHADER_FROM_FILE
878 switch (params.
radix)
902 str += Blob::fft_common_source;
903 switch (params.
radix)
906 str += Blob::fft_radix4_source;
910 str += Blob::fft_radix8_source;
914 str += Blob::fft_radix4_source;
915 str += Blob::fft_shared_source;
916 str += Blob::fft_radix16_source;
920 str += Blob::fft_radix8_source;
921 str += Blob::fft_shared_source;
922 str += Blob::fft_radix64_source;
925 str += Blob::fft_main_source;
935 char shader_path[1024];
936 snprintf(shader_path,
sizeof(shader_path),
"glfft_shader_radix%u_first%u_mode%u_in_target%u_out_target%u.comp.src",
955 GL_CHECK(glShaderSource(shader, 2, sources, NULL));
959 GL_CHECK(glGetShaderiv(shader, GL_COMPILE_STATUS, &status));
960 if (status == GL_FALSE)
965 GL_CHECK(glGetShaderiv(shader, GL_INFO_LOG_LENGTH, &len));
966 vector<char>
buf(len);
967 GL_CHECK(glGetShaderInfoLog(shader, len, &out_len, buf.data()));
968 glfft_log(
"GLFFT: Shader log:\n%s\n\n", buf.data());
980 if (status == GL_FALSE)
985 vector<char>
buf(len);
987 glfft_log(
"Program log:\n%s\n\n", buf.data());
998 unsigned warmup_iterations,
unsigned iterations,
unsigned dispatches_per_iteration,
double max_time)
1001 for (
unsigned i = 0; i < warmup_iterations; i++)
1011 for (
unsigned i = 0; i < iterations && (((
glfft_time() -
start_time) < max_time) || i == 0); i++)
1014 for (
unsigned d = 0; d < dispatches_per_iteration; d++)
1017 GL_CHECK(glMemoryBarrier(GL_ALL_BARRIER_BITS));
1022 total_time += iteration_end - iteration_start;
1025 return total_time / runs;
1044 if (
passes.front().parameters.input_target !=
SSBO)
1046 GL_CHECK(glActiveTexture(GL_TEXTURE1));
1047 GL_CHECK(glBindTexture(GL_TEXTURE_2D, input_aux));
1052 if (
ssbo.input_aux.size != 0)
1054 GL_CHECK(glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 2, input_aux,
1055 ssbo.input_aux.offset,
ssbo.input_aux.size));
1059 GL_CHECK(glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, input_aux));
1064 GLuint current_program = 0;
1066 unsigned pass_index = 0;
1069 if (
pass.program != current_program)
1072 current_program =
pass.program;
1075 if (
pass.parameters.p1)
1084 p *=
pass.parameters.radix;
1086 if (
pass.parameters.input_target != SSBO)
1088 GL_CHECK(glActiveTexture(GL_TEXTURE0));
1089 GL_CHECK(glBindTexture(GL_TEXTURE_2D, buffers[0]));
1099 if (buffers[0] == input &&
ssbo.input.size != 0)
1101 GL_CHECK(glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 0, buffers[0],
1102 ssbo.input.offset,
ssbo.input.size));
1104 else if (buffers[0] == output &&
ssbo.output.size != 0)
1108 GL_CHECK(glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 0, buffers[0],
1109 ssbo.output.offset,
ssbo.output.size));
1113 GL_CHECK(glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, buffers[0]));
1117 if (
pass.parameters.output_target != SSBO)
1128 switch (
pass.parameters.mode)
1132 format = GL_RGBA16F;
1145 GL_CHECK(glBindImageTexture(0, output, 0, GL_FALSE, 0, GL_WRITE_ONLY, format));
1149 if (buffers[1] == output &&
ssbo.output.size != 0)
1151 GL_CHECK(glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 1, buffers[1],
1152 ssbo.output.offset,
ssbo.output.size));
1156 GL_CHECK(glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, buffers[1]));
1162 if (
pass.barriers != 0)
1167 if (pass_index == 0)
1169 buffers[0] = passes.size() & 1 ?
1174 swap(buffers[0], buffers[1]);
static void store_shader_string(const char *path, const std::string &source)
void merge_if_better(const CostPropagate &a, const CostPropagate &b)
GLboolean GLboolean GLboolean GLboolean a
unsigned num_workgroups_y
void process(GLuint output, GLuint input, GLuint input_aux=0)
Process the FFT.
static unsigned mode_to_input_components(Mode mode)
static std::string load_shader_string(const char *path)
void glfft_log(const char *fmt,...)
bool normalize
Whether to apply 1 / N normalization factor.
Complex-to-real transform. N / 2 + 1 complex values are used per row with a stride of N complex sampl...
GLint GLsizei GLsizei GLenum format
bool input_fp16
Whether input SSBO is a packed 2xfp16 format. Otherwise, regular FP32.
bool fp16
Whether internal shader should be mediump float.
static bool is_radix_valid(unsigned Nx, unsigned Ny, Mode mode, unsigned vector_size, unsigned radix, WorkGroupSize size, bool pow2_stride)
static Radix build_radix(unsigned Nx, unsigned Ny, Mode mode, unsigned vector_size, bool shared_banked, unsigned radix, WorkGroupSize size, bool pow2_stride)
unsigned workgroup_size_y
GLuint build_program(const Parameters ¶ms)
bool glfft_read_file_string(const char *path, char **out_buf)
vector< unsigned > radices
FFT(unsigned Nx, unsigned Ny, Type type, Direction direction, Target input_target, Target output_target, std::shared_ptr< ProgramCache > cache, const FFTOptions &options, const FFTWisdom &wisdom=FFTWisdom())
Creates a full FFT.
unsigned workgroup_size_z
GLuint compile_compute_shader(const char *src)
GLsizei GLsizei GLchar * source
Real-to-complex transform. N / 2 + 1 complex output samples are created per row with a stride of N co...
unsigned workgroup_size_x
double bench(GLuint output, GLuint input, unsigned warmup_iterations, unsigned iterations, unsigned dispatches_per_iteration, double max_time=std::numeric_limits< double >::max())
Run process() multiple times, timing the results.
Options for FFT implementation. Defaults for performance as conservative.
bool output_fp16
Whether output SSBO is a packed 2xfp16 format. Otherwise, regular FP32.
void init(const void *data, size_t size, GLenum access)
static vector< Radix > split_radices(unsigned Nx, unsigned Ny, Mode mode, Target input_target, Target output_target, const FFTOptions &options, bool pow2_stride, const FFTWisdom &wisdom, double &accumulate_cost)
const std::pair< const WisdomPass, FFTOptions::Performance > * find_optimal_options(unsigned Nx, unsigned Ny, unsigned radix, Mode mode, Target input_target, Target output_target, const FFTOptions::Type &base_options) const
struct GLFFT::FFT::@1::@2 input
static double find_cost(unsigned Nx, unsigned Ny, Mode mode, unsigned radix, const FFTOptions &options, const FFTWisdom &wisdom)
static void reduce(unsigned &wg_size, unsigned &divisor)
GL_SHADER_STORAGE_BUFFER.
void set_texture_offset_scale(float offset_x, float offset_y, float scale_x, float scale_y)
Sets offset and scale parameters for normalized texel coordinates when sampling textures.
float max(float x, float y)
static Radix build_resolve_radix(unsigned Nx, unsigned Ny, WorkGroupSize size)
const FFTOptions::Performance & find_optimal_options_or_default(unsigned Nx, unsigned Ny, unsigned radix, Mode mode, Target input_target, Target output_target, const FFTOptions &base_options) const
Regular complex-to-complex transform.
struct GLFFT::FFT::@1::@2 output
GLuint get_program(const Parameters ¶ms)
static void print_radix_splits(const vector< Radix > radices[2])
GLenum GLuint GLintptr GLsizeiptr size
static unsigned type_to_input_components(Type type)
GLint GLint GLint GLint GLint x
unsigned num_workgroups_x
static unsigned radix_to_wg_z(unsigned radix)
GLenum GLuint GLenum GLsizei const GLchar * buf
GLboolean GLboolean GLboolean b
typedef GLenum(GL_APIENTRYP PFNGLGETGRAPHICSRESETSTATUSKHRPROC)(void)
struct GLFFT::FFTOptions::Type type
typedef GLuint(GL_APIENTRYP PFNGLGETDEBUGMESSAGELOGKHRPROC)(GLuint count
std::vector< Pass > passes
struct GLFFT::FFTOptions::Performance performance
CostPropagate(double cost, vector< unsigned > radices)
struct GLFFT::FFT::@1 ssbo