24 using namespace GLFFT;
31 GL_CHECK(glGetIntegerv(GL_MAX_COMPUTE_WORK_GROUP_INVOCATIONS, &value));
33 if (strstr(renderer,
"GeForce"))
43 else if (strstr(renderer,
"Radeon"))
54 else if (strstr(renderer,
"Mali"))
70 const pair<double, FFTOptions::Performance> FFTWisdom::learn_optimal_options(
unsigned Nx,
unsigned Ny,
unsigned radix,
76 Nx, Ny, radix,
mode, input_target, output_target,
82 auto itr = library.find(pass);
83 if (itr !=
end(library))
85 return make_pair(itr->first.cost, itr->second);
89 auto result = study(pass, type);
90 pass.
cost = result.first;
91 library[
pass] = result.second;
97 void FFTWisdom::learn_optimal_options_exhaustive(
unsigned Nx,
unsigned Ny,
105 static const unsigned radices[] = { 4, 8, 16, 64 };
106 for (
auto radix : radices)
115 learn_optimal_options(Nx >> learn_resolve, Ny, radix, vertical_mode,
SSBO,
SSBO, fft_type);
117 learn_optimal_options(Nx >> learn_resolve, Ny, radix, horizontal_mode,
SSBO,
SSBO, fft_type);
121 if (input_target !=
SSBO)
125 learn_optimal_options(Nx >> learn_resolve, Ny, radix, vertical_mode, input_target,
SSBO, fft_type);
127 learn_optimal_options(Nx >> learn_resolve, Ny, radix, horizontal_mode, input_target,
SSBO, fft_type);
130 if (output_target !=
SSBO)
134 learn_optimal_options(Nx >> learn_resolve, Ny, radix, vertical_mode,
SSBO, output_target, fft_type);
136 learn_optimal_options(Nx >> learn_resolve, Ny, radix, horizontal_mode,
SSBO, output_target, fft_type);
146 auto resolve_type = fft_type;
147 resolve_type.
input_fp16 = resolve_type.output_fp16;
154 resolve_type = fft_type;
155 resolve_input_target = input_target;
166 learn_optimal_options(Nx >> learn_resolve, Ny, 2, resolve_mode, resolve_input_target, output_target, resolve_type);
170 learn_optimal_options(Nx >> learn_resolve, Ny, 2, resolve_mode, resolve_input_target,
SSBO, resolve_type);
208 auto cache = make_shared<ProgramCache>();
218 vector<float> tmp(mode_size * pass.
pass.
Nx * pass.
pass.
Ny);
223 input_name = input.
get();
227 GLenum internal_format = 0;
229 unsigned Nx = pass.
pass.
Nx;
230 unsigned Ny = pass.
pass.
Ny;
236 internal_format = GL_RGBA32F;
242 internal_format = GL_RG32F;
247 internal_format = GL_RG32F;
253 throw logic_error(
"Invalid input mode.\n");
256 input_tex.
init(Nx, Ny, 1, internal_format);
257 input_tex.
upload(tmp.data(),
format, GL_FLOAT, 0, 0, Nx, Ny);
258 input_name = input_tex.
get();
263 output.init(
nullptr, tmp.size() *
sizeof(
float) >> type.
output_fp16, GL_STREAM_COPY);
264 output_name = output.get();
268 GLenum internal_format = 0;
269 unsigned Nx = pass.
pass.
Nx;
270 unsigned Ny = pass.
pass.
Ny;
276 internal_format = GL_RGBA32F;
281 internal_format = GL_RG32F;
285 internal_format = GL_RG32F;
290 throw logic_error(
"Invalid output mode.\n");
293 output_tex.
init(Nx, Ny, 1, internal_format);
294 output_name = output_tex.
get();
300 double minimum_cost = bench(output_name, input_name, pass, { best_perf, type }, cache);
303 static const unsigned vector_size_values[] = { 2, 4, 8 };
304 static const unsigned workgroup_size_x_values[] = { 4, 8, 16, 32, 64, 128, 256 };
305 static const unsigned workgroup_size_y_values[] = { 1, 2, 4, 8, };
309 unsigned bench_count = 0;
311 for (
auto shared_banked : shared_banked_values)
314 if (pass.
pass.
radix < 16 && shared_banked)
319 bool fair_shared_banked = (pass.
pass.
radix < 16) ||
320 (static_wisdom.shared_banked == FFTStaticWisdom::DontCare) ||
321 (shared_banked == static_wisdom.shared_banked);
323 if (!fair_shared_banked)
328 for (
auto vector_size : vector_size_values)
331 if (test_resolve && (vector_size != 2 || shared_banked))
343 if (test_dual && vector_size < 4)
348 for (
auto workgroup_size_x : workgroup_size_x_values)
350 for (
auto workgroup_size_y : workgroup_size_y_values)
352 unsigned workgroup_size = workgroup_size_x * workgroup_size_y;
354 unsigned min_workgroup_size = pass.
pass.
radix >= 16 ? static_wisdom.min_workgroup_size_shared :
355 static_wisdom.min_workgroup_size;
357 unsigned min_vector_size = test_dual ?
max(4u, static_wisdom.min_vector_size) : static_wisdom.min_vector_size;
358 unsigned max_vector_size = test_dual ?
max(4u, static_wisdom.max_vector_size) : static_wisdom.max_vector_size;
360 bool fair_workgroup_size = workgroup_size <= static_wisdom.max_workgroup_size &&
361 workgroup_size >= min_workgroup_size;
362 if (pass.
pass.
Ny == 1 && workgroup_size_y > 1)
364 fair_workgroup_size =
false;
367 if (!fair_workgroup_size)
373 bool fair_vector_size = test_resolve || (vector_size <= max_vector_size &&
374 vector_size >= min_vector_size);
376 if (!fair_vector_size)
390 double cost = bench(output_name, input_name, pass, { perf, type }, cache);
397 glfft_log(
" Shared banked: %3s\n", shared_banked ?
"yes" :
"no");
398 glfft_log(
" Vector size: %u\n", vector_size);
399 glfft_log(
" Workgroup size: (%u, %u)\n", workgroup_size_x, workgroup_size_y);
403 if (cost < minimum_cost)
406 glfft_log(
" New optimal solution! (%g -> %g)\n", minimum_cost, cost);
422 glfft_log(
"Tested %u variants!\n", bench_count);
423 return make_pair(minimum_cost, best_perf);
426 const pair<const WisdomPass, FFTOptions::Performance>* FFTWisdom::find_optimal_options(
unsigned Nx,
unsigned Ny,
unsigned radix,
431 Nx, Ny, radix,
mode, input_target, output_target,
437 auto itr = library.find(pass);
438 return itr !=
end(library) ? (&(*itr)) :
nullptr;
446 Nx, Ny, radix,
mode, input_target, output_target,
452 auto itr = library.find(pass);
455 if (itr ==
end(library))
457 glfft_log(
"Didn't find options for (%u x %u, radix %u, mode %u, input_target %u, output_target %u)\n",
458 Nx, Ny, radix,
unsigned(mode),
unsigned(input_target),
unsigned(output_target));
462 return itr !=
end(library) ? itr->second : base_options.
performance;
void upload(const void *data, GLenum format, GLenum type, unsigned x_off, unsigned y_off, unsigned width, unsigned height)
float min(float x, float y)
void glfft_log(const char *fmt,...)
Complex-to-real transform. N / 2 + 1 complex values are used per row with a stride of N complex sampl...
void init(unsigned width, unsigned height, unsigned levels, GLenum internal_format, GLenum wrap_s=GL_REPEAT, GLenum wrap_t=GL_REPEAT, GLenum min_filter=GL_NEAREST, GLenum mag_filter=GL_NEAREST)
GLint GLsizei GLsizei GLenum format
bool input_fp16
Whether input SSBO is a packed 2xfp16 format. Otherwise, regular FP32.
unsigned max_workgroup_size
bool fp16
Whether internal shader should be mediump float.
struct GLFFT::WisdomPass::@3 pass
Real-to-complex transform. N / 2 + 1 complex output samples are created per row with a stride of N co...
double bench(GLuint output, GLuint input, unsigned warmup_iterations, unsigned iterations, unsigned dispatches_per_iteration, double max_time=std::numeric_limits< double >::max())
Run process() multiple times, timing the results.
Options for FFT implementation. Defaults for performance as conservative.
bool output_fp16
Whether output SSBO is a packed 2xfp16 format. Otherwise, regular FP32.
void init(const void *data, size_t size, GLenum access)
GL_SHADER_STORAGE_BUFFER.
float max(float x, float y)
static unsigned mode_to_size(Mode mode)
unsigned min_workgroup_size
typedef GLenum(GL_APIENTRYP PFNGLGETGRAPHICSRESETSTATUSKHRPROC)(void)
struct GLFFT::FFTOptions::Type type
unsigned min_workgroup_size_shared
typedef GLuint(GL_APIENTRYP PFNGLGETDEBUGMESSAGELOGKHRPROC)(GLuint count
struct GLFFT::FFTOptions::Performance performance