OpenGL ES SDK for Android ARM Developer Center
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
glfft_wisdom.cpp
Go to the documentation of this file.
1 /* Copyright (C) 2015 Hans-Kristian Arntzen <maister@archlinux.us>
2  *
3  * Permission is hereby granted, free of charge,
4  * to any person obtaining a copy of this software and associated documentation files (the "Software"),
5  * to deal in the Software without restriction, including without limitation the rights to
6  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
7  * and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
8  *
9  * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
10  *
11  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
12  * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
13  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
14  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
15  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
16  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
17  */
18 
19 #include "glfft_wisdom.hpp"
20 #include "glfft.hpp"
21 #include <utility>
22 
23 using namespace std;
24 using namespace GLFFT;
25 
26 FFTStaticWisdom FFTWisdom::get_static_wisdom_from_renderer(const char *renderer)
27 {
28  FFTStaticWisdom res;
29 
30  GLint value = 0;
31  GL_CHECK(glGetIntegerv(GL_MAX_COMPUTE_WORK_GROUP_INVOCATIONS, &value));
32 
33  if (strstr(renderer, "GeForce"))
34  {
35  glfft_log("Detected GeForce GPU.\n");
36  res.min_workgroup_size = 32; // Warp threads.
38  res.max_workgroup_size = min(value, 256); // Very unlikely that more than 256 threads will do anything good.
39  res.min_vector_size = 2;
40  res.max_vector_size = 2;
41  res.shared_banked = FFTStaticWisdom::True;
42  }
43  else if (strstr(renderer, "Radeon"))
44  {
45  glfft_log("Detected Radeon GPU.\n");
46  res.min_workgroup_size = 64; // Wavefront threads (GCN).
47  res.min_workgroup_size_shared = 128;
48  res.max_workgroup_size = min(value, 256); // Very unlikely that more than 256 threads will do anything good.
49  // TODO: Find if we can restrict this to 2 or 4 always.
50  res.min_vector_size = 2;
51  res.max_vector_size = 4;
52  res.shared_banked = FFTStaticWisdom::True;
53  }
54  else if (strstr(renderer, "Mali"))
55  {
56  glfft_log("Detected Mali GPU.\n");
57 
58  res.min_workgroup_size = 4;
60  res.max_workgroup_size = 64; // Going beyond 64 threads per WG is not a good idea.
61  res.min_vector_size = 4;
62  res.max_vector_size = 4;
63  res.shared_banked = FFTStaticWisdom::False;
64  }
65  // TODO: Add more GPUs.
66 
67  return res;
68 }
69 
70 const pair<double, FFTOptions::Performance> FFTWisdom::learn_optimal_options(unsigned Nx, unsigned Ny, unsigned radix,
71  Mode mode, Target input_target, Target output_target,
72  const FFTOptions::Type &type)
73 {
74  WisdomPass pass = {
75  {
76  Nx, Ny, radix, mode, input_target, output_target,
77  type,
78  },
79  0.0,
80  };
81 
82  auto itr = library.find(pass);
83  if (itr != end(library))
84  {
85  return make_pair(itr->first.cost, itr->second);
86  }
87  else
88  {
89  auto result = study(pass, type);
90  pass.cost = result.first;
91  library[pass] = result.second;
92 
93  return result;
94  }
95 }
96 
97 void FFTWisdom::learn_optimal_options_exhaustive(unsigned Nx, unsigned Ny,
98  Type type, Target input_target, Target output_target, const FFTOptions::Type &fft_type)
99 {
100  bool learn_resolve = type == ComplexToReal || type == RealToComplex;
101  Mode vertical_mode = type == ComplexToComplexDual ? VerticalDual : Vertical;
102  Mode horizontal_mode = type == ComplexToComplexDual ? HorizontalDual : Horizontal;
103 
104  // Create wisdom for horizontal transforms and vertical transform.
105  static const unsigned radices[] = { 4, 8, 16, 64 };
106  for (auto radix : radices)
107  {
108  try
109  {
110  // If we're doing SSBO -> Image or Image -> SSBO. Create wisdom for the two variants.
111 
112  // Learn plain transforms.
113  if (Ny > 1)
114  {
115  learn_optimal_options(Nx >> learn_resolve, Ny, radix, vertical_mode, SSBO, SSBO, fft_type);
116  }
117  learn_optimal_options(Nx >> learn_resolve, Ny, radix, horizontal_mode, SSBO, SSBO, fft_type);
118 
119  // Learn the first/last pass transforms. Can be fairly significant since accessing textures makes more sense with
120  // block interleave and larger WG_Y sizes.
121  if (input_target != SSBO)
122  {
123  if (Ny > 1)
124  {
125  learn_optimal_options(Nx >> learn_resolve, Ny, radix, vertical_mode, input_target, SSBO, fft_type);
126  }
127  learn_optimal_options(Nx >> learn_resolve, Ny, radix, horizontal_mode, input_target, SSBO, fft_type);
128  }
129 
130  if (output_target != SSBO)
131  {
132  if (Ny > 1)
133  {
134  learn_optimal_options(Nx >> learn_resolve, Ny, radix, vertical_mode, SSBO, output_target, fft_type);
135  }
136  learn_optimal_options(Nx >> learn_resolve, Ny, radix, horizontal_mode, SSBO, output_target, fft_type);
137  }
138  }
139  catch (...)
140  {
141  // If our default options cannot successfully create the radix pass (i.e. throws),
142  // just ignore it for purpose of creating wisdom.
143  }
144  }
145 
146  auto resolve_type = fft_type;
147  resolve_type.input_fp16 = resolve_type.output_fp16;
148  Mode resolve_mode = type == ComplexToReal ? ResolveComplexToReal : ResolveRealToComplex;
149  Target resolve_input_target = SSBO;
150 
151  // If we have C2R Nx1 transform, the first pass is resolve, so use those types.
152  if (type == ComplexToReal && Ny == 1)
153  {
154  resolve_type = fft_type;
155  resolve_input_target = input_target;
156  }
157 
158  // If we need to do a resolve pass, train this case as well.
159  if (learn_resolve)
160  {
161  try
162  {
163  // If Ny == 1 and we're doing RealToComplex, this will be the last pass, so use output_target as target.
164  if (Ny == 1 && resolve_mode == ResolveRealToComplex)
165  {
166  learn_optimal_options(Nx >> learn_resolve, Ny, 2, resolve_mode, resolve_input_target, output_target, resolve_type);
167  }
168  else
169  {
170  learn_optimal_options(Nx >> learn_resolve, Ny, 2, resolve_mode, resolve_input_target, SSBO, resolve_type);
171  }
172  }
173  catch (...)
174  {
175  // If our default options cannot successfully create the radix pass (i.e. throws),
176  // just ignore it for purpose of creating wisdom.
177  }
178  }
179 }
180 
181 double FFTWisdom::bench(GLuint output, GLuint input,
182  const WisdomPass &pass, const FFTOptions &options, const shared_ptr<ProgramCache> &cache) const
183 {
184  FFT fft(pass.pass.Nx, pass.pass.Ny, pass.pass.radix, pass.pass.input_target != SSBO ? 1 : pass.pass.radix,
185  pass.pass.mode, pass.pass.input_target, pass.pass.output_target,
186  cache, options);
187 
188  return fft.bench(output, input, params.warmup, params.iterations, params.dispatches, params.timeout);
189 }
190 
191 static inline unsigned mode_to_size(Mode mode)
192 {
193  switch (mode)
194  {
195  case VerticalDual:
196  case HorizontalDual:
199  return 4;
200 
201  default:
202  return 2;
203  }
204 }
205 
206 std::pair<double, FFTOptions::Performance> FFTWisdom::study(const WisdomPass &pass, FFTOptions::Type type) const
207 {
208  auto cache = make_shared<ProgramCache>();
209 
210  Buffer output;
211  Buffer input;
212  Texture output_tex;
213  Texture input_tex;
214  GLuint output_name = 0;
215  GLuint input_name = 0;
216 
217  unsigned mode_size = mode_to_size(pass.pass.mode);
218  vector<float> tmp(mode_size * pass.pass.Nx * pass.pass.Ny);
219 
220  if (pass.pass.input_target == SSBO)
221  {
222  input.init(tmp.data(), tmp.size() * sizeof(float) >> type.input_fp16, GL_STATIC_COPY);
223  input_name = input.get();
224  }
225  else
226  {
227  GLenum internal_format = 0;
228  GLenum format = 0;
229  unsigned Nx = pass.pass.Nx;
230  unsigned Ny = pass.pass.Ny;
231 
232  switch (pass.pass.mode)
233  {
234  case VerticalDual:
235  case HorizontalDual:
236  internal_format = GL_RGBA32F;
237  format = GL_RGBA;
238  break;
239 
240  case Vertical:
241  case Horizontal:
242  internal_format = GL_RG32F;
243  format = GL_RG;
244  break;
245 
247  internal_format = GL_RG32F;
248  format = GL_RG;
249  Nx *= 2;
250  break;
251 
252  default:
253  throw logic_error("Invalid input mode.\n");
254  }
255 
256  input_tex.init(Nx, Ny, 1, internal_format);
257  input_tex.upload(tmp.data(), format, GL_FLOAT, 0, 0, Nx, Ny);
258  input_name = input_tex.get();
259  }
260 
261  if (pass.pass.output_target == SSBO)
262  {
263  output.init(nullptr, tmp.size() * sizeof(float) >> type.output_fp16, GL_STREAM_COPY);
264  output_name = output.get();
265  }
266  else
267  {
268  GLenum internal_format = 0;
269  unsigned Nx = pass.pass.Nx;
270  unsigned Ny = pass.pass.Ny;
271 
272  switch (pass.pass.mode)
273  {
274  case VerticalDual:
275  case HorizontalDual:
276  internal_format = GL_RGBA32F;
277  break;
278 
279  case Vertical:
280  case Horizontal:
281  internal_format = GL_RG32F;
282  break;
283 
285  internal_format = GL_RG32F;
286  Nx *= 2;
287  break;
288 
289  default:
290  throw logic_error("Invalid output mode.\n");
291  }
292 
293  output_tex.init(Nx, Ny, 1, internal_format);
294  output_name = output_tex.get();
295  }
296 
297  // Exhaustive search, look for every sensible combination, and find fastest parameters.
298  // Get initial best cost with defaults.
299  FFTOptions::Performance best_perf;
300  double minimum_cost = bench(output_name, input_name, pass, { best_perf, type }, cache);
301 
302  static const FFTStaticWisdom::Tristate shared_banked_values[] = { FFTStaticWisdom::False, FFTStaticWisdom::True };
303  static const unsigned vector_size_values[] = { 2, 4, 8 };
304  static const unsigned workgroup_size_x_values[] = { 4, 8, 16, 32, 64, 128, 256 };
305  static const unsigned workgroup_size_y_values[] = { 1, 2, 4, 8, };
306 
307  bool test_resolve = pass.pass.mode == ResolveComplexToReal || pass.pass.mode == ResolveRealToComplex;
308  bool test_dual = pass.pass.mode == VerticalDual || pass.pass.mode == HorizontalDual;
309  unsigned bench_count = 0;
310 
311  for (auto shared_banked : shared_banked_values)
312  {
313  // Useless test, since shared banked is only relevant for radix 16/64.
314  if (pass.pass.radix < 16 && shared_banked)
315  {
316  continue;
317  }
318 
319  bool fair_shared_banked = (pass.pass.radix < 16) ||
320  (static_wisdom.shared_banked == FFTStaticWisdom::DontCare) ||
321  (shared_banked == static_wisdom.shared_banked);
322 
323  if (!fair_shared_banked)
324  {
325  continue;
326  }
327 
328  for (auto vector_size : vector_size_values)
329  {
330  // Resolve passes currently only support vector size 2. Shared banked makes no sense either.
331  if (test_resolve && (vector_size != 2 || shared_banked))
332  {
333  continue;
334  }
335 
336  // We can only use vector_size 8 with FP16.
337  if (vector_size == 8 && (!type.fp16 || !type.input_fp16 || !type.output_fp16))
338  {
339  continue;
340  }
341 
342  // Makes little sense to test since since vector_size will be bumped to 4 anyways.
343  if (test_dual && vector_size < 4)
344  {
345  continue;
346  }
347 
348  for (auto workgroup_size_x : workgroup_size_x_values)
349  {
350  for (auto workgroup_size_y : workgroup_size_y_values)
351  {
352  unsigned workgroup_size = workgroup_size_x * workgroup_size_y;
353 
354  unsigned min_workgroup_size = pass.pass.radix >= 16 ? static_wisdom.min_workgroup_size_shared :
355  static_wisdom.min_workgroup_size;
356 
357  unsigned min_vector_size = test_dual ? max(4u, static_wisdom.min_vector_size) : static_wisdom.min_vector_size;
358  unsigned max_vector_size = test_dual ? max(4u, static_wisdom.max_vector_size) : static_wisdom.max_vector_size;
359 
360  bool fair_workgroup_size = workgroup_size <= static_wisdom.max_workgroup_size &&
361  workgroup_size >= min_workgroup_size;
362  if (pass.pass.Ny == 1 && workgroup_size_y > 1)
363  {
364  fair_workgroup_size = false;
365  }
366 
367  if (!fair_workgroup_size)
368  {
369  continue;
370  }
371 
372  // If we have dual mode, accept vector sizes larger than max.
373  bool fair_vector_size = test_resolve || (vector_size <= max_vector_size &&
374  vector_size >= min_vector_size);
375 
376  if (!fair_vector_size)
377  {
378  continue;
379  }
380 
382  perf.shared_banked = shared_banked;
383  perf.vector_size = vector_size;
384  perf.workgroup_size_x = workgroup_size_x;
385  perf.workgroup_size_y = workgroup_size_y;
386 
387  try
388  {
389  // If workgroup sizes are too big for our test, this will throw.
390  double cost = bench(output_name, input_name, pass, { perf, type }, cache);
391  bench_count++;
392 
393 #if 1
394  glfft_log("\nWisdom run (mode = %u, radix = %u):\n", pass.pass.mode, pass.pass.radix);
395  glfft_log(" Width: %4u\n", pass.pass.Nx);
396  glfft_log(" Height: %4u\n", pass.pass.Ny);
397  glfft_log(" Shared banked: %3s\n", shared_banked ? "yes" : "no");
398  glfft_log(" Vector size: %u\n", vector_size);
399  glfft_log(" Workgroup size: (%u, %u)\n", workgroup_size_x, workgroup_size_y);
400  glfft_log(" Cost: %8.3g\n", cost);
401 #endif
402 
403  if (cost < minimum_cost)
404  {
405 #if 1
406  glfft_log(" New optimal solution! (%g -> %g)\n", minimum_cost, cost);
407 #endif
408  best_perf = perf;
409  minimum_cost = cost;
410  }
411  }
412  catch (...)
413  {
414  // If we pass in bogus parameters,
415  // FFT will throw and we just ignore this.
416  }
417  }
418  }
419  }
420  }
421 
422  glfft_log("Tested %u variants!\n", bench_count);
423  return make_pair(minimum_cost, best_perf);
424 }
425 
426 const pair<const WisdomPass, FFTOptions::Performance>* FFTWisdom::find_optimal_options(unsigned Nx, unsigned Ny, unsigned radix,
427  Mode mode, Target input_target, Target output_target, const FFTOptions::Type &type) const
428 {
429  WisdomPass pass = {
430  {
431  Nx, Ny, radix, mode, input_target, output_target,
432  type,
433  },
434  0.0,
435  };
436 
437  auto itr = library.find(pass);
438  return itr != end(library) ? (&(*itr)) : nullptr;
439 }
440 
441 const FFTOptions::Performance& FFTWisdom::find_optimal_options_or_default(unsigned Nx, unsigned Ny, unsigned radix,
442  Mode mode, Target input_target, Target output_target, const FFTOptions &base_options) const
443 {
444  WisdomPass pass = {
445  {
446  Nx, Ny, radix, mode, input_target, output_target,
447  base_options.type,
448  },
449  0.0,
450  };
451 
452  auto itr = library.find(pass);
453 
454 #if 1
455  if (itr == end(library))
456  {
457  glfft_log("Didn't find options for (%u x %u, radix %u, mode %u, input_target %u, output_target %u)\n",
458  Nx, Ny, radix, unsigned(mode), unsigned(input_target), unsigned(output_target));
459  }
460 #endif
461 
462  return itr != end(library) ? itr->second : base_options.performance;
463 }
464 
GLuint get() const
void ** params
Definition: gl2ext.h:143
int pass
Definition: app.cpp:216
GLuint GLuint end
Definition: gl2ext.h:323
void upload(const void *data, GLenum format, GLenum type, unsigned x_off, unsigned y_off, unsigned width, unsigned height)
float min(float x, float y)
Definition: noise.cpp:34
void glfft_log(const char *fmt,...)
Complex-to-real transform. N / 2 + 1 complex values are used per row with a stride of N complex sampl...
void init(unsigned width, unsigned height, unsigned levels, GLenum internal_format, GLenum wrap_s=GL_REPEAT, GLenum wrap_t=GL_REPEAT, GLenum min_filter=GL_NEAREST, GLenum mag_filter=GL_NEAREST)
GLint GLsizei GLsizei GLenum format
Definition: gl2ext.h:179
bool input_fp16
Whether input SSBO is a packed 2xfp16 format. Otherwise, regular FP32.
GLint value
Definition: gl2ext.h:558
unsigned vector_size
Vector size. Very GPU dependent. "Scalar" GPUs prefer 2 here, vector GPUs prefer 4 (and maybe 8)...
GLuint get() const
bool fp16
Whether internal shader should be mediump float.
GLenum mode
Definition: gl2ext.h:302
struct GLFFT::WisdomPass::@3 pass
Real-to-complex transform. N / 2 + 1 complex output samples are created per row with a stride of N co...
double bench(GLuint output, GLuint input, unsigned warmup_iterations, unsigned iterations, unsigned dispatches_per_iteration, double max_time=std::numeric_limits< double >::max())
Run process() multiple times, timing the results.
Definition: glfft.cpp:997
Options for FFT implementation. Defaults for performance as conservative.
bool output_fp16
Whether output SSBO is a packed 2xfp16 format. Otherwise, regular FP32.
void init(const void *data, size_t size, GLenum access)
#define GL_CHECK(x)
Definition: AstcTextures.h:59
GL_SHADER_STORAGE_BUFFER.
GLenum type
Definition: gl2ext.h:133
float max(float x, float y)
Definition: noise.cpp:29
static unsigned mode_to_size(Mode mode)
precision highp float
Definition: hiz_cull.cs:37
typedef GLenum(GL_APIENTRYP PFNGLGETGRAPHICSRESETSTATUSKHRPROC)(void)
struct GLFFT::FFTOptions::Type type
unsigned min_workgroup_size_shared
typedef GLuint(GL_APIENTRYP PFNGLGETDEBUGMESSAGELOGKHRPROC)(GLuint count
struct GLFFT::FFTOptions::Performance performance