OpenGL ES SDK for Android ARM Developer Center
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
glfft.cpp
Go to the documentation of this file.
1 /* Copyright (C) 2015 Hans-Kristian Arntzen <maister@archlinux.us>
2  *
3  * Permission is hereby granted, free of charge,
4  * to any person obtaining a copy of this software and associated documentation files (the "Software"),
5  * to deal in the Software without restriction, including without limitation the rights to
6  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
7  * and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
8  *
9  * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
10  *
11  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
12  * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
13  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
14  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
15  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
16  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
17  */
18 
19 #include "glfft.hpp"
20 #include <algorithm>
21 #include <stdexcept>
22 #include <numeric>
23 #include <fstream>
24 #include <sstream>
25 #include <assert.h>
26 
27 #define GLFFT_SHADER_FROM_FILE
28 
29 #ifndef GLFFT_SHADER_FROM_FILE
30 #include "glsl/fft_common.inc"
31 #include "glsl/fft_radix4.inc"
32 #include "glsl/fft_radix8.inc"
33 #include "glsl/fft_radix16.inc"
34 #include "glsl/fft_radix64.inc"
35 #include "glsl/fft_shared.inc"
36 #include "glsl/fft_main.inc"
37 #endif
38 
39 using namespace std;
40 using namespace GLFFT;
41 
43 {
44  unsigned x, y, z;
45 };
46 
47 struct Radix
48 {
50  unsigned num_workgroups_x;
51  unsigned num_workgroups_y;
52  unsigned radix;
53  unsigned vector_size;
55 };
56 
57 static void reduce(unsigned &wg_size, unsigned &divisor)
58 {
59  if (divisor > 1 && wg_size >= divisor)
60  {
61  wg_size /= divisor;
62  divisor = 1;
63  }
64  else if (divisor > 1 && wg_size < divisor)
65  {
66  divisor /= wg_size;
67  wg_size = 1;
68  }
69 }
70 
71 static unsigned radix_to_wg_z(unsigned radix)
72 {
73  switch (radix)
74  {
75  case 16:
76  return 4;
77 
78  case 64:
79  return 8;
80 
81  default:
82  return 1;
83  }
84 }
85 
86 static Radix build_radix(unsigned Nx, unsigned Ny,
87  Mode mode, unsigned vector_size, bool shared_banked, unsigned radix,
89  bool pow2_stride)
90 {
91  unsigned wg_x = 0, wg_y = 0;
92 
93  if (Ny == 1 && size.y > 1)
94  {
95  throw logic_error("WorkGroupSize.y must be 1, when Ny == 1.\n");
96  }
97 
98  // To avoid too many threads per workgroup due to workgroup_size_z,
99  // try to divide workgroup_size_y, then workgroup_size_x.
100  // TODO: Make a better constraint solver which takes into account cache line sizes,
101  // and image swizzling patterns, etc ... Not that critical though, since wisdom interface
102  // will find the optimal options despite this.
103  unsigned divisor = size.z;
104  reduce(size.y, divisor);
105  reduce(size.x, divisor);
106 
107  switch (mode)
108  {
109  case Vertical:
110  // If we have pow2_stride, we need to transform 2^n + 1 elements horizontally,
111  // so just add a single workgroup in X.
112  // We pad by going up to pow2 stride anyways.
113  // We will transform some garbage,
114  // but it's better than transforming close to double the amount.
115  wg_x = (2 * Nx) / (vector_size * size.x) + pow2_stride;
116  wg_y = Ny / (size.y * radix);
117  break;
118 
119  case VerticalDual:
120  vector_size = max(vector_size, 4u);
121  wg_x = (4 * Nx) / (vector_size * size.x);
122  wg_y = Ny / (size.y * radix);
123  break;
124 
125  case Horizontal:
126  wg_x = (2 * Nx) / (vector_size * radix * size.x);
127  wg_y = Ny / size.y;
128  break;
129 
130  case HorizontalDual:
131  vector_size = max(vector_size, 4u);
132  wg_x = (4 * Nx) / (vector_size * radix * size.x);
133  wg_y = Ny / size.y;
134  break;
135 
136  default:
137  assert(0);
138  }
139 
140  return { size, wg_x, wg_y, radix, vector_size, shared_banked };
141 }
142 
143 // Resolve radices are simpler, and don't yet support different vector sizes, etc.
144 static Radix build_resolve_radix(unsigned Nx, unsigned Ny, WorkGroupSize size)
145 {
146  return { size, Nx / size.x, Ny / size.y, 2, 2, false };
147 }
148 
149 // Smaller FFT with larger workgroups are not always possible to create.
150 static bool is_radix_valid(unsigned Nx, unsigned Ny,
151  Mode mode, unsigned vector_size, unsigned radix,
153  bool pow2_stride)
154 {
155  auto res = build_radix(Nx, Ny,
156  mode, vector_size, false, radix,
157  size,
158  pow2_stride);
159 
160  return res.num_workgroups_x > 0 && res.num_workgroups_y > 0;
161 }
162 
163 static double find_cost(unsigned Nx, unsigned Ny, Mode mode, unsigned radix,
164  const FFTOptions &options, const FFTWisdom &wisdom)
165 {
166  auto opt = wisdom.find_optimal_options(Nx, Ny, radix, mode, SSBO, SSBO, options.type);
167 
168  // Return a very rough estimate if we cannot find cost.
169  // The cost functions generated here are expected to be huge,
170  // always much larger than true cost functions.
171  // The purpose of this is to give a strong bias towards radices we have wisdom for.
172  // We also give a bias towards larger radices, since they are generally more BW efficient.
173  return opt ? opt->first.cost : Nx * Ny * (log2(float(radix)) + 2.0f);
174 }
175 
177 {
178  CostPropagate() = default;
179  CostPropagate(double cost, vector<unsigned> radices)
180  : cost(cost), radices(move(radices)) {}
181 
183  {
184  double new_cost = a.cost + b.cost;
185 
186  if ((cost == 0.0 || new_cost < cost) && a.cost != 0.0 && b.cost != 0.0)
187  {
188  cost = new_cost;
189  radices = a.radices;
190  radices.insert(end(radices), begin(b.radices), end(b.radices));
191  }
192  }
193 
194  double cost = 0.0;
195  vector<unsigned> radices;
196 };
197 
198 static vector<Radix> split_radices(unsigned Nx, unsigned Ny, Mode mode, Target input_target, Target output_target,
199  const FFTOptions &options,
200  bool pow2_stride, const FFTWisdom &wisdom, double &accumulate_cost)
201 {
202  unsigned N;
203  switch (mode)
204  {
205  case Vertical:
206  case VerticalDual:
207  N = Ny;
208  break;
209 
210  case Horizontal:
211  case HorizontalDual:
212  N = Nx;
213  break;
214 
215  default:
216  return {};
217  }
218 
219  // N == 1 is for things like Nx1 transforms where we don't do any vertical transforms.
220  if (N == 1)
221  {
222  return {};
223  }
224 
225  // Treat cost 0.0 as invalid.
226  double cost_table[8] = {0.0};
227  CostPropagate cost_propagate[32];
228 
229  // Fill table with fastest known ways to do radix 4, radix 8, radix 16, and 64.
230  // We'll then find the optimal subdivision which has the lowest additive cost.
231  cost_table[2] = find_cost(Nx, Ny, mode, 4, options, wisdom);
232  cost_table[3] = find_cost(Nx, Ny, mode, 8, options, wisdom);
233  cost_table[4] = find_cost(Nx, Ny, mode, 16, options, wisdom);
234  cost_table[6] = find_cost(Nx, Ny, mode, 64, options, wisdom);
235 
236  auto is_valid = [&](unsigned radix) -> bool {
237  unsigned workgroup_size_z = radix_to_wg_z(radix);
238  auto &opt = wisdom.find_optimal_options_or_default(Nx, Ny, radix, mode, SSBO, SSBO, options);
239 
240  // We don't want pow2_stride to round up a very inefficient work group and make the is_valid test pass.
241  return is_radix_valid(Nx, Ny,
242  mode, opt.vector_size, radix,
243  { opt.workgroup_size_x, opt.workgroup_size_y, workgroup_size_z },
244  false);
245  };
246 
247  // If our work-space is too small to allow certain radices, we disable them from consideration here.
248  for (unsigned i = 2; i <= 6; i++)
249  {
250  // Don't check the composite radix.
251  if (i == 5)
252  {
253  continue;
254  }
255 
256  if (is_valid(1 << i))
257  {
258  cost_propagate[i] = CostPropagate(cost_table[i], { 1u << i });
259  }
260  }
261 
262  // Now start bubble this up all the way to N, starting from radix 16.
263  for (unsigned i = 4; (1u << i) <= N; i++)
264  {
265  auto &target = cost_propagate[i];
266 
267  for (unsigned r = 2; i - r >= r; r++)
268  {
269  target.merge_if_better(cost_propagate[r], cost_propagate[i - r]);
270  }
271 
272  if ((1u << i) == N && target.cost == 0.0)
273  {
274  throw logic_error("There is no possible subdivision ...\n");
275  }
276  }
277 
278  // Ensure that the radix splits are sensible.
279  // A radix-N non p-1 transform mandates that p factor is at least N.
280  // Sort the splits so that larger radices come first.
281  // For composite radices like 16 and 64, they are built with 4x4 and 8x8, so we only
282  // need p factors for 4 and 8 for those cases.
283  // The cost function doesn't depend in which order we split the radices.
284  auto &cost = cost_propagate[unsigned(log2(float(N)))];
285  auto radices = move(cost.radices);
286 
287  sort(begin(radices), end(radices), greater<unsigned>());
288 
289  if (accumulate(begin(radices), end(radices), 1u, multiplies<unsigned>()) != N)
290  {
291  throw logic_error("Radix splits are invalid.");
292  }
293 
294  vector<Radix> radices_out;
295  radices_out.reserve(radices.size());
296 
297  // Fill in the structs with all information.
298  for (auto radix : radices)
299  {
300  bool first = radices_out.empty();
301  bool last = radices_out.size() + 1 == radices.size();
302 
303  // Use known performance options as a fallback.
304  // We used SSBO -> SSBO cost functions to find the optimal radix splits,
305  // but replace first and last options with Image -> SSBO / SSBO -> Image cost functions if appropriate.
306  auto &orig_opt = wisdom.find_optimal_options_or_default(Nx, Ny, radix, mode, SSBO, SSBO, options);
307  auto &opts = wisdom.find_optimal_options_or_default(Nx, Ny, radix, mode,
308  first ? input_target : SSBO,
309  last ? output_target : SSBO,
310  { orig_opt, options.type });
311 
312  radices_out.push_back(build_radix(Nx, Ny,
313  mode, opts.vector_size, opts.shared_banked, radix,
314  { opts.workgroup_size_x, opts.workgroup_size_y, radix_to_wg_z(radix) },
315  pow2_stride));
316  }
317 
318  accumulate_cost += cost.cost;
319  return radices_out;
320 }
321 
322 GLuint ProgramCache::find_program(const Parameters &parameters) const
323 {
324  auto itr = programs.find(parameters);
325  if (itr != end(programs))
326  {
327  return itr->second.get();
328  }
329  else
330  {
331  return 0;
332  }
333 }
334 
335 void ProgramCache::insert_program(const Parameters &parameters, GLuint program)
336 {
337  programs[parameters] = Program(program);
338 }
339 
340 GLuint FFT::get_program(const Parameters &params)
341 {
342  GLuint prog = cache->find_program(params);
343  if (!prog)
344  {
345  prog = build_program(params);
346  if (!prog)
347  {
348  throw runtime_error("Failed to compile shader.\n");
349  }
350  cache->insert_program(params, prog);
351  }
352  return prog;
353 }
354 
355 static inline unsigned mode_to_input_components(Mode mode)
356 {
357  switch (mode)
358  {
359  case HorizontalDual:
360  case VerticalDual:
361  return 4;
362 
363  case Horizontal:
364  case Vertical:
366  return 2;
367 
369  return 1;
370 
371  default:
372  return 0;
373  }
374 }
375 
376 FFT::FFT(unsigned Nx, unsigned Ny,
377  unsigned radix, unsigned p,
378  Mode mode, Target input_target, Target output_target,
379  std::shared_ptr<ProgramCache> program_cache, const FFTOptions &options)
380  : cache(move(program_cache)), size_x(Nx), size_y(Ny)
381 {
382  set_texture_offset_scale(0.5f / Nx, 0.5f / Ny, 1.0f / Nx, 1.0f / Ny);
383 
384  if (!Nx || !Ny || (Nx & (Nx - 1)) || (Ny & (Ny - 1)))
385  {
386  throw logic_error("FFT size is not POT.");
387  }
388 
389  if (p != 1 && input_target != SSBO)
390  {
391  throw logic_error("P != 1 only supported with SSBO as input.");
392  }
393 
394  if (p < radix && output_target != SSBO)
395  {
396  throw logic_error("P < radix only supported with SSBO as output.");
397  }
398 
399  // We don't really care about transform direction since it's just a matter of sign-flipping twiddles,
400  // but we have to obey some fundamental assumptions of resolve passes.
401  Direction direction = mode == ResolveComplexToReal ? Inverse : Forward;
402 
403  Radix res;
404  if (mode == ResolveRealToComplex || mode == ResolveComplexToReal)
405  {
406  res = build_resolve_radix(Nx, Ny, { options.performance.workgroup_size_x, options.performance.workgroup_size_y, 1 });
407  }
408  else
409  {
410  res = build_radix(Nx, Ny,
411  mode, options.performance.vector_size, options.performance.shared_banked, radix,
413  false);
414  }
415 
416  const Parameters params = {
417  res.size.x,
418  res.size.y,
419  res.size.z,
420  res.radix,
421  res.vector_size,
422  direction,
423  mode,
424  input_target,
425  output_target,
426  p == 1,
427  false,
428  res.shared_banked,
429  options.type.fp16, options.type.input_fp16, options.type.output_fp16,
430  options.type.normalize,
431  };
432 
433  if (res.num_workgroups_x == 0 || res.num_workgroups_y == 0)
434  {
435  throw logic_error("Invalid workgroup sizes for this radix.");
436  }
437 
438  unsigned uv_scale_x = res.vector_size / mode_to_input_components(mode);
439  const Pass pass = {
440  params,
442  uv_scale_x,
443  get_program(params),
444  0u,
445  };
446 
447  passes.push_back(pass);
448 }
449 
450 static inline void print_radix_splits(const vector<Radix> radices[2])
451 {
452  glfft_log("Transform #1\n");
453  for (auto &radix : radices[0])
454  {
455  glfft_log(" Size: (%u, %u, %u)\n",
456  radix.size.x, radix.size.y, radix.size.z);
457  glfft_log(" Dispatch: (%u, %u)\n",
458  radix.num_workgroups_x, radix.num_workgroups_y);
459  glfft_log(" Radix: %u\n",
460  radix.radix);
461  glfft_log(" VectorSize: %u\n\n",
462  radix.vector_size);
463  }
464 
465  glfft_log("Transform #2\n");
466  for (auto &radix : radices[1])
467  {
468  glfft_log(" Size: (%u, %u, %u)\n",
469  radix.size.x, radix.size.y, radix.size.z);
470  glfft_log(" Dispatch: (%u, %u)\n",
471  radix.num_workgroups_x, radix.num_workgroups_y);
472  glfft_log(" Radix: %u\n",
473  radix.radix);
474  glfft_log(" VectorSize: %u\n\n",
475  radix.vector_size);
476  }
477 }
478 
479 static inline unsigned type_to_input_components(Type type)
480 {
481  switch (type)
482  {
483  case ComplexToComplex:
484  case ComplexToReal:
485  return 2;
486 
487  case RealToComplex:
488  return 1;
489 
491  return 4;
492 
493  default:
494  return 0;
495  }
496 }
497 
498 FFT::FFT(unsigned Nx, unsigned Ny,
499  Type type, Direction direction, Target input_target, Target output_target,
500  std::shared_ptr<ProgramCache> program_cache, const FFTOptions &options, const FFTWisdom &wisdom)
501  : cache(move(program_cache)), size_x(Nx), size_y(Ny)
502 {
503  set_texture_offset_scale(0.5f / Nx, 0.5f / Ny, 1.0f / Nx, 1.0f / Ny);
504 
505  size_t temp_buffer_size = Nx * Ny * sizeof(float) * (type == ComplexToComplexDual ? 4 : 2);
506  temp_buffer_size >>= options.type.output_fp16;
507 
508  temp_buffer.init(nullptr, temp_buffer_size, GL_STREAM_COPY);
509  if (output_target != SSBO)
510  {
511  temp_buffer_image.init(nullptr, temp_buffer_size, GL_STREAM_COPY);
512  }
513 
514  bool expand = false;
515  if (type == ComplexToReal || type == RealToComplex)
516  {
517  // If we're doing C2R or R2C, we'll need double the scratch memory,
518  // so make sure we're dividing Nx *after* allocating.
519  Nx /= 2;
520  expand = true;
521  }
522 
523  // Sanity checks.
524  if (!Nx || !Ny || (Nx & (Nx - 1)) || (Ny & (Ny - 1)))
525  {
526  throw logic_error("FFT size is not POT.");
527  }
528 
529  if (type == ComplexToReal && direction == Forward)
530  {
531  throw logic_error("ComplexToReal transforms requires inverse transform.");
532  }
533 
534  if (type == RealToComplex && direction != Forward)
535  {
536  throw logic_error("RealToComplex transforms requires forward transform.");
537  }
538 
539  if (type == RealToComplex && input_target == Image)
540  {
541  throw logic_error("Input real-to-complex must use ImageReal target.");
542  }
543 
544  if (type == ComplexToReal && output_target == Image)
545  {
546  throw logic_error("Output complex-to-real must use ImageReal target.");
547  }
548 
549  vector<Radix> radices[2];
550  Mode modes[2];
551  Target targets[4];
552 
553  switch (direction)
554  {
555  case Forward:
556  modes[0] = type == ComplexToComplexDual ? HorizontalDual : Horizontal;
557  modes[1] = type == ComplexToComplexDual ? VerticalDual : Vertical;
558 
559  targets[0] = input_target;
560  targets[1] = Ny > 1 ? SSBO : output_target;
561  targets[2] = targets[1];
562  targets[3] = output_target;
563 
564  radices[0] = split_radices(Nx, Ny, modes[0], targets[0], targets[1], options, false, wisdom, cost);
565  radices[1] = split_radices(Nx, Ny, modes[1], targets[2], targets[3], options, expand, wisdom, cost);
566  break;
567 
568  case Inverse:
569  case InverseConvolve:
570  modes[0] = type == ComplexToComplexDual ? VerticalDual : Vertical;
571  modes[1] = type == ComplexToComplexDual ? HorizontalDual : Horizontal;
572 
573  targets[0] = input_target;
574  targets[1] = Ny > 1 ? SSBO : input_target;
575  targets[2] = targets[1];
576  targets[3] = output_target;
577 
578  radices[0] = split_radices(Nx, Ny, modes[0], targets[0], targets[1], options, expand, wisdom, cost);
579  radices[1] = split_radices(Nx, Ny, modes[1], targets[2], targets[3], options, false, wisdom, cost);
580  break;
581  }
582 
583 #if 1
584  print_radix_splits(radices);
585 #endif
586 
587  passes.reserve(radices[0].size() + radices[1].size() + expand);
588 
589  unsigned index = 0;
590  unsigned last_index = (radices[1].empty() && !expand) ? 0 : 1;
591 
592  for (auto &radix_direction : radices)
593  {
594  unsigned p = 1;
595  unsigned i = 0;
596 
597  // If we have R2C or C2R, we have a padded buffer to accomodate 2^n + 1 elements horizontally.
598  // For simplicity, this is implemented as a shader variant.
599  bool pow2_stride = expand && modes[index] == Vertical;
600 
601  for (auto &radix : radix_direction)
602  {
603  // If this is the last pass and we're writing to an image, use a special shader variant.
604  bool last_pass = index == last_index && i == radix_direction.size() - 1;
605 
606  bool input_fp16 = passes.empty() ? options.type.input_fp16 : options.type.output_fp16;
607  Target out_target = last_pass ? output_target : SSBO;
608  Target in_target = passes.empty() ? input_target : SSBO;
609  Direction dir = direction == InverseConvolve && !passes.empty() ? Inverse : direction;
610  unsigned uv_scale_x = radix.vector_size / type_to_input_components(type);
611 
612  const Parameters params = {
613  radix.size.x,
614  radix.size.y,
615  radix.size.z,
616  radix.radix,
617  radix.vector_size,
618  dir,
619  modes[index],
620  in_target,
621  out_target,
622  p == 1,
623  pow2_stride,
624  radix.shared_banked,
625  options.type.fp16, input_fp16, options.type.output_fp16,
626  options.type.normalize,
627  };
628 
629  // For last pass, we don't know how our resource will be used afterwards,
630  // so let barrier decisions be up to the API user.
631  const Pass pass = {
632  params,
633  radix.num_workgroups_x, radix.num_workgroups_y,
634  uv_scale_x,
635  get_program(params),
636  last_pass ? 0u : GL_SHADER_STORAGE_BARRIER_BIT,
637  };
638 
639  passes.push_back(pass);
640 
641  p *= radix.radix;
642  i++;
643  }
644 
645  // After the first transform direction, inject either a real-to-complex resolve or complex-to-real resolve.
646  // This way, we avoid having special purpose transforms for all FFT variants.
647  if (index == 0 && (type == ComplexToReal || type == RealToComplex))
648  {
649  bool input_fp16 = passes.empty() ? options.type.input_fp16 : options.type.output_fp16;
650  bool last_pass = radices[1].empty();
651  Direction dir = direction == InverseConvolve && !passes.empty() ? Inverse : direction;
652  Target in_target = passes.empty() ? input_target : SSBO;
653  Target out_target = last_pass ? output_target : SSBO;
655  unsigned uv_scale_x = 1;
656 
657  auto base_opts = options;
658  base_opts.type.input_fp16 = input_fp16;
659 
660  auto &opts = wisdom.find_optimal_options_or_default(Nx, Ny, 2, mode, in_target, out_target, base_opts);
661  auto res = build_resolve_radix(Nx, Ny, { opts.workgroup_size_x, opts.workgroup_size_y, 1 });
662 
663  const Parameters params = {
664  res.size.x,
665  res.size.y,
666  res.size.z,
667  res.radix,
668  res.vector_size,
669  dir,
670  mode,
671  in_target,
672  out_target,
673  true,
674  false,
675  false,
676  base_opts.type.fp16, base_opts.type.input_fp16, base_opts.type.output_fp16,
677  base_opts.type.normalize,
678  };
679 
680  const Pass pass = {
681  params,
682  Nx / res.size.x,
683  Ny / res.size.y,
684  uv_scale_x,
685  get_program(params),
686  GL_SHADER_STORAGE_BARRIER_BIT,
687  };
688 
689  passes.push_back(pass);
690  }
691 
692  index++;
693  }
694 }
695 
696 string FFT::load_shader_string(const char *path)
697 {
698  char *buf = nullptr;
699  if (!glfft_read_file_string(path, &buf))
700  {
701  throw runtime_error("Failed to load shader file from disk.\n");
702  }
703 
704  string ret(buf);
705  free(buf);
706  return ret;
707 }
708 
709 void FFT::store_shader_string(const char *path, const string &source)
710 {
711  ofstream file(path);
712  file.write(source.data(), source.size());
713 }
714 
716 {
717  string str;
718  str.reserve(16 * 1024);
719 
720 #if 0
721  glfft_log("Building program:\n");
722  glfft_log(
723  " WG_X: %u\n"
724  " WG_Y: %u\n"
725  " WG_Z: %u\n"
726  " P1: %u\n"
727  " Radix: %u\n"
728  " Dir: %d\n"
729  " Mode: %u\n"
730  " InTarget: %u\n"
731  " OutTarget: %u\n"
732  " POW2: %u\n"
733  " FP16: %u\n"
734  " InFP16: %u\n"
735  " OutFP16: %u\n"
736  " Norm: %u\n",
737  params.workgroup_size_x,
738  params.workgroup_size_y,
739  params.workgroup_size_z,
740  params.p1,
741  params.radix,
742  params.direction,
743  params.mode,
744  params.input_target,
745  params.output_target,
746  params.pow2_stride,
747  params.fft_fp16,
748  params.input_fp16,
749  params.output_fp16,
750  params.fft_normalize);
751 #endif
752 
753  if (params.p1)
754  {
755  str += "#define FFT_P1\n";
756  }
757 
758  if (params.pow2_stride)
759  {
760  str += "#define FFT_POW2_STRIDE\n";
761  }
762 
763  if (params.fft_fp16)
764  {
765  str += "#define FFT_FP16\n";
766  }
767 
768  if (params.input_fp16)
769  {
770  str += "#define FFT_INPUT_FP16\n";
771  }
772 
773  if (params.output_fp16)
774  {
775  str += "#define FFT_OUTPUT_FP16\n";
776  }
777 
778  if (params.fft_normalize)
779  {
780  str += "#define FFT_NORMALIZE\n";
781  }
782 
783  if (params.direction == InverseConvolve)
784  {
785  str += "#define FFT_CONVOLVE\n";
786  }
787 
788  str += params.shared_banked ? "#define FFT_SHARED_BANKED 1\n" : "#define FFT_SHARED_BANKED 0\n";
789 
790  str += params.direction == Forward ? "#define FFT_FORWARD\n" : "#define FFT_INVERSE\n";
791  str += string("#define FFT_RADIX ") + to_string(params.radix) + "\n";
792 
793  unsigned vector_size = params.vector_size;
794  switch (params.mode)
795  {
796  case VerticalDual:
797  str += "#define FFT_DUAL\n";
798  str += "#define FFT_VERT\n";
799  break;
800 
801  case Vertical:
802  str += "#define FFT_VERT\n";
803  break;
804 
805  case HorizontalDual:
806  str += "#define FFT_DUAL\n";
807  str += "#define FFT_HORIZ\n";
808  break;
809 
810  case Horizontal:
811  str += "#define FFT_HORIZ\n";
812  break;
813 
815  str += "#define FFT_RESOLVE_REAL_TO_COMPLEX\n";
816  str += "#define FFT_HORIZ\n";
817  vector_size = 2;
818  break;
819 
821  str += "#define FFT_RESOLVE_COMPLEX_TO_REAL\n";
822  str += "#define FFT_HORIZ\n";
823  vector_size = 2;
824  break;
825  }
826 
827  switch (params.input_target)
828  {
829  case ImageReal:
830  str += "#define FFT_INPUT_REAL\n";
831  // Fallthrough
832  case Image:
833  str += "#define FFT_INPUT_TEXTURE\n";
834  break;
835 
836  default:
837  break;
838  }
839 
840  switch (params.output_target)
841  {
842  case ImageReal:
843  str += "#define FFT_OUTPUT_REAL\n";
844  // Fallthrough
845  case Image:
846  str += "#define FFT_OUTPUT_IMAGE\n";
847  break;
848 
849  default:
850  break;
851  }
852 
853  switch (vector_size)
854  {
855  case 2:
856  str += "#define FFT_VEC2\n";
857  break;
858 
859  case 4:
860  str += "#define FFT_VEC4\n";
861  break;
862 
863  case 8:
864  str += "#define FFT_VEC8\n";
865  break;
866  }
867 
868  str += string("layout(local_size_x = ") +
869  to_string(params.workgroup_size_x) +
870  ", local_size_y = " +
871  to_string(params.workgroup_size_y) +
872  ", local_size_z = " +
873  to_string(params.workgroup_size_z) +
874  ") in;\n";
875 
876 #ifdef GLFFT_SHADER_FROM_FILE
877  str += load_shader_string("fft_common.comp");
878  switch (params.radix)
879  {
880  case 4:
881  str += load_shader_string("fft_radix4.comp");
882  break;
883 
884  case 8:
885  str += load_shader_string("fft_radix8.comp");
886  break;
887 
888  case 16:
889  str += load_shader_string("fft_radix4.comp");
890  str += load_shader_string("fft_shared.comp");
891  str += load_shader_string("fft_radix16.comp");
892  break;
893 
894  case 64:
895  str += load_shader_string("fft_radix8.comp");
896  str += load_shader_string("fft_shared.comp");
897  str += load_shader_string("fft_radix64.comp");
898  break;
899  }
900  str += load_shader_string("fft_main.comp");
901 #else
902  str += Blob::fft_common_source;
903  switch (params.radix)
904  {
905  case 4:
906  str += Blob::fft_radix4_source;
907  break;
908 
909  case 8:
910  str += Blob::fft_radix8_source;
911  break;
912 
913  case 16:
914  str += Blob::fft_radix4_source;
915  str += Blob::fft_shared_source;
916  str += Blob::fft_radix16_source;
917  break;
918 
919  case 64:
920  str += Blob::fft_radix8_source;
921  str += Blob::fft_shared_source;
922  str += Blob::fft_radix64_source;
923  break;
924  }
925  str += Blob::fft_main_source;
926 #endif
927 
928  GLuint prog = compile_compute_shader(str.c_str());
929  if (!prog)
930  {
931  puts(str.c_str());
932  }
933 
934 #if 0
935  char shader_path[1024];
936  snprintf(shader_path, sizeof(shader_path), "glfft_shader_radix%u_first%u_mode%u_in_target%u_out_target%u.comp.src",
937  params.radix, params.p1, params.mode, unsigned(params.input_target), unsigned(params.output_target));
938  store_shader_string(shader_path, str);
939 #endif
940 
941  return prog;
942 }
943 
945 {
946  GL_CHECK(GLuint program = glCreateProgram());
947  if (!program)
948  {
949  return 0;
950  }
951 
952  GL_CHECK(GLuint shader = glCreateShader(GL_COMPUTE_SHADER));
953 
954  const char *sources[] = { GLFFT_GLSL_LANG_STRING, src };
955  GL_CHECK(glShaderSource(shader, 2, sources, NULL));
956  GL_CHECK(glCompileShader(shader));
957 
958  GLint status;
959  GL_CHECK(glGetShaderiv(shader, GL_COMPILE_STATUS, &status));
960  if (status == GL_FALSE)
961  {
962  GLint len;
963  GLsizei out_len;
964 
965  GL_CHECK(glGetShaderiv(shader, GL_INFO_LOG_LENGTH, &len));
966  vector<char> buf(len);
967  GL_CHECK(glGetShaderInfoLog(shader, len, &out_len, buf.data()));
968  glfft_log("GLFFT: Shader log:\n%s\n\n", buf.data());
969 
970  GL_CHECK(glDeleteShader(shader));
971  GL_CHECK(glDeleteProgram(program));
972  return 0;
973  }
974 
975  GL_CHECK(glAttachShader(program, shader));
976  GL_CHECK(glLinkProgram(program));
977  GL_CHECK(glDeleteShader(shader));
978 
979  GL_CHECK(glGetProgramiv(program, GL_LINK_STATUS, &status));
980  if (status == GL_FALSE)
981  {
982  GLint len;
983  GLsizei out_len;
984  GL_CHECK(glGetProgramiv(program, GL_INFO_LOG_LENGTH, &len));
985  vector<char> buf(len);
986  GL_CHECK(glGetProgramInfoLog(program, len, &out_len, buf.data()));
987  glfft_log("Program log:\n%s\n\n", buf.data());
988 
989  GL_CHECK(glDeleteProgram(program));
990  GL_CHECK(glDeleteShader(shader));
991  return 0;
992  }
993 
994  return program;
995 }
996 
997 double FFT::bench(GLuint output, GLuint input,
998  unsigned warmup_iterations, unsigned iterations, unsigned dispatches_per_iteration, double max_time)
999 {
1000  GL_CHECK(glFinish());
1001  for (unsigned i = 0; i < warmup_iterations; i++)
1002  {
1003  process(output, input);
1004  }
1005  GL_CHECK(glFinish());
1006 
1007  unsigned runs = 0;
1008  double start_time = glfft_time();
1009  double total_time = 0.0;
1010 
1011  for (unsigned i = 0; i < iterations && (((glfft_time() - start_time) < max_time) || i == 0); i++)
1012  {
1013  double iteration_start = glfft_time();
1014  for (unsigned d = 0; d < dispatches_per_iteration; d++)
1015  {
1016  process(output, input);
1017  GL_CHECK(glMemoryBarrier(GL_ALL_BARRIER_BITS));
1018  runs++;
1019  }
1020  GL_CHECK(glFinish());
1021  double iteration_end = glfft_time();
1022  total_time += iteration_end - iteration_start;
1023  }
1024 
1025  return total_time / runs;
1026 }
1027 
1028 void FFT::process(GLuint output, GLuint input, GLuint input_aux)
1029 {
1030  if (passes.empty())
1031  {
1032  return;
1033  }
1034 
1035  GLuint buffers[2] = {
1036  input,
1037  passes.size() & 1 ?
1038  (passes.back().parameters.output_target != SSBO ? temp_buffer_image.get() : output) :
1039  temp_buffer.get(),
1040  };
1041 
1042  if (input_aux != 0)
1043  {
1044  if (passes.front().parameters.input_target != SSBO)
1045  {
1046  GL_CHECK(glActiveTexture(GL_TEXTURE1));
1047  GL_CHECK(glBindTexture(GL_TEXTURE_2D, input_aux));
1048  GL_CHECK(glBindSampler(1, texture.samplers[1]));
1049  }
1050  else
1051  {
1052  if (ssbo.input_aux.size != 0)
1053  {
1054  GL_CHECK(glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 2, input_aux,
1055  ssbo.input_aux.offset, ssbo.input_aux.size));
1056  }
1057  else
1058  {
1059  GL_CHECK(glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, input_aux));
1060  }
1061  }
1062  }
1063 
1064  GLuint current_program = 0;
1065  unsigned p = 1;
1066  unsigned pass_index = 0;
1067  for (auto &pass : passes)
1068  {
1069  if (pass.program != current_program)
1070  {
1071  GL_CHECK(glUseProgram(pass.program));
1072  current_program = pass.program;
1073  }
1074 
1075  if (pass.parameters.p1)
1076  {
1077  p = 1;
1078  }
1079  else
1080  {
1081  glUniform1ui(0, p);
1082  }
1083 
1084  p *= pass.parameters.radix;
1085 
1086  if (pass.parameters.input_target != SSBO)
1087  {
1088  GL_CHECK(glActiveTexture(GL_TEXTURE0));
1089  GL_CHECK(glBindTexture(GL_TEXTURE_2D, buffers[0]));
1090  GL_CHECK(glBindSampler(0, texture.samplers[0]));
1091 
1092  // If one compute thread reads multiple texels in X dimension, scale this accordingly.
1093  float scale_x = texture.scale_x * pass.uv_scale_x;
1094  GL_CHECK(glUniform2f(1, texture.offset_x, texture.offset_y));
1095  GL_CHECK(glUniform2f(2, scale_x, texture.scale_y));
1096  }
1097  else
1098  {
1099  if (buffers[0] == input && ssbo.input.size != 0)
1100  {
1101  GL_CHECK(glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 0, buffers[0],
1102  ssbo.input.offset, ssbo.input.size));
1103  }
1104  else if (buffers[0] == output && ssbo.output.size != 0)
1105  {
1106  // This can behave weirdly if output is an image and our temp buffers GLuint aliases with
1107  // the output texture name, but we shouldn't set ssbo.output.size in this case anyways.
1108  GL_CHECK(glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 0, buffers[0],
1109  ssbo.output.offset, ssbo.output.size));
1110  }
1111  else
1112  {
1113  GL_CHECK(glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, buffers[0]));
1114  }
1115  }
1116 
1117  if (pass.parameters.output_target != SSBO)
1118  {
1119  GLenum format = 0;
1120 
1121  // TODO: Make this more flexible, would require shader variants per-format though.
1122  if (pass.parameters.output_target == ImageReal)
1123  {
1124  format = GL_R32F;
1125  }
1126  else
1127  {
1128  switch (pass.parameters.mode)
1129  {
1130  case VerticalDual:
1131  case HorizontalDual:
1132  format = GL_RGBA16F;
1133  break;
1134 
1135  case Vertical:
1136  case Horizontal:
1137  case ResolveRealToComplex:
1138  format = GL_R32UI;
1139  break;
1140 
1141  default:
1142  break;
1143  }
1144  }
1145  GL_CHECK(glBindImageTexture(0, output, 0, GL_FALSE, 0, GL_WRITE_ONLY, format));
1146  }
1147  else
1148  {
1149  if (buffers[1] == output && ssbo.output.size != 0)
1150  {
1151  GL_CHECK(glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 1, buffers[1],
1152  ssbo.output.offset, ssbo.output.size));
1153  }
1154  else
1155  {
1156  GL_CHECK(glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, buffers[1]));
1157  }
1158  }
1159 
1160  GL_CHECK(glDispatchCompute(pass.workgroups_x, pass.workgroups_y, 1));
1161 
1162  if (pass.barriers != 0)
1163  {
1164  GL_CHECK(glMemoryBarrier(pass.barriers));
1165  }
1166 
1167  if (pass_index == 0)
1168  {
1169  buffers[0] = passes.size() & 1 ?
1170  temp_buffer.get() :
1171  (passes.back().parameters.output_target != SSBO ? temp_buffer_image.get() : output);
1172  }
1173 
1174  swap(buffers[0], buffers[1]);
1175  pass_index++;
1176  }
1177 }
1178 
static void store_shader_string(const char *path, const std::string &source)
Definition: glfft.cpp:709
GLuint get() const
unsigned x
Definition: glfft.cpp:44
void ** params
Definition: gl2ext.h:143
int pass
Definition: app.cpp:216
GLuint GLuint end
Definition: gl2ext.h:323
Buffer temp_buffer_image
Definition: glfft.hpp:189
Inverse FFT transform.
Forward FFT transform.
void merge_if_better(const CostPropagate &a, const CostPropagate &b)
Definition: glfft.cpp:182
GLboolean GLboolean GLboolean GLboolean a
Definition: gl2ext.h:306
unsigned num_workgroups_y
Definition: glfft.cpp:51
void process(GLuint output, GLuint input, GLuint input_aux=0)
Process the FFT.
Definition: glfft.cpp:1028
GLuint divisor
Definition: gl2ext.h:840
float scale_x
Definition: glfft.hpp:202
Definition: Native.cpp:67
GLboolean r
Definition: gl2ext.h:306
static unsigned mode_to_input_components(Mode mode)
Definition: glfft.cpp:355
static std::string load_shader_string(const char *path)
Definition: glfft.cpp:696
void glfft_log(const char *fmt,...)
bool normalize
Whether to apply 1 / N normalization factor.
GLsizei GLenum * sources
Definition: gl2ext.h:136
Complex-to-real transform. N / 2 + 1 complex values are used per row with a stride of N complex sampl...
GLint GLsizei GLsizei GLenum format
Definition: gl2ext.h:179
bool input_fp16
Whether input SSBO is a packed 2xfp16 format. Otherwise, regular FP32.
WorkGroupSize size
Definition: glfft.cpp:49
Buffer temp_buffer
Definition: glfft.hpp:188
unsigned vector_size
Vector size. Very GPU dependent. "Scalar" GPUs prefer 2 here, vector GPUs prefer 4 (and maybe 8)...
bool fp16
Whether internal shader should be mediump float.
unsigned radix
Definition: glfft.cpp:52
GLuint index
Definition: gl2ext.h:300
unsigned y
Definition: glfft.cpp:44
static bool is_radix_valid(unsigned Nx, unsigned Ny, Mode mode, unsigned vector_size, unsigned radix, WorkGroupSize size, bool pow2_stride)
Definition: glfft.cpp:150
static Radix build_radix(unsigned Nx, unsigned Ny, Mode mode, unsigned vector_size, bool shared_banked, unsigned radix, WorkGroupSize size, bool pow2_stride)
Definition: glfft.cpp:86
GLenum mode
Definition: gl2ext.h:302
unsigned workgroup_size_y
GLuint build_program(const Parameters &params)
Definition: glfft.cpp:715
bool glfft_read_file_string(const char *path, char **out_buf)
GLint first
Definition: gl2ext.h:838
GLenum target
Definition: gl2ext.h:720
vector< unsigned > radices
Definition: glfft.cpp:195
double cost
Definition: glfft.hpp:186
#define GLFFT_GLSL_LANG_STRING
FFT(unsigned Nx, unsigned Ny, Type type, Direction direction, Target input_target, Target output_target, std::shared_ptr< ProgramCache > cache, const FFTOptions &options, const FFTWisdom &wisdom=FFTWisdom())
Creates a full FFT.
Definition: glfft.cpp:498
unsigned workgroup_size_z
GLuint compile_compute_shader(const char *src)
Definition: glfft.cpp:944
GLsizei GLsizei GLchar * source
Definition: gl2ext.h:877
Real-to-complex transform. N / 2 + 1 complex output samples are created per row with a stride of N co...
unsigned workgroup_size_x
double bench(GLuint output, GLuint input, unsigned warmup_iterations, unsigned iterations, unsigned dispatches_per_iteration, double max_time=std::numeric_limits< double >::max())
Run process() multiple times, timing the results.
Definition: glfft.cpp:997
Options for FFT implementation. Defaults for performance as conservative.
static float total_time
Definition: ocean.cpp:244
bool output_fp16
Whether output SSBO is a packed 2xfp16 format. Otherwise, regular FP32.
void init(const void *data, size_t size, GLenum access)
GLfloat GLfloat f
Definition: gl2ext.h:2707
static vector< Radix > split_radices(unsigned Nx, unsigned Ny, Mode mode, Target input_target, Target output_target, const FFTOptions &options, bool pow2_stride, const FFTWisdom &wisdom, double &accumulate_cost)
Definition: glfft.cpp:198
const std::pair< const WisdomPass, FFTOptions::Performance > * find_optimal_options(unsigned Nx, unsigned Ny, unsigned radix, Mode mode, Target input_target, Target output_target, const FFTOptions::Type &base_options) const
struct GLFFT::FFT::@1::@2 input
#define GL_CHECK(x)
Definition: AstcTextures.h:59
static double find_cost(unsigned Nx, unsigned Ny, Mode mode, unsigned radix, const FFTOptions &options, const FFTWisdom &wisdom)
Definition: glfft.cpp:163
static void reduce(unsigned &wg_size, unsigned &divisor)
Definition: glfft.cpp:57
GL_SHADER_STORAGE_BUFFER.
void set_texture_offset_scale(float offset_x, float offset_y, float scale_x, float scale_y)
Sets offset and scale parameters for normalized texel coordinates when sampling textures.
Definition: glfft.hpp:126
GLsizei size
Definition: glfft.hpp:211
GLenum GLuint texture
Definition: gl2ext.h:385
GLenum type
Definition: gl2ext.h:133
float max(float x, float y)
Definition: noise.cpp:29
static Radix build_resolve_radix(unsigned Nx, unsigned Ny, WorkGroupSize size)
Definition: glfft.cpp:144
const FFTOptions::Performance & find_optimal_options_or_default(unsigned Nx, unsigned Ny, unsigned radix, Mode mode, Target input_target, Target output_target, const FFTOptions &base_options) const
Regular complex-to-complex transform.
struct GLFFT::FFT::@1::@2 output
GLuint get_program(const Parameters &params)
Definition: glfft.cpp:340
static void print_radix_splits(const vector< Radix > radices[2])
Definition: glfft.cpp:450
GLenum GLuint GLintptr GLsizeiptr size
Definition: gl2ext.h:629
unsigned z
Definition: glfft.cpp:44
static unsigned type_to_input_components(Type type)
Definition: glfft.cpp:479
GLint GLint GLint GLint GLint x
Definition: gl2ext.h:574
void glfft_time()
unsigned num_workgroups_x
Definition: glfft.cpp:50
unsigned vector_size
Definition: glfft.cpp:53
Definition: glfft.cpp:47
static unsigned radix_to_wg_z(unsigned radix)
Definition: glfft.cpp:71
GLenum GLuint GLenum GLsizei const GLchar * buf
Definition: gl2ext.h:134
precision highp float
Definition: hiz_cull.cs:37
GLboolean GLboolean GLboolean b
Definition: gl2ext.h:306
typedef GLenum(GL_APIENTRYP PFNGLGETGRAPHICSRESETSTATUSKHRPROC)(void)
timeval start_time
Definition: timer.cpp:24
GLuint program
Definition: gl2ext.h:1475
#define N
Definition: geometry.cpp:25
struct GLFFT::FFTOptions::Type type
typedef GLuint(GL_APIENTRYP PFNGLGETDEBUGMESSAGELOGKHRPROC)(GLuint count
bool shared_banked
Definition: glfft.cpp:54
std::vector< Pass > passes
Definition: glfft.hpp:190
GLint y
Definition: gl2ext.h:179
struct GLFFT::FFTOptions::Performance performance
CostPropagate(double cost, vector< unsigned > radices)
Definition: glfft.cpp:179
struct GLFFT::FFT::@1 ssbo
GLenum src
Definition: gl2ext.h:304
double cost
Definition: glfft.cpp:194