OpenGL ES SDK for Android ARM Developer Center
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
scan_first.cs
Go to the documentation of this file.
1 #version 310 es
2 
3 /* Copyright (c) 2014-2017, ARM Limited and Contributors
4  *
5  * SPDX-License-Identifier: MIT
6  *
7  * Permission is hereby granted, free of charge,
8  * to any person obtaining a copy of this software and associated documentation files (the "Software"),
9  * to deal in the Software without restriction, including without limitation the rights to
10  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
11  * and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
16  * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
19  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21  */
22 
23 /*
24  * The main idea of the sorting algorithm is based on counting.
25  * For example, say that we need to put the number 3 in sorted order,
26  * and we know that there are in total 5 numbers less than 3 in the input.
27  * Then we know that our number must come after all of these - that is, in position 5!
28  *
29  * To determine such a value for each number in our input, we use a prefix sum
30  * (also known as a scan).
31  *
32  * It works like this, let's make this our input that we want to sort:
33  * 1 3 2 0 1 0 2 2
34  *
35  * (the actual input may have values greater than 3, but the scan only operates
36  * on 2 bit values, because the radix sort works on 2-bit stages. We mask out
37  * the interesting digit based on the <bitOffset> value using bitfieldExtract().)
38  *
39  * We then construct four flag arrays, one for each possible digit,
40  * that has a 1 where the key matches the digit, and 0 elsewhere:
41  * 0 0 0 1 0 1 0 0 <- 0
42  * 1 0 0 0 1 0 0 0 <- 1
43  * 0 0 1 0 0 0 1 1 <- 2
44  * 0 1 0 0 0 0 0 0 <- 3
45  *
46  * If we do an exclusive prefix sum over these arrays (carrying over the sum
47  * from each array to the next) we get:
48  * 0 0 0 0 1 1 2 2 <- 0
49  * 2 3 3 3 3 4 4 4 <- 1
50  * 4 4 4 5 5 5 5 6 <- 2
51  * 7 7 8 8 8 8 8 8 <- 3 (note that 7 was carried over instead of 6)
52  *
53  * Now we have all we need!
54  * We then go through each element in the input, and look at this table to find
55  * out where the element should go in the sorted output.
56  *
57  * For example, the first 0 is located in the fourth column (as marked by the flag).
58  * The scan array that corresponds to 0 contains the number 0 at this columns.
59  * Thus, the first 0 should go to location 0.
60  *
61  * Not too bad!
62  * What about the first 1?
63  *
64  * It is masked in the first column, second row.
65  * We then look at the second row to determine its offset.
66  * The scan value there is 2. So the first 1 should go to index 2 in the output.
67 */
68 
69 /*
70  * For efficiency, we perform the prefix sum in blocks over multiple stages.
71  * For example, with blocks of 4-by-4 elements, calculating the prefix sum of
72  * 0 0 0 1 0 1 0 0
73  * is done by first scanning each block individually
74  * 0 0 0 0 | 0 0 1 1
75  *
76  * This can be done highly parallel over multiple shader cores and lots of threads
77  * if the work set is large enough.
78  *
79  * To merge these two we need to add the sum of elements in the first block,
80  * to each element in the second block. So first we compute the sums
81  * 1 | 1
82  * then we take the prefix sum of this again!
83  * 0 | 1
84  * Finally we add blocksum[i] to each element in block[i], and so on
85  * 0 0 0 0 (+ 0) | 0 0 1 1 (+ 1)
86  * 0 0 0 0 | 1 1 2 2
87  *
88  * This is a recursive problem since we need to take the prefix sum of block sums.
89  * We might have enough blocks that this problem can also be quite parallel.
90  * Our implementation takes a recursive approach where we keep scanning the
91  * residual block sums in parallel until the blocksum becomes a single value.
92  * Once we have worked our way down, we need to "resolve" the prefix sum
93  * in the smaller blocks all the way up so we can get the complete prefix sum again.
94  *
95  * Because we need to perform four prefix sums, one for every digit, the sums array is a uvec4 array.
96  * The xyzw components correspond to the digit 0, 1, 2 and 3
97  * scan arrays, respectively. Storing it as a vector allows us to use vector math
98  * to operate on each array in single expressions.
99  *
100  * In the implementation, we use inclusive scan (postfix sum) instead of exclusive scan (prefix sum).
101  * The difference between these is trivially resolved in scan_reorder.cs.
102  */
103 
104 layout(local_size_x = 32) in; // We work on 4 items at once, so this value should be BLOCK_SIZE / 4.
105 #define NUM_STEPS 4u
106 layout(binding = 0, std430) readonly buffer Data
107 {
108  vec4 in_points[];
109 };
110 
111 layout(binding = 1, std430) writeonly buffer OutData
112 {
113  uvec4 outbuf[];
114 };
115 
116 layout(binding = 2, std430) writeonly buffer BlockSumData
117 {
118  uvec4 blocksum[];
119 };
120 
121 layout(binding = 3, std430) writeonly buffer FlagsData
122 {
123  uvec4 flags[];
124 };
125 
126 shared uvec4 sharedData[gl_WorkGroupSize.x];
127 
130 uniform float zMin;
131 uniform float zMax;
132 
133 /*
134  * The particles are sorted by increasing distance along the sorting axis.
135  * We find the distance by a simple dot product. The sorting algorithm
136  * needs integer keys (16-bit in this case), so we convert the distance from
137  * the range [zMin, zMax] -> [0, 65535].
138  * Finally we extract the current working digits (2-bit in this case) from the key.
139  *
140  * Read four particles at once to improve vectorization a bit.
141  */
142 uvec4 decodeKeys(uint index)
143 {
144  vec4 z = vec4(
145  dot(in_points[4u * index + 0u].xyz, axis),
146  dot(in_points[4u * index + 1u].xyz, axis),
147  dot(in_points[4u * index + 2u].xyz, axis),
148  dot(in_points[4u * index + 3u].xyz, axis));
149  z = 65535.0 * clamp((z - zMin) / (zMax - zMin), vec4(0.0), vec4(1.0));
150  return bitfieldExtract(uvec4(z), bitOffset, 2);
151 }
152 
153 void main()
154 {
155  uint ident = gl_GlobalInvocationID.x;
156  uint local_ident = gl_LocalInvocationID.x;
157 
158  uvec4 flagdata = decodeKeys(ident);
159  // Save the flags data for later. Will be used in scan_reorder.cs.
160  // We could recompute in scan_reorder.cs of course if we wanted.
161  flags[ident] = flagdata;
162 
163  // Create flags by comparing against the possible 2-bit values.
164  uvec4 miniblock0 = uvec4(equal(flagdata.xxxx, uvec4(0u, 1u, 2u, 3u)));
165  uvec4 miniblock1 = uvec4(equal(flagdata.yyyy, uvec4(0u, 1u, 2u, 3u)));
166  uvec4 miniblock2 = uvec4(equal(flagdata.zzzz, uvec4(0u, 1u, 2u, 3u)));
167  uvec4 miniblock3 = uvec4(equal(flagdata.wwww, uvec4(0u, 1u, 2u, 3u)));
168 
169  // First we want to do the scan in registers to reduce shared memory pressure a bit.
170  miniblock1 += miniblock0;
171  miniblock2 += miniblock1;
172  miniblock3 += miniblock2;
173 
174  // We have now done inclusive scan for our "miniblock".
175  // We only share our accumulated sum (miniblock3) with other threads.
176 
177  // Share miniblock sum with other threads
178  sharedData[local_ident] = miniblock3;
179  memoryBarrierShared();
180  barrier();
181 
182  for (uint step = 0u; step < NUM_STEPS; step++) {
183  // Half the threads will have something useful to do every step.
184  // Branching like this is a not an issue on Mali Midgard as long as we keep enough threads busy doing something useful.
185  if ((local_ident & (1u << step)) != 0u) {
186  // Get previous index. This value will be the same for every thread within this "block".
187  uint prev = ((local_ident >> step) << step) - 1u;
188 
189  // Update our block. Always accumulate data in registers.
190  uvec4 sum_prev = sharedData[prev];
191  miniblock0 += sum_prev;
192  miniblock1 += sum_prev;
193  miniblock2 += sum_prev;
194  miniblock3 += sum_prev;
195 
196  // Write out current value.
197  sharedData[local_ident] = miniblock3;
198  }
199  memoryBarrierShared();
200  barrier();
201  }
202 
203  // We don't need barrier after last iteration, so unroll that manually.
204  if ((local_ident & (1u << NUM_STEPS)) != 0u) {
205  // Get previous index. This value will be the same for every thread within this "block".
206  uint prev = ((local_ident >> NUM_STEPS) << NUM_STEPS) - 1u;
207 
208  // Update our block. Always accumulate data in registers.
209  uvec4 sum_prev = sharedData[prev];
210  miniblock0 += sum_prev;
211  miniblock1 += sum_prev;
212  miniblock2 += sum_prev;
213  miniblock3 += sum_prev;
214  }
215 
216  // Write out inclusive scan results.
217  outbuf[4u * ident + 0u] = miniblock0;
218  outbuf[4u * ident + 1u] = miniblock1;
219  outbuf[4u * ident + 2u] = miniblock2;
220  outbuf[4u * ident + 3u] = miniblock3;
221 
222  // Last thread knows the inclusive scan for this work group, so write out to blocksum.
223  if (local_ident == (gl_WorkGroupSize.x - 1u))
224  blocksum[gl_WorkGroupID.x] = miniblock3;
225 }
uniform int bitOffset
Definition: scan_first.cs:128
Definition: matrix.h:51
float clamp(float x, float min, float max)
Definition: noise.cpp:24
GLbitfield flags
Definition: gl2ext.h:951
void main()
Definition: scan_first.cs:153
Definition: matrix.h:75
GLuint index
Definition: gl2ext.h:300
shared uvec4 sharedData[gl_WorkGroupSize.x]
Definition: scan_first.cs:124
uniform float zMax
Definition: scan_first.cs:131
#define NUM_STEPS
Definition: scan_first.cs:105
layout(local_size_x=32) in
void uniform(string name, const mat4 &v)
Definition: glutil.cpp:97
GLint GLint GLint GLint GLint x
Definition: gl2ext.h:574
uniform vec3 axis
Definition: scan_first.cs:129
GLenum GLuint buffer
Definition: gl2ext.h:628
uniform float zMin
Definition: scan_first.cs:130
uvec4 decodeKeys(uint index)
Definition: scan_first.cs:142