OpenGL ES SDK for Android ARM Developer Center
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
scan_reorder.cs
Go to the documentation of this file.
1 #version 310 es
2 
3 /* Copyright (c) 2014-2017, ARM Limited and Contributors
4  *
5  * SPDX-License-Identifier: MIT
6  *
7  * Permission is hereby granted, free of charge,
8  * to any person obtaining a copy of this software and associated documentation files (the "Software"),
9  * to deal in the Software without restriction, including without limitation the rights to
10  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
11  * and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
16  * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
19  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21  */
22 
23 /*
24  * This program merges the block-wise prefix sums of the input into a
25  * single prefix sum. It does this by adding the sum of elements in a previous
26  * block to the next one, accumulating the sum as we go along. The sums to
27  * add are located in the blocksum buffer.
28  *
29  * Finally, we shuffle the elements to their correct positions.
30  * The new position is given by the prefix
31  * sum value in the element's scan array.
32  *
33  * For example, if our keys (for the current stage) are as following:
34  * 1 3 2 0 1 0 2 2
35  *
36  * We get these digit scan arrays
37  * 0 0 0 0 1 1 2 2 <- 0
38  * 2 3 3 3 3 4 4 4 <- 1
39  * 4 4 4 5 5 5 5 6 <- 2
40  * 7 7 8 8 8 8 8 8 <- 3
41  *
42  * And these flags
43  * 0 0 0 1 0 1 0 0 <- 0
44  * 1 0 0 0 1 0 0 0 <- 1
45  * 0 0 1 0 0 0 1 1 <- 2
46  * 0 1 0 0 0 0 0 0 <- 3
47  *
48  * (Note that the columns of these are stored as uvec4s in the buffers)
49  *
50  * In column 1 we have a flag in the second row, because the key is (1) in the first position.
51  * Looking at the scan array belonging to (1) we find that the offset is 2.
52  * So the first element should be reordered to position 2 in the sorted array.
53  */
54 
55 layout(local_size_x = 32) in; // We work on 4 items at once, so this value should be BLOCK_SIZE / 4.
56 layout(binding = 0, std430) readonly buffer SortData
57 {
58  vec4 sort_buf[];
59 };
60 
61 layout(binding = 1, std430) readonly buffer Data
62 {
63  uvec4 buf[];
64 };
65 
66 layout(binding = 2, std430) readonly buffer BlockSumData
67 {
68  uvec4 blocksum[];
69 };
70 
71 layout(binding = 3, std430) writeonly buffer OutSortData
72 {
73  vec4 out_sort_buf[];
74 };
75 
76 layout(binding = 4, std430) readonly buffer FlagsData
77 {
78  uvec4 flags[];
79 };
80 
81 layout(location = 0) uniform int uShift;
82 
83 void main()
84 {
85  uint ident = gl_GlobalInvocationID.x;
86  uint wg_ident = gl_WorkGroupID.x;
87 
88  // The last blocksum in our buffer contains the count of values which are 0, 1, 2, and 3 respectively
89  // since it's an inclusive scan.
90  uvec4 carry = blocksum[gl_NumWorkGroups.x - 1u];
91 
92  // Do an exclusive scan over the carry values.
93  // We can now figure out the base offset for all the different values.
94  carry = uvec4(0u, carry.x, carry.x + carry.y, carry.x + carry.y + carry.z);
95 
96  // Cached results computed in scan_first.cs.
97  uvec4 lookups = flags[ident];
98 
99  // Swizzle the carry vector so we know the base offset for our data.
100  // Subtract -1 on the carry to get exclusive scan instead of inclusive scan.
101  // We're always going to add in 1 due to inclusive scan results so we have to compensate for it somewhere.
102  carry = uvec4(carry[lookups.x], carry[lookups.y], carry[lookups.z], carry[lookups.w]) - 1u;
103 
104  // Add in per-digit inclusive scan results.
105  carry.x += buf[4u * ident + 0u][lookups.x];
106  carry.y += buf[4u * ident + 1u][lookups.y];
107  carry.z += buf[4u * ident + 2u][lookups.z];
108  carry.w += buf[4u * ident + 3u][lookups.w];
109 
110  // We also have to add in blocksum for the previous block.
111  // If we're the first block, there is nothing to add.
112  if (wg_ident != 0u) {
113  uvec4 prev_sum = blocksum[wg_ident - 1u];
114  carry.x += prev_sum[lookups.x];
115  carry.y += prev_sum[lookups.y];
116  carry.z += prev_sum[lookups.z];
117  carry.w += prev_sum[lookups.w];
118  }
119 
120  // Reorder data since we now know the output indices.
121  // Not the most cache friendly operation.
122  out_sort_buf[carry.x] = sort_buf[4u * ident + 0u];
123  out_sort_buf[carry.y] = sort_buf[4u * ident + 1u];
124  out_sort_buf[carry.z] = sort_buf[4u * ident + 2u];
125  out_sort_buf[carry.w] = sort_buf[4u * ident + 3u];
126 }
GLbitfield flags
Definition: gl2ext.h:951
Definition: matrix.h:75
void main()
Definition: scan_reorder.cs:83
GLfloat GLfloat GLfloat w
Definition: gl2ext.h:2701
layout(local_size_x=32) in
GLint location
Definition: gl2ext.h:180
void uniform(string name, const mat4 &v)
Definition: glutil.cpp:97
GLint GLint GLint GLint GLint x
Definition: gl2ext.h:574
GLenum GLuint buffer
Definition: gl2ext.h:628
GLenum GLuint GLenum GLsizei const GLchar * buf
Definition: gl2ext.h:134
GLint y
Definition: gl2ext.h:179