OpenGL ES SDK for Android ARM Developer Center
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
scan.cs
Go to the documentation of this file.
1 #version 310 es
2 
3 /* Copyright (c) 2014-2017, ARM Limited and Contributors
4  *
5  * SPDX-License-Identifier: MIT
6  *
7  * Permission is hereby granted, free of charge,
8  * to any person obtaining a copy of this software and associated documentation files (the "Software"),
9  * to deal in the Software without restriction, including without limitation the rights to
10  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
11  * and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
16  * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
19  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21  */
22 
23 /*
24  * See scan_first.cs for more detailed comments.
25  * This shader is used for scan passes after the first one and is very similar.
26  */
27 
28 layout(local_size_x = 32) in; // We work on 4 items at once, so this value should be BLOCK_SIZE / 4.
29 #define NUM_STEPS 4u
30 
31 layout(binding = 0, std430) readonly buffer Data
32 {
33  uvec4 buf[];
34 };
35 
36 layout(binding = 1, std430) writeonly buffer OutData
37 {
38  uvec4 outbuf[];
39 };
40 
41 layout(binding = 2, std430) writeonly buffer BlockSumData
42 {
43  uvec4 blocksum[];
44 };
45 
46 shared uvec4 sharedData[gl_WorkGroupSize.x];
47 
48 void main()
49 {
50  uint ident = gl_GlobalInvocationID.x;
51  uint local_ident = gl_LocalInvocationID.x;
52 
53  // First we want to do the scan in registers to reduce shared memory pressure a bit.
54  uvec4 miniblock0 = buf[4u * ident + 0u];
55  uvec4 miniblock1 = buf[4u * ident + 1u];
56  uvec4 miniblock2 = buf[4u * ident + 2u];
57  uvec4 miniblock3 = buf[4u * ident + 3u];
58  miniblock1 += miniblock0;
59  miniblock2 += miniblock1;
60  miniblock3 += miniblock2;
61 
62  // We have now done inclusive scan for our "miniblock".
63  // We only share our accumulated sum (miniblock3) with other threads.
64 
65  // Share miniblock sum with other threads
66  sharedData[local_ident] = miniblock3;
67  memoryBarrierShared();
68  barrier();
69 
70  // Now we have to start accumulating across threads.
71  // We double the "block size" every iteration. Odd blocks accumulate the scan value from just before the block.
72 
73  for (uint step = 0u; step < NUM_STEPS; step++) {
74  // Half the threads will have something useful to do every step.
75  // Branching like this is a not an issue on Mali as long as we keep enough threads busy doing something useful.
76  if ((local_ident & (1u << step)) != 0u) {
77  // Get previous index. This value will be the same for every thread within this "block".
78  uint prev = ((local_ident >> step) << step) - 1u;
79 
80  // Update our block. Always accumulate data in registers.
81  uvec4 sum_prev = sharedData[prev];
82  miniblock0 += sum_prev;
83  miniblock1 += sum_prev;
84  miniblock2 += sum_prev;
85  miniblock3 += sum_prev;
86 
87  // Write out current value.
88  sharedData[local_ident] = miniblock3;
89  }
90  memoryBarrierShared();
91  barrier();
92  }
93 
94  // We don't need barrier after last iteration, so unroll that manually.
95  if ((local_ident & (1u << NUM_STEPS)) != 0u) {
96  // Get previous index. This value will be the same for every thread within this "block".
97  uint prev = ((local_ident >> NUM_STEPS) << NUM_STEPS) - 1u;
98 
99  // Update our block. Always accumulate data in registers.
100  uvec4 sum_prev = sharedData[prev];
101  miniblock0 += sum_prev;
102  miniblock1 += sum_prev;
103  miniblock2 += sum_prev;
104  miniblock3 += sum_prev;
105  }
106 
107  // Write out inclusive scan results.
108  outbuf[4u * ident + 0u] = miniblock0;
109  outbuf[4u * ident + 1u] = miniblock1;
110  outbuf[4u * ident + 2u] = miniblock2;
111  outbuf[4u * ident + 3u] = miniblock3;
112 
113  // Last thread knows the inclusive scan for this work group, so write out to blocksum.
114  if (local_ident == (gl_WorkGroupSize.x - 1u))
115  blocksum[gl_WorkGroupID.x] = miniblock3;
116 }
#define NUM_STEPS
Definition: scan.cs:29
void main()
Definition: scan.cs:48
layout(local_size_x=32) in
shared uvec4 sharedData[gl_WorkGroupSize.x]
Definition: scan.cs:44
GLint GLint GLint GLint GLint x
Definition: gl2ext.h:574
GLenum GLuint buffer
Definition: gl2ext.h:628
GLenum GLuint GLenum GLsizei const GLchar * buf
Definition: gl2ext.h:134