1layout(local_size_x = 256) in; 2const uint WORKGROUP_SIZE = 256; 3 4struct GlobalCounts { 5 atomicUint firstHalfCount; 6 atomicUint secondHalfCount; 7}; 8layout(metal, binding = 0) buffer ssbo { 9 GlobalCounts globalCounts; 10}; 11 12workgroup atomicUint localCounts[2]; 13 14void main() { 15 // Initialize the local counts. 16 if (sk_LocalInvocationID.x == 0) { 17 atomicStore(localCounts[0], 0); 18 atomicStore(localCounts[1], 0); 19 } 20 21 // Synchronize the threads in the workgroup so they all see the initial value. 22 workgroupBarrier(); 23 24 // Each thread increments one of the local counters based on its invocation index. 25 uint idx = sk_LocalInvocationID.x < (WORKGROUP_SIZE / 2) ? 0 : 1; 26 atomicAdd(localCounts[idx], 1); 27 28 // Synchronize the threads again to ensure they have all executed the increments 29 // and the following load reads the same value across all threads in the 30 // workgroup. 31 workgroupBarrier(); 32 33 // Add the workgroup-only tally to the global counter. 34 if (sk_LocalInvocationID.x == 0) { 35 atomicAdd(globalCounts.firstHalfCount, atomicLoad(localCounts[0])); 36 atomicAdd(globalCounts.secondHalfCount, atomicLoad(localCounts[1])); 37 } 38} 39