1// Implementation of the parallel prefix sum algorithm 2 3layout(local_size_x = 256) in; 4const int SIZE = 256; 5 6layout(set=0, binding=0) readonly buffer inputs { 7 float[] in_data; 8}; 9layout(set=0, binding=1) writeonly buffer outputs { 10 float[] out_data; 11}; 12 13workgroup float[SIZE * 2] shared_data; 14 15// Test that workgroup-shared variables are passed to user-defined functions 16// correctly. 17noinline void store(uint i, float value) { 18 shared_data[i] = value; 19} 20 21void main() { 22 uint id = sk_GlobalInvocationID.x; 23 uint rd_id; 24 uint wr_id; 25 uint mask; 26 27 // Each thread is responsible for two elements of the output array 28 shared_data[id * 2] = in_data[id * 2]; 29 shared_data[id * 2 + 1] = in_data[id * 2 + 1]; 30 31 workgroupBarrier(); 32 33 const uint steps = uint(log2(float(SIZE))) + 1; 34 for (uint step = 0; step < steps; step++) { 35 // Calculate the read and write index in the shared array 36 mask = (1 << step) - 1; 37 rd_id = ((id >> step) << (step + 1)) + mask; 38 wr_id = rd_id + 1 + (id & mask); 39 40 // Accumulate the read data into our element 41 store(wr_id, shared_data[wr_id] + shared_data[rd_id]); 42 43 workgroupBarrier(); 44 } 45 46 // Write the final result out 47 out_data[id * 2] = shared_data[id * 2]; 48 out_data[id * 2 + 1] = shared_data[id * 2 + 1]; 49} 50