layout(local_size_x=LOCALSIZE) in;
const uint groupSize=LOCALSIZE*BLOCKSIZE;
uniform uint blockSize;
layout(binding=3, std430) buffer globalSumBuffer
{
uint globalSum[];
};
shared uint groupSum[LOCALSIZE];
void main()
{
uint localSum[groupSize];
uint id=gl_LocalInvocationID.x;
uint dataOffset=blockSize*id;
uint sum=0u;
for(uint i=0; i < blockSize; ++i) {
localSum[i]=sum;
sum += globalSum[dataOffset+i];
}
groupSum[id]=sum;
barrier();
for(uint shift=1u; shift < LOCALSIZE; shift *= 2u) {
uint read;
if(shift <= id)
read=groupSum[id]+groupSum[id-shift];
barrier();
if(shift <= id)
groupSum[id]=read;
barrier();
}
// shift local sums and store
uint shift=id > 0u ? groupSum[id-1u] : 0u;
for(uint i=0u; i < blockSize; ++i)
globalSum[dataOffset+i]=localSum[i]+shift;
}