layout(local_size_x=LOCALSIZE) in;
const uint groupSize=LOCALSIZE*BLOCKSIZE;
layout(binding=3, std430) buffer globalSumBuffer
{
uint globalSum[];
};
layout(binding=8, std430) buffer feedbackBuffer
{
uint maxSize;
uint fragments;
};
shared uint shuffle[groupSize+LOCALSIZE-1u];
shared uint groupSum[LOCALSIZE+1u];
void main()
{
uint id=gl_LocalInvocationID.x;
// avoid bank conflicts and coalesce global memory accesses
uint shuffleOffset=id/BLOCKSIZE+id;
const uint stride=LOCALSIZE/BLOCKSIZE+LOCALSIZE;
for(uint i=0u; i < BLOCKSIZE; i++)
shuffle[shuffleOffset+i*stride]=globalSum[id+i*LOCALSIZE];
barrier();
uint Offset=id*BLOCKSIZE+id;
uint stop=Offset+BLOCKSIZE;
uint sum=0u;
for(uint i=Offset; i < stop; ++i) {
uint value=shuffle[i];
shuffle[i]=sum;
sum += value;
}
if(id == 0u)
groupSum[0u]=0u;
groupSum[id+1u]=sum;
barrier();
// Apply Hillis-Steele algorithm over all sums in workgroup
for(uint shift=1u; shift < LOCALSIZE; shift *= 2u) {
uint read;
if(shift <= id)
read=groupSum[id]+groupSum[id-shift];
barrier();
if(shift <= id)
groupSum[id]=read;
barrier();
}
for(uint i=0u; i < BLOCKSIZE; i++)
globalSum[id+i*LOCALSIZE]=
shuffle[shuffleOffset+i*stride]+groupSum[(i*LOCALSIZE+id)/BLOCKSIZE];
}