layout(local_size_x=LOCALSIZE) in;

const uint groupSize=LOCALSIZE*BLOCKSIZE;

uniform uint elements;

layout(binding=2, std430) buffer countBuffer
{
 uint maxSize;
 uint count[];
};

layout(binding=3, std430) buffer globalSumBuffer
{
 uint globalSum[];
};

shared uint groupSum[LOCALSIZE];

void main()
{
 uint id=gl_LocalInvocationID.x;
 uint dataOffset=gl_WorkGroupID.x*groupSize+id;
 uint stop=dataOffset+groupSize;
 uint sum=0u;
 for(uint i=dataOffset; i < stop; i += LOCALSIZE)
   sum += count[i];

 groupSum[id]=sum;
 barrier();

 for(uint s=LOCALSIZE/2; s > 0u; s >>= 1u) {
   if(id < s)
     groupSum[id] += groupSum[id+s];
   barrier();
 }

 if(id == 0u)
   globalSum[gl_WorkGroupID.x]=groupSum[0u];
}