#include "cuda_utils.h"

#if defined(__CUDA_ARCH__) && __CUDA_ARCH__<110
#error compute capability 1.1 is required for atomic operations
#endif

__global__ void ParticleSort(float* positionsX, float* positionsY,
                             int* particleLists, unsigned int* listSizes,
                             int particleCount, int gridSizeX,
                             int gridCellCount,int particleReservation) {

  for(int i=threadIdx.x+blockIdx.x*blockDim.x;i<particleCount;
      i+=blockDim.x*gridDim.x) {

    int x=(int)positionsX[i];
    int y=(int)positionsY[i];
    unsigned int oldSize;
    oldSize=atomicAdd(&(listSizes[y*gridSizeX+x]),1);
    if (oldSize==particleReservation) {
      // Note: printf in device code works only with cc>=2.0 //
      #if (__CUDA_ARCH__>=200)
      printf("error: overflow in grid cell (%i,%i)\n",x,y);
      #endif
      return;
    }
    particleLists[(y*gridSizeX+x)+gridCellCount*oldSize]=i;
  }
}

void validateResults(float* positionsX, float* positionsY,
                     int* particleLists, unsigned int* listSizes,
                     int particleCount, int gridSizeX,int gridSizeY) {

  int gridCellCount = gridSizeX*gridSizeY;

  int sortedParticleCount=0;
  for(int i=0;i<gridCellCount;i++) {
    sortedParticleCount+=listSizes[i];
  }
  if (sortedParticleCount!=particleCount) {
    fprintf(stderr,"error: particle count is wrong: actual = %i, "
            "expected = %i\n",
            sortedParticleCount,particleCount);
  }

  for(int i=0;i<particleCount;i++) {

    int x = (int)positionsX[i];
    int y = (int)positionsY[i];
    int listSize = listSizes[y*gridSizeX+x];
    bool particleFound = false;
    for(int j=0;j<listSize;j++) {
      if (particleLists[(y*gridSizeX+x)+gridCellCount*j]==i)
        particleFound = true;
    }
    if (!particleFound) {
      fprintf(stderr,"error: particle %i is expected to be in cell (%i,%i) "
              "but was not found.\n",i,x,y);
      return;
    }
  }
  fprintf(stderr,"computation result is correct.\n");
}

int main() {

  srand48(time(NULL));

  int gridSizeX=400;
  int gridSizeY=400;
  int particleCount=10000000;

  float* particlePositionsX_cpu = (float*)malloc(particleCount*sizeof(float));
  float* particlePositionsY_cpu = (float*)malloc(particleCount*sizeof(float));

  int averageParticlesPerCell
    =(particleCount+gridSizeX*gridSizeY-1)/(gridSizeX*gridSizeY);

  // maximum length of per-cell list of particles
  int particleReservation=(int)(averageParticlesPerCell*2.5+10);
  // per-cell lists of particles
  int*  particleLists_cpu
    =(int*) malloc(gridSizeX*gridSizeY*particleReservation*sizeof(int));
  // particle list sizes
  unsigned int*  cellParticleCounts_cpu
    =(unsigned int*) malloc(gridSizeX*gridSizeY*sizeof(unsigned int));

  // initialize particle coordinates //
  for(int i=0;i<particleCount;i++) {
    particlePositionsX_cpu[i]=drand48()*(gridSizeX-0.001);
    particlePositionsY_cpu[i]=drand48()*(gridSizeY-0.001);
  }

  // initialize lists //
  for(int i=0;i<gridSizeY*gridSizeX;i++) {
    cellParticleCounts_cpu[i]=0;
  }

  float*        particlePositionsX_gpu;
  float*        particlePositionsY_gpu;
  unsigned int* cellParticleCounts_gpu;
  int*          particleLists_gpu;

  cudaVerify(cudaMalloc((void**)&particlePositionsX_gpu,
                        particleCount*sizeof(float)));
  cudaVerify(cudaMalloc((void**)&particlePositionsY_gpu,
                        particleCount*sizeof(float)));
  cudaVerify(cudaMalloc((void**)&cellParticleCounts_gpu,
                       gridSizeX*gridSizeY*sizeof(unsigned int)));
  cudaVerify(cudaMalloc((void**)&particleLists_gpu,
                       gridSizeX*gridSizeY*particleReservation*sizeof(int)));

  cudaVerify(cudaMemcpy(particlePositionsX_gpu,particlePositionsX_cpu,
                       particleCount*sizeof(float),cudaMemcpyHostToDevice));
  cudaVerify(cudaMemcpy(particlePositionsY_gpu,particlePositionsY_cpu,
                       particleCount*sizeof(float),cudaMemcpyHostToDevice));
  cudaVerify(cudaMemcpy(cellParticleCounts_gpu,cellParticleCounts_cpu,
                       gridSizeX*gridSizeY*sizeof(int),
                       cudaMemcpyHostToDevice));

  dim3 blockSize(512);
  dim3 gridSize(256);

  fprintf(stderr,"particle reservation=%i\n",particleReservation);

  Timer timer;
  initTimer(&timer);

  cudaVerifyKernel((ParticleSort<<<gridSize,blockSize>>>
                    (particlePositionsX_gpu, particlePositionsY_gpu,
                     particleLists_gpu, cellParticleCounts_gpu, particleCount,
                     gridSizeX, gridSizeX*gridSizeY, particleReservation)));

  cudaVerify(cudaThreadSynchronize());

  double duration=getTimer(&timer);

  cudaVerify(cudaMemcpy(particleLists_cpu, particleLists_gpu,
                       gridSizeX*gridSizeY*particleReservation*sizeof(int),
                       cudaMemcpyDeviceToHost));
  cudaVerify(cudaMemcpy(cellParticleCounts_cpu, cellParticleCounts_gpu,
                       gridSizeX*gridSizeY*sizeof(int),
                       cudaMemcpyDeviceToHost));

  fprintf(stderr,"particles=%i cells=%i duration=%e particles per second=%e "
          "memory bandwidth=%e\n",
          particleCount,gridSizeX*gridSizeY,duration,
          particleCount/duration,20.*particleCount/duration);

  validateResults(particlePositionsX_cpu, particlePositionsY_cpu,
                  particleLists_cpu, cellParticleCounts_cpu,
                  particleCount, gridSizeX, gridSizeY);

  cudaVerify(cudaFree(particlePositionsX_gpu));
  cudaVerify(cudaFree(particlePositionsY_gpu));
  cudaVerify(cudaFree(cellParticleCounts_gpu));
  cudaVerify(cudaFree(particleLists_gpu));

  free(particleLists_cpu);
  free(cellParticleCounts_cpu);
  free(particlePositionsX_cpu);
  free(particlePositionsY_cpu);
}
