#define _SVID_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
#include <stdbool.h>

#include "cuda_utils.h"

void ParticleSort(float* positionsX, float* positionsY,
                  int* particleLists, unsigned int* listSizes,
                  int particleCount, int gridSizeX,
                  int gridCellCount,int particleReservation) {

  for(int i=0;i<particleCount;i++) {

    int x=(int)positionsX[i];
    int y=(int)positionsY[i];
    unsigned int oldSize;
    oldSize=listSizes[y*gridSizeX+x]++;
    if (oldSize==particleReservation) {
      printf("error: overflow in grid cell (%i,%i)\n",x,y);
      return;
    }
    particleLists[(y*gridSizeX+x)+gridCellCount*oldSize]=i;
  }
}

void validateResults(float* positionsX, float* positionsY,
                     int* particleLists, unsigned int* listSizes,
                     int particleCount, int gridSizeX,int gridSizeY) {

  int gridCellCount = gridSizeX*gridSizeY;

  int sortedParticleCount=0;
  for(int i=0;i<gridCellCount;i++) {
    sortedParticleCount+=listSizes[i];
  }
  if (sortedParticleCount!=particleCount) {
    fprintf(stderr,"error: particle count is wrong: actual = %i, "
            "expected = %i\n",
            sortedParticleCount,particleCount);
  }

  for(int i=0;i<particleCount;i++) {

    int x = (int)positionsX[i];
    int y = (int)positionsY[i];
    int listSize = listSizes[y*gridSizeX+x];
    bool particleFound = false;
    for(int j=0;j<listSize;j++) {
      if (particleLists[(y*gridSizeX+x)+gridCellCount*j]==i)
        particleFound = true;
    }
    if (!particleFound) {
      fprintf(stderr,"error: particle %i is expected to be in cell (%i,%i) "
              "but was not found.\n",i,x,y);
      return;
    }
  }
  fprintf(stderr,"computation result is correct.\n");
}

int main() {

  srand48(time(NULL));

  int gridSizeX=400;
  int gridSizeY=400;
  int particleCount=10000000;

  float* particlePositionsX_cpu = (float*)malloc(particleCount*sizeof(float));
  float* particlePositionsY_cpu = (float*)malloc(particleCount*sizeof(float));

  int averageParticlesPerCell
    =(particleCount+gridSizeX*gridSizeY-1)/(gridSizeX*gridSizeY);

  // maximum length of per-cell list of particles
  int particleReservation=(int)(averageParticlesPerCell*2.5+10);
  // per-cell lists of particles
  int*  particleLists_cpu
    =(int*) malloc(gridSizeX*gridSizeY*particleReservation*sizeof(int));
  // particle list sizes
  unsigned int*  cellParticleCounts_cpu
    =(unsigned int*) malloc(gridSizeX*gridSizeY*sizeof(unsigned int));

  // initialize particle coordinates //
  for(int i=0;i<particleCount;i++) {
    particlePositionsX_cpu[i]=drand48()*(gridSizeX-0.001);
    particlePositionsY_cpu[i]=drand48()*(gridSizeY-0.001);
  }

  // initialize lists //
  for(int i=0;i<gridSizeY*gridSizeX;i++) {
    cellParticleCounts_cpu[i]=0;
  }


  fprintf(stderr,"particle reservation=%i\n",particleReservation);

  Timer timer;
  initTimer(&timer);

  ParticleSort(particlePositionsX_cpu, particlePositionsY_cpu,
               particleLists_cpu, cellParticleCounts_cpu, particleCount,
               gridSizeX, gridSizeX*gridSizeY, particleReservation);

  double duration=getTimer(&timer);

  fprintf(stderr,"particles=%i cells=%i duration=%e particles per second=%e "
          "memory bandwidth=%e\n",
          particleCount,gridSizeX*gridSizeY,duration,
          particleCount/duration,20.*particleCount/duration);

  validateResults(particlePositionsX_cpu, particlePositionsY_cpu,
                  particleLists_cpu, cellParticleCounts_cpu,
                  particleCount, gridSizeX, gridSizeY);

  free(particleLists_cpu);
  free(cellParticleCounts_cpu);
  free(particlePositionsX_cpu);
  free(particlePositionsY_cpu);
}
