#include "cuda_utils.h"

#include <stdio.h>

typedef float FieldType;

const int blockSizeX=32;
const int blockSizeY=6;

void __global__ FdtdStep(int size_x,int size_y,
                         FieldType* dataOut,FieldType* dataIn) {

  int myX=blockIdx.x*blockSizeX+threadIdx.x;
  int myY=blockIdx.y*blockSizeY+threadIdx.y+1;

  if (myX==0 || myX>=size_x-1 || myY>=size_y-1) {
    return;
  }

  int myIndex=myY*size_x+myX;
  
  dataOut[myIndex] = 0.5f*dataIn[myIndex] +
    0.125f * (   dataIn[myIndex-1]      + dataIn[myIndex+1]
               + dataIn[myIndex-size_x] + dataIn[myIndex+size_x] );
}

int main() {

  // define field size //
  const int size_x = 2048;
  const int size_y = 4096;
  const int loops = 100;

  dim3 threads(blockSizeX,blockSizeY);
  dim3 grid((size_x+blockSizeX-3)/blockSizeX,(size_y+blockSizeY-3)/blockSizeY);

  FieldType* dataA;
  FieldType* dataB;
  FieldType* dataDeviceA;
  FieldType* dataDeviceB;

  int dataSize=sizeof(FieldType)*size_x*size_y;

  // allocate memory on cpu //
  cudaVerify(cudaMallocHost((void**)&dataA,dataSize));
  cudaVerify(cudaMallocHost((void**)&dataB,dataSize));
  
  // initialize fields //
  for(int i=0;i<size_y;i++) {
    for(int j=0;j<size_x;j++) {
      dataA[i*size_x+j]=1.f;
      dataB[i*size_x+j]=1.f;
    }
  }
  dataA[(size_y/2)*size_x+(size_x/2)]=2.f;

  // allocate memory on gpu //
  cudaVerify(cudaMalloc((void**)&dataDeviceA,dataSize));
  cudaVerify(cudaMalloc((void**)&dataDeviceB,dataSize));

  // copy input data from cpu to gpus //
  cudaVerify(cudaMemcpy(dataDeviceA,dataA,dataSize,
                       cudaMemcpyHostToDevice));
  cudaVerify(cudaMemcpy(dataDeviceB,dataB,dataSize,
                       cudaMemcpyHostToDevice));

  Timer timer;
  initTimer(&timer);

  // computation main loop //
  for(int32_t i=0;i<loops;i++) {

      // call kernel a->b //
      cudaVerifyKernel((FdtdStep<<< grid,threads >>>(size_x,size_y,
                                                   dataDeviceB,dataDeviceA)));

      // call kernel b->a //
      cudaVerifyKernel((FdtdStep<<< grid,threads >>>(size_x,size_y,
                                                   dataDeviceA,dataDeviceB)));
  }
  cudaVerify(cudaDeviceSynchronize());

  double duration = getTimer(&timer);
  fprintf(stderr,"size_x=%i size_y=%i duration/step=%e s, "
          "performance=%e flops\n",
          size_x,size_y,duration/loops,
          2.*6.*loops*(size_x-2)*(size_y-2)/duration);

  // copy results from gpu to cpu //
  cudaVerify(cudaMemcpy(dataA,dataDeviceA,
                       dataSize,cudaMemcpyDeviceToHost));
  cudaVerify(cudaMemcpy(dataB,dataDeviceB,
                       dataSize,cudaMemcpyDeviceToHost));
  
  for(int i=size_y/2-5;i<size_y/2+6;i++) {
    for(int j=size_x/2-3;j<size_x/2+3;j++) {
      fprintf(stderr,"%e ",dataA[i*size_x+j]);
    }
    fprintf(stderr,"\n");
  }

  // free cpu memory //
  cudaVerify(cudaFreeHost(dataA));
  cudaVerify(cudaFreeHost(dataB));

  // free gpu memory //
  cudaVerify(cudaFree(dataDeviceA));
  cudaVerify(cudaFree(dataDeviceB));

  // exit //
  return 0;
}
