#include "cuda_utils.h"

#include <stdio.h>

typedef float FieldType;

const int block_size_x=32;
const int block_size_y=6;

// TODO: if peer-to-peer access is used,
//       modify kernel to read data from other device on first/last line //
void __global__ FdtdStep(int size_x,int size_y,
                         FieldType* dataOut,FieldType* dataIn) {

  int myX=blockIdx.x*block_size_x+threadIdx.x;
  int myY=blockIdx.y*block_size_y+threadIdx.y+1;

  if (myX==0 || myX>=size_x-1 || myY>=size_y-1) {
    return;
  }

  int myIndex=myY*size_x+myX;
  
  dataOut[myIndex] = 0.5f*dataIn[myIndex] +
    0.125f * (   dataIn[myIndex-1]      + dataIn[myIndex+1]
               + dataIn[myIndex-size_x] + dataIn[myIndex+size_x] );
}

int main() {

  // define field size //
  const int size_x_gpu = 2048;
  const int size_y_gpu = 2048;
  const int loops = 100;

  const int size_x_cpu = size_x_gpu;
  const int size_y_cpu = size_y_gpu*2;

  dim3 threads(block_size_x,block_size_y);
  dim3 grid((size_x_gpu+block_size_x-2)/block_size_x,
            (size_y_gpu+block_size_y-2)/block_size_y);

  FieldType* dataA;
  FieldType* dataB;
  FieldType* dataDeviceA[2];
  FieldType* dataDeviceB[2];

  int dataSizeHost           = sizeof(FieldType)*size_x_cpu*size_y_cpu;
  int dataSizeDevice         = sizeof(FieldType)*size_x_gpu*size_y_gpu;
  int dataSizeDeviceWithHalo = sizeof(FieldType)*size_x_gpu*(size_y_gpu+1);

  // allocate memory on cpu //
  cudaVerify(cudaMallocHost((void**)&dataA,dataSizeHost));
  cudaVerify(cudaMallocHost((void**)&dataB,dataSizeHost));

  // Note: 2D field element a(x,y) is stored in 1D-array //
  //       as dataA(y*sizeX+x)                           //
  
  // initialize fields //
  for(int i=0;i<size_y_cpu;i++) {
    for(int j=0;j<size_x_gpu;j++) {
      dataA[i*size_x_cpu+j]=1.f;
      dataB[i*size_x_cpu+j]=1.f;
    }
  }
  dataA[size_y_gpu*size_x_cpu+(size_x_cpu/2)]=2.f;

  for(int i=0;i<2;i++) {

    cudaVerify(cudaSetDevice(i));

    // TODO: if peer-to-peer access is used, uncomment following line //
    // cudaVerify(cudaDeviceEnablePeerAccess(1-i,0));

    // allocate memory on gpu //
    cudaVerify(cudaMalloc((void**)&dataDeviceA[i],dataSizeDeviceWithHalo));
    cudaVerify(cudaMalloc((void**)&dataDeviceB[i],dataSizeDeviceWithHalo));

    // copy input data from cpu to gpus //
    cudaVerify(cudaMemcpy(dataDeviceA[i],dataA+i*size_x_gpu*(size_y_gpu-1),
                          dataSizeDeviceWithHalo,cudaMemcpyHostToDevice));
    cudaVerify(cudaMemcpy(dataDeviceB[i],dataB+i*size_x_gpu*(size_y_gpu-1),
                          dataSizeDeviceWithHalo,cudaMemcpyHostToDevice));
  }

   // Notes on halo exchange:                                             //
   // Domain decomposition is done in y direction (slow index).           //
   // The total field size in y direction is 2*size_y_gpu  .              //
   // Thus the computation on the field is done as follows:               //
   // y=0:            boundary                                            //
   // y=1:            computed by gpu 0                                   //
   // ...             ...                                                 //
   // y=size_y_gpu-1: computed by gpu 0                                   //
   // y=size_y_gpu:   computed by gpu 1                                   //
   // ...             ...                                                 //
   // y=size_y_cpu-2: computed by gpu 2                                   //
   // y=size_y_cpu-1: boundary                                            //
   // The stored field on GPU 0 is used as follows:                       //
   // y=0:            boundary                                            //
   // y=1:            computed by gpu 0                                   //
   // ...             ...                                                 //
   // y=size_y_gpu-1: computed by gpu 0                                   //
   // y=size_y_gpu:   halo to be copied from line y=1 on GPU 1            //
   // The stored field on GPU 1 is used as follows:                       //
   // y=0:            halo to be copied from line y=size_y_gpu-1 on GPU 0 //
   // y=1:            computed by gpu 1                                   //
   // ...             ...                                                 //
   // y=size_y_gpu-1: computed by gpu 1                                   //
   // y=size_y_gpu:   boundary                                            //

  Timer timer;
  initTimer(&timer);

  // computation main loop //
  for(int32_t i=0;i<loops;i++) {

    for(int j=0;j<2;j++) {
      cudaVerify(cudaSetDevice(j));
      // call kernel a->b //
      cudaVerifyKernel((FdtdStep<<<grid,threads>>>(size_x_gpu,size_y_gpu+1,
                                                   dataDeviceB[j],dataDeviceA[j])));
    }

    // TODO: add memcpy operations to exchange data of field B              //
    // Note: with CUDA4 cudaMemcpyPeer can directly copy from one device    //
    //       to another                                                     //
    // Note: in case of using peer-to-peer access from kernel memcpy is not //
    //       needed here, but do not forget correct synchronization!        //

    for(int j=0;j<2;j++) {
      cudaVerify(cudaSetDevice(j));
      // call kernel b->a //
      cudaVerifyKernel((FdtdStep<<< grid,threads >>>(size_x_gpu,size_y_gpu+1,
                                                     dataDeviceA[j],dataDeviceB[j])));
    }

    // TODO: add memcpy operations to exchange data of field A //

  }
  for(int i=0;i<2;i++) {
    cudaVerify(cudaSetDevice(i));
    cudaVerify(cudaDeviceSynchronize());
  }

  double duration = getTimer(&timer);
  fprintf(stderr,"size_x_cpu=%i size_y_cpu=%i duration/step=%e s, "
          "performance=%e flops\n",
          size_x_cpu,size_y_cpu,duration/loops,
          2.*6.*loops*(size_x_cpu-2)*(size_y_cpu-2)/duration);

  for(int i=0;i<2;i++) {
    cudaVerify(cudaSetDevice(i));
    // copy results from gpu to cpu //
    cudaVerify(cudaMemcpy(dataA+i*size_x_gpu*size_y_gpu,
                          dataDeviceA[i]+i*size_x_gpu,
                          dataSizeDevice,
                          cudaMemcpyDeviceToHost));
    cudaVerify(cudaMemcpy(dataB+i*size_x_gpu*size_y_gpu,
                          dataDeviceB[i]+i*size_x_gpu,
                          dataSizeDevice,
                          cudaMemcpyDeviceToHost));
  }

  for(int i=size_y_gpu-5;i<size_y_gpu+6;i++) {
    for(int j=size_x_cpu/2-3;j<=size_x_cpu/2+4;j++) {
      fprintf(stderr,"%e ",dataA[i*size_x_cpu+j]);
    }
    fprintf(stderr,"\n");
  }

  cudaVerify(cudaSetDevice(0));

  // free cpu memory //
  cudaVerify(cudaFreeHost(dataA));
  cudaVerify(cudaFreeHost(dataB));

  for(int i=0;i<2;i++) {
    cudaVerify(cudaSetDevice(i));
    // free gpu memory //
    cudaVerify(cudaFree(dataDeviceA[i]));
    cudaVerify(cudaFree(dataDeviceB[i]));
  }

  // exit //
  return 0;
}
