#include "cuda_utils.h"

#include <stdio.h>

typedef float FieldType;

const int block_size_x=32;
const int block_size_y=6;

void __global__ FdtdStepDevice0(int size_x,int size_y,FieldType* dataOut,
                                FieldType* dataIn,FieldType* dataInOther) {

  int myX=blockIdx.x*block_size_x+threadIdx.x;
  int myY=blockIdx.y*block_size_y+threadIdx.y+1;

  if (myX==0 || myX>=size_x-1) {
    return;
  }
  if (myY>=size_y-2) {
    if (myY==size_y-2) {

      int myIndex=myY*size_x+myX;
  
      dataOut[myIndex] = 0.5f*dataIn[myIndex] +
        0.125f * (   dataIn[myIndex-1]      + dataIn[myIndex+1]
                   + dataIn[myIndex-size_x] + dataInOther[myX+size_x] );
    }
    return;
  }

  int myIndex=myY*size_x+myX;
  
  dataOut[myIndex] = 0.5f*dataIn[myIndex] +
    0.125f * (   dataIn[myIndex-1]      + dataIn[myIndex+1]
               + dataIn[myIndex-size_x] + dataIn[myIndex+size_x] );
}

void __global__ FdtdStepDevice1(int size_x,int size_y,FieldType* dataOut,
                                FieldType* dataIn,FieldType* dataInOther) {

  int myX=blockIdx.x*block_size_x+threadIdx.x;
  int myY=blockIdx.y*block_size_y+threadIdx.y+1;

  if (myX==0 || myX>=size_x-1 || myY>=size_y-1)
    return;

  int myIndex=myY*size_x+myX;

  if (myY==1) {
    dataOut[myIndex] = 0.5f*dataIn[myIndex] +
      0.125f * (   dataIn[myIndex-1]                  + dataIn[myIndex+1]
                 + dataInOther[size_x*(size_y-2)+myX] + dataIn[myIndex+size_x] );
    return;
  }

  
  dataOut[myIndex] = 0.5f*dataIn[myIndex] +
    0.125f * (   dataIn[myIndex-1]      + dataIn[myIndex+1]
               + dataIn[myIndex-size_x] + dataIn[myIndex+size_x] );
}

int main() {

  // define field size //
  const int size_x_gpu = 2048;
  const int size_y_gpu = 2048;
  const int loops = 100;

  const int size_x_cpu = size_x_gpu;
  const int size_y_cpu = size_y_gpu*2;

  dim3 threads(block_size_x,block_size_y);
  dim3 grid((size_x_gpu+block_size_x-2)/block_size_x,
            (size_y_gpu+block_size_y-2)/block_size_y);

  FieldType* dataA;
  FieldType* dataB;
  FieldType* dataDeviceA[2];
  FieldType* dataDeviceB[2];

  int dataSizeHost           = sizeof(FieldType)*size_x_cpu*size_y_cpu;
  int dataSizeDevice         = sizeof(FieldType)*size_x_gpu*size_y_gpu;
  int dataSizeDeviceWithHalo = sizeof(FieldType)*size_x_gpu*(size_y_gpu+1);

  // allocate memory on cpu //
  cudaVerify(cudaMallocHost((void**)&dataA,dataSizeHost));
  cudaVerify(cudaMallocHost((void**)&dataB,dataSizeHost));
  
  // initialize fields //
  for(int i=0;i<size_y_cpu;i++) {
    for(int j=0;j<size_x_gpu;j++) {
      dataA[i*size_x_cpu+j]=1.f;
      dataB[i*size_x_cpu+j]=1.f;
    }
  }
  dataA[size_y_gpu*size_x_cpu+(size_x_cpu/2)]=2.f;

  for(int i=0;i<2;i++) {

    cudaVerify(cudaSetDevice(i));
    cudaVerify(cudaDeviceEnablePeerAccess(1-i,0));

    // allocate memory on gpu //
    cudaVerify(cudaMalloc((void**)&dataDeviceA[i],dataSizeDeviceWithHalo));
    cudaVerify(cudaMalloc((void**)&dataDeviceB[i],dataSizeDeviceWithHalo));

    // copy input data from cpu to gpus //
    cudaVerify(cudaMemcpy(dataDeviceA[i],dataA+i*size_x_gpu*(size_y_gpu-1),
                          dataSizeDeviceWithHalo,cudaMemcpyHostToDevice));
    cudaVerify(cudaMemcpy(dataDeviceB[i],dataB+i*size_x_gpu*(size_y_gpu-1),
                          dataSizeDeviceWithHalo,cudaMemcpyHostToDevice));
  }

  Timer timer;
  initTimer(&timer);

  // computation main loop //
  for(int32_t i=0;i<loops;i++) {

    cudaVerify(cudaSetDevice(0));
    // call kernel a->b, device 0 //
    cudaVerifyKernel((FdtdStepDevice0<<<grid,threads>>>(size_x_gpu,size_y_gpu+1,
                                                        dataDeviceB[0],
                                                        dataDeviceA[0],
                                                        dataDeviceA[1])));
    cudaVerify(cudaSetDevice(1));
    // call kernel a->b, device 1 //
    cudaVerifyKernel((FdtdStepDevice1<<<grid,threads>>>(size_x_gpu,size_y_gpu+1,
                                                        dataDeviceB[1],
                                                        dataDeviceA[1],
                                                        dataDeviceA[0])));

    for(int i=0;i<2;i++) {
      cudaVerify(cudaSetDevice(i));
      cudaVerify(cudaDeviceSynchronize());
    }

    cudaVerify(cudaSetDevice(0));
    // call kernel b->a, device 0 //
    cudaVerifyKernel((FdtdStepDevice0<<<grid,threads>>>(size_x_gpu,size_y_gpu+1,
                                                        dataDeviceA[0],
                                                        dataDeviceB[0],
                                                        dataDeviceB[1])));
    cudaVerify(cudaSetDevice(1));
    // call kernel b->a, device 1 //
    cudaVerifyKernel((FdtdStepDevice1<<<grid,threads>>>(size_x_gpu,size_y_gpu+1,
                                                        dataDeviceA[1],
                                                        dataDeviceB[1],
                                                        dataDeviceB[0])));

    for(int i=0;i<2;i++) {
      cudaVerify(cudaSetDevice(i));
      cudaVerify(cudaDeviceSynchronize());
    }

  }

  double duration = getTimer(&timer);
  fprintf(stderr,"size_x_cpu=%i size_y_cpu=%i duration/step=%e s, "
          "performance=%e flops\n",
          size_x_cpu,size_y_gpu,duration/loops,
          2.*6.*loops*(size_x_cpu-2)*(size_y_cpu-2)/duration);

  for(int i=0;i<2;i++) {
    cudaVerify(cudaSetDevice(i));
    // copy results from gpu to cpu //
    cudaVerify(cudaMemcpy(dataA+i*size_x_gpu*size_y_gpu,
                          dataDeviceA[i]+i*size_x_gpu,
                          dataSizeDevice,
                          cudaMemcpyDeviceToHost));
    cudaVerify(cudaMemcpy(dataB+i*size_x_gpu*size_y_gpu,
                          dataDeviceB[i]+i*size_x_gpu,
                          dataSizeDevice,
                          cudaMemcpyDeviceToHost));
  }

  for(int i=size_y_gpu-5;i<size_y_gpu+6;i++) {
    for(int j=size_x_cpu/2-3;j<=size_x_cpu/2+3;j++) {
      fprintf(stderr,"%e ",dataA[i*size_x_cpu+j]);
    }
    fprintf(stderr,"\n");
  }

  cudaVerify(cudaSetDevice(0));

  // free cpu memory //
  cudaVerify(cudaFreeHost(dataA));
  cudaVerify(cudaFreeHost(dataB));

  for(int i=0;i<2;i++) {
    cudaVerify(cudaSetDevice(i));
    // free gpu memory //
    cudaVerify(cudaFree(dataDeviceA[i]));
    cudaVerify(cudaFree(dataDeviceB[i]));
  }

  // exit //
  return 0;
}
