#include <stdio.h>

#include "cuda_utils.h"

void DiffusionReference(float* dataIn,float* dataOut,int size)
{
  for(int i=0; i<size; i++) {
    dataOut[i] = 0.5f * dataIn[i] + 0.25f * ( dataIn[i-1] + dataIn[i+1] );
  }
} 

void __global__ Diffusion(float* dataIn,float* dataOut,int size)
{
  for(int i=threadIdx.x+blockIdx.x*blockDim.x; i<size; 
      i+=blockDim.x*gridDim.x) {
    dataOut[i] = 0.5f * dataIn[i] + 0.25f * ( dataIn[i-1] + dataIn[i+1] );
  }
} 

int main()
{
  srand48(time(NULL));

  // define field size //
  const int size = 10000000;

  float* dataA;
  float* dataB;
  float* dataDeviceA;
  float* dataDeviceB;

  int dataSize=sizeof(float)*size;

  // allocate memory on cpu //
  cudaVerify(cudaMallocHost((void**)&dataA,dataSize));
  cudaVerify(cudaMallocHost((void**)&dataB,dataSize));

  // for reference calculation //
  float* dataReferenceA = (float*)malloc(dataSize);
  float* dataReferenceB = (float*)malloc(dataSize);
  
  // initialize field //
  for(int i=1;i<size-1;i++) {
    dataReferenceA[i]=dataA[i]=drand48();
    dataB[i]=0;
  }

  // allocate memory on gpu //
  cudaVerify(cudaMalloc((void**)&dataDeviceA,dataSize));
  cudaVerify(cudaMalloc((void**)&dataDeviceB,dataSize));

  // copy input data from cpu to gpu //
  cudaVerify(cudaMemcpy(dataDeviceA,dataA,dataSize,cudaMemcpyHostToDevice));
  cudaVerify(cudaMemcpy(dataDeviceB,dataB,dataSize,cudaMemcpyHostToDevice));

  int gridSize=80;
  int blockSize=128;

  Timer timer;
  initTimer(&timer);
  cudaVerifyKernel((Diffusion<<<gridSize,blockSize>>>(dataDeviceA+1,
                                                      dataDeviceB,size-2)));
  cudaThreadSynchronize();
  double duration=getTimer(&timer);
  printf("Time=%e s, performance=%e flops memory bandwidth = %e bytes/s\n",
         duration,4*size/duration,8*size/duration);

  // copy results from gpu to cpu //
  cudaVerify(cudaMemcpy(dataB,dataDeviceB,dataSize,cudaMemcpyDeviceToHost));

  DiffusionReference(dataReferenceA+1,dataReferenceB,size-2);

  bool success=true;
  for(int i=0;i<size-2;i++) {
    if (dataReferenceB[i]!=dataB[i]) {
        printf("computation result is wrong: position=(%i) reference=%.12e"
               " cuda result=%.12e\n",
               i,dataReferenceB[i],dataB[i]);
        success=false;
        break; 
      }
    if (!success)
      break;
  }
  if (success) {
    printf("computation result is correct.\n");
  }
  free(dataReferenceA);
  free(dataReferenceB);

  // free cpu memory //
  cudaVerify(cudaFreeHost(dataA));
  cudaVerify(cudaFreeHost(dataB));

  // free gpu memory //
  cudaVerify(cudaFree(dataDeviceA));
  cudaVerify(cudaFree(dataDeviceB));

  // exit //
  return 0;
}
