#include"cuda_utils.h"

#include<stdio.h>
#include<stdlib.h>

typedef float FloatType;

void printField(FloatType *field,int sizeX,int sizeY) {
  for(int i=sizeY/2-5;i<=sizeY/2+5;i++) {
    for(int j=sizeX/2-5;j<=sizeX/2+5;j++) {
      fprintf(stderr,"%g ",field[i*sizeX+j]);
    }
    fprintf(stderr,"\n");
  }
}

void FdtdStep(int sizeX, int sizeY,
              FloatType* dataOut, FloatType* dataIn) {
  #pragma acc parallel loop \
    present(dataIn[0:sizeX*sizeY],dataOut[0:sizeX*sizeY]) async(1)
  for(int i=1;i<sizeY-1;i++) {
    for(int j=1;j<sizeX-1;j++) {
      int index=i*sizeX+j;
      dataOut[index]= 0.6f*dataIn[index]
                     +0.1f*( dataIn[index-1]    +dataIn[index+1]
                            +dataIn[index-sizeX]+dataIn[index+sizeX]);
    }
  }
}

int main() {

  int sizeX=4096;
  int sizeY=4096;
  int steps=10;
  int size=sizeX*sizeY;

  FloatType* dataA = calloc(1,size*sizeof(FloatType));
  FloatType* dataB = calloc(1,size*sizeof(FloatType));

  dataA[sizeX/2+sizeY/2*sizeX]=1.0;

  double duration;

  #pragma acc data copy(dataA[0:sizeX*sizeY]) create(dataB[0:sizeX*sizeY])
  {
    Timer timer;
    initTimer(&timer);
    for(int s=0;s<steps;s++) {
      FdtdStep(sizeX,sizeY,dataB,dataA);
      FdtdStep(sizeX,sizeY,dataA,dataB);
    }
    #pragma acc wait(1)
    duration=getTimer(&timer);
  }

  printField(dataA,sizeX,sizeY);

  free(dataA);
  free(dataB);

  fprintf(stderr,"size=%i duration/step=%e s, performance=%e flops\n",
          sizeX,duration/steps,2.*6.*steps*(sizeX-2)*(sizeY-2)/duration);

  return 0;
}
