#include"cuda_utils.h"

#include<stdio.h>
#include<stdlib.h>

// TODO: create group and mappings                            //
// which arrays need to be mapped to the same memory location //
// to not need any copies in loop s?                          //

typedef float FloatType;

void printField(FloatType *field,int sizeX,int sizeY) {
  for(int i=sizeY/2-5;i<=sizeY/2+5;i++) {
    for(int j=sizeX/2-5;j<=sizeX/2+5;j++) {
      fprintf(stderr,"%g ",field[i*sizeX+j]);
    }
    fprintf(stderr,"\n");
  }
}

// TODO: make function hmpp codelet //
void FdtdStep(int n, int sizeX, int sizeY, FloatType* dataOut, FloatType* dataIn) {
  for(int i=1;i<sizeY-1;i++) {
    for(int j=1;j<sizeX-1;j++) {
      int index=i*sizeX+j;
      dataOut[index]= 0.6f*dataIn[index]
                     +0.1f*( dataIn[index-1]    +dataIn[index+1]
                            +dataIn[index-sizeX]+dataIn[index+sizeX]);
    }
  }
}

int main() {

  int sizeX=4096;
  int sizeY=4096;
  int steps=10;
  int size=sizeX*sizeY;

  FloatType* dataA = calloc(1,size*sizeof(FloatType));
  FloatType* dataB = calloc(1,size*sizeof(FloatType));

  dataA[sizeX/2+sizeY/2*sizeX]=1.0;

  // TODO: allocate device memory and copy data in //

  Timer timer;
  initTimer(&timer);
  for(int s=0;s<steps;s++) {
    //  TODO: make hmpp callsite              //
    //  which option avoids unnecessary copy? //
    FdtdStep(size,sizeX,sizeY,dataB,dataA);
    //TODO: make hmpp callsite                //
    FdtdStep(size,sizeX,sizeY,dataA,dataB);
  }
  double duration=getTimer(&timer);

  // TODO: copy data back and free device memory //

  printField(dataA,sizeX,sizeY);
  free(dataA);
  free(dataB);

  fprintf(stderr,"size=%i duration/step=%e s, performance=%e flops\n",
          sizeX,duration/steps,2.*6.*steps*(sizeX-2)*(sizeY-2)/duration);

  return 0;
}
