#include "cuda_utils.h"
#include <stdio.h>

#if defined(__CUDA_ARCH__) && __CUDA_ARCH__<110
#error compute capability 1.1 is required for atomic operations
#endif

__device__ inline void __syncblocks(int* syncval) {

  // TODO: add synchronization logic //

}

__global__ void calc(int* dataX,int* dataY,int* syncval,int size) {

  for(int i=threadIdx.x+blockDim.x*blockIdx.x;i<size;i+=blockDim.x*gridDim.x) {
    dataX[i]=i;
  }

  __syncblocks(syncval);

  for(int i=threadIdx.x+blockDim.x*blockIdx.x;i<size;i+=blockDim.x*gridDim.x) {
    dataY[i]=dataX[size-i-1];
  }

}

int main() {

  int vectorSize=100000000;
  int blockDim=64;
  int gridDim=112;


  int* vector_a_cpu;
  int* vector_a_gpu;
  int* vector_b_cpu;
  int* vector_b_gpu;
  int* syncval;

  cudaMallocHost((void**)&vector_a_cpu,vectorSize*sizeof(float));
  cudaMallocHost((void**)&vector_b_cpu,vectorSize*sizeof(float));
  cudaVerify(cudaMalloc((void**)&vector_a_gpu,vectorSize*sizeof(float)));
  cudaMalloc((void**)&vector_b_gpu,vectorSize*sizeof(float));
  cudaMalloc((void**)&syncval,sizeof(int));
  cudaVerify(cudaMemset(syncval,0,sizeof(int)));

  for(int i=0;i<vectorSize;i++) {
    vector_a_cpu[i]=0;
    vector_b_cpu[i]=0;
  }
  cudaMemcpy(vector_a_gpu,vector_a_cpu,vectorSize*sizeof(float),
             cudaMemcpyHostToDevice);

  cudaMemcpy(vector_b_gpu,vector_b_cpu,vectorSize*sizeof(float),
             cudaMemcpyHostToDevice);

  Timer timer;
  initTimer(&timer);

  cudaVerifyKernel((calc<<<gridDim,blockDim>>>
                    (vector_a_gpu,vector_b_gpu,syncval,
                     vectorSize)));

  cudaVerify(cudaThreadSynchronize());

  double duration=getTimer(&timer);

  cudaMemcpy(vector_b_cpu,vector_b_gpu,vectorSize*sizeof(float),
             cudaMemcpyDeviceToHost);

  // verify results //
  bool success=true;
  for(int i=0;i<vectorSize;i++) {
    if (vector_b_cpu[i]!=vectorSize-i-1) {
      printf("computation result is wrong: index=%i expected result=%i "
             "cuda result=%i\n", i, vectorSize-i-1, vector_b_cpu[i]);
      success=false;
      break;
    }
  }
  if (success) {
    printf("computation result is correct.\n");
  }

  fprintf(stderr,"duration=%e s, memory bandwidth = %e bytes/s\n",
          duration,3.*vectorSize*sizeof(float)/duration);

  cudaVerify(cudaFree(vector_a_gpu));
  cudaVerify(cudaFree(vector_b_gpu));
  cudaVerify(cudaFree(syncval));

  return 0;
}
