#include <stdio.h>
#include <stdlib.h>
#include <math.h>

#include "cuda_utils.h"

// CUDA Kernel //

void __global__ VectorSum(float* a,float* b,float* c,int size) {
  for(int i=threadIdx.x+blockIdx.x*blockDim.x; i<size; 
      i+=blockDim.x*gridDim.x) {
    c[i]=a[i]+b[i];
  }
}


// CPU code //

int main() {

  srand48(time(NULL));

  // define vector size //
  const int vector_size = 10000000;

  // allocate memory on cpu //
  float* vector_a_cpu;
  float* vector_b_cpu;
  float* vector_c_cpu;
  cudaVerify(cudaMallocHost((void**)&vector_a_cpu,vector_size*sizeof(float)));
  cudaVerify(cudaMallocHost((void**)&vector_b_cpu,vector_size*sizeof(float)));
  cudaVerify(cudaMallocHost((void**)&vector_c_cpu,vector_size*sizeof(float)));

  // initialize host vectors //
  for(int i=0;i<vector_size;i++) {
    vector_a_cpu[i]=drand48();
    vector_b_cpu[i]=drand48();
    vector_c_cpu[i]=0;
  }

  // allocate memory on gpu //
  float* vector_a_gpu;
  float* vector_b_gpu;
  float* vector_c_gpu;
  cudaVerify(cudaMalloc((void**)&vector_a_gpu,vector_size*sizeof(float)));
  cudaVerify(cudaMalloc((void**)&vector_b_gpu,vector_size*sizeof(float)));
  cudaVerify(cudaMalloc((void**)&vector_c_gpu,vector_size*sizeof(float)));

  // initialize gpu output vector to zero //
  cudaVerify(cudaMemcpy(vector_c_gpu,vector_c_cpu,vector_size*sizeof(float),
                       cudaMemcpyHostToDevice));

  Timer timer;
  initTimer(&timer);

  // copy input vectors from cpu to gpu //
  cudaVerify(cudaMemcpy(vector_a_gpu,vector_a_cpu,vector_size*sizeof(float),
                       cudaMemcpyHostToDevice));
  cudaVerify(cudaMemcpy(vector_b_gpu,vector_b_cpu,vector_size*sizeof(float),
                       cudaMemcpyHostToDevice));

  double durationMemoryCopyIn = getAndResetTimer(&timer);

  // cuda kernel call //
  cudaVerifyKernel((VectorSum<<<64,128>>>(vector_a_gpu,vector_b_gpu,
                                          vector_c_gpu,vector_size)));

  cudaVerify(cudaThreadSynchronize());
  double durationKernel = getAndResetTimer(&timer);

  // copy output vector from gpu to cpu //
  cudaVerify(cudaMemcpy(vector_c_cpu,vector_c_gpu,vector_size*sizeof(float),
                        cudaMemcpyDeviceToHost));

  double durationMemoryCopyOut = getAndResetTimer(&timer);

  // verify results //
  bool success=true;
  for(int i=0;i<vector_size;i++) {
    if (fabsf(vector_c_cpu[i]-(vector_a_cpu[i]+vector_b_cpu[i]))>1e-5) {
      printf("computation result is wrong: index=%i a[i]+b[i]=%g"
             " cuda result=%g\n",
             i,vector_a_cpu[i]+vector_b_cpu[i],vector_c_cpu[i]);
      success=false;
      break;
    }
  }
  if (success) {
    printf("computation result is correct.\n");
  }

  printf("duration (copy in)          = %e seconds\n"
         "memory bandwidth (copy in)  = %e bytes/s\n",
         durationMemoryCopyIn,vector_size*2.*sizeof(float)/durationMemoryCopyIn);
  printf("duration (kernel)           = %e seconds, performance = %e FLOPS\n"
         "memory bandwidth (kernel)   = %e bytes/s\n",
         durationKernel,vector_size/durationKernel,
         vector_size*3.*sizeof(float)/durationKernel);
  printf("duration (copy out)         = %e seconds\n"
         "memory bandwidth (copy out) = %e bytes/s\n",
         durationMemoryCopyOut,vector_size*sizeof(float)/durationMemoryCopyOut);

  // free gpu memory //
  cudaVerify(cudaFree(vector_a_gpu));
  cudaVerify(cudaFree(vector_b_gpu));
  cudaVerify(cudaFree(vector_c_gpu));

  // free cpu memory //
  cudaVerify(cudaFreeHost(vector_a_cpu));
  cudaVerify(cudaFreeHost(vector_b_cpu));
  cudaVerify(cudaFreeHost(vector_c_cpu));

  // exit //
  return 0;
}
