#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
#include <stdbool.h>
#include <math.h>

#include "cuda_utils.h"

#if defined(__CUDA_ARCH__) && __CUDA_ARCH__<200
#error compute capability 2.0 is required for atomic operations
#endif

const int blockSize=64;
const int gridSize=64;

// CUDA Kernel //
void __global__ ScalarProduct(float* a,float* b,float* result,int size) {
  for(int i=threadIdx.x+blockIdx.x*blockDim.x; i<size;
      i+=blockDim.x*gridDim.x) {
    atomicAdd(result,a[i]*b[i]);
  }
}

// CPU code //
int main() {

  srand48(time(NULL));

  // define vector size //
  const int vector_size = 1000000;

  // allocate memory on cpu //
  float* vector_a_cpu;
  float* vector_b_cpu;
  cudaVerify(cudaMallocHost((void**)&vector_a_cpu,vector_size*sizeof(float)));
  cudaVerify(cudaMallocHost((void**)&vector_b_cpu,vector_size*sizeof(float)));
  float result_cpu = 0;

  // initialize vectors //
  for(int i=0;i<vector_size;i++) {
    vector_a_cpu[i]=drand48();
    vector_b_cpu[i]=drand48();
  }

  // allocate memory on gpu //
  float* vector_a_gpu;
  float* vector_b_gpu;
  float* result_gpu;
  cudaVerify(cudaMalloc((void**)&vector_a_gpu,vector_size*sizeof(float)));
  cudaVerify(cudaMalloc((void**)&vector_b_gpu,vector_size*sizeof(float)));
  cudaVerify(cudaMalloc((void**)&result_gpu,gridSize*sizeof(float)));

  // copy input vector from cpu to gpu //
  cudaVerify(cudaMemcpy(vector_a_gpu,vector_a_cpu,vector_size*sizeof(float),
                        cudaMemcpyHostToDevice));
  cudaVerify(cudaMemcpy(vector_b_gpu,vector_b_cpu,vector_size*sizeof(float),
                        cudaMemcpyHostToDevice));
  cudaVerify(cudaMemcpy(result_gpu,&result_cpu,sizeof(float),
                        cudaMemcpyHostToDevice));

  Timer timer;
  initTimer(&timer);

  // cuda kernel call //
  cudaVerifyKernel((ScalarProduct<<<gridSize,blockSize>>>(vector_a_gpu,
                                                          vector_b_gpu,
                                                          result_gpu,
                                                          vector_size)));
  cudaVerify(cudaThreadSynchronize());

  double duration = getTimer(&timer);

  fprintf(stderr,"duration = %e seconds, performance = %e FLOPS\n"
          "memory bandwidth = %e bytes/s\n",
          duration,2.*vector_size/duration,
          sizeof(float)*2.*vector_size/duration);

  // copy output vector from gpu to cpu //
  cudaVerify(cudaMemcpy(&result_cpu,result_gpu,sizeof(float),
                        cudaMemcpyDeviceToHost));

  // verify result //
  double sum=0;
  for(int i=0;i<vector_size;i++) {
    sum+=vector_a_cpu[i]*vector_b_cpu[i];
  }
  if (fabsf(sum-result_cpu)>fabsf(sum+result_cpu)*1e-5) {
    fprintf(stderr,"computation result is wrong: expected=%g cuda result=%g\n"
                   "note: small deviations are not unexpected for this implementation\n"
                   "      because of roundoff errors.\n",
            sum,result_cpu);
  } else {
    fprintf(stderr,"computation result is correct.\n");
  }
  // free gpu memory //
  cudaVerify(cudaFree(vector_a_gpu));
  cudaVerify(cudaFree(vector_b_gpu));
  cudaVerify(cudaFree(result_gpu));

  // free cpu memory //
  cudaFreeHost(vector_a_cpu);
  cudaFreeHost(vector_b_cpu);

  // exit //
  return 0;
}
