#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
#include <stdbool.h>
#include <math.h>

#include "cuda_utils.h"

// CUDA Kernel //
void __global__ ScalarProduct(float* a,float* b,float* result,int size) {
  __shared__ float sums[32];
  float sum=0;
  for(int i=threadIdx.x; i<size; i+=blockDim.x) {
    sum+=a[i]*b[i];
  }
  if (threadIdx.x<32)
    sums[threadIdx.x]=sum;
  __syncthreads();
  if (threadIdx.x>=32)
    sums[threadIdx.x-32]+=sum;
  __syncthreads();
  if (threadIdx.x<16)
    sums[threadIdx.x]+=sums[threadIdx.x+16];
  __syncthreads();
  if (threadIdx.x<8)
    sums[threadIdx.x]+=sums[threadIdx.x+8];
  __syncthreads();
  if (threadIdx.x<4)
    sums[threadIdx.x]+=sums[threadIdx.x+4];
  __syncthreads();
  if (threadIdx.x==0)
    *result=(sums[0]+sums[1])+(sums[2]+sums[3]);
}

// CPU code //
int main() {

  srand48(time(NULL));

  // define vector size //
  const int vector_size = 1000000;

  // allocate memory on cpu //
  float* vector_a_cpu;
  float* vector_b_cpu;
  cudaVerify(cudaMallocHost((void**)&vector_a_cpu,vector_size*sizeof(float)));
  cudaVerify(cudaMallocHost((void**)&vector_b_cpu,vector_size*sizeof(float)));
  float result_cpu = 0;

  // initialize vectors //
  for(int i=0;i<vector_size;i++) {
    vector_a_cpu[i]=drand48();
    vector_b_cpu[i]=drand48();
  }

  // allocate memory on gpu //
  float* vector_a_gpu;
  float* vector_b_gpu;
  float* result_gpu;
  cudaVerify(cudaMalloc((void**)&vector_a_gpu,vector_size*sizeof(float)));
  cudaVerify(cudaMalloc((void**)&vector_b_gpu,vector_size*sizeof(float)));
  cudaVerify(cudaMalloc((void**)&result_gpu,sizeof(float)));

  // copy input vector from cpu to gpu //
  cudaVerify(cudaMemcpy(vector_a_gpu,vector_a_cpu,vector_size*sizeof(float),
                        cudaMemcpyHostToDevice));
  cudaVerify(cudaMemcpy(vector_b_gpu,vector_b_cpu,vector_size*sizeof(float),
                        cudaMemcpyHostToDevice));
  cudaVerify(cudaMemcpy(result_gpu,&result_cpu,sizeof(float),
                        cudaMemcpyHostToDevice));

  Timer timer;
  initTimer(&timer);

  // cuda kernel call //
  cudaVerifyKernel((ScalarProduct<<<1,64>>>(vector_a_gpu,vector_b_gpu,
                                            result_gpu,vector_size)));
  cudaVerify(cudaThreadSynchronize());

  double duration = getTimer(&timer);

  fprintf(stderr,"duration = %e seconds, performance = %e FLOPS\n"
          "memory bandwidth = %e bytes/s\n",
          duration,2.*vector_size/duration,
          sizeof(float)*2.*vector_size/duration);

  // copy output vector from gpu to cpu //
  cudaVerify(cudaMemcpy(&result_cpu,result_gpu,sizeof(float),
                        cudaMemcpyDeviceToHost));

  // verify result //
  double sum=0;
  for(int i=0;i<vector_size;i++) {
    sum+=vector_a_cpu[i]*vector_b_cpu[i];
  }
  if (fabsf(sum-result_cpu)>fabsf(sum+result_cpu)*1e-5) {
    fprintf(stderr,"computation result is wrong: expected=%g cuda result=%g\n",
            sum,result_cpu);
  } else {
    fprintf(stderr,"computation result is correct.\n");
  }
  // free gpu memory //
  cudaVerify(cudaFree(vector_a_gpu));
  cudaVerify(cudaFree(vector_b_gpu));
  cudaVerify(cudaFree(result_gpu));

  // free cpu memory //
  cudaFreeHost(vector_a_cpu);
  cudaFreeHost(vector_b_cpu);

  // exit //
  return 0;
}
