#include<stdio.h>
#include<malloc.h>
#include<math.h>

#include "cuda_utils.h"

// CUDA Kernel //

void __global__ eval(float* a,float b,float* c,int size) {
  int index=threadIdx.x+blockIdx.x*blockDim.x;
  if(index<size) {
    c[index]=a[index]*b;
  }
}


// CPU code //
int main() {

  srand48(time(NULL));

  // define vector size //
  const int vector_size = 10000;

  // number of threads per block //
  const int block_dim = 32;

  // allocate memory on cpu //
  float* vector_a_cpu;
  float* vector_c_cpu;
  cudaVerify(cudaMallocHost((void**)&vector_a_cpu,
                            vector_size*sizeof(float)));
  cudaVerify(cudaMallocHost((void**)&vector_c_cpu,
                            vector_size*sizeof(float)));

  float b=drand48();
  // initialize cpu input vector //
  for(int i=0;i<vector_size;i++) {
    vector_a_cpu[i]=drand48();
  }
  // initialize cpu output vector to 0 //
  for(int i=0;i<vector_size;i++) {
    vector_c_cpu[i]=0;
  }

  // allocate memory on gpu //
  float* vector_a_gpu;
  float* vector_c_gpu;
  cudaVerify(cudaMalloc((void**)&vector_a_gpu,vector_size*sizeof(float)));
  cudaVerify(cudaMalloc((void**)&vector_c_gpu,vector_size*sizeof(float)));

  // copy input vector to gpu //
  cudaVerify(cudaMemcpy(vector_a_gpu,vector_a_cpu,
                        vector_size*sizeof(float),cudaMemcpyHostToDevice));
  // initialize gpu output vector to 0 //
  cudaVerify(cudaMemcpy(vector_c_gpu,vector_c_cpu,
                        vector_size*sizeof(float),cudaMemcpyHostToDevice));


  int grid_dim = (vector_size+block_dim-1)/block_dim;

  if (grid_dim>65535) {
    fprintf(stderr,"warning: grid dimension must be smaller than 65536.\n"
            "         current value = %d. Kernel execution will fail.\n",
            grid_dim);
  }

  // cuda kernel call //
  cudaVerifyKernel((eval<<<grid_dim,block_dim>>>(vector_a_gpu,b,vector_c_gpu,
                                                 vector_size)));

  // copy output vector from gpu to cpu //
  cudaVerify(cudaMemcpy(vector_c_cpu,vector_c_gpu,
                        vector_size*sizeof(float),cudaMemcpyDeviceToHost));

  // verify results //
  bool success=true;
  for(int i=0;i<vector_size;i++) {
    if (fabsf(vector_a_cpu[i]*b-vector_c_cpu[i])>1e-5*b) {
      printf("computation result is wrong: index=%i expected result=%g "
             "cuda result=%g\n", i, vector_a_cpu[i]*b, vector_c_cpu[i]);
      success=false;
      break;
    }
  }
  if (success) {
    printf("computation result is correct.\n");
  }

  // free gpu memory //
  cudaVerify(cudaFree(vector_a_gpu));
  cudaVerify(cudaFree(vector_c_gpu));

  // free cpu memory //
  cudaVerify(cudaFreeHost(vector_a_cpu));
  cudaVerify(cudaFreeHost(vector_c_cpu));

  // exit //
  return 0;
}
