#define _SVID_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
#include <stdbool.h>
#include <math.h>

#include "cuda_utils.h"

float ScalarProduct(float* a,float* b,int size) {
  double sum=0;
  for( int i=0;i<size;i++) {
    sum += a[i] * b[i];
  }
  return sum;
}
// CPU code //
int main() {

  srand48(time(NULL));

  // define vector size //
  const int vector_size = 1000000;

  // allocate memory //
  float* vector_a = (float*)malloc(vector_size*sizeof(float));
  float* vector_b = (float*)malloc(vector_size*sizeof(float));
  float result = 0;

  // initialize vectors //
  for(int i=0;i<vector_size;i++) {
    vector_a[i]=drand48();
    vector_b[i]=drand48();
  }

  Timer timer;
  initTimer(&timer);

  // call computation function
  result=ScalarProduct(vector_a,vector_b,vector_size);

  double duration = getTimer(&timer);

  fprintf(stderr,"duration = %e seconds, performance = %e FLOPS\n"
          "memory bandwidth = %e bytes/s\n",
          duration,2.*vector_size/duration,
          sizeof(float)*2.*vector_size/duration);

  // verify result //
  double sum=0;
  for(int i=0;i<vector_size;i++) {
    sum+=vector_a[i]*vector_b[i];
  }
  if (fabsf(sum-result)>fabsf(sum+result)*1e-5) {
    fprintf(stderr,"computation result is wrong: expected=%g cuda result=%g\n",
            sum,result);
  } else {
    fprintf(stderr,"computation result is correct.\n");
  }
  // free memory //
  free(vector_a);
  free(vector_b);

  // exit //
  return 0;
}
