#define _SVID_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <malloc.h>
#include <stdbool.h>

#include "cuda_utils.h"

// CUDA Kernel //

void VectorSum(float* a,float* b,float* c,int size) {
  for(int i=0; i<size; i++) {
    c[i]=a[i]+b[i];
  }
}


// CPU code //

int main() {

  srand48(time(NULL));

  // define vector size //
  const int vector_size = 10000000;

  // allocate memory on cpu //
  float* vector_a;
  float* vector_b;
  float* vector_c;
  vector_a=(float*)malloc(vector_size*sizeof(float));
  vector_b=(float*)malloc(vector_size*sizeof(float));
  vector_c=(float*)malloc(vector_size*sizeof(float));

  // initialize vectors //
  for(int i=0;i<vector_size;i++) {
    vector_a[i]=drand48();
    vector_b[i]=drand48();
  }

  Timer timer;
  initTimer(&timer);

  // vector operation call //
  VectorSum(vector_a,vector_b,vector_c,vector_size);

  double duration = getTimer(&timer);

  // verify results //
  bool success=true;
  for(int i=0;i<vector_size;i++) {
    if (fabsf(vector_c[i]-(vector_a[i]+vector_b[i]))>1e-5) {
      printf("computation result is wrong: index=%i a[i]+b[i]=%g"
             " cuda result=%g\n",
             i,vector_a[i]+vector_b[i],vector_c[i]);
      success=false;
      break;
    }
  }
  if (success) {
    printf("computation result is correct.\n");
  }

  printf("duration=%le seconds, performance = %le FLOPS\n",
         duration,vector_size/duration);

  // free gpu memory //
  free(vector_a);
  free(vector_b);
  free(vector_c);

  // exit //
  return 0;
}
