#define _SVID_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <malloc.h>
#include <stdbool.h>

#include "cuda_utils.h"

int main() {

  srand48(time(NULL));

  // define vector size //
  const int vector_size = 8000000;

  // allocate memory //
  float* vector_a = (float*)malloc(vector_size*sizeof(float));
  float* vector_c = (float*)malloc(vector_size*sizeof(float));

  float b=drand48();
  // initialize input vector //
  for(int i=0;i<vector_size;i++) {
    vector_a[i]=drand48();
  }
  // initialize output vector to 0 //
  for(int i=0;i<vector_size;i++) {
    vector_c[i]=0;
  }

  double duration;

  // execute computation on device //
  #pragma acc data copyin(vector_a[0:vector_size]) copyout(vector_c[0:vector_size])
  {
    Timer timer;
    initTimer(&timer);
    // note: this brace is only a workaround for Cray compiler bug
    {
    #pragma acc parallel loop async(1)
    for(int i=0;i<vector_size;i++) {
      vector_c[i]=vector_a[i]*b;
    }
    }
    #pragma acc wait(1)
    duration = getTimer(&timer);
  }

  fprintf(stderr,"duration=%g s, memory bandwidth = %g bytes/s\n",
          duration,vector_size*sizeof(float)*2./duration);

  // verify results //
  bool success=true;
  for(int i=0;i<vector_size;i++) {
    if (fabsf(vector_a[i]*b-vector_c[i])>1e-5*b) {
      printf("computation result is wrong: index=%i expected result=%g "
             "actual result=%g\n", i, vector_a[i]*b,vector_c[i]);
      success=false;
      break;
    }
  }
  if (success) {
    printf("computation result is correct.\n");
  }

  // free memory //
  free(vector_a);
  free(vector_c);

  // exit //
  return 0;
}
