#define _SVID_SOURCE

#include <CL/cl.h>
#include "opencl_utils.h"

#include <stdio.h>
#include <malloc.h>
#include <math.h>
#include <stdlib.h>
#include <stdbool.h>
#include <time.h>

#if __cplusplus <201100L && !defined __GXX_EXPERIMENTAL_CXX0X__
#error this source requires a C++11 compiler (for gcc try '-std=c++0x')
#endif

int main(void) {

  int vector_size=10000000;

  srand48(time(NULL));
  
  // get platform and device information //
  cl_platform_id platform=NULL;
  cl_device_id device=NULL;   
  cl_uint deviceCount;
  cl_uint platformCount;
  cl_int result=0;
  openclVerify(clGetPlatformIDs(1,&platform,&platformCount));
  openclVerify(clGetDeviceIDs(platform,CL_DEVICE_TYPE_DEFAULT,1, 
                              &device,&deviceCount));

  // create OpenCL context //
  cl_context context=clCreateContext(NULL,1,&device,NULL,NULL,&result);
  openclVerifyVal(result);
  
  // create command queue //
  cl_command_queue command_queue 
    = clCreateCommandQueue(context,device,0,&result);
  openclVerifyVal(result);

  // allocate host vectors //
  float* vector_a_cpu=(float*)malloc(vector_size*sizeof(float));
  float* vector_b_cpu=(float*)malloc(vector_size*sizeof(float));
  float* vector_c_cpu=(float*)malloc(vector_size*sizeof(float));

  // create memory buffers on the device for each vector //
  cl_mem vector_a_gpu=clCreateBuffer(context, CL_MEM_READ_ONLY, 
                                     vector_size*sizeof(float),NULL,&result);
  openclVerifyVal(result);
  cl_mem vector_b_gpu=clCreateBuffer(context, CL_MEM_READ_ONLY,
                                     vector_size*sizeof(float),NULL,&result);
  openclVerifyVal(result);
  cl_mem vector_c_gpu=clCreateBuffer(context, CL_MEM_WRITE_ONLY, 
                                     vector_size*sizeof(float),NULL,&result);
  openclVerifyVal(result);

  char dummy=0;
  // copy something to input vector a and b (just to warm up framework) //
  openclVerify(clEnqueueWriteBuffer(command_queue,vector_a_gpu,CL_TRUE,0,
                                    sizeof(char),&dummy,0,NULL,NULL));
  openclVerify(clEnqueueWriteBuffer(command_queue,vector_b_gpu,CL_TRUE,0,
                                    sizeof(char),&dummy,0,NULL,NULL));

  // initialize cpu input vectors //
  for(int i=0;i<vector_size;i++) {
    vector_a_cpu[i]=drand48();
    vector_b_cpu[i]=drand48();
  }
  // initialize cpu output vector to 0 //
  for(int i=0;i<vector_size;i++) {
    vector_c_cpu[i]=0;
  }

  Timer timer;
  initTimer(&timer);

  // copy input vectors a and b to gpu //
  openclVerify(clEnqueueWriteBuffer(command_queue,vector_a_gpu,CL_TRUE,0,
                                    vector_size*sizeof(float),vector_a_cpu,
                                    0,NULL,NULL));
  openclVerify(clEnqueueWriteBuffer(command_queue,vector_b_gpu,CL_TRUE,0, 
                                    vector_size*sizeof(float),vector_b_cpu, 
                                    0, NULL, NULL));

  double durationMemoryCopyIn = getTimer(&timer);

  // create program object, read kernel source file and compile it //
  cl_program program 
    =openclCreateProgramFromSourceFile(context,device,"exercise16_kernel.cl");
  
  // create kernel object //
  cl_kernel kernel=clCreateKernel(program,"vector_add",&result);
  openclVerifyVal(result);

  // this is the 'block size' //
  size_t local_item_size=128;
  // global item size ('block_size*grid_size')   //
  // must be evenly divisible by local_item_size //
  size_t global_item_size=(vector_size+local_item_size-1)
                           /local_item_size*local_item_size; 

  // launch the kernel (just to warm up the framework) //
  openclCallKernel<cl_mem,cl_mem,cl_mem,float>
    (command_queue,kernel,1,&global_item_size,&local_item_size,
    vector_a_gpu,vector_b_gpu,vector_c_gpu,vector_size);
  openclVerify(clFinish(command_queue));

  initTimer(&timer);

  // launch the kernel (for measurement) //
  openclCallKernel<cl_mem,cl_mem,cl_mem,float>
    (command_queue,kernel,1,&global_item_size,&local_item_size,
    vector_a_gpu,vector_b_gpu,vector_c_gpu,vector_size);
  openclVerify(clFinish(command_queue));

  double durationKernel = getAndResetTimer(&timer);

  // copy output vector from gpu to cpu //
  openclVerify(clEnqueueReadBuffer(command_queue,vector_c_gpu,CL_TRUE,0, 
                                   vector_size*sizeof(float),vector_c_cpu,
                                   0,NULL,NULL));

  double durationMemoryCopyOut = getTimer(&timer);

  // verify results //
  bool success=true;
  for(int i=0;i<vector_size;i++) {
    if (fabsf(vector_a_cpu[i]+vector_b_cpu[i]-vector_c_cpu[i])>1e-5) {
      printf("computation result is wrong: index=%i expected result=%g "
             "cuda result=%g\n", i, 
             vector_a_cpu[i]+vector_b_cpu[i],vector_c_cpu[i]);
      success=false;
      break;
    }
  }
  if (success) {
    printf("computation result is correct.\n");
  }

  printf("duration (copy in)          = %e seconds\n"
         "memory bandwidth (copy in)  = %e bytes/s\n",
         durationMemoryCopyIn,vector_size*2.*sizeof(float)/durationMemoryCopyIn);
  printf("duration (kernel)           = %e seconds, performance = %e FLOPS\n"
         "memory bandwidth (kernel)   = %e bytes/s\n",
         durationKernel,vector_size/durationKernel,
         vector_size*3.*sizeof(float)/durationKernel);
  printf("duration (copy out)         = %e seconds\n"
         "memory bandwidth (copy out) = %e bytes/s\n",
         durationMemoryCopyOut,vector_size*sizeof(float)/durationMemoryCopyOut);

  // clean up //
  openclVerify(clFinish(command_queue));
  openclVerify(clReleaseKernel(kernel));
  openclVerify(clReleaseProgram(program));
  openclVerify(clReleaseMemObject(vector_a_gpu));
  openclVerify(clReleaseMemObject(vector_b_gpu));
  openclVerify(clReleaseMemObject(vector_c_gpu));
  openclVerify(clReleaseCommandQueue(command_queue));
  openclVerify(clReleaseContext(context));
  free(vector_a_cpu);
  free(vector_b_cpu);
  free(vector_c_cpu);
  return 0;
}
