#define _SVID_SOURCE

#include <CL/cl.h>
#include "opencl_utils.h"

#include <stdio.h>
#include <malloc.h>
#include <math.h>
#include <stdlib.h>
#include <stdbool.h>
#include <time.h>

int main(void) {

  // define matrix size //
  const int size_x_a = 4096;
  const int size_y_a = 2048*3;

  const int size_x_b = size_y_a;
  const int size_y_b = size_x_a;

  const int group_size=16;

  if (size_x_a%group_size!=0 || size_y_a%group_size!=0) {
    printf("error: matrix size must be evenly divisible by block size.\n"
           "Aborting.\n");
    exit(1);
  }

  int matrix_size=size_x_a*size_y_a;

  srand48(time(NULL));
  
  // get platform and device information //
  cl_platform_id platform=NULL;
  cl_device_id device=NULL;   
  cl_uint deviceCount;
  cl_uint platformCount;
  cl_int result=0;
  openclVerify(clGetPlatformIDs(1,&platform,&platformCount));
  openclVerify(clGetDeviceIDs(platform,CL_DEVICE_TYPE_DEFAULT,1, 
                              &device,&deviceCount));

  // create OpenCL context //
  cl_context context=clCreateContext(NULL,1,&device,NULL,NULL,&result);
  openclVerifyVal(result);
  
  // create command queue //
  cl_command_queue command_queue 
    = clCreateCommandQueue(context,device,0,&result);
  openclVerifyVal(result);

  // allocate host vectors //
  float* matrix_a_cpu =(float*)malloc(matrix_size*sizeof(float));
  float* matrix_b_cpu=(float*)malloc(matrix_size*sizeof(float));

  // initialize matrices //
  for(int i=0;i<size_y_a;i++) {
    for(int j=0;j<size_x_a;j++) {
      matrix_a_cpu[i*size_x_a+j]=drand48();
    }
  }
  // initialize cpu output matrix to 0 //
  for(int i=0;i<size_y_b;i++) {
    for(int j=0;j<size_x_b;j++) {
      matrix_b_cpu[i*size_x_b+j]=0;
    }
  }

  // create memory buffers on the device for each matrix //
  cl_mem matrix_a_gpu=clCreateBuffer(context, CL_MEM_READ_ONLY,
                                      matrix_size*sizeof(float),NULL,&result);
  openclVerifyVal(result);
  cl_mem matrix_b_gpu=clCreateBuffer(context, CL_MEM_WRITE_ONLY, 
                                       matrix_size*sizeof(float),NULL,&result);
  openclVerifyVal(result);

  // copy input matrix to gpu //
  openclVerify(clEnqueueWriteBuffer(command_queue,matrix_a_gpu,CL_TRUE,0,
                                    matrix_size*sizeof(float),matrix_a_cpu,
                                    0,NULL,NULL));

  // create program object, read kernel source file and compile it //
  cl_program program 
    =openclCreateProgramFromSourceFile(context,device,"exercise17_kernel.cl");
  
  // create kernel object //
  cl_kernel kernel=clCreateKernel(program,"matrix_transpose",&result);
  openclVerifyVal(result);

  // set kernel arguments //
  openclVerify(clSetKernelArg(kernel,0,sizeof(cl_mem),(void*)&matrix_a_gpu));
  openclVerify(clSetKernelArg(kernel,1,sizeof(cl_mem),(void*)&matrix_b_gpu));
  openclVerify(clSetKernelArg(kernel,2,sizeof(int),(void*)&size_x_a));
  openclVerify(clSetKernelArg(kernel,3,sizeof(int),(void*)&size_y_a));

  // this is the 'block size' //
  size_t local_item_size[2]={group_size,group_size};
  // global item size ('block_size*grid_size') //
  size_t global_item_size[2]={size_x_b,size_y_b};

  // launch the kernel (just to warm up the framework) //
  openclVerify(clEnqueueNDRangeKernel(command_queue,kernel,2,NULL,
                                      global_item_size,local_item_size, 
                                      0,NULL,NULL));
  openclVerify(clFinish(command_queue));
  
  Timer timer;
  initTimer(&timer);

  // launch the kernel (for measurement) //
  openclVerify(clEnqueueNDRangeKernel(command_queue,kernel,2,NULL,
                                      global_item_size,local_item_size, 
                                      0,NULL,NULL));

  openclVerify(clFinish(command_queue));
  double durationKernel = getAndResetTimer(&timer);

  // copy output matrix from gpu to cpu //
  openclVerify(clEnqueueReadBuffer(command_queue,matrix_b_gpu,CL_TRUE,0, 
                                   matrix_size*sizeof(float),matrix_b_cpu,
                                   0,NULL,NULL));

  // verify results //
  bool success=true;
  for(int i=0;i<size_y_a;i++) {
    for(int j=0;j<size_x_a;j++) {
      if (matrix_a_cpu[i*size_x_a+j]!=matrix_b_cpu[j*size_x_b+i]) {
        printf("computation result is wrong: position=(%i,%i) reference=%.12e"
               " cuda result=%.12e\n",
               i,j,matrix_a_cpu[i*size_x_a+j],matrix_b_cpu[j*size_x_b+i]);
        success=false;
        break; 
      }
    }
    if (!success)
      break;
  }
  if (success) {
    printf("computation result is correct.\n");
  }

  printf("duration=%le seconds, memory bandwidth = %le bytes/s\n",
         durationKernel,matrix_size*2.*sizeof(float)/durationKernel);

  // clean up //
  openclVerify(clFinish(command_queue));
  openclVerify(clReleaseKernel(kernel));
  openclVerify(clReleaseProgram(program));
  openclVerify(clReleaseMemObject(matrix_a_gpu));
  openclVerify(clReleaseMemObject(matrix_b_gpu));
  openclVerify(clReleaseCommandQueue(command_queue));
  openclVerify(clReleaseContext(context));
  free(matrix_a_cpu);
  free(matrix_b_cpu);
  return 0;
}
