// naive version //
__kernel void matrix_transpose_naive(__global float* a,__global float* b, 
                                     int size_x_a,int size_y_a) {
  int index_x=get_global_id(0);
  int index_y=get_global_id(1);
  b[index_y*size_y_a+index_x]=a[index_x*size_x_a+index_y];
}

// shared memory version //
__kernel void matrix_transpose(__global float* a,__global float* b, 
                               int size_x_a,int size_y_a) {
  const int group_size=16;
  __local float a_local[16][16];

  int group_x=get_group_id(0);
  int group_y=get_group_id(1);
  int item_x=get_local_id(0);
  int item_y=get_local_id(1);
  int index_x_a=group_y*group_size+item_x;
  int index_y_a=group_x*group_size+item_y;
  int index_x_b=group_x*group_size+item_x;
  int index_y_b=group_y*group_size+item_y;
  
  a_local[item_x][item_y]
    =a[index_y_a*size_x_a+index_x_a];
  barrier(CLK_LOCAL_MEM_FENCE);
  b[index_y_b*size_y_a+index_x_b]=a_local[item_y][item_x];
  
}
