#include <stdio.h>
#include <stdlib.h>

#include "cuda_utils.h"

static char* computeModeStrings[] = 
  { "default (multiple processes/threads per device)", 
    "exclusive", "prohibited", "exclusive-process"};

void printDeviceProperties(int device) {

  cudaDeviceProp properties;
  cudaVerify(cudaGetDeviceProperties(&properties,device));
  printf("device name: %s\n",properties.name);
  printf("compute capability: %u.%u\n",properties.major,properties.minor);
  printf("number of multiprocessors: %u\n",properties.multiProcessorCount);
  printf("global memory: %Lu bytes\n",(unsigned long long)properties.totalGlobalMem);
  printf("size of L2 cache: %u bytes\n",properties.l2CacheSize);
  printf("maximum shared memory per block: %Lu bytes\n",
         (unsigned long long)properties.sharedMemPerBlock);
  printf("maximum number of registers per block: %u\n",properties.regsPerBlock);
  printf("warp size: %u\n",properties.warpSize);
  printf("maximum number of threads per block: %u\n",properties.maxThreadsPerBlock);
  printf("maximum block dimensions : (%u,%u,%u)\n",properties.maxThreadsDim[0],
         properties.maxThreadsDim[1],properties.maxThreadsDim[2]);
  printf("maximum grid dimensions: (%u,%u,%u)\n",properties.maxGridSize[0],
         properties.maxGridSize[1],properties.maxGridSize[2]);  
  printf("maximum number of threads per multiprocessor: %u\n",
         properties.maxThreadsPerMultiProcessor);
  printf("GPU clock frequency: %f MHz\n",properties.clockRate/1000.);
  printf("memory clock frequency: %f MHz\n",properties.memoryClockRate/1000.);
  printf("memory bus width: %u bits\n",properties.memoryBusWidth);
  printf("peak memory bandwidth: %f GB/s\n",
  	 2./8*1e-6*properties.memoryClockRate*properties.memoryBusWidth);
  printf("total constant memory: %Lu bytes\n",
         (unsigned long long)properties.totalConstMem);
  printf("can execute multiple kernels concurrently: %s\n",
         properties.concurrentKernels?"yes":"no");
  printf("can overlap data transfer and computation: %s\n",
         properties.deviceOverlap?"yes":"no");
  printf("can overlap data transfers from/to device: %s\n",
         properties.asyncEngineCount>=2?"yes":"no");
  printf("can map host memory: %s\n",properties.canMapHostMemory?"yes":"no");
  printf("unified address space enabled: %s\n",
         properties.unifiedAddressing?"yes":"no");
  printf("compute mode: %s\n",computeModeStrings[properties.computeMode]);
  printf("kernel timeout enabled: %s\n",
         properties.kernelExecTimeoutEnabled?"yes":"no");
  printf("ECC memory enabled: %s\n",properties.ECCEnabled?"yes":"no");
  printf("is integrated device: %s\n",properties.integrated?"yes":"no");
}

void printDeviceCanAccessPeer(int device,int devicePeer) {
  int result=0;
  cudaDeviceCanAccessPeer(&result,device,devicePeer);
  printf("can access memory of peer device %i: %s\n",devicePeer,result?"yes":"no");
}

int main() {

  int deviceCount;
  cudaVerify(cudaGetDeviceCount(&deviceCount));

  for(int i=0;i<deviceCount;i++) {
    printf("--- device %i: ---\n",i);
    printDeviceProperties(i);
    for(int j=0;j<deviceCount;j++) {
      if (i!=j) {
        printDeviceCanAccessPeer(i,j);
      }
    }
    printf("\n");
  }

  return 0;
}
