benchmarks/streams/CUDAVersion.cu

*403adfb6SMatthew G Knepley/*
*403adfb6SMatthew G Knepley  STREAM benchmark implementation in CUDA.
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley    COPY:       a(i) = b(i)
*403adfb6SMatthew G Knepley    SCALE:      a(i) = q*b(i)
*403adfb6SMatthew G Knepley    SUM:        a(i) = b(i) + c(i)
*403adfb6SMatthew G Knepley    TRIAD:      a(i) = b(i) + q*c(i)
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley  It measures the memory system on the device.
*403adfb6SMatthew G Knepley  The implementation is in single precision.
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley  Code based on the code developed by John D. McCalpin
*403adfb6SMatthew G Knepley  http://www.cs.virginia.edu/stream/FTP/Code/stream.c
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley  Written by: Massimiliano Fatica, NVIDIA Corporation
*403adfb6SMatthew G Knepley  Modified by: Douglas Enright (dpephd-nvidia@yahoo.com), 1 December 2010
*403adfb6SMatthew G Knepley  Extensive Revisions, 4 December 2010
*403adfb6SMatthew G Knepley  Modified for PETSc by: Matthew G. Knepley 14 Aug 2011
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley  User interface motivated by bandwidthTest NVIDIA SDK example.
*403adfb6SMatthew G Knepley*/
*403adfb6SMatthew G Knepleystatic char *help =  "Single-Precision STREAM Benchmark implementation in CUDA\n"
*403adfb6SMatthew G Knepley                     "Performs Copy, Scale, Add, and Triad single-precision kernels\n\n";
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley#include <petscconf.h>
*403adfb6SMatthew G Knepley#include <petscsys.h>
*403adfb6SMatthew G Knepley#include <petsctime.h>
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley#define N	2000000
*403adfb6SMatthew G Knepley#define NTIMES	10
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley# ifndef MIN
*403adfb6SMatthew G Knepley# define MIN(x,y) ((x)<(y)?(x):(y))
*403adfb6SMatthew G Knepley# endif
*403adfb6SMatthew G Knepley# ifndef MAX
*403adfb6SMatthew G Knepley# define MAX(x,y) ((x)>(y)?(x):(y))
*403adfb6SMatthew G Knepley# endif
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepleyconst float flt_eps = 1.192092896e-07f;
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley__global__ void set_array(float *a,  float value, size_t len)
*403adfb6SMatthew G Knepley{
*403adfb6SMatthew G Knepley  size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
*403adfb6SMatthew G Knepley  while (idx < len) {
*403adfb6SMatthew G Knepley    a[idx] = value;
*403adfb6SMatthew G Knepley    idx += blockDim.x * gridDim.x;
*403adfb6SMatthew G Knepley  }
*403adfb6SMatthew G Knepley}
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley__global__ void STREAM_Copy(float *a, float *b, size_t len)
*403adfb6SMatthew G Knepley{
*403adfb6SMatthew G Knepley  size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
*403adfb6SMatthew G Knepley  while (idx < len) {
*403adfb6SMatthew G Knepley    b[idx] = a[idx];
*403adfb6SMatthew G Knepley    idx += blockDim.x * gridDim.x;
*403adfb6SMatthew G Knepley  }
*403adfb6SMatthew G Knepley}
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley__global__ void STREAM_Copy_Optimized(float *a, float *b, size_t len)
*403adfb6SMatthew G Knepley{
*403adfb6SMatthew G Knepley  /*
*403adfb6SMatthew G Knepley   * Ensure size of thread index space is as large as or greater than
*403adfb6SMatthew G Knepley   * vector index space else return.
*403adfb6SMatthew G Knepley   */
*403adfb6SMatthew G Knepley  if (blockDim.x * gridDim.x < len) return;
*403adfb6SMatthew G Knepley  size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
*403adfb6SMatthew G Knepley  if (idx < len) b[idx] = a[idx];
*403adfb6SMatthew G Knepley}
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley__global__ void STREAM_Scale(float *a, float *b, float scale,  size_t len)
*403adfb6SMatthew G Knepley{
*403adfb6SMatthew G Knepley  size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
*403adfb6SMatthew G Knepley  while (idx < len) {
*403adfb6SMatthew G Knepley    b[idx] = scale* a[idx];
*403adfb6SMatthew G Knepley    idx += blockDim.x * gridDim.x;
*403adfb6SMatthew G Knepley  }
*403adfb6SMatthew G Knepley}
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley__global__ void STREAM_Add( float *a, float *b, float *c,  size_t len)
*403adfb6SMatthew G Knepley{
*403adfb6SMatthew G Knepley  size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
*403adfb6SMatthew G Knepley  while (idx < len) {
*403adfb6SMatthew G Knepley    c[idx] = a[idx]+b[idx];
*403adfb6SMatthew G Knepley    idx += blockDim.x * gridDim.x;
*403adfb6SMatthew G Knepley  }
*403adfb6SMatthew G Knepley}
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley__global__ void STREAM_Triad( float *a, float *b, float *c, float scalar, size_t len)
*403adfb6SMatthew G Knepley{
*403adfb6SMatthew G Knepley  size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
*403adfb6SMatthew G Knepley  while (idx < len) {
*403adfb6SMatthew G Knepley    c[idx] = a[idx]+scalar*b[idx];
*403adfb6SMatthew G Knepley    idx += blockDim.x * gridDim.x;
*403adfb6SMatthew G Knepley  }
*403adfb6SMatthew G Knepley}
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley/* Host side verification routines */
*403adfb6SMatthew G Knepleybool STREAM_Copy_verify(float *a, float *b, size_t len) {
*403adfb6SMatthew G Knepley  size_t idx;
*403adfb6SMatthew G Knepley  bool bDifferent = false;
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley  for (idx = 0; idx < len && !bDifferent; idx++) {
*403adfb6SMatthew G Knepley    float expectedResult = a[idx];
*403adfb6SMatthew G Knepley    float diffResultExpected = (b[idx] - expectedResult);
*403adfb6SMatthew G Knepley    float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
*403adfb6SMatthew G Knepley    /* element-wise relative error determination */
*403adfb6SMatthew G Knepley    bDifferent = (relErrorULPS > 2.f);
*403adfb6SMatthew G Knepley  }
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley  return bDifferent;
*403adfb6SMatthew G Knepley}
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepleybool STREAM_Scale_verify(float *a, float *b, float scale, size_t len) {
*403adfb6SMatthew G Knepley  size_t idx;
*403adfb6SMatthew G Knepley  bool bDifferent = false;
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley  for (idx = 0; idx < len && !bDifferent; idx++) {
*403adfb6SMatthew G Knepley    float expectedResult = scale*a[idx];
*403adfb6SMatthew G Knepley    float diffResultExpected = (b[idx] - expectedResult);
*403adfb6SMatthew G Knepley    float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
*403adfb6SMatthew G Knepley    /* element-wise relative error determination */
*403adfb6SMatthew G Knepley    bDifferent = (relErrorULPS > 2.f);
*403adfb6SMatthew G Knepley  }
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley  return bDifferent;
*403adfb6SMatthew G Knepley}
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepleybool STREAM_Add_verify(float *a, float *b, float *c, size_t len) {
*403adfb6SMatthew G Knepley  size_t idx;
*403adfb6SMatthew G Knepley  bool bDifferent = false;
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley  for (idx = 0; idx < len && !bDifferent; idx++) {
*403adfb6SMatthew G Knepley    float expectedResult = a[idx] + b[idx];
*403adfb6SMatthew G Knepley    float diffResultExpected = (c[idx] - expectedResult);
*403adfb6SMatthew G Knepley    float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
*403adfb6SMatthew G Knepley    /* element-wise relative error determination */
*403adfb6SMatthew G Knepley    bDifferent = (relErrorULPS > 2.f);
*403adfb6SMatthew G Knepley  }
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley  return bDifferent;
*403adfb6SMatthew G Knepley}
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepleybool STREAM_Triad_verify(float *a, float *b, float *c, float scalar, size_t len) {
*403adfb6SMatthew G Knepley  size_t idx;
*403adfb6SMatthew G Knepley  bool bDifferent = false;
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley  for (idx = 0; idx < len && !bDifferent; idx++) {
*403adfb6SMatthew G Knepley    float expectedResult = a[idx] + scalar*b[idx];
*403adfb6SMatthew G Knepley    float diffResultExpected = (c[idx] - expectedResult);
*403adfb6SMatthew G Knepley    float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
*403adfb6SMatthew G Knepley    /* element-wise relative error determination */
*403adfb6SMatthew G Knepley    bDifferent = (relErrorULPS > 3.f);
*403adfb6SMatthew G Knepley  }
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley  return bDifferent;
*403adfb6SMatthew G Knepley}
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley/* forward declarations */
*403adfb6SMatthew G KnepleyPetscErrorCode setupStream(PetscInt device, PetscBool cpuTiming);
*403adfb6SMatthew G KnepleyPetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming);
*403adfb6SMatthew G KnepleyPetscErrorCode printResultsReadable(float times[][NTIMES]);
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepleyint main(int argc, char *argv[])
*403adfb6SMatthew G Knepley{
*403adfb6SMatthew G Knepley  PetscInt       device    = 0;
*403adfb6SMatthew G Knepley  PetscBool      cpuTiming = PETSC_FALSE;
*403adfb6SMatthew G Knepley  PetscErrorCode ierr;
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley  ierr = PetscInitialize(&argc, &argv, 0, help);CHKERRQ(ierr);
*403adfb6SMatthew G Knepley  ierr = PetscPrintf(PETSC_COMM_SELF, "[Single-Precision Device-Only STREAM Benchmark implementation in CUDA]\n");CHKERRQ(ierr);
*403adfb6SMatthew G Knepley  ierr = PetscPrintf(PETSC_COMM_SELF, "%s Starting...\n\n", argv[0]);CHKERRQ(ierr);
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley  ierr = PetscOptionsBegin(PETSC_COMM_WORLD, "", "STREAM Benchmark Options", "STREAM");CHKERRQ(ierr);
*403adfb6SMatthew G Knepley    ierr = PetscOptionsInt("-device", "Specify the CUDA device to be used", "STREAM", device, &device, PETSC_NULL);CHKERRQ(ierr);
*403adfb6SMatthew G Knepley    ierr = PetscOptionsBool("-cputiming", "Force CPU-based timing to be used", "STREAM", cpuTiming, &cpuTiming, PETSC_NULL);CHKERRQ(ierr);
*403adfb6SMatthew G Knepley  ierr = PetscOptionsEnd();
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley  ierr = setupStream(device, cpuTiming);
*403adfb6SMatthew G Knepley  if (ierr >= 0) {
*403adfb6SMatthew G Knepley    PetscErrorCode ierr2 = PetscPrintf(PETSC_COMM_SELF, "\n[streamBenchmark] - results:\t%s\n\n", (ierr == 0) ? "PASSES" : "FAILED");CHKERRQ(ierr2);
*403adfb6SMatthew G Knepley  }
*403adfb6SMatthew G Knepley  PetscFinalize();
*403adfb6SMatthew G Knepley  return 0;
*403adfb6SMatthew G Knepley}
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley///////////////////////////////////////////////////////////////////////////////
*403adfb6SMatthew G Knepley//Run the appropriate tests
*403adfb6SMatthew G Knepley///////////////////////////////////////////////////////////////////////////////
*403adfb6SMatthew G KnepleyPetscErrorCode setupStream(PetscInt deviceNum, PetscBool cpuTiming)
*403adfb6SMatthew G Knepley{
*403adfb6SMatthew G Knepley  PetscInt       iNumThreadsPerBlock = 128;
*403adfb6SMatthew G Knepley  PetscErrorCode ierr;
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley  PetscFunctionBegin;
*403adfb6SMatthew G Knepley  // Check device
*403adfb6SMatthew G Knepley  {
*403adfb6SMatthew G Knepley    int deviceCount;
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley    cudaGetDeviceCount(&deviceCount);
*403adfb6SMatthew G Knepley    if (deviceCount == 0) {
*403adfb6SMatthew G Knepley      ierr = PetscPrintf(PETSC_COMM_SELF, "!!!!!No devices found!!!!!\n");CHKERRQ(ierr);
*403adfb6SMatthew G Knepley      return -1000;
*403adfb6SMatthew G Knepley    }
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley    if (deviceNum >= deviceCount || deviceNum < 0) {
*403adfb6SMatthew G Knepley      ierr = PetscPrintf(PETSC_COMM_SELF, "\n!!!!!Invalid GPU number %d given hence default gpu %d will be used !!!!!\n", deviceNum, 0);CHKERRQ(ierr);
*403adfb6SMatthew G Knepley      deviceNum = 0;
*403adfb6SMatthew G Knepley    }
*403adfb6SMatthew G Knepley  }
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley  cudaSetDevice(deviceNum);
*403adfb6SMatthew G Knepley  ierr = PetscPrintf(PETSC_COMM_SELF, "Running on...\n\n");CHKERRQ(ierr);
*403adfb6SMatthew G Knepley  cudaDeviceProp deviceProp;
*403adfb6SMatthew G Knepley  if (cudaGetDeviceProperties(&deviceProp, deviceNum) == cudaSuccess) {
*403adfb6SMatthew G Knepley    ierr = PetscPrintf(PETSC_COMM_SELF, " Device %d: %s\n", deviceNum, deviceProp.name);CHKERRQ(ierr);
*403adfb6SMatthew G Knepley  } else {
*403adfb6SMatthew G Knepley    ierr = PetscPrintf(PETSC_COMM_SELF, " Unable to determine device %d properties, exiting\n");CHKERRQ(ierr);
*403adfb6SMatthew G Knepley    return -1;
*403adfb6SMatthew G Knepley  }
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley  if (deviceProp.major == 2 && deviceProp.minor == 1) {
*403adfb6SMatthew G Knepley    iNumThreadsPerBlock = 192; /* GF104 architecture / 48 CUDA Cores per MP */
*403adfb6SMatthew G Knepley  } else {
*403adfb6SMatthew G Knepley    iNumThreadsPerBlock = 128; /* GF100 architecture / 32 CUDA Cores per MP */
*403adfb6SMatthew G Knepley  }
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley  /*cutilSafeCall(cudaSetDeviceFlags(cudaDeviceBlockingSync));*/
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley  if (cpuTiming) {
*403adfb6SMatthew G Knepley    ierr = PetscPrintf(PETSC_COMM_SELF, " Using cpu-only timer.\n");CHKERRQ(ierr);
*403adfb6SMatthew G Knepley  }
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley  ierr = runStream(iNumThreadsPerBlock, cpuTiming);CHKERRQ(ierr);
*403adfb6SMatthew G Knepley  PetscFunctionReturn(0);
*403adfb6SMatthew G Knepley}
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley///////////////////////////////////////////////////////////////////////////
*403adfb6SMatthew G Knepley// runStream
*403adfb6SMatthew G Knepley///////////////////////////////////////////////////////////////////////////
*403adfb6SMatthew G KnepleyPetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming)
*403adfb6SMatthew G Knepley{
*403adfb6SMatthew G Knepley  float *d_a, *d_b, *d_c;
*403adfb6SMatthew G Knepley  int k;
*403adfb6SMatthew G Knepley  float times[5][NTIMES];
*403adfb6SMatthew G Knepley  float scalar;
*403adfb6SMatthew G Knepley  PetscErrorCode ierr;
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley  PetscFunctionBegin;
*403adfb6SMatthew G Knepley  /* Allocate memory on device */
*403adfb6SMatthew G Knepley  ierr = cudaMalloc((void**)&d_a, sizeof(float)*N);CHKERRQ(ierr);
*403adfb6SMatthew G Knepley  ierr = cudaMalloc((void**)&d_b, sizeof(float)*N);CHKERRQ(ierr);
*403adfb6SMatthew G Knepley  ierr = cudaMalloc((void**)&d_c, sizeof(float)*N);CHKERRQ(ierr);
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley  /* Compute execution configuration */
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley  dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */
*403adfb6SMatthew G Knepley  dim3 dimGrid(N/dimBlock.x); /* (N/dimBlock.x,1,1) */
*403adfb6SMatthew G Knepley  if (N % dimBlock.x != 0) dimGrid.x+=1;
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley  ierr = PetscPrintf(PETSC_COMM_SELF, " Array size (single precision) = %u\n",N);CHKERRQ(ierr);
*403adfb6SMatthew G Knepley  ierr = PetscPrintf(PETSC_COMM_SELF, " using %u threads per block, %u blocks\n",dimBlock.x,dimGrid.x);CHKERRQ(ierr);
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley  /* Initialize memory on the device */
*403adfb6SMatthew G Knepley  set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
*403adfb6SMatthew G Knepley  set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
*403adfb6SMatthew G Knepley  set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley  /*	--- MAIN LOOP --- repeat test cases NTIMES times --- */
*403adfb6SMatthew G Knepley  PetscLogDouble cpuTimer = 0.0;
*403adfb6SMatthew G Knepley  cudaEvent_t start, stop;
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley  /* both timers report msec */
*403adfb6SMatthew G Knepley  ierr = cudaEventCreate( &start );CHKERRQ(ierr); /* gpu timer facility */
*403adfb6SMatthew G Knepley  ierr = cudaEventCreate( &stop );CHKERRQ(ierr);  /* gpu timer facility */
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley  scalar=3.0f;
*403adfb6SMatthew G Knepley  for(k = 0; k < NTIMES; ++k) {
*403adfb6SMatthew G Knepley    PetscTimeSubtract(cpuTimer);
*403adfb6SMatthew G Knepley    ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr);
*403adfb6SMatthew G Knepley    STREAM_Copy<<<dimGrid,dimBlock>>>(d_a, d_c, N);
*403adfb6SMatthew G Knepley    ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr);
*403adfb6SMatthew G Knepley    ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
*403adfb6SMatthew G Knepley    //get the the total elapsed time in ms
*403adfb6SMatthew G Knepley    if (bDontUseGPUTiming) {
*403adfb6SMatthew G Knepley      PetscTimeAdd(cpuTimer);
*403adfb6SMatthew G Knepley      times[0][k] = cpuTimer;
*403adfb6SMatthew G Knepley    } else {
*403adfb6SMatthew G Knepley      ierr = cudaEventElapsedTime( &times[0][k], start, stop );CHKERRQ(ierr);
*403adfb6SMatthew G Knepley    }
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley    cpuTimer = 0.0;
*403adfb6SMatthew G Knepley    PetscTimeSubtract(cpuTimer);
*403adfb6SMatthew G Knepley    ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr);
*403adfb6SMatthew G Knepley    STREAM_Copy_Optimized<<<dimGrid,dimBlock>>>(d_a, d_c, N);
*403adfb6SMatthew G Knepley    ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr);
*403adfb6SMatthew G Knepley    ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
*403adfb6SMatthew G Knepley    //get the the total elapsed time in ms
*403adfb6SMatthew G Knepley    if (bDontUseGPUTiming) {
*403adfb6SMatthew G Knepley      PetscTimeAdd(cpuTimer);
*403adfb6SMatthew G Knepley      times[1][k] = cpuTimer;
*403adfb6SMatthew G Knepley    } else {
*403adfb6SMatthew G Knepley      ierr = cudaEventElapsedTime( &times[1][k], start, stop );CHKERRQ(ierr);
*403adfb6SMatthew G Knepley    }
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley    cpuTimer = 0.0;
*403adfb6SMatthew G Knepley    PetscTimeSubtract(cpuTimer);
*403adfb6SMatthew G Knepley    ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr);
*403adfb6SMatthew G Knepley    STREAM_Scale<<<dimGrid,dimBlock>>>(d_b, d_c, scalar,  N);
*403adfb6SMatthew G Knepley    ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr);
*403adfb6SMatthew G Knepley    ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
*403adfb6SMatthew G Knepley    //get the the total elapsed time in ms
*403adfb6SMatthew G Knepley    PetscTimeAdd(cpuTimer);
*403adfb6SMatthew G Knepley    if (bDontUseGPUTiming) {
*403adfb6SMatthew G Knepley      times[2][k] = cpuTimer;
*403adfb6SMatthew G Knepley    } else {
*403adfb6SMatthew G Knepley      ierr = cudaEventElapsedTime( &times[2][k], start, stop );CHKERRQ(ierr);
*403adfb6SMatthew G Knepley    }
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley    cpuTimer = 0.0;
*403adfb6SMatthew G Knepley    PetscTimeSubtract(cpuTimer);
*403adfb6SMatthew G Knepley    ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr);
*403adfb6SMatthew G Knepley    STREAM_Add<<<dimGrid,dimBlock>>>(d_a, d_b, d_c,  N);
*403adfb6SMatthew G Knepley    ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr);
*403adfb6SMatthew G Knepley    ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
*403adfb6SMatthew G Knepley    //get the the total elapsed time in ms
*403adfb6SMatthew G Knepley    PetscTimeAdd(cpuTimer);
*403adfb6SMatthew G Knepley    if (bDontUseGPUTiming) {
*403adfb6SMatthew G Knepley      times[3][k] = cpuTimer;
*403adfb6SMatthew G Knepley    } else {
*403adfb6SMatthew G Knepley      ierr = cudaEventElapsedTime( &times[3][k], start, stop );CHKERRQ(ierr);
*403adfb6SMatthew G Knepley    }
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley    cpuTimer = 0.0;
*403adfb6SMatthew G Knepley    PetscTimeSubtract(cpuTimer);
*403adfb6SMatthew G Knepley    ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr);
*403adfb6SMatthew G Knepley    STREAM_Triad<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar,  N);
*403adfb6SMatthew G Knepley    ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr);
*403adfb6SMatthew G Knepley    ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
*403adfb6SMatthew G Knepley    //get the the total elapsed time in ms
*403adfb6SMatthew G Knepley    PetscTimeAdd(cpuTimer);
*403adfb6SMatthew G Knepley    if (bDontUseGPUTiming) {
*403adfb6SMatthew G Knepley      times[4][k] = cpuTimer;
*403adfb6SMatthew G Knepley    } else {
*403adfb6SMatthew G Knepley      ierr = cudaEventElapsedTime( &times[4][k], start, stop );CHKERRQ(ierr);
*403adfb6SMatthew G Knepley    }
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley  }
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley  /* verify kernels */
*403adfb6SMatthew G Knepley  float *h_a, *h_b, *h_c;
*403adfb6SMatthew G Knepley  bool errorSTREAMkernel = true;
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley  if ( (h_a = (float*)calloc( N, sizeof(float) )) == (float*)NULL ) {
*403adfb6SMatthew G Knepley    printf("Unable to allocate array h_a, exiting ...\n");
*403adfb6SMatthew G Knepley    exit(1);
*403adfb6SMatthew G Knepley  }
*403adfb6SMatthew G Knepley  if ( (h_b = (float*)calloc( N, sizeof(float) )) == (float*)NULL ) {
*403adfb6SMatthew G Knepley    printf("Unable to allocate array h_b, exiting ...\n");
*403adfb6SMatthew G Knepley    exit(1);
*403adfb6SMatthew G Knepley  }
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley  if ( (h_c = (float*)calloc( N, sizeof(float) )) == (float*)NULL ) {
*403adfb6SMatthew G Knepley    printf("Unalbe to allocate array h_c, exiting ...\n");
*403adfb6SMatthew G Knepley    exit(1);
*403adfb6SMatthew G Knepley  }
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley  /*
*403adfb6SMatthew G Knepley   * perform kernel, copy device memory into host memory and verify each
*403adfb6SMatthew G Knepley   * device kernel output
*403adfb6SMatthew G Knepley   */
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley  /* Initialize memory on the device */
*403adfb6SMatthew G Knepley  set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
*403adfb6SMatthew G Knepley  set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
*403adfb6SMatthew G Knepley  set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley  STREAM_Copy<<<dimGrid,dimBlock>>>(d_a, d_c, N);
*403adfb6SMatthew G Knepley  ierr = cudaMemcpy( h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
*403adfb6SMatthew G Knepley  ierr = cudaMemcpy( h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
*403adfb6SMatthew G Knepley  errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N);
*403adfb6SMatthew G Knepley  if (errorSTREAMkernel) {
*403adfb6SMatthew G Knepley    ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n");CHKERRQ(ierr);
*403adfb6SMatthew G Knepley    exit(-2000);
*403adfb6SMatthew G Knepley  } else {
*403adfb6SMatthew G Knepley    ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tPass\n");CHKERRQ(ierr);
*403adfb6SMatthew G Knepley  }
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley  /* Initialize memory on the device */
*403adfb6SMatthew G Knepley  set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
*403adfb6SMatthew G Knepley  set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
*403adfb6SMatthew G Knepley  set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley  STREAM_Copy_Optimized<<<dimGrid,dimBlock>>>(d_a, d_c, N);
*403adfb6SMatthew G Knepley  ierr = cudaMemcpy( h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
*403adfb6SMatthew G Knepley  ierr = cudaMemcpy( h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
*403adfb6SMatthew G Knepley  errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N);
*403adfb6SMatthew G Knepley  if (errorSTREAMkernel) {
*403adfb6SMatthew G Knepley    ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n");CHKERRQ(ierr);
*403adfb6SMatthew G Knepley    exit(-3000);
*403adfb6SMatthew G Knepley  } else {
*403adfb6SMatthew G Knepley    ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tPass\n");CHKERRQ(ierr);
*403adfb6SMatthew G Knepley  }
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley  /* Initialize memory on the device */
*403adfb6SMatthew G Knepley  set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
*403adfb6SMatthew G Knepley  set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley  STREAM_Scale<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N);
*403adfb6SMatthew G Knepley  ierr = cudaMemcpy( h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
*403adfb6SMatthew G Knepley  ierr = cudaMemcpy( h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
*403adfb6SMatthew G Knepley  errorSTREAMkernel = STREAM_Scale_verify(h_b, h_c, scalar, N);
*403adfb6SMatthew G Knepley  if (errorSTREAMkernel) {
*403adfb6SMatthew G Knepley    ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n");CHKERRQ(ierr);
*403adfb6SMatthew G Knepley    exit(-4000);
*403adfb6SMatthew G Knepley  } else {
*403adfb6SMatthew G Knepley    ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tPass\n");CHKERRQ(ierr);
*403adfb6SMatthew G Knepley  }
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley  /* Initialize memory on the device */
*403adfb6SMatthew G Knepley  set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
*403adfb6SMatthew G Knepley  set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
*403adfb6SMatthew G Knepley  set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley  STREAM_Add<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N);
*403adfb6SMatthew G Knepley  ierr = cudaMemcpy( h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
*403adfb6SMatthew G Knepley  ierr = cudaMemcpy( h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
*403adfb6SMatthew G Knepley  ierr = cudaMemcpy( h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
*403adfb6SMatthew G Knepley  errorSTREAMkernel = STREAM_Add_verify(h_a, h_b, h_c, N);
*403adfb6SMatthew G Knepley  if (errorSTREAMkernel) {
*403adfb6SMatthew G Knepley    ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n");CHKERRQ(ierr);
*403adfb6SMatthew G Knepley    exit(-5000);
*403adfb6SMatthew G Knepley  } else {
*403adfb6SMatthew G Knepley    ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tPass\n");CHKERRQ(ierr);
*403adfb6SMatthew G Knepley  }
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley  /* Initialize memory on the device */
*403adfb6SMatthew G Knepley  set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
*403adfb6SMatthew G Knepley  set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
*403adfb6SMatthew G Knepley  set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley  STREAM_Triad<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N);
*403adfb6SMatthew G Knepley  ierr = cudaMemcpy( h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
*403adfb6SMatthew G Knepley  ierr = cudaMemcpy( h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
*403adfb6SMatthew G Knepley  ierr = cudaMemcpy( h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
*403adfb6SMatthew G Knepley  errorSTREAMkernel = STREAM_Triad_verify(h_b, h_c, h_a, scalar, N);
*403adfb6SMatthew G Knepley  if (errorSTREAMkernel) {
*403adfb6SMatthew G Knepley    ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n");CHKERRQ(ierr);
*403adfb6SMatthew G Knepley    exit(-6000);
*403adfb6SMatthew G Knepley  } else {
*403adfb6SMatthew G Knepley    ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tPass\n");CHKERRQ(ierr);
*403adfb6SMatthew G Knepley  }
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley  /* continue from here */
*403adfb6SMatthew G Knepley  printResultsReadable(times);
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley  //clean up timers
*403adfb6SMatthew G Knepley  ierr = cudaEventDestroy( stop );CHKERRQ(ierr);
*403adfb6SMatthew G Knepley  ierr = cudaEventDestroy( start );CHKERRQ(ierr);
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley  /* Free memory on device */
*403adfb6SMatthew G Knepley  ierr = cudaFree(d_a);CHKERRQ(ierr);
*403adfb6SMatthew G Knepley  ierr = cudaFree(d_b);CHKERRQ(ierr);
*403adfb6SMatthew G Knepley  ierr = cudaFree(d_c);CHKERRQ(ierr);
*403adfb6SMatthew G Knepley  PetscFunctionReturn(0);
*403adfb6SMatthew G Knepley}
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley///////////////////////////////////////////////////////////////////////////
*403adfb6SMatthew G Knepley//Print Results to Screen and File
*403adfb6SMatthew G Knepley///////////////////////////////////////////////////////////////////////////
*403adfb6SMatthew G KnepleyPetscErrorCode printResultsReadable(float times[][NTIMES]) {
*403adfb6SMatthew G Knepley  PetscErrorCode ierr;
*403adfb6SMatthew G Knepley  PetscInt       j, k;
*403adfb6SMatthew G Knepley  float	avgtime[5] = {0., 0., 0., 0., 0.};
*403adfb6SMatthew G Knepley  float	maxtime[5] = {0., 0., 0., 0., 0.};
*403adfb6SMatthew G Knepley  float	mintime[5] = {1e30,1e30,1e30,1e30,1e30};
*403adfb6SMatthew G Knepley  char *label[5]   = {"Copy:      ", "Copy Opt.: ", "Scale:     ", "Add:       ", "Triad:     "};
*403adfb6SMatthew G Knepley  float	bytes_per_kernel[5] = {
*403adfb6SMatthew G Knepley    2. * sizeof(float) * N,
*403adfb6SMatthew G Knepley    2. * sizeof(float) * N,
*403adfb6SMatthew G Knepley    2. * sizeof(float) * N,
*403adfb6SMatthew G Knepley    3. * sizeof(float) * N,
*403adfb6SMatthew G Knepley    3. * sizeof(float) * N
*403adfb6SMatthew G Knepley  };
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley  PetscFunctionBegin;
*403adfb6SMatthew G Knepley  /* --- SUMMARY --- */
*403adfb6SMatthew G Knepley  for(k = 1; k < NTIMES; ++k) { /* note -- skip first iteration */
*403adfb6SMatthew G Knepley    for(j = 0; j < 5; ++j) {
*403adfb6SMatthew G Knepley      avgtime[j] = avgtime[j] + (1.e-03f * times[j][k]);
*403adfb6SMatthew G Knepley      mintime[j] = MIN(mintime[j], (1.e-03f * times[j][k]));
*403adfb6SMatthew G Knepley      maxtime[j] = MAX(maxtime[j], (1.e-03f * times[j][k]));
*403adfb6SMatthew G Knepley    }
*403adfb6SMatthew G Knepley  }
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley  ierr = PetscPrintf(PETSC_COMM_SELF, "Function    Rate (MB/s)    Avg time      Min time      Max time\n");CHKERRQ(ierr);
*403adfb6SMatthew G Knepley
*403adfb6SMatthew G Knepley  for(j = 0; j < 5; ++j) {
*403adfb6SMatthew G Knepley     avgtime[j] = avgtime[j]/(float)(NTIMES-1);
*403adfb6SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, "%s%11.4f  %11.6f  %12.6f  %12.6f\n", label[j], 1.0E-06 * bytes_per_kernel[j]/mintime[j], avgtime[j], mintime[j], maxtime[j]);CHKERRQ(ierr);
*403adfb6SMatthew G Knepley  }
*403adfb6SMatthew G Knepley  PetscFunctionReturn(0);
*403adfb6SMatthew G Knepley}