1*403adfb6SMatthew G Knepley /* 2*403adfb6SMatthew G Knepley STREAM benchmark implementation in CUDA. 3*403adfb6SMatthew G Knepley 4*403adfb6SMatthew G Knepley COPY: a(i) = b(i) 5*403adfb6SMatthew G Knepley SCALE: a(i) = q*b(i) 6*403adfb6SMatthew G Knepley SUM: a(i) = b(i) + c(i) 7*403adfb6SMatthew G Knepley TRIAD: a(i) = b(i) + q*c(i) 8*403adfb6SMatthew G Knepley 9*403adfb6SMatthew G Knepley It measures the memory system on the device. 10*403adfb6SMatthew G Knepley The implementation is in single precision. 11*403adfb6SMatthew G Knepley 12*403adfb6SMatthew G Knepley Code based on the code developed by John D. McCalpin 13*403adfb6SMatthew G Knepley http://www.cs.virginia.edu/stream/FTP/Code/stream.c 14*403adfb6SMatthew G Knepley 15*403adfb6SMatthew G Knepley Written by: Massimiliano Fatica, NVIDIA Corporation 16*403adfb6SMatthew G Knepley Modified by: Douglas Enright (dpephd-nvidia@yahoo.com), 1 December 2010 17*403adfb6SMatthew G Knepley Extensive Revisions, 4 December 2010 18*403adfb6SMatthew G Knepley Modified for PETSc by: Matthew G. Knepley 14 Aug 2011 19*403adfb6SMatthew G Knepley 20*403adfb6SMatthew G Knepley User interface motivated by bandwidthTest NVIDIA SDK example. 21*403adfb6SMatthew G Knepley */ 22*403adfb6SMatthew G Knepley static char *help = "Single-Precision STREAM Benchmark implementation in CUDA\n" 23*403adfb6SMatthew G Knepley "Performs Copy, Scale, Add, and Triad single-precision kernels\n\n"; 24*403adfb6SMatthew G Knepley 25*403adfb6SMatthew G Knepley #include <petscconf.h> 26*403adfb6SMatthew G Knepley #include <petscsys.h> 27*403adfb6SMatthew G Knepley #include <petsctime.h> 28*403adfb6SMatthew G Knepley 29*403adfb6SMatthew G Knepley #define N 2000000 30*403adfb6SMatthew G Knepley #define NTIMES 10 31*403adfb6SMatthew G Knepley 32*403adfb6SMatthew G Knepley # ifndef MIN 33*403adfb6SMatthew G Knepley # define MIN(x,y) ((x)<(y)?(x):(y)) 34*403adfb6SMatthew G Knepley # endif 35*403adfb6SMatthew G Knepley # ifndef MAX 36*403adfb6SMatthew G Knepley # define MAX(x,y) ((x)>(y)?(x):(y)) 37*403adfb6SMatthew G Knepley # endif 38*403adfb6SMatthew G Knepley 39*403adfb6SMatthew G Knepley const float flt_eps = 1.192092896e-07f; 40*403adfb6SMatthew G Knepley 41*403adfb6SMatthew G Knepley __global__ void set_array(float *a, float value, size_t len) 42*403adfb6SMatthew G Knepley { 43*403adfb6SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 44*403adfb6SMatthew G Knepley while (idx < len) { 45*403adfb6SMatthew G Knepley a[idx] = value; 46*403adfb6SMatthew G Knepley idx += blockDim.x * gridDim.x; 47*403adfb6SMatthew G Knepley } 48*403adfb6SMatthew G Knepley } 49*403adfb6SMatthew G Knepley 50*403adfb6SMatthew G Knepley __global__ void STREAM_Copy(float *a, float *b, size_t len) 51*403adfb6SMatthew G Knepley { 52*403adfb6SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 53*403adfb6SMatthew G Knepley while (idx < len) { 54*403adfb6SMatthew G Knepley b[idx] = a[idx]; 55*403adfb6SMatthew G Knepley idx += blockDim.x * gridDim.x; 56*403adfb6SMatthew G Knepley } 57*403adfb6SMatthew G Knepley } 58*403adfb6SMatthew G Knepley 59*403adfb6SMatthew G Knepley __global__ void STREAM_Copy_Optimized(float *a, float *b, size_t len) 60*403adfb6SMatthew G Knepley { 61*403adfb6SMatthew G Knepley /* 62*403adfb6SMatthew G Knepley * Ensure size of thread index space is as large as or greater than 63*403adfb6SMatthew G Knepley * vector index space else return. 64*403adfb6SMatthew G Knepley */ 65*403adfb6SMatthew G Knepley if (blockDim.x * gridDim.x < len) return; 66*403adfb6SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 67*403adfb6SMatthew G Knepley if (idx < len) b[idx] = a[idx]; 68*403adfb6SMatthew G Knepley } 69*403adfb6SMatthew G Knepley 70*403adfb6SMatthew G Knepley __global__ void STREAM_Scale(float *a, float *b, float scale, size_t len) 71*403adfb6SMatthew G Knepley { 72*403adfb6SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 73*403adfb6SMatthew G Knepley while (idx < len) { 74*403adfb6SMatthew G Knepley b[idx] = scale* a[idx]; 75*403adfb6SMatthew G Knepley idx += blockDim.x * gridDim.x; 76*403adfb6SMatthew G Knepley } 77*403adfb6SMatthew G Knepley } 78*403adfb6SMatthew G Knepley 79*403adfb6SMatthew G Knepley __global__ void STREAM_Add( float *a, float *b, float *c, size_t len) 80*403adfb6SMatthew G Knepley { 81*403adfb6SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 82*403adfb6SMatthew G Knepley while (idx < len) { 83*403adfb6SMatthew G Knepley c[idx] = a[idx]+b[idx]; 84*403adfb6SMatthew G Knepley idx += blockDim.x * gridDim.x; 85*403adfb6SMatthew G Knepley } 86*403adfb6SMatthew G Knepley } 87*403adfb6SMatthew G Knepley 88*403adfb6SMatthew G Knepley __global__ void STREAM_Triad( float *a, float *b, float *c, float scalar, size_t len) 89*403adfb6SMatthew G Knepley { 90*403adfb6SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 91*403adfb6SMatthew G Knepley while (idx < len) { 92*403adfb6SMatthew G Knepley c[idx] = a[idx]+scalar*b[idx]; 93*403adfb6SMatthew G Knepley idx += blockDim.x * gridDim.x; 94*403adfb6SMatthew G Knepley } 95*403adfb6SMatthew G Knepley } 96*403adfb6SMatthew G Knepley 97*403adfb6SMatthew G Knepley /* Host side verification routines */ 98*403adfb6SMatthew G Knepley bool STREAM_Copy_verify(float *a, float *b, size_t len) { 99*403adfb6SMatthew G Knepley size_t idx; 100*403adfb6SMatthew G Knepley bool bDifferent = false; 101*403adfb6SMatthew G Knepley 102*403adfb6SMatthew G Knepley for (idx = 0; idx < len && !bDifferent; idx++) { 103*403adfb6SMatthew G Knepley float expectedResult = a[idx]; 104*403adfb6SMatthew G Knepley float diffResultExpected = (b[idx] - expectedResult); 105*403adfb6SMatthew G Knepley float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 106*403adfb6SMatthew G Knepley /* element-wise relative error determination */ 107*403adfb6SMatthew G Knepley bDifferent = (relErrorULPS > 2.f); 108*403adfb6SMatthew G Knepley } 109*403adfb6SMatthew G Knepley 110*403adfb6SMatthew G Knepley return bDifferent; 111*403adfb6SMatthew G Knepley } 112*403adfb6SMatthew G Knepley 113*403adfb6SMatthew G Knepley bool STREAM_Scale_verify(float *a, float *b, float scale, size_t len) { 114*403adfb6SMatthew G Knepley size_t idx; 115*403adfb6SMatthew G Knepley bool bDifferent = false; 116*403adfb6SMatthew G Knepley 117*403adfb6SMatthew G Knepley for (idx = 0; idx < len && !bDifferent; idx++) { 118*403adfb6SMatthew G Knepley float expectedResult = scale*a[idx]; 119*403adfb6SMatthew G Knepley float diffResultExpected = (b[idx] - expectedResult); 120*403adfb6SMatthew G Knepley float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 121*403adfb6SMatthew G Knepley /* element-wise relative error determination */ 122*403adfb6SMatthew G Knepley bDifferent = (relErrorULPS > 2.f); 123*403adfb6SMatthew G Knepley } 124*403adfb6SMatthew G Knepley 125*403adfb6SMatthew G Knepley return bDifferent; 126*403adfb6SMatthew G Knepley } 127*403adfb6SMatthew G Knepley 128*403adfb6SMatthew G Knepley bool STREAM_Add_verify(float *a, float *b, float *c, size_t len) { 129*403adfb6SMatthew G Knepley size_t idx; 130*403adfb6SMatthew G Knepley bool bDifferent = false; 131*403adfb6SMatthew G Knepley 132*403adfb6SMatthew G Knepley for (idx = 0; idx < len && !bDifferent; idx++) { 133*403adfb6SMatthew G Knepley float expectedResult = a[idx] + b[idx]; 134*403adfb6SMatthew G Knepley float diffResultExpected = (c[idx] - expectedResult); 135*403adfb6SMatthew G Knepley float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 136*403adfb6SMatthew G Knepley /* element-wise relative error determination */ 137*403adfb6SMatthew G Knepley bDifferent = (relErrorULPS > 2.f); 138*403adfb6SMatthew G Knepley } 139*403adfb6SMatthew G Knepley 140*403adfb6SMatthew G Knepley return bDifferent; 141*403adfb6SMatthew G Knepley } 142*403adfb6SMatthew G Knepley 143*403adfb6SMatthew G Knepley bool STREAM_Triad_verify(float *a, float *b, float *c, float scalar, size_t len) { 144*403adfb6SMatthew G Knepley size_t idx; 145*403adfb6SMatthew G Knepley bool bDifferent = false; 146*403adfb6SMatthew G Knepley 147*403adfb6SMatthew G Knepley for (idx = 0; idx < len && !bDifferent; idx++) { 148*403adfb6SMatthew G Knepley float expectedResult = a[idx] + scalar*b[idx]; 149*403adfb6SMatthew G Knepley float diffResultExpected = (c[idx] - expectedResult); 150*403adfb6SMatthew G Knepley float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 151*403adfb6SMatthew G Knepley /* element-wise relative error determination */ 152*403adfb6SMatthew G Knepley bDifferent = (relErrorULPS > 3.f); 153*403adfb6SMatthew G Knepley } 154*403adfb6SMatthew G Knepley 155*403adfb6SMatthew G Knepley return bDifferent; 156*403adfb6SMatthew G Knepley } 157*403adfb6SMatthew G Knepley 158*403adfb6SMatthew G Knepley /* forward declarations */ 159*403adfb6SMatthew G Knepley PetscErrorCode setupStream(PetscInt device, PetscBool cpuTiming); 160*403adfb6SMatthew G Knepley PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming); 161*403adfb6SMatthew G Knepley PetscErrorCode printResultsReadable(float times[][NTIMES]); 162*403adfb6SMatthew G Knepley 163*403adfb6SMatthew G Knepley int main(int argc, char *argv[]) 164*403adfb6SMatthew G Knepley { 165*403adfb6SMatthew G Knepley PetscInt device = 0; 166*403adfb6SMatthew G Knepley PetscBool cpuTiming = PETSC_FALSE; 167*403adfb6SMatthew G Knepley PetscErrorCode ierr; 168*403adfb6SMatthew G Knepley 169*403adfb6SMatthew G Knepley ierr = PetscInitialize(&argc, &argv, 0, help);CHKERRQ(ierr); 170*403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, "[Single-Precision Device-Only STREAM Benchmark implementation in CUDA]\n");CHKERRQ(ierr); 171*403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, "%s Starting...\n\n", argv[0]);CHKERRQ(ierr); 172*403adfb6SMatthew G Knepley 173*403adfb6SMatthew G Knepley ierr = PetscOptionsBegin(PETSC_COMM_WORLD, "", "STREAM Benchmark Options", "STREAM");CHKERRQ(ierr); 174*403adfb6SMatthew G Knepley ierr = PetscOptionsInt("-device", "Specify the CUDA device to be used", "STREAM", device, &device, PETSC_NULL);CHKERRQ(ierr); 175*403adfb6SMatthew G Knepley ierr = PetscOptionsBool("-cputiming", "Force CPU-based timing to be used", "STREAM", cpuTiming, &cpuTiming, PETSC_NULL);CHKERRQ(ierr); 176*403adfb6SMatthew G Knepley ierr = PetscOptionsEnd(); 177*403adfb6SMatthew G Knepley 178*403adfb6SMatthew G Knepley ierr = setupStream(device, cpuTiming); 179*403adfb6SMatthew G Knepley if (ierr >= 0) { 180*403adfb6SMatthew G Knepley PetscErrorCode ierr2 = PetscPrintf(PETSC_COMM_SELF, "\n[streamBenchmark] - results:\t%s\n\n", (ierr == 0) ? "PASSES" : "FAILED");CHKERRQ(ierr2); 181*403adfb6SMatthew G Knepley } 182*403adfb6SMatthew G Knepley PetscFinalize(); 183*403adfb6SMatthew G Knepley return 0; 184*403adfb6SMatthew G Knepley } 185*403adfb6SMatthew G Knepley 186*403adfb6SMatthew G Knepley /////////////////////////////////////////////////////////////////////////////// 187*403adfb6SMatthew G Knepley //Run the appropriate tests 188*403adfb6SMatthew G Knepley /////////////////////////////////////////////////////////////////////////////// 189*403adfb6SMatthew G Knepley PetscErrorCode setupStream(PetscInt deviceNum, PetscBool cpuTiming) 190*403adfb6SMatthew G Knepley { 191*403adfb6SMatthew G Knepley PetscInt iNumThreadsPerBlock = 128; 192*403adfb6SMatthew G Knepley PetscErrorCode ierr; 193*403adfb6SMatthew G Knepley 194*403adfb6SMatthew G Knepley PetscFunctionBegin; 195*403adfb6SMatthew G Knepley // Check device 196*403adfb6SMatthew G Knepley { 197*403adfb6SMatthew G Knepley int deviceCount; 198*403adfb6SMatthew G Knepley 199*403adfb6SMatthew G Knepley cudaGetDeviceCount(&deviceCount); 200*403adfb6SMatthew G Knepley if (deviceCount == 0) { 201*403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, "!!!!!No devices found!!!!!\n");CHKERRQ(ierr); 202*403adfb6SMatthew G Knepley return -1000; 203*403adfb6SMatthew G Knepley } 204*403adfb6SMatthew G Knepley 205*403adfb6SMatthew G Knepley if (deviceNum >= deviceCount || deviceNum < 0) { 206*403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, "\n!!!!!Invalid GPU number %d given hence default gpu %d will be used !!!!!\n", deviceNum, 0);CHKERRQ(ierr); 207*403adfb6SMatthew G Knepley deviceNum = 0; 208*403adfb6SMatthew G Knepley } 209*403adfb6SMatthew G Knepley } 210*403adfb6SMatthew G Knepley 211*403adfb6SMatthew G Knepley cudaSetDevice(deviceNum); 212*403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, "Running on...\n\n");CHKERRQ(ierr); 213*403adfb6SMatthew G Knepley cudaDeviceProp deviceProp; 214*403adfb6SMatthew G Knepley if (cudaGetDeviceProperties(&deviceProp, deviceNum) == cudaSuccess) { 215*403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " Device %d: %s\n", deviceNum, deviceProp.name);CHKERRQ(ierr); 216*403adfb6SMatthew G Knepley } else { 217*403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " Unable to determine device %d properties, exiting\n");CHKERRQ(ierr); 218*403adfb6SMatthew G Knepley return -1; 219*403adfb6SMatthew G Knepley } 220*403adfb6SMatthew G Knepley 221*403adfb6SMatthew G Knepley if (deviceProp.major == 2 && deviceProp.minor == 1) { 222*403adfb6SMatthew G Knepley iNumThreadsPerBlock = 192; /* GF104 architecture / 48 CUDA Cores per MP */ 223*403adfb6SMatthew G Knepley } else { 224*403adfb6SMatthew G Knepley iNumThreadsPerBlock = 128; /* GF100 architecture / 32 CUDA Cores per MP */ 225*403adfb6SMatthew G Knepley } 226*403adfb6SMatthew G Knepley 227*403adfb6SMatthew G Knepley /*cutilSafeCall(cudaSetDeviceFlags(cudaDeviceBlockingSync));*/ 228*403adfb6SMatthew G Knepley 229*403adfb6SMatthew G Knepley if (cpuTiming) { 230*403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " Using cpu-only timer.\n");CHKERRQ(ierr); 231*403adfb6SMatthew G Knepley } 232*403adfb6SMatthew G Knepley 233*403adfb6SMatthew G Knepley ierr = runStream(iNumThreadsPerBlock, cpuTiming);CHKERRQ(ierr); 234*403adfb6SMatthew G Knepley PetscFunctionReturn(0); 235*403adfb6SMatthew G Knepley } 236*403adfb6SMatthew G Knepley 237*403adfb6SMatthew G Knepley /////////////////////////////////////////////////////////////////////////// 238*403adfb6SMatthew G Knepley // runStream 239*403adfb6SMatthew G Knepley /////////////////////////////////////////////////////////////////////////// 240*403adfb6SMatthew G Knepley PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming) 241*403adfb6SMatthew G Knepley { 242*403adfb6SMatthew G Knepley float *d_a, *d_b, *d_c; 243*403adfb6SMatthew G Knepley int k; 244*403adfb6SMatthew G Knepley float times[5][NTIMES]; 245*403adfb6SMatthew G Knepley float scalar; 246*403adfb6SMatthew G Knepley PetscErrorCode ierr; 247*403adfb6SMatthew G Knepley 248*403adfb6SMatthew G Knepley PetscFunctionBegin; 249*403adfb6SMatthew G Knepley /* Allocate memory on device */ 250*403adfb6SMatthew G Knepley ierr = cudaMalloc((void**)&d_a, sizeof(float)*N);CHKERRQ(ierr); 251*403adfb6SMatthew G Knepley ierr = cudaMalloc((void**)&d_b, sizeof(float)*N);CHKERRQ(ierr); 252*403adfb6SMatthew G Knepley ierr = cudaMalloc((void**)&d_c, sizeof(float)*N);CHKERRQ(ierr); 253*403adfb6SMatthew G Knepley 254*403adfb6SMatthew G Knepley /* Compute execution configuration */ 255*403adfb6SMatthew G Knepley 256*403adfb6SMatthew G Knepley dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */ 257*403adfb6SMatthew G Knepley dim3 dimGrid(N/dimBlock.x); /* (N/dimBlock.x,1,1) */ 258*403adfb6SMatthew G Knepley if (N % dimBlock.x != 0) dimGrid.x+=1; 259*403adfb6SMatthew G Knepley 260*403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " Array size (single precision) = %u\n",N);CHKERRQ(ierr); 261*403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " using %u threads per block, %u blocks\n",dimBlock.x,dimGrid.x);CHKERRQ(ierr); 262*403adfb6SMatthew G Knepley 263*403adfb6SMatthew G Knepley /* Initialize memory on the device */ 264*403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 265*403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 266*403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 267*403adfb6SMatthew G Knepley 268*403adfb6SMatthew G Knepley /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ 269*403adfb6SMatthew G Knepley PetscLogDouble cpuTimer = 0.0; 270*403adfb6SMatthew G Knepley cudaEvent_t start, stop; 271*403adfb6SMatthew G Knepley 272*403adfb6SMatthew G Knepley /* both timers report msec */ 273*403adfb6SMatthew G Knepley ierr = cudaEventCreate( &start );CHKERRQ(ierr); /* gpu timer facility */ 274*403adfb6SMatthew G Knepley ierr = cudaEventCreate( &stop );CHKERRQ(ierr); /* gpu timer facility */ 275*403adfb6SMatthew G Knepley 276*403adfb6SMatthew G Knepley scalar=3.0f; 277*403adfb6SMatthew G Knepley for(k = 0; k < NTIMES; ++k) { 278*403adfb6SMatthew G Knepley PetscTimeSubtract(cpuTimer); 279*403adfb6SMatthew G Knepley ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 280*403adfb6SMatthew G Knepley STREAM_Copy<<<dimGrid,dimBlock>>>(d_a, d_c, N); 281*403adfb6SMatthew G Knepley ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 282*403adfb6SMatthew G Knepley ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 283*403adfb6SMatthew G Knepley //get the the total elapsed time in ms 284*403adfb6SMatthew G Knepley if (bDontUseGPUTiming) { 285*403adfb6SMatthew G Knepley PetscTimeAdd(cpuTimer); 286*403adfb6SMatthew G Knepley times[0][k] = cpuTimer; 287*403adfb6SMatthew G Knepley } else { 288*403adfb6SMatthew G Knepley ierr = cudaEventElapsedTime( ×[0][k], start, stop );CHKERRQ(ierr); 289*403adfb6SMatthew G Knepley } 290*403adfb6SMatthew G Knepley 291*403adfb6SMatthew G Knepley cpuTimer = 0.0; 292*403adfb6SMatthew G Knepley PetscTimeSubtract(cpuTimer); 293*403adfb6SMatthew G Knepley ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 294*403adfb6SMatthew G Knepley STREAM_Copy_Optimized<<<dimGrid,dimBlock>>>(d_a, d_c, N); 295*403adfb6SMatthew G Knepley ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 296*403adfb6SMatthew G Knepley ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 297*403adfb6SMatthew G Knepley //get the the total elapsed time in ms 298*403adfb6SMatthew G Knepley if (bDontUseGPUTiming) { 299*403adfb6SMatthew G Knepley PetscTimeAdd(cpuTimer); 300*403adfb6SMatthew G Knepley times[1][k] = cpuTimer; 301*403adfb6SMatthew G Knepley } else { 302*403adfb6SMatthew G Knepley ierr = cudaEventElapsedTime( ×[1][k], start, stop );CHKERRQ(ierr); 303*403adfb6SMatthew G Knepley } 304*403adfb6SMatthew G Knepley 305*403adfb6SMatthew G Knepley cpuTimer = 0.0; 306*403adfb6SMatthew G Knepley PetscTimeSubtract(cpuTimer); 307*403adfb6SMatthew G Knepley ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 308*403adfb6SMatthew G Knepley STREAM_Scale<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 309*403adfb6SMatthew G Knepley ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 310*403adfb6SMatthew G Knepley ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 311*403adfb6SMatthew G Knepley //get the the total elapsed time in ms 312*403adfb6SMatthew G Knepley PetscTimeAdd(cpuTimer); 313*403adfb6SMatthew G Knepley if (bDontUseGPUTiming) { 314*403adfb6SMatthew G Knepley times[2][k] = cpuTimer; 315*403adfb6SMatthew G Knepley } else { 316*403adfb6SMatthew G Knepley ierr = cudaEventElapsedTime( ×[2][k], start, stop );CHKERRQ(ierr); 317*403adfb6SMatthew G Knepley } 318*403adfb6SMatthew G Knepley 319*403adfb6SMatthew G Knepley cpuTimer = 0.0; 320*403adfb6SMatthew G Knepley PetscTimeSubtract(cpuTimer); 321*403adfb6SMatthew G Knepley ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 322*403adfb6SMatthew G Knepley STREAM_Add<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 323*403adfb6SMatthew G Knepley ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 324*403adfb6SMatthew G Knepley ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 325*403adfb6SMatthew G Knepley //get the the total elapsed time in ms 326*403adfb6SMatthew G Knepley PetscTimeAdd(cpuTimer); 327*403adfb6SMatthew G Knepley if (bDontUseGPUTiming) { 328*403adfb6SMatthew G Knepley times[3][k] = cpuTimer; 329*403adfb6SMatthew G Knepley } else { 330*403adfb6SMatthew G Knepley ierr = cudaEventElapsedTime( ×[3][k], start, stop );CHKERRQ(ierr); 331*403adfb6SMatthew G Knepley } 332*403adfb6SMatthew G Knepley 333*403adfb6SMatthew G Knepley cpuTimer = 0.0; 334*403adfb6SMatthew G Knepley PetscTimeSubtract(cpuTimer); 335*403adfb6SMatthew G Knepley ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 336*403adfb6SMatthew G Knepley STREAM_Triad<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 337*403adfb6SMatthew G Knepley ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 338*403adfb6SMatthew G Knepley ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 339*403adfb6SMatthew G Knepley //get the the total elapsed time in ms 340*403adfb6SMatthew G Knepley PetscTimeAdd(cpuTimer); 341*403adfb6SMatthew G Knepley if (bDontUseGPUTiming) { 342*403adfb6SMatthew G Knepley times[4][k] = cpuTimer; 343*403adfb6SMatthew G Knepley } else { 344*403adfb6SMatthew G Knepley ierr = cudaEventElapsedTime( ×[4][k], start, stop );CHKERRQ(ierr); 345*403adfb6SMatthew G Knepley } 346*403adfb6SMatthew G Knepley 347*403adfb6SMatthew G Knepley } 348*403adfb6SMatthew G Knepley 349*403adfb6SMatthew G Knepley /* verify kernels */ 350*403adfb6SMatthew G Knepley float *h_a, *h_b, *h_c; 351*403adfb6SMatthew G Knepley bool errorSTREAMkernel = true; 352*403adfb6SMatthew G Knepley 353*403adfb6SMatthew G Knepley if ( (h_a = (float*)calloc( N, sizeof(float) )) == (float*)NULL ) { 354*403adfb6SMatthew G Knepley printf("Unable to allocate array h_a, exiting ...\n"); 355*403adfb6SMatthew G Knepley exit(1); 356*403adfb6SMatthew G Knepley } 357*403adfb6SMatthew G Knepley if ( (h_b = (float*)calloc( N, sizeof(float) )) == (float*)NULL ) { 358*403adfb6SMatthew G Knepley printf("Unable to allocate array h_b, exiting ...\n"); 359*403adfb6SMatthew G Knepley exit(1); 360*403adfb6SMatthew G Knepley } 361*403adfb6SMatthew G Knepley 362*403adfb6SMatthew G Knepley if ( (h_c = (float*)calloc( N, sizeof(float) )) == (float*)NULL ) { 363*403adfb6SMatthew G Knepley printf("Unalbe to allocate array h_c, exiting ...\n"); 364*403adfb6SMatthew G Knepley exit(1); 365*403adfb6SMatthew G Knepley } 366*403adfb6SMatthew G Knepley 367*403adfb6SMatthew G Knepley /* 368*403adfb6SMatthew G Knepley * perform kernel, copy device memory into host memory and verify each 369*403adfb6SMatthew G Knepley * device kernel output 370*403adfb6SMatthew G Knepley */ 371*403adfb6SMatthew G Knepley 372*403adfb6SMatthew G Knepley /* Initialize memory on the device */ 373*403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 374*403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 375*403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 376*403adfb6SMatthew G Knepley 377*403adfb6SMatthew G Knepley STREAM_Copy<<<dimGrid,dimBlock>>>(d_a, d_c, N); 378*403adfb6SMatthew G Knepley ierr = cudaMemcpy( h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 379*403adfb6SMatthew G Knepley ierr = cudaMemcpy( h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 380*403adfb6SMatthew G Knepley errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N); 381*403adfb6SMatthew G Knepley if (errorSTREAMkernel) { 382*403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n");CHKERRQ(ierr); 383*403adfb6SMatthew G Knepley exit(-2000); 384*403adfb6SMatthew G Knepley } else { 385*403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tPass\n");CHKERRQ(ierr); 386*403adfb6SMatthew G Knepley } 387*403adfb6SMatthew G Knepley 388*403adfb6SMatthew G Knepley /* Initialize memory on the device */ 389*403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 390*403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 391*403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 392*403adfb6SMatthew G Knepley 393*403adfb6SMatthew G Knepley STREAM_Copy_Optimized<<<dimGrid,dimBlock>>>(d_a, d_c, N); 394*403adfb6SMatthew G Knepley ierr = cudaMemcpy( h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 395*403adfb6SMatthew G Knepley ierr = cudaMemcpy( h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 396*403adfb6SMatthew G Knepley errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N); 397*403adfb6SMatthew G Knepley if (errorSTREAMkernel) { 398*403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n");CHKERRQ(ierr); 399*403adfb6SMatthew G Knepley exit(-3000); 400*403adfb6SMatthew G Knepley } else { 401*403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tPass\n");CHKERRQ(ierr); 402*403adfb6SMatthew G Knepley } 403*403adfb6SMatthew G Knepley 404*403adfb6SMatthew G Knepley /* Initialize memory on the device */ 405*403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 406*403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 407*403adfb6SMatthew G Knepley 408*403adfb6SMatthew G Knepley STREAM_Scale<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 409*403adfb6SMatthew G Knepley ierr = cudaMemcpy( h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 410*403adfb6SMatthew G Knepley ierr = cudaMemcpy( h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 411*403adfb6SMatthew G Knepley errorSTREAMkernel = STREAM_Scale_verify(h_b, h_c, scalar, N); 412*403adfb6SMatthew G Knepley if (errorSTREAMkernel) { 413*403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n");CHKERRQ(ierr); 414*403adfb6SMatthew G Knepley exit(-4000); 415*403adfb6SMatthew G Knepley } else { 416*403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tPass\n");CHKERRQ(ierr); 417*403adfb6SMatthew G Knepley } 418*403adfb6SMatthew G Knepley 419*403adfb6SMatthew G Knepley /* Initialize memory on the device */ 420*403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 421*403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 422*403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 423*403adfb6SMatthew G Knepley 424*403adfb6SMatthew G Knepley STREAM_Add<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 425*403adfb6SMatthew G Knepley ierr = cudaMemcpy( h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 426*403adfb6SMatthew G Knepley ierr = cudaMemcpy( h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 427*403adfb6SMatthew G Knepley ierr = cudaMemcpy( h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 428*403adfb6SMatthew G Knepley errorSTREAMkernel = STREAM_Add_verify(h_a, h_b, h_c, N); 429*403adfb6SMatthew G Knepley if (errorSTREAMkernel) { 430*403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n");CHKERRQ(ierr); 431*403adfb6SMatthew G Knepley exit(-5000); 432*403adfb6SMatthew G Knepley } else { 433*403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tPass\n");CHKERRQ(ierr); 434*403adfb6SMatthew G Knepley } 435*403adfb6SMatthew G Knepley 436*403adfb6SMatthew G Knepley /* Initialize memory on the device */ 437*403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 438*403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 439*403adfb6SMatthew G Knepley set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 440*403adfb6SMatthew G Knepley 441*403adfb6SMatthew G Knepley STREAM_Triad<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 442*403adfb6SMatthew G Knepley ierr = cudaMemcpy( h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 443*403adfb6SMatthew G Knepley ierr = cudaMemcpy( h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 444*403adfb6SMatthew G Knepley ierr = cudaMemcpy( h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 445*403adfb6SMatthew G Knepley errorSTREAMkernel = STREAM_Triad_verify(h_b, h_c, h_a, scalar, N); 446*403adfb6SMatthew G Knepley if (errorSTREAMkernel) { 447*403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n");CHKERRQ(ierr); 448*403adfb6SMatthew G Knepley exit(-6000); 449*403adfb6SMatthew G Knepley } else { 450*403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tPass\n");CHKERRQ(ierr); 451*403adfb6SMatthew G Knepley } 452*403adfb6SMatthew G Knepley 453*403adfb6SMatthew G Knepley /* continue from here */ 454*403adfb6SMatthew G Knepley printResultsReadable(times); 455*403adfb6SMatthew G Knepley 456*403adfb6SMatthew G Knepley //clean up timers 457*403adfb6SMatthew G Knepley ierr = cudaEventDestroy( stop );CHKERRQ(ierr); 458*403adfb6SMatthew G Knepley ierr = cudaEventDestroy( start );CHKERRQ(ierr); 459*403adfb6SMatthew G Knepley 460*403adfb6SMatthew G Knepley /* Free memory on device */ 461*403adfb6SMatthew G Knepley ierr = cudaFree(d_a);CHKERRQ(ierr); 462*403adfb6SMatthew G Knepley ierr = cudaFree(d_b);CHKERRQ(ierr); 463*403adfb6SMatthew G Knepley ierr = cudaFree(d_c);CHKERRQ(ierr); 464*403adfb6SMatthew G Knepley PetscFunctionReturn(0); 465*403adfb6SMatthew G Knepley } 466*403adfb6SMatthew G Knepley 467*403adfb6SMatthew G Knepley /////////////////////////////////////////////////////////////////////////// 468*403adfb6SMatthew G Knepley //Print Results to Screen and File 469*403adfb6SMatthew G Knepley /////////////////////////////////////////////////////////////////////////// 470*403adfb6SMatthew G Knepley PetscErrorCode printResultsReadable(float times[][NTIMES]) { 471*403adfb6SMatthew G Knepley PetscErrorCode ierr; 472*403adfb6SMatthew G Knepley PetscInt j, k; 473*403adfb6SMatthew G Knepley float avgtime[5] = {0., 0., 0., 0., 0.}; 474*403adfb6SMatthew G Knepley float maxtime[5] = {0., 0., 0., 0., 0.}; 475*403adfb6SMatthew G Knepley float mintime[5] = {1e30,1e30,1e30,1e30,1e30}; 476*403adfb6SMatthew G Knepley char *label[5] = {"Copy: ", "Copy Opt.: ", "Scale: ", "Add: ", "Triad: "}; 477*403adfb6SMatthew G Knepley float bytes_per_kernel[5] = { 478*403adfb6SMatthew G Knepley 2. * sizeof(float) * N, 479*403adfb6SMatthew G Knepley 2. * sizeof(float) * N, 480*403adfb6SMatthew G Knepley 2. * sizeof(float) * N, 481*403adfb6SMatthew G Knepley 3. * sizeof(float) * N, 482*403adfb6SMatthew G Knepley 3. * sizeof(float) * N 483*403adfb6SMatthew G Knepley }; 484*403adfb6SMatthew G Knepley 485*403adfb6SMatthew G Knepley PetscFunctionBegin; 486*403adfb6SMatthew G Knepley /* --- SUMMARY --- */ 487*403adfb6SMatthew G Knepley for(k = 1; k < NTIMES; ++k) { /* note -- skip first iteration */ 488*403adfb6SMatthew G Knepley for(j = 0; j < 5; ++j) { 489*403adfb6SMatthew G Knepley avgtime[j] = avgtime[j] + (1.e-03f * times[j][k]); 490*403adfb6SMatthew G Knepley mintime[j] = MIN(mintime[j], (1.e-03f * times[j][k])); 491*403adfb6SMatthew G Knepley maxtime[j] = MAX(maxtime[j], (1.e-03f * times[j][k])); 492*403adfb6SMatthew G Knepley } 493*403adfb6SMatthew G Knepley } 494*403adfb6SMatthew G Knepley 495*403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, "Function Rate (MB/s) Avg time Min time Max time\n");CHKERRQ(ierr); 496*403adfb6SMatthew G Knepley 497*403adfb6SMatthew G Knepley for(j = 0; j < 5; ++j) { 498*403adfb6SMatthew G Knepley avgtime[j] = avgtime[j]/(float)(NTIMES-1); 499*403adfb6SMatthew G Knepley ierr = PetscPrintf(PETSC_COMM_SELF, "%s%11.4f %11.6f %12.6f %12.6f\n", label[j], 1.0E-06 * bytes_per_kernel[j]/mintime[j], avgtime[j], mintime[j], maxtime[j]);CHKERRQ(ierr); 500*403adfb6SMatthew G Knepley } 501*403adfb6SMatthew G Knepley PetscFunctionReturn(0); 502*403adfb6SMatthew G Knepley } 503