1 /* 2 STREAM benchmark implementation in CUDA. 3 4 COPY: a(i) = b(i) 5 SCALE: a(i) = q*b(i) 6 SUM: a(i) = b(i) + c(i) 7 TRIAD: a(i) = b(i) + q*c(i) 8 9 It measures the memory system on the device. 10 The implementation is in single precision. 11 12 Code based on the code developed by John D. McCalpin 13 http://www.cs.virginia.edu/stream/FTP/Code/stream.c 14 15 Written by: Massimiliano Fatica, NVIDIA Corporation 16 Modified by: Douglas Enright (dpephd-nvidia@yahoo.com), 1 December 2010 17 Extensive Revisions, 4 December 2010 18 Modified for PETSc by: Matthew G. Knepley 14 Aug 2011 19 20 User interface motivated by bandwidthTest NVIDIA SDK example. 21 */ 22 static char *help = "Single-Precision STREAM Benchmark implementation in CUDA\n" 23 "Performs Copy, Scale, Add, and Triad single-precision kernels\n\n"; 24 25 #include <petscconf.h> 26 #include <petscsys.h> 27 #include <petsctime.h> 28 29 #define N 2000000 30 #define N_DOUBLE 8000000 31 #define NTIMES 10 32 33 # ifndef MIN 34 # define MIN(x,y) ((x)<(y)?(x):(y)) 35 # endif 36 # ifndef MAX 37 # define MAX(x,y) ((x)>(y)?(x):(y)) 38 # endif 39 40 const float flt_eps = 1.192092896e-07f; 41 const double dbl_eps = 2.2204460492503131e-16; 42 43 __global__ void set_array(float *a, float value, size_t len) 44 { 45 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 46 while (idx < len) { 47 a[idx] = value; 48 idx += blockDim.x * gridDim.x; 49 } 50 } 51 52 __global__ void set_array_double(double *a, double value, size_t len) 53 { 54 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 55 while (idx < len) { 56 a[idx] = value; 57 idx += blockDim.x * gridDim.x; 58 } 59 } 60 61 __global__ void STREAM_Copy(float *a, float *b, size_t len) 62 { 63 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 64 while (idx < len) { 65 b[idx] = a[idx]; 66 idx += blockDim.x * gridDim.x; 67 } 68 } 69 70 __global__ void STREAM_Copy_double(double *a, double *b, size_t len) 71 { 72 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 73 while (idx < len) { 74 b[idx] = a[idx]; 75 idx += blockDim.x * gridDim.x; 76 } 77 } 78 79 __global__ void STREAM_Copy_Optimized(float *a, float *b, size_t len) 80 { 81 /* 82 * Ensure size of thread index space is as large as or greater than 83 * vector index space else return. 84 */ 85 if (blockDim.x * gridDim.x < len) return; 86 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 87 if (idx < len) b[idx] = a[idx]; 88 } 89 90 __global__ void STREAM_Copy_Optimized_double(double *a, double *b, size_t len) 91 { 92 /* 93 * Ensure size of thread index space is as large as or greater than 94 * vector index space else return. 95 */ 96 if (blockDim.x * gridDim.x < len) return; 97 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 98 if (idx < len) b[idx] = a[idx]; 99 } 100 101 __global__ void STREAM_Scale(float *a, float *b, float scale, size_t len) 102 { 103 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 104 while (idx < len) { 105 b[idx] = scale* a[idx]; 106 idx += blockDim.x * gridDim.x; 107 } 108 } 109 110 __global__ void STREAM_Scale_double(double *a, double *b, double scale, size_t len) 111 { 112 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 113 while (idx < len) { 114 b[idx] = scale* a[idx]; 115 idx += blockDim.x * gridDim.x; 116 } 117 } 118 119 __global__ void STREAM_Scale_Optimized(float *a, float *b, float scale, size_t len) 120 { 121 /* 122 * Ensure size of thread index space is as large as or greater than 123 * vector index space else return. 124 */ 125 if (blockDim.x * gridDim.x < len) return; 126 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 127 if (idx < len) b[idx] = scale* a[idx]; 128 } 129 130 __global__ void STREAM_Scale_Optimized_double(double *a, double *b, double scale, size_t len) 131 { 132 /* 133 * Ensure size of thread index space is as large as or greater than 134 * vector index space else return. 135 */ 136 if (blockDim.x * gridDim.x < len) return; 137 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 138 if (idx < len) b[idx] = scale* a[idx]; 139 } 140 141 __global__ void STREAM_Add( float *a, float *b, float *c, size_t len) 142 { 143 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 144 while (idx < len) { 145 c[idx] = a[idx]+b[idx]; 146 idx += blockDim.x * gridDim.x; 147 } 148 } 149 150 __global__ void STREAM_Add_double( double *a, double *b, double *c, size_t len) 151 { 152 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 153 while (idx < len) { 154 c[idx] = a[idx]+b[idx]; 155 idx += blockDim.x * gridDim.x; 156 } 157 } 158 159 __global__ void STREAM_Add_Optimized( float *a, float *b, float *c, size_t len) 160 { 161 /* 162 * Ensure size of thread index space is as large as or greater than 163 * vector index space else return. 164 */ 165 if (blockDim.x * gridDim.x < len) return; 166 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 167 if (idx < len) c[idx] = a[idx]+b[idx]; 168 } 169 170 __global__ void STREAM_Add_Optimized_double( double *a, double *b, double *c, size_t len) 171 { 172 /* 173 * Ensure size of thread index space is as large as or greater than 174 * vector index space else return. 175 */ 176 if (blockDim.x * gridDim.x < len) return; 177 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 178 if (idx < len) c[idx] = a[idx]+b[idx]; 179 } 180 181 __global__ void STREAM_Triad( float *a, float *b, float *c, float scalar, size_t len) 182 { 183 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 184 while (idx < len) { 185 c[idx] = a[idx]+scalar*b[idx]; 186 idx += blockDim.x * gridDim.x; 187 } 188 } 189 190 __global__ void STREAM_Triad_double( double *a, double *b, double *c, double scalar, size_t len) 191 { 192 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 193 while (idx < len) { 194 c[idx] = a[idx]+scalar*b[idx]; 195 idx += blockDim.x * gridDim.x; 196 } 197 } 198 199 __global__ void STREAM_Triad_Optimized( float *a, float *b, float *c, float scalar, size_t len) 200 { 201 /* 202 * Ensure size of thread index space is as large as or greater than 203 * vector index space else return. 204 */ 205 if (blockDim.x * gridDim.x < len) return; 206 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 207 if (idx < len) c[idx] = a[idx]+scalar*b[idx]; 208 } 209 210 __global__ void STREAM_Triad_Optimized_double( double *a, double *b, double *c, double scalar, size_t len) 211 { 212 /* 213 * Ensure size of thread index space is as large as or greater than 214 * vector index space else return. 215 */ 216 if (blockDim.x * gridDim.x < len) return; 217 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 218 if (idx < len) c[idx] = a[idx]+scalar*b[idx]; 219 } 220 221 /* Host side verification routines */ 222 bool STREAM_Copy_verify(float *a, float *b, size_t len) { 223 size_t idx; 224 bool bDifferent = false; 225 226 for (idx = 0; idx < len && !bDifferent; idx++) { 227 float expectedResult = a[idx]; 228 float diffResultExpected = (b[idx] - expectedResult); 229 float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 230 /* element-wise relative error determination */ 231 bDifferent = (relErrorULPS > 2.f); 232 } 233 234 return bDifferent; 235 } 236 237 bool STREAM_Copy_verify_double(double *a, double *b, size_t len) { 238 size_t idx; 239 bool bDifferent = false; 240 241 for (idx = 0; idx < len && !bDifferent; idx++) { 242 double expectedResult = a[idx]; 243 double diffResultExpected = (b[idx] - expectedResult); 244 double relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 245 /* element-wise relative error determination */ 246 bDifferent = (relErrorULPS > 2.); 247 } 248 249 return bDifferent; 250 } 251 252 bool STREAM_Scale_verify(float *a, float *b, float scale, size_t len) { 253 size_t idx; 254 bool bDifferent = false; 255 256 for (idx = 0; idx < len && !bDifferent; idx++) { 257 float expectedResult = scale*a[idx]; 258 float diffResultExpected = (b[idx] - expectedResult); 259 float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 260 /* element-wise relative error determination */ 261 bDifferent = (relErrorULPS > 2.f); 262 } 263 264 return bDifferent; 265 } 266 267 bool STREAM_Scale_verify_double(double *a, double *b, double scale, size_t len) { 268 size_t idx; 269 bool bDifferent = false; 270 271 for (idx = 0; idx < len && !bDifferent; idx++) { 272 double expectedResult = scale*a[idx]; 273 double diffResultExpected = (b[idx] - expectedResult); 274 double relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 275 /* element-wise relative error determination */ 276 bDifferent = (relErrorULPS > 2.); 277 } 278 279 return bDifferent; 280 } 281 282 bool STREAM_Add_verify(float *a, float *b, float *c, size_t len) { 283 size_t idx; 284 bool bDifferent = false; 285 286 for (idx = 0; idx < len && !bDifferent; idx++) { 287 float expectedResult = a[idx] + b[idx]; 288 float diffResultExpected = (c[idx] - expectedResult); 289 float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 290 /* element-wise relative error determination */ 291 bDifferent = (relErrorULPS > 2.f); 292 } 293 294 return bDifferent; 295 } 296 297 bool STREAM_Add_verify_double(double *a, double *b, double *c, size_t len) { 298 size_t idx; 299 bool bDifferent = false; 300 301 for (idx = 0; idx < len && !bDifferent; idx++) { 302 double expectedResult = a[idx] + b[idx]; 303 double diffResultExpected = (c[idx] - expectedResult); 304 double relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 305 /* element-wise relative error determination */ 306 bDifferent = (relErrorULPS > 2.); 307 } 308 309 return bDifferent; 310 } 311 312 bool STREAM_Triad_verify(float *a, float *b, float *c, float scalar, size_t len) { 313 size_t idx; 314 bool bDifferent = false; 315 316 for (idx = 0; idx < len && !bDifferent; idx++) { 317 float expectedResult = a[idx] + scalar*b[idx]; 318 float diffResultExpected = (c[idx] - expectedResult); 319 float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 320 /* element-wise relative error determination */ 321 bDifferent = (relErrorULPS > 3.f); 322 } 323 324 return bDifferent; 325 } 326 327 bool STREAM_Triad_verify_double(double *a, double *b, double *c, double scalar, size_t len) { 328 size_t idx; 329 bool bDifferent = false; 330 331 for (idx = 0; idx < len && !bDifferent; idx++) { 332 double expectedResult = a[idx] + scalar*b[idx]; 333 double diffResultExpected = (c[idx] - expectedResult); 334 double relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 335 /* element-wise relative error determination */ 336 bDifferent = (relErrorULPS > 3.); 337 } 338 339 return bDifferent; 340 } 341 342 /* forward declarations */ 343 PetscErrorCode setupStream(PetscInt device, PetscBool runDouble, PetscBool cpuTiming); 344 PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming); 345 PetscErrorCode runStreamDouble(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming); 346 PetscErrorCode printResultsReadable(float times[][NTIMES]); 347 348 int main(int argc, char *argv[]) 349 { 350 PetscInt device = 0; 351 PetscBool runDouble = PETSC_FALSE; 352 PetscBool cpuTiming = PETSC_FALSE; 353 PetscErrorCode ierr; 354 355 ierr = PetscInitialize(&argc, &argv, 0, help);CHKERRQ(ierr); 356 ierr = PetscPrintf(PETSC_COMM_SELF, "[Single and Double-Precision Device-Only STREAM Benchmark implementation in CUDA]\n");CHKERRQ(ierr); 357 ierr = PetscPrintf(PETSC_COMM_SELF, "%s Starting...\n\n", argv[0]);CHKERRQ(ierr); 358 359 ierr = PetscOptionsBegin(PETSC_COMM_WORLD, "", "STREAM Benchmark Options", "STREAM");CHKERRQ(ierr); 360 ierr = PetscOptionsInt("-device", "Specify the CUDA device to be used", "STREAM", device, &device, PETSC_NULL);CHKERRQ(ierr); 361 ierr = PetscOptionsBool("-double", "Also run double precision tests", "STREAM", runDouble, &runDouble, PETSC_NULL);CHKERRQ(ierr); 362 ierr = PetscOptionsBool("-cputiming", "Force CPU-based timing to be used", "STREAM", cpuTiming, &cpuTiming, PETSC_NULL);CHKERRQ(ierr); 363 ierr = PetscOptionsEnd(); 364 365 ierr = setupStream(device, runDouble, cpuTiming); 366 if (ierr >= 0) { 367 PetscErrorCode ierr2 = PetscPrintf(PETSC_COMM_SELF, "\n[streamBenchmark] - results:\t%s\n\n", (ierr == 0) ? "PASSES" : "FAILED");CHKERRQ(ierr2); 368 } 369 PetscFinalize(); 370 return 0; 371 } 372 373 /////////////////////////////////////////////////////////////////////////////// 374 //Run the appropriate tests 375 /////////////////////////////////////////////////////////////////////////////// 376 PetscErrorCode setupStream(PetscInt deviceNum, PetscBool runDouble, PetscBool cpuTiming) 377 { 378 PetscInt iNumThreadsPerBlock = 128; 379 PetscErrorCode ierr; 380 381 PetscFunctionBegin; 382 // Check device 383 { 384 int deviceCount; 385 386 cudaGetDeviceCount(&deviceCount); 387 if (deviceCount == 0) { 388 ierr = PetscPrintf(PETSC_COMM_SELF, "!!!!!No devices found!!!!!\n");CHKERRQ(ierr); 389 return -1000; 390 } 391 392 if (deviceNum >= deviceCount || deviceNum < 0) { 393 ierr = PetscPrintf(PETSC_COMM_SELF, "\n!!!!!Invalid GPU number %d given hence default gpu %d will be used !!!!!\n", deviceNum, 0);CHKERRQ(ierr); 394 deviceNum = 0; 395 } 396 } 397 398 cudaSetDevice(deviceNum); 399 ierr = PetscPrintf(PETSC_COMM_SELF, "Running on...\n\n");CHKERRQ(ierr); 400 cudaDeviceProp deviceProp; 401 if (cudaGetDeviceProperties(&deviceProp, deviceNum) == cudaSuccess) { 402 ierr = PetscPrintf(PETSC_COMM_SELF, " Device %d: %s\n", deviceNum, deviceProp.name);CHKERRQ(ierr); 403 } else { 404 ierr = PetscPrintf(PETSC_COMM_SELF, " Unable to determine device %d properties, exiting\n");CHKERRQ(ierr); 405 return -1; 406 } 407 408 if (runDouble && deviceProp.major == 1 && deviceProp.minor < 3) { 409 ierr = PetscPrintf(PETSC_COMM_SELF, " Unable to run double-precision STREAM benchmark on a compute capability GPU less than 1.3\n");CHKERRQ(ierr); 410 return -1; 411 } 412 if (deviceProp.major == 2 && deviceProp.minor == 1) { 413 iNumThreadsPerBlock = 192; /* GF104 architecture / 48 CUDA Cores per MP */ 414 } else { 415 iNumThreadsPerBlock = 128; /* GF100 architecture / 32 CUDA Cores per MP */ 416 } 417 418 if (cpuTiming) { 419 ierr = PetscPrintf(PETSC_COMM_SELF, " Using cpu-only timer.\n");CHKERRQ(ierr); 420 } 421 422 ierr = runStream(iNumThreadsPerBlock, cpuTiming);CHKERRQ(ierr); 423 if (runDouble) { 424 ierr = cudaSetDeviceFlags(cudaDeviceBlockingSync);CHKERRQ(ierr); 425 ierr = runStreamDouble(iNumThreadsPerBlock, cpuTiming);CHKERRQ(ierr); 426 } 427 PetscFunctionReturn(0); 428 } 429 430 /////////////////////////////////////////////////////////////////////////// 431 // runStream 432 /////////////////////////////////////////////////////////////////////////// 433 PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming) 434 { 435 float *d_a, *d_b, *d_c; 436 int k; 437 float times[8][NTIMES]; 438 float scalar; 439 PetscErrorCode ierr; 440 441 PetscFunctionBegin; 442 /* Allocate memory on device */ 443 ierr = cudaMalloc((void**)&d_a, sizeof(float)*N);CHKERRQ(ierr); 444 ierr = cudaMalloc((void**)&d_b, sizeof(float)*N);CHKERRQ(ierr); 445 ierr = cudaMalloc((void**)&d_c, sizeof(float)*N);CHKERRQ(ierr); 446 447 /* Compute execution configuration */ 448 449 dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */ 450 dim3 dimGrid(N/dimBlock.x); /* (N/dimBlock.x,1,1) */ 451 if (N % dimBlock.x != 0) dimGrid.x+=1; 452 453 ierr = PetscPrintf(PETSC_COMM_SELF, " Array size (single precision) = %u\n",N);CHKERRQ(ierr); 454 ierr = PetscPrintf(PETSC_COMM_SELF, " using %u threads per block, %u blocks\n",dimBlock.x,dimGrid.x);CHKERRQ(ierr); 455 456 /* Initialize memory on the device */ 457 set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 458 set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 459 set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 460 461 /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ 462 PetscLogDouble cpuTimer = 0.0; 463 cudaEvent_t start, stop; 464 465 /* both timers report msec */ 466 ierr = cudaEventCreate( &start );CHKERRQ(ierr); /* gpu timer facility */ 467 ierr = cudaEventCreate( &stop );CHKERRQ(ierr); /* gpu timer facility */ 468 469 scalar=3.0f; 470 for (k = 0; k < NTIMES; ++k) { 471 PetscTimeSubtract(cpuTimer); 472 ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 473 STREAM_Copy<<<dimGrid,dimBlock>>>(d_a, d_c, N); 474 ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 475 ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 476 //get the the total elapsed time in ms 477 PetscTimeAdd(cpuTimer); 478 if (bDontUseGPUTiming) { 479 times[0][k] = cpuTimer; 480 } else { 481 ierr = cudaEventElapsedTime( ×[0][k], start, stop );CHKERRQ(ierr); 482 } 483 484 cpuTimer = 0.0; 485 PetscTimeSubtract(cpuTimer); 486 ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 487 STREAM_Copy_Optimized<<<dimGrid,dimBlock>>>(d_a, d_c, N); 488 ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 489 ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 490 //get the the total elapsed time in ms 491 PetscTimeAdd(cpuTimer); 492 if (bDontUseGPUTiming) { 493 times[1][k] = cpuTimer; 494 } else { 495 ierr = cudaEventElapsedTime( ×[1][k], start, stop );CHKERRQ(ierr); 496 } 497 498 cpuTimer = 0.0; 499 PetscTimeSubtract(cpuTimer); 500 ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 501 STREAM_Scale<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 502 ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 503 ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 504 //get the the total elapsed time in ms 505 PetscTimeAdd(cpuTimer); 506 if (bDontUseGPUTiming) { 507 times[2][k] = cpuTimer; 508 } else { 509 ierr = cudaEventElapsedTime( ×[2][k], start, stop );CHKERRQ(ierr); 510 } 511 512 cpuTimer = 0.0; 513 PetscTimeSubtract(cpuTimer); 514 ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 515 STREAM_Scale_Optimized<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 516 ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 517 ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 518 //get the the total elapsed time in ms 519 PetscTimeAdd(cpuTimer); 520 if (bDontUseGPUTiming) { 521 times[3][k] = cpuTimer; 522 } else { 523 ierr = cudaEventElapsedTime( ×[3][k], start, stop );CHKERRQ(ierr); 524 } 525 526 cpuTimer = 0.0; 527 PetscTimeSubtract(cpuTimer); 528 ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 529 STREAM_Add<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 530 ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 531 ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 532 //get the the total elapsed time in ms 533 PetscTimeAdd(cpuTimer); 534 if (bDontUseGPUTiming) { 535 times[4][k] = cpuTimer; 536 } else { 537 ierr = cudaEventElapsedTime( ×[4][k], start, stop );CHKERRQ(ierr); 538 } 539 540 cpuTimer = 0.0; 541 PetscTimeSubtract(cpuTimer); 542 ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 543 STREAM_Add_Optimized<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 544 ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 545 ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 546 //get the the total elapsed time in ms 547 PetscTimeAdd(cpuTimer); 548 if (bDontUseGPUTiming) { 549 times[5][k] = cpuTimer; 550 } else { 551 ierr = cudaEventElapsedTime( ×[5][k], start, stop );CHKERRQ(ierr); 552 } 553 554 cpuTimer = 0.0; 555 PetscTimeSubtract(cpuTimer); 556 ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 557 STREAM_Triad<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 558 ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 559 ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 560 //get the the total elapsed time in ms 561 PetscTimeAdd(cpuTimer); 562 if (bDontUseGPUTiming) { 563 times[6][k] = cpuTimer; 564 } else { 565 ierr = cudaEventElapsedTime( ×[6][k], start, stop );CHKERRQ(ierr); 566 } 567 568 cpuTimer = 0.0; 569 PetscTimeSubtract(cpuTimer); 570 ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 571 STREAM_Triad_Optimized<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 572 ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 573 ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 574 //get the the total elapsed time in ms 575 PetscTimeAdd(cpuTimer); 576 if (bDontUseGPUTiming) { 577 times[7][k] = cpuTimer; 578 } else { 579 ierr = cudaEventElapsedTime( ×[7][k], start, stop );CHKERRQ(ierr); 580 } 581 582 } 583 584 /* verify kernels */ 585 float *h_a, *h_b, *h_c; 586 bool errorSTREAMkernel = true; 587 588 if ( (h_a = (float*)calloc( N, sizeof(float) )) == (float*)NULL ) { 589 printf("Unable to allocate array h_a, exiting ...\n"); 590 exit(1); 591 } 592 if ( (h_b = (float*)calloc( N, sizeof(float) )) == (float*)NULL ) { 593 printf("Unable to allocate array h_b, exiting ...\n"); 594 exit(1); 595 } 596 597 if ( (h_c = (float*)calloc( N, sizeof(float) )) == (float*)NULL ) { 598 printf("Unalbe to allocate array h_c, exiting ...\n"); 599 exit(1); 600 } 601 602 /* 603 * perform kernel, copy device memory into host memory and verify each 604 * device kernel output 605 */ 606 607 /* Initialize memory on the device */ 608 set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 609 set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 610 set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 611 612 STREAM_Copy<<<dimGrid,dimBlock>>>(d_a, d_c, N); 613 ierr = cudaMemcpy( h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 614 ierr = cudaMemcpy( h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 615 errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N); 616 if (errorSTREAMkernel) { 617 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n");CHKERRQ(ierr); 618 exit(-2000); 619 } else { 620 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tPass\n");CHKERRQ(ierr); 621 } 622 623 /* Initialize memory on the device */ 624 set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 625 set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 626 set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 627 628 STREAM_Copy_Optimized<<<dimGrid,dimBlock>>>(d_a, d_c, N); 629 ierr = cudaMemcpy( h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 630 ierr = cudaMemcpy( h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 631 errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N); 632 if (errorSTREAMkernel) { 633 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n");CHKERRQ(ierr); 634 exit(-3000); 635 } else { 636 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tPass\n");CHKERRQ(ierr); 637 } 638 639 /* Initialize memory on the device */ 640 set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 641 set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 642 643 STREAM_Scale<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 644 ierr = cudaMemcpy( h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 645 ierr = cudaMemcpy( h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 646 errorSTREAMkernel = STREAM_Scale_verify(h_b, h_c, scalar, N); 647 if (errorSTREAMkernel) { 648 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n");CHKERRQ(ierr); 649 exit(-4000); 650 } else { 651 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tPass\n");CHKERRQ(ierr); 652 } 653 654 /* Initialize memory on the device */ 655 set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 656 set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 657 set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 658 659 STREAM_Add<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 660 ierr = cudaMemcpy( h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 661 ierr = cudaMemcpy( h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 662 ierr = cudaMemcpy( h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 663 errorSTREAMkernel = STREAM_Add_verify(h_a, h_b, h_c, N); 664 if (errorSTREAMkernel) { 665 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n");CHKERRQ(ierr); 666 exit(-5000); 667 } else { 668 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tPass\n");CHKERRQ(ierr); 669 } 670 671 /* Initialize memory on the device */ 672 set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 673 set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 674 set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 675 676 STREAM_Triad<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 677 ierr = cudaMemcpy( h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 678 ierr = cudaMemcpy( h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 679 ierr = cudaMemcpy( h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 680 errorSTREAMkernel = STREAM_Triad_verify(h_b, h_c, h_a, scalar, N); 681 if (errorSTREAMkernel) { 682 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n");CHKERRQ(ierr); 683 exit(-6000); 684 } else { 685 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tPass\n");CHKERRQ(ierr); 686 } 687 688 /* continue from here */ 689 printResultsReadable(times); 690 691 //clean up timers 692 ierr = cudaEventDestroy( stop );CHKERRQ(ierr); 693 ierr = cudaEventDestroy( start );CHKERRQ(ierr); 694 695 /* Free memory on device */ 696 ierr = cudaFree(d_a);CHKERRQ(ierr); 697 ierr = cudaFree(d_b);CHKERRQ(ierr); 698 ierr = cudaFree(d_c);CHKERRQ(ierr); 699 PetscFunctionReturn(0); 700 } 701 702 PetscErrorCode runStreamDouble(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming) 703 { 704 double *d_a, *d_b, *d_c; 705 int k; 706 float times[8][NTIMES]; 707 double scalar; 708 PetscErrorCode ierr; 709 710 PetscFunctionBegin; 711 /* Allocate memory on device */ 712 ierr = cudaMalloc((void**)&d_a, sizeof(double)*N);CHKERRQ(ierr); 713 ierr = cudaMalloc((void**)&d_b, sizeof(double)*N);CHKERRQ(ierr); 714 ierr = cudaMalloc((void**)&d_c, sizeof(double)*N);CHKERRQ(ierr); 715 716 /* Compute execution configuration */ 717 718 dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */ 719 dim3 dimGrid(N/dimBlock.x); /* (N/dimBlock.x,1,1) */ 720 if (N % dimBlock.x != 0) dimGrid.x+=1; 721 722 ierr = PetscPrintf(PETSC_COMM_SELF, " Array size (double precision) = %u\n",N);CHKERRQ(ierr); 723 ierr = PetscPrintf(PETSC_COMM_SELF, " using %u threads per block, %u blocks\n",dimBlock.x,dimGrid.x);CHKERRQ(ierr); 724 725 /* Initialize memory on the device */ 726 set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N); 727 set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 728 set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 729 730 /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ 731 PetscLogDouble cpuTimer = 0.0; 732 cudaEvent_t start, stop; 733 734 /* both timers report msec */ 735 ierr = cudaEventCreate( &start );CHKERRQ(ierr); /* gpu timer facility */ 736 ierr = cudaEventCreate( &stop );CHKERRQ(ierr); /* gpu timer facility */ 737 738 scalar=3.0; 739 for (k = 0; k < NTIMES; ++k) { 740 PetscTimeSubtract(cpuTimer); 741 ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 742 STREAM_Copy_double<<<dimGrid,dimBlock>>>(d_a, d_c, N); 743 ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 744 ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 745 //get the the total elapsed time in ms 746 if (bDontUseGPUTiming) { 747 PetscTimeAdd(cpuTimer); 748 times[0][k] = cpuTimer; 749 } else { 750 ierr = cudaEventElapsedTime( ×[0][k], start, stop );CHKERRQ(ierr); 751 } 752 753 cpuTimer = 0.0; 754 PetscTimeSubtract(cpuTimer); 755 ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 756 STREAM_Copy_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_c, N); 757 ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 758 ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 759 //get the the total elapsed time in ms 760 if (bDontUseGPUTiming) { 761 PetscTimeAdd(cpuTimer); 762 times[1][k] = cpuTimer; 763 } else { 764 ierr = cudaEventElapsedTime( ×[1][k], start, stop );CHKERRQ(ierr); 765 } 766 767 cpuTimer = 0.0; 768 PetscTimeSubtract(cpuTimer); 769 ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 770 STREAM_Scale_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 771 ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 772 ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 773 //get the the total elapsed time in ms 774 PetscTimeAdd(cpuTimer); 775 if (bDontUseGPUTiming) { 776 times[2][k] = cpuTimer; 777 } else { 778 ierr = cudaEventElapsedTime( ×[2][k], start, stop );CHKERRQ(ierr); 779 } 780 781 cpuTimer = 0.0; 782 PetscTimeSubtract(cpuTimer); 783 ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 784 STREAM_Scale_Optimized_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 785 ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 786 ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 787 //get the the total elapsed time in ms 788 PetscTimeAdd(cpuTimer); 789 if (bDontUseGPUTiming) { 790 times[3][k] = cpuTimer; 791 } else { 792 ierr = cudaEventElapsedTime( ×[2][k], start, stop );CHKERRQ(ierr); 793 } 794 795 cpuTimer = 0.0; 796 PetscTimeSubtract(cpuTimer); 797 ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 798 STREAM_Add_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 799 ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 800 ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 801 //get the the total elapsed time in ms 802 PetscTimeAdd(cpuTimer); 803 if (bDontUseGPUTiming) { 804 times[4][k] = cpuTimer; 805 } else { 806 ierr = cudaEventElapsedTime( ×[3][k], start, stop );CHKERRQ(ierr); 807 } 808 809 cpuTimer = 0.0; 810 PetscTimeSubtract(cpuTimer); 811 ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 812 STREAM_Add_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 813 ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 814 ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 815 //get the the total elapsed time in ms 816 PetscTimeAdd(cpuTimer); 817 if (bDontUseGPUTiming) { 818 times[5][k] = cpuTimer; 819 } else { 820 ierr = cudaEventElapsedTime( ×[3][k], start, stop );CHKERRQ(ierr); 821 } 822 823 cpuTimer = 0.0; 824 PetscTimeSubtract(cpuTimer); 825 ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 826 STREAM_Triad_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 827 ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 828 ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 829 //get the the total elapsed time in ms 830 PetscTimeAdd(cpuTimer); 831 if (bDontUseGPUTiming) { 832 times[6][k] = cpuTimer; 833 } else { 834 ierr = cudaEventElapsedTime( ×[4][k], start, stop );CHKERRQ(ierr); 835 } 836 837 cpuTimer = 0.0; 838 PetscTimeSubtract(cpuTimer); 839 ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 840 STREAM_Triad_Optimized_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 841 ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 842 ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 843 //get the the total elapsed time in ms 844 PetscTimeAdd(cpuTimer); 845 if (bDontUseGPUTiming) { 846 times[7][k] = cpuTimer; 847 } else { 848 ierr = cudaEventElapsedTime( ×[4][k], start, stop );CHKERRQ(ierr); 849 } 850 851 } 852 853 /* verify kernels */ 854 double *h_a, *h_b, *h_c; 855 bool errorSTREAMkernel = true; 856 857 if ( (h_a = (double*)calloc( N, sizeof(double) )) == (double*)NULL ) { 858 printf("Unable to allocate array h_a, exiting ...\n"); 859 exit(1); 860 } 861 if ( (h_b = (double*)calloc( N, sizeof(double) )) == (double*)NULL ) { 862 printf("Unable to allocate array h_b, exiting ...\n"); 863 exit(1); 864 } 865 866 if ( (h_c = (double*)calloc( N, sizeof(double) )) == (double*)NULL ) { 867 printf("Unalbe to allocate array h_c, exiting ...\n"); 868 exit(1); 869 } 870 871 /* 872 * perform kernel, copy device memory into host memory and verify each 873 * device kernel output 874 */ 875 876 /* Initialize memory on the device */ 877 set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N); 878 set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 879 set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 880 881 STREAM_Copy_double<<<dimGrid,dimBlock>>>(d_a, d_c, N); 882 ierr = cudaMemcpy( h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 883 ierr = cudaMemcpy( h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 884 errorSTREAMkernel = STREAM_Copy_verify_double(h_a, h_c, N); 885 if (errorSTREAMkernel) { 886 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n");CHKERRQ(ierr); 887 exit(-2000); 888 } else { 889 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tPass\n");CHKERRQ(ierr); 890 } 891 892 /* Initialize memory on the device */ 893 set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N); 894 set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 895 set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 896 897 STREAM_Copy_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_c, N); 898 ierr = cudaMemcpy( h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 899 ierr = cudaMemcpy( h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 900 errorSTREAMkernel = STREAM_Copy_verify_double(h_a, h_c, N); 901 if (errorSTREAMkernel) { 902 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n");CHKERRQ(ierr); 903 exit(-3000); 904 } else { 905 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tPass\n");CHKERRQ(ierr); 906 } 907 908 /* Initialize memory on the device */ 909 set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 910 set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 911 912 STREAM_Scale_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 913 ierr = cudaMemcpy( h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 914 ierr = cudaMemcpy( h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 915 errorSTREAMkernel = STREAM_Scale_verify_double(h_b, h_c, scalar, N); 916 if (errorSTREAMkernel) { 917 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n");CHKERRQ(ierr); 918 exit(-4000); 919 } else { 920 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tPass\n");CHKERRQ(ierr); 921 } 922 923 /* Initialize memory on the device */ 924 set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N); 925 set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 926 set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 927 928 STREAM_Add_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 929 ierr = cudaMemcpy( h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 930 ierr = cudaMemcpy( h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 931 ierr = cudaMemcpy( h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 932 errorSTREAMkernel = STREAM_Add_verify_double(h_a, h_b, h_c, N); 933 if (errorSTREAMkernel) { 934 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n");CHKERRQ(ierr); 935 exit(-5000); 936 } else { 937 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tPass\n");CHKERRQ(ierr); 938 } 939 940 /* Initialize memory on the device */ 941 set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N); 942 set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 943 set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 944 945 STREAM_Triad_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 946 ierr = cudaMemcpy( h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 947 ierr = cudaMemcpy( h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 948 ierr = cudaMemcpy( h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 949 errorSTREAMkernel = STREAM_Triad_verify_double(h_b, h_c, h_a, scalar, N); 950 if (errorSTREAMkernel) { 951 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n");CHKERRQ(ierr); 952 exit(-6000); 953 } else { 954 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tPass\n");CHKERRQ(ierr); 955 } 956 957 /* continue from here */ 958 printResultsReadable(times); 959 960 //clean up timers 961 ierr = cudaEventDestroy( stop );CHKERRQ(ierr); 962 ierr = cudaEventDestroy( start );CHKERRQ(ierr); 963 964 /* Free memory on device */ 965 ierr = cudaFree(d_a);CHKERRQ(ierr); 966 ierr = cudaFree(d_b);CHKERRQ(ierr); 967 ierr = cudaFree(d_c);CHKERRQ(ierr); 968 PetscFunctionReturn(0); 969 } 970 971 /////////////////////////////////////////////////////////////////////////// 972 //Print Results to Screen and File 973 /////////////////////////////////////////////////////////////////////////// 974 PetscErrorCode printResultsReadable(float times[][NTIMES]) { 975 PetscErrorCode ierr; 976 PetscInt j, k; 977 float avgtime[8] = {0., 0., 0., 0., 0., 0., 0., 0.}; 978 float maxtime[8] = {0., 0., 0., 0., 0., 0., 0., 0.}; 979 float mintime[8] = {1e30,1e30,1e30,1e30,1e30,1e30,1e30,1e30}; 980 char *label[8] = {"Copy: ", "Copy Opt.: ", "Scale: ", "Scale Opt: ", "Add: ", "Add Opt: ", "Triad: ", "Triad Opt: "}; 981 float bytes_per_kernel[8] = { 982 2. * sizeof(float) * N, 983 2. * sizeof(float) * N, 984 2. * sizeof(float) * N, 985 2. * sizeof(float) * N, 986 3. * sizeof(float) * N, 987 3. * sizeof(float) * N, 988 3. * sizeof(float) * N, 989 3. * sizeof(float) * N 990 }; 991 992 PetscFunctionBegin; 993 /* --- SUMMARY --- */ 994 for (k = 1; k < NTIMES; ++k) { /* note -- skip first iteration */ 995 for (j = 0; j < 8; ++j) { 996 avgtime[j] = avgtime[j] + (1.e-03f * times[j][k]); 997 mintime[j] = MIN(mintime[j], (1.e-03f * times[j][k])); 998 maxtime[j] = MAX(maxtime[j], (1.e-03f * times[j][k])); 999 } 1000 } 1001 1002 ierr = PetscPrintf(PETSC_COMM_SELF, "Function Rate (MB/s) Avg time Min time Max time\n");CHKERRQ(ierr); 1003 1004 for (j = 0; j < 8; ++j) { 1005 avgtime[j] = avgtime[j]/(float)(NTIMES-1); 1006 ierr = PetscPrintf(PETSC_COMM_SELF, "%s%11.4f %11.6f %12.6f %12.6f\n", label[j], 1.0E-06 * bytes_per_kernel[j]/mintime[j], avgtime[j], mintime[j], maxtime[j]);CHKERRQ(ierr); 1007 } 1008 PetscFunctionReturn(0); 1009 } 1010