1 /* 2 STREAM benchmark implementation in CUDA. 3 4 COPY: a(i) = b(i) 5 SCALE: a(i) = q*b(i) 6 SUM: a(i) = b(i) + c(i) 7 TRIAD: a(i) = b(i) + q*c(i) 8 9 It measures the memory system on the device. 10 The implementation is in single precision. 11 12 Code based on the code developed by John D. McCalpin 13 http://www.cs.virginia.edu/stream/FTP/Code/stream.c 14 15 Written by: Massimiliano Fatica, NVIDIA Corporation 16 Modified by: Douglas Enright (dpephd-nvidia@yahoo.com), 1 December 2010 17 Extensive Revisions, 4 December 2010 18 Modified for PETSc by: Matthew G. Knepley 14 Aug 2011 19 20 User interface motivated by bandwidthTest NVIDIA SDK example. 21 */ 22 static char *help = "Single-Precision STREAM Benchmark implementation in CUDA\n" 23 "Performs Copy, Scale, Add, and Triad single-precision kernels\n\n"; 24 25 #include <petscconf.h> 26 #include <petscsys.h> 27 #include <petsctime.h> 28 29 #define N 2000000 30 #define N_DOUBLE 8000000 31 #define NTIMES 10 32 33 # ifndef MIN 34 # define MIN(x,y) ((x)<(y)?(x):(y)) 35 # endif 36 # ifndef MAX 37 # define MAX(x,y) ((x)>(y)?(x):(y)) 38 # endif 39 40 const float flt_eps = 1.192092896e-07f; 41 const double dbl_eps = 2.2204460492503131e-16; 42 43 __global__ void set_array(float *a, float value, size_t len) 44 { 45 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 46 while (idx < len) { 47 a[idx] = value; 48 idx += blockDim.x * gridDim.x; 49 } 50 } 51 52 __global__ void set_array_double(double *a, double value, size_t len) 53 { 54 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 55 while (idx < len) { 56 a[idx] = value; 57 idx += blockDim.x * gridDim.x; 58 } 59 } 60 61 __global__ void STREAM_Copy(float *a, float *b, size_t len) 62 { 63 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 64 while (idx < len) { 65 b[idx] = a[idx]; 66 idx += blockDim.x * gridDim.x; 67 } 68 } 69 70 __global__ void STREAM_Copy_double(double *a, double *b, size_t len) 71 { 72 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 73 while (idx < len) { 74 b[idx] = a[idx]; 75 idx += blockDim.x * gridDim.x; 76 } 77 } 78 79 __global__ void STREAM_Copy_Optimized(float *a, float *b, size_t len) 80 { 81 /* 82 * Ensure size of thread index space is as large as or greater than 83 * vector index space else return. 84 */ 85 if (blockDim.x * gridDim.x < len) return; 86 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 87 if (idx < len) b[idx] = a[idx]; 88 } 89 90 __global__ void STREAM_Copy_Optimized_double(double *a, double *b, size_t len) 91 { 92 /* 93 * Ensure size of thread index space is as large as or greater than 94 * vector index space else return. 95 */ 96 if (blockDim.x * gridDim.x < len) return; 97 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 98 if (idx < len) b[idx] = a[idx]; 99 } 100 101 __global__ void STREAM_Scale(float *a, float *b, float scale, size_t len) 102 { 103 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 104 while (idx < len) { 105 b[idx] = scale* a[idx]; 106 idx += blockDim.x * gridDim.x; 107 } 108 } 109 110 __global__ void STREAM_Scale_double(double *a, double *b, double scale, size_t len) 111 { 112 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 113 while (idx < len) { 114 b[idx] = scale* a[idx]; 115 idx += blockDim.x * gridDim.x; 116 } 117 } 118 119 __global__ void STREAM_Scale_Optimized(float *a, float *b, float scale, size_t len) 120 { 121 /* 122 * Ensure size of thread index space is as large as or greater than 123 * vector index space else return. 124 */ 125 if (blockDim.x * gridDim.x < len) return; 126 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 127 if (idx < len) b[idx] = scale* a[idx]; 128 } 129 130 __global__ void STREAM_Scale_Optimized_double(double *a, double *b, double scale, size_t len) 131 { 132 /* 133 * Ensure size of thread index space is as large as or greater than 134 * vector index space else return. 135 */ 136 if (blockDim.x * gridDim.x < len) return; 137 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 138 if (idx < len) b[idx] = scale* a[idx]; 139 } 140 141 __global__ void STREAM_Add( float *a, float *b, float *c, size_t len) 142 { 143 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 144 while (idx < len) { 145 c[idx] = a[idx]+b[idx]; 146 idx += blockDim.x * gridDim.x; 147 } 148 } 149 150 __global__ void STREAM_Add_double( double *a, double *b, double *c, size_t len) 151 { 152 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 153 while (idx < len) { 154 c[idx] = a[idx]+b[idx]; 155 idx += blockDim.x * gridDim.x; 156 } 157 } 158 159 __global__ void STREAM_Add_Optimized( float *a, float *b, float *c, size_t len) 160 { 161 /* 162 * Ensure size of thread index space is as large as or greater than 163 * vector index space else return. 164 */ 165 if (blockDim.x * gridDim.x < len) return; 166 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 167 if (idx < len) c[idx] = a[idx]+b[idx]; 168 } 169 170 __global__ void STREAM_Add_Optimized_double( double *a, double *b, double *c, size_t len) 171 { 172 /* 173 * Ensure size of thread index space is as large as or greater than 174 * vector index space else return. 175 */ 176 if (blockDim.x * gridDim.x < len) return; 177 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 178 if (idx < len) c[idx] = a[idx]+b[idx]; 179 } 180 181 __global__ void STREAM_Triad( float *a, float *b, float *c, float scalar, size_t len) 182 { 183 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 184 while (idx < len) { 185 c[idx] = a[idx]+scalar*b[idx]; 186 idx += blockDim.x * gridDim.x; 187 } 188 } 189 190 __global__ void STREAM_Triad_double( double *a, double *b, double *c, double scalar, size_t len) 191 { 192 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 193 while (idx < len) { 194 c[idx] = a[idx]+scalar*b[idx]; 195 idx += blockDim.x * gridDim.x; 196 } 197 } 198 199 __global__ void STREAM_Triad_Optimized( float *a, float *b, float *c, float scalar, size_t len) 200 { 201 /* 202 * Ensure size of thread index space is as large as or greater than 203 * vector index space else return. 204 */ 205 if (blockDim.x * gridDim.x < len) return; 206 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 207 if (idx < len) c[idx] = a[idx]+scalar*b[idx]; 208 } 209 210 __global__ void STREAM_Triad_Optimized_double( double *a, double *b, double *c, double scalar, size_t len) 211 { 212 /* 213 * Ensure size of thread index space is as large as or greater than 214 * vector index space else return. 215 */ 216 if (blockDim.x * gridDim.x < len) return; 217 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 218 if (idx < len) c[idx] = a[idx]+scalar*b[idx]; 219 } 220 221 /* Host side verification routines */ 222 bool STREAM_Copy_verify(float *a, float *b, size_t len) 223 { 224 size_t idx; 225 bool bDifferent = false; 226 227 for (idx = 0; idx < len && !bDifferent; idx++) { 228 float expectedResult = a[idx]; 229 float diffResultExpected = (b[idx] - expectedResult); 230 float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 231 /* element-wise relative error determination */ 232 bDifferent = (relErrorULPS > 2.f); 233 } 234 235 return bDifferent; 236 } 237 238 bool STREAM_Copy_verify_double(double *a, double *b, size_t len) 239 { 240 size_t idx; 241 bool bDifferent = false; 242 243 for (idx = 0; idx < len && !bDifferent; idx++) { 244 double expectedResult = a[idx]; 245 double diffResultExpected = (b[idx] - expectedResult); 246 double relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 247 /* element-wise relative error determination */ 248 bDifferent = (relErrorULPS > 2.); 249 } 250 251 return bDifferent; 252 } 253 254 bool STREAM_Scale_verify(float *a, float *b, float scale, size_t len) 255 { 256 size_t idx; 257 bool bDifferent = false; 258 259 for (idx = 0; idx < len && !bDifferent; idx++) { 260 float expectedResult = scale*a[idx]; 261 float diffResultExpected = (b[idx] - expectedResult); 262 float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 263 /* element-wise relative error determination */ 264 bDifferent = (relErrorULPS > 2.f); 265 } 266 267 return bDifferent; 268 } 269 270 bool STREAM_Scale_verify_double(double *a, double *b, double scale, size_t len) 271 { 272 size_t idx; 273 bool bDifferent = false; 274 275 for (idx = 0; idx < len && !bDifferent; idx++) { 276 double expectedResult = scale*a[idx]; 277 double diffResultExpected = (b[idx] - expectedResult); 278 double relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 279 /* element-wise relative error determination */ 280 bDifferent = (relErrorULPS > 2.); 281 } 282 283 return bDifferent; 284 } 285 286 bool STREAM_Add_verify(float *a, float *b, float *c, size_t len) 287 { 288 size_t idx; 289 bool bDifferent = false; 290 291 for (idx = 0; idx < len && !bDifferent; idx++) { 292 float expectedResult = a[idx] + b[idx]; 293 float diffResultExpected = (c[idx] - expectedResult); 294 float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 295 /* element-wise relative error determination */ 296 bDifferent = (relErrorULPS > 2.f); 297 } 298 299 return bDifferent; 300 } 301 302 bool STREAM_Add_verify_double(double *a, double *b, double *c, size_t len) 303 { 304 size_t idx; 305 bool bDifferent = false; 306 307 for (idx = 0; idx < len && !bDifferent; idx++) { 308 double expectedResult = a[idx] + b[idx]; 309 double diffResultExpected = (c[idx] - expectedResult); 310 double relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 311 /* element-wise relative error determination */ 312 bDifferent = (relErrorULPS > 2.); 313 } 314 315 return bDifferent; 316 } 317 318 bool STREAM_Triad_verify(float *a, float *b, float *c, float scalar, size_t len) 319 { 320 size_t idx; 321 bool bDifferent = false; 322 323 for (idx = 0; idx < len && !bDifferent; idx++) { 324 float expectedResult = a[idx] + scalar*b[idx]; 325 float diffResultExpected = (c[idx] - expectedResult); 326 float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 327 /* element-wise relative error determination */ 328 bDifferent = (relErrorULPS > 3.f); 329 } 330 331 return bDifferent; 332 } 333 334 bool STREAM_Triad_verify_double(double *a, double *b, double *c, double scalar, size_t len) 335 { 336 size_t idx; 337 bool bDifferent = false; 338 339 for (idx = 0; idx < len && !bDifferent; idx++) { 340 double expectedResult = a[idx] + scalar*b[idx]; 341 double diffResultExpected = (c[idx] - expectedResult); 342 double relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 343 /* element-wise relative error determination */ 344 bDifferent = (relErrorULPS > 3.); 345 } 346 347 return bDifferent; 348 } 349 350 /* forward declarations */ 351 PetscErrorCode setupStream(PetscInt device, PetscBool runDouble, PetscBool cpuTiming); 352 PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming); 353 PetscErrorCode runStreamDouble(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming); 354 PetscErrorCode printResultsReadable(float times[][NTIMES]); 355 356 int main(int argc, char *argv[]) 357 { 358 PetscInt device = 0; 359 PetscBool runDouble = PETSC_FALSE; 360 PetscBool cpuTiming = PETSC_FALSE; 361 PetscErrorCode ierr; 362 363 ierr = PetscInitialize(&argc, &argv, 0, help);CHKERRQ(ierr); 364 ierr = PetscPrintf(PETSC_COMM_SELF, "[Single and Double-Precision Device-Only STREAM Benchmark implementation in CUDA]\n");CHKERRQ(ierr); 365 ierr = PetscPrintf(PETSC_COMM_SELF, "%s Starting...\n\n", argv[0]);CHKERRQ(ierr); 366 367 ierr = PetscOptionsBegin(PETSC_COMM_WORLD, "", "STREAM Benchmark Options", "STREAM");CHKERRQ(ierr); 368 ierr = PetscOptionsInt("-device", "Specify the CUDA device to be used", "STREAM", device, &device, PETSC_NULL);CHKERRQ(ierr); 369 ierr = PetscOptionsBool("-double", "Also run double precision tests", "STREAM", runDouble, &runDouble, PETSC_NULL);CHKERRQ(ierr); 370 ierr = PetscOptionsBool("-cputiming", "Force CPU-based timing to be used", "STREAM", cpuTiming, &cpuTiming, PETSC_NULL);CHKERRQ(ierr); 371 ierr = PetscOptionsEnd(); 372 373 ierr = setupStream(device, runDouble, cpuTiming); 374 if (ierr >= 0) { 375 PetscErrorCode ierr2 = PetscPrintf(PETSC_COMM_SELF, "\n[streamBenchmark] - results:\t%s\n\n", (ierr == 0) ? "PASSES" : "FAILED");CHKERRQ(ierr2); 376 } 377 PetscFinalize(); 378 return 0; 379 } 380 381 /////////////////////////////////////////////////////////////////////////////// 382 //Run the appropriate tests 383 /////////////////////////////////////////////////////////////////////////////// 384 PetscErrorCode setupStream(PetscInt deviceNum, PetscBool runDouble, PetscBool cpuTiming) 385 { 386 PetscInt iNumThreadsPerBlock = 128; 387 PetscErrorCode ierr; 388 389 PetscFunctionBegin; 390 // Check device 391 { 392 int deviceCount; 393 394 cudaGetDeviceCount(&deviceCount); 395 if (deviceCount == 0) { 396 ierr = PetscPrintf(PETSC_COMM_SELF, "!!!!!No devices found!!!!!\n");CHKERRQ(ierr); 397 return -1000; 398 } 399 400 if (deviceNum >= deviceCount || deviceNum < 0) { 401 ierr = PetscPrintf(PETSC_COMM_SELF, "\n!!!!!Invalid GPU number %d given hence default gpu %d will be used !!!!!\n", deviceNum, 0);CHKERRQ(ierr); 402 deviceNum = 0; 403 } 404 } 405 406 cudaSetDevice(deviceNum); 407 ierr = PetscPrintf(PETSC_COMM_SELF, "Running on...\n\n");CHKERRQ(ierr); 408 cudaDeviceProp deviceProp; 409 if (cudaGetDeviceProperties(&deviceProp, deviceNum) == cudaSuccess) { 410 ierr = PetscPrintf(PETSC_COMM_SELF, " Device %d: %s\n", deviceNum, deviceProp.name);CHKERRQ(ierr); 411 } else { 412 ierr = PetscPrintf(PETSC_COMM_SELF, " Unable to determine device %d properties, exiting\n");CHKERRQ(ierr); 413 return -1; 414 } 415 416 if (runDouble && deviceProp.major == 1 && deviceProp.minor < 3) { 417 ierr = PetscPrintf(PETSC_COMM_SELF, " Unable to run double-precision STREAM benchmark on a compute capability GPU less than 1.3\n");CHKERRQ(ierr); 418 return -1; 419 } 420 if (deviceProp.major == 2 && deviceProp.minor == 1) { 421 iNumThreadsPerBlock = 192; /* GF104 architecture / 48 CUDA Cores per MP */ 422 } else { 423 iNumThreadsPerBlock = 128; /* GF100 architecture / 32 CUDA Cores per MP */ 424 } 425 426 if (cpuTiming) { 427 ierr = PetscPrintf(PETSC_COMM_SELF, " Using cpu-only timer.\n");CHKERRQ(ierr); 428 } 429 430 ierr = runStream(iNumThreadsPerBlock, cpuTiming);CHKERRQ(ierr); 431 if (runDouble) { 432 ierr = cudaSetDeviceFlags(cudaDeviceBlockingSync);CHKERRQ(ierr); 433 ierr = runStreamDouble(iNumThreadsPerBlock, cpuTiming);CHKERRQ(ierr); 434 } 435 PetscFunctionReturn(0); 436 } 437 438 /////////////////////////////////////////////////////////////////////////// 439 // runStream 440 /////////////////////////////////////////////////////////////////////////// 441 PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming) 442 { 443 float *d_a, *d_b, *d_c; 444 int k; 445 float times[8][NTIMES]; 446 float scalar; 447 PetscErrorCode ierr; 448 449 PetscFunctionBegin; 450 /* Allocate memory on device */ 451 ierr = cudaMalloc((void**)&d_a, sizeof(float)*N);CHKERRQ(ierr); 452 ierr = cudaMalloc((void**)&d_b, sizeof(float)*N);CHKERRQ(ierr); 453 ierr = cudaMalloc((void**)&d_c, sizeof(float)*N);CHKERRQ(ierr); 454 455 /* Compute execution configuration */ 456 457 dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */ 458 dim3 dimGrid(N/dimBlock.x); /* (N/dimBlock.x,1,1) */ 459 if (N % dimBlock.x != 0) dimGrid.x+=1; 460 461 ierr = PetscPrintf(PETSC_COMM_SELF, " Array size (single precision) = %u\n",N);CHKERRQ(ierr); 462 ierr = PetscPrintf(PETSC_COMM_SELF, " using %u threads per block, %u blocks\n",dimBlock.x,dimGrid.x);CHKERRQ(ierr); 463 464 /* Initialize memory on the device */ 465 set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 466 set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 467 set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 468 469 /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ 470 PetscLogDouble cpuTimer = 0.0; 471 cudaEvent_t start, stop; 472 473 /* both timers report msec */ 474 ierr = cudaEventCreate( &start );CHKERRQ(ierr); /* gpu timer facility */ 475 ierr = cudaEventCreate( &stop );CHKERRQ(ierr); /* gpu timer facility */ 476 477 scalar=3.0f; 478 for (k = 0; k < NTIMES; ++k) { 479 PetscTimeSubtract(cpuTimer); 480 ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 481 STREAM_Copy<<<dimGrid,dimBlock>>>(d_a, d_c, N); 482 ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 483 ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 484 //get the the total elapsed time in ms 485 PetscTimeAdd(cpuTimer); 486 if (bDontUseGPUTiming) { 487 times[0][k] = cpuTimer; 488 } else { 489 ierr = cudaEventElapsedTime( ×[0][k], start, stop );CHKERRQ(ierr); 490 } 491 492 cpuTimer = 0.0; 493 PetscTimeSubtract(cpuTimer); 494 ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 495 STREAM_Copy_Optimized<<<dimGrid,dimBlock>>>(d_a, d_c, N); 496 ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 497 ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 498 //get the the total elapsed time in ms 499 PetscTimeAdd(cpuTimer); 500 if (bDontUseGPUTiming) { 501 times[1][k] = cpuTimer; 502 } else { 503 ierr = cudaEventElapsedTime( ×[1][k], start, stop );CHKERRQ(ierr); 504 } 505 506 cpuTimer = 0.0; 507 PetscTimeSubtract(cpuTimer); 508 ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 509 STREAM_Scale<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 510 ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 511 ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 512 //get the the total elapsed time in ms 513 PetscTimeAdd(cpuTimer); 514 if (bDontUseGPUTiming) { 515 times[2][k] = cpuTimer; 516 } else { 517 ierr = cudaEventElapsedTime( ×[2][k], start, stop );CHKERRQ(ierr); 518 } 519 520 cpuTimer = 0.0; 521 PetscTimeSubtract(cpuTimer); 522 ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 523 STREAM_Scale_Optimized<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 524 ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 525 ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 526 //get the the total elapsed time in ms 527 PetscTimeAdd(cpuTimer); 528 if (bDontUseGPUTiming) { 529 times[3][k] = cpuTimer; 530 } else { 531 ierr = cudaEventElapsedTime( ×[3][k], start, stop );CHKERRQ(ierr); 532 } 533 534 cpuTimer = 0.0; 535 PetscTimeSubtract(cpuTimer); 536 ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 537 STREAM_Add<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 538 ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 539 ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 540 //get the the total elapsed time in ms 541 PetscTimeAdd(cpuTimer); 542 if (bDontUseGPUTiming) { 543 times[4][k] = cpuTimer; 544 } else { 545 ierr = cudaEventElapsedTime( ×[4][k], start, stop );CHKERRQ(ierr); 546 } 547 548 cpuTimer = 0.0; 549 PetscTimeSubtract(cpuTimer); 550 ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 551 STREAM_Add_Optimized<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 552 ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 553 ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 554 //get the the total elapsed time in ms 555 PetscTimeAdd(cpuTimer); 556 if (bDontUseGPUTiming) { 557 times[5][k] = cpuTimer; 558 } else { 559 ierr = cudaEventElapsedTime( ×[5][k], start, stop );CHKERRQ(ierr); 560 } 561 562 cpuTimer = 0.0; 563 PetscTimeSubtract(cpuTimer); 564 ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 565 STREAM_Triad<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 566 ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 567 ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 568 //get the the total elapsed time in ms 569 PetscTimeAdd(cpuTimer); 570 if (bDontUseGPUTiming) { 571 times[6][k] = cpuTimer; 572 } else { 573 ierr = cudaEventElapsedTime( ×[6][k], start, stop );CHKERRQ(ierr); 574 } 575 576 cpuTimer = 0.0; 577 PetscTimeSubtract(cpuTimer); 578 ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 579 STREAM_Triad_Optimized<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 580 ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 581 ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 582 //get the the total elapsed time in ms 583 PetscTimeAdd(cpuTimer); 584 if (bDontUseGPUTiming) { 585 times[7][k] = cpuTimer; 586 } else { 587 ierr = cudaEventElapsedTime( ×[7][k], start, stop );CHKERRQ(ierr); 588 } 589 590 } 591 592 /* verify kernels */ 593 float *h_a, *h_b, *h_c; 594 bool errorSTREAMkernel = true; 595 596 if ( (h_a = (float*)calloc( N, sizeof(float) )) == (float*)NULL ) { 597 printf("Unable to allocate array h_a, exiting ...\n"); 598 exit(1); 599 } 600 if ( (h_b = (float*)calloc( N, sizeof(float) )) == (float*)NULL ) { 601 printf("Unable to allocate array h_b, exiting ...\n"); 602 exit(1); 603 } 604 605 if ( (h_c = (float*)calloc( N, sizeof(float) )) == (float*)NULL ) { 606 printf("Unalbe to allocate array h_c, exiting ...\n"); 607 exit(1); 608 } 609 610 /* 611 * perform kernel, copy device memory into host memory and verify each 612 * device kernel output 613 */ 614 615 /* Initialize memory on the device */ 616 set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 617 set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 618 set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 619 620 STREAM_Copy<<<dimGrid,dimBlock>>>(d_a, d_c, N); 621 ierr = cudaMemcpy( h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 622 ierr = cudaMemcpy( h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 623 errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N); 624 if (errorSTREAMkernel) { 625 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n");CHKERRQ(ierr); 626 exit(-2000); 627 } else { 628 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tPass\n");CHKERRQ(ierr); 629 } 630 631 /* Initialize memory on the device */ 632 set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 633 set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 634 set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 635 636 STREAM_Copy_Optimized<<<dimGrid,dimBlock>>>(d_a, d_c, N); 637 ierr = cudaMemcpy( h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 638 ierr = cudaMemcpy( h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 639 errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N); 640 if (errorSTREAMkernel) { 641 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n");CHKERRQ(ierr); 642 exit(-3000); 643 } else { 644 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tPass\n");CHKERRQ(ierr); 645 } 646 647 /* Initialize memory on the device */ 648 set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 649 set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 650 651 STREAM_Scale<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 652 ierr = cudaMemcpy( h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 653 ierr = cudaMemcpy( h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 654 errorSTREAMkernel = STREAM_Scale_verify(h_b, h_c, scalar, N); 655 if (errorSTREAMkernel) { 656 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n");CHKERRQ(ierr); 657 exit(-4000); 658 } else { 659 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tPass\n");CHKERRQ(ierr); 660 } 661 662 /* Initialize memory on the device */ 663 set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 664 set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 665 set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 666 667 STREAM_Add<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 668 ierr = cudaMemcpy( h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 669 ierr = cudaMemcpy( h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 670 ierr = cudaMemcpy( h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 671 errorSTREAMkernel = STREAM_Add_verify(h_a, h_b, h_c, N); 672 if (errorSTREAMkernel) { 673 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n");CHKERRQ(ierr); 674 exit(-5000); 675 } else { 676 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tPass\n");CHKERRQ(ierr); 677 } 678 679 /* Initialize memory on the device */ 680 set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 681 set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 682 set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 683 684 STREAM_Triad<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 685 ierr = cudaMemcpy( h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 686 ierr = cudaMemcpy( h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 687 ierr = cudaMemcpy( h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 688 errorSTREAMkernel = STREAM_Triad_verify(h_b, h_c, h_a, scalar, N); 689 if (errorSTREAMkernel) { 690 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n");CHKERRQ(ierr); 691 exit(-6000); 692 } else { 693 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tPass\n");CHKERRQ(ierr); 694 } 695 696 /* continue from here */ 697 printResultsReadable(times); 698 699 //clean up timers 700 ierr = cudaEventDestroy( stop );CHKERRQ(ierr); 701 ierr = cudaEventDestroy( start );CHKERRQ(ierr); 702 703 /* Free memory on device */ 704 ierr = cudaFree(d_a);CHKERRQ(ierr); 705 ierr = cudaFree(d_b);CHKERRQ(ierr); 706 ierr = cudaFree(d_c);CHKERRQ(ierr); 707 PetscFunctionReturn(0); 708 } 709 710 PetscErrorCode runStreamDouble(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming) 711 { 712 double *d_a, *d_b, *d_c; 713 int k; 714 float times[8][NTIMES]; 715 double scalar; 716 PetscErrorCode ierr; 717 718 PetscFunctionBegin; 719 /* Allocate memory on device */ 720 ierr = cudaMalloc((void**)&d_a, sizeof(double)*N);CHKERRQ(ierr); 721 ierr = cudaMalloc((void**)&d_b, sizeof(double)*N);CHKERRQ(ierr); 722 ierr = cudaMalloc((void**)&d_c, sizeof(double)*N);CHKERRQ(ierr); 723 724 /* Compute execution configuration */ 725 726 dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */ 727 dim3 dimGrid(N/dimBlock.x); /* (N/dimBlock.x,1,1) */ 728 if (N % dimBlock.x != 0) dimGrid.x+=1; 729 730 ierr = PetscPrintf(PETSC_COMM_SELF, " Array size (double precision) = %u\n",N);CHKERRQ(ierr); 731 ierr = PetscPrintf(PETSC_COMM_SELF, " using %u threads per block, %u blocks\n",dimBlock.x,dimGrid.x);CHKERRQ(ierr); 732 733 /* Initialize memory on the device */ 734 set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N); 735 set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 736 set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 737 738 /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ 739 PetscLogDouble cpuTimer = 0.0; 740 cudaEvent_t start, stop; 741 742 /* both timers report msec */ 743 ierr = cudaEventCreate( &start );CHKERRQ(ierr); /* gpu timer facility */ 744 ierr = cudaEventCreate( &stop );CHKERRQ(ierr); /* gpu timer facility */ 745 746 scalar=3.0; 747 for (k = 0; k < NTIMES; ++k) { 748 PetscTimeSubtract(cpuTimer); 749 ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 750 STREAM_Copy_double<<<dimGrid,dimBlock>>>(d_a, d_c, N); 751 ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 752 ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 753 //get the the total elapsed time in ms 754 if (bDontUseGPUTiming) { 755 PetscTimeAdd(cpuTimer); 756 times[0][k] = cpuTimer; 757 } else { 758 ierr = cudaEventElapsedTime( ×[0][k], start, stop );CHKERRQ(ierr); 759 } 760 761 cpuTimer = 0.0; 762 PetscTimeSubtract(cpuTimer); 763 ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 764 STREAM_Copy_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_c, N); 765 ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 766 ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 767 //get the the total elapsed time in ms 768 if (bDontUseGPUTiming) { 769 PetscTimeAdd(cpuTimer); 770 times[1][k] = cpuTimer; 771 } else { 772 ierr = cudaEventElapsedTime( ×[1][k], start, stop );CHKERRQ(ierr); 773 } 774 775 cpuTimer = 0.0; 776 PetscTimeSubtract(cpuTimer); 777 ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 778 STREAM_Scale_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 779 ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 780 ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 781 //get the the total elapsed time in ms 782 PetscTimeAdd(cpuTimer); 783 if (bDontUseGPUTiming) { 784 times[2][k] = cpuTimer; 785 } else { 786 ierr = cudaEventElapsedTime( ×[2][k], start, stop );CHKERRQ(ierr); 787 } 788 789 cpuTimer = 0.0; 790 PetscTimeSubtract(cpuTimer); 791 ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 792 STREAM_Scale_Optimized_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 793 ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 794 ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 795 //get the the total elapsed time in ms 796 PetscTimeAdd(cpuTimer); 797 if (bDontUseGPUTiming) { 798 times[3][k] = cpuTimer; 799 } else { 800 ierr = cudaEventElapsedTime( ×[2][k], start, stop );CHKERRQ(ierr); 801 } 802 803 cpuTimer = 0.0; 804 PetscTimeSubtract(cpuTimer); 805 ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 806 STREAM_Add_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 807 ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 808 ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 809 //get the the total elapsed time in ms 810 PetscTimeAdd(cpuTimer); 811 if (bDontUseGPUTiming) { 812 times[4][k] = cpuTimer; 813 } else { 814 ierr = cudaEventElapsedTime( ×[3][k], start, stop );CHKERRQ(ierr); 815 } 816 817 cpuTimer = 0.0; 818 PetscTimeSubtract(cpuTimer); 819 ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 820 STREAM_Add_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 821 ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 822 ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 823 //get the the total elapsed time in ms 824 PetscTimeAdd(cpuTimer); 825 if (bDontUseGPUTiming) { 826 times[5][k] = cpuTimer; 827 } else { 828 ierr = cudaEventElapsedTime( ×[3][k], start, stop );CHKERRQ(ierr); 829 } 830 831 cpuTimer = 0.0; 832 PetscTimeSubtract(cpuTimer); 833 ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 834 STREAM_Triad_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 835 ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 836 ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 837 //get the the total elapsed time in ms 838 PetscTimeAdd(cpuTimer); 839 if (bDontUseGPUTiming) { 840 times[6][k] = cpuTimer; 841 } else { 842 ierr = cudaEventElapsedTime( ×[4][k], start, stop );CHKERRQ(ierr); 843 } 844 845 cpuTimer = 0.0; 846 PetscTimeSubtract(cpuTimer); 847 ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr); 848 STREAM_Triad_Optimized_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 849 ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr); 850 ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 851 //get the the total elapsed time in ms 852 PetscTimeAdd(cpuTimer); 853 if (bDontUseGPUTiming) { 854 times[7][k] = cpuTimer; 855 } else { 856 ierr = cudaEventElapsedTime( ×[4][k], start, stop );CHKERRQ(ierr); 857 } 858 859 } 860 861 /* verify kernels */ 862 double *h_a, *h_b, *h_c; 863 bool errorSTREAMkernel = true; 864 865 if ( (h_a = (double*)calloc( N, sizeof(double) )) == (double*)NULL ) { 866 printf("Unable to allocate array h_a, exiting ...\n"); 867 exit(1); 868 } 869 if ( (h_b = (double*)calloc( N, sizeof(double) )) == (double*)NULL ) { 870 printf("Unable to allocate array h_b, exiting ...\n"); 871 exit(1); 872 } 873 874 if ( (h_c = (double*)calloc( N, sizeof(double) )) == (double*)NULL ) { 875 printf("Unalbe to allocate array h_c, exiting ...\n"); 876 exit(1); 877 } 878 879 /* 880 * perform kernel, copy device memory into host memory and verify each 881 * device kernel output 882 */ 883 884 /* Initialize memory on the device */ 885 set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N); 886 set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 887 set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 888 889 STREAM_Copy_double<<<dimGrid,dimBlock>>>(d_a, d_c, N); 890 ierr = cudaMemcpy( h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 891 ierr = cudaMemcpy( h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 892 errorSTREAMkernel = STREAM_Copy_verify_double(h_a, h_c, N); 893 if (errorSTREAMkernel) { 894 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n");CHKERRQ(ierr); 895 exit(-2000); 896 } else { 897 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tPass\n");CHKERRQ(ierr); 898 } 899 900 /* Initialize memory on the device */ 901 set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N); 902 set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 903 set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 904 905 STREAM_Copy_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_c, N); 906 ierr = cudaMemcpy( h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 907 ierr = cudaMemcpy( h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 908 errorSTREAMkernel = STREAM_Copy_verify_double(h_a, h_c, N); 909 if (errorSTREAMkernel) { 910 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n");CHKERRQ(ierr); 911 exit(-3000); 912 } else { 913 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tPass\n");CHKERRQ(ierr); 914 } 915 916 /* Initialize memory on the device */ 917 set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 918 set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 919 920 STREAM_Scale_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 921 ierr = cudaMemcpy( h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 922 ierr = cudaMemcpy( h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 923 errorSTREAMkernel = STREAM_Scale_verify_double(h_b, h_c, scalar, N); 924 if (errorSTREAMkernel) { 925 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n");CHKERRQ(ierr); 926 exit(-4000); 927 } else { 928 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tPass\n");CHKERRQ(ierr); 929 } 930 931 /* Initialize memory on the device */ 932 set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N); 933 set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 934 set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 935 936 STREAM_Add_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 937 ierr = cudaMemcpy( h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 938 ierr = cudaMemcpy( h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 939 ierr = cudaMemcpy( h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 940 errorSTREAMkernel = STREAM_Add_verify_double(h_a, h_b, h_c, N); 941 if (errorSTREAMkernel) { 942 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n");CHKERRQ(ierr); 943 exit(-5000); 944 } else { 945 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tPass\n");CHKERRQ(ierr); 946 } 947 948 /* Initialize memory on the device */ 949 set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N); 950 set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 951 set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 952 953 STREAM_Triad_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 954 ierr = cudaMemcpy( h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 955 ierr = cudaMemcpy( h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 956 ierr = cudaMemcpy( h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 957 errorSTREAMkernel = STREAM_Triad_verify_double(h_b, h_c, h_a, scalar, N); 958 if (errorSTREAMkernel) { 959 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n");CHKERRQ(ierr); 960 exit(-6000); 961 } else { 962 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tPass\n");CHKERRQ(ierr); 963 } 964 965 /* continue from here */ 966 printResultsReadable(times); 967 968 //clean up timers 969 ierr = cudaEventDestroy( stop );CHKERRQ(ierr); 970 ierr = cudaEventDestroy( start );CHKERRQ(ierr); 971 972 /* Free memory on device */ 973 ierr = cudaFree(d_a);CHKERRQ(ierr); 974 ierr = cudaFree(d_b);CHKERRQ(ierr); 975 ierr = cudaFree(d_c);CHKERRQ(ierr); 976 PetscFunctionReturn(0); 977 } 978 979 /////////////////////////////////////////////////////////////////////////// 980 //Print Results to Screen and File 981 /////////////////////////////////////////////////////////////////////////// 982 PetscErrorCode printResultsReadable(float times[][NTIMES]) 983 { 984 PetscErrorCode ierr; 985 PetscInt j, k; 986 float avgtime[8] = {0., 0., 0., 0., 0., 0., 0., 0.}; 987 float maxtime[8] = {0., 0., 0., 0., 0., 0., 0., 0.}; 988 float mintime[8] = {1e30,1e30,1e30,1e30,1e30,1e30,1e30,1e30}; 989 char *label[8] = {"Copy: ", "Copy Opt.: ", "Scale: ", "Scale Opt: ", "Add: ", "Add Opt: ", "Triad: ", "Triad Opt: "}; 990 float bytes_per_kernel[8] = { 991 2. * sizeof(float) * N, 992 2. * sizeof(float) * N, 993 2. * sizeof(float) * N, 994 2. * sizeof(float) * N, 995 3. * sizeof(float) * N, 996 3. * sizeof(float) * N, 997 3. * sizeof(float) * N, 998 3. * sizeof(float) * N 999 }; 1000 1001 PetscFunctionBegin; 1002 /* --- SUMMARY --- */ 1003 for (k = 1; k < NTIMES; ++k) { /* note -- skip first iteration */ 1004 for (j = 0; j < 8; ++j) { 1005 avgtime[j] = avgtime[j] + (1.e-03f * times[j][k]); 1006 mintime[j] = MIN(mintime[j], (1.e-03f * times[j][k])); 1007 maxtime[j] = MAX(maxtime[j], (1.e-03f * times[j][k])); 1008 } 1009 } 1010 1011 ierr = PetscPrintf(PETSC_COMM_SELF, "Function Rate (MB/s) Avg time Min time Max time\n");CHKERRQ(ierr); 1012 1013 for (j = 0; j < 8; ++j) { 1014 avgtime[j] = avgtime[j]/(float)(NTIMES-1); 1015 ierr = PetscPrintf(PETSC_COMM_SELF, "%s%11.4f %11.6f %12.6f %12.6f\n", label[j], 1.0E-06 * bytes_per_kernel[j]/mintime[j], avgtime[j], mintime[j], maxtime[j]);CHKERRQ(ierr); 1016 } 1017 PetscFunctionReturn(0); 1018 } 1019