1 /* 2 STREAM benchmark implementation in CUDA. 3 4 COPY: a(i) = b(i) 5 SCALE: a(i) = q*b(i) 6 SUM: a(i) = b(i) + c(i) 7 TRIAD: a(i) = b(i) + q*c(i) 8 9 It measures the memory system on the device. 10 The implementation is in single precision. 11 12 Code based on the code developed by John D. McCalpin 13 http://www.cs.virginia.edu/stream/FTP/Code/stream.c 14 15 Written by: Massimiliano Fatica, NVIDIA Corporation 16 Modified by: Douglas Enright (dpephd-nvidia@yahoo.com), 1 December 2010 17 Extensive Revisions, 4 December 2010 18 Modified for PETSc by: Matthew G. Knepley 14 Aug 2011 19 20 User interface motivated by bandwidthTest NVIDIA SDK example. 21 */ 22 static char *help = "Single-Precision STREAM Benchmark implementation in CUDA\n" 23 "Performs Copy, Scale, Add, and Triad single-precision kernels\n\n"; 24 25 #include <petscconf.h> 26 #include <petscsys.h> 27 #include <petsctime.h> 28 29 #define N 2000000 30 #define N_DOUBLE 8000000 31 #define NTIMES 10 32 33 # ifndef MIN 34 # define MIN(x,y) ((x)<(y) ? (x) : (y)) 35 # endif 36 # ifndef MAX 37 # define MAX(x,y) ((x)>(y) ? (x) : (y)) 38 # endif 39 40 const float flt_eps = 1.192092896e-07f; 41 const double dbl_eps = 2.2204460492503131e-16; 42 43 __global__ void set_array(float *a, float value, size_t len) 44 { 45 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 46 while (idx < len) { 47 a[idx] = value; 48 idx += blockDim.x * gridDim.x; 49 } 50 } 51 52 __global__ void set_array_double(double *a, double value, size_t len) 53 { 54 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 55 while (idx < len) { 56 a[idx] = value; 57 idx += blockDim.x * gridDim.x; 58 } 59 } 60 61 __global__ void STREAM_Copy(float *a, float *b, size_t len) 62 { 63 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 64 while (idx < len) { 65 b[idx] = a[idx]; 66 idx += blockDim.x * gridDim.x; 67 } 68 } 69 70 __global__ void STREAM_Copy_double(double *a, double *b, size_t len) 71 { 72 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 73 while (idx < len) { 74 b[idx] = a[idx]; 75 idx += blockDim.x * gridDim.x; 76 } 77 } 78 79 __global__ void STREAM_Copy_Optimized(float *a, float *b, size_t len) 80 { 81 /* 82 * Ensure size of thread index space is as large as or greater than 83 * vector index space else return. 84 */ 85 if (blockDim.x * gridDim.x < len) return; 86 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 87 if (idx < len) b[idx] = a[idx]; 88 } 89 90 __global__ void STREAM_Copy_Optimized_double(double *a, double *b, size_t len) 91 { 92 /* 93 * Ensure size of thread index space is as large as or greater than 94 * vector index space else return. 95 */ 96 if (blockDim.x * gridDim.x < len) return; 97 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 98 if (idx < len) b[idx] = a[idx]; 99 } 100 101 __global__ void STREAM_Scale(float *a, float *b, float scale, size_t len) 102 { 103 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 104 while (idx < len) { 105 b[idx] = scale* a[idx]; 106 idx += blockDim.x * gridDim.x; 107 } 108 } 109 110 __global__ void STREAM_Scale_double(double *a, double *b, double scale, size_t len) 111 { 112 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 113 while (idx < len) { 114 b[idx] = scale* a[idx]; 115 idx += blockDim.x * gridDim.x; 116 } 117 } 118 119 __global__ void STREAM_Scale_Optimized(float *a, float *b, float scale, size_t len) 120 { 121 /* 122 * Ensure size of thread index space is as large as or greater than 123 * vector index space else return. 124 */ 125 if (blockDim.x * gridDim.x < len) return; 126 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 127 if (idx < len) b[idx] = scale* a[idx]; 128 } 129 130 __global__ void STREAM_Scale_Optimized_double(double *a, double *b, double scale, size_t len) 131 { 132 /* 133 * Ensure size of thread index space is as large as or greater than 134 * vector index space else return. 135 */ 136 if (blockDim.x * gridDim.x < len) return; 137 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 138 if (idx < len) b[idx] = scale* a[idx]; 139 } 140 141 __global__ void STREAM_Add(float *a, float *b, float *c, size_t len) 142 { 143 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 144 while (idx < len) { 145 c[idx] = a[idx]+b[idx]; 146 idx += blockDim.x * gridDim.x; 147 } 148 } 149 150 __global__ void STREAM_Add_double(double *a, double *b, double *c, size_t len) 151 { 152 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 153 while (idx < len) { 154 c[idx] = a[idx]+b[idx]; 155 idx += blockDim.x * gridDim.x; 156 } 157 } 158 159 __global__ void STREAM_Add_Optimized(float *a, float *b, float *c, size_t len) 160 { 161 /* 162 * Ensure size of thread index space is as large as or greater than 163 * vector index space else return. 164 */ 165 if (blockDim.x * gridDim.x < len) return; 166 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 167 if (idx < len) c[idx] = a[idx]+b[idx]; 168 } 169 170 __global__ void STREAM_Add_Optimized_double(double *a, double *b, double *c, size_t len) 171 { 172 /* 173 * Ensure size of thread index space is as large as or greater than 174 * vector index space else return. 175 */ 176 if (blockDim.x * gridDim.x < len) return; 177 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 178 if (idx < len) c[idx] = a[idx]+b[idx]; 179 } 180 181 __global__ void STREAM_Triad(float *a, float *b, float *c, float scalar, size_t len) 182 { 183 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 184 while (idx < len) { 185 c[idx] = a[idx]+scalar*b[idx]; 186 idx += blockDim.x * gridDim.x; 187 } 188 } 189 190 __global__ void STREAM_Triad_double(double *a, double *b, double *c, double scalar, size_t len) 191 { 192 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 193 while (idx < len) { 194 c[idx] = a[idx]+scalar*b[idx]; 195 idx += blockDim.x * gridDim.x; 196 } 197 } 198 199 __global__ void STREAM_Triad_Optimized(float *a, float *b, float *c, float scalar, size_t len) 200 { 201 /* 202 * Ensure size of thread index space is as large as or greater than 203 * vector index space else return. 204 */ 205 if (blockDim.x * gridDim.x < len) return; 206 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 207 if (idx < len) c[idx] = a[idx]+scalar*b[idx]; 208 } 209 210 __global__ void STREAM_Triad_Optimized_double(double *a, double *b, double *c, double scalar, size_t len) 211 { 212 /* 213 * Ensure size of thread index space is as large as or greater than 214 * vector index space else return. 215 */ 216 if (blockDim.x * gridDim.x < len) return; 217 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 218 if (idx < len) c[idx] = a[idx]+scalar*b[idx]; 219 } 220 221 /* Host side verification routines */ 222 bool STREAM_Copy_verify(float *a, float *b, size_t len) 223 { 224 size_t idx; 225 bool bDifferent = false; 226 227 for (idx = 0; idx < len && !bDifferent; idx++) { 228 float expectedResult = a[idx]; 229 float diffResultExpected = (b[idx] - expectedResult); 230 float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 231 /* element-wise relative error determination */ 232 bDifferent = (relErrorULPS > 2.f); 233 } 234 235 return bDifferent; 236 } 237 238 bool STREAM_Copy_verify_double(double *a, double *b, size_t len) 239 { 240 size_t idx; 241 bool bDifferent = false; 242 243 for (idx = 0; idx < len && !bDifferent; idx++) { 244 double expectedResult = a[idx]; 245 double diffResultExpected = (b[idx] - expectedResult); 246 double relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 247 /* element-wise relative error determination */ 248 bDifferent = (relErrorULPS > 2.); 249 } 250 251 return bDifferent; 252 } 253 254 bool STREAM_Scale_verify(float *a, float *b, float scale, size_t len) 255 { 256 size_t idx; 257 bool bDifferent = false; 258 259 for (idx = 0; idx < len && !bDifferent; idx++) { 260 float expectedResult = scale*a[idx]; 261 float diffResultExpected = (b[idx] - expectedResult); 262 float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 263 /* element-wise relative error determination */ 264 bDifferent = (relErrorULPS > 2.f); 265 } 266 267 return bDifferent; 268 } 269 270 bool STREAM_Scale_verify_double(double *a, double *b, double scale, size_t len) 271 { 272 size_t idx; 273 bool bDifferent = false; 274 275 for (idx = 0; idx < len && !bDifferent; idx++) { 276 double expectedResult = scale*a[idx]; 277 double diffResultExpected = (b[idx] - expectedResult); 278 double relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 279 /* element-wise relative error determination */ 280 bDifferent = (relErrorULPS > 2.); 281 } 282 283 return bDifferent; 284 } 285 286 bool STREAM_Add_verify(float *a, float *b, float *c, size_t len) 287 { 288 size_t idx; 289 bool bDifferent = false; 290 291 for (idx = 0; idx < len && !bDifferent; idx++) { 292 float expectedResult = a[idx] + b[idx]; 293 float diffResultExpected = (c[idx] - expectedResult); 294 float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 295 /* element-wise relative error determination */ 296 bDifferent = (relErrorULPS > 2.f); 297 } 298 299 return bDifferent; 300 } 301 302 bool STREAM_Add_verify_double(double *a, double *b, double *c, size_t len) 303 { 304 size_t idx; 305 bool bDifferent = false; 306 307 for (idx = 0; idx < len && !bDifferent; idx++) { 308 double expectedResult = a[idx] + b[idx]; 309 double diffResultExpected = (c[idx] - expectedResult); 310 double relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 311 /* element-wise relative error determination */ 312 bDifferent = (relErrorULPS > 2.); 313 } 314 315 return bDifferent; 316 } 317 318 bool STREAM_Triad_verify(float *a, float *b, float *c, float scalar, size_t len) 319 { 320 size_t idx; 321 bool bDifferent = false; 322 323 for (idx = 0; idx < len && !bDifferent; idx++) { 324 float expectedResult = a[idx] + scalar*b[idx]; 325 float diffResultExpected = (c[idx] - expectedResult); 326 float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 327 /* element-wise relative error determination */ 328 bDifferent = (relErrorULPS > 3.f); 329 } 330 331 return bDifferent; 332 } 333 334 bool STREAM_Triad_verify_double(double *a, double *b, double *c, double scalar, size_t len) 335 { 336 size_t idx; 337 bool bDifferent = false; 338 339 for (idx = 0; idx < len && !bDifferent; idx++) { 340 double expectedResult = a[idx] + scalar*b[idx]; 341 double diffResultExpected = (c[idx] - expectedResult); 342 double relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 343 /* element-wise relative error determination */ 344 bDifferent = (relErrorULPS > 3.); 345 } 346 347 return bDifferent; 348 } 349 350 /* forward declarations */ 351 PetscErrorCode setupStream(PetscInt device, PetscBool runDouble, PetscBool cpuTiming); 352 PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming); 353 PetscErrorCode runStreamDouble(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming); 354 PetscErrorCode printResultsReadable(float times[][NTIMES]); 355 356 int main(int argc, char *argv[]) 357 { 358 PetscInt device = 0; 359 PetscBool runDouble = PETSC_FALSE; 360 PetscBool cpuTiming = PETSC_FALSE; 361 PetscErrorCode ierr; 362 363 ierr = PetscInitialize(&argc, &argv, 0, help);CHKERRQ(ierr); 364 ierr = PetscPrintf(PETSC_COMM_SELF, "[Single and Double-Precision Device-Only STREAM Benchmark implementation in CUDA]\n");CHKERRQ(ierr); 365 ierr = PetscPrintf(PETSC_COMM_SELF, "%s Starting...\n\n", argv[0]);CHKERRQ(ierr); 366 367 ierr = PetscOptionsBegin(PETSC_COMM_WORLD, "", "STREAM Benchmark Options", "STREAM");CHKERRQ(ierr); 368 ierr = PetscOptionsInt("-device", "Specify the CUDA device to be used", "STREAM", device, &device, NULL);CHKERRQ(ierr); 369 ierr = PetscOptionsBool("-double", "Also run double precision tests", "STREAM", runDouble, &runDouble, NULL);CHKERRQ(ierr); 370 ierr = PetscOptionsBool("-cputiming", "Force CPU-based timing to be used", "STREAM", cpuTiming, &cpuTiming, NULL);CHKERRQ(ierr); 371 ierr = PetscOptionsEnd(); 372 373 ierr = setupStream(device, runDouble, cpuTiming); 374 if (ierr >= 0) { 375 PetscErrorCode ierr2 = PetscPrintf(PETSC_COMM_SELF, "\n[streamBenchmark] - results:\t%s\n\n", (ierr == 0) ? "PASSES" : "FAILED");CHKERRQ(ierr2); 376 } 377 PetscFinalize(); 378 return 0; 379 } 380 381 /////////////////////////////////////////////////////////////////////////////// 382 //Run the appropriate tests 383 /////////////////////////////////////////////////////////////////////////////// 384 PetscErrorCode setupStream(PetscInt deviceNum, PetscBool runDouble, PetscBool cpuTiming) 385 { 386 PetscInt iNumThreadsPerBlock = 128; 387 PetscErrorCode ierr; 388 389 PetscFunctionBegin; 390 // Check device 391 { 392 int deviceCount; 393 394 cudaGetDeviceCount(&deviceCount); 395 if (deviceCount == 0) { 396 ierr = PetscPrintf(PETSC_COMM_SELF, "!!!!!No devices found!!!!!\n");CHKERRQ(ierr); 397 return -1000; 398 } 399 400 if (deviceNum >= deviceCount || deviceNum < 0) { 401 ierr = PetscPrintf(PETSC_COMM_SELF, "\n!!!!!Invalid GPU number %d given hence default gpu %d will be used !!!!!\n", deviceNum, 0);CHKERRQ(ierr); 402 deviceNum = 0; 403 } 404 } 405 406 cudaSetDevice(deviceNum); 407 ierr = PetscPrintf(PETSC_COMM_SELF, "Running on...\n\n");CHKERRQ(ierr); 408 cudaDeviceProp deviceProp; 409 if (cudaGetDeviceProperties(&deviceProp, deviceNum) == cudaSuccess) { 410 ierr = PetscPrintf(PETSC_COMM_SELF, " Device %d: %s\n", deviceNum, deviceProp.name);CHKERRQ(ierr); 411 } else { 412 ierr = PetscPrintf(PETSC_COMM_SELF, " Unable to determine device %d properties, exiting\n");CHKERRQ(ierr); 413 return -1; 414 } 415 416 if (runDouble && deviceProp.major == 1 && deviceProp.minor < 3) { 417 ierr = PetscPrintf(PETSC_COMM_SELF, " Unable to run double-precision STREAM benchmark on a compute capability GPU less than 1.3\n");CHKERRQ(ierr); 418 return -1; 419 } 420 if (deviceProp.major == 2 && deviceProp.minor == 1) iNumThreadsPerBlock = 192; /* GF104 architecture / 48 CUDA Cores per MP */ 421 else iNumThreadsPerBlock = 128; /* GF100 architecture / 32 CUDA Cores per MP */ 422 423 if (cpuTiming) { 424 ierr = PetscPrintf(PETSC_COMM_SELF, " Using cpu-only timer.\n");CHKERRQ(ierr); 425 } 426 427 ierr = runStream(iNumThreadsPerBlock, cpuTiming);CHKERRQ(ierr); 428 if (runDouble) { 429 ierr = cudaSetDeviceFlags(cudaDeviceBlockingSync);CHKERRQ(ierr); 430 ierr = runStreamDouble(iNumThreadsPerBlock, cpuTiming);CHKERRQ(ierr); 431 } 432 PetscFunctionReturn(0); 433 } 434 435 /////////////////////////////////////////////////////////////////////////// 436 // runStream 437 /////////////////////////////////////////////////////////////////////////// 438 PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming) 439 { 440 float *d_a, *d_b, *d_c; 441 int k; 442 float times[8][NTIMES]; 443 float scalar; 444 PetscErrorCode ierr; 445 446 PetscFunctionBegin; 447 /* Allocate memory on device */ 448 ierr = cudaMalloc((void**)&d_a, sizeof(float)*N);CHKERRQ(ierr); 449 ierr = cudaMalloc((void**)&d_b, sizeof(float)*N);CHKERRQ(ierr); 450 ierr = cudaMalloc((void**)&d_c, sizeof(float)*N);CHKERRQ(ierr); 451 452 /* Compute execution configuration */ 453 454 dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */ 455 dim3 dimGrid(N/dimBlock.x); /* (N/dimBlock.x,1,1) */ 456 if (N % dimBlock.x != 0) dimGrid.x+=1; 457 458 ierr = PetscPrintf(PETSC_COMM_SELF, " Array size (single precision) = %u\n",N);CHKERRQ(ierr); 459 ierr = PetscPrintf(PETSC_COMM_SELF, " using %u threads per block, %u blocks\n",dimBlock.x,dimGrid.x);CHKERRQ(ierr); 460 461 /* Initialize memory on the device */ 462 set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 463 set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 464 set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 465 466 /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ 467 PetscLogDouble cpuTimer = 0.0; 468 cudaEvent_t start, stop; 469 470 /* both timers report msec */ 471 ierr = cudaEventCreate(&start);CHKERRQ(ierr); /* gpu timer facility */ 472 ierr = cudaEventCreate(&stop);CHKERRQ(ierr); /* gpu timer facility */ 473 474 scalar=3.0f; 475 for (k = 0; k < NTIMES; ++k) { 476 PetscTimeSubtract(&cpuTimer); 477 ierr = cudaEventRecord(start, 0);CHKERRQ(ierr); 478 STREAM_Copy<<<dimGrid,dimBlock>>>(d_a, d_c, N); 479 ierr = cudaEventRecord(stop, 0);CHKERRQ(ierr); 480 ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 481 //get the total elapsed time in ms 482 PetscTimeAdd(&cpuTimer); 483 if (bDontUseGPUTiming) times[0][k] = cpuTimer; 484 else { 485 ierr = cudaEventElapsedTime(×[0][k], start, stop);CHKERRQ(ierr); 486 } 487 488 cpuTimer = 0.0; 489 PetscTimeSubtract(&cpuTimer); 490 ierr = cudaEventRecord(start, 0);CHKERRQ(ierr); 491 STREAM_Copy_Optimized<<<dimGrid,dimBlock>>>(d_a, d_c, N); 492 ierr = cudaEventRecord(stop, 0);CHKERRQ(ierr); 493 ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 494 //get the total elapsed time in ms 495 PetscTimeAdd(&cpuTimer); 496 if (bDontUseGPUTiming) times[1][k] = cpuTimer; 497 else { 498 ierr = cudaEventElapsedTime(×[1][k], start, stop);CHKERRQ(ierr); 499 } 500 501 cpuTimer = 0.0; 502 PetscTimeSubtract(&cpuTimer); 503 ierr = cudaEventRecord(start, 0);CHKERRQ(ierr); 504 STREAM_Scale<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 505 ierr = cudaEventRecord(stop, 0);CHKERRQ(ierr); 506 ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 507 //get the total elapsed time in ms 508 PetscTimeAdd(&cpuTimer); 509 if (bDontUseGPUTiming) times[2][k] = cpuTimer; 510 else { 511 ierr = cudaEventElapsedTime(×[2][k], start, stop);CHKERRQ(ierr); 512 } 513 514 cpuTimer = 0.0; 515 PetscTimeSubtract(&cpuTimer); 516 ierr = cudaEventRecord(start, 0);CHKERRQ(ierr); 517 STREAM_Scale_Optimized<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 518 ierr = cudaEventRecord(stop, 0);CHKERRQ(ierr); 519 ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 520 //get the total elapsed time in ms 521 PetscTimeAdd(&cpuTimer); 522 if (bDontUseGPUTiming) times[3][k] = cpuTimer; 523 else { 524 ierr = cudaEventElapsedTime(×[3][k], start, stop);CHKERRQ(ierr); 525 } 526 527 cpuTimer = 0.0; 528 PetscTimeSubtract(&cpuTimer); 529 ierr = cudaEventRecord(start, 0);CHKERRQ(ierr); 530 STREAM_Add<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 531 ierr = cudaEventRecord(stop, 0);CHKERRQ(ierr); 532 ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 533 //get the total elapsed time in ms 534 PetscTimeAdd(&cpuTimer); 535 if (bDontUseGPUTiming) times[4][k] = cpuTimer; 536 else { 537 ierr = cudaEventElapsedTime(×[4][k], start, stop);CHKERRQ(ierr); 538 } 539 540 cpuTimer = 0.0; 541 PetscTimeSubtract(&cpuTimer); 542 ierr = cudaEventRecord(start, 0);CHKERRQ(ierr); 543 STREAM_Add_Optimized<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 544 ierr = cudaEventRecord(stop, 0);CHKERRQ(ierr); 545 ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 546 //get the total elapsed time in ms 547 PetscTimeAdd(&cpuTimer); 548 if (bDontUseGPUTiming) times[5][k] = cpuTimer; 549 else { 550 ierr = cudaEventElapsedTime(×[5][k], start, stop);CHKERRQ(ierr); 551 } 552 553 cpuTimer = 0.0; 554 PetscTimeSubtract(&cpuTimer); 555 ierr = cudaEventRecord(start, 0);CHKERRQ(ierr); 556 STREAM_Triad<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 557 ierr = cudaEventRecord(stop, 0);CHKERRQ(ierr); 558 ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 559 //get the total elapsed time in ms 560 PetscTimeAdd(&cpuTimer); 561 if (bDontUseGPUTiming) times[6][k] = cpuTimer; 562 else { 563 ierr = cudaEventElapsedTime(×[6][k], start, stop);CHKERRQ(ierr); 564 } 565 566 cpuTimer = 0.0; 567 PetscTimeSubtract(&cpuTimer); 568 ierr = cudaEventRecord(start, 0);CHKERRQ(ierr); 569 STREAM_Triad_Optimized<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 570 ierr = cudaEventRecord(stop, 0);CHKERRQ(ierr); 571 ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 572 //get the total elapsed time in ms 573 PetscTimeAdd(&cpuTimer); 574 if (bDontUseGPUTiming) times[7][k] = cpuTimer; 575 else { 576 ierr = cudaEventElapsedTime(×[7][k], start, stop);CHKERRQ(ierr); 577 } 578 579 } 580 581 /* verify kernels */ 582 float *h_a, *h_b, *h_c; 583 bool errorSTREAMkernel = true; 584 585 if ((h_a = (float*)calloc(N, sizeof(float))) == (float*)NULL) { 586 printf("Unable to allocate array h_a, exiting ...\n"); 587 exit(1); 588 } 589 if ((h_b = (float*)calloc(N, sizeof(float))) == (float*)NULL) { 590 printf("Unable to allocate array h_b, exiting ...\n"); 591 exit(1); 592 } 593 594 if ((h_c = (float*)calloc(N, sizeof(float))) == (float*)NULL) { 595 printf("Unalbe to allocate array h_c, exiting ...\n"); 596 exit(1); 597 } 598 599 /* 600 * perform kernel, copy device memory into host memory and verify each 601 * device kernel output 602 */ 603 604 /* Initialize memory on the device */ 605 set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 606 set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 607 set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 608 609 STREAM_Copy<<<dimGrid,dimBlock>>>(d_a, d_c, N); 610 ierr = cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 611 ierr = cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 612 errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N); 613 if (errorSTREAMkernel) { 614 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n");CHKERRQ(ierr); 615 exit(-2000); 616 } else { 617 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tPass\n");CHKERRQ(ierr); 618 } 619 620 /* Initialize memory on the device */ 621 set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 622 set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 623 set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 624 625 STREAM_Copy_Optimized<<<dimGrid,dimBlock>>>(d_a, d_c, N); 626 ierr = cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 627 ierr = cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 628 errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N); 629 if (errorSTREAMkernel) { 630 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n");CHKERRQ(ierr); 631 exit(-3000); 632 } else { 633 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tPass\n");CHKERRQ(ierr); 634 } 635 636 /* Initialize memory on the device */ 637 set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 638 set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 639 640 STREAM_Scale<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 641 ierr = cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 642 ierr = cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 643 errorSTREAMkernel = STREAM_Scale_verify(h_b, h_c, scalar, N); 644 if (errorSTREAMkernel) { 645 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n");CHKERRQ(ierr); 646 exit(-4000); 647 } else { 648 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tPass\n");CHKERRQ(ierr); 649 } 650 651 /* Initialize memory on the device */ 652 set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 653 set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 654 set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 655 656 STREAM_Add<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 657 ierr = cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 658 ierr = cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 659 ierr = cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 660 errorSTREAMkernel = STREAM_Add_verify(h_a, h_b, h_c, N); 661 if (errorSTREAMkernel) { 662 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n");CHKERRQ(ierr); 663 exit(-5000); 664 } else { 665 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tPass\n");CHKERRQ(ierr); 666 } 667 668 /* Initialize memory on the device */ 669 set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 670 set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 671 set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 672 673 STREAM_Triad<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 674 ierr = cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 675 ierr = cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 676 ierr = cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 677 errorSTREAMkernel = STREAM_Triad_verify(h_b, h_c, h_a, scalar, N); 678 if (errorSTREAMkernel) { 679 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n");CHKERRQ(ierr); 680 exit(-6000); 681 } else { 682 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tPass\n");CHKERRQ(ierr); 683 } 684 685 /* continue from here */ 686 printResultsReadable(times); 687 688 //clean up timers 689 ierr = cudaEventDestroy(stop);CHKERRQ(ierr); 690 ierr = cudaEventDestroy(start);CHKERRQ(ierr); 691 692 /* Free memory on device */ 693 ierr = cudaFree(d_a);CHKERRQ(ierr); 694 ierr = cudaFree(d_b);CHKERRQ(ierr); 695 ierr = cudaFree(d_c);CHKERRQ(ierr); 696 PetscFunctionReturn(0); 697 } 698 699 PetscErrorCode runStreamDouble(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming) 700 { 701 double *d_a, *d_b, *d_c; 702 int k; 703 float times[8][NTIMES]; 704 double scalar; 705 PetscErrorCode ierr; 706 707 PetscFunctionBegin; 708 /* Allocate memory on device */ 709 ierr = cudaMalloc((void**)&d_a, sizeof(double)*N);CHKERRQ(ierr); 710 ierr = cudaMalloc((void**)&d_b, sizeof(double)*N);CHKERRQ(ierr); 711 ierr = cudaMalloc((void**)&d_c, sizeof(double)*N);CHKERRQ(ierr); 712 713 /* Compute execution configuration */ 714 715 dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */ 716 dim3 dimGrid(N/dimBlock.x); /* (N/dimBlock.x,1,1) */ 717 if (N % dimBlock.x != 0) dimGrid.x+=1; 718 719 ierr = PetscPrintf(PETSC_COMM_SELF, " Array size (double precision) = %u\n",N);CHKERRQ(ierr); 720 ierr = PetscPrintf(PETSC_COMM_SELF, " using %u threads per block, %u blocks\n",dimBlock.x,dimGrid.x);CHKERRQ(ierr); 721 722 /* Initialize memory on the device */ 723 set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N); 724 set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 725 set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 726 727 /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ 728 PetscLogDouble cpuTimer = 0.0; 729 cudaEvent_t start, stop; 730 731 /* both timers report msec */ 732 ierr = cudaEventCreate(&start);CHKERRQ(ierr); /* gpu timer facility */ 733 ierr = cudaEventCreate(&stop);CHKERRQ(ierr); /* gpu timer facility */ 734 735 scalar=3.0; 736 for (k = 0; k < NTIMES; ++k) { 737 PetscTimeSubtract(&cpuTimer); 738 ierr = cudaEventRecord(start, 0);CHKERRQ(ierr); 739 STREAM_Copy_double<<<dimGrid,dimBlock>>>(d_a, d_c, N); 740 ierr = cudaEventRecord(stop, 0);CHKERRQ(ierr); 741 ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 742 //get the total elapsed time in ms 743 if (bDontUseGPUTiming) { 744 PetscTimeAdd(&cpuTimer); 745 times[0][k] = cpuTimer; 746 } else { 747 ierr = cudaEventElapsedTime(×[0][k], start, stop);CHKERRQ(ierr); 748 } 749 750 cpuTimer = 0.0; 751 PetscTimeSubtract(&cpuTimer); 752 ierr = cudaEventRecord(start, 0);CHKERRQ(ierr); 753 STREAM_Copy_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_c, N); 754 ierr = cudaEventRecord(stop, 0);CHKERRQ(ierr); 755 ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 756 //get the total elapsed time in ms 757 if (bDontUseGPUTiming) { 758 PetscTimeAdd(&cpuTimer); 759 times[1][k] = cpuTimer; 760 } else { 761 ierr = cudaEventElapsedTime(×[1][k], start, stop);CHKERRQ(ierr); 762 } 763 764 cpuTimer = 0.0; 765 PetscTimeSubtract(&cpuTimer); 766 ierr = cudaEventRecord(start, 0);CHKERRQ(ierr); 767 STREAM_Scale_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 768 ierr = cudaEventRecord(stop, 0);CHKERRQ(ierr); 769 ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 770 //get the total elapsed time in ms 771 PetscTimeAdd(&cpuTimer); 772 if (bDontUseGPUTiming) times[2][k] = cpuTimer; 773 else { 774 ierr = cudaEventElapsedTime(×[2][k], start, stop);CHKERRQ(ierr); 775 } 776 777 cpuTimer = 0.0; 778 PetscTimeSubtract(&cpuTimer); 779 ierr = cudaEventRecord(start, 0);CHKERRQ(ierr); 780 STREAM_Scale_Optimized_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 781 ierr = cudaEventRecord(stop, 0);CHKERRQ(ierr); 782 ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 783 //get the total elapsed time in ms 784 PetscTimeAdd(&cpuTimer); 785 if (bDontUseGPUTiming) times[3][k] = cpuTimer; 786 else { 787 ierr = cudaEventElapsedTime(×[2][k], start, stop);CHKERRQ(ierr); 788 } 789 790 cpuTimer = 0.0; 791 PetscTimeSubtract(&cpuTimer); 792 ierr = cudaEventRecord(start, 0);CHKERRQ(ierr); 793 STREAM_Add_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 794 ierr = cudaEventRecord(stop, 0);CHKERRQ(ierr); 795 ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 796 //get the total elapsed time in ms 797 PetscTimeAdd(&cpuTimer); 798 if (bDontUseGPUTiming) times[4][k] = cpuTimer; 799 else { 800 ierr = cudaEventElapsedTime(×[3][k], start, stop);CHKERRQ(ierr); 801 } 802 803 cpuTimer = 0.0; 804 PetscTimeSubtract(&cpuTimer); 805 ierr = cudaEventRecord(start, 0);CHKERRQ(ierr); 806 STREAM_Add_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 807 ierr = cudaEventRecord(stop, 0);CHKERRQ(ierr); 808 ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 809 //get the total elapsed time in ms 810 PetscTimeAdd(&cpuTimer); 811 if (bDontUseGPUTiming) times[5][k] = cpuTimer; 812 else { 813 ierr = cudaEventElapsedTime(×[3][k], start, stop);CHKERRQ(ierr); 814 } 815 816 cpuTimer = 0.0; 817 PetscTimeSubtract(&cpuTimer); 818 ierr = cudaEventRecord(start, 0);CHKERRQ(ierr); 819 STREAM_Triad_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 820 ierr = cudaEventRecord(stop, 0);CHKERRQ(ierr); 821 ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 822 //get the total elapsed time in ms 823 PetscTimeAdd(&cpuTimer); 824 if (bDontUseGPUTiming) times[6][k] = cpuTimer; 825 else { 826 ierr = cudaEventElapsedTime(×[4][k], start, stop);CHKERRQ(ierr); 827 } 828 829 cpuTimer = 0.0; 830 PetscTimeSubtract(&cpuTimer); 831 ierr = cudaEventRecord(start, 0);CHKERRQ(ierr); 832 STREAM_Triad_Optimized_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 833 ierr = cudaEventRecord(stop, 0);CHKERRQ(ierr); 834 ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 835 //get the total elapsed time in ms 836 PetscTimeAdd(&cpuTimer); 837 if (bDontUseGPUTiming) times[7][k] = cpuTimer; 838 else { 839 ierr = cudaEventElapsedTime(×[4][k], start, stop);CHKERRQ(ierr); 840 } 841 842 } 843 844 /* verify kernels */ 845 double *h_a, *h_b, *h_c; 846 bool errorSTREAMkernel = true; 847 848 if ((h_a = (double*)calloc(N, sizeof(double))) == (double*)NULL) { 849 printf("Unable to allocate array h_a, exiting ...\n"); 850 exit(1); 851 } 852 if ((h_b = (double*)calloc(N, sizeof(double))) == (double*)NULL) { 853 printf("Unable to allocate array h_b, exiting ...\n"); 854 exit(1); 855 } 856 857 if ((h_c = (double*)calloc(N, sizeof(double))) == (double*)NULL) { 858 printf("Unalbe to allocate array h_c, exiting ...\n"); 859 exit(1); 860 } 861 862 /* 863 * perform kernel, copy device memory into host memory and verify each 864 * device kernel output 865 */ 866 867 /* Initialize memory on the device */ 868 set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N); 869 set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 870 set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 871 872 STREAM_Copy_double<<<dimGrid,dimBlock>>>(d_a, d_c, N); 873 ierr = cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 874 ierr = cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 875 errorSTREAMkernel = STREAM_Copy_verify_double(h_a, h_c, N); 876 if (errorSTREAMkernel) { 877 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n");CHKERRQ(ierr); 878 exit(-2000); 879 } else { 880 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tPass\n");CHKERRQ(ierr); 881 } 882 883 /* Initialize memory on the device */ 884 set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N); 885 set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 886 set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 887 888 STREAM_Copy_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_c, N); 889 ierr = cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 890 ierr = cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 891 errorSTREAMkernel = STREAM_Copy_verify_double(h_a, h_c, N); 892 if (errorSTREAMkernel) { 893 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n");CHKERRQ(ierr); 894 exit(-3000); 895 } else { 896 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tPass\n");CHKERRQ(ierr); 897 } 898 899 /* Initialize memory on the device */ 900 set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 901 set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 902 903 STREAM_Scale_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 904 ierr = cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 905 ierr = cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 906 errorSTREAMkernel = STREAM_Scale_verify_double(h_b, h_c, scalar, N); 907 if (errorSTREAMkernel) { 908 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n");CHKERRQ(ierr); 909 exit(-4000); 910 } else { 911 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tPass\n");CHKERRQ(ierr); 912 } 913 914 /* Initialize memory on the device */ 915 set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N); 916 set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 917 set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 918 919 STREAM_Add_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 920 ierr = cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 921 ierr = cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 922 ierr = cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 923 errorSTREAMkernel = STREAM_Add_verify_double(h_a, h_b, h_c, N); 924 if (errorSTREAMkernel) { 925 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n");CHKERRQ(ierr); 926 exit(-5000); 927 } else { 928 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tPass\n");CHKERRQ(ierr); 929 } 930 931 /* Initialize memory on the device */ 932 set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N); 933 set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 934 set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 935 936 STREAM_Triad_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 937 ierr = cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 938 ierr = cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 939 ierr = cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 940 errorSTREAMkernel = STREAM_Triad_verify_double(h_b, h_c, h_a, scalar, N); 941 if (errorSTREAMkernel) { 942 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n");CHKERRQ(ierr); 943 exit(-6000); 944 } else { 945 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tPass\n");CHKERRQ(ierr); 946 } 947 948 /* continue from here */ 949 printResultsReadable(times); 950 951 //clean up timers 952 ierr = cudaEventDestroy(stop);CHKERRQ(ierr); 953 ierr = cudaEventDestroy(start);CHKERRQ(ierr); 954 955 /* Free memory on device */ 956 ierr = cudaFree(d_a);CHKERRQ(ierr); 957 ierr = cudaFree(d_b);CHKERRQ(ierr); 958 ierr = cudaFree(d_c);CHKERRQ(ierr); 959 PetscFunctionReturn(0); 960 } 961 962 /////////////////////////////////////////////////////////////////////////// 963 //Print Results to Screen and File 964 /////////////////////////////////////////////////////////////////////////// 965 PetscErrorCode printResultsReadable(float times[][NTIMES]) 966 { 967 PetscErrorCode ierr; 968 PetscInt j, k; 969 float avgtime[8] = {0., 0., 0., 0., 0., 0., 0., 0.}; 970 float maxtime[8] = {0., 0., 0., 0., 0., 0., 0., 0.}; 971 float mintime[8] = {1e30,1e30,1e30,1e30,1e30,1e30,1e30,1e30}; 972 char *label[8] = {"Copy: ", "Copy Opt.: ", "Scale: ", "Scale Opt: ", "Add: ", "Add Opt: ", "Triad: ", "Triad Opt: "}; 973 float bytes_per_kernel[8] = { 974 2. * sizeof(float) * N, 975 2. * sizeof(float) * N, 976 2. * sizeof(float) * N, 977 2. * sizeof(float) * N, 978 3. * sizeof(float) * N, 979 3. * sizeof(float) * N, 980 3. * sizeof(float) * N, 981 3. * sizeof(float) * N 982 }; 983 984 PetscFunctionBegin; 985 /* --- SUMMARY --- */ 986 for (k = 1; k < NTIMES; ++k) /* note -- skip first iteration */ 987 for (j = 0; j < 8; ++j) { 988 avgtime[j] = avgtime[j] + (1.e-03f * times[j][k]); 989 mintime[j] = MIN(mintime[j], (1.e-03f * times[j][k])); 990 maxtime[j] = MAX(maxtime[j], (1.e-03f * times[j][k])); 991 } 992 993 ierr = PetscPrintf(PETSC_COMM_SELF, "Function Rate (MB/s) Avg time Min time Max time\n");CHKERRQ(ierr); 994 995 for (j = 0; j < 8; ++j) { 996 avgtime[j] = avgtime[j]/(float)(NTIMES-1); 997 ierr = PetscPrintf(PETSC_COMM_SELF, "%s%11.4f %11.6f %12.6f %12.6f\n", label[j], 1.0E-06 * bytes_per_kernel[j]/mintime[j], avgtime[j], mintime[j], maxtime[j]);CHKERRQ(ierr); 998 } 999 PetscFunctionReturn(0); 1000 } 1001