1 /* 2 STREAM benchmark implementation in CUDA. 3 4 COPY: a(i) = b(i) 5 SCALE: a(i) = q*b(i) 6 SUM: a(i) = b(i) + c(i) 7 TRIAD: a(i) = b(i) + q*c(i) 8 9 It measures the memory system on the device. 10 The implementation is in double precision with a single option. 11 12 Code based on the code developed by John D. McCalpin 13 http://www.cs.virginia.edu/stream/FTP/Code/stream.c 14 15 Written by: Massimiliano Fatica, NVIDIA Corporation 16 Modified by: Douglas Enright (dpephd-nvidia@yahoo.com), 1 December 2010 17 Extensive Revisions, 4 December 2010 18 Modified for PETSc by: Matthew G. Knepley 14 Aug 2011 19 20 User interface motivated by bandwidthTest NVIDIA SDK example. 21 */ 22 static char help[] = "Double-Precision STREAM Benchmark implementation in CUDA\n Performs Copy, Scale, Add, and Triad double-precision kernels\n\n"; 23 24 #include <petscconf.h> 25 #include <petscsys.h> 26 #include <petsctime.h> 27 #include <petscdevice.h> 28 29 #define N 10000000 30 #define NTIMES 10 31 32 # ifndef MIN 33 # define MIN(x,y) ((x)<(y) ? (x) : (y)) 34 # endif 35 # ifndef MAX 36 # define MAX(x,y) ((x)>(y) ? (x) : (y)) 37 # endif 38 39 const float flt_eps = 1.192092896e-07f; 40 const double dbl_eps = 2.2204460492503131e-16; 41 42 __global__ void set_array(float *a, float value, size_t len) 43 { 44 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 45 while (idx < len) { 46 a[idx] = value; 47 idx += blockDim.x * gridDim.x; 48 } 49 } 50 51 __global__ void set_array_double(double *a, double value, size_t len) 52 { 53 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 54 while (idx < len) { 55 a[idx] = value; 56 idx += blockDim.x * gridDim.x; 57 } 58 } 59 60 __global__ void STREAM_Copy(float *a, float *b, size_t len) 61 { 62 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 63 while (idx < len) { 64 b[idx] = a[idx]; 65 idx += blockDim.x * gridDim.x; 66 } 67 } 68 69 __global__ void STREAM_Copy_double(double *a, double *b, size_t len) 70 { 71 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 72 while (idx < len) { 73 b[idx] = a[idx]; 74 idx += blockDim.x * gridDim.x; 75 } 76 } 77 78 __global__ void STREAM_Copy_Optimized(float *a, float *b, size_t len) 79 { 80 /* 81 * Ensure size of thread index space is as large as or greater than 82 * vector index space else return. 83 */ 84 if (blockDim.x * gridDim.x < len) return; 85 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 86 if (idx < len) b[idx] = a[idx]; 87 } 88 89 __global__ void STREAM_Copy_Optimized_double(double *a, double *b, size_t len) 90 { 91 /* 92 * Ensure size of thread index space is as large as or greater than 93 * vector index space else return. 94 */ 95 if (blockDim.x * gridDim.x < len) return; 96 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 97 if (idx < len) b[idx] = a[idx]; 98 } 99 100 __global__ void STREAM_Scale(float *a, float *b, float scale, size_t len) 101 { 102 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 103 while (idx < len) { 104 b[idx] = scale* a[idx]; 105 idx += blockDim.x * gridDim.x; 106 } 107 } 108 109 __global__ void STREAM_Scale_double(double *a, double *b, double scale, size_t len) 110 { 111 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 112 while (idx < len) { 113 b[idx] = scale* a[idx]; 114 idx += blockDim.x * gridDim.x; 115 } 116 } 117 118 __global__ void STREAM_Scale_Optimized(float *a, float *b, float scale, size_t len) 119 { 120 /* 121 * Ensure size of thread index space is as large as or greater than 122 * vector index space else return. 123 */ 124 if (blockDim.x * gridDim.x < len) return; 125 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 126 if (idx < len) b[idx] = scale* a[idx]; 127 } 128 129 __global__ void STREAM_Scale_Optimized_double(double *a, double *b, double scale, size_t len) 130 { 131 /* 132 * Ensure size of thread index space is as large as or greater than 133 * vector index space else return. 134 */ 135 if (blockDim.x * gridDim.x < len) return; 136 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 137 if (idx < len) b[idx] = scale* a[idx]; 138 } 139 140 __global__ void STREAM_Add(float *a, float *b, float *c, size_t len) 141 { 142 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 143 while (idx < len) { 144 c[idx] = a[idx]+b[idx]; 145 idx += blockDim.x * gridDim.x; 146 } 147 } 148 149 __global__ void STREAM_Add_double(double *a, double *b, double *c, size_t len) 150 { 151 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 152 while (idx < len) { 153 c[idx] = a[idx]+b[idx]; 154 idx += blockDim.x * gridDim.x; 155 } 156 } 157 158 __global__ void STREAM_Add_Optimized(float *a, float *b, float *c, size_t len) 159 { 160 /* 161 * Ensure size of thread index space is as large as or greater than 162 * vector index space else return. 163 */ 164 if (blockDim.x * gridDim.x < len) return; 165 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 166 if (idx < len) c[idx] = a[idx]+b[idx]; 167 } 168 169 __global__ void STREAM_Add_Optimized_double(double *a, double *b, double *c, size_t len) 170 { 171 /* 172 * Ensure size of thread index space is as large as or greater than 173 * vector index space else return. 174 */ 175 if (blockDim.x * gridDim.x < len) return; 176 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 177 if (idx < len) c[idx] = a[idx]+b[idx]; 178 } 179 180 __global__ void STREAM_Triad(float *a, float *b, float *c, float scalar, size_t len) 181 { 182 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 183 while (idx < len) { 184 c[idx] = a[idx]+scalar*b[idx]; 185 idx += blockDim.x * gridDim.x; 186 } 187 } 188 189 __global__ void STREAM_Triad_double(double *a, double *b, double *c, double scalar, size_t len) 190 { 191 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 192 while (idx < len) { 193 c[idx] = a[idx]+scalar*b[idx]; 194 idx += blockDim.x * gridDim.x; 195 } 196 } 197 198 __global__ void STREAM_Triad_Optimized(float *a, float *b, float *c, float scalar, size_t len) 199 { 200 /* 201 * Ensure size of thread index space is as large as or greater than 202 * vector index space else return. 203 */ 204 if (blockDim.x * gridDim.x < len) return; 205 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 206 if (idx < len) c[idx] = a[idx]+scalar*b[idx]; 207 } 208 209 __global__ void STREAM_Triad_Optimized_double(double *a, double *b, double *c, double scalar, size_t len) 210 { 211 /* 212 * Ensure size of thread index space is as large as or greater than 213 * vector index space else return. 214 */ 215 if (blockDim.x * gridDim.x < len) return; 216 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 217 if (idx < len) c[idx] = a[idx]+scalar*b[idx]; 218 } 219 220 /* Host side verification routines */ 221 bool STREAM_Copy_verify(float *a, float *b, size_t len) 222 { 223 size_t idx; 224 bool bDifferent = false; 225 226 for (idx = 0; idx < len && !bDifferent; idx++) { 227 float expectedResult = a[idx]; 228 float diffResultExpected = (b[idx] - expectedResult); 229 float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 230 /* element-wise relative error determination */ 231 bDifferent = (relErrorULPS > 2.f); 232 } 233 234 return bDifferent; 235 } 236 237 bool STREAM_Copy_verify_double(double *a, double *b, size_t len) 238 { 239 size_t idx; 240 bool bDifferent = false; 241 242 for (idx = 0; idx < len && !bDifferent; idx++) { 243 double expectedResult = a[idx]; 244 double diffResultExpected = (b[idx] - expectedResult); 245 double relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/dbl_eps; 246 /* element-wise relative error determination */ 247 bDifferent = (relErrorULPS > 2.); 248 } 249 250 return bDifferent; 251 } 252 253 bool STREAM_Scale_verify(float *a, float *b, float scale, size_t len) 254 { 255 size_t idx; 256 bool bDifferent = false; 257 258 for (idx = 0; idx < len && !bDifferent; idx++) { 259 float expectedResult = scale*a[idx]; 260 float diffResultExpected = (b[idx] - expectedResult); 261 float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 262 /* element-wise relative error determination */ 263 bDifferent = (relErrorULPS > 2.f); 264 } 265 266 return bDifferent; 267 } 268 269 bool STREAM_Scale_verify_double(double *a, double *b, double scale, size_t len) 270 { 271 size_t idx; 272 bool bDifferent = false; 273 274 for (idx = 0; idx < len && !bDifferent; idx++) { 275 double expectedResult = scale*a[idx]; 276 double diffResultExpected = (b[idx] - expectedResult); 277 double relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 278 /* element-wise relative error determination */ 279 bDifferent = (relErrorULPS > 2.); 280 } 281 282 return bDifferent; 283 } 284 285 bool STREAM_Add_verify(float *a, float *b, float *c, size_t len) 286 { 287 size_t idx; 288 bool bDifferent = false; 289 290 for (idx = 0; idx < len && !bDifferent; idx++) { 291 float expectedResult = a[idx] + b[idx]; 292 float diffResultExpected = (c[idx] - expectedResult); 293 float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 294 /* element-wise relative error determination */ 295 bDifferent = (relErrorULPS > 2.f); 296 } 297 298 return bDifferent; 299 } 300 301 bool STREAM_Add_verify_double(double *a, double *b, double *c, size_t len) 302 { 303 size_t idx; 304 bool bDifferent = false; 305 306 for (idx = 0; idx < len && !bDifferent; idx++) { 307 double expectedResult = a[idx] + b[idx]; 308 double diffResultExpected = (c[idx] - expectedResult); 309 double relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 310 /* element-wise relative error determination */ 311 bDifferent = (relErrorULPS > 2.); 312 } 313 314 return bDifferent; 315 } 316 317 bool STREAM_Triad_verify(float *a, float *b, float *c, float scalar, size_t len) 318 { 319 size_t idx; 320 bool bDifferent = false; 321 322 for (idx = 0; idx < len && !bDifferent; idx++) { 323 float expectedResult = a[idx] + scalar*b[idx]; 324 float diffResultExpected = (c[idx] - expectedResult); 325 float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 326 /* element-wise relative error determination */ 327 bDifferent = (relErrorULPS > 3.f); 328 } 329 330 return bDifferent; 331 } 332 333 bool STREAM_Triad_verify_double(double *a, double *b, double *c, double scalar, size_t len) 334 { 335 size_t idx; 336 bool bDifferent = false; 337 338 for (idx = 0; idx < len && !bDifferent; idx++) { 339 double expectedResult = a[idx] + scalar*b[idx]; 340 double diffResultExpected = (c[idx] - expectedResult); 341 double relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 342 /* element-wise relative error determination */ 343 bDifferent = (relErrorULPS > 3.); 344 } 345 346 return bDifferent; 347 } 348 349 /* forward declarations */ 350 PetscErrorCode setupStream(PetscInt device, PetscBool runDouble, PetscBool cpuTiming); 351 PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming); 352 PetscErrorCode runStreamDouble(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming); 353 PetscErrorCode printResultsReadable(float times[][NTIMES], size_t); 354 355 int main(int argc, char *argv[]) 356 { 357 PetscInt device = 0; 358 PetscBool runDouble = PETSC_TRUE; 359 const PetscBool cpuTiming = PETSC_TRUE; // must be true 360 PetscErrorCode ierr; 361 362 PetscCallCUDA(cudaSetDeviceFlags(cudaDeviceBlockingSync)); 363 364 PetscCall(PetscInitialize(&argc, &argv, 0, help)); 365 366 PetscOptionsBegin(PETSC_COMM_WORLD, "", "STREAM Benchmark Options", "STREAM"); 367 PetscCall(PetscOptionsBoundedInt("-device", "Specify the CUDA device to be used", "STREAM", device, &device, NULL,0)); 368 PetscCall(PetscOptionsBool("-double", "Also run double precision tests", "STREAM", runDouble, &runDouble, NULL)); 369 PetscOptionsEnd(); 370 371 ierr = setupStream(device, runDouble, cpuTiming); 372 if (ierr) { 373 PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n[streamBenchmark] - results:\t%s\n\n", (ierr == 0) ? "PASSES" : "FAILED")); 374 } 375 PetscCall(PetscFinalize()); 376 return 0; 377 } 378 379 /////////////////////////////////////////////////////////////////////////////// 380 //Run the appropriate tests 381 /////////////////////////////////////////////////////////////////////////////// 382 PetscErrorCode setupStream(PetscInt deviceNum, PetscBool runDouble, PetscBool cpuTiming) 383 { 384 PetscInt iNumThreadsPerBlock = 128; 385 386 PetscFunctionBegin; 387 // Check device 388 { 389 int deviceCount; 390 391 cudaGetDeviceCount(&deviceCount); 392 if (deviceCount == 0) { 393 PetscCall(PetscPrintf(PETSC_COMM_SELF, "!!!!!No devices found!!!!!\n")); 394 return -1000; 395 } 396 397 if (deviceNum >= deviceCount || deviceNum < 0) { 398 PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n!!!!!Invalid GPU number %d given hence default gpu %d will be used !!!!!\n", deviceNum, 0)); 399 deviceNum = 0; 400 } 401 } 402 403 cudaSetDevice(deviceNum); 404 // PetscCall(PetscPrintf(PETSC_COMM_SELF, "Running on...\n\n")); 405 cudaDeviceProp deviceProp; 406 if (cudaGetDeviceProperties(&deviceProp, deviceNum) != cudaSuccess) { 407 PetscCall(PetscPrintf(PETSC_COMM_SELF, " Unable to determine device %d properties, exiting\n")); 408 return -1; 409 } 410 411 if (runDouble && deviceProp.major == 1 && deviceProp.minor < 3) { 412 PetscCall(PetscPrintf(PETSC_COMM_SELF, " Unable to run double-precision STREAM benchmark on a compute capability GPU less than 1.3\n")); 413 return -1; 414 } 415 if (deviceProp.major == 2 && deviceProp.minor == 1) iNumThreadsPerBlock = 192; /* GF104 architecture / 48 CUDA Cores per MP */ 416 else iNumThreadsPerBlock = 128; /* GF100 architecture / 32 CUDA Cores per MP */ 417 418 if (runDouble) { 419 PetscCall(runStreamDouble(iNumThreadsPerBlock, cpuTiming)); 420 } else { 421 PetscCall(runStream(iNumThreadsPerBlock, cpuTiming)); 422 } 423 PetscFunctionReturn(0); 424 } 425 426 /////////////////////////////////////////////////////////////////////////// 427 // runStream 428 /////////////////////////////////////////////////////////////////////////// 429 PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming) 430 { 431 float *d_a, *d_b, *d_c; 432 int k; 433 float times[8][NTIMES]; 434 float scalar; 435 436 PetscFunctionBegin; 437 /* Allocate memory on device */ 438 PetscCallCUDA(cudaMalloc((void**)&d_a, sizeof(float)*N)); 439 PetscCallCUDA(cudaMalloc((void**)&d_b, sizeof(float)*N)); 440 PetscCallCUDA(cudaMalloc((void**)&d_c, sizeof(float)*N)); 441 442 /* Compute execution configuration */ 443 444 dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */ 445 dim3 dimGrid(N/dimBlock.x); /* (N/dimBlock.x,1,1) */ 446 if (N % dimBlock.x != 0) dimGrid.x+=1; 447 448 /* Initialize memory on the device */ 449 set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 450 set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 451 set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 452 453 /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ 454 PetscLogDouble cpuTimer = 0.0; 455 456 scalar=3.0f; 457 for (k = 0; k < NTIMES; ++k) { 458 PetscTimeSubtract(&cpuTimer); 459 STREAM_Copy<<<dimGrid,dimBlock>>>(d_a, d_c, N); 460 cudaStreamSynchronize(NULL); 461 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 462 PetscTimeAdd(&cpuTimer); 463 if (bDontUseGPUTiming) times[0][k] = cpuTimer*1.e3; // millisec 464 465 cpuTimer = 0.0; 466 PetscTimeSubtract(&cpuTimer); 467 STREAM_Copy_Optimized<<<dimGrid,dimBlock>>>(d_a, d_c, N); 468 cudaStreamSynchronize(NULL); 469 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 470 //get the total elapsed time in ms 471 PetscTimeAdd(&cpuTimer); 472 if (bDontUseGPUTiming) times[1][k] = cpuTimer*1.e3; 473 474 cpuTimer = 0.0; 475 PetscTimeSubtract(&cpuTimer); 476 STREAM_Scale<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 477 cudaStreamSynchronize(NULL); 478 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 479 //get the total elapsed time in ms 480 PetscTimeAdd(&cpuTimer); 481 if (bDontUseGPUTiming) times[2][k] = cpuTimer*1.e3; 482 483 cpuTimer = 0.0; 484 PetscTimeSubtract(&cpuTimer); 485 STREAM_Scale_Optimized<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 486 cudaStreamSynchronize(NULL); 487 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 488 //get the total elapsed time in ms 489 PetscTimeAdd(&cpuTimer); 490 if (bDontUseGPUTiming) times[3][k] = cpuTimer*1.e3; 491 492 cpuTimer = 0.0; 493 PetscTimeSubtract(&cpuTimer); 494 // PetscCallCUDA(cudaEventRecord(start, 0)); 495 STREAM_Add<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 496 cudaStreamSynchronize(NULL); 497 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 498 PetscCallCUDA(cudaEventRecord(stop, 0)); 499 // PetscCallCUDA(cudaEventSynchronize(stop)); 500 //get the total elapsed time in ms 501 PetscTimeAdd(&cpuTimer); 502 if (bDontUseGPUTiming) times[4][k] = cpuTimer*1.e3; 503 else { 504 // PetscCallCUDA(cudaEventElapsedTime(×[4][k], start, stop)); 505 } 506 507 cpuTimer = 0.0; 508 PetscTimeSubtract(&cpuTimer); 509 STREAM_Add_Optimized<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 510 cudaStreamSynchronize(NULL); 511 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 512 //get the total elapsed time in ms 513 PetscTimeAdd(&cpuTimer); 514 if (bDontUseGPUTiming) times[5][k] = cpuTimer*1.e3; 515 516 cpuTimer = 0.0; 517 PetscTimeSubtract(&cpuTimer); 518 STREAM_Triad<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 519 cudaStreamSynchronize(NULL); 520 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 521 //get the total elapsed time in ms 522 PetscTimeAdd(&cpuTimer); 523 if (bDontUseGPUTiming) times[6][k] = cpuTimer*1.e3; 524 525 cpuTimer = 0.0; 526 PetscTimeSubtract(&cpuTimer); 527 STREAM_Triad_Optimized<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 528 cudaStreamSynchronize(NULL); 529 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 530 //get the total elapsed time in ms 531 PetscTimeAdd(&cpuTimer); 532 if (bDontUseGPUTiming) times[7][k] = cpuTimer*1.e3; 533 } 534 535 if (1) { /* verify kernels */ 536 float *h_a, *h_b, *h_c; 537 bool errorSTREAMkernel = true; 538 539 if ((h_a = (float*)calloc(N, sizeof(float))) == (float*)NULL) { 540 printf("Unable to allocate array h_a, exiting ...\n"); 541 exit(1); 542 } 543 if ((h_b = (float*)calloc(N, sizeof(float))) == (float*)NULL) { 544 printf("Unable to allocate array h_b, exiting ...\n"); 545 exit(1); 546 } 547 548 if ((h_c = (float*)calloc(N, sizeof(float))) == (float*)NULL) { 549 printf("Unalbe to allocate array h_c, exiting ...\n"); 550 exit(1); 551 } 552 553 /* 554 * perform kernel, copy device memory into host memory and verify each 555 * device kernel output 556 */ 557 558 /* Initialize memory on the device */ 559 set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 560 set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 561 set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 562 563 STREAM_Copy<<<dimGrid,dimBlock>>>(d_a, d_c, N); 564 PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost)); 565 PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost)); 566 errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N); 567 if (errorSTREAMkernel) { 568 PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n")); 569 exit(-2000); 570 } 571 572 /* Initialize memory on the device */ 573 set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 574 set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 575 set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 576 577 STREAM_Copy_Optimized<<<dimGrid,dimBlock>>>(d_a, d_c, N); 578 PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost)); 579 PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost)); 580 errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N); 581 if (errorSTREAMkernel) { 582 PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n")); 583 exit(-3000); 584 } 585 586 /* Initialize memory on the device */ 587 set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 588 set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 589 set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 590 591 STREAM_Scale<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 592 PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost)); 593 PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost)); 594 errorSTREAMkernel = STREAM_Scale_verify(h_b, h_c, scalar, N); 595 if (errorSTREAMkernel) { 596 PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n")); 597 exit(-4000); 598 } 599 600 /* Initialize memory on the device */ 601 set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 602 set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 603 set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 604 605 STREAM_Add<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 606 PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost)); 607 PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost)); 608 PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost)); 609 errorSTREAMkernel = STREAM_Add_verify(h_a, h_b, h_c, N); 610 if (errorSTREAMkernel) { 611 PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n")); 612 exit(-5000); 613 } 614 615 /* Initialize memory on the device */ 616 set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 617 set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 618 set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 619 620 STREAM_Triad<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 621 PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost)); 622 PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost)); 623 PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost)); 624 errorSTREAMkernel = STREAM_Triad_verify(h_b, h_c, h_a, scalar, N); 625 if (errorSTREAMkernel) { 626 PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n")); 627 exit(-6000); 628 } 629 630 free(h_a); 631 free(h_b); 632 free(h_c); 633 } 634 /* continue from here */ 635 printResultsReadable(times, sizeof(float)); 636 637 /* Free memory on device */ 638 PetscCallCUDA(cudaFree(d_a)); 639 PetscCallCUDA(cudaFree(d_b)); 640 PetscCallCUDA(cudaFree(d_c)); 641 642 PetscFunctionReturn(0); 643 } 644 645 PetscErrorCode runStreamDouble(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming) 646 { 647 double *d_a, *d_b, *d_c; 648 int k; 649 float times[8][NTIMES]; 650 double scalar; 651 652 PetscFunctionBegin; 653 /* Allocate memory on device */ 654 PetscCallCUDA(cudaMalloc((void**)&d_a, sizeof(double)*N)); 655 PetscCallCUDA(cudaMalloc((void**)&d_b, sizeof(double)*N)); 656 PetscCallCUDA(cudaMalloc((void**)&d_c, sizeof(double)*N)); 657 658 /* Compute execution configuration */ 659 660 dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */ 661 dim3 dimGrid(N/dimBlock.x); /* (N/dimBlock.x,1,1) */ 662 if (N % dimBlock.x != 0) dimGrid.x+=1; 663 664 /* Initialize memory on the device */ 665 set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N); 666 set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 667 set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 668 669 /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ 670 PetscLogDouble cpuTimer = 0.0; 671 672 scalar=3.0; 673 for (k = 0; k < NTIMES; ++k) { 674 PetscTimeSubtract(&cpuTimer); 675 STREAM_Copy_double<<<dimGrid,dimBlock>>>(d_a, d_c, N); 676 cudaStreamSynchronize(NULL); 677 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 678 //get the total elapsed time in ms 679 if (bDontUseGPUTiming) { 680 PetscTimeAdd(&cpuTimer); 681 times[0][k] = cpuTimer*1.e3; 682 } 683 684 cpuTimer = 0.0; 685 PetscTimeSubtract(&cpuTimer); 686 STREAM_Copy_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_c, N); 687 cudaStreamSynchronize(NULL); 688 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 689 //get the total elapsed time in ms 690 if (bDontUseGPUTiming) { 691 PetscTimeAdd(&cpuTimer); 692 times[1][k] = cpuTimer*1.e3; 693 } 694 695 cpuTimer = 0.0; 696 PetscTimeSubtract(&cpuTimer); 697 STREAM_Scale_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 698 cudaStreamSynchronize(NULL); 699 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 700 //get the total elapsed time in ms 701 PetscTimeAdd(&cpuTimer); 702 if (bDontUseGPUTiming) times[2][k] = cpuTimer*1.e3; 703 704 cpuTimer = 0.0; 705 PetscTimeSubtract(&cpuTimer); 706 STREAM_Scale_Optimized_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 707 cudaStreamSynchronize(NULL); 708 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 709 //get the total elapsed time in ms 710 PetscTimeAdd(&cpuTimer); 711 if (bDontUseGPUTiming) times[3][k] = cpuTimer*1.e3; 712 713 cpuTimer = 0.0; 714 PetscTimeSubtract(&cpuTimer); 715 STREAM_Add_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 716 cudaStreamSynchronize(NULL); 717 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 718 //get the total elapsed time in ms 719 PetscTimeAdd(&cpuTimer); 720 if (bDontUseGPUTiming) times[4][k] = cpuTimer*1.e3; 721 722 cpuTimer = 0.0; 723 PetscTimeSubtract(&cpuTimer); 724 STREAM_Add_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 725 cudaStreamSynchronize(NULL); 726 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 727 //get the total elapsed time in ms 728 PetscTimeAdd(&cpuTimer); 729 if (bDontUseGPUTiming) times[5][k] = cpuTimer*1.e3; 730 731 cpuTimer = 0.0; 732 PetscTimeSubtract(&cpuTimer); 733 STREAM_Triad_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 734 cudaStreamSynchronize(NULL); 735 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 736 //get the total elapsed time in ms 737 PetscTimeAdd(&cpuTimer); 738 if (bDontUseGPUTiming) times[6][k] = cpuTimer*1.e3; 739 740 cpuTimer = 0.0; 741 PetscTimeSubtract(&cpuTimer); 742 STREAM_Triad_Optimized_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 743 cudaStreamSynchronize(NULL); 744 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 745 //get the total elapsed time in ms 746 PetscTimeAdd(&cpuTimer); 747 if (bDontUseGPUTiming) times[7][k] = cpuTimer*1.e3; 748 } 749 750 if (1) { /* verify kernels */ 751 double *h_a, *h_b, *h_c; 752 bool errorSTREAMkernel = true; 753 754 if ((h_a = (double*)calloc(N, sizeof(double))) == (double*)NULL) { 755 printf("Unable to allocate array h_a, exiting ...\n"); 756 exit(1); 757 } 758 if ((h_b = (double*)calloc(N, sizeof(double))) == (double*)NULL) { 759 printf("Unable to allocate array h_b, exiting ...\n"); 760 exit(1); 761 } 762 763 if ((h_c = (double*)calloc(N, sizeof(double))) == (double*)NULL) { 764 printf("Unalbe to allocate array h_c, exiting ...\n"); 765 exit(1); 766 } 767 768 /* 769 * perform kernel, copy device memory into host memory and verify each 770 * device kernel output 771 */ 772 773 /* Initialize memory on the device */ 774 set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N); 775 set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 776 set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 777 778 STREAM_Copy_double<<<dimGrid,dimBlock>>>(d_a, d_c, N); 779 PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost)); 780 PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost)); 781 errorSTREAMkernel = STREAM_Copy_verify_double(h_a, h_c, N); 782 if (errorSTREAMkernel) { 783 PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n")); 784 exit(-2000); 785 } 786 787 /* Initialize memory on the device */ 788 set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N); 789 set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 790 set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 791 792 STREAM_Copy_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_c, N); 793 PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost)); 794 PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost)); 795 errorSTREAMkernel = STREAM_Copy_verify_double(h_a, h_c, N); 796 if (errorSTREAMkernel) { 797 PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n")); 798 exit(-3000); 799 } 800 801 /* Initialize memory on the device */ 802 set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 803 set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 804 805 STREAM_Scale_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 806 PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost)); 807 PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost)); 808 errorSTREAMkernel = STREAM_Scale_verify_double(h_b, h_c, scalar, N); 809 if (errorSTREAMkernel) { 810 PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n")); 811 exit(-4000); 812 } 813 814 /* Initialize memory on the device */ 815 set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N); 816 set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 817 set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 818 819 STREAM_Add_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 820 PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost)); 821 PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost)); 822 PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost)); 823 errorSTREAMkernel = STREAM_Add_verify_double(h_a, h_b, h_c, N); 824 if (errorSTREAMkernel) { 825 PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n")); 826 exit(-5000); 827 } 828 829 /* Initialize memory on the device */ 830 set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N); 831 set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 832 set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 833 834 STREAM_Triad_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 835 PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost)); 836 PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost)); 837 PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost)); 838 errorSTREAMkernel = STREAM_Triad_verify_double(h_b, h_c, h_a, scalar, N); 839 if (errorSTREAMkernel) { 840 PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n")); 841 exit(-6000); 842 } 843 844 free(h_a); 845 free(h_b); 846 free(h_c); 847 } 848 /* continue from here */ 849 printResultsReadable(times,sizeof(double)); 850 851 /* Free memory on device */ 852 PetscCallCUDA(cudaFree(d_a)); 853 PetscCallCUDA(cudaFree(d_b)); 854 PetscCallCUDA(cudaFree(d_c)); 855 856 PetscFunctionReturn(0); 857 } 858 859 /////////////////////////////////////////////////////////////////////////// 860 //Print Results to Screen and File 861 /////////////////////////////////////////////////////////////////////////// 862 PetscErrorCode printResultsReadable(float times[][NTIMES], const size_t bsize) 863 { 864 PetscErrorCode ierr; 865 PetscInt j, k; 866 float avgtime[8] = {0., 0., 0., 0., 0., 0., 0., 0.}; 867 float maxtime[8] = {0., 0., 0., 0., 0., 0., 0., 0.}; 868 float mintime[8] = {1e30,1e30,1e30,1e30,1e30,1e30,1e30,1e30}; 869 // char *label[8] = {"Copy: ", "Copy Opt.: ", "Scale: ", "Scale Opt: ", "Add: ", "Add Opt: ", "Triad: ", "Triad Opt: "}; 870 const float bytes_per_kernel[8] = { 871 2. * bsize * N, 872 2. * bsize * N, 873 2. * bsize * N, 874 2. * bsize * N, 875 3. * bsize * N, 876 3. * bsize * N, 877 3. * bsize * N, 878 3. * bsize * N 879 }; 880 double rate,irate; 881 int rank,size; 882 PetscFunctionBegin; 883 PetscCallMPI(MPI_Comm_rank(MPI_COMM_WORLD,&rank)); 884 PetscCallMPI(MPI_Comm_size(MPI_COMM_WORLD,&size)); 885 /* --- SUMMARY --- */ 886 for (k = 0; k < NTIMES; ++k) { 887 for (j = 0; j < 8; ++j) { 888 avgtime[j] = avgtime[j] + (1.e-03f * times[j][k]); // millisec --> sec 889 mintime[j] = MIN(mintime[j], (1.e-03f * times[j][k])); 890 maxtime[j] = MAX(maxtime[j], (1.e-03f * times[j][k])); 891 } 892 } 893 for (j = 0; j < 8; ++j) { 894 avgtime[j] = avgtime[j]/(float)(NTIMES-1); 895 } 896 j = 7; 897 irate = 1.0E-06 * bytes_per_kernel[j]/mintime[j]; 898 ierr = MPI_Reduce(&irate,&rate,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); 899 if (rank == 0) { 900 FILE *fd; 901 if (size == 1) { 902 printf("%d %11.4f Rate (MB/s)\n",size, rate); 903 fd = fopen("flops","w"); 904 fprintf(fd,"%g\n",rate); 905 fclose(fd); 906 } else { 907 double prate; 908 fd = fopen("flops","r"); 909 fscanf(fd,"%lg",&prate); 910 fclose(fd); 911 printf("%d %11.4f Rate (MB/s) %g \n", size, rate, rate/prate); 912 } 913 } 914 915 PetscFunctionReturn(0); 916 } 917