1 /* 2 STREAM benchmark implementation in CUDA. 3 4 COPY: a(i) = b(i) 5 SCALE: a(i) = q*b(i) 6 SUM: a(i) = b(i) + c(i) 7 TRIAD: a(i) = b(i) + q*c(i) 8 9 It measures the memory system on the device. 10 The implementation is in double precision with a single option. 11 12 Code based on the code developed by John D. McCalpin 13 http://www.cs.virginia.edu/stream/FTP/Code/stream.c 14 15 Written by: Massimiliano Fatica, NVIDIA Corporation 16 Modified by: Douglas Enright (dpephd-nvidia@yahoo.com), 1 December 2010 17 Extensive Revisions, 4 December 2010 18 Modified for PETSc by: Matthew G. Knepley 14 Aug 2011 19 20 User interface motivated by bandwidthTest NVIDIA SDK example. 21 */ 22 static char help[] = "Double-Precision STREAM Benchmark implementation in CUDA\n Performs Copy, Scale, Add, and Triad double-precision kernels\n\n"; 23 24 #include <petscconf.h> 25 #include <petscsys.h> 26 #include <petsctime.h> 27 #include <petscdevice_cuda.h> 28 29 #define N 10000000 30 #define NTIMES 10 31 32 #if !defined(MIN) 33 #define MIN(x, y) ((x) < (y) ? (x) : (y)) 34 #endif 35 #if !defined(MAX) 36 #define MAX(x, y) ((x) > (y) ? (x) : (y)) 37 #endif 38 39 const float flt_eps = 1.192092896e-07f; 40 const double dbl_eps = 2.2204460492503131e-16; 41 42 __global__ void set_array(float *a, float value, size_t len) 43 { 44 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 45 while (idx < len) { 46 a[idx] = value; 47 idx += blockDim.x * gridDim.x; 48 } 49 } 50 51 __global__ void set_array_double(double *a, double value, size_t len) 52 { 53 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 54 while (idx < len) { 55 a[idx] = value; 56 idx += blockDim.x * gridDim.x; 57 } 58 } 59 60 __global__ void STREAM_Copy(float *a, float *b, size_t len) 61 { 62 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 63 while (idx < len) { 64 b[idx] = a[idx]; 65 idx += blockDim.x * gridDim.x; 66 } 67 } 68 69 __global__ void STREAM_Copy_double(double *a, double *b, size_t len) 70 { 71 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 72 while (idx < len) { 73 b[idx] = a[idx]; 74 idx += blockDim.x * gridDim.x; 75 } 76 } 77 78 __global__ void STREAM_Copy_Optimized(float *a, float *b, size_t len) 79 { 80 /* 81 * Ensure size of thread index space is as large as or greater than 82 * vector index space else return. 83 */ 84 if (blockDim.x * gridDim.x < len) return; 85 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 86 if (idx < len) b[idx] = a[idx]; 87 } 88 89 __global__ void STREAM_Copy_Optimized_double(double *a, double *b, size_t len) 90 { 91 /* 92 * Ensure size of thread index space is as large as or greater than 93 * vector index space else return. 94 */ 95 if (blockDim.x * gridDim.x < len) return; 96 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 97 if (idx < len) b[idx] = a[idx]; 98 } 99 100 __global__ void STREAM_Scale(float *a, float *b, float scale, size_t len) 101 { 102 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 103 while (idx < len) { 104 b[idx] = scale * a[idx]; 105 idx += blockDim.x * gridDim.x; 106 } 107 } 108 109 __global__ void STREAM_Scale_double(double *a, double *b, double scale, size_t len) 110 { 111 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 112 while (idx < len) { 113 b[idx] = scale * a[idx]; 114 idx += blockDim.x * gridDim.x; 115 } 116 } 117 118 __global__ void STREAM_Scale_Optimized(float *a, float *b, float scale, size_t len) 119 { 120 /* 121 * Ensure size of thread index space is as large as or greater than 122 * vector index space else return. 123 */ 124 if (blockDim.x * gridDim.x < len) return; 125 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 126 if (idx < len) b[idx] = scale * a[idx]; 127 } 128 129 __global__ void STREAM_Scale_Optimized_double(double *a, double *b, double scale, size_t len) 130 { 131 /* 132 * Ensure size of thread index space is as large as or greater than 133 * vector index space else return. 134 */ 135 if (blockDim.x * gridDim.x < len) return; 136 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 137 if (idx < len) b[idx] = scale * a[idx]; 138 } 139 140 __global__ void STREAM_Add(float *a, float *b, float *c, size_t len) 141 { 142 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 143 while (idx < len) { 144 c[idx] = a[idx] + b[idx]; 145 idx += blockDim.x * gridDim.x; 146 } 147 } 148 149 __global__ void STREAM_Add_double(double *a, double *b, double *c, size_t len) 150 { 151 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 152 while (idx < len) { 153 c[idx] = a[idx] + b[idx]; 154 idx += blockDim.x * gridDim.x; 155 } 156 } 157 158 __global__ void STREAM_Add_Optimized(float *a, float *b, float *c, size_t len) 159 { 160 /* 161 * Ensure size of thread index space is as large as or greater than 162 * vector index space else return. 163 */ 164 if (blockDim.x * gridDim.x < len) return; 165 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 166 if (idx < len) c[idx] = a[idx] + b[idx]; 167 } 168 169 __global__ void STREAM_Add_Optimized_double(double *a, double *b, double *c, size_t len) 170 { 171 /* 172 * Ensure size of thread index space is as large as or greater than 173 * vector index space else return. 174 */ 175 if (blockDim.x * gridDim.x < len) return; 176 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 177 if (idx < len) c[idx] = a[idx] + b[idx]; 178 } 179 180 __global__ void STREAM_Triad(float *a, float *b, float *c, float scalar, size_t len) 181 { 182 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 183 while (idx < len) { 184 c[idx] = a[idx] + scalar * b[idx]; 185 idx += blockDim.x * gridDim.x; 186 } 187 } 188 189 __global__ void STREAM_Triad_double(double *a, double *b, double *c, double scalar, size_t len) 190 { 191 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 192 while (idx < len) { 193 c[idx] = a[idx] + scalar * b[idx]; 194 idx += blockDim.x * gridDim.x; 195 } 196 } 197 198 __global__ void STREAM_Triad_Optimized(float *a, float *b, float *c, float scalar, size_t len) 199 { 200 /* 201 * Ensure size of thread index space is as large as or greater than 202 * vector index space else return. 203 */ 204 if (blockDim.x * gridDim.x < len) return; 205 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 206 if (idx < len) c[idx] = a[idx] + scalar * b[idx]; 207 } 208 209 __global__ void STREAM_Triad_Optimized_double(double *a, double *b, double *c, double scalar, size_t len) 210 { 211 /* 212 * Ensure size of thread index space is as large as or greater than 213 * vector index space else return. 214 */ 215 if (blockDim.x * gridDim.x < len) return; 216 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 217 if (idx < len) c[idx] = a[idx] + scalar * b[idx]; 218 } 219 220 /* Host side verification routines */ 221 bool STREAM_Copy_verify(float *a, float *b, size_t len) 222 { 223 size_t idx; 224 bool bDifferent = false; 225 226 for (idx = 0; idx < len && !bDifferent; idx++) { 227 float expectedResult = a[idx]; 228 float diffResultExpected = (b[idx] - expectedResult); 229 float relErrorULPS = (fabsf(diffResultExpected) / fabsf(expectedResult)) / flt_eps; 230 /* element-wise relative error determination */ 231 bDifferent = (relErrorULPS > 2.f); 232 } 233 234 return bDifferent; 235 } 236 237 bool STREAM_Copy_verify_double(double *a, double *b, size_t len) 238 { 239 size_t idx; 240 bool bDifferent = false; 241 242 for (idx = 0; idx < len && !bDifferent; idx++) { 243 double expectedResult = a[idx]; 244 double diffResultExpected = (b[idx] - expectedResult); 245 double relErrorULPS = (fabsf(diffResultExpected) / fabsf(expectedResult)) / dbl_eps; 246 /* element-wise relative error determination */ 247 bDifferent = (relErrorULPS > 2.); 248 } 249 250 return bDifferent; 251 } 252 253 bool STREAM_Scale_verify(float *a, float *b, float scale, size_t len) 254 { 255 size_t idx; 256 bool bDifferent = false; 257 258 for (idx = 0; idx < len && !bDifferent; idx++) { 259 float expectedResult = scale * a[idx]; 260 float diffResultExpected = (b[idx] - expectedResult); 261 float relErrorULPS = (fabsf(diffResultExpected) / fabsf(expectedResult)) / flt_eps; 262 /* element-wise relative error determination */ 263 bDifferent = (relErrorULPS > 2.f); 264 } 265 266 return bDifferent; 267 } 268 269 bool STREAM_Scale_verify_double(double *a, double *b, double scale, size_t len) 270 { 271 size_t idx; 272 bool bDifferent = false; 273 274 for (idx = 0; idx < len && !bDifferent; idx++) { 275 double expectedResult = scale * a[idx]; 276 double diffResultExpected = (b[idx] - expectedResult); 277 double relErrorULPS = (fabsf(diffResultExpected) / fabsf(expectedResult)) / flt_eps; 278 /* element-wise relative error determination */ 279 bDifferent = (relErrorULPS > 2.); 280 } 281 282 return bDifferent; 283 } 284 285 bool STREAM_Add_verify(float *a, float *b, float *c, size_t len) 286 { 287 size_t idx; 288 bool bDifferent = false; 289 290 for (idx = 0; idx < len && !bDifferent; idx++) { 291 float expectedResult = a[idx] + b[idx]; 292 float diffResultExpected = (c[idx] - expectedResult); 293 float relErrorULPS = (fabsf(diffResultExpected) / fabsf(expectedResult)) / flt_eps; 294 /* element-wise relative error determination */ 295 bDifferent = (relErrorULPS > 2.f); 296 } 297 298 return bDifferent; 299 } 300 301 bool STREAM_Add_verify_double(double *a, double *b, double *c, size_t len) 302 { 303 size_t idx; 304 bool bDifferent = false; 305 306 for (idx = 0; idx < len && !bDifferent; idx++) { 307 double expectedResult = a[idx] + b[idx]; 308 double diffResultExpected = (c[idx] - expectedResult); 309 double relErrorULPS = (fabsf(diffResultExpected) / fabsf(expectedResult)) / flt_eps; 310 /* element-wise relative error determination */ 311 bDifferent = (relErrorULPS > 2.); 312 } 313 314 return bDifferent; 315 } 316 317 bool STREAM_Triad_verify(float *a, float *b, float *c, float scalar, size_t len) 318 { 319 size_t idx; 320 bool bDifferent = false; 321 322 for (idx = 0; idx < len && !bDifferent; idx++) { 323 float expectedResult = a[idx] + scalar * b[idx]; 324 float diffResultExpected = (c[idx] - expectedResult); 325 float relErrorULPS = (fabsf(diffResultExpected) / fabsf(expectedResult)) / flt_eps; 326 /* element-wise relative error determination */ 327 bDifferent = (relErrorULPS > 3.f); 328 } 329 330 return bDifferent; 331 } 332 333 bool STREAM_Triad_verify_double(double *a, double *b, double *c, double scalar, size_t len) 334 { 335 size_t idx; 336 bool bDifferent = false; 337 338 for (idx = 0; idx < len && !bDifferent; idx++) { 339 double expectedResult = a[idx] + scalar * b[idx]; 340 double diffResultExpected = (c[idx] - expectedResult); 341 double relErrorULPS = (fabsf(diffResultExpected) / fabsf(expectedResult)) / flt_eps; 342 /* element-wise relative error determination */ 343 bDifferent = (relErrorULPS > 3.); 344 } 345 346 return bDifferent; 347 } 348 349 /* forward declarations */ 350 PetscErrorCode setupStream(PetscInt device, PetscBool runDouble, PetscBool cpuTiming); 351 PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming); 352 PetscErrorCode runStreamDouble(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming); 353 PetscErrorCode printResultsReadable(float times[][NTIMES], size_t); 354 355 int main(int argc, char *argv[]) 356 { 357 PetscInt device = 0; 358 PetscBool runDouble = PETSC_TRUE; 359 const PetscBool cpuTiming = PETSC_TRUE; // must be true 360 PetscErrorCode ierr; 361 362 PetscCallCUDA(cudaSetDeviceFlags(cudaDeviceBlockingSync)); 363 364 PetscCall(PetscInitialize(&argc, &argv, 0, help)); 365 366 PetscOptionsBegin(PETSC_COMM_WORLD, "", "STREAM Benchmark Options", "STREAM"); 367 PetscCall(PetscOptionsBoundedInt("-device", "Specify the CUDA device to be used", "STREAM", device, &device, NULL, 0)); 368 PetscCall(PetscOptionsBool("-double", "Also run double precision tests", "STREAM", runDouble, &runDouble, NULL)); 369 PetscOptionsEnd(); 370 371 ierr = setupStream(device, runDouble, cpuTiming); 372 if (ierr) PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n[streamBenchmark] - results:\t%s\n\n", (ierr == 0) ? "PASSES" : "FAILED")); 373 PetscCall(PetscFinalize()); 374 return 0; 375 } 376 377 /////////////////////////////////////////////////////////////////////////////// 378 //Run the appropriate tests 379 /////////////////////////////////////////////////////////////////////////////// 380 PetscErrorCode setupStream(PetscInt deviceNum, PetscBool runDouble, PetscBool cpuTiming) 381 { 382 PetscInt iNumThreadsPerBlock = 128; 383 384 PetscFunctionBegin; 385 // Check device 386 { 387 int deviceCount; 388 389 cudaGetDeviceCount(&deviceCount); 390 if (deviceCount == 0) { 391 PetscCall(PetscPrintf(PETSC_COMM_SELF, "!!!!!No devices found!!!!!\n")); 392 return -1000; 393 } 394 395 if (deviceNum >= deviceCount || deviceNum < 0) { 396 PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n!!!!!Invalid GPU number %d given hence default gpu %d will be used !!!!!\n", deviceNum, 0)); 397 deviceNum = 0; 398 } 399 } 400 401 cudaSetDevice(deviceNum); 402 // PetscCall(PetscPrintf(PETSC_COMM_SELF, "Running on...\n\n")); 403 cudaDeviceProp deviceProp; 404 if (cudaGetDeviceProperties(&deviceProp, deviceNum) != cudaSuccess) { 405 PetscCall(PetscPrintf(PETSC_COMM_SELF, " Unable to determine device %d properties, exiting\n")); 406 return -1; 407 } 408 409 if (runDouble && deviceProp.major == 1 && deviceProp.minor < 3) { 410 PetscCall(PetscPrintf(PETSC_COMM_SELF, " Unable to run double-precision STREAM benchmark on a compute capability GPU less than 1.3\n")); 411 return -1; 412 } 413 if (deviceProp.major == 2 && deviceProp.minor == 1) iNumThreadsPerBlock = 192; /* GF104 architecture / 48 CUDA Cores per MP */ 414 else iNumThreadsPerBlock = 128; /* GF100 architecture / 32 CUDA Cores per MP */ 415 416 if (runDouble) PetscCall(runStreamDouble(iNumThreadsPerBlock, cpuTiming)); 417 else PetscCall(runStream(iNumThreadsPerBlock, cpuTiming)); 418 PetscFunctionReturn(PETSC_SUCCESS); 419 } 420 421 /////////////////////////////////////////////////////////////////////////// 422 // runStream 423 /////////////////////////////////////////////////////////////////////////// 424 PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming) 425 { 426 float *d_a, *d_b, *d_c; 427 int k; 428 float times[8][NTIMES]; 429 float scalar; 430 431 PetscFunctionBegin; 432 /* Allocate memory on device */ 433 PetscCallCUDA(cudaMalloc((void **)&d_a, sizeof(float) * N)); 434 PetscCallCUDA(cudaMalloc((void **)&d_b, sizeof(float) * N)); 435 PetscCallCUDA(cudaMalloc((void **)&d_c, sizeof(float) * N)); 436 437 /* Compute execution configuration */ 438 439 dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */ 440 dim3 dimGrid(N / dimBlock.x); /* (N/dimBlock.x,1,1) */ 441 if (N % dimBlock.x != 0) dimGrid.x += 1; 442 443 /* Initialize memory on the device */ 444 set_array<<<dimGrid, dimBlock>>>(d_a, 2.f, N); 445 set_array<<<dimGrid, dimBlock>>>(d_b, .5f, N); 446 set_array<<<dimGrid, dimBlock>>>(d_c, .5f, N); 447 448 /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ 449 PetscLogDouble cpuTimer = 0.0; 450 451 scalar = 3.0f; 452 for (k = 0; k < NTIMES; ++k) { 453 PetscTimeSubtract(&cpuTimer); 454 STREAM_Copy<<<dimGrid, dimBlock>>>(d_a, d_c, N); 455 cudaStreamSynchronize(NULL); 456 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 457 PetscTimeAdd(&cpuTimer); 458 if (bDontUseGPUTiming) times[0][k] = cpuTimer * 1.e3; // millisec 459 460 cpuTimer = 0.0; 461 PetscTimeSubtract(&cpuTimer); 462 STREAM_Copy_Optimized<<<dimGrid, dimBlock>>>(d_a, d_c, N); 463 cudaStreamSynchronize(NULL); 464 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 465 //get the total elapsed time in ms 466 PetscTimeAdd(&cpuTimer); 467 if (bDontUseGPUTiming) times[1][k] = cpuTimer * 1.e3; 468 469 cpuTimer = 0.0; 470 PetscTimeSubtract(&cpuTimer); 471 STREAM_Scale<<<dimGrid, dimBlock>>>(d_b, d_c, scalar, N); 472 cudaStreamSynchronize(NULL); 473 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 474 //get the total elapsed time in ms 475 PetscTimeAdd(&cpuTimer); 476 if (bDontUseGPUTiming) times[2][k] = cpuTimer * 1.e3; 477 478 cpuTimer = 0.0; 479 PetscTimeSubtract(&cpuTimer); 480 STREAM_Scale_Optimized<<<dimGrid, dimBlock>>>(d_b, d_c, scalar, N); 481 cudaStreamSynchronize(NULL); 482 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 483 //get the total elapsed time in ms 484 PetscTimeAdd(&cpuTimer); 485 if (bDontUseGPUTiming) times[3][k] = cpuTimer * 1.e3; 486 487 cpuTimer = 0.0; 488 PetscTimeSubtract(&cpuTimer); 489 // PetscCallCUDA(cudaEventRecord(start, 0)); 490 STREAM_Add<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, N); 491 cudaStreamSynchronize(NULL); 492 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 493 PetscCallCUDA(cudaEventRecord(stop, 0)); 494 // PetscCallCUDA(cudaEventSynchronize(stop)); 495 //get the total elapsed time in ms 496 PetscTimeAdd(&cpuTimer); 497 if (bDontUseGPUTiming) times[4][k] = cpuTimer * 1.e3; 498 else { 499 // PetscCallCUDA(cudaEventElapsedTime(×[4][k], start, stop)); 500 } 501 502 cpuTimer = 0.0; 503 PetscTimeSubtract(&cpuTimer); 504 STREAM_Add_Optimized<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, N); 505 cudaStreamSynchronize(NULL); 506 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 507 //get the total elapsed time in ms 508 PetscTimeAdd(&cpuTimer); 509 if (bDontUseGPUTiming) times[5][k] = cpuTimer * 1.e3; 510 511 cpuTimer = 0.0; 512 PetscTimeSubtract(&cpuTimer); 513 STREAM_Triad<<<dimGrid, dimBlock>>>(d_b, d_c, d_a, scalar, N); 514 cudaStreamSynchronize(NULL); 515 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 516 //get the total elapsed time in ms 517 PetscTimeAdd(&cpuTimer); 518 if (bDontUseGPUTiming) times[6][k] = cpuTimer * 1.e3; 519 520 cpuTimer = 0.0; 521 PetscTimeSubtract(&cpuTimer); 522 STREAM_Triad_Optimized<<<dimGrid, dimBlock>>>(d_b, d_c, d_a, scalar, N); 523 cudaStreamSynchronize(NULL); 524 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 525 //get the total elapsed time in ms 526 PetscTimeAdd(&cpuTimer); 527 if (bDontUseGPUTiming) times[7][k] = cpuTimer * 1.e3; 528 } 529 530 if (1) { /* verify kernels */ 531 float *h_a, *h_b, *h_c; 532 bool errorSTREAMkernel = true; 533 534 if ((h_a = (float *)calloc(N, sizeof(float))) == (float *)NULL) { 535 printf("Unable to allocate array h_a, exiting ...\n"); 536 exit(1); 537 } 538 if ((h_b = (float *)calloc(N, sizeof(float))) == (float *)NULL) { 539 printf("Unable to allocate array h_b, exiting ...\n"); 540 exit(1); 541 } 542 543 if ((h_c = (float *)calloc(N, sizeof(float))) == (float *)NULL) { 544 printf("Unalbe to allocate array h_c, exiting ...\n"); 545 exit(1); 546 } 547 548 /* 549 * perform kernel, copy device memory into host memory and verify each 550 * device kernel output 551 */ 552 553 /* Initialize memory on the device */ 554 set_array<<<dimGrid, dimBlock>>>(d_a, 2.f, N); 555 set_array<<<dimGrid, dimBlock>>>(d_b, .5f, N); 556 set_array<<<dimGrid, dimBlock>>>(d_c, .5f, N); 557 558 STREAM_Copy<<<dimGrid, dimBlock>>>(d_a, d_c, N); 559 PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost)); 560 PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost)); 561 errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N); 562 if (errorSTREAMkernel) { 563 PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n")); 564 exit(-2000); 565 } 566 567 /* Initialize memory on the device */ 568 set_array<<<dimGrid, dimBlock>>>(d_a, 2.f, N); 569 set_array<<<dimGrid, dimBlock>>>(d_b, .5f, N); 570 set_array<<<dimGrid, dimBlock>>>(d_c, .5f, N); 571 572 STREAM_Copy_Optimized<<<dimGrid, dimBlock>>>(d_a, d_c, N); 573 PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost)); 574 PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost)); 575 errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N); 576 if (errorSTREAMkernel) { 577 PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n")); 578 exit(-3000); 579 } 580 581 /* Initialize memory on the device */ 582 set_array<<<dimGrid, dimBlock>>>(d_a, 2.f, N); 583 set_array<<<dimGrid, dimBlock>>>(d_b, .5f, N); 584 set_array<<<dimGrid, dimBlock>>>(d_c, .5f, N); 585 586 STREAM_Scale<<<dimGrid, dimBlock>>>(d_b, d_c, scalar, N); 587 PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost)); 588 PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost)); 589 errorSTREAMkernel = STREAM_Scale_verify(h_b, h_c, scalar, N); 590 if (errorSTREAMkernel) { 591 PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n")); 592 exit(-4000); 593 } 594 595 /* Initialize memory on the device */ 596 set_array<<<dimGrid, dimBlock>>>(d_a, 2.f, N); 597 set_array<<<dimGrid, dimBlock>>>(d_b, .5f, N); 598 set_array<<<dimGrid, dimBlock>>>(d_c, .5f, N); 599 600 STREAM_Add<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, N); 601 PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost)); 602 PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost)); 603 PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost)); 604 errorSTREAMkernel = STREAM_Add_verify(h_a, h_b, h_c, N); 605 if (errorSTREAMkernel) { 606 PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n")); 607 exit(-5000); 608 } 609 610 /* Initialize memory on the device */ 611 set_array<<<dimGrid, dimBlock>>>(d_a, 2.f, N); 612 set_array<<<dimGrid, dimBlock>>>(d_b, .5f, N); 613 set_array<<<dimGrid, dimBlock>>>(d_c, .5f, N); 614 615 STREAM_Triad<<<dimGrid, dimBlock>>>(d_b, d_c, d_a, scalar, N); 616 PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost)); 617 PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost)); 618 PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost)); 619 errorSTREAMkernel = STREAM_Triad_verify(h_b, h_c, h_a, scalar, N); 620 if (errorSTREAMkernel) { 621 PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n")); 622 exit(-6000); 623 } 624 625 free(h_a); 626 free(h_b); 627 free(h_c); 628 } 629 /* continue from here */ 630 printResultsReadable(times, sizeof(float)); 631 632 /* Free memory on device */ 633 PetscCallCUDA(cudaFree(d_a)); 634 PetscCallCUDA(cudaFree(d_b)); 635 PetscCallCUDA(cudaFree(d_c)); 636 PetscFunctionReturn(PETSC_SUCCESS); 637 } 638 639 PetscErrorCode runStreamDouble(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming) 640 { 641 double *d_a, *d_b, *d_c; 642 int k; 643 float times[8][NTIMES]; 644 double scalar; 645 646 PetscFunctionBegin; 647 /* Allocate memory on device */ 648 PetscCallCUDA(cudaMalloc((void **)&d_a, sizeof(double) * N)); 649 PetscCallCUDA(cudaMalloc((void **)&d_b, sizeof(double) * N)); 650 PetscCallCUDA(cudaMalloc((void **)&d_c, sizeof(double) * N)); 651 652 /* Compute execution configuration */ 653 654 dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */ 655 dim3 dimGrid(N / dimBlock.x); /* (N/dimBlock.x,1,1) */ 656 if (N % dimBlock.x != 0) dimGrid.x += 1; 657 658 /* Initialize memory on the device */ 659 set_array_double<<<dimGrid, dimBlock>>>(d_a, 2., N); 660 set_array_double<<<dimGrid, dimBlock>>>(d_b, .5, N); 661 set_array_double<<<dimGrid, dimBlock>>>(d_c, .5, N); 662 663 /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ 664 PetscLogDouble cpuTimer = 0.0; 665 666 scalar = 3.0; 667 for (k = 0; k < NTIMES; ++k) { 668 PetscTimeSubtract(&cpuTimer); 669 STREAM_Copy_double<<<dimGrid, dimBlock>>>(d_a, d_c, N); 670 cudaStreamSynchronize(NULL); 671 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 672 //get the total elapsed time in ms 673 if (bDontUseGPUTiming) { 674 PetscTimeAdd(&cpuTimer); 675 times[0][k] = cpuTimer * 1.e3; 676 } 677 678 cpuTimer = 0.0; 679 PetscTimeSubtract(&cpuTimer); 680 STREAM_Copy_Optimized_double<<<dimGrid, dimBlock>>>(d_a, d_c, N); 681 cudaStreamSynchronize(NULL); 682 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 683 //get the total elapsed time in ms 684 if (bDontUseGPUTiming) { 685 PetscTimeAdd(&cpuTimer); 686 times[1][k] = cpuTimer * 1.e3; 687 } 688 689 cpuTimer = 0.0; 690 PetscTimeSubtract(&cpuTimer); 691 STREAM_Scale_double<<<dimGrid, dimBlock>>>(d_b, d_c, scalar, N); 692 cudaStreamSynchronize(NULL); 693 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 694 //get the total elapsed time in ms 695 PetscTimeAdd(&cpuTimer); 696 if (bDontUseGPUTiming) times[2][k] = cpuTimer * 1.e3; 697 698 cpuTimer = 0.0; 699 PetscTimeSubtract(&cpuTimer); 700 STREAM_Scale_Optimized_double<<<dimGrid, dimBlock>>>(d_b, d_c, scalar, N); 701 cudaStreamSynchronize(NULL); 702 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 703 //get the total elapsed time in ms 704 PetscTimeAdd(&cpuTimer); 705 if (bDontUseGPUTiming) times[3][k] = cpuTimer * 1.e3; 706 707 cpuTimer = 0.0; 708 PetscTimeSubtract(&cpuTimer); 709 STREAM_Add_double<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, N); 710 cudaStreamSynchronize(NULL); 711 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 712 //get the total elapsed time in ms 713 PetscTimeAdd(&cpuTimer); 714 if (bDontUseGPUTiming) times[4][k] = cpuTimer * 1.e3; 715 716 cpuTimer = 0.0; 717 PetscTimeSubtract(&cpuTimer); 718 STREAM_Add_Optimized_double<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, N); 719 cudaStreamSynchronize(NULL); 720 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 721 //get the total elapsed time in ms 722 PetscTimeAdd(&cpuTimer); 723 if (bDontUseGPUTiming) times[5][k] = cpuTimer * 1.e3; 724 725 cpuTimer = 0.0; 726 PetscTimeSubtract(&cpuTimer); 727 STREAM_Triad_double<<<dimGrid, dimBlock>>>(d_b, d_c, d_a, scalar, N); 728 cudaStreamSynchronize(NULL); 729 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 730 //get the total elapsed time in ms 731 PetscTimeAdd(&cpuTimer); 732 if (bDontUseGPUTiming) times[6][k] = cpuTimer * 1.e3; 733 734 cpuTimer = 0.0; 735 PetscTimeSubtract(&cpuTimer); 736 STREAM_Triad_Optimized_double<<<dimGrid, dimBlock>>>(d_b, d_c, d_a, scalar, N); 737 cudaStreamSynchronize(NULL); 738 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 739 //get the total elapsed time in ms 740 PetscTimeAdd(&cpuTimer); 741 if (bDontUseGPUTiming) times[7][k] = cpuTimer * 1.e3; 742 } 743 744 if (1) { /* verify kernels */ 745 double *h_a, *h_b, *h_c; 746 bool errorSTREAMkernel = true; 747 748 if ((h_a = (double *)calloc(N, sizeof(double))) == (double *)NULL) { 749 printf("Unable to allocate array h_a, exiting ...\n"); 750 exit(1); 751 } 752 if ((h_b = (double *)calloc(N, sizeof(double))) == (double *)NULL) { 753 printf("Unable to allocate array h_b, exiting ...\n"); 754 exit(1); 755 } 756 757 if ((h_c = (double *)calloc(N, sizeof(double))) == (double *)NULL) { 758 printf("Unalbe to allocate array h_c, exiting ...\n"); 759 exit(1); 760 } 761 762 /* 763 * perform kernel, copy device memory into host memory and verify each 764 * device kernel output 765 */ 766 767 /* Initialize memory on the device */ 768 set_array_double<<<dimGrid, dimBlock>>>(d_a, 2., N); 769 set_array_double<<<dimGrid, dimBlock>>>(d_b, .5, N); 770 set_array_double<<<dimGrid, dimBlock>>>(d_c, .5, N); 771 772 STREAM_Copy_double<<<dimGrid, dimBlock>>>(d_a, d_c, N); 773 PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost)); 774 PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost)); 775 errorSTREAMkernel = STREAM_Copy_verify_double(h_a, h_c, N); 776 if (errorSTREAMkernel) { 777 PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n")); 778 exit(-2000); 779 } 780 781 /* Initialize memory on the device */ 782 set_array_double<<<dimGrid, dimBlock>>>(d_a, 2., N); 783 set_array_double<<<dimGrid, dimBlock>>>(d_b, .5, N); 784 set_array_double<<<dimGrid, dimBlock>>>(d_c, .5, N); 785 786 STREAM_Copy_Optimized_double<<<dimGrid, dimBlock>>>(d_a, d_c, N); 787 PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost)); 788 PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost)); 789 errorSTREAMkernel = STREAM_Copy_verify_double(h_a, h_c, N); 790 if (errorSTREAMkernel) { 791 PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n")); 792 exit(-3000); 793 } 794 795 /* Initialize memory on the device */ 796 set_array_double<<<dimGrid, dimBlock>>>(d_b, .5, N); 797 set_array_double<<<dimGrid, dimBlock>>>(d_c, .5, N); 798 799 STREAM_Scale_double<<<dimGrid, dimBlock>>>(d_b, d_c, scalar, N); 800 PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost)); 801 PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost)); 802 errorSTREAMkernel = STREAM_Scale_verify_double(h_b, h_c, scalar, N); 803 if (errorSTREAMkernel) { 804 PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n")); 805 exit(-4000); 806 } 807 808 /* Initialize memory on the device */ 809 set_array_double<<<dimGrid, dimBlock>>>(d_a, 2., N); 810 set_array_double<<<dimGrid, dimBlock>>>(d_b, .5, N); 811 set_array_double<<<dimGrid, dimBlock>>>(d_c, .5, N); 812 813 STREAM_Add_double<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, N); 814 PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost)); 815 PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost)); 816 PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost)); 817 errorSTREAMkernel = STREAM_Add_verify_double(h_a, h_b, h_c, N); 818 if (errorSTREAMkernel) { 819 PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n")); 820 exit(-5000); 821 } 822 823 /* Initialize memory on the device */ 824 set_array_double<<<dimGrid, dimBlock>>>(d_a, 2., N); 825 set_array_double<<<dimGrid, dimBlock>>>(d_b, .5, N); 826 set_array_double<<<dimGrid, dimBlock>>>(d_c, .5, N); 827 828 STREAM_Triad_double<<<dimGrid, dimBlock>>>(d_b, d_c, d_a, scalar, N); 829 PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost)); 830 PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost)); 831 PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost)); 832 errorSTREAMkernel = STREAM_Triad_verify_double(h_b, h_c, h_a, scalar, N); 833 if (errorSTREAMkernel) { 834 PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n")); 835 exit(-6000); 836 } 837 838 free(h_a); 839 free(h_b); 840 free(h_c); 841 } 842 /* continue from here */ 843 printResultsReadable(times, sizeof(double)); 844 845 /* Free memory on device */ 846 PetscCallCUDA(cudaFree(d_a)); 847 PetscCallCUDA(cudaFree(d_b)); 848 PetscCallCUDA(cudaFree(d_c)); 849 PetscFunctionReturn(PETSC_SUCCESS); 850 } 851 852 /////////////////////////////////////////////////////////////////////////// 853 //Print Results to Screen and File 854 /////////////////////////////////////////////////////////////////////////// 855 PetscErrorCode printResultsReadable(float times[][NTIMES], const size_t bsize) 856 { 857 PetscErrorCode ierr; 858 PetscInt j, k; 859 float avgtime[8] = {0., 0., 0., 0., 0., 0., 0., 0.}; 860 float maxtime[8] = {0., 0., 0., 0., 0., 0., 0., 0.}; 861 float mintime[8] = {1e30, 1e30, 1e30, 1e30, 1e30, 1e30, 1e30, 1e30}; 862 // char *label[8] = {"Copy: ", "Copy Opt.: ", "Scale: ", "Scale Opt: ", "Add: ", "Add Opt: ", "Triad: ", "Triad Opt: "}; 863 const float bytes_per_kernel[8] = {2. * bsize * N, 2. * bsize * N, 2. * bsize * N, 2. * bsize * N, 3. * bsize * N, 3. * bsize * N, 3. * bsize * N, 3. * bsize * N}; 864 double rate, irate; 865 int rank, size; 866 867 PetscFunctionBegin; 868 PetscCallMPI(MPI_Comm_rank(MPI_COMM_WORLD, &rank)); 869 PetscCallMPI(MPI_Comm_size(MPI_COMM_WORLD, &size)); 870 /* --- SUMMARY --- */ 871 for (k = 0; k < NTIMES; ++k) { 872 for (j = 0; j < (PetscInt)PETSC_STATIC_ARRAY_LENGTH(avgtime); ++j) { 873 avgtime[j] = avgtime[j] + (1.e-03f * times[j][k]); // millisec --> sec 874 mintime[j] = MIN(mintime[j], (1.e-03f * times[j][k])); 875 maxtime[j] = MAX(maxtime[j], (1.e-03f * times[j][k])); 876 } 877 } 878 for (j = 0; j < (PetscInt)PETSC_STATIC_ARRAY_LENGTH(avgtime); ++j) avgtime[j] = avgtime[j] / (float)(NTIMES - 1); 879 j = 7; 880 irate = 1.0E-06 * bytes_per_kernel[j] / mintime[j]; 881 ierr = MPI_Reduce(&irate, &rate, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); 882 if (rank == 0) { 883 FILE *fd; 884 if (size == 1) { 885 printf("%d %11.4f Rate (MB/s)\n", size, rate); 886 fd = fopen("flops", "w"); 887 fprintf(fd, "%g\n", rate); 888 fclose(fd); 889 } else { 890 double prate; 891 fd = fopen("flops", "r"); 892 fscanf(fd, "%lg", &prate); 893 fclose(fd); 894 printf("%d %11.4f Rate (MB/s) %g\n", size, rate, rate / prate); 895 } 896 } 897 PetscFunctionReturn(PETSC_SUCCESS); 898 } 899