1 /* 2 STREAM benchmark implementation in CUDA. 3 4 COPY: a(i) = b(i) 5 SCALE: a(i) = q*b(i) 6 SUM: a(i) = b(i) + c(i) 7 TRIAD: a(i) = b(i) + q*c(i) 8 9 It measures the memory system on the device. 10 The implementation is in double precision with a single option. 11 12 Code based on the code developed by John D. McCalpin 13 http://www.cs.virginia.edu/stream/FTP/Code/stream.c 14 15 Written by: Massimiliano Fatica, NVIDIA Corporation 16 Modified by: Douglas Enright (dpephd-nvidia@yahoo.com), 1 December 2010 17 Extensive Revisions, 4 December 2010 18 Modified for PETSc by: Matthew G. Knepley 14 Aug 2011 19 20 User interface motivated by bandwidthTest NVIDIA SDK example. 21 */ 22 static char help[] = "Double-Precision STREAM Benchmark implementation in CUDA\n Performs Copy, Scale, Add, and Triad double-precision kernels\n\n"; 23 24 #include <petscconf.h> 25 #include <petscsys.h> 26 #include <petsctime.h> 27 #include <petscdevice.h> 28 29 #define N 10000000 30 #define NTIMES 10 31 32 # ifndef MIN 33 # define MIN(x,y) ((x)<(y) ? (x) : (y)) 34 # endif 35 # ifndef MAX 36 # define MAX(x,y) ((x)>(y) ? (x) : (y)) 37 # endif 38 39 const float flt_eps = 1.192092896e-07f; 40 const double dbl_eps = 2.2204460492503131e-16; 41 42 __global__ void set_array(float *a, float value, size_t len) 43 { 44 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 45 while (idx < len) { 46 a[idx] = value; 47 idx += blockDim.x * gridDim.x; 48 } 49 } 50 51 __global__ void set_array_double(double *a, double value, size_t len) 52 { 53 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 54 while (idx < len) { 55 a[idx] = value; 56 idx += blockDim.x * gridDim.x; 57 } 58 } 59 60 __global__ void STREAM_Copy(float *a, float *b, size_t len) 61 { 62 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 63 while (idx < len) { 64 b[idx] = a[idx]; 65 idx += blockDim.x * gridDim.x; 66 } 67 } 68 69 __global__ void STREAM_Copy_double(double *a, double *b, size_t len) 70 { 71 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 72 while (idx < len) { 73 b[idx] = a[idx]; 74 idx += blockDim.x * gridDim.x; 75 } 76 } 77 78 __global__ void STREAM_Copy_Optimized(float *a, float *b, size_t len) 79 { 80 /* 81 * Ensure size of thread index space is as large as or greater than 82 * vector index space else return. 83 */ 84 if (blockDim.x * gridDim.x < len) return; 85 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 86 if (idx < len) b[idx] = a[idx]; 87 } 88 89 __global__ void STREAM_Copy_Optimized_double(double *a, double *b, size_t len) 90 { 91 /* 92 * Ensure size of thread index space is as large as or greater than 93 * vector index space else return. 94 */ 95 if (blockDim.x * gridDim.x < len) return; 96 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 97 if (idx < len) b[idx] = a[idx]; 98 } 99 100 __global__ void STREAM_Scale(float *a, float *b, float scale, size_t len) 101 { 102 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 103 while (idx < len) { 104 b[idx] = scale* a[idx]; 105 idx += blockDim.x * gridDim.x; 106 } 107 } 108 109 __global__ void STREAM_Scale_double(double *a, double *b, double scale, size_t len) 110 { 111 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 112 while (idx < len) { 113 b[idx] = scale* a[idx]; 114 idx += blockDim.x * gridDim.x; 115 } 116 } 117 118 __global__ void STREAM_Scale_Optimized(float *a, float *b, float scale, size_t len) 119 { 120 /* 121 * Ensure size of thread index space is as large as or greater than 122 * vector index space else return. 123 */ 124 if (blockDim.x * gridDim.x < len) return; 125 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 126 if (idx < len) b[idx] = scale* a[idx]; 127 } 128 129 __global__ void STREAM_Scale_Optimized_double(double *a, double *b, double scale, size_t len) 130 { 131 /* 132 * Ensure size of thread index space is as large as or greater than 133 * vector index space else return. 134 */ 135 if (blockDim.x * gridDim.x < len) return; 136 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 137 if (idx < len) b[idx] = scale* a[idx]; 138 } 139 140 __global__ void STREAM_Add(float *a, float *b, float *c, size_t len) 141 { 142 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 143 while (idx < len) { 144 c[idx] = a[idx]+b[idx]; 145 idx += blockDim.x * gridDim.x; 146 } 147 } 148 149 __global__ void STREAM_Add_double(double *a, double *b, double *c, size_t len) 150 { 151 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 152 while (idx < len) { 153 c[idx] = a[idx]+b[idx]; 154 idx += blockDim.x * gridDim.x; 155 } 156 } 157 158 __global__ void STREAM_Add_Optimized(float *a, float *b, float *c, size_t len) 159 { 160 /* 161 * Ensure size of thread index space is as large as or greater than 162 * vector index space else return. 163 */ 164 if (blockDim.x * gridDim.x < len) return; 165 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 166 if (idx < len) c[idx] = a[idx]+b[idx]; 167 } 168 169 __global__ void STREAM_Add_Optimized_double(double *a, double *b, double *c, size_t len) 170 { 171 /* 172 * Ensure size of thread index space is as large as or greater than 173 * vector index space else return. 174 */ 175 if (blockDim.x * gridDim.x < len) return; 176 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 177 if (idx < len) c[idx] = a[idx]+b[idx]; 178 } 179 180 __global__ void STREAM_Triad(float *a, float *b, float *c, float scalar, size_t len) 181 { 182 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 183 while (idx < len) { 184 c[idx] = a[idx]+scalar*b[idx]; 185 idx += blockDim.x * gridDim.x; 186 } 187 } 188 189 __global__ void STREAM_Triad_double(double *a, double *b, double *c, double scalar, size_t len) 190 { 191 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 192 while (idx < len) { 193 c[idx] = a[idx]+scalar*b[idx]; 194 idx += blockDim.x * gridDim.x; 195 } 196 } 197 198 __global__ void STREAM_Triad_Optimized(float *a, float *b, float *c, float scalar, size_t len) 199 { 200 /* 201 * Ensure size of thread index space is as large as or greater than 202 * vector index space else return. 203 */ 204 if (blockDim.x * gridDim.x < len) return; 205 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 206 if (idx < len) c[idx] = a[idx]+scalar*b[idx]; 207 } 208 209 __global__ void STREAM_Triad_Optimized_double(double *a, double *b, double *c, double scalar, size_t len) 210 { 211 /* 212 * Ensure size of thread index space is as large as or greater than 213 * vector index space else return. 214 */ 215 if (blockDim.x * gridDim.x < len) return; 216 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 217 if (idx < len) c[idx] = a[idx]+scalar*b[idx]; 218 } 219 220 /* Host side verification routines */ 221 bool STREAM_Copy_verify(float *a, float *b, size_t len) 222 { 223 size_t idx; 224 bool bDifferent = false; 225 226 for (idx = 0; idx < len && !bDifferent; idx++) { 227 float expectedResult = a[idx]; 228 float diffResultExpected = (b[idx] - expectedResult); 229 float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 230 /* element-wise relative error determination */ 231 bDifferent = (relErrorULPS > 2.f); 232 } 233 234 return bDifferent; 235 } 236 237 bool STREAM_Copy_verify_double(double *a, double *b, size_t len) 238 { 239 size_t idx; 240 bool bDifferent = false; 241 242 for (idx = 0; idx < len && !bDifferent; idx++) { 243 double expectedResult = a[idx]; 244 double diffResultExpected = (b[idx] - expectedResult); 245 double relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/dbl_eps; 246 /* element-wise relative error determination */ 247 bDifferent = (relErrorULPS > 2.); 248 } 249 250 return bDifferent; 251 } 252 253 bool STREAM_Scale_verify(float *a, float *b, float scale, size_t len) 254 { 255 size_t idx; 256 bool bDifferent = false; 257 258 for (idx = 0; idx < len && !bDifferent; idx++) { 259 float expectedResult = scale*a[idx]; 260 float diffResultExpected = (b[idx] - expectedResult); 261 float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 262 /* element-wise relative error determination */ 263 bDifferent = (relErrorULPS > 2.f); 264 } 265 266 return bDifferent; 267 } 268 269 bool STREAM_Scale_verify_double(double *a, double *b, double scale, size_t len) 270 { 271 size_t idx; 272 bool bDifferent = false; 273 274 for (idx = 0; idx < len && !bDifferent; idx++) { 275 double expectedResult = scale*a[idx]; 276 double diffResultExpected = (b[idx] - expectedResult); 277 double relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 278 /* element-wise relative error determination */ 279 bDifferent = (relErrorULPS > 2.); 280 } 281 282 return bDifferent; 283 } 284 285 bool STREAM_Add_verify(float *a, float *b, float *c, size_t len) 286 { 287 size_t idx; 288 bool bDifferent = false; 289 290 for (idx = 0; idx < len && !bDifferent; idx++) { 291 float expectedResult = a[idx] + b[idx]; 292 float diffResultExpected = (c[idx] - expectedResult); 293 float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 294 /* element-wise relative error determination */ 295 bDifferent = (relErrorULPS > 2.f); 296 } 297 298 return bDifferent; 299 } 300 301 bool STREAM_Add_verify_double(double *a, double *b, double *c, size_t len) 302 { 303 size_t idx; 304 bool bDifferent = false; 305 306 for (idx = 0; idx < len && !bDifferent; idx++) { 307 double expectedResult = a[idx] + b[idx]; 308 double diffResultExpected = (c[idx] - expectedResult); 309 double relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 310 /* element-wise relative error determination */ 311 bDifferent = (relErrorULPS > 2.); 312 } 313 314 return bDifferent; 315 } 316 317 bool STREAM_Triad_verify(float *a, float *b, float *c, float scalar, size_t len) 318 { 319 size_t idx; 320 bool bDifferent = false; 321 322 for (idx = 0; idx < len && !bDifferent; idx++) { 323 float expectedResult = a[idx] + scalar*b[idx]; 324 float diffResultExpected = (c[idx] - expectedResult); 325 float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 326 /* element-wise relative error determination */ 327 bDifferent = (relErrorULPS > 3.f); 328 } 329 330 return bDifferent; 331 } 332 333 bool STREAM_Triad_verify_double(double *a, double *b, double *c, double scalar, size_t len) 334 { 335 size_t idx; 336 bool bDifferent = false; 337 338 for (idx = 0; idx < len && !bDifferent; idx++) { 339 double expectedResult = a[idx] + scalar*b[idx]; 340 double diffResultExpected = (c[idx] - expectedResult); 341 double relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 342 /* element-wise relative error determination */ 343 bDifferent = (relErrorULPS > 3.); 344 } 345 346 return bDifferent; 347 } 348 349 /* forward declarations */ 350 PetscErrorCode setupStream(PetscInt device, PetscBool runDouble, PetscBool cpuTiming); 351 PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming); 352 PetscErrorCode runStreamDouble(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming); 353 PetscErrorCode printResultsReadable(float times[][NTIMES], size_t); 354 355 int main(int argc, char *argv[]) 356 { 357 PetscInt device = 0; 358 PetscBool runDouble = PETSC_TRUE; 359 const PetscBool cpuTiming = PETSC_TRUE; // must be true 360 PetscErrorCode ierr; 361 362 PetscCallCUDA(cudaSetDeviceFlags(cudaDeviceBlockingSync)); 363 364 PetscCall(PetscInitialize(&argc, &argv, 0, help)); 365 366 PetscOptionsBegin(PETSC_COMM_WORLD, "", "STREAM Benchmark Options", "STREAM"); 367 PetscCall(PetscOptionsBoundedInt("-device", "Specify the CUDA device to be used", "STREAM", device, &device, NULL,0)); 368 PetscCall(PetscOptionsBool("-double", "Also run double precision tests", "STREAM", runDouble, &runDouble, NULL)); 369 PetscOptionsEnd(); 370 371 ierr = setupStream(device, runDouble, cpuTiming); 372 if (ierr) { 373 PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n[streamBenchmark] - results:\t%s\n\n", (ierr == 0) ? "PASSES" : "FAILED")); 374 } 375 PetscCall(PetscFinalize()); 376 return 0; 377 } 378 379 /////////////////////////////////////////////////////////////////////////////// 380 //Run the appropriate tests 381 /////////////////////////////////////////////////////////////////////////////// 382 PetscErrorCode setupStream(PetscInt deviceNum, PetscBool runDouble, PetscBool cpuTiming) 383 { 384 PetscInt iNumThreadsPerBlock = 128; 385 386 PetscFunctionBegin; 387 // Check device 388 { 389 int deviceCount; 390 391 cudaGetDeviceCount(&deviceCount); 392 if (deviceCount == 0) { 393 PetscCall(PetscPrintf(PETSC_COMM_SELF, "!!!!!No devices found!!!!!\n")); 394 return -1000; 395 } 396 397 if (deviceNum >= deviceCount || deviceNum < 0) { 398 PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n!!!!!Invalid GPU number %d given hence default gpu %d will be used !!!!!\n", deviceNum, 0)); 399 deviceNum = 0; 400 } 401 } 402 403 cudaSetDevice(deviceNum); 404 // PetscCall(PetscPrintf(PETSC_COMM_SELF, "Running on...\n\n")); 405 cudaDeviceProp deviceProp; 406 if (cudaGetDeviceProperties(&deviceProp, deviceNum) != cudaSuccess) { 407 PetscCall(PetscPrintf(PETSC_COMM_SELF, " Unable to determine device %d properties, exiting\n")); 408 return -1; 409 } 410 411 if (runDouble && deviceProp.major == 1 && deviceProp.minor < 3) { 412 PetscCall(PetscPrintf(PETSC_COMM_SELF, " Unable to run double-precision STREAM benchmark on a compute capability GPU less than 1.3\n")); 413 return -1; 414 } 415 if (deviceProp.major == 2 && deviceProp.minor == 1) iNumThreadsPerBlock = 192; /* GF104 architecture / 48 CUDA Cores per MP */ 416 else iNumThreadsPerBlock = 128; /* GF100 architecture / 32 CUDA Cores per MP */ 417 418 if (runDouble) PetscCall(runStreamDouble(iNumThreadsPerBlock, cpuTiming)); 419 else PetscCall(runStream(iNumThreadsPerBlock, cpuTiming)); 420 PetscFunctionReturn(0); 421 } 422 423 /////////////////////////////////////////////////////////////////////////// 424 // runStream 425 /////////////////////////////////////////////////////////////////////////// 426 PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming) 427 { 428 float *d_a, *d_b, *d_c; 429 int k; 430 float times[8][NTIMES]; 431 float scalar; 432 433 PetscFunctionBegin; 434 /* Allocate memory on device */ 435 PetscCallCUDA(cudaMalloc((void**)&d_a, sizeof(float)*N)); 436 PetscCallCUDA(cudaMalloc((void**)&d_b, sizeof(float)*N)); 437 PetscCallCUDA(cudaMalloc((void**)&d_c, sizeof(float)*N)); 438 439 /* Compute execution configuration */ 440 441 dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */ 442 dim3 dimGrid(N/dimBlock.x); /* (N/dimBlock.x,1,1) */ 443 if (N % dimBlock.x != 0) dimGrid.x+=1; 444 445 /* Initialize memory on the device */ 446 set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 447 set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 448 set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 449 450 /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ 451 PetscLogDouble cpuTimer = 0.0; 452 453 scalar=3.0f; 454 for (k = 0; k < NTIMES; ++k) { 455 PetscTimeSubtract(&cpuTimer); 456 STREAM_Copy<<<dimGrid,dimBlock>>>(d_a, d_c, N); 457 cudaStreamSynchronize(NULL); 458 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 459 PetscTimeAdd(&cpuTimer); 460 if (bDontUseGPUTiming) times[0][k] = cpuTimer*1.e3; // millisec 461 462 cpuTimer = 0.0; 463 PetscTimeSubtract(&cpuTimer); 464 STREAM_Copy_Optimized<<<dimGrid,dimBlock>>>(d_a, d_c, N); 465 cudaStreamSynchronize(NULL); 466 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 467 //get the total elapsed time in ms 468 PetscTimeAdd(&cpuTimer); 469 if (bDontUseGPUTiming) times[1][k] = cpuTimer*1.e3; 470 471 cpuTimer = 0.0; 472 PetscTimeSubtract(&cpuTimer); 473 STREAM_Scale<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 474 cudaStreamSynchronize(NULL); 475 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 476 //get the total elapsed time in ms 477 PetscTimeAdd(&cpuTimer); 478 if (bDontUseGPUTiming) times[2][k] = cpuTimer*1.e3; 479 480 cpuTimer = 0.0; 481 PetscTimeSubtract(&cpuTimer); 482 STREAM_Scale_Optimized<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 483 cudaStreamSynchronize(NULL); 484 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 485 //get the total elapsed time in ms 486 PetscTimeAdd(&cpuTimer); 487 if (bDontUseGPUTiming) times[3][k] = cpuTimer*1.e3; 488 489 cpuTimer = 0.0; 490 PetscTimeSubtract(&cpuTimer); 491 // PetscCallCUDA(cudaEventRecord(start, 0)); 492 STREAM_Add<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 493 cudaStreamSynchronize(NULL); 494 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 495 PetscCallCUDA(cudaEventRecord(stop, 0)); 496 // PetscCallCUDA(cudaEventSynchronize(stop)); 497 //get the total elapsed time in ms 498 PetscTimeAdd(&cpuTimer); 499 if (bDontUseGPUTiming) times[4][k] = cpuTimer*1.e3; 500 else { 501 // PetscCallCUDA(cudaEventElapsedTime(×[4][k], start, stop)); 502 } 503 504 cpuTimer = 0.0; 505 PetscTimeSubtract(&cpuTimer); 506 STREAM_Add_Optimized<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 507 cudaStreamSynchronize(NULL); 508 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 509 //get the total elapsed time in ms 510 PetscTimeAdd(&cpuTimer); 511 if (bDontUseGPUTiming) times[5][k] = cpuTimer*1.e3; 512 513 cpuTimer = 0.0; 514 PetscTimeSubtract(&cpuTimer); 515 STREAM_Triad<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 516 cudaStreamSynchronize(NULL); 517 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 518 //get the total elapsed time in ms 519 PetscTimeAdd(&cpuTimer); 520 if (bDontUseGPUTiming) times[6][k] = cpuTimer*1.e3; 521 522 cpuTimer = 0.0; 523 PetscTimeSubtract(&cpuTimer); 524 STREAM_Triad_Optimized<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 525 cudaStreamSynchronize(NULL); 526 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 527 //get the total elapsed time in ms 528 PetscTimeAdd(&cpuTimer); 529 if (bDontUseGPUTiming) times[7][k] = cpuTimer*1.e3; 530 } 531 532 if (1) { /* verify kernels */ 533 float *h_a, *h_b, *h_c; 534 bool errorSTREAMkernel = true; 535 536 if ((h_a = (float*)calloc(N, sizeof(float))) == (float*)NULL) { 537 printf("Unable to allocate array h_a, exiting ...\n"); 538 exit(1); 539 } 540 if ((h_b = (float*)calloc(N, sizeof(float))) == (float*)NULL) { 541 printf("Unable to allocate array h_b, exiting ...\n"); 542 exit(1); 543 } 544 545 if ((h_c = (float*)calloc(N, sizeof(float))) == (float*)NULL) { 546 printf("Unalbe to allocate array h_c, exiting ...\n"); 547 exit(1); 548 } 549 550 /* 551 * perform kernel, copy device memory into host memory and verify each 552 * device kernel output 553 */ 554 555 /* Initialize memory on the device */ 556 set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 557 set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 558 set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 559 560 STREAM_Copy<<<dimGrid,dimBlock>>>(d_a, d_c, N); 561 PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost)); 562 PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost)); 563 errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N); 564 if (errorSTREAMkernel) { 565 PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n")); 566 exit(-2000); 567 } 568 569 /* Initialize memory on the device */ 570 set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 571 set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 572 set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 573 574 STREAM_Copy_Optimized<<<dimGrid,dimBlock>>>(d_a, d_c, N); 575 PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost)); 576 PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost)); 577 errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N); 578 if (errorSTREAMkernel) { 579 PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n")); 580 exit(-3000); 581 } 582 583 /* Initialize memory on the device */ 584 set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 585 set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 586 set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 587 588 STREAM_Scale<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 589 PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost)); 590 PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost)); 591 errorSTREAMkernel = STREAM_Scale_verify(h_b, h_c, scalar, N); 592 if (errorSTREAMkernel) { 593 PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n")); 594 exit(-4000); 595 } 596 597 /* Initialize memory on the device */ 598 set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 599 set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 600 set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 601 602 STREAM_Add<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 603 PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost)); 604 PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost)); 605 PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost)); 606 errorSTREAMkernel = STREAM_Add_verify(h_a, h_b, h_c, N); 607 if (errorSTREAMkernel) { 608 PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n")); 609 exit(-5000); 610 } 611 612 /* Initialize memory on the device */ 613 set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 614 set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 615 set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 616 617 STREAM_Triad<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 618 PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost)); 619 PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost)); 620 PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost)); 621 errorSTREAMkernel = STREAM_Triad_verify(h_b, h_c, h_a, scalar, N); 622 if (errorSTREAMkernel) { 623 PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n")); 624 exit(-6000); 625 } 626 627 free(h_a); 628 free(h_b); 629 free(h_c); 630 } 631 /* continue from here */ 632 printResultsReadable(times, sizeof(float)); 633 634 /* Free memory on device */ 635 PetscCallCUDA(cudaFree(d_a)); 636 PetscCallCUDA(cudaFree(d_b)); 637 PetscCallCUDA(cudaFree(d_c)); 638 639 PetscFunctionReturn(0); 640 } 641 642 PetscErrorCode runStreamDouble(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming) 643 { 644 double *d_a, *d_b, *d_c; 645 int k; 646 float times[8][NTIMES]; 647 double scalar; 648 649 PetscFunctionBegin; 650 /* Allocate memory on device */ 651 PetscCallCUDA(cudaMalloc((void**)&d_a, sizeof(double)*N)); 652 PetscCallCUDA(cudaMalloc((void**)&d_b, sizeof(double)*N)); 653 PetscCallCUDA(cudaMalloc((void**)&d_c, sizeof(double)*N)); 654 655 /* Compute execution configuration */ 656 657 dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */ 658 dim3 dimGrid(N/dimBlock.x); /* (N/dimBlock.x,1,1) */ 659 if (N % dimBlock.x != 0) dimGrid.x+=1; 660 661 /* Initialize memory on the device */ 662 set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N); 663 set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 664 set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 665 666 /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ 667 PetscLogDouble cpuTimer = 0.0; 668 669 scalar=3.0; 670 for (k = 0; k < NTIMES; ++k) { 671 PetscTimeSubtract(&cpuTimer); 672 STREAM_Copy_double<<<dimGrid,dimBlock>>>(d_a, d_c, N); 673 cudaStreamSynchronize(NULL); 674 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 675 //get the total elapsed time in ms 676 if (bDontUseGPUTiming) { 677 PetscTimeAdd(&cpuTimer); 678 times[0][k] = cpuTimer*1.e3; 679 } 680 681 cpuTimer = 0.0; 682 PetscTimeSubtract(&cpuTimer); 683 STREAM_Copy_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_c, N); 684 cudaStreamSynchronize(NULL); 685 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 686 //get the total elapsed time in ms 687 if (bDontUseGPUTiming) { 688 PetscTimeAdd(&cpuTimer); 689 times[1][k] = cpuTimer*1.e3; 690 } 691 692 cpuTimer = 0.0; 693 PetscTimeSubtract(&cpuTimer); 694 STREAM_Scale_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 695 cudaStreamSynchronize(NULL); 696 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 697 //get the total elapsed time in ms 698 PetscTimeAdd(&cpuTimer); 699 if (bDontUseGPUTiming) times[2][k] = cpuTimer*1.e3; 700 701 cpuTimer = 0.0; 702 PetscTimeSubtract(&cpuTimer); 703 STREAM_Scale_Optimized_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 704 cudaStreamSynchronize(NULL); 705 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 706 //get the total elapsed time in ms 707 PetscTimeAdd(&cpuTimer); 708 if (bDontUseGPUTiming) times[3][k] = cpuTimer*1.e3; 709 710 cpuTimer = 0.0; 711 PetscTimeSubtract(&cpuTimer); 712 STREAM_Add_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 713 cudaStreamSynchronize(NULL); 714 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 715 //get the total elapsed time in ms 716 PetscTimeAdd(&cpuTimer); 717 if (bDontUseGPUTiming) times[4][k] = cpuTimer*1.e3; 718 719 cpuTimer = 0.0; 720 PetscTimeSubtract(&cpuTimer); 721 STREAM_Add_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 722 cudaStreamSynchronize(NULL); 723 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 724 //get the total elapsed time in ms 725 PetscTimeAdd(&cpuTimer); 726 if (bDontUseGPUTiming) times[5][k] = cpuTimer*1.e3; 727 728 cpuTimer = 0.0; 729 PetscTimeSubtract(&cpuTimer); 730 STREAM_Triad_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 731 cudaStreamSynchronize(NULL); 732 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 733 //get the total elapsed time in ms 734 PetscTimeAdd(&cpuTimer); 735 if (bDontUseGPUTiming) times[6][k] = cpuTimer*1.e3; 736 737 cpuTimer = 0.0; 738 PetscTimeSubtract(&cpuTimer); 739 STREAM_Triad_Optimized_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 740 cudaStreamSynchronize(NULL); 741 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 742 //get the total elapsed time in ms 743 PetscTimeAdd(&cpuTimer); 744 if (bDontUseGPUTiming) times[7][k] = cpuTimer*1.e3; 745 } 746 747 if (1) { /* verify kernels */ 748 double *h_a, *h_b, *h_c; 749 bool errorSTREAMkernel = true; 750 751 if ((h_a = (double*)calloc(N, sizeof(double))) == (double*)NULL) { 752 printf("Unable to allocate array h_a, exiting ...\n"); 753 exit(1); 754 } 755 if ((h_b = (double*)calloc(N, sizeof(double))) == (double*)NULL) { 756 printf("Unable to allocate array h_b, exiting ...\n"); 757 exit(1); 758 } 759 760 if ((h_c = (double*)calloc(N, sizeof(double))) == (double*)NULL) { 761 printf("Unalbe to allocate array h_c, exiting ...\n"); 762 exit(1); 763 } 764 765 /* 766 * perform kernel, copy device memory into host memory and verify each 767 * device kernel output 768 */ 769 770 /* Initialize memory on the device */ 771 set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N); 772 set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 773 set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 774 775 STREAM_Copy_double<<<dimGrid,dimBlock>>>(d_a, d_c, N); 776 PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost)); 777 PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost)); 778 errorSTREAMkernel = STREAM_Copy_verify_double(h_a, h_c, N); 779 if (errorSTREAMkernel) { 780 PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n")); 781 exit(-2000); 782 } 783 784 /* Initialize memory on the device */ 785 set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N); 786 set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 787 set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 788 789 STREAM_Copy_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_c, N); 790 PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost)); 791 PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost)); 792 errorSTREAMkernel = STREAM_Copy_verify_double(h_a, h_c, N); 793 if (errorSTREAMkernel) { 794 PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n")); 795 exit(-3000); 796 } 797 798 /* Initialize memory on the device */ 799 set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 800 set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 801 802 STREAM_Scale_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 803 PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost)); 804 PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost)); 805 errorSTREAMkernel = STREAM_Scale_verify_double(h_b, h_c, scalar, N); 806 if (errorSTREAMkernel) { 807 PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n")); 808 exit(-4000); 809 } 810 811 /* Initialize memory on the device */ 812 set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N); 813 set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 814 set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 815 816 STREAM_Add_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 817 PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost)); 818 PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost)); 819 PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost)); 820 errorSTREAMkernel = STREAM_Add_verify_double(h_a, h_b, h_c, N); 821 if (errorSTREAMkernel) { 822 PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n")); 823 exit(-5000); 824 } 825 826 /* Initialize memory on the device */ 827 set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N); 828 set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 829 set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 830 831 STREAM_Triad_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 832 PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost)); 833 PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost)); 834 PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost)); 835 errorSTREAMkernel = STREAM_Triad_verify_double(h_b, h_c, h_a, scalar, N); 836 if (errorSTREAMkernel) { 837 PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n")); 838 exit(-6000); 839 } 840 841 free(h_a); 842 free(h_b); 843 free(h_c); 844 } 845 /* continue from here */ 846 printResultsReadable(times,sizeof(double)); 847 848 /* Free memory on device */ 849 PetscCallCUDA(cudaFree(d_a)); 850 PetscCallCUDA(cudaFree(d_b)); 851 PetscCallCUDA(cudaFree(d_c)); 852 853 PetscFunctionReturn(0); 854 } 855 856 /////////////////////////////////////////////////////////////////////////// 857 //Print Results to Screen and File 858 /////////////////////////////////////////////////////////////////////////// 859 PetscErrorCode printResultsReadable(float times[][NTIMES], const size_t bsize) 860 { 861 PetscErrorCode ierr; 862 PetscInt j, k; 863 float avgtime[8] = {0., 0., 0., 0., 0., 0., 0., 0.}; 864 float maxtime[8] = {0., 0., 0., 0., 0., 0., 0., 0.}; 865 float mintime[8] = {1e30,1e30,1e30,1e30,1e30,1e30,1e30,1e30}; 866 // char *label[8] = {"Copy: ", "Copy Opt.: ", "Scale: ", "Scale Opt: ", "Add: ", "Add Opt: ", "Triad: ", "Triad Opt: "}; 867 const float bytes_per_kernel[8] = { 868 2. * bsize * N, 869 2. * bsize * N, 870 2. * bsize * N, 871 2. * bsize * N, 872 3. * bsize * N, 873 3. * bsize * N, 874 3. * bsize * N, 875 3. * bsize * N 876 }; 877 double rate,irate; 878 int rank,size; 879 PetscFunctionBegin; 880 PetscCallMPI(MPI_Comm_rank(MPI_COMM_WORLD,&rank)); 881 PetscCallMPI(MPI_Comm_size(MPI_COMM_WORLD,&size)); 882 /* --- SUMMARY --- */ 883 for (k = 0; k < NTIMES; ++k) { 884 for (j = 0; j < 8; ++j) { 885 avgtime[j] = avgtime[j] + (1.e-03f * times[j][k]); // millisec --> sec 886 mintime[j] = MIN(mintime[j], (1.e-03f * times[j][k])); 887 maxtime[j] = MAX(maxtime[j], (1.e-03f * times[j][k])); 888 } 889 } 890 for (j = 0; j < 8; ++j) { 891 avgtime[j] = avgtime[j]/(float)(NTIMES-1); 892 } 893 j = 7; 894 irate = 1.0E-06 * bytes_per_kernel[j]/mintime[j]; 895 ierr = MPI_Reduce(&irate,&rate,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); 896 if (rank == 0) { 897 FILE *fd; 898 if (size == 1) { 899 printf("%d %11.4f Rate (MB/s)\n",size, rate); 900 fd = fopen("flops","w"); 901 fprintf(fd,"%g\n",rate); 902 fclose(fd); 903 } else { 904 double prate; 905 fd = fopen("flops","r"); 906 fscanf(fd,"%lg",&prate); 907 fclose(fd); 908 printf("%d %11.4f Rate (MB/s) %g \n", size, rate, rate/prate); 909 } 910 } 911 912 PetscFunctionReturn(0); 913 } 914