1 /* 2 STREAM benchmark implementation in CUDA. 3 4 COPY: a(i) = b(i) 5 SCALE: a(i) = q*b(i) 6 SUM: a(i) = b(i) + c(i) 7 TRIAD: a(i) = b(i) + q*c(i) 8 9 It measures the memory system on the device. 10 The implementation is in double precision with a single option. 11 12 Code based on the code developed by John D. McCalpin 13 http://www.cs.virginia.edu/stream/FTP/Code/stream.c 14 15 Written by: Massimiliano Fatica, NVIDIA Corporation 16 Modified by: Douglas Enright (dpephd-nvidia@yahoo.com), 1 December 2010 17 Extensive Revisions, 4 December 2010 18 Modified for PETSc by: Matthew G. Knepley 14 Aug 2011 19 20 User interface motivated by bandwidthTest NVIDIA SDK example. 21 */ 22 static char help[] = "Double-Precision STREAM Benchmark implementation in CUDA\n Performs Copy, Scale, Add, and Triad double-precision kernels\n\n"; 23 24 #include <petscconf.h> 25 #include <petscsys.h> 26 #include <petsctime.h> 27 #include <petscdevice.h> 28 29 #define N 10000000 30 #define NTIMES 10 31 32 # ifndef MIN 33 # define MIN(x,y) ((x)<(y) ? (x) : (y)) 34 # endif 35 # ifndef MAX 36 # define MAX(x,y) ((x)>(y) ? (x) : (y)) 37 # endif 38 39 const float flt_eps = 1.192092896e-07f; 40 const double dbl_eps = 2.2204460492503131e-16; 41 42 __global__ void set_array(float *a, float value, size_t len) 43 { 44 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 45 while (idx < len) { 46 a[idx] = value; 47 idx += blockDim.x * gridDim.x; 48 } 49 } 50 51 __global__ void set_array_double(double *a, double value, size_t len) 52 { 53 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 54 while (idx < len) { 55 a[idx] = value; 56 idx += blockDim.x * gridDim.x; 57 } 58 } 59 60 __global__ void STREAM_Copy(float *a, float *b, size_t len) 61 { 62 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 63 while (idx < len) { 64 b[idx] = a[idx]; 65 idx += blockDim.x * gridDim.x; 66 } 67 } 68 69 __global__ void STREAM_Copy_double(double *a, double *b, size_t len) 70 { 71 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 72 while (idx < len) { 73 b[idx] = a[idx]; 74 idx += blockDim.x * gridDim.x; 75 } 76 } 77 78 __global__ void STREAM_Copy_Optimized(float *a, float *b, size_t len) 79 { 80 /* 81 * Ensure size of thread index space is as large as or greater than 82 * vector index space else return. 83 */ 84 if (blockDim.x * gridDim.x < len) return; 85 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 86 if (idx < len) b[idx] = a[idx]; 87 } 88 89 __global__ void STREAM_Copy_Optimized_double(double *a, double *b, size_t len) 90 { 91 /* 92 * Ensure size of thread index space is as large as or greater than 93 * vector index space else return. 94 */ 95 if (blockDim.x * gridDim.x < len) return; 96 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 97 if (idx < len) b[idx] = a[idx]; 98 } 99 100 __global__ void STREAM_Scale(float *a, float *b, float scale, size_t len) 101 { 102 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 103 while (idx < len) { 104 b[idx] = scale* a[idx]; 105 idx += blockDim.x * gridDim.x; 106 } 107 } 108 109 __global__ void STREAM_Scale_double(double *a, double *b, double scale, size_t len) 110 { 111 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 112 while (idx < len) { 113 b[idx] = scale* a[idx]; 114 idx += blockDim.x * gridDim.x; 115 } 116 } 117 118 __global__ void STREAM_Scale_Optimized(float *a, float *b, float scale, size_t len) 119 { 120 /* 121 * Ensure size of thread index space is as large as or greater than 122 * vector index space else return. 123 */ 124 if (blockDim.x * gridDim.x < len) return; 125 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 126 if (idx < len) b[idx] = scale* a[idx]; 127 } 128 129 __global__ void STREAM_Scale_Optimized_double(double *a, double *b, double scale, size_t len) 130 { 131 /* 132 * Ensure size of thread index space is as large as or greater than 133 * vector index space else return. 134 */ 135 if (blockDim.x * gridDim.x < len) return; 136 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 137 if (idx < len) b[idx] = scale* a[idx]; 138 } 139 140 __global__ void STREAM_Add(float *a, float *b, float *c, size_t len) 141 { 142 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 143 while (idx < len) { 144 c[idx] = a[idx]+b[idx]; 145 idx += blockDim.x * gridDim.x; 146 } 147 } 148 149 __global__ void STREAM_Add_double(double *a, double *b, double *c, size_t len) 150 { 151 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 152 while (idx < len) { 153 c[idx] = a[idx]+b[idx]; 154 idx += blockDim.x * gridDim.x; 155 } 156 } 157 158 __global__ void STREAM_Add_Optimized(float *a, float *b, float *c, size_t len) 159 { 160 /* 161 * Ensure size of thread index space is as large as or greater than 162 * vector index space else return. 163 */ 164 if (blockDim.x * gridDim.x < len) return; 165 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 166 if (idx < len) c[idx] = a[idx]+b[idx]; 167 } 168 169 __global__ void STREAM_Add_Optimized_double(double *a, double *b, double *c, size_t len) 170 { 171 /* 172 * Ensure size of thread index space is as large as or greater than 173 * vector index space else return. 174 */ 175 if (blockDim.x * gridDim.x < len) return; 176 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 177 if (idx < len) c[idx] = a[idx]+b[idx]; 178 } 179 180 __global__ void STREAM_Triad(float *a, float *b, float *c, float scalar, size_t len) 181 { 182 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 183 while (idx < len) { 184 c[idx] = a[idx]+scalar*b[idx]; 185 idx += blockDim.x * gridDim.x; 186 } 187 } 188 189 __global__ void STREAM_Triad_double(double *a, double *b, double *c, double scalar, size_t len) 190 { 191 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 192 while (idx < len) { 193 c[idx] = a[idx]+scalar*b[idx]; 194 idx += blockDim.x * gridDim.x; 195 } 196 } 197 198 __global__ void STREAM_Triad_Optimized(float *a, float *b, float *c, float scalar, size_t len) 199 { 200 /* 201 * Ensure size of thread index space is as large as or greater than 202 * vector index space else return. 203 */ 204 if (blockDim.x * gridDim.x < len) return; 205 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 206 if (idx < len) c[idx] = a[idx]+scalar*b[idx]; 207 } 208 209 __global__ void STREAM_Triad_Optimized_double(double *a, double *b, double *c, double scalar, size_t len) 210 { 211 /* 212 * Ensure size of thread index space is as large as or greater than 213 * vector index space else return. 214 */ 215 if (blockDim.x * gridDim.x < len) return; 216 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 217 if (idx < len) c[idx] = a[idx]+scalar*b[idx]; 218 } 219 220 /* Host side verification routines */ 221 bool STREAM_Copy_verify(float *a, float *b, size_t len) 222 { 223 size_t idx; 224 bool bDifferent = false; 225 226 for (idx = 0; idx < len && !bDifferent; idx++) { 227 float expectedResult = a[idx]; 228 float diffResultExpected = (b[idx] - expectedResult); 229 float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 230 /* element-wise relative error determination */ 231 bDifferent = (relErrorULPS > 2.f); 232 } 233 234 return bDifferent; 235 } 236 237 bool STREAM_Copy_verify_double(double *a, double *b, size_t len) 238 { 239 size_t idx; 240 bool bDifferent = false; 241 242 for (idx = 0; idx < len && !bDifferent; idx++) { 243 double expectedResult = a[idx]; 244 double diffResultExpected = (b[idx] - expectedResult); 245 double relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/dbl_eps; 246 /* element-wise relative error determination */ 247 bDifferent = (relErrorULPS > 2.); 248 } 249 250 return bDifferent; 251 } 252 253 bool STREAM_Scale_verify(float *a, float *b, float scale, size_t len) 254 { 255 size_t idx; 256 bool bDifferent = false; 257 258 for (idx = 0; idx < len && !bDifferent; idx++) { 259 float expectedResult = scale*a[idx]; 260 float diffResultExpected = (b[idx] - expectedResult); 261 float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 262 /* element-wise relative error determination */ 263 bDifferent = (relErrorULPS > 2.f); 264 } 265 266 return bDifferent; 267 } 268 269 bool STREAM_Scale_verify_double(double *a, double *b, double scale, size_t len) 270 { 271 size_t idx; 272 bool bDifferent = false; 273 274 for (idx = 0; idx < len && !bDifferent; idx++) { 275 double expectedResult = scale*a[idx]; 276 double diffResultExpected = (b[idx] - expectedResult); 277 double relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 278 /* element-wise relative error determination */ 279 bDifferent = (relErrorULPS > 2.); 280 } 281 282 return bDifferent; 283 } 284 285 bool STREAM_Add_verify(float *a, float *b, float *c, size_t len) 286 { 287 size_t idx; 288 bool bDifferent = false; 289 290 for (idx = 0; idx < len && !bDifferent; idx++) { 291 float expectedResult = a[idx] + b[idx]; 292 float diffResultExpected = (c[idx] - expectedResult); 293 float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 294 /* element-wise relative error determination */ 295 bDifferent = (relErrorULPS > 2.f); 296 } 297 298 return bDifferent; 299 } 300 301 bool STREAM_Add_verify_double(double *a, double *b, double *c, size_t len) 302 { 303 size_t idx; 304 bool bDifferent = false; 305 306 for (idx = 0; idx < len && !bDifferent; idx++) { 307 double expectedResult = a[idx] + b[idx]; 308 double diffResultExpected = (c[idx] - expectedResult); 309 double relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 310 /* element-wise relative error determination */ 311 bDifferent = (relErrorULPS > 2.); 312 } 313 314 return bDifferent; 315 } 316 317 bool STREAM_Triad_verify(float *a, float *b, float *c, float scalar, size_t len) 318 { 319 size_t idx; 320 bool bDifferent = false; 321 322 for (idx = 0; idx < len && !bDifferent; idx++) { 323 float expectedResult = a[idx] + scalar*b[idx]; 324 float diffResultExpected = (c[idx] - expectedResult); 325 float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 326 /* element-wise relative error determination */ 327 bDifferent = (relErrorULPS > 3.f); 328 } 329 330 return bDifferent; 331 } 332 333 bool STREAM_Triad_verify_double(double *a, double *b, double *c, double scalar, size_t len) 334 { 335 size_t idx; 336 bool bDifferent = false; 337 338 for (idx = 0; idx < len && !bDifferent; idx++) { 339 double expectedResult = a[idx] + scalar*b[idx]; 340 double diffResultExpected = (c[idx] - expectedResult); 341 double relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 342 /* element-wise relative error determination */ 343 bDifferent = (relErrorULPS > 3.); 344 } 345 346 return bDifferent; 347 } 348 349 /* forward declarations */ 350 PetscErrorCode setupStream(PetscInt device, PetscBool runDouble, PetscBool cpuTiming); 351 PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming); 352 PetscErrorCode runStreamDouble(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming); 353 PetscErrorCode printResultsReadable(float times[][NTIMES], size_t); 354 355 int main(int argc, char *argv[]) 356 { 357 PetscInt device = 0; 358 PetscBool runDouble = PETSC_TRUE; 359 const PetscBool cpuTiming = PETSC_TRUE; // must be true 360 PetscErrorCode ierr; 361 362 CHKERRCUDA(cudaSetDeviceFlags(cudaDeviceBlockingSync)); 363 364 CHKERRQ(PetscInitialize(&argc, &argv, 0, help)); 365 366 ierr = PetscOptionsBegin(PETSC_COMM_WORLD, "", "STREAM Benchmark Options", "STREAM");CHKERRQ(ierr); 367 CHKERRQ(PetscOptionsBoundedInt("-device", "Specify the CUDA device to be used", "STREAM", device, &device, NULL,0)); 368 CHKERRQ(PetscOptionsBool("-double", "Also run double precision tests", "STREAM", runDouble, &runDouble, NULL)); 369 ierr = PetscOptionsEnd(); 370 371 ierr = setupStream(device, runDouble, cpuTiming); 372 if (ierr) { 373 CHKERRQ(PetscPrintf(PETSC_COMM_SELF, "\n[streamBenchmark] - results:\t%s\n\n", (ierr == 0) ? "PASSES" : "FAILED")); 374 } 375 CHKERRQ(PetscFinalize()); 376 return 0; 377 } 378 379 /////////////////////////////////////////////////////////////////////////////// 380 //Run the appropriate tests 381 /////////////////////////////////////////////////////////////////////////////// 382 PetscErrorCode setupStream(PetscInt deviceNum, PetscBool runDouble, PetscBool cpuTiming) 383 { 384 PetscInt iNumThreadsPerBlock = 128; 385 PetscErrorCode ierr; 386 387 PetscFunctionBegin; 388 // Check device 389 { 390 int deviceCount; 391 392 cudaGetDeviceCount(&deviceCount); 393 if (deviceCount == 0) { 394 CHKERRQ(PetscPrintf(PETSC_COMM_SELF, "!!!!!No devices found!!!!!\n")); 395 return -1000; 396 } 397 398 if (deviceNum >= deviceCount || deviceNum < 0) { 399 CHKERRQ(PetscPrintf(PETSC_COMM_SELF, "\n!!!!!Invalid GPU number %d given hence default gpu %d will be used !!!!!\n", deviceNum, 0)); 400 deviceNum = 0; 401 } 402 } 403 404 cudaSetDevice(deviceNum); 405 // CHKERRQ(PetscPrintf(PETSC_COMM_SELF, "Running on...\n\n")); 406 cudaDeviceProp deviceProp; 407 if (cudaGetDeviceProperties(&deviceProp, deviceNum) != cudaSuccess) { 408 CHKERRQ(PetscPrintf(PETSC_COMM_SELF, " Unable to determine device %d properties, exiting\n")); 409 return -1; 410 } 411 412 if (runDouble && deviceProp.major == 1 && deviceProp.minor < 3) { 413 CHKERRQ(PetscPrintf(PETSC_COMM_SELF, " Unable to run double-precision STREAM benchmark on a compute capability GPU less than 1.3\n")); 414 return -1; 415 } 416 if (deviceProp.major == 2 && deviceProp.minor == 1) iNumThreadsPerBlock = 192; /* GF104 architecture / 48 CUDA Cores per MP */ 417 else iNumThreadsPerBlock = 128; /* GF100 architecture / 32 CUDA Cores per MP */ 418 419 if (runDouble) { 420 CHKERRQ(runStreamDouble(iNumThreadsPerBlock, cpuTiming)); 421 } else { 422 CHKERRQ(runStream(iNumThreadsPerBlock, cpuTiming)); 423 } 424 PetscFunctionReturn(0); 425 } 426 427 /////////////////////////////////////////////////////////////////////////// 428 // runStream 429 /////////////////////////////////////////////////////////////////////////// 430 PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming) 431 { 432 float *d_a, *d_b, *d_c; 433 int k; 434 float times[8][NTIMES]; 435 float scalar; 436 PetscErrorCode ierr; 437 438 PetscFunctionBegin; 439 /* Allocate memory on device */ 440 CHKERRCUDA(cudaMalloc((void**)&d_a, sizeof(float)*N)); 441 CHKERRCUDA(cudaMalloc((void**)&d_b, sizeof(float)*N)); 442 CHKERRCUDA(cudaMalloc((void**)&d_c, sizeof(float)*N)); 443 444 /* Compute execution configuration */ 445 446 dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */ 447 dim3 dimGrid(N/dimBlock.x); /* (N/dimBlock.x,1,1) */ 448 if (N % dimBlock.x != 0) dimGrid.x+=1; 449 450 /* Initialize memory on the device */ 451 set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 452 set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 453 set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 454 455 /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ 456 PetscLogDouble cpuTimer = 0.0; 457 458 scalar=3.0f; 459 for (k = 0; k < NTIMES; ++k) { 460 PetscTimeSubtract(&cpuTimer); 461 STREAM_Copy<<<dimGrid,dimBlock>>>(d_a, d_c, N); 462 cudaStreamSynchronize(NULL); 463 CHKERRMPI(MPI_Barrier(MPI_COMM_WORLD)); 464 PetscTimeAdd(&cpuTimer); 465 if (bDontUseGPUTiming) times[0][k] = cpuTimer*1.e3; // millisec 466 467 cpuTimer = 0.0; 468 PetscTimeSubtract(&cpuTimer); 469 STREAM_Copy_Optimized<<<dimGrid,dimBlock>>>(d_a, d_c, N); 470 cudaStreamSynchronize(NULL); 471 CHKERRMPI(MPI_Barrier(MPI_COMM_WORLD)); 472 //get the total elapsed time in ms 473 PetscTimeAdd(&cpuTimer); 474 if (bDontUseGPUTiming) times[1][k] = cpuTimer*1.e3; 475 476 cpuTimer = 0.0; 477 PetscTimeSubtract(&cpuTimer); 478 STREAM_Scale<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 479 cudaStreamSynchronize(NULL); 480 CHKERRMPI(MPI_Barrier(MPI_COMM_WORLD)); 481 //get the total elapsed time in ms 482 PetscTimeAdd(&cpuTimer); 483 if (bDontUseGPUTiming) times[2][k] = cpuTimer*1.e3; 484 485 cpuTimer = 0.0; 486 PetscTimeSubtract(&cpuTimer); 487 STREAM_Scale_Optimized<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 488 cudaStreamSynchronize(NULL); 489 CHKERRMPI(MPI_Barrier(MPI_COMM_WORLD)); 490 //get the total elapsed time in ms 491 PetscTimeAdd(&cpuTimer); 492 if (bDontUseGPUTiming) times[3][k] = cpuTimer*1.e3; 493 494 cpuTimer = 0.0; 495 PetscTimeSubtract(&cpuTimer); 496 // CHKERRCUDA(cudaEventRecord(start, 0)); 497 STREAM_Add<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 498 cudaStreamSynchronize(NULL); 499 CHKERRMPI(MPI_Barrier(MPI_COMM_WORLD)); 500 CHKERRCUDA(cudaEventRecord(stop, 0)); 501 // CHKERRCUDA(cudaEventSynchronize(stop)); 502 //get the total elapsed time in ms 503 PetscTimeAdd(&cpuTimer); 504 if (bDontUseGPUTiming) times[4][k] = cpuTimer*1.e3; 505 else { 506 // CHKERRCUDA(cudaEventElapsedTime(×[4][k], start, stop)); 507 } 508 509 cpuTimer = 0.0; 510 PetscTimeSubtract(&cpuTimer); 511 STREAM_Add_Optimized<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 512 cudaStreamSynchronize(NULL); 513 CHKERRMPI(MPI_Barrier(MPI_COMM_WORLD)); 514 //get the total elapsed time in ms 515 PetscTimeAdd(&cpuTimer); 516 if (bDontUseGPUTiming) times[5][k] = cpuTimer*1.e3; 517 518 cpuTimer = 0.0; 519 PetscTimeSubtract(&cpuTimer); 520 STREAM_Triad<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 521 cudaStreamSynchronize(NULL); 522 CHKERRMPI(MPI_Barrier(MPI_COMM_WORLD)); 523 //get the total elapsed time in ms 524 PetscTimeAdd(&cpuTimer); 525 if (bDontUseGPUTiming) times[6][k] = cpuTimer*1.e3; 526 527 cpuTimer = 0.0; 528 PetscTimeSubtract(&cpuTimer); 529 STREAM_Triad_Optimized<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 530 cudaStreamSynchronize(NULL); 531 CHKERRMPI(MPI_Barrier(MPI_COMM_WORLD)); 532 //get the total elapsed time in ms 533 PetscTimeAdd(&cpuTimer); 534 if (bDontUseGPUTiming) times[7][k] = cpuTimer*1.e3; 535 } 536 537 if (1) { /* verify kernels */ 538 float *h_a, *h_b, *h_c; 539 bool errorSTREAMkernel = true; 540 541 if ((h_a = (float*)calloc(N, sizeof(float))) == (float*)NULL) { 542 printf("Unable to allocate array h_a, exiting ...\n"); 543 exit(1); 544 } 545 if ((h_b = (float*)calloc(N, sizeof(float))) == (float*)NULL) { 546 printf("Unable to allocate array h_b, exiting ...\n"); 547 exit(1); 548 } 549 550 if ((h_c = (float*)calloc(N, sizeof(float))) == (float*)NULL) { 551 printf("Unalbe to allocate array h_c, exiting ...\n"); 552 exit(1); 553 } 554 555 /* 556 * perform kernel, copy device memory into host memory and verify each 557 * device kernel output 558 */ 559 560 /* Initialize memory on the device */ 561 set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 562 set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 563 set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 564 565 STREAM_Copy<<<dimGrid,dimBlock>>>(d_a, d_c, N); 566 CHKERRCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost)); 567 CHKERRCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost)); 568 errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N); 569 if (errorSTREAMkernel) { 570 CHKERRQ(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n")); 571 exit(-2000); 572 } 573 574 /* Initialize memory on the device */ 575 set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 576 set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 577 set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 578 579 STREAM_Copy_Optimized<<<dimGrid,dimBlock>>>(d_a, d_c, N); 580 CHKERRCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost)); 581 CHKERRCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost)); 582 errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N); 583 if (errorSTREAMkernel) { 584 CHKERRQ(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n")); 585 exit(-3000); 586 } 587 588 /* Initialize memory on the device */ 589 set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 590 set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 591 set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 592 593 STREAM_Scale<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 594 CHKERRCUDA(cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost)); 595 CHKERRCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost)); 596 errorSTREAMkernel = STREAM_Scale_verify(h_b, h_c, scalar, N); 597 if (errorSTREAMkernel) { 598 CHKERRQ(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n")); 599 exit(-4000); 600 } 601 602 /* Initialize memory on the device */ 603 set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 604 set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 605 set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 606 607 STREAM_Add<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 608 CHKERRCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost)); 609 CHKERRCUDA(cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost)); 610 CHKERRCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost)); 611 errorSTREAMkernel = STREAM_Add_verify(h_a, h_b, h_c, N); 612 if (errorSTREAMkernel) { 613 CHKERRQ(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n")); 614 exit(-5000); 615 } 616 617 /* Initialize memory on the device */ 618 set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 619 set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 620 set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 621 622 STREAM_Triad<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 623 CHKERRCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost)); 624 CHKERRCUDA(cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost)); 625 CHKERRCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost)); 626 errorSTREAMkernel = STREAM_Triad_verify(h_b, h_c, h_a, scalar, N); 627 if (errorSTREAMkernel) { 628 CHKERRQ(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n")); 629 exit(-6000); 630 } 631 632 free(h_a); 633 free(h_b); 634 free(h_c); 635 } 636 /* continue from here */ 637 printResultsReadable(times, sizeof(float)); 638 639 /* Free memory on device */ 640 CHKERRCUDA(cudaFree(d_a)); 641 CHKERRCUDA(cudaFree(d_b)); 642 CHKERRCUDA(cudaFree(d_c)); 643 644 PetscFunctionReturn(0); 645 } 646 647 PetscErrorCode runStreamDouble(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming) 648 { 649 double *d_a, *d_b, *d_c; 650 int k; 651 float times[8][NTIMES]; 652 double scalar; 653 PetscErrorCode ierr; 654 655 PetscFunctionBegin; 656 /* Allocate memory on device */ 657 CHKERRCUDA(cudaMalloc((void**)&d_a, sizeof(double)*N)); 658 CHKERRCUDA(cudaMalloc((void**)&d_b, sizeof(double)*N)); 659 CHKERRCUDA(cudaMalloc((void**)&d_c, sizeof(double)*N)); 660 661 /* Compute execution configuration */ 662 663 dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */ 664 dim3 dimGrid(N/dimBlock.x); /* (N/dimBlock.x,1,1) */ 665 if (N % dimBlock.x != 0) dimGrid.x+=1; 666 667 /* Initialize memory on the device */ 668 set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N); 669 set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 670 set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 671 672 /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ 673 PetscLogDouble cpuTimer = 0.0; 674 675 scalar=3.0; 676 for (k = 0; k < NTIMES; ++k) { 677 PetscTimeSubtract(&cpuTimer); 678 STREAM_Copy_double<<<dimGrid,dimBlock>>>(d_a, d_c, N); 679 cudaStreamSynchronize(NULL); 680 CHKERRMPI(MPI_Barrier(MPI_COMM_WORLD)); 681 //get the total elapsed time in ms 682 if (bDontUseGPUTiming) { 683 PetscTimeAdd(&cpuTimer); 684 times[0][k] = cpuTimer*1.e3; 685 } 686 687 cpuTimer = 0.0; 688 PetscTimeSubtract(&cpuTimer); 689 STREAM_Copy_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_c, N); 690 cudaStreamSynchronize(NULL); 691 CHKERRMPI(MPI_Barrier(MPI_COMM_WORLD)); 692 //get the total elapsed time in ms 693 if (bDontUseGPUTiming) { 694 PetscTimeAdd(&cpuTimer); 695 times[1][k] = cpuTimer*1.e3; 696 } 697 698 cpuTimer = 0.0; 699 PetscTimeSubtract(&cpuTimer); 700 STREAM_Scale_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 701 cudaStreamSynchronize(NULL); 702 CHKERRMPI(MPI_Barrier(MPI_COMM_WORLD)); 703 //get the total elapsed time in ms 704 PetscTimeAdd(&cpuTimer); 705 if (bDontUseGPUTiming) times[2][k] = cpuTimer*1.e3; 706 707 cpuTimer = 0.0; 708 PetscTimeSubtract(&cpuTimer); 709 STREAM_Scale_Optimized_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 710 cudaStreamSynchronize(NULL); 711 CHKERRMPI(MPI_Barrier(MPI_COMM_WORLD)); 712 //get the total elapsed time in ms 713 PetscTimeAdd(&cpuTimer); 714 if (bDontUseGPUTiming) times[3][k] = cpuTimer*1.e3; 715 716 cpuTimer = 0.0; 717 PetscTimeSubtract(&cpuTimer); 718 STREAM_Add_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 719 cudaStreamSynchronize(NULL); 720 CHKERRMPI(MPI_Barrier(MPI_COMM_WORLD)); 721 //get the total elapsed time in ms 722 PetscTimeAdd(&cpuTimer); 723 if (bDontUseGPUTiming) times[4][k] = cpuTimer*1.e3; 724 725 cpuTimer = 0.0; 726 PetscTimeSubtract(&cpuTimer); 727 STREAM_Add_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 728 cudaStreamSynchronize(NULL); 729 CHKERRMPI(MPI_Barrier(MPI_COMM_WORLD)); 730 //get the total elapsed time in ms 731 PetscTimeAdd(&cpuTimer); 732 if (bDontUseGPUTiming) times[5][k] = cpuTimer*1.e3; 733 734 cpuTimer = 0.0; 735 PetscTimeSubtract(&cpuTimer); 736 STREAM_Triad_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 737 cudaStreamSynchronize(NULL); 738 CHKERRMPI(MPI_Barrier(MPI_COMM_WORLD)); 739 //get the total elapsed time in ms 740 PetscTimeAdd(&cpuTimer); 741 if (bDontUseGPUTiming) times[6][k] = cpuTimer*1.e3; 742 743 cpuTimer = 0.0; 744 PetscTimeSubtract(&cpuTimer); 745 STREAM_Triad_Optimized_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 746 cudaStreamSynchronize(NULL); 747 CHKERRMPI(MPI_Barrier(MPI_COMM_WORLD)); 748 //get the total elapsed time in ms 749 PetscTimeAdd(&cpuTimer); 750 if (bDontUseGPUTiming) times[7][k] = cpuTimer*1.e3; 751 } 752 753 if (1) { /* verify kernels */ 754 double *h_a, *h_b, *h_c; 755 bool errorSTREAMkernel = true; 756 757 if ((h_a = (double*)calloc(N, sizeof(double))) == (double*)NULL) { 758 printf("Unable to allocate array h_a, exiting ...\n"); 759 exit(1); 760 } 761 if ((h_b = (double*)calloc(N, sizeof(double))) == (double*)NULL) { 762 printf("Unable to allocate array h_b, exiting ...\n"); 763 exit(1); 764 } 765 766 if ((h_c = (double*)calloc(N, sizeof(double))) == (double*)NULL) { 767 printf("Unalbe to allocate array h_c, exiting ...\n"); 768 exit(1); 769 } 770 771 /* 772 * perform kernel, copy device memory into host memory and verify each 773 * device kernel output 774 */ 775 776 /* Initialize memory on the device */ 777 set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N); 778 set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 779 set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 780 781 STREAM_Copy_double<<<dimGrid,dimBlock>>>(d_a, d_c, N); 782 CHKERRCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost)); 783 CHKERRCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost)); 784 errorSTREAMkernel = STREAM_Copy_verify_double(h_a, h_c, N); 785 if (errorSTREAMkernel) { 786 CHKERRQ(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n")); 787 exit(-2000); 788 } 789 790 /* Initialize memory on the device */ 791 set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N); 792 set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 793 set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 794 795 STREAM_Copy_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_c, N); 796 CHKERRCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost)); 797 CHKERRCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost)); 798 errorSTREAMkernel = STREAM_Copy_verify_double(h_a, h_c, N); 799 if (errorSTREAMkernel) { 800 CHKERRQ(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n")); 801 exit(-3000); 802 } 803 804 /* Initialize memory on the device */ 805 set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 806 set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 807 808 STREAM_Scale_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 809 CHKERRCUDA(cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost)); 810 CHKERRCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost)); 811 errorSTREAMkernel = STREAM_Scale_verify_double(h_b, h_c, scalar, N); 812 if (errorSTREAMkernel) { 813 CHKERRQ(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n")); 814 exit(-4000); 815 } 816 817 /* Initialize memory on the device */ 818 set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N); 819 set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 820 set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 821 822 STREAM_Add_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 823 CHKERRCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost)); 824 CHKERRCUDA(cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost)); 825 CHKERRCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost)); 826 errorSTREAMkernel = STREAM_Add_verify_double(h_a, h_b, h_c, N); 827 if (errorSTREAMkernel) { 828 CHKERRQ(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n")); 829 exit(-5000); 830 } 831 832 /* Initialize memory on the device */ 833 set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N); 834 set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 835 set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 836 837 STREAM_Triad_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 838 CHKERRCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost)); 839 CHKERRCUDA(cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost)); 840 CHKERRCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost)); 841 errorSTREAMkernel = STREAM_Triad_verify_double(h_b, h_c, h_a, scalar, N); 842 if (errorSTREAMkernel) { 843 CHKERRQ(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n")); 844 exit(-6000); 845 } 846 847 free(h_a); 848 free(h_b); 849 free(h_c); 850 } 851 /* continue from here */ 852 printResultsReadable(times,sizeof(double)); 853 854 /* Free memory on device */ 855 CHKERRCUDA(cudaFree(d_a)); 856 CHKERRCUDA(cudaFree(d_b)); 857 CHKERRCUDA(cudaFree(d_c)); 858 859 PetscFunctionReturn(0); 860 } 861 862 /////////////////////////////////////////////////////////////////////////// 863 //Print Results to Screen and File 864 /////////////////////////////////////////////////////////////////////////// 865 PetscErrorCode printResultsReadable(float times[][NTIMES], const size_t bsize) 866 { 867 PetscErrorCode ierr; 868 PetscInt j, k; 869 float avgtime[8] = {0., 0., 0., 0., 0., 0., 0., 0.}; 870 float maxtime[8] = {0., 0., 0., 0., 0., 0., 0., 0.}; 871 float mintime[8] = {1e30,1e30,1e30,1e30,1e30,1e30,1e30,1e30}; 872 // char *label[8] = {"Copy: ", "Copy Opt.: ", "Scale: ", "Scale Opt: ", "Add: ", "Add Opt: ", "Triad: ", "Triad Opt: "}; 873 const float bytes_per_kernel[8] = { 874 2. * bsize * N, 875 2. * bsize * N, 876 2. * bsize * N, 877 2. * bsize * N, 878 3. * bsize * N, 879 3. * bsize * N, 880 3. * bsize * N, 881 3. * bsize * N 882 }; 883 double rate,irate; 884 int rank,size; 885 PetscFunctionBegin; 886 CHKERRMPI(MPI_Comm_rank(MPI_COMM_WORLD,&rank)); 887 CHKERRMPI(MPI_Comm_size(MPI_COMM_WORLD,&size)); 888 /* --- SUMMARY --- */ 889 for (k = 0; k < NTIMES; ++k) { 890 for (j = 0; j < 8; ++j) { 891 avgtime[j] = avgtime[j] + (1.e-03f * times[j][k]); // millisec --> sec 892 mintime[j] = MIN(mintime[j], (1.e-03f * times[j][k])); 893 maxtime[j] = MAX(maxtime[j], (1.e-03f * times[j][k])); 894 } 895 } 896 for (j = 0; j < 8; ++j) { 897 avgtime[j] = avgtime[j]/(float)(NTIMES-1); 898 } 899 j = 7; 900 irate = 1.0E-06 * bytes_per_kernel[j]/mintime[j]; 901 ierr = MPI_Reduce(&irate,&rate,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); 902 if (rank == 0) { 903 FILE *fd; 904 if (size == 1) { 905 printf("%d %11.4f Rate (MB/s)\n",size, rate); 906 fd = fopen("flops","w"); 907 fprintf(fd,"%g\n",rate); 908 fclose(fd); 909 } else { 910 double prate; 911 fd = fopen("flops","r"); 912 fscanf(fd,"%lg",&prate); 913 fclose(fd); 914 printf("%d %11.4f Rate (MB/s) %g \n", size, rate, rate/prate); 915 } 916 } 917 918 PetscFunctionReturn(0); 919 } 920