1 /* 2 STREAM benchmark implementation in CUDA. 3 4 COPY: a(i) = b(i) 5 SCALE: a(i) = q*b(i) 6 SUM: a(i) = b(i) + c(i) 7 TRIAD: a(i) = b(i) + q*c(i) 8 9 It measures the memory system on the device. 10 The implementation is in double precision with a single option. 11 12 Code based on the code developed by John D. McCalpin 13 http://www.cs.virginia.edu/stream/FTP/Code/stream.c 14 15 Written by: Massimiliano Fatica, NVIDIA Corporation 16 Modified by: Douglas Enright (dpephd-nvidia@yahoo.com), 1 December 2010 17 Extensive Revisions, 4 December 2010 18 Modified for PETSc by: Matthew G. Knepley 14 Aug 2011 19 20 User interface motivated by bandwidthTest NVIDIA SDK example. 21 */ 22 static char help[] = "Double-Precision STREAM Benchmark implementation in CUDA\n Performs Copy, Scale, Add, and Triad double-precision kernels\n\n"; 23 24 #include <petscconf.h> 25 #include <petscsys.h> 26 #include <petsctime.h> 27 28 #define N 10000000 29 #define NTIMES 10 30 31 # ifndef MIN 32 # define MIN(x,y) ((x)<(y) ? (x) : (y)) 33 # endif 34 # ifndef MAX 35 # define MAX(x,y) ((x)>(y) ? (x) : (y)) 36 # endif 37 38 const float flt_eps = 1.192092896e-07f; 39 const double dbl_eps = 2.2204460492503131e-16; 40 41 __global__ void set_array(float *a, float value, size_t len) 42 { 43 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 44 while (idx < len) { 45 a[idx] = value; 46 idx += blockDim.x * gridDim.x; 47 } 48 } 49 50 __global__ void set_array_double(double *a, double value, size_t len) 51 { 52 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 53 while (idx < len) { 54 a[idx] = value; 55 idx += blockDim.x * gridDim.x; 56 } 57 } 58 59 __global__ void STREAM_Copy(float *a, float *b, size_t len) 60 { 61 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 62 while (idx < len) { 63 b[idx] = a[idx]; 64 idx += blockDim.x * gridDim.x; 65 } 66 } 67 68 __global__ void STREAM_Copy_double(double *a, double *b, size_t len) 69 { 70 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 71 while (idx < len) { 72 b[idx] = a[idx]; 73 idx += blockDim.x * gridDim.x; 74 } 75 } 76 77 __global__ void STREAM_Copy_Optimized(float *a, float *b, size_t len) 78 { 79 /* 80 * Ensure size of thread index space is as large as or greater than 81 * vector index space else return. 82 */ 83 if (blockDim.x * gridDim.x < len) return; 84 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 85 if (idx < len) b[idx] = a[idx]; 86 } 87 88 __global__ void STREAM_Copy_Optimized_double(double *a, double *b, size_t len) 89 { 90 /* 91 * Ensure size of thread index space is as large as or greater than 92 * vector index space else return. 93 */ 94 if (blockDim.x * gridDim.x < len) return; 95 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 96 if (idx < len) b[idx] = a[idx]; 97 } 98 99 __global__ void STREAM_Scale(float *a, float *b, float scale, size_t len) 100 { 101 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 102 while (idx < len) { 103 b[idx] = scale* a[idx]; 104 idx += blockDim.x * gridDim.x; 105 } 106 } 107 108 __global__ void STREAM_Scale_double(double *a, double *b, double scale, size_t len) 109 { 110 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 111 while (idx < len) { 112 b[idx] = scale* a[idx]; 113 idx += blockDim.x * gridDim.x; 114 } 115 } 116 117 __global__ void STREAM_Scale_Optimized(float *a, float *b, float scale, size_t len) 118 { 119 /* 120 * Ensure size of thread index space is as large as or greater than 121 * vector index space else return. 122 */ 123 if (blockDim.x * gridDim.x < len) return; 124 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 125 if (idx < len) b[idx] = scale* a[idx]; 126 } 127 128 __global__ void STREAM_Scale_Optimized_double(double *a, double *b, double scale, size_t len) 129 { 130 /* 131 * Ensure size of thread index space is as large as or greater than 132 * vector index space else return. 133 */ 134 if (blockDim.x * gridDim.x < len) return; 135 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 136 if (idx < len) b[idx] = scale* a[idx]; 137 } 138 139 __global__ void STREAM_Add(float *a, float *b, float *c, size_t len) 140 { 141 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 142 while (idx < len) { 143 c[idx] = a[idx]+b[idx]; 144 idx += blockDim.x * gridDim.x; 145 } 146 } 147 148 __global__ void STREAM_Add_double(double *a, double *b, double *c, size_t len) 149 { 150 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 151 while (idx < len) { 152 c[idx] = a[idx]+b[idx]; 153 idx += blockDim.x * gridDim.x; 154 } 155 } 156 157 __global__ void STREAM_Add_Optimized(float *a, float *b, float *c, size_t len) 158 { 159 /* 160 * Ensure size of thread index space is as large as or greater than 161 * vector index space else return. 162 */ 163 if (blockDim.x * gridDim.x < len) return; 164 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 165 if (idx < len) c[idx] = a[idx]+b[idx]; 166 } 167 168 __global__ void STREAM_Add_Optimized_double(double *a, double *b, double *c, size_t len) 169 { 170 /* 171 * Ensure size of thread index space is as large as or greater than 172 * vector index space else return. 173 */ 174 if (blockDim.x * gridDim.x < len) return; 175 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 176 if (idx < len) c[idx] = a[idx]+b[idx]; 177 } 178 179 __global__ void STREAM_Triad(float *a, float *b, float *c, float scalar, size_t len) 180 { 181 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 182 while (idx < len) { 183 c[idx] = a[idx]+scalar*b[idx]; 184 idx += blockDim.x * gridDim.x; 185 } 186 } 187 188 __global__ void STREAM_Triad_double(double *a, double *b, double *c, double scalar, size_t len) 189 { 190 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 191 while (idx < len) { 192 c[idx] = a[idx]+scalar*b[idx]; 193 idx += blockDim.x * gridDim.x; 194 } 195 } 196 197 __global__ void STREAM_Triad_Optimized(float *a, float *b, float *c, float scalar, size_t len) 198 { 199 /* 200 * Ensure size of thread index space is as large as or greater than 201 * vector index space else return. 202 */ 203 if (blockDim.x * gridDim.x < len) return; 204 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 205 if (idx < len) c[idx] = a[idx]+scalar*b[idx]; 206 } 207 208 __global__ void STREAM_Triad_Optimized_double(double *a, double *b, double *c, double scalar, size_t len) 209 { 210 /* 211 * Ensure size of thread index space is as large as or greater than 212 * vector index space else return. 213 */ 214 if (blockDim.x * gridDim.x < len) return; 215 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 216 if (idx < len) c[idx] = a[idx]+scalar*b[idx]; 217 } 218 219 /* Host side verification routines */ 220 bool STREAM_Copy_verify(float *a, float *b, size_t len) 221 { 222 size_t idx; 223 bool bDifferent = false; 224 225 for (idx = 0; idx < len && !bDifferent; idx++) { 226 float expectedResult = a[idx]; 227 float diffResultExpected = (b[idx] - expectedResult); 228 float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 229 /* element-wise relative error determination */ 230 bDifferent = (relErrorULPS > 2.f); 231 } 232 233 return bDifferent; 234 } 235 236 bool STREAM_Copy_verify_double(double *a, double *b, size_t len) 237 { 238 size_t idx; 239 bool bDifferent = false; 240 241 for (idx = 0; idx < len && !bDifferent; idx++) { 242 double expectedResult = a[idx]; 243 double diffResultExpected = (b[idx] - expectedResult); 244 double relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/dbl_eps; 245 /* element-wise relative error determination */ 246 bDifferent = (relErrorULPS > 2.); 247 } 248 249 return bDifferent; 250 } 251 252 bool STREAM_Scale_verify(float *a, float *b, float scale, size_t len) 253 { 254 size_t idx; 255 bool bDifferent = false; 256 257 for (idx = 0; idx < len && !bDifferent; idx++) { 258 float expectedResult = scale*a[idx]; 259 float diffResultExpected = (b[idx] - expectedResult); 260 float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 261 /* element-wise relative error determination */ 262 bDifferent = (relErrorULPS > 2.f); 263 } 264 265 return bDifferent; 266 } 267 268 bool STREAM_Scale_verify_double(double *a, double *b, double scale, size_t len) 269 { 270 size_t idx; 271 bool bDifferent = false; 272 273 for (idx = 0; idx < len && !bDifferent; idx++) { 274 double expectedResult = scale*a[idx]; 275 double diffResultExpected = (b[idx] - expectedResult); 276 double relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 277 /* element-wise relative error determination */ 278 bDifferent = (relErrorULPS > 2.); 279 } 280 281 return bDifferent; 282 } 283 284 bool STREAM_Add_verify(float *a, float *b, float *c, size_t len) 285 { 286 size_t idx; 287 bool bDifferent = false; 288 289 for (idx = 0; idx < len && !bDifferent; idx++) { 290 float expectedResult = a[idx] + b[idx]; 291 float diffResultExpected = (c[idx] - expectedResult); 292 float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 293 /* element-wise relative error determination */ 294 bDifferent = (relErrorULPS > 2.f); 295 } 296 297 return bDifferent; 298 } 299 300 bool STREAM_Add_verify_double(double *a, double *b, double *c, size_t len) 301 { 302 size_t idx; 303 bool bDifferent = false; 304 305 for (idx = 0; idx < len && !bDifferent; idx++) { 306 double expectedResult = a[idx] + b[idx]; 307 double diffResultExpected = (c[idx] - expectedResult); 308 double relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 309 /* element-wise relative error determination */ 310 bDifferent = (relErrorULPS > 2.); 311 } 312 313 return bDifferent; 314 } 315 316 bool STREAM_Triad_verify(float *a, float *b, float *c, float scalar, size_t len) 317 { 318 size_t idx; 319 bool bDifferent = false; 320 321 for (idx = 0; idx < len && !bDifferent; idx++) { 322 float expectedResult = a[idx] + scalar*b[idx]; 323 float diffResultExpected = (c[idx] - expectedResult); 324 float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 325 /* element-wise relative error determination */ 326 bDifferent = (relErrorULPS > 3.f); 327 } 328 329 return bDifferent; 330 } 331 332 bool STREAM_Triad_verify_double(double *a, double *b, double *c, double scalar, size_t len) 333 { 334 size_t idx; 335 bool bDifferent = false; 336 337 for (idx = 0; idx < len && !bDifferent; idx++) { 338 double expectedResult = a[idx] + scalar*b[idx]; 339 double diffResultExpected = (c[idx] - expectedResult); 340 double relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 341 /* element-wise relative error determination */ 342 bDifferent = (relErrorULPS > 3.); 343 } 344 345 return bDifferent; 346 } 347 348 /* forward declarations */ 349 PetscErrorCode setupStream(PetscInt device, PetscBool runDouble, PetscBool cpuTiming); 350 PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming); 351 PetscErrorCode runStreamDouble(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming); 352 PetscErrorCode printResultsReadable(float times[][NTIMES], size_t); 353 354 int main(int argc, char *argv[]) 355 { 356 PetscInt device = 0; 357 PetscBool runDouble = PETSC_TRUE; 358 const PetscBool cpuTiming = PETSC_TRUE; // must be true 359 PetscErrorCode ierr; 360 361 ierr = cudaSetDeviceFlags(cudaDeviceBlockingSync);CHKERRQ(ierr); 362 363 ierr = PetscInitialize(&argc, &argv, 0, help);if (ierr) return ierr; 364 365 ierr = PetscOptionsBegin(PETSC_COMM_WORLD, "", "STREAM Benchmark Options", "STREAM");CHKERRQ(ierr); 366 ierr = PetscOptionsBoundedInt("-device", "Specify the CUDA device to be used", "STREAM", device, &device, NULL,0);CHKERRQ(ierr); 367 ierr = PetscOptionsBool("-double", "Also run double precision tests", "STREAM", runDouble, &runDouble, NULL);CHKERRQ(ierr); 368 ierr = PetscOptionsEnd(); 369 370 ierr = setupStream(device, runDouble, cpuTiming); 371 if (ierr) { 372 ierr = PetscPrintf(PETSC_COMM_SELF, "\n[streamBenchmark] - results:\t%s\n\n", (ierr == 0) ? "PASSES" : "FAILED");CHKERRQ(ierr); 373 } 374 ierr = PetscFinalize(); 375 return ierr; 376 } 377 378 /////////////////////////////////////////////////////////////////////////////// 379 //Run the appropriate tests 380 /////////////////////////////////////////////////////////////////////////////// 381 PetscErrorCode setupStream(PetscInt deviceNum, PetscBool runDouble, PetscBool cpuTiming) 382 { 383 PetscInt iNumThreadsPerBlock = 128; 384 PetscErrorCode ierr; 385 386 PetscFunctionBegin; 387 // Check device 388 { 389 int deviceCount; 390 391 cudaGetDeviceCount(&deviceCount); 392 if (deviceCount == 0) { 393 ierr = PetscPrintf(PETSC_COMM_SELF, "!!!!!No devices found!!!!!\n");CHKERRQ(ierr); 394 return -1000; 395 } 396 397 if (deviceNum >= deviceCount || deviceNum < 0) { 398 ierr = PetscPrintf(PETSC_COMM_SELF, "\n!!!!!Invalid GPU number %d given hence default gpu %d will be used !!!!!\n", deviceNum, 0);CHKERRQ(ierr); 399 deviceNum = 0; 400 } 401 } 402 403 cudaSetDevice(deviceNum); 404 // ierr = PetscPrintf(PETSC_COMM_SELF, "Running on...\n\n");CHKERRQ(ierr); 405 cudaDeviceProp deviceProp; 406 if (cudaGetDeviceProperties(&deviceProp, deviceNum) != cudaSuccess) { 407 ierr = PetscPrintf(PETSC_COMM_SELF, " Unable to determine device %d properties, exiting\n");CHKERRQ(ierr); 408 return -1; 409 } 410 411 if (runDouble && deviceProp.major == 1 && deviceProp.minor < 3) { 412 ierr = PetscPrintf(PETSC_COMM_SELF, " Unable to run double-precision STREAM benchmark on a compute capability GPU less than 1.3\n");CHKERRQ(ierr); 413 return -1; 414 } 415 if (deviceProp.major == 2 && deviceProp.minor == 1) iNumThreadsPerBlock = 192; /* GF104 architecture / 48 CUDA Cores per MP */ 416 else iNumThreadsPerBlock = 128; /* GF100 architecture / 32 CUDA Cores per MP */ 417 418 if (runDouble) { 419 ierr = runStreamDouble(iNumThreadsPerBlock, cpuTiming);CHKERRQ(ierr); 420 } else { 421 ierr = runStream(iNumThreadsPerBlock, cpuTiming);CHKERRQ(ierr); 422 } 423 PetscFunctionReturn(0); 424 } 425 426 /////////////////////////////////////////////////////////////////////////// 427 // runStream 428 /////////////////////////////////////////////////////////////////////////// 429 PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming) 430 { 431 float *d_a, *d_b, *d_c; 432 int k; 433 float times[8][NTIMES]; 434 float scalar; 435 PetscErrorCode ierr; 436 437 PetscFunctionBegin; 438 /* Allocate memory on device */ 439 ierr = cudaMalloc((void**)&d_a, sizeof(float)*N);CHKERRQ(ierr); 440 ierr = cudaMalloc((void**)&d_b, sizeof(float)*N);CHKERRQ(ierr); 441 ierr = cudaMalloc((void**)&d_c, sizeof(float)*N);CHKERRQ(ierr); 442 443 /* Compute execution configuration */ 444 445 dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */ 446 dim3 dimGrid(N/dimBlock.x); /* (N/dimBlock.x,1,1) */ 447 if (N % dimBlock.x != 0) dimGrid.x+=1; 448 449 /* Initialize memory on the device */ 450 set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 451 set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 452 set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 453 454 /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ 455 PetscLogDouble cpuTimer = 0.0; 456 457 scalar=3.0f; 458 for (k = 0; k < NTIMES; ++k) { 459 PetscTimeSubtract(&cpuTimer); 460 STREAM_Copy<<<dimGrid,dimBlock>>>(d_a, d_c, N); 461 cudaStreamSynchronize(NULL); 462 ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRQ(ierr); 463 PetscTimeAdd(&cpuTimer); 464 if (bDontUseGPUTiming) times[0][k] = cpuTimer*1.e3; // millisec 465 466 cpuTimer = 0.0; 467 PetscTimeSubtract(&cpuTimer); 468 STREAM_Copy_Optimized<<<dimGrid,dimBlock>>>(d_a, d_c, N); 469 cudaStreamSynchronize(NULL); 470 ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRQ(ierr); 471 //get the total elapsed time in ms 472 PetscTimeAdd(&cpuTimer); 473 if (bDontUseGPUTiming) times[1][k] = cpuTimer*1.e3; 474 475 cpuTimer = 0.0; 476 PetscTimeSubtract(&cpuTimer); 477 STREAM_Scale<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 478 cudaStreamSynchronize(NULL); 479 ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRQ(ierr); 480 //get the total elapsed time in ms 481 PetscTimeAdd(&cpuTimer); 482 if (bDontUseGPUTiming) times[2][k] = cpuTimer*1.e3; 483 484 cpuTimer = 0.0; 485 PetscTimeSubtract(&cpuTimer); 486 STREAM_Scale_Optimized<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 487 cudaStreamSynchronize(NULL); 488 ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRQ(ierr); 489 //get the total elapsed time in ms 490 PetscTimeAdd(&cpuTimer); 491 if (bDontUseGPUTiming) times[3][k] = cpuTimer*1.e3; 492 493 cpuTimer = 0.0; 494 PetscTimeSubtract(&cpuTimer); 495 // ierr = cudaEventRecord(start, 0);CHKERRQ(ierr); 496 STREAM_Add<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 497 cudaStreamSynchronize(NULL); 498 ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRQ(ierr); // ierr = cudaEventRecord(stop, 0);CHKERRQ(ierr); 499 // ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 500 //get the total elapsed time in ms 501 PetscTimeAdd(&cpuTimer); 502 if (bDontUseGPUTiming) times[4][k] = cpuTimer*1.e3; 503 else { 504 // ierr = cudaEventElapsedTime(×[4][k], start, stop);CHKERRQ(ierr); 505 } 506 507 cpuTimer = 0.0; 508 PetscTimeSubtract(&cpuTimer); 509 STREAM_Add_Optimized<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 510 cudaStreamSynchronize(NULL); 511 ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRQ(ierr); 512 //get the total elapsed time in ms 513 PetscTimeAdd(&cpuTimer); 514 if (bDontUseGPUTiming) times[5][k] = cpuTimer*1.e3; 515 516 cpuTimer = 0.0; 517 PetscTimeSubtract(&cpuTimer); 518 STREAM_Triad<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 519 cudaStreamSynchronize(NULL); 520 ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRQ(ierr); 521 //get the total elapsed time in ms 522 PetscTimeAdd(&cpuTimer); 523 if (bDontUseGPUTiming) times[6][k] = cpuTimer*1.e3; 524 525 cpuTimer = 0.0; 526 PetscTimeSubtract(&cpuTimer); 527 STREAM_Triad_Optimized<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 528 cudaStreamSynchronize(NULL); 529 ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRQ(ierr); 530 //get the total elapsed time in ms 531 PetscTimeAdd(&cpuTimer); 532 if (bDontUseGPUTiming) times[7][k] = cpuTimer*1.e3; 533 } 534 535 if (1) { /* verify kernels */ 536 float *h_a, *h_b, *h_c; 537 bool errorSTREAMkernel = true; 538 539 if ((h_a = (float*)calloc(N, sizeof(float))) == (float*)NULL) { 540 printf("Unable to allocate array h_a, exiting ...\n"); 541 exit(1); 542 } 543 if ((h_b = (float*)calloc(N, sizeof(float))) == (float*)NULL) { 544 printf("Unable to allocate array h_b, exiting ...\n"); 545 exit(1); 546 } 547 548 if ((h_c = (float*)calloc(N, sizeof(float))) == (float*)NULL) { 549 printf("Unalbe to allocate array h_c, exiting ...\n"); 550 exit(1); 551 } 552 553 /* 554 * perform kernel, copy device memory into host memory and verify each 555 * device kernel output 556 */ 557 558 /* Initialize memory on the device */ 559 set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 560 set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 561 set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 562 563 STREAM_Copy<<<dimGrid,dimBlock>>>(d_a, d_c, N); 564 ierr = cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 565 ierr = cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 566 errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N); 567 if (errorSTREAMkernel) { 568 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n");CHKERRQ(ierr); 569 exit(-2000); 570 } 571 572 /* Initialize memory on the device */ 573 set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 574 set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 575 set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 576 577 STREAM_Copy_Optimized<<<dimGrid,dimBlock>>>(d_a, d_c, N); 578 ierr = cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 579 ierr = cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 580 errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N); 581 if (errorSTREAMkernel) { 582 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n");CHKERRQ(ierr); 583 exit(-3000); 584 } 585 586 /* Initialize memory on the device */ 587 set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 588 set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 589 set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 590 591 STREAM_Scale<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 592 ierr = cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 593 ierr = cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 594 errorSTREAMkernel = STREAM_Scale_verify(h_b, h_c, scalar, N); 595 if (errorSTREAMkernel) { 596 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n");CHKERRQ(ierr); 597 exit(-4000); 598 } 599 600 /* Initialize memory on the device */ 601 set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 602 set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 603 set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 604 605 STREAM_Add<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 606 ierr = cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 607 ierr = cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 608 ierr = cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 609 errorSTREAMkernel = STREAM_Add_verify(h_a, h_b, h_c, N); 610 if (errorSTREAMkernel) { 611 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n");CHKERRQ(ierr); 612 exit(-5000); 613 } 614 615 /* Initialize memory on the device */ 616 set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 617 set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 618 set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 619 620 STREAM_Triad<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 621 ierr = cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 622 ierr = cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 623 ierr = cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 624 errorSTREAMkernel = STREAM_Triad_verify(h_b, h_c, h_a, scalar, N); 625 if (errorSTREAMkernel) { 626 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n");CHKERRQ(ierr); 627 exit(-6000); 628 } 629 630 free(h_a); 631 free(h_b); 632 free(h_c); 633 } 634 /* continue from here */ 635 printResultsReadable(times, sizeof(float)); 636 637 /* Free memory on device */ 638 ierr = cudaFree(d_a);CHKERRQ(ierr); 639 ierr = cudaFree(d_b);CHKERRQ(ierr); 640 ierr = cudaFree(d_c);CHKERRQ(ierr); 641 642 PetscFunctionReturn(0); 643 } 644 645 PetscErrorCode runStreamDouble(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming) 646 { 647 double *d_a, *d_b, *d_c; 648 int k; 649 float times[8][NTIMES]; 650 double scalar; 651 PetscErrorCode ierr; 652 653 PetscFunctionBegin; 654 /* Allocate memory on device */ 655 ierr = cudaMalloc((void**)&d_a, sizeof(double)*N);CHKERRQ(ierr); 656 ierr = cudaMalloc((void**)&d_b, sizeof(double)*N);CHKERRQ(ierr); 657 ierr = cudaMalloc((void**)&d_c, sizeof(double)*N);CHKERRQ(ierr); 658 659 /* Compute execution configuration */ 660 661 dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */ 662 dim3 dimGrid(N/dimBlock.x); /* (N/dimBlock.x,1,1) */ 663 if (N % dimBlock.x != 0) dimGrid.x+=1; 664 665 /* Initialize memory on the device */ 666 set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N); 667 set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 668 set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 669 670 /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ 671 PetscLogDouble cpuTimer = 0.0; 672 673 scalar=3.0; 674 for (k = 0; k < NTIMES; ++k) { 675 PetscTimeSubtract(&cpuTimer); 676 STREAM_Copy_double<<<dimGrid,dimBlock>>>(d_a, d_c, N); 677 cudaStreamSynchronize(NULL); 678 ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRQ(ierr); 679 //get the total elapsed time in ms 680 if (bDontUseGPUTiming) { 681 PetscTimeAdd(&cpuTimer); 682 times[0][k] = cpuTimer*1.e3; 683 } 684 685 cpuTimer = 0.0; 686 PetscTimeSubtract(&cpuTimer); 687 STREAM_Copy_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_c, N); 688 cudaStreamSynchronize(NULL); 689 ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRQ(ierr); 690 //get the total elapsed time in ms 691 if (bDontUseGPUTiming) { 692 PetscTimeAdd(&cpuTimer); 693 times[1][k] = cpuTimer*1.e3; 694 } 695 696 cpuTimer = 0.0; 697 PetscTimeSubtract(&cpuTimer); 698 STREAM_Scale_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 699 cudaStreamSynchronize(NULL); 700 ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRQ(ierr); 701 //get the total elapsed time in ms 702 PetscTimeAdd(&cpuTimer); 703 if (bDontUseGPUTiming) times[2][k] = cpuTimer*1.e3; 704 705 cpuTimer = 0.0; 706 PetscTimeSubtract(&cpuTimer); 707 STREAM_Scale_Optimized_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 708 cudaStreamSynchronize(NULL); 709 ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRQ(ierr); 710 //get the total elapsed time in ms 711 PetscTimeAdd(&cpuTimer); 712 if (bDontUseGPUTiming) times[3][k] = cpuTimer*1.e3; 713 714 cpuTimer = 0.0; 715 PetscTimeSubtract(&cpuTimer); 716 STREAM_Add_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 717 cudaStreamSynchronize(NULL); 718 ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRQ(ierr); 719 //get the total elapsed time in ms 720 PetscTimeAdd(&cpuTimer); 721 if (bDontUseGPUTiming) times[4][k] = cpuTimer*1.e3; 722 723 cpuTimer = 0.0; 724 PetscTimeSubtract(&cpuTimer); 725 STREAM_Add_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 726 cudaStreamSynchronize(NULL); 727 ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRQ(ierr); 728 //get the total elapsed time in ms 729 PetscTimeAdd(&cpuTimer); 730 if (bDontUseGPUTiming) times[5][k] = cpuTimer*1.e3; 731 732 cpuTimer = 0.0; 733 PetscTimeSubtract(&cpuTimer); 734 STREAM_Triad_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 735 cudaStreamSynchronize(NULL); 736 ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRQ(ierr); 737 //get the total elapsed time in ms 738 PetscTimeAdd(&cpuTimer); 739 if (bDontUseGPUTiming) times[6][k] = cpuTimer*1.e3; 740 741 cpuTimer = 0.0; 742 PetscTimeSubtract(&cpuTimer); 743 STREAM_Triad_Optimized_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 744 cudaStreamSynchronize(NULL); 745 ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRQ(ierr); 746 //get the total elapsed time in ms 747 PetscTimeAdd(&cpuTimer); 748 if (bDontUseGPUTiming) times[7][k] = cpuTimer*1.e3; 749 } 750 751 if (1) { /* verify kernels */ 752 double *h_a, *h_b, *h_c; 753 bool errorSTREAMkernel = true; 754 755 if ((h_a = (double*)calloc(N, sizeof(double))) == (double*)NULL) { 756 printf("Unable to allocate array h_a, exiting ...\n"); 757 exit(1); 758 } 759 if ((h_b = (double*)calloc(N, sizeof(double))) == (double*)NULL) { 760 printf("Unable to allocate array h_b, exiting ...\n"); 761 exit(1); 762 } 763 764 if ((h_c = (double*)calloc(N, sizeof(double))) == (double*)NULL) { 765 printf("Unalbe to allocate array h_c, exiting ...\n"); 766 exit(1); 767 } 768 769 /* 770 * perform kernel, copy device memory into host memory and verify each 771 * device kernel output 772 */ 773 774 /* Initialize memory on the device */ 775 set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N); 776 set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 777 set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 778 779 STREAM_Copy_double<<<dimGrid,dimBlock>>>(d_a, d_c, N); 780 ierr = cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 781 ierr = cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 782 errorSTREAMkernel = STREAM_Copy_verify_double(h_a, h_c, N); 783 if (errorSTREAMkernel) { 784 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n");CHKERRQ(ierr); 785 exit(-2000); 786 } 787 788 /* Initialize memory on the device */ 789 set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N); 790 set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 791 set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 792 793 STREAM_Copy_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_c, N); 794 ierr = cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 795 ierr = cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 796 errorSTREAMkernel = STREAM_Copy_verify_double(h_a, h_c, N); 797 if (errorSTREAMkernel) { 798 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n");CHKERRQ(ierr); 799 exit(-3000); 800 } 801 802 /* Initialize memory on the device */ 803 set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 804 set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 805 806 STREAM_Scale_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 807 ierr = cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 808 ierr = cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 809 errorSTREAMkernel = STREAM_Scale_verify_double(h_b, h_c, scalar, N); 810 if (errorSTREAMkernel) { 811 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n");CHKERRQ(ierr); 812 exit(-4000); 813 } 814 815 /* Initialize memory on the device */ 816 set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N); 817 set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 818 set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 819 820 STREAM_Add_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 821 ierr = cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 822 ierr = cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 823 ierr = cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 824 errorSTREAMkernel = STREAM_Add_verify_double(h_a, h_b, h_c, N); 825 if (errorSTREAMkernel) { 826 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n");CHKERRQ(ierr); 827 exit(-5000); 828 } 829 830 /* Initialize memory on the device */ 831 set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N); 832 set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 833 set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 834 835 STREAM_Triad_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 836 ierr = cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 837 ierr = cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 838 ierr = cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 839 errorSTREAMkernel = STREAM_Triad_verify_double(h_b, h_c, h_a, scalar, N); 840 if (errorSTREAMkernel) { 841 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n");CHKERRQ(ierr); 842 exit(-6000); 843 } 844 845 free(h_a); 846 free(h_b); 847 free(h_c); 848 } 849 /* continue from here */ 850 printResultsReadable(times,sizeof(double)); 851 852 /* Free memory on device */ 853 ierr = cudaFree(d_a);CHKERRQ(ierr); 854 ierr = cudaFree(d_b);CHKERRQ(ierr); 855 ierr = cudaFree(d_c);CHKERRQ(ierr); 856 857 PetscFunctionReturn(0); 858 } 859 860 /////////////////////////////////////////////////////////////////////////// 861 //Print Results to Screen and File 862 /////////////////////////////////////////////////////////////////////////// 863 PetscErrorCode printResultsReadable(float times[][NTIMES], const size_t bsize) 864 { 865 PetscErrorCode ierr; 866 PetscInt j, k; 867 float avgtime[8] = {0., 0., 0., 0., 0., 0., 0., 0.}; 868 float maxtime[8] = {0., 0., 0., 0., 0., 0., 0., 0.}; 869 float mintime[8] = {1e30,1e30,1e30,1e30,1e30,1e30,1e30,1e30}; 870 // char *label[8] = {"Copy: ", "Copy Opt.: ", "Scale: ", "Scale Opt: ", "Add: ", "Add Opt: ", "Triad: ", "Triad Opt: "}; 871 const float bytes_per_kernel[8] = { 872 2. * bsize * N, 873 2. * bsize * N, 874 2. * bsize * N, 875 2. * bsize * N, 876 3. * bsize * N, 877 3. * bsize * N, 878 3. * bsize * N, 879 3. * bsize * N 880 }; 881 double rate,irate; 882 int rank,size; 883 PetscFunctionBegin; 884 ierr = MPI_Comm_rank(MPI_COMM_WORLD,&rank);CHKERRQ(ierr); 885 ierr = MPI_Comm_size(MPI_COMM_WORLD,&size);CHKERRQ(ierr); 886 /* --- SUMMARY --- */ 887 for (k = 0; k < NTIMES; ++k) { 888 for (j = 0; j < 8; ++j) { 889 avgtime[j] = avgtime[j] + (1.e-03f * times[j][k]); // millisec --> sec 890 mintime[j] = MIN(mintime[j], (1.e-03f * times[j][k])); 891 maxtime[j] = MAX(maxtime[j], (1.e-03f * times[j][k])); 892 } 893 } 894 for (j = 0; j < 8; ++j) { 895 avgtime[j] = avgtime[j]/(float)(NTIMES-1); 896 } 897 j = 7; 898 irate = 1.0E-06 * bytes_per_kernel[j]/mintime[j]; 899 ierr = MPI_Reduce(&irate,&rate,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); 900 if (!rank) { 901 FILE *fd; 902 if (size == 1) { 903 printf("%d %11.4f Rate (MB/s)\n",size, rate); 904 fd = fopen("flops","w"); 905 fprintf(fd,"%g\n",rate); 906 fclose(fd); 907 } else { 908 double prate; 909 fd = fopen("flops","r"); 910 fscanf(fd,"%lg",&prate); 911 fclose(fd); 912 printf("%d %11.4f Rate (MB/s) %g \n", size, rate, rate/prate); 913 } 914 } 915 916 PetscFunctionReturn(0); 917 } 918