1 /* 2 STREAM benchmark implementation in CUDA. 3 4 COPY: a(i) = b(i) 5 SCALE: a(i) = q*b(i) 6 SUM: a(i) = b(i) + c(i) 7 TRIAD: a(i) = b(i) + q*c(i) 8 9 It measures the memory system on the device. 10 The implementation is in double precision with a single option. 11 12 Code based on the code developed by John D. McCalpin 13 http://www.cs.virginia.edu/stream/FTP/Code/stream.c 14 15 Written by: Massimiliano Fatica, NVIDIA Corporation 16 Modified by: Douglas Enright (dpephd-nvidia@yahoo.com), 1 December 2010 17 Extensive Revisions, 4 December 2010 18 Modified for PETSc by: Matthew G. Knepley 14 Aug 2011 19 20 User interface motivated by bandwidthTest NVIDIA SDK example. 21 */ 22 static char help[] = "Double-Precision STREAM Benchmark implementation in CUDA\n Performs Copy, Scale, Add, and Triad double-precision kernels\n\n"; 23 24 #include <petscconf.h> 25 #include <petscsys.h> 26 #include <petsctime.h> 27 #include <petscdevice_cuda.h> 28 29 #define N 10000000 30 #define NTIMES 10 31 32 #ifndef MIN 33 #define MIN(x, y) ((x) < (y) ? (x) : (y)) 34 #endif 35 #ifndef MAX 36 #define MAX(x, y) ((x) > (y) ? (x) : (y)) 37 #endif 38 39 const float flt_eps = 1.192092896e-07f; 40 const double dbl_eps = 2.2204460492503131e-16; 41 42 __global__ void set_array(float *a, float value, size_t len) { 43 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 44 while (idx < len) { 45 a[idx] = value; 46 idx += blockDim.x * gridDim.x; 47 } 48 } 49 50 __global__ void set_array_double(double *a, double value, size_t len) { 51 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 52 while (idx < len) { 53 a[idx] = value; 54 idx += blockDim.x * gridDim.x; 55 } 56 } 57 58 __global__ void STREAM_Copy(float *a, float *b, size_t len) { 59 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 60 while (idx < len) { 61 b[idx] = a[idx]; 62 idx += blockDim.x * gridDim.x; 63 } 64 } 65 66 __global__ void STREAM_Copy_double(double *a, double *b, size_t len) { 67 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 68 while (idx < len) { 69 b[idx] = a[idx]; 70 idx += blockDim.x * gridDim.x; 71 } 72 } 73 74 __global__ void STREAM_Copy_Optimized(float *a, float *b, size_t len) { 75 /* 76 * Ensure size of thread index space is as large as or greater than 77 * vector index space else return. 78 */ 79 if (blockDim.x * gridDim.x < len) return; 80 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 81 if (idx < len) b[idx] = a[idx]; 82 } 83 84 __global__ void STREAM_Copy_Optimized_double(double *a, double *b, size_t len) { 85 /* 86 * Ensure size of thread index space is as large as or greater than 87 * vector index space else return. 88 */ 89 if (blockDim.x * gridDim.x < len) return; 90 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 91 if (idx < len) b[idx] = a[idx]; 92 } 93 94 __global__ void STREAM_Scale(float *a, float *b, float scale, size_t len) { 95 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 96 while (idx < len) { 97 b[idx] = scale * a[idx]; 98 idx += blockDim.x * gridDim.x; 99 } 100 } 101 102 __global__ void STREAM_Scale_double(double *a, double *b, double scale, size_t len) { 103 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 104 while (idx < len) { 105 b[idx] = scale * a[idx]; 106 idx += blockDim.x * gridDim.x; 107 } 108 } 109 110 __global__ void STREAM_Scale_Optimized(float *a, float *b, float scale, size_t len) { 111 /* 112 * Ensure size of thread index space is as large as or greater than 113 * vector index space else return. 114 */ 115 if (blockDim.x * gridDim.x < len) return; 116 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 117 if (idx < len) b[idx] = scale * a[idx]; 118 } 119 120 __global__ void STREAM_Scale_Optimized_double(double *a, double *b, double scale, size_t len) { 121 /* 122 * Ensure size of thread index space is as large as or greater than 123 * vector index space else return. 124 */ 125 if (blockDim.x * gridDim.x < len) return; 126 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 127 if (idx < len) b[idx] = scale * a[idx]; 128 } 129 130 __global__ void STREAM_Add(float *a, float *b, float *c, size_t len) { 131 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 132 while (idx < len) { 133 c[idx] = a[idx] + b[idx]; 134 idx += blockDim.x * gridDim.x; 135 } 136 } 137 138 __global__ void STREAM_Add_double(double *a, double *b, double *c, size_t len) { 139 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 140 while (idx < len) { 141 c[idx] = a[idx] + b[idx]; 142 idx += blockDim.x * gridDim.x; 143 } 144 } 145 146 __global__ void STREAM_Add_Optimized(float *a, float *b, float *c, size_t len) { 147 /* 148 * Ensure size of thread index space is as large as or greater than 149 * vector index space else return. 150 */ 151 if (blockDim.x * gridDim.x < len) return; 152 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 153 if (idx < len) c[idx] = a[idx] + b[idx]; 154 } 155 156 __global__ void STREAM_Add_Optimized_double(double *a, double *b, double *c, size_t len) { 157 /* 158 * Ensure size of thread index space is as large as or greater than 159 * vector index space else return. 160 */ 161 if (blockDim.x * gridDim.x < len) return; 162 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 163 if (idx < len) c[idx] = a[idx] + b[idx]; 164 } 165 166 __global__ void STREAM_Triad(float *a, float *b, float *c, float scalar, size_t len) { 167 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 168 while (idx < len) { 169 c[idx] = a[idx] + scalar * b[idx]; 170 idx += blockDim.x * gridDim.x; 171 } 172 } 173 174 __global__ void STREAM_Triad_double(double *a, double *b, double *c, double scalar, size_t len) { 175 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 176 while (idx < len) { 177 c[idx] = a[idx] + scalar * b[idx]; 178 idx += blockDim.x * gridDim.x; 179 } 180 } 181 182 __global__ void STREAM_Triad_Optimized(float *a, float *b, float *c, float scalar, size_t len) { 183 /* 184 * Ensure size of thread index space is as large as or greater than 185 * vector index space else return. 186 */ 187 if (blockDim.x * gridDim.x < len) return; 188 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 189 if (idx < len) c[idx] = a[idx] + scalar * b[idx]; 190 } 191 192 __global__ void STREAM_Triad_Optimized_double(double *a, double *b, double *c, double scalar, size_t len) { 193 /* 194 * Ensure size of thread index space is as large as or greater than 195 * vector index space else return. 196 */ 197 if (blockDim.x * gridDim.x < len) return; 198 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 199 if (idx < len) c[idx] = a[idx] + scalar * b[idx]; 200 } 201 202 /* Host side verification routines */ 203 bool STREAM_Copy_verify(float *a, float *b, size_t len) { 204 size_t idx; 205 bool bDifferent = false; 206 207 for (idx = 0; idx < len && !bDifferent; idx++) { 208 float expectedResult = a[idx]; 209 float diffResultExpected = (b[idx] - expectedResult); 210 float relErrorULPS = (fabsf(diffResultExpected) / fabsf(expectedResult)) / flt_eps; 211 /* element-wise relative error determination */ 212 bDifferent = (relErrorULPS > 2.f); 213 } 214 215 return bDifferent; 216 } 217 218 bool STREAM_Copy_verify_double(double *a, double *b, size_t len) { 219 size_t idx; 220 bool bDifferent = false; 221 222 for (idx = 0; idx < len && !bDifferent; idx++) { 223 double expectedResult = a[idx]; 224 double diffResultExpected = (b[idx] - expectedResult); 225 double relErrorULPS = (fabsf(diffResultExpected) / fabsf(expectedResult)) / dbl_eps; 226 /* element-wise relative error determination */ 227 bDifferent = (relErrorULPS > 2.); 228 } 229 230 return bDifferent; 231 } 232 233 bool STREAM_Scale_verify(float *a, float *b, float scale, size_t len) { 234 size_t idx; 235 bool bDifferent = false; 236 237 for (idx = 0; idx < len && !bDifferent; idx++) { 238 float expectedResult = scale * a[idx]; 239 float diffResultExpected = (b[idx] - expectedResult); 240 float relErrorULPS = (fabsf(diffResultExpected) / fabsf(expectedResult)) / flt_eps; 241 /* element-wise relative error determination */ 242 bDifferent = (relErrorULPS > 2.f); 243 } 244 245 return bDifferent; 246 } 247 248 bool STREAM_Scale_verify_double(double *a, double *b, double scale, size_t len) { 249 size_t idx; 250 bool bDifferent = false; 251 252 for (idx = 0; idx < len && !bDifferent; idx++) { 253 double expectedResult = scale * a[idx]; 254 double diffResultExpected = (b[idx] - expectedResult); 255 double relErrorULPS = (fabsf(diffResultExpected) / fabsf(expectedResult)) / flt_eps; 256 /* element-wise relative error determination */ 257 bDifferent = (relErrorULPS > 2.); 258 } 259 260 return bDifferent; 261 } 262 263 bool STREAM_Add_verify(float *a, float *b, float *c, size_t len) { 264 size_t idx; 265 bool bDifferent = false; 266 267 for (idx = 0; idx < len && !bDifferent; idx++) { 268 float expectedResult = a[idx] + b[idx]; 269 float diffResultExpected = (c[idx] - expectedResult); 270 float relErrorULPS = (fabsf(diffResultExpected) / fabsf(expectedResult)) / flt_eps; 271 /* element-wise relative error determination */ 272 bDifferent = (relErrorULPS > 2.f); 273 } 274 275 return bDifferent; 276 } 277 278 bool STREAM_Add_verify_double(double *a, double *b, double *c, size_t len) { 279 size_t idx; 280 bool bDifferent = false; 281 282 for (idx = 0; idx < len && !bDifferent; idx++) { 283 double expectedResult = a[idx] + b[idx]; 284 double diffResultExpected = (c[idx] - expectedResult); 285 double relErrorULPS = (fabsf(diffResultExpected) / fabsf(expectedResult)) / flt_eps; 286 /* element-wise relative error determination */ 287 bDifferent = (relErrorULPS > 2.); 288 } 289 290 return bDifferent; 291 } 292 293 bool STREAM_Triad_verify(float *a, float *b, float *c, float scalar, size_t len) { 294 size_t idx; 295 bool bDifferent = false; 296 297 for (idx = 0; idx < len && !bDifferent; idx++) { 298 float expectedResult = a[idx] + scalar * b[idx]; 299 float diffResultExpected = (c[idx] - expectedResult); 300 float relErrorULPS = (fabsf(diffResultExpected) / fabsf(expectedResult)) / flt_eps; 301 /* element-wise relative error determination */ 302 bDifferent = (relErrorULPS > 3.f); 303 } 304 305 return bDifferent; 306 } 307 308 bool STREAM_Triad_verify_double(double *a, double *b, double *c, double scalar, size_t len) { 309 size_t idx; 310 bool bDifferent = false; 311 312 for (idx = 0; idx < len && !bDifferent; idx++) { 313 double expectedResult = a[idx] + scalar * b[idx]; 314 double diffResultExpected = (c[idx] - expectedResult); 315 double relErrorULPS = (fabsf(diffResultExpected) / fabsf(expectedResult)) / flt_eps; 316 /* element-wise relative error determination */ 317 bDifferent = (relErrorULPS > 3.); 318 } 319 320 return bDifferent; 321 } 322 323 /* forward declarations */ 324 PetscErrorCode setupStream(PetscInt device, PetscBool runDouble, PetscBool cpuTiming); 325 PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming); 326 PetscErrorCode runStreamDouble(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming); 327 PetscErrorCode printResultsReadable(float times[][NTIMES], size_t); 328 329 int main(int argc, char *argv[]) { 330 PetscInt device = 0; 331 PetscBool runDouble = PETSC_TRUE; 332 const PetscBool cpuTiming = PETSC_TRUE; // must be true 333 PetscErrorCode ierr; 334 335 PetscCallCUDA(cudaSetDeviceFlags(cudaDeviceBlockingSync)); 336 337 PetscCall(PetscInitialize(&argc, &argv, 0, help)); 338 339 PetscOptionsBegin(PETSC_COMM_WORLD, "", "STREAM Benchmark Options", "STREAM"); 340 PetscCall(PetscOptionsBoundedInt("-device", "Specify the CUDA device to be used", "STREAM", device, &device, NULL, 0)); 341 PetscCall(PetscOptionsBool("-double", "Also run double precision tests", "STREAM", runDouble, &runDouble, NULL)); 342 PetscOptionsEnd(); 343 344 ierr = setupStream(device, runDouble, cpuTiming); 345 if (ierr) { PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n[streamBenchmark] - results:\t%s\n\n", (ierr == 0) ? "PASSES" : "FAILED")); } 346 PetscCall(PetscFinalize()); 347 return 0; 348 } 349 350 /////////////////////////////////////////////////////////////////////////////// 351 //Run the appropriate tests 352 /////////////////////////////////////////////////////////////////////////////// 353 PetscErrorCode setupStream(PetscInt deviceNum, PetscBool runDouble, PetscBool cpuTiming) { 354 PetscInt iNumThreadsPerBlock = 128; 355 356 PetscFunctionBegin; 357 // Check device 358 { 359 int deviceCount; 360 361 cudaGetDeviceCount(&deviceCount); 362 if (deviceCount == 0) { 363 PetscCall(PetscPrintf(PETSC_COMM_SELF, "!!!!!No devices found!!!!!\n")); 364 return -1000; 365 } 366 367 if (deviceNum >= deviceCount || deviceNum < 0) { 368 PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n!!!!!Invalid GPU number %d given hence default gpu %d will be used !!!!!\n", deviceNum, 0)); 369 deviceNum = 0; 370 } 371 } 372 373 cudaSetDevice(deviceNum); 374 // PetscCall(PetscPrintf(PETSC_COMM_SELF, "Running on...\n\n")); 375 cudaDeviceProp deviceProp; 376 if (cudaGetDeviceProperties(&deviceProp, deviceNum) != cudaSuccess) { 377 PetscCall(PetscPrintf(PETSC_COMM_SELF, " Unable to determine device %d properties, exiting\n")); 378 return -1; 379 } 380 381 if (runDouble && deviceProp.major == 1 && deviceProp.minor < 3) { 382 PetscCall(PetscPrintf(PETSC_COMM_SELF, " Unable to run double-precision STREAM benchmark on a compute capability GPU less than 1.3\n")); 383 return -1; 384 } 385 if (deviceProp.major == 2 && deviceProp.minor == 1) iNumThreadsPerBlock = 192; /* GF104 architecture / 48 CUDA Cores per MP */ 386 else iNumThreadsPerBlock = 128; /* GF100 architecture / 32 CUDA Cores per MP */ 387 388 if (runDouble) PetscCall(runStreamDouble(iNumThreadsPerBlock, cpuTiming)); 389 else PetscCall(runStream(iNumThreadsPerBlock, cpuTiming)); 390 PetscFunctionReturn(PETSC_SUCCESS); 391 } 392 393 /////////////////////////////////////////////////////////////////////////// 394 // runStream 395 /////////////////////////////////////////////////////////////////////////// 396 PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming) { 397 float *d_a, *d_b, *d_c; 398 int k; 399 float times[8][NTIMES]; 400 float scalar; 401 402 PetscFunctionBegin; 403 /* Allocate memory on device */ 404 PetscCallCUDA(cudaMalloc((void **)&d_a, sizeof(float) * N)); 405 PetscCallCUDA(cudaMalloc((void **)&d_b, sizeof(float) * N)); 406 PetscCallCUDA(cudaMalloc((void **)&d_c, sizeof(float) * N)); 407 408 /* Compute execution configuration */ 409 410 dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */ 411 dim3 dimGrid(N / dimBlock.x); /* (N/dimBlock.x,1,1) */ 412 if (N % dimBlock.x != 0) dimGrid.x += 1; 413 414 /* Initialize memory on the device */ 415 set_array<<<dimGrid, dimBlock>>>(d_a, 2.f, N); 416 set_array<<<dimGrid, dimBlock>>>(d_b, .5f, N); 417 set_array<<<dimGrid, dimBlock>>>(d_c, .5f, N); 418 419 /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ 420 PetscLogDouble cpuTimer = 0.0; 421 422 scalar = 3.0f; 423 for (k = 0; k < NTIMES; ++k) { 424 PetscTimeSubtract(&cpuTimer); 425 STREAM_Copy<<<dimGrid, dimBlock>>>(d_a, d_c, N); 426 cudaStreamSynchronize(NULL); 427 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 428 PetscTimeAdd(&cpuTimer); 429 if (bDontUseGPUTiming) times[0][k] = cpuTimer * 1.e3; // millisec 430 431 cpuTimer = 0.0; 432 PetscTimeSubtract(&cpuTimer); 433 STREAM_Copy_Optimized<<<dimGrid, dimBlock>>>(d_a, d_c, N); 434 cudaStreamSynchronize(NULL); 435 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 436 //get the total elapsed time in ms 437 PetscTimeAdd(&cpuTimer); 438 if (bDontUseGPUTiming) times[1][k] = cpuTimer * 1.e3; 439 440 cpuTimer = 0.0; 441 PetscTimeSubtract(&cpuTimer); 442 STREAM_Scale<<<dimGrid, dimBlock>>>(d_b, d_c, scalar, N); 443 cudaStreamSynchronize(NULL); 444 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 445 //get the total elapsed time in ms 446 PetscTimeAdd(&cpuTimer); 447 if (bDontUseGPUTiming) times[2][k] = cpuTimer * 1.e3; 448 449 cpuTimer = 0.0; 450 PetscTimeSubtract(&cpuTimer); 451 STREAM_Scale_Optimized<<<dimGrid, dimBlock>>>(d_b, d_c, scalar, N); 452 cudaStreamSynchronize(NULL); 453 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 454 //get the total elapsed time in ms 455 PetscTimeAdd(&cpuTimer); 456 if (bDontUseGPUTiming) times[3][k] = cpuTimer * 1.e3; 457 458 cpuTimer = 0.0; 459 PetscTimeSubtract(&cpuTimer); 460 // PetscCallCUDA(cudaEventRecord(start, 0)); 461 STREAM_Add<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, N); 462 cudaStreamSynchronize(NULL); 463 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 464 PetscCallCUDA(cudaEventRecord(stop, 0)); 465 // PetscCallCUDA(cudaEventSynchronize(stop)); 466 //get the total elapsed time in ms 467 PetscTimeAdd(&cpuTimer); 468 if (bDontUseGPUTiming) times[4][k] = cpuTimer * 1.e3; 469 else { 470 // PetscCallCUDA(cudaEventElapsedTime(×[4][k], start, stop)); 471 } 472 473 cpuTimer = 0.0; 474 PetscTimeSubtract(&cpuTimer); 475 STREAM_Add_Optimized<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, N); 476 cudaStreamSynchronize(NULL); 477 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 478 //get the total elapsed time in ms 479 PetscTimeAdd(&cpuTimer); 480 if (bDontUseGPUTiming) times[5][k] = cpuTimer * 1.e3; 481 482 cpuTimer = 0.0; 483 PetscTimeSubtract(&cpuTimer); 484 STREAM_Triad<<<dimGrid, dimBlock>>>(d_b, d_c, d_a, scalar, N); 485 cudaStreamSynchronize(NULL); 486 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 487 //get the total elapsed time in ms 488 PetscTimeAdd(&cpuTimer); 489 if (bDontUseGPUTiming) times[6][k] = cpuTimer * 1.e3; 490 491 cpuTimer = 0.0; 492 PetscTimeSubtract(&cpuTimer); 493 STREAM_Triad_Optimized<<<dimGrid, dimBlock>>>(d_b, d_c, d_a, scalar, N); 494 cudaStreamSynchronize(NULL); 495 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 496 //get the total elapsed time in ms 497 PetscTimeAdd(&cpuTimer); 498 if (bDontUseGPUTiming) times[7][k] = cpuTimer * 1.e3; 499 } 500 501 if (1) { /* verify kernels */ 502 float *h_a, *h_b, *h_c; 503 bool errorSTREAMkernel = true; 504 505 if ((h_a = (float *)calloc(N, sizeof(float))) == (float *)NULL) { 506 printf("Unable to allocate array h_a, exiting ...\n"); 507 exit(1); 508 } 509 if ((h_b = (float *)calloc(N, sizeof(float))) == (float *)NULL) { 510 printf("Unable to allocate array h_b, exiting ...\n"); 511 exit(1); 512 } 513 514 if ((h_c = (float *)calloc(N, sizeof(float))) == (float *)NULL) { 515 printf("Unalbe to allocate array h_c, exiting ...\n"); 516 exit(1); 517 } 518 519 /* 520 * perform kernel, copy device memory into host memory and verify each 521 * device kernel output 522 */ 523 524 /* Initialize memory on the device */ 525 set_array<<<dimGrid, dimBlock>>>(d_a, 2.f, N); 526 set_array<<<dimGrid, dimBlock>>>(d_b, .5f, N); 527 set_array<<<dimGrid, dimBlock>>>(d_c, .5f, N); 528 529 STREAM_Copy<<<dimGrid, dimBlock>>>(d_a, d_c, N); 530 PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost)); 531 PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost)); 532 errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N); 533 if (errorSTREAMkernel) { 534 PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n")); 535 exit(-2000); 536 } 537 538 /* Initialize memory on the device */ 539 set_array<<<dimGrid, dimBlock>>>(d_a, 2.f, N); 540 set_array<<<dimGrid, dimBlock>>>(d_b, .5f, N); 541 set_array<<<dimGrid, dimBlock>>>(d_c, .5f, N); 542 543 STREAM_Copy_Optimized<<<dimGrid, dimBlock>>>(d_a, d_c, N); 544 PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost)); 545 PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost)); 546 errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N); 547 if (errorSTREAMkernel) { 548 PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n")); 549 exit(-3000); 550 } 551 552 /* Initialize memory on the device */ 553 set_array<<<dimGrid, dimBlock>>>(d_a, 2.f, N); 554 set_array<<<dimGrid, dimBlock>>>(d_b, .5f, N); 555 set_array<<<dimGrid, dimBlock>>>(d_c, .5f, N); 556 557 STREAM_Scale<<<dimGrid, dimBlock>>>(d_b, d_c, scalar, N); 558 PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost)); 559 PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost)); 560 errorSTREAMkernel = STREAM_Scale_verify(h_b, h_c, scalar, N); 561 if (errorSTREAMkernel) { 562 PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n")); 563 exit(-4000); 564 } 565 566 /* Initialize memory on the device */ 567 set_array<<<dimGrid, dimBlock>>>(d_a, 2.f, N); 568 set_array<<<dimGrid, dimBlock>>>(d_b, .5f, N); 569 set_array<<<dimGrid, dimBlock>>>(d_c, .5f, N); 570 571 STREAM_Add<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, N); 572 PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost)); 573 PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost)); 574 PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost)); 575 errorSTREAMkernel = STREAM_Add_verify(h_a, h_b, h_c, N); 576 if (errorSTREAMkernel) { 577 PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n")); 578 exit(-5000); 579 } 580 581 /* Initialize memory on the device */ 582 set_array<<<dimGrid, dimBlock>>>(d_a, 2.f, N); 583 set_array<<<dimGrid, dimBlock>>>(d_b, .5f, N); 584 set_array<<<dimGrid, dimBlock>>>(d_c, .5f, N); 585 586 STREAM_Triad<<<dimGrid, dimBlock>>>(d_b, d_c, d_a, scalar, N); 587 PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost)); 588 PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost)); 589 PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost)); 590 errorSTREAMkernel = STREAM_Triad_verify(h_b, h_c, h_a, scalar, N); 591 if (errorSTREAMkernel) { 592 PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n")); 593 exit(-6000); 594 } 595 596 free(h_a); 597 free(h_b); 598 free(h_c); 599 } 600 /* continue from here */ 601 printResultsReadable(times, sizeof(float)); 602 603 /* Free memory on device */ 604 PetscCallCUDA(cudaFree(d_a)); 605 PetscCallCUDA(cudaFree(d_b)); 606 PetscCallCUDA(cudaFree(d_c)); 607 PetscFunctionReturn(PETSC_SUCCESS); 608 } 609 610 PetscErrorCode runStreamDouble(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming) { 611 double *d_a, *d_b, *d_c; 612 int k; 613 float times[8][NTIMES]; 614 double scalar; 615 616 PetscFunctionBegin; 617 /* Allocate memory on device */ 618 PetscCallCUDA(cudaMalloc((void **)&d_a, sizeof(double) * N)); 619 PetscCallCUDA(cudaMalloc((void **)&d_b, sizeof(double) * N)); 620 PetscCallCUDA(cudaMalloc((void **)&d_c, sizeof(double) * N)); 621 622 /* Compute execution configuration */ 623 624 dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */ 625 dim3 dimGrid(N / dimBlock.x); /* (N/dimBlock.x,1,1) */ 626 if (N % dimBlock.x != 0) dimGrid.x += 1; 627 628 /* Initialize memory on the device */ 629 set_array_double<<<dimGrid, dimBlock>>>(d_a, 2., N); 630 set_array_double<<<dimGrid, dimBlock>>>(d_b, .5, N); 631 set_array_double<<<dimGrid, dimBlock>>>(d_c, .5, N); 632 633 /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ 634 PetscLogDouble cpuTimer = 0.0; 635 636 scalar = 3.0; 637 for (k = 0; k < NTIMES; ++k) { 638 PetscTimeSubtract(&cpuTimer); 639 STREAM_Copy_double<<<dimGrid, dimBlock>>>(d_a, d_c, N); 640 cudaStreamSynchronize(NULL); 641 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 642 //get the total elapsed time in ms 643 if (bDontUseGPUTiming) { 644 PetscTimeAdd(&cpuTimer); 645 times[0][k] = cpuTimer * 1.e3; 646 } 647 648 cpuTimer = 0.0; 649 PetscTimeSubtract(&cpuTimer); 650 STREAM_Copy_Optimized_double<<<dimGrid, dimBlock>>>(d_a, d_c, N); 651 cudaStreamSynchronize(NULL); 652 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 653 //get the total elapsed time in ms 654 if (bDontUseGPUTiming) { 655 PetscTimeAdd(&cpuTimer); 656 times[1][k] = cpuTimer * 1.e3; 657 } 658 659 cpuTimer = 0.0; 660 PetscTimeSubtract(&cpuTimer); 661 STREAM_Scale_double<<<dimGrid, dimBlock>>>(d_b, d_c, scalar, N); 662 cudaStreamSynchronize(NULL); 663 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 664 //get the total elapsed time in ms 665 PetscTimeAdd(&cpuTimer); 666 if (bDontUseGPUTiming) times[2][k] = cpuTimer * 1.e3; 667 668 cpuTimer = 0.0; 669 PetscTimeSubtract(&cpuTimer); 670 STREAM_Scale_Optimized_double<<<dimGrid, dimBlock>>>(d_b, d_c, scalar, N); 671 cudaStreamSynchronize(NULL); 672 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 673 //get the total elapsed time in ms 674 PetscTimeAdd(&cpuTimer); 675 if (bDontUseGPUTiming) times[3][k] = cpuTimer * 1.e3; 676 677 cpuTimer = 0.0; 678 PetscTimeSubtract(&cpuTimer); 679 STREAM_Add_double<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, N); 680 cudaStreamSynchronize(NULL); 681 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 682 //get the total elapsed time in ms 683 PetscTimeAdd(&cpuTimer); 684 if (bDontUseGPUTiming) times[4][k] = cpuTimer * 1.e3; 685 686 cpuTimer = 0.0; 687 PetscTimeSubtract(&cpuTimer); 688 STREAM_Add_Optimized_double<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, N); 689 cudaStreamSynchronize(NULL); 690 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 691 //get the total elapsed time in ms 692 PetscTimeAdd(&cpuTimer); 693 if (bDontUseGPUTiming) times[5][k] = cpuTimer * 1.e3; 694 695 cpuTimer = 0.0; 696 PetscTimeSubtract(&cpuTimer); 697 STREAM_Triad_double<<<dimGrid, dimBlock>>>(d_b, d_c, d_a, scalar, N); 698 cudaStreamSynchronize(NULL); 699 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 700 //get the total elapsed time in ms 701 PetscTimeAdd(&cpuTimer); 702 if (bDontUseGPUTiming) times[6][k] = cpuTimer * 1.e3; 703 704 cpuTimer = 0.0; 705 PetscTimeSubtract(&cpuTimer); 706 STREAM_Triad_Optimized_double<<<dimGrid, dimBlock>>>(d_b, d_c, d_a, scalar, N); 707 cudaStreamSynchronize(NULL); 708 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 709 //get the total elapsed time in ms 710 PetscTimeAdd(&cpuTimer); 711 if (bDontUseGPUTiming) times[7][k] = cpuTimer * 1.e3; 712 } 713 714 if (1) { /* verify kernels */ 715 double *h_a, *h_b, *h_c; 716 bool errorSTREAMkernel = true; 717 718 if ((h_a = (double *)calloc(N, sizeof(double))) == (double *)NULL) { 719 printf("Unable to allocate array h_a, exiting ...\n"); 720 exit(1); 721 } 722 if ((h_b = (double *)calloc(N, sizeof(double))) == (double *)NULL) { 723 printf("Unable to allocate array h_b, exiting ...\n"); 724 exit(1); 725 } 726 727 if ((h_c = (double *)calloc(N, sizeof(double))) == (double *)NULL) { 728 printf("Unalbe to allocate array h_c, exiting ...\n"); 729 exit(1); 730 } 731 732 /* 733 * perform kernel, copy device memory into host memory and verify each 734 * device kernel output 735 */ 736 737 /* Initialize memory on the device */ 738 set_array_double<<<dimGrid, dimBlock>>>(d_a, 2., N); 739 set_array_double<<<dimGrid, dimBlock>>>(d_b, .5, N); 740 set_array_double<<<dimGrid, dimBlock>>>(d_c, .5, N); 741 742 STREAM_Copy_double<<<dimGrid, dimBlock>>>(d_a, d_c, N); 743 PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost)); 744 PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost)); 745 errorSTREAMkernel = STREAM_Copy_verify_double(h_a, h_c, N); 746 if (errorSTREAMkernel) { 747 PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n")); 748 exit(-2000); 749 } 750 751 /* Initialize memory on the device */ 752 set_array_double<<<dimGrid, dimBlock>>>(d_a, 2., N); 753 set_array_double<<<dimGrid, dimBlock>>>(d_b, .5, N); 754 set_array_double<<<dimGrid, dimBlock>>>(d_c, .5, N); 755 756 STREAM_Copy_Optimized_double<<<dimGrid, dimBlock>>>(d_a, d_c, N); 757 PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost)); 758 PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost)); 759 errorSTREAMkernel = STREAM_Copy_verify_double(h_a, h_c, N); 760 if (errorSTREAMkernel) { 761 PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n")); 762 exit(-3000); 763 } 764 765 /* Initialize memory on the device */ 766 set_array_double<<<dimGrid, dimBlock>>>(d_b, .5, N); 767 set_array_double<<<dimGrid, dimBlock>>>(d_c, .5, N); 768 769 STREAM_Scale_double<<<dimGrid, dimBlock>>>(d_b, d_c, scalar, N); 770 PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost)); 771 PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost)); 772 errorSTREAMkernel = STREAM_Scale_verify_double(h_b, h_c, scalar, N); 773 if (errorSTREAMkernel) { 774 PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n")); 775 exit(-4000); 776 } 777 778 /* Initialize memory on the device */ 779 set_array_double<<<dimGrid, dimBlock>>>(d_a, 2., N); 780 set_array_double<<<dimGrid, dimBlock>>>(d_b, .5, N); 781 set_array_double<<<dimGrid, dimBlock>>>(d_c, .5, N); 782 783 STREAM_Add_double<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, N); 784 PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost)); 785 PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost)); 786 PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost)); 787 errorSTREAMkernel = STREAM_Add_verify_double(h_a, h_b, h_c, N); 788 if (errorSTREAMkernel) { 789 PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n")); 790 exit(-5000); 791 } 792 793 /* Initialize memory on the device */ 794 set_array_double<<<dimGrid, dimBlock>>>(d_a, 2., N); 795 set_array_double<<<dimGrid, dimBlock>>>(d_b, .5, N); 796 set_array_double<<<dimGrid, dimBlock>>>(d_c, .5, N); 797 798 STREAM_Triad_double<<<dimGrid, dimBlock>>>(d_b, d_c, d_a, scalar, N); 799 PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost)); 800 PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost)); 801 PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost)); 802 errorSTREAMkernel = STREAM_Triad_verify_double(h_b, h_c, h_a, scalar, N); 803 if (errorSTREAMkernel) { 804 PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n")); 805 exit(-6000); 806 } 807 808 free(h_a); 809 free(h_b); 810 free(h_c); 811 } 812 /* continue from here */ 813 printResultsReadable(times, sizeof(double)); 814 815 /* Free memory on device */ 816 PetscCallCUDA(cudaFree(d_a)); 817 PetscCallCUDA(cudaFree(d_b)); 818 PetscCallCUDA(cudaFree(d_c)); 819 PetscFunctionReturn(PETSC_SUCCESS); 820 } 821 822 /////////////////////////////////////////////////////////////////////////// 823 //Print Results to Screen and File 824 /////////////////////////////////////////////////////////////////////////// 825 PetscErrorCode printResultsReadable(float times[][NTIMES], const size_t bsize) { 826 PetscErrorCode ierr; 827 PetscInt j, k; 828 float avgtime[8] = {0., 0., 0., 0., 0., 0., 0., 0.}; 829 float maxtime[8] = {0., 0., 0., 0., 0., 0., 0., 0.}; 830 float mintime[8] = {1e30, 1e30, 1e30, 1e30, 1e30, 1e30, 1e30, 1e30}; 831 // char *label[8] = {"Copy: ", "Copy Opt.: ", "Scale: ", "Scale Opt: ", "Add: ", "Add Opt: ", "Triad: ", "Triad Opt: "}; 832 const float bytes_per_kernel[8] = {2. * bsize * N, 2. * bsize * N, 2. * bsize * N, 2. * bsize * N, 3. * bsize * N, 3. * bsize * N, 3. * bsize * N, 3. * bsize * N}; 833 double rate, irate; 834 int rank, size; 835 836 PetscFunctionBegin; 837 PetscCallMPI(MPI_Comm_rank(MPI_COMM_WORLD, &rank)); 838 PetscCallMPI(MPI_Comm_size(MPI_COMM_WORLD, &size)); 839 /* --- SUMMARY --- */ 840 for (k = 0; k < NTIMES; ++k) { 841 for (j = 0; j < 8; ++j) { 842 avgtime[j] = avgtime[j] + (1.e-03f * times[j][k]); // millisec --> sec 843 mintime[j] = MIN(mintime[j], (1.e-03f * times[j][k])); 844 maxtime[j] = MAX(maxtime[j], (1.e-03f * times[j][k])); 845 } 846 } 847 for (j = 0; j < 8; ++j) { avgtime[j] = avgtime[j] / (float)(NTIMES - 1); } 848 j = 7; 849 irate = 1.0E-06 * bytes_per_kernel[j] / mintime[j]; 850 ierr = MPI_Reduce(&irate, &rate, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); 851 if (rank == 0) { 852 FILE *fd; 853 if (size == 1) { 854 printf("%d %11.4f Rate (MB/s)\n", size, rate); 855 fd = fopen("flops", "w"); 856 fprintf(fd, "%g\n", rate); 857 fclose(fd); 858 } else { 859 double prate; 860 fd = fopen("flops", "r"); 861 fscanf(fd, "%lg", &prate); 862 fclose(fd); 863 printf("%d %11.4f Rate (MB/s) %g \n", size, rate, rate / prate); 864 } 865 } 866 PetscFunctionReturn(PETSC_SUCCESS); 867 } 868