xref: /petsc/src/benchmarks/streams/CUDAVersion.cu (revision 4d86920da9ee67c835173a5dfffa1b3a52fd24ca)
1403adfb6SMatthew G Knepley /*
2403adfb6SMatthew G Knepley   STREAM benchmark implementation in CUDA.
3403adfb6SMatthew G Knepley 
4403adfb6SMatthew G Knepley     COPY:       a(i) = b(i)
5403adfb6SMatthew G Knepley     SCALE:      a(i) = q*b(i)
6403adfb6SMatthew G Knepley     SUM:        a(i) = b(i) + c(i)
7403adfb6SMatthew G Knepley     TRIAD:      a(i) = b(i) + q*c(i)
8403adfb6SMatthew G Knepley 
9403adfb6SMatthew G Knepley   It measures the memory system on the device.
1019816777SMark   The implementation is in double precision with a single option.
11403adfb6SMatthew G Knepley 
12403adfb6SMatthew G Knepley   Code based on the code developed by John D. McCalpin
13403adfb6SMatthew G Knepley   http://www.cs.virginia.edu/stream/FTP/Code/stream.c
14403adfb6SMatthew G Knepley 
15403adfb6SMatthew G Knepley   Written by: Massimiliano Fatica, NVIDIA Corporation
16403adfb6SMatthew G Knepley   Modified by: Douglas Enright (dpephd-nvidia@yahoo.com), 1 December 2010
17403adfb6SMatthew G Knepley   Extensive Revisions, 4 December 2010
18403adfb6SMatthew G Knepley   Modified for PETSc by: Matthew G. Knepley 14 Aug 2011
19403adfb6SMatthew G Knepley 
20403adfb6SMatthew G Knepley   User interface motivated by bandwidthTest NVIDIA SDK example.
21403adfb6SMatthew G Knepley */
2219816777SMark static char help[] = "Double-Precision STREAM Benchmark implementation in CUDA\n Performs Copy, Scale, Add, and Triad double-precision kernels\n\n";
23403adfb6SMatthew G Knepley 
24403adfb6SMatthew G Knepley #include <petscconf.h>
25403adfb6SMatthew G Knepley #include <petscsys.h>
26403adfb6SMatthew G Knepley #include <petsctime.h>
270e6b6b59SJacob Faibussowitsch #include <petscdevice_cuda.h>
28403adfb6SMatthew G Knepley 
2919816777SMark #define N      10000000
30403adfb6SMatthew G Knepley #define NTIMES 10
31403adfb6SMatthew G Knepley 
32403adfb6SMatthew G Knepley #ifndef MIN
33403adfb6SMatthew G Knepley #define MIN(x, y) ((x) < (y) ? (x) : (y))
34403adfb6SMatthew G Knepley #endif
35403adfb6SMatthew G Knepley #ifndef MAX
36403adfb6SMatthew G Knepley #define MAX(x, y) ((x) > (y) ? (x) : (y))
37403adfb6SMatthew G Knepley #endif
38403adfb6SMatthew G Knepley 
39403adfb6SMatthew G Knepley const float  flt_eps = 1.192092896e-07f;
40caccb7e3SMatthew G Knepley const double dbl_eps = 2.2204460492503131e-16;
41403adfb6SMatthew G Knepley 
420e6b6b59SJacob Faibussowitsch __global__ void set_array(float *a, float value, size_t len) {
43403adfb6SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
44403adfb6SMatthew G Knepley   while (idx < len) {
45403adfb6SMatthew G Knepley     a[idx] = value;
46403adfb6SMatthew G Knepley     idx += blockDim.x * gridDim.x;
47403adfb6SMatthew G Knepley   }
48403adfb6SMatthew G Knepley }
49403adfb6SMatthew G Knepley 
500e6b6b59SJacob Faibussowitsch __global__ void set_array_double(double *a, double value, size_t len) {
51caccb7e3SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
52caccb7e3SMatthew G Knepley   while (idx < len) {
53caccb7e3SMatthew G Knepley     a[idx] = value;
54caccb7e3SMatthew G Knepley     idx += blockDim.x * gridDim.x;
55caccb7e3SMatthew G Knepley   }
56caccb7e3SMatthew G Knepley }
57caccb7e3SMatthew G Knepley 
580e6b6b59SJacob Faibussowitsch __global__ void STREAM_Copy(float *a, float *b, size_t len) {
59403adfb6SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
60403adfb6SMatthew G Knepley   while (idx < len) {
61403adfb6SMatthew G Knepley     b[idx] = a[idx];
62403adfb6SMatthew G Knepley     idx += blockDim.x * gridDim.x;
63403adfb6SMatthew G Knepley   }
64403adfb6SMatthew G Knepley }
65403adfb6SMatthew G Knepley 
660e6b6b59SJacob Faibussowitsch __global__ void STREAM_Copy_double(double *a, double *b, size_t len) {
67caccb7e3SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
68caccb7e3SMatthew G Knepley   while (idx < len) {
69caccb7e3SMatthew G Knepley     b[idx] = a[idx];
70caccb7e3SMatthew G Knepley     idx += blockDim.x * gridDim.x;
71caccb7e3SMatthew G Knepley   }
72caccb7e3SMatthew G Knepley }
73caccb7e3SMatthew G Knepley 
740e6b6b59SJacob Faibussowitsch __global__ void STREAM_Copy_Optimized(float *a, float *b, size_t len) {
75403adfb6SMatthew G Knepley   /*
76403adfb6SMatthew G Knepley    * Ensure size of thread index space is as large as or greater than
77403adfb6SMatthew G Knepley    * vector index space else return.
78403adfb6SMatthew G Knepley    */
79403adfb6SMatthew G Knepley   if (blockDim.x * gridDim.x < len) return;
80403adfb6SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
81403adfb6SMatthew G Knepley   if (idx < len) b[idx] = a[idx];
82403adfb6SMatthew G Knepley }
83403adfb6SMatthew G Knepley 
840e6b6b59SJacob Faibussowitsch __global__ void STREAM_Copy_Optimized_double(double *a, double *b, size_t len) {
85caccb7e3SMatthew G Knepley   /*
86caccb7e3SMatthew G Knepley    * Ensure size of thread index space is as large as or greater than
87caccb7e3SMatthew G Knepley    * vector index space else return.
88caccb7e3SMatthew G Knepley    */
89caccb7e3SMatthew G Knepley   if (blockDim.x * gridDim.x < len) return;
90caccb7e3SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
91caccb7e3SMatthew G Knepley   if (idx < len) b[idx] = a[idx];
92caccb7e3SMatthew G Knepley }
93caccb7e3SMatthew G Knepley 
940e6b6b59SJacob Faibussowitsch __global__ void STREAM_Scale(float *a, float *b, float scale, size_t len) {
95403adfb6SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
96403adfb6SMatthew G Knepley   while (idx < len) {
97403adfb6SMatthew G Knepley     b[idx] = scale * a[idx];
98403adfb6SMatthew G Knepley     idx += blockDim.x * gridDim.x;
99403adfb6SMatthew G Knepley   }
100403adfb6SMatthew G Knepley }
101403adfb6SMatthew G Knepley 
1020e6b6b59SJacob Faibussowitsch __global__ void STREAM_Scale_double(double *a, double *b, double scale, size_t len) {
103caccb7e3SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
104caccb7e3SMatthew G Knepley   while (idx < len) {
105caccb7e3SMatthew G Knepley     b[idx] = scale * a[idx];
106caccb7e3SMatthew G Knepley     idx += blockDim.x * gridDim.x;
107caccb7e3SMatthew G Knepley   }
108caccb7e3SMatthew G Knepley }
109caccb7e3SMatthew G Knepley 
1100e6b6b59SJacob Faibussowitsch __global__ void STREAM_Scale_Optimized(float *a, float *b, float scale, size_t len) {
111caccb7e3SMatthew G Knepley   /*
112caccb7e3SMatthew G Knepley    * Ensure size of thread index space is as large as or greater than
113caccb7e3SMatthew G Knepley    * vector index space else return.
114caccb7e3SMatthew G Knepley    */
115caccb7e3SMatthew G Knepley   if (blockDim.x * gridDim.x < len) return;
116caccb7e3SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
117caccb7e3SMatthew G Knepley   if (idx < len) b[idx] = scale * a[idx];
118caccb7e3SMatthew G Knepley }
119caccb7e3SMatthew G Knepley 
1200e6b6b59SJacob Faibussowitsch __global__ void STREAM_Scale_Optimized_double(double *a, double *b, double scale, size_t len) {
121caccb7e3SMatthew G Knepley   /*
122caccb7e3SMatthew G Knepley    * Ensure size of thread index space is as large as or greater than
123caccb7e3SMatthew G Knepley    * vector index space else return.
124caccb7e3SMatthew G Knepley    */
125caccb7e3SMatthew G Knepley   if (blockDim.x * gridDim.x < len) return;
126caccb7e3SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
127caccb7e3SMatthew G Knepley   if (idx < len) b[idx] = scale * a[idx];
128caccb7e3SMatthew G Knepley }
129caccb7e3SMatthew G Knepley 
1300e6b6b59SJacob Faibussowitsch __global__ void STREAM_Add(float *a, float *b, float *c, size_t len) {
131403adfb6SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
132403adfb6SMatthew G Knepley   while (idx < len) {
133403adfb6SMatthew G Knepley     c[idx] = a[idx] + b[idx];
134403adfb6SMatthew G Knepley     idx += blockDim.x * gridDim.x;
135403adfb6SMatthew G Knepley   }
136403adfb6SMatthew G Knepley }
137403adfb6SMatthew G Knepley 
1380e6b6b59SJacob Faibussowitsch __global__ void STREAM_Add_double(double *a, double *b, double *c, size_t len) {
139caccb7e3SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
140caccb7e3SMatthew G Knepley   while (idx < len) {
141caccb7e3SMatthew G Knepley     c[idx] = a[idx] + b[idx];
142caccb7e3SMatthew G Knepley     idx += blockDim.x * gridDim.x;
143caccb7e3SMatthew G Knepley   }
144caccb7e3SMatthew G Knepley }
145caccb7e3SMatthew G Knepley 
1460e6b6b59SJacob Faibussowitsch __global__ void STREAM_Add_Optimized(float *a, float *b, float *c, size_t len) {
147caccb7e3SMatthew G Knepley   /*
148caccb7e3SMatthew G Knepley    * Ensure size of thread index space is as large as or greater than
149caccb7e3SMatthew G Knepley    * vector index space else return.
150caccb7e3SMatthew G Knepley    */
151caccb7e3SMatthew G Knepley   if (blockDim.x * gridDim.x < len) return;
152caccb7e3SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
153caccb7e3SMatthew G Knepley   if (idx < len) c[idx] = a[idx] + b[idx];
154caccb7e3SMatthew G Knepley }
155caccb7e3SMatthew G Knepley 
1560e6b6b59SJacob Faibussowitsch __global__ void STREAM_Add_Optimized_double(double *a, double *b, double *c, size_t len) {
157caccb7e3SMatthew G Knepley   /*
158caccb7e3SMatthew G Knepley    * Ensure size of thread index space is as large as or greater than
159caccb7e3SMatthew G Knepley    * vector index space else return.
160caccb7e3SMatthew G Knepley    */
161caccb7e3SMatthew G Knepley   if (blockDim.x * gridDim.x < len) return;
162caccb7e3SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
163caccb7e3SMatthew G Knepley   if (idx < len) c[idx] = a[idx] + b[idx];
164caccb7e3SMatthew G Knepley }
165caccb7e3SMatthew G Knepley 
1660e6b6b59SJacob Faibussowitsch __global__ void STREAM_Triad(float *a, float *b, float *c, float scalar, size_t len) {
167403adfb6SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
168403adfb6SMatthew G Knepley   while (idx < len) {
169403adfb6SMatthew G Knepley     c[idx] = a[idx] + scalar * b[idx];
170403adfb6SMatthew G Knepley     idx += blockDim.x * gridDim.x;
171403adfb6SMatthew G Knepley   }
172403adfb6SMatthew G Knepley }
173403adfb6SMatthew G Knepley 
1740e6b6b59SJacob Faibussowitsch __global__ void STREAM_Triad_double(double *a, double *b, double *c, double scalar, size_t len) {
175caccb7e3SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
176caccb7e3SMatthew G Knepley   while (idx < len) {
177caccb7e3SMatthew G Knepley     c[idx] = a[idx] + scalar * b[idx];
178caccb7e3SMatthew G Knepley     idx += blockDim.x * gridDim.x;
179caccb7e3SMatthew G Knepley   }
180caccb7e3SMatthew G Knepley }
181caccb7e3SMatthew G Knepley 
1820e6b6b59SJacob Faibussowitsch __global__ void STREAM_Triad_Optimized(float *a, float *b, float *c, float scalar, size_t len) {
183caccb7e3SMatthew G Knepley   /*
184caccb7e3SMatthew G Knepley    * Ensure size of thread index space is as large as or greater than
185caccb7e3SMatthew G Knepley    * vector index space else return.
186caccb7e3SMatthew G Knepley    */
187caccb7e3SMatthew G Knepley   if (blockDim.x * gridDim.x < len) return;
188caccb7e3SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
189caccb7e3SMatthew G Knepley   if (idx < len) c[idx] = a[idx] + scalar * b[idx];
190caccb7e3SMatthew G Knepley }
191caccb7e3SMatthew G Knepley 
1920e6b6b59SJacob Faibussowitsch __global__ void STREAM_Triad_Optimized_double(double *a, double *b, double *c, double scalar, size_t len) {
193caccb7e3SMatthew G Knepley   /*
194caccb7e3SMatthew G Knepley    * Ensure size of thread index space is as large as or greater than
195caccb7e3SMatthew G Knepley    * vector index space else return.
196caccb7e3SMatthew G Knepley    */
197caccb7e3SMatthew G Knepley   if (blockDim.x * gridDim.x < len) return;
198caccb7e3SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
199caccb7e3SMatthew G Knepley   if (idx < len) c[idx] = a[idx] + scalar * b[idx];
200caccb7e3SMatthew G Knepley }
201caccb7e3SMatthew G Knepley 
202403adfb6SMatthew G Knepley /* Host side verification routines */
2030e6b6b59SJacob Faibussowitsch bool STREAM_Copy_verify(float *a, float *b, size_t len) {
204403adfb6SMatthew G Knepley   size_t idx;
205403adfb6SMatthew G Knepley   bool   bDifferent = false;
206403adfb6SMatthew G Knepley 
207403adfb6SMatthew G Knepley   for (idx = 0; idx < len && !bDifferent; idx++) {
208403adfb6SMatthew G Knepley     float expectedResult     = a[idx];
209403adfb6SMatthew G Knepley     float diffResultExpected = (b[idx] - expectedResult);
210403adfb6SMatthew G Knepley     float relErrorULPS       = (fabsf(diffResultExpected) / fabsf(expectedResult)) / flt_eps;
211403adfb6SMatthew G Knepley     /* element-wise relative error determination */
212403adfb6SMatthew G Knepley     bDifferent               = (relErrorULPS > 2.f);
213403adfb6SMatthew G Knepley   }
214403adfb6SMatthew G Knepley 
215403adfb6SMatthew G Knepley   return bDifferent;
216403adfb6SMatthew G Knepley }
217403adfb6SMatthew G Knepley 
2180e6b6b59SJacob Faibussowitsch bool STREAM_Copy_verify_double(double *a, double *b, size_t len) {
219caccb7e3SMatthew G Knepley   size_t idx;
220caccb7e3SMatthew G Knepley   bool   bDifferent = false;
221caccb7e3SMatthew G Knepley 
222caccb7e3SMatthew G Knepley   for (idx = 0; idx < len && !bDifferent; idx++) {
223caccb7e3SMatthew G Knepley     double expectedResult     = a[idx];
224caccb7e3SMatthew G Knepley     double diffResultExpected = (b[idx] - expectedResult);
22519816777SMark     double relErrorULPS       = (fabsf(diffResultExpected) / fabsf(expectedResult)) / dbl_eps;
226caccb7e3SMatthew G Knepley     /* element-wise relative error determination */
227caccb7e3SMatthew G Knepley     bDifferent                = (relErrorULPS > 2.);
228caccb7e3SMatthew G Knepley   }
229caccb7e3SMatthew G Knepley 
230caccb7e3SMatthew G Knepley   return bDifferent;
231caccb7e3SMatthew G Knepley }
232caccb7e3SMatthew G Knepley 
2330e6b6b59SJacob Faibussowitsch bool STREAM_Scale_verify(float *a, float *b, float scale, size_t len) {
234403adfb6SMatthew G Knepley   size_t idx;
235403adfb6SMatthew G Knepley   bool   bDifferent = false;
236403adfb6SMatthew G Knepley 
237403adfb6SMatthew G Knepley   for (idx = 0; idx < len && !bDifferent; idx++) {
238403adfb6SMatthew G Knepley     float expectedResult     = scale * a[idx];
239403adfb6SMatthew G Knepley     float diffResultExpected = (b[idx] - expectedResult);
240403adfb6SMatthew G Knepley     float relErrorULPS       = (fabsf(diffResultExpected) / fabsf(expectedResult)) / flt_eps;
241403adfb6SMatthew G Knepley     /* element-wise relative error determination */
242403adfb6SMatthew G Knepley     bDifferent               = (relErrorULPS > 2.f);
243403adfb6SMatthew G Knepley   }
244403adfb6SMatthew G Knepley 
245403adfb6SMatthew G Knepley   return bDifferent;
246403adfb6SMatthew G Knepley }
247403adfb6SMatthew G Knepley 
2480e6b6b59SJacob Faibussowitsch bool STREAM_Scale_verify_double(double *a, double *b, double scale, size_t len) {
249caccb7e3SMatthew G Knepley   size_t idx;
250caccb7e3SMatthew G Knepley   bool   bDifferent = false;
251caccb7e3SMatthew G Knepley 
252caccb7e3SMatthew G Knepley   for (idx = 0; idx < len && !bDifferent; idx++) {
253caccb7e3SMatthew G Knepley     double expectedResult     = scale * a[idx];
254caccb7e3SMatthew G Knepley     double diffResultExpected = (b[idx] - expectedResult);
255caccb7e3SMatthew G Knepley     double relErrorULPS       = (fabsf(diffResultExpected) / fabsf(expectedResult)) / flt_eps;
256caccb7e3SMatthew G Knepley     /* element-wise relative error determination */
257caccb7e3SMatthew G Knepley     bDifferent                = (relErrorULPS > 2.);
258caccb7e3SMatthew G Knepley   }
259caccb7e3SMatthew G Knepley 
260caccb7e3SMatthew G Knepley   return bDifferent;
261caccb7e3SMatthew G Knepley }
262caccb7e3SMatthew G Knepley 
2630e6b6b59SJacob Faibussowitsch bool STREAM_Add_verify(float *a, float *b, float *c, size_t len) {
264403adfb6SMatthew G Knepley   size_t idx;
265403adfb6SMatthew G Knepley   bool   bDifferent = false;
266403adfb6SMatthew G Knepley 
267403adfb6SMatthew G Knepley   for (idx = 0; idx < len && !bDifferent; idx++) {
268403adfb6SMatthew G Knepley     float expectedResult     = a[idx] + b[idx];
269403adfb6SMatthew G Knepley     float diffResultExpected = (c[idx] - expectedResult);
270403adfb6SMatthew G Knepley     float relErrorULPS       = (fabsf(diffResultExpected) / fabsf(expectedResult)) / flt_eps;
271403adfb6SMatthew G Knepley     /* element-wise relative error determination */
272403adfb6SMatthew G Knepley     bDifferent               = (relErrorULPS > 2.f);
273403adfb6SMatthew G Knepley   }
274403adfb6SMatthew G Knepley 
275403adfb6SMatthew G Knepley   return bDifferent;
276403adfb6SMatthew G Knepley }
277403adfb6SMatthew G Knepley 
2780e6b6b59SJacob Faibussowitsch bool STREAM_Add_verify_double(double *a, double *b, double *c, size_t len) {
279caccb7e3SMatthew G Knepley   size_t idx;
280caccb7e3SMatthew G Knepley   bool   bDifferent = false;
281caccb7e3SMatthew G Knepley 
282caccb7e3SMatthew G Knepley   for (idx = 0; idx < len && !bDifferent; idx++) {
283caccb7e3SMatthew G Knepley     double expectedResult     = a[idx] + b[idx];
284caccb7e3SMatthew G Knepley     double diffResultExpected = (c[idx] - expectedResult);
285caccb7e3SMatthew G Knepley     double relErrorULPS       = (fabsf(diffResultExpected) / fabsf(expectedResult)) / flt_eps;
286caccb7e3SMatthew G Knepley     /* element-wise relative error determination */
287caccb7e3SMatthew G Knepley     bDifferent                = (relErrorULPS > 2.);
288caccb7e3SMatthew G Knepley   }
289caccb7e3SMatthew G Knepley 
290caccb7e3SMatthew G Knepley   return bDifferent;
291caccb7e3SMatthew G Knepley }
292caccb7e3SMatthew G Knepley 
2930e6b6b59SJacob Faibussowitsch bool STREAM_Triad_verify(float *a, float *b, float *c, float scalar, size_t len) {
294403adfb6SMatthew G Knepley   size_t idx;
295403adfb6SMatthew G Knepley   bool   bDifferent = false;
296403adfb6SMatthew G Knepley 
297403adfb6SMatthew G Knepley   for (idx = 0; idx < len && !bDifferent; idx++) {
298403adfb6SMatthew G Knepley     float expectedResult     = a[idx] + scalar * b[idx];
299403adfb6SMatthew G Knepley     float diffResultExpected = (c[idx] - expectedResult);
300403adfb6SMatthew G Knepley     float relErrorULPS       = (fabsf(diffResultExpected) / fabsf(expectedResult)) / flt_eps;
301403adfb6SMatthew G Knepley     /* element-wise relative error determination */
302403adfb6SMatthew G Knepley     bDifferent               = (relErrorULPS > 3.f);
303403adfb6SMatthew G Knepley   }
304403adfb6SMatthew G Knepley 
305403adfb6SMatthew G Knepley   return bDifferent;
306403adfb6SMatthew G Knepley }
307403adfb6SMatthew G Knepley 
3080e6b6b59SJacob Faibussowitsch bool STREAM_Triad_verify_double(double *a, double *b, double *c, double scalar, size_t len) {
309caccb7e3SMatthew G Knepley   size_t idx;
310caccb7e3SMatthew G Knepley   bool   bDifferent = false;
311caccb7e3SMatthew G Knepley 
312caccb7e3SMatthew G Knepley   for (idx = 0; idx < len && !bDifferent; idx++) {
313caccb7e3SMatthew G Knepley     double expectedResult     = a[idx] + scalar * b[idx];
314caccb7e3SMatthew G Knepley     double diffResultExpected = (c[idx] - expectedResult);
315caccb7e3SMatthew G Knepley     double relErrorULPS       = (fabsf(diffResultExpected) / fabsf(expectedResult)) / flt_eps;
316caccb7e3SMatthew G Knepley     /* element-wise relative error determination */
317caccb7e3SMatthew G Knepley     bDifferent                = (relErrorULPS > 3.);
318caccb7e3SMatthew G Knepley   }
319caccb7e3SMatthew G Knepley 
320caccb7e3SMatthew G Knepley   return bDifferent;
321caccb7e3SMatthew G Knepley }
322caccb7e3SMatthew G Knepley 
323403adfb6SMatthew G Knepley /* forward declarations */
324caccb7e3SMatthew G Knepley PetscErrorCode setupStream(PetscInt device, PetscBool runDouble, PetscBool cpuTiming);
325403adfb6SMatthew G Knepley PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming);
326caccb7e3SMatthew G Knepley PetscErrorCode runStreamDouble(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming);
32719816777SMark PetscErrorCode printResultsReadable(float times[][NTIMES], size_t);
328403adfb6SMatthew G Knepley 
3290e6b6b59SJacob Faibussowitsch int main(int argc, char *argv[]) {
330403adfb6SMatthew G Knepley   PetscInt        device    = 0;
33119816777SMark   PetscBool       runDouble = PETSC_TRUE;
33219816777SMark   const PetscBool cpuTiming = PETSC_TRUE; // must be true
333403adfb6SMatthew G Knepley   PetscErrorCode  ierr;
334403adfb6SMatthew G Knepley 
3359566063dSJacob Faibussowitsch   PetscCallCUDA(cudaSetDeviceFlags(cudaDeviceBlockingSync));
33619816777SMark 
3379566063dSJacob Faibussowitsch   PetscCall(PetscInitialize(&argc, &argv, 0, help));
338403adfb6SMatthew G Knepley 
339d0609cedSBarry Smith   PetscOptionsBegin(PETSC_COMM_WORLD, "", "STREAM Benchmark Options", "STREAM");
3409566063dSJacob Faibussowitsch   PetscCall(PetscOptionsBoundedInt("-device", "Specify the CUDA device to be used", "STREAM", device, &device, NULL, 0));
3419566063dSJacob Faibussowitsch   PetscCall(PetscOptionsBool("-double", "Also run double precision tests", "STREAM", runDouble, &runDouble, NULL));
342d0609cedSBarry Smith   PetscOptionsEnd();
343403adfb6SMatthew G Knepley 
344caccb7e3SMatthew G Knepley   ierr = setupStream(device, runDouble, cpuTiming);
3450e6b6b59SJacob Faibussowitsch   if (ierr) { PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n[streamBenchmark] - results:\t%s\n\n", (ierr == 0) ? "PASSES" : "FAILED")); }
3469566063dSJacob Faibussowitsch   PetscCall(PetscFinalize());
347b122ec5aSJacob Faibussowitsch   return 0;
348403adfb6SMatthew G Knepley }
349403adfb6SMatthew G Knepley 
350403adfb6SMatthew G Knepley ///////////////////////////////////////////////////////////////////////////////
351403adfb6SMatthew G Knepley //Run the appropriate tests
352403adfb6SMatthew G Knepley ///////////////////////////////////////////////////////////////////////////////
3530e6b6b59SJacob Faibussowitsch PetscErrorCode setupStream(PetscInt deviceNum, PetscBool runDouble, PetscBool cpuTiming) {
354403adfb6SMatthew G Knepley   PetscInt iNumThreadsPerBlock = 128;
355403adfb6SMatthew G Knepley 
356403adfb6SMatthew G Knepley   PetscFunctionBegin;
357403adfb6SMatthew G Knepley   // Check device
358403adfb6SMatthew G Knepley   {
359403adfb6SMatthew G Knepley     int deviceCount;
360403adfb6SMatthew G Knepley 
361403adfb6SMatthew G Knepley     cudaGetDeviceCount(&deviceCount);
362403adfb6SMatthew G Knepley     if (deviceCount == 0) {
3639566063dSJacob Faibussowitsch       PetscCall(PetscPrintf(PETSC_COMM_SELF, "!!!!!No devices found!!!!!\n"));
364403adfb6SMatthew G Knepley       return -1000;
365403adfb6SMatthew G Knepley     }
366403adfb6SMatthew G Knepley 
367403adfb6SMatthew G Knepley     if (deviceNum >= deviceCount || deviceNum < 0) {
3689566063dSJacob Faibussowitsch       PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n!!!!!Invalid GPU number %d given hence default gpu %d will be used !!!!!\n", deviceNum, 0));
369403adfb6SMatthew G Knepley       deviceNum = 0;
370403adfb6SMatthew G Knepley     }
371403adfb6SMatthew G Knepley   }
372403adfb6SMatthew G Knepley 
373403adfb6SMatthew G Knepley   cudaSetDevice(deviceNum);
3749566063dSJacob Faibussowitsch   // PetscCall(PetscPrintf(PETSC_COMM_SELF, "Running on...\n\n"));
375403adfb6SMatthew G Knepley   cudaDeviceProp deviceProp;
37619816777SMark   if (cudaGetDeviceProperties(&deviceProp, deviceNum) != cudaSuccess) {
3779566063dSJacob Faibussowitsch     PetscCall(PetscPrintf(PETSC_COMM_SELF, " Unable to determine device %d properties, exiting\n"));
378403adfb6SMatthew G Knepley     return -1;
379403adfb6SMatthew G Knepley   }
380403adfb6SMatthew G Knepley 
381caccb7e3SMatthew G Knepley   if (runDouble && deviceProp.major == 1 && deviceProp.minor < 3) {
3829566063dSJacob Faibussowitsch     PetscCall(PetscPrintf(PETSC_COMM_SELF, " Unable to run double-precision STREAM benchmark on a compute capability GPU less than 1.3\n"));
383caccb7e3SMatthew G Knepley     return -1;
384caccb7e3SMatthew G Knepley   }
3856f2b61bcSKarl Rupp   if (deviceProp.major == 2 && deviceProp.minor == 1) iNumThreadsPerBlock = 192; /* GF104 architecture / 48 CUDA Cores per MP */
3866f2b61bcSKarl Rupp   else iNumThreadsPerBlock = 128;                                                /* GF100 architecture / 32 CUDA Cores per MP */
387403adfb6SMatthew G Knepley 
3881baa6e33SBarry Smith   if (runDouble) PetscCall(runStreamDouble(iNumThreadsPerBlock, cpuTiming));
3891baa6e33SBarry Smith   else PetscCall(runStream(iNumThreadsPerBlock, cpuTiming));
3903ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
391403adfb6SMatthew G Knepley }
392403adfb6SMatthew G Knepley 
393403adfb6SMatthew G Knepley ///////////////////////////////////////////////////////////////////////////
394403adfb6SMatthew G Knepley // runStream
395403adfb6SMatthew G Knepley ///////////////////////////////////////////////////////////////////////////
3960e6b6b59SJacob Faibussowitsch PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming) {
397403adfb6SMatthew G Knepley   float *d_a, *d_b, *d_c;
398403adfb6SMatthew G Knepley   int    k;
399caccb7e3SMatthew G Knepley   float  times[8][NTIMES];
400403adfb6SMatthew G Knepley   float  scalar;
401403adfb6SMatthew G Knepley 
402403adfb6SMatthew G Knepley   PetscFunctionBegin;
403403adfb6SMatthew G Knepley   /* Allocate memory on device */
4049566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&d_a, sizeof(float) * N));
4059566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&d_b, sizeof(float) * N));
4069566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&d_c, sizeof(float) * N));
407403adfb6SMatthew G Knepley 
408403adfb6SMatthew G Knepley   /* Compute execution configuration */
409403adfb6SMatthew G Knepley 
410403adfb6SMatthew G Knepley   dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */
411403adfb6SMatthew G Knepley   dim3 dimGrid(N / dimBlock.x);       /* (N/dimBlock.x,1,1) */
412403adfb6SMatthew G Knepley   if (N % dimBlock.x != 0) dimGrid.x += 1;
413403adfb6SMatthew G Knepley 
414403adfb6SMatthew G Knepley   /* Initialize memory on the device */
415403adfb6SMatthew G Knepley   set_array<<<dimGrid, dimBlock>>>(d_a, 2.f, N);
416403adfb6SMatthew G Knepley   set_array<<<dimGrid, dimBlock>>>(d_b, .5f, N);
417403adfb6SMatthew G Knepley   set_array<<<dimGrid, dimBlock>>>(d_c, .5f, N);
418403adfb6SMatthew G Knepley 
419403adfb6SMatthew G Knepley   /* --- MAIN LOOP --- repeat test cases NTIMES times --- */
420403adfb6SMatthew G Knepley   PetscLogDouble cpuTimer = 0.0;
421403adfb6SMatthew G Knepley 
422403adfb6SMatthew G Knepley   scalar = 3.0f;
423403adfb6SMatthew G Knepley   for (k = 0; k < NTIMES; ++k) {
4248563dfccSBarry Smith     PetscTimeSubtract(&cpuTimer);
425403adfb6SMatthew G Knepley     STREAM_Copy<<<dimGrid, dimBlock>>>(d_a, d_c, N);
42619816777SMark     cudaStreamSynchronize(NULL);
4279566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
4288563dfccSBarry Smith     PetscTimeAdd(&cpuTimer);
42919816777SMark     if (bDontUseGPUTiming) times[0][k] = cpuTimer * 1.e3; // millisec
430403adfb6SMatthew G Knepley 
431403adfb6SMatthew G Knepley     cpuTimer = 0.0;
4328563dfccSBarry Smith     PetscTimeSubtract(&cpuTimer);
433403adfb6SMatthew G Knepley     STREAM_Copy_Optimized<<<dimGrid, dimBlock>>>(d_a, d_c, N);
43419816777SMark     cudaStreamSynchronize(NULL);
4359566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
436df3898eeSBarry Smith     //get the total elapsed time in ms
4378563dfccSBarry Smith     PetscTimeAdd(&cpuTimer);
43819816777SMark     if (bDontUseGPUTiming) times[1][k] = cpuTimer * 1.e3;
439403adfb6SMatthew G Knepley 
440403adfb6SMatthew G Knepley     cpuTimer = 0.0;
4418563dfccSBarry Smith     PetscTimeSubtract(&cpuTimer);
442403adfb6SMatthew G Knepley     STREAM_Scale<<<dimGrid, dimBlock>>>(d_b, d_c, scalar, N);
44319816777SMark     cudaStreamSynchronize(NULL);
4449566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
445df3898eeSBarry Smith     //get the total elapsed time in ms
4468563dfccSBarry Smith     PetscTimeAdd(&cpuTimer);
44719816777SMark     if (bDontUseGPUTiming) times[2][k] = cpuTimer * 1.e3;
448403adfb6SMatthew G Knepley 
449403adfb6SMatthew G Knepley     cpuTimer = 0.0;
4508563dfccSBarry Smith     PetscTimeSubtract(&cpuTimer);
451caccb7e3SMatthew G Knepley     STREAM_Scale_Optimized<<<dimGrid, dimBlock>>>(d_b, d_c, scalar, N);
45219816777SMark     cudaStreamSynchronize(NULL);
4539566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
454df3898eeSBarry Smith     //get the total elapsed time in ms
4558563dfccSBarry Smith     PetscTimeAdd(&cpuTimer);
45619816777SMark     if (bDontUseGPUTiming) times[3][k] = cpuTimer * 1.e3;
457403adfb6SMatthew G Knepley 
458403adfb6SMatthew G Knepley     cpuTimer = 0.0;
4598563dfccSBarry Smith     PetscTimeSubtract(&cpuTimer);
4609566063dSJacob Faibussowitsch     // PetscCallCUDA(cudaEventRecord(start, 0));
461caccb7e3SMatthew G Knepley     STREAM_Add<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, N);
46219816777SMark     cudaStreamSynchronize(NULL);
4639566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
4649566063dSJacob Faibussowitsch     PetscCallCUDA(cudaEventRecord(stop, 0));
4659566063dSJacob Faibussowitsch     // PetscCallCUDA(cudaEventSynchronize(stop));
466df3898eeSBarry Smith     //get the total elapsed time in ms
4678563dfccSBarry Smith     PetscTimeAdd(&cpuTimer);
46819816777SMark     if (bDontUseGPUTiming) times[4][k] = cpuTimer * 1.e3;
4696f2b61bcSKarl Rupp     else {
4709566063dSJacob Faibussowitsch       // PetscCallCUDA(cudaEventElapsedTime(&times[4][k], start, stop));
471403adfb6SMatthew G Knepley     }
472403adfb6SMatthew G Knepley 
473caccb7e3SMatthew G Knepley     cpuTimer = 0.0;
4748563dfccSBarry Smith     PetscTimeSubtract(&cpuTimer);
475caccb7e3SMatthew G Knepley     STREAM_Add_Optimized<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, N);
47619816777SMark     cudaStreamSynchronize(NULL);
4779566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
478df3898eeSBarry Smith     //get the total elapsed time in ms
4798563dfccSBarry Smith     PetscTimeAdd(&cpuTimer);
48019816777SMark     if (bDontUseGPUTiming) times[5][k] = cpuTimer * 1.e3;
481caccb7e3SMatthew G Knepley 
482caccb7e3SMatthew G Knepley     cpuTimer = 0.0;
4838563dfccSBarry Smith     PetscTimeSubtract(&cpuTimer);
484caccb7e3SMatthew G Knepley     STREAM_Triad<<<dimGrid, dimBlock>>>(d_b, d_c, d_a, scalar, N);
48519816777SMark     cudaStreamSynchronize(NULL);
4869566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
487df3898eeSBarry Smith     //get the total elapsed time in ms
4888563dfccSBarry Smith     PetscTimeAdd(&cpuTimer);
48919816777SMark     if (bDontUseGPUTiming) times[6][k] = cpuTimer * 1.e3;
490caccb7e3SMatthew G Knepley 
491caccb7e3SMatthew G Knepley     cpuTimer = 0.0;
4928563dfccSBarry Smith     PetscTimeSubtract(&cpuTimer);
493caccb7e3SMatthew G Knepley     STREAM_Triad_Optimized<<<dimGrid, dimBlock>>>(d_b, d_c, d_a, scalar, N);
49419816777SMark     cudaStreamSynchronize(NULL);
4959566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
496df3898eeSBarry Smith     //get the total elapsed time in ms
4978563dfccSBarry Smith     PetscTimeAdd(&cpuTimer);
49819816777SMark     if (bDontUseGPUTiming) times[7][k] = cpuTimer * 1.e3;
499caccb7e3SMatthew G Knepley   }
500caccb7e3SMatthew G Knepley 
50119816777SMark   if (1) { /* verify kernels */
502403adfb6SMatthew G Knepley     float *h_a, *h_b, *h_c;
503403adfb6SMatthew G Knepley     bool   errorSTREAMkernel = true;
504403adfb6SMatthew G Knepley 
505403adfb6SMatthew G Knepley     if ((h_a = (float *)calloc(N, sizeof(float))) == (float *)NULL) {
506403adfb6SMatthew G Knepley       printf("Unable to allocate array h_a, exiting ...\n");
507403adfb6SMatthew G Knepley       exit(1);
508403adfb6SMatthew G Knepley     }
509403adfb6SMatthew G Knepley     if ((h_b = (float *)calloc(N, sizeof(float))) == (float *)NULL) {
510403adfb6SMatthew G Knepley       printf("Unable to allocate array h_b, exiting ...\n");
511403adfb6SMatthew G Knepley       exit(1);
512403adfb6SMatthew G Knepley     }
513403adfb6SMatthew G Knepley 
514403adfb6SMatthew G Knepley     if ((h_c = (float *)calloc(N, sizeof(float))) == (float *)NULL) {
515403adfb6SMatthew G Knepley       printf("Unalbe to allocate array h_c, exiting ...\n");
516403adfb6SMatthew G Knepley       exit(1);
517403adfb6SMatthew G Knepley     }
518403adfb6SMatthew G Knepley 
519403adfb6SMatthew G Knepley     /*
520403adfb6SMatthew G Knepley    * perform kernel, copy device memory into host memory and verify each
521403adfb6SMatthew G Knepley    * device kernel output
522403adfb6SMatthew G Knepley    */
523403adfb6SMatthew G Knepley 
524403adfb6SMatthew G Knepley     /* Initialize memory on the device */
525403adfb6SMatthew G Knepley     set_array<<<dimGrid, dimBlock>>>(d_a, 2.f, N);
526403adfb6SMatthew G Knepley     set_array<<<dimGrid, dimBlock>>>(d_b, .5f, N);
527403adfb6SMatthew G Knepley     set_array<<<dimGrid, dimBlock>>>(d_c, .5f, N);
528403adfb6SMatthew G Knepley 
529403adfb6SMatthew G Knepley     STREAM_Copy<<<dimGrid, dimBlock>>>(d_a, d_c, N);
5309566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost));
5319566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost));
532403adfb6SMatthew G Knepley     errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N);
533403adfb6SMatthew G Knepley     if (errorSTREAMkernel) {
5349566063dSJacob Faibussowitsch       PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n"));
535403adfb6SMatthew G Knepley       exit(-2000);
536403adfb6SMatthew G Knepley     }
537403adfb6SMatthew G Knepley 
538403adfb6SMatthew G Knepley     /* Initialize memory on the device */
539403adfb6SMatthew G Knepley     set_array<<<dimGrid, dimBlock>>>(d_a, 2.f, N);
540403adfb6SMatthew G Knepley     set_array<<<dimGrid, dimBlock>>>(d_b, .5f, N);
541403adfb6SMatthew G Knepley     set_array<<<dimGrid, dimBlock>>>(d_c, .5f, N);
542403adfb6SMatthew G Knepley 
543403adfb6SMatthew G Knepley     STREAM_Copy_Optimized<<<dimGrid, dimBlock>>>(d_a, d_c, N);
5449566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost));
5459566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost));
546403adfb6SMatthew G Knepley     errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N);
547403adfb6SMatthew G Knepley     if (errorSTREAMkernel) {
5489566063dSJacob Faibussowitsch       PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n"));
549403adfb6SMatthew G Knepley       exit(-3000);
550403adfb6SMatthew G Knepley     }
551403adfb6SMatthew G Knepley 
552403adfb6SMatthew G Knepley     /* Initialize memory on the device */
55319816777SMark     set_array<<<dimGrid, dimBlock>>>(d_a, 2.f, N);
554403adfb6SMatthew G Knepley     set_array<<<dimGrid, dimBlock>>>(d_b, .5f, N);
555403adfb6SMatthew G Knepley     set_array<<<dimGrid, dimBlock>>>(d_c, .5f, N);
556403adfb6SMatthew G Knepley 
557403adfb6SMatthew G Knepley     STREAM_Scale<<<dimGrid, dimBlock>>>(d_b, d_c, scalar, N);
5589566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost));
5599566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost));
560403adfb6SMatthew G Knepley     errorSTREAMkernel = STREAM_Scale_verify(h_b, h_c, scalar, N);
561403adfb6SMatthew G Knepley     if (errorSTREAMkernel) {
5629566063dSJacob Faibussowitsch       PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n"));
563403adfb6SMatthew G Knepley       exit(-4000);
564403adfb6SMatthew G Knepley     }
565403adfb6SMatthew G Knepley 
566403adfb6SMatthew G Knepley     /* Initialize memory on the device */
567403adfb6SMatthew G Knepley     set_array<<<dimGrid, dimBlock>>>(d_a, 2.f, N);
568403adfb6SMatthew G Knepley     set_array<<<dimGrid, dimBlock>>>(d_b, .5f, N);
569403adfb6SMatthew G Knepley     set_array<<<dimGrid, dimBlock>>>(d_c, .5f, N);
570403adfb6SMatthew G Knepley 
571403adfb6SMatthew G Knepley     STREAM_Add<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, N);
5729566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost));
5739566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost));
5749566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost));
575403adfb6SMatthew G Knepley     errorSTREAMkernel = STREAM_Add_verify(h_a, h_b, h_c, N);
576403adfb6SMatthew G Knepley     if (errorSTREAMkernel) {
5779566063dSJacob Faibussowitsch       PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n"));
578403adfb6SMatthew G Knepley       exit(-5000);
579403adfb6SMatthew G Knepley     }
580403adfb6SMatthew G Knepley 
581403adfb6SMatthew G Knepley     /* Initialize memory on the device */
582403adfb6SMatthew G Knepley     set_array<<<dimGrid, dimBlock>>>(d_a, 2.f, N);
583403adfb6SMatthew G Knepley     set_array<<<dimGrid, dimBlock>>>(d_b, .5f, N);
584403adfb6SMatthew G Knepley     set_array<<<dimGrid, dimBlock>>>(d_c, .5f, N);
585403adfb6SMatthew G Knepley 
586403adfb6SMatthew G Knepley     STREAM_Triad<<<dimGrid, dimBlock>>>(d_b, d_c, d_a, scalar, N);
5879566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost));
5889566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost));
5899566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost));
590403adfb6SMatthew G Knepley     errorSTREAMkernel = STREAM_Triad_verify(h_b, h_c, h_a, scalar, N);
591403adfb6SMatthew G Knepley     if (errorSTREAMkernel) {
5929566063dSJacob Faibussowitsch       PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n"));
593403adfb6SMatthew G Knepley       exit(-6000);
594403adfb6SMatthew G Knepley     }
595403adfb6SMatthew G Knepley 
59619816777SMark     free(h_a);
59719816777SMark     free(h_b);
59819816777SMark     free(h_c);
59919816777SMark   }
600403adfb6SMatthew G Knepley   /* continue from here */
60119816777SMark   printResultsReadable(times, sizeof(float));
602403adfb6SMatthew G Knepley 
603403adfb6SMatthew G Knepley   /* Free memory on device */
6049566063dSJacob Faibussowitsch   PetscCallCUDA(cudaFree(d_a));
6059566063dSJacob Faibussowitsch   PetscCallCUDA(cudaFree(d_b));
6069566063dSJacob Faibussowitsch   PetscCallCUDA(cudaFree(d_c));
6073ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
608403adfb6SMatthew G Knepley }
609403adfb6SMatthew G Knepley 
6100e6b6b59SJacob Faibussowitsch PetscErrorCode runStreamDouble(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming) {
611caccb7e3SMatthew G Knepley   double *d_a, *d_b, *d_c;
612caccb7e3SMatthew G Knepley   int     k;
613caccb7e3SMatthew G Knepley   float   times[8][NTIMES];
614caccb7e3SMatthew G Knepley   double  scalar;
615caccb7e3SMatthew G Knepley 
616caccb7e3SMatthew G Knepley   PetscFunctionBegin;
617caccb7e3SMatthew G Knepley   /* Allocate memory on device */
6189566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&d_a, sizeof(double) * N));
6199566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&d_b, sizeof(double) * N));
6209566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&d_c, sizeof(double) * N));
621caccb7e3SMatthew G Knepley 
622caccb7e3SMatthew G Knepley   /* Compute execution configuration */
623caccb7e3SMatthew G Knepley 
624caccb7e3SMatthew G Knepley   dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */
625caccb7e3SMatthew G Knepley   dim3 dimGrid(N / dimBlock.x);       /* (N/dimBlock.x,1,1) */
626caccb7e3SMatthew G Knepley   if (N % dimBlock.x != 0) dimGrid.x += 1;
627caccb7e3SMatthew G Knepley 
628caccb7e3SMatthew G Knepley   /* Initialize memory on the device */
629caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid, dimBlock>>>(d_a, 2., N);
630caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid, dimBlock>>>(d_b, .5, N);
631caccb7e3SMatthew G Knepley   set_array_double<<<dimGrid, dimBlock>>>(d_c, .5, N);
632caccb7e3SMatthew G Knepley 
633caccb7e3SMatthew G Knepley   /* --- MAIN LOOP --- repeat test cases NTIMES times --- */
634caccb7e3SMatthew G Knepley   PetscLogDouble cpuTimer = 0.0;
635caccb7e3SMatthew G Knepley 
636caccb7e3SMatthew G Knepley   scalar = 3.0;
637caccb7e3SMatthew G Knepley   for (k = 0; k < NTIMES; ++k) {
6388563dfccSBarry Smith     PetscTimeSubtract(&cpuTimer);
639caccb7e3SMatthew G Knepley     STREAM_Copy_double<<<dimGrid, dimBlock>>>(d_a, d_c, N);
64019816777SMark     cudaStreamSynchronize(NULL);
6419566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
642df3898eeSBarry Smith     //get the total elapsed time in ms
643caccb7e3SMatthew G Knepley     if (bDontUseGPUTiming) {
6448563dfccSBarry Smith       PetscTimeAdd(&cpuTimer);
64519816777SMark       times[0][k] = cpuTimer * 1.e3;
646caccb7e3SMatthew G Knepley     }
647caccb7e3SMatthew G Knepley 
648caccb7e3SMatthew G Knepley     cpuTimer = 0.0;
6498563dfccSBarry Smith     PetscTimeSubtract(&cpuTimer);
650caccb7e3SMatthew G Knepley     STREAM_Copy_Optimized_double<<<dimGrid, dimBlock>>>(d_a, d_c, N);
65119816777SMark     cudaStreamSynchronize(NULL);
6529566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
653df3898eeSBarry Smith     //get the total elapsed time in ms
654caccb7e3SMatthew G Knepley     if (bDontUseGPUTiming) {
6558563dfccSBarry Smith       PetscTimeAdd(&cpuTimer);
65619816777SMark       times[1][k] = cpuTimer * 1.e3;
657caccb7e3SMatthew G Knepley     }
658caccb7e3SMatthew G Knepley 
659caccb7e3SMatthew G Knepley     cpuTimer = 0.0;
6608563dfccSBarry Smith     PetscTimeSubtract(&cpuTimer);
661caccb7e3SMatthew G Knepley     STREAM_Scale_double<<<dimGrid, dimBlock>>>(d_b, d_c, scalar, N);
66219816777SMark     cudaStreamSynchronize(NULL);
6639566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
664df3898eeSBarry Smith     //get the total elapsed time in ms
6658563dfccSBarry Smith     PetscTimeAdd(&cpuTimer);
66619816777SMark     if (bDontUseGPUTiming) times[2][k] = cpuTimer * 1.e3;
667caccb7e3SMatthew G Knepley 
668caccb7e3SMatthew G Knepley     cpuTimer = 0.0;
6698563dfccSBarry Smith     PetscTimeSubtract(&cpuTimer);
670caccb7e3SMatthew G Knepley     STREAM_Scale_Optimized_double<<<dimGrid, dimBlock>>>(d_b, d_c, scalar, N);
67119816777SMark     cudaStreamSynchronize(NULL);
6729566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
673df3898eeSBarry Smith     //get the total elapsed time in ms
6748563dfccSBarry Smith     PetscTimeAdd(&cpuTimer);
67519816777SMark     if (bDontUseGPUTiming) times[3][k] = cpuTimer * 1.e3;
676caccb7e3SMatthew G Knepley 
677caccb7e3SMatthew G Knepley     cpuTimer = 0.0;
6788563dfccSBarry Smith     PetscTimeSubtract(&cpuTimer);
679caccb7e3SMatthew G Knepley     STREAM_Add_double<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, N);
68019816777SMark     cudaStreamSynchronize(NULL);
6819566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
682df3898eeSBarry Smith     //get the total elapsed time in ms
6838563dfccSBarry Smith     PetscTimeAdd(&cpuTimer);
68419816777SMark     if (bDontUseGPUTiming) times[4][k] = cpuTimer * 1.e3;
685caccb7e3SMatthew G Knepley 
686caccb7e3SMatthew G Knepley     cpuTimer = 0.0;
6878563dfccSBarry Smith     PetscTimeSubtract(&cpuTimer);
688caccb7e3SMatthew G Knepley     STREAM_Add_Optimized_double<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, N);
68919816777SMark     cudaStreamSynchronize(NULL);
6909566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
691df3898eeSBarry Smith     //get the total elapsed time in ms
6928563dfccSBarry Smith     PetscTimeAdd(&cpuTimer);
69319816777SMark     if (bDontUseGPUTiming) times[5][k] = cpuTimer * 1.e3;
694caccb7e3SMatthew G Knepley 
695caccb7e3SMatthew G Knepley     cpuTimer = 0.0;
6968563dfccSBarry Smith     PetscTimeSubtract(&cpuTimer);
697caccb7e3SMatthew G Knepley     STREAM_Triad_double<<<dimGrid, dimBlock>>>(d_b, d_c, d_a, scalar, N);
69819816777SMark     cudaStreamSynchronize(NULL);
6999566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
700df3898eeSBarry Smith     //get the total elapsed time in ms
7018563dfccSBarry Smith     PetscTimeAdd(&cpuTimer);
70219816777SMark     if (bDontUseGPUTiming) times[6][k] = cpuTimer * 1.e3;
703caccb7e3SMatthew G Knepley 
704caccb7e3SMatthew G Knepley     cpuTimer = 0.0;
7058563dfccSBarry Smith     PetscTimeSubtract(&cpuTimer);
706caccb7e3SMatthew G Knepley     STREAM_Triad_Optimized_double<<<dimGrid, dimBlock>>>(d_b, d_c, d_a, scalar, N);
70719816777SMark     cudaStreamSynchronize(NULL);
7089566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
709df3898eeSBarry Smith     //get the total elapsed time in ms
7108563dfccSBarry Smith     PetscTimeAdd(&cpuTimer);
71119816777SMark     if (bDontUseGPUTiming) times[7][k] = cpuTimer * 1.e3;
712caccb7e3SMatthew G Knepley   }
713caccb7e3SMatthew G Knepley 
71419816777SMark   if (1) { /* verify kernels */
715caccb7e3SMatthew G Knepley     double *h_a, *h_b, *h_c;
716caccb7e3SMatthew G Knepley     bool    errorSTREAMkernel = true;
717caccb7e3SMatthew G Knepley 
718caccb7e3SMatthew G Knepley     if ((h_a = (double *)calloc(N, sizeof(double))) == (double *)NULL) {
719caccb7e3SMatthew G Knepley       printf("Unable to allocate array h_a, exiting ...\n");
720caccb7e3SMatthew G Knepley       exit(1);
721caccb7e3SMatthew G Knepley     }
722caccb7e3SMatthew G Knepley     if ((h_b = (double *)calloc(N, sizeof(double))) == (double *)NULL) {
723caccb7e3SMatthew G Knepley       printf("Unable to allocate array h_b, exiting ...\n");
724caccb7e3SMatthew G Knepley       exit(1);
725caccb7e3SMatthew G Knepley     }
726caccb7e3SMatthew G Knepley 
727caccb7e3SMatthew G Knepley     if ((h_c = (double *)calloc(N, sizeof(double))) == (double *)NULL) {
728caccb7e3SMatthew G Knepley       printf("Unalbe to allocate array h_c, exiting ...\n");
729caccb7e3SMatthew G Knepley       exit(1);
730caccb7e3SMatthew G Knepley     }
731caccb7e3SMatthew G Knepley 
732caccb7e3SMatthew G Knepley     /*
733caccb7e3SMatthew G Knepley    * perform kernel, copy device memory into host memory and verify each
734caccb7e3SMatthew G Knepley    * device kernel output
735caccb7e3SMatthew G Knepley    */
736caccb7e3SMatthew G Knepley 
737caccb7e3SMatthew G Knepley     /* Initialize memory on the device */
738caccb7e3SMatthew G Knepley     set_array_double<<<dimGrid, dimBlock>>>(d_a, 2., N);
739caccb7e3SMatthew G Knepley     set_array_double<<<dimGrid, dimBlock>>>(d_b, .5, N);
740caccb7e3SMatthew G Knepley     set_array_double<<<dimGrid, dimBlock>>>(d_c, .5, N);
741caccb7e3SMatthew G Knepley 
742caccb7e3SMatthew G Knepley     STREAM_Copy_double<<<dimGrid, dimBlock>>>(d_a, d_c, N);
7439566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost));
7449566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost));
745caccb7e3SMatthew G Knepley     errorSTREAMkernel = STREAM_Copy_verify_double(h_a, h_c, N);
746caccb7e3SMatthew G Knepley     if (errorSTREAMkernel) {
7479566063dSJacob Faibussowitsch       PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n"));
748caccb7e3SMatthew G Knepley       exit(-2000);
749caccb7e3SMatthew G Knepley     }
750caccb7e3SMatthew G Knepley 
751caccb7e3SMatthew G Knepley     /* Initialize memory on the device */
752caccb7e3SMatthew G Knepley     set_array_double<<<dimGrid, dimBlock>>>(d_a, 2., N);
753caccb7e3SMatthew G Knepley     set_array_double<<<dimGrid, dimBlock>>>(d_b, .5, N);
754caccb7e3SMatthew G Knepley     set_array_double<<<dimGrid, dimBlock>>>(d_c, .5, N);
755caccb7e3SMatthew G Knepley 
756caccb7e3SMatthew G Knepley     STREAM_Copy_Optimized_double<<<dimGrid, dimBlock>>>(d_a, d_c, N);
7579566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost));
7589566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost));
759caccb7e3SMatthew G Knepley     errorSTREAMkernel = STREAM_Copy_verify_double(h_a, h_c, N);
760caccb7e3SMatthew G Knepley     if (errorSTREAMkernel) {
7619566063dSJacob Faibussowitsch       PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n"));
762caccb7e3SMatthew G Knepley       exit(-3000);
763caccb7e3SMatthew G Knepley     }
764caccb7e3SMatthew G Knepley 
765caccb7e3SMatthew G Knepley     /* Initialize memory on the device */
766caccb7e3SMatthew G Knepley     set_array_double<<<dimGrid, dimBlock>>>(d_b, .5, N);
767caccb7e3SMatthew G Knepley     set_array_double<<<dimGrid, dimBlock>>>(d_c, .5, N);
768caccb7e3SMatthew G Knepley 
769caccb7e3SMatthew G Knepley     STREAM_Scale_double<<<dimGrid, dimBlock>>>(d_b, d_c, scalar, N);
7709566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost));
7719566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost));
772caccb7e3SMatthew G Knepley     errorSTREAMkernel = STREAM_Scale_verify_double(h_b, h_c, scalar, N);
773caccb7e3SMatthew G Knepley     if (errorSTREAMkernel) {
7749566063dSJacob Faibussowitsch       PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n"));
775caccb7e3SMatthew G Knepley       exit(-4000);
776caccb7e3SMatthew G Knepley     }
777caccb7e3SMatthew G Knepley 
778caccb7e3SMatthew G Knepley     /* Initialize memory on the device */
779caccb7e3SMatthew G Knepley     set_array_double<<<dimGrid, dimBlock>>>(d_a, 2., N);
780caccb7e3SMatthew G Knepley     set_array_double<<<dimGrid, dimBlock>>>(d_b, .5, N);
781caccb7e3SMatthew G Knepley     set_array_double<<<dimGrid, dimBlock>>>(d_c, .5, N);
782caccb7e3SMatthew G Knepley 
783caccb7e3SMatthew G Knepley     STREAM_Add_double<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, N);
7849566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost));
7859566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost));
7869566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost));
787caccb7e3SMatthew G Knepley     errorSTREAMkernel = STREAM_Add_verify_double(h_a, h_b, h_c, N);
788caccb7e3SMatthew G Knepley     if (errorSTREAMkernel) {
7899566063dSJacob Faibussowitsch       PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n"));
790caccb7e3SMatthew G Knepley       exit(-5000);
791caccb7e3SMatthew G Knepley     }
792caccb7e3SMatthew G Knepley 
793caccb7e3SMatthew G Knepley     /* Initialize memory on the device */
794caccb7e3SMatthew G Knepley     set_array_double<<<dimGrid, dimBlock>>>(d_a, 2., N);
795caccb7e3SMatthew G Knepley     set_array_double<<<dimGrid, dimBlock>>>(d_b, .5, N);
796caccb7e3SMatthew G Knepley     set_array_double<<<dimGrid, dimBlock>>>(d_c, .5, N);
797caccb7e3SMatthew G Knepley 
798caccb7e3SMatthew G Knepley     STREAM_Triad_double<<<dimGrid, dimBlock>>>(d_b, d_c, d_a, scalar, N);
7999566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost));
8009566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost));
8019566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost));
802caccb7e3SMatthew G Knepley     errorSTREAMkernel = STREAM_Triad_verify_double(h_b, h_c, h_a, scalar, N);
803caccb7e3SMatthew G Knepley     if (errorSTREAMkernel) {
8049566063dSJacob Faibussowitsch       PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n"));
805caccb7e3SMatthew G Knepley       exit(-6000);
806caccb7e3SMatthew G Knepley     }
807caccb7e3SMatthew G Knepley 
80819816777SMark     free(h_a);
80919816777SMark     free(h_b);
81019816777SMark     free(h_c);
81119816777SMark   }
812caccb7e3SMatthew G Knepley   /* continue from here */
81319816777SMark   printResultsReadable(times, sizeof(double));
814caccb7e3SMatthew G Knepley 
815caccb7e3SMatthew G Knepley   /* Free memory on device */
8169566063dSJacob Faibussowitsch   PetscCallCUDA(cudaFree(d_a));
8179566063dSJacob Faibussowitsch   PetscCallCUDA(cudaFree(d_b));
8189566063dSJacob Faibussowitsch   PetscCallCUDA(cudaFree(d_c));
8193ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
820caccb7e3SMatthew G Knepley }
821caccb7e3SMatthew G Knepley 
822403adfb6SMatthew G Knepley ///////////////////////////////////////////////////////////////////////////
823403adfb6SMatthew G Knepley //Print Results to Screen and File
824403adfb6SMatthew G Knepley ///////////////////////////////////////////////////////////////////////////
8250e6b6b59SJacob Faibussowitsch PetscErrorCode printResultsReadable(float times[][NTIMES], const size_t bsize) {
826403adfb6SMatthew G Knepley   PetscErrorCode ierr;
827403adfb6SMatthew G Knepley   PetscInt       j, k;
828caccb7e3SMatthew G Knepley   float          avgtime[8]          = {0., 0., 0., 0., 0., 0., 0., 0.};
829caccb7e3SMatthew G Knepley   float          maxtime[8]          = {0., 0., 0., 0., 0., 0., 0., 0.};
830caccb7e3SMatthew G Knepley   float          mintime[8]          = {1e30, 1e30, 1e30, 1e30, 1e30, 1e30, 1e30, 1e30};
83119816777SMark   // char           *label[8]           = {"Copy:      ", "Copy Opt.: ", "Scale:     ", "Scale Opt: ", "Add:       ", "Add Opt:   ", "Triad:     ", "Triad Opt: "};
8320e6b6b59SJacob Faibussowitsch   const float    bytes_per_kernel[8] = {2. * bsize * N, 2. * bsize * N, 2. * bsize * N, 2. * bsize * N, 3. * bsize * N, 3. * bsize * N, 3. * bsize * N, 3. * bsize * N};
83319816777SMark   double         rate, irate;
83419816777SMark   int            rank, size;
835*4d86920dSPierre Jolivet 
836403adfb6SMatthew G Knepley   PetscFunctionBegin;
8379566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_rank(MPI_COMM_WORLD, &rank));
8389566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_size(MPI_COMM_WORLD, &size));
839403adfb6SMatthew G Knepley   /* --- SUMMARY --- */
84019816777SMark   for (k = 0; k < NTIMES; ++k) {
841caccb7e3SMatthew G Knepley     for (j = 0; j < 8; ++j) {
84219816777SMark       avgtime[j] = avgtime[j] + (1.e-03f * times[j][k]); // millisec --> sec
843403adfb6SMatthew G Knepley       mintime[j] = MIN(mintime[j], (1.e-03f * times[j][k]));
844403adfb6SMatthew G Knepley       maxtime[j] = MAX(maxtime[j], (1.e-03f * times[j][k]));
845403adfb6SMatthew G Knepley     }
84619816777SMark   }
8470e6b6b59SJacob Faibussowitsch   for (j = 0; j < 8; ++j) { avgtime[j] = avgtime[j] / (float)(NTIMES - 1); }
84819816777SMark   j     = 7;
84919816777SMark   irate = 1.0E-06 * bytes_per_kernel[j] / mintime[j];
85019816777SMark   ierr  = MPI_Reduce(&irate, &rate, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
851dd400576SPatrick Sanan   if (rank == 0) {
85219816777SMark     FILE *fd;
85319816777SMark     if (size == 1) {
85419816777SMark       printf("%d %11.4f   Rate (MB/s)\n", size, rate);
85519816777SMark       fd = fopen("flops", "w");
85619816777SMark       fprintf(fd, "%g\n", rate);
85719816777SMark       fclose(fd);
85819816777SMark     } else {
85919816777SMark       double prate;
86019816777SMark       fd = fopen("flops", "r");
86119816777SMark       fscanf(fd, "%lg", &prate);
86219816777SMark       fclose(fd);
86319816777SMark       printf("%d %11.4f   Rate (MB/s) %g \n", size, rate, rate / prate);
86419816777SMark     }
86519816777SMark   }
8663ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
867403adfb6SMatthew G Knepley }
868