xref: /petsc/src/ksp/ksp/tutorials/bench_kspsolve.c (revision 4e8208cbcbc709572b8abe32f33c78b69c819375)
1 /*
2 Poisson in 3D. Modeled by the PDE:
3 
4   - delta u(x,y,z) = f(x,y,z)
5 
6 With exact solution:
7 
8    u(x,y,z) = 1.0
9 
10 Example usage:
11 
12   Run on GPU (requires respective backends installed):
13     ./bench_kspsolve -mat_type aijcusparse
14     ./bench_kspsolve -mat_type aijhipsparse
15     ./bench_kspsolve -mat_type aijkokkos
16 
17   Test only MatMult:
18     ./bench_kspsolve -matmult
19 
20   Test MatMult over 1000 iterations:
21     ./bench_kspsolve -matmult -its 1000
22 
23   Change size of problem (e.g., use a 128x128x128 grid):
24     ./bench_kspsolve -n 128
25 */
26 static char help[] = "Solves 3D Laplacian with 27-point finite difference stencil.\n";
27 
28 #include <petscksp.h>
29 
30 typedef struct {
31   PetscMPIInt rank, size;
32   PetscInt    n;        /* global size in each dimension */
33   PetscInt    dim;      /* global size */
34   PetscInt    nnz;      /* local nonzeros */
35   PetscBool   matmult;  /* Do MatMult() only */
36   PetscBool   debug;    /* Debug flag for PreallocateCOO() */
37   PetscBool   splitksp; /* Split KSPSolve and PCSetUp */
38   PetscInt    its;      /* No of matmult_iterations */
39   PetscInt    Istart, Iend;
40 } AppCtx;
41 
PreallocateCOO(Mat A,PetscCtx ctx)42 static PetscErrorCode PreallocateCOO(Mat A, PetscCtx ctx)
43 {
44   AppCtx  *user = (AppCtx *)ctx;
45   PetscInt n = user->n, n2 = n * n, n1 = n - 1;
46   PetscInt xstart, ystart, zstart, xend, yend, zend, nm2 = n - 2, idx;
47   PetscInt nnz[] = {8, 12, 18, 27}; /* nnz for corner, edge, face, and center. */
48 
49   PetscFunctionBeginUser;
50   xstart = user->Istart % n;
51   ystart = (user->Istart / n) % n;
52   zstart = user->Istart / n2;
53   xend   = (user->Iend - 1) % n;
54   yend   = ((user->Iend - 1) / n) % n;
55   zend   = (user->Iend - 1) / n2;
56 
57   /* bottom xy-plane */
58   user->nnz = 0;
59   idx       = !zstart ? 0 : 1;
60   if (zstart == zend && (!zstart || zstart == n1)) idx = 0;
61   if (zstart == zend && (zstart && zstart < n1)) idx = 1;
62   if (!xstart && !ystart) // bottom left
63     user->nnz += 4 * nnz[idx] + 4 * nm2 * nnz[idx + 1] + nm2 * nm2 * nnz[idx + 2];
64   else if (xstart && xstart < n1 && !ystart) // bottom
65     user->nnz += 3 * nnz[idx] + (3 * nm2 + n1 - xstart) * nnz[idx + 1] + nm2 * nm2 * nnz[idx + 2];
66   else if (xstart == n1 && !ystart) // bottom right
67     user->nnz += 3 * nnz[idx] + (3 * nm2) * nnz[idx + 1] + nm2 * nm2 * nnz[idx + 2];
68   else if (!xstart && ystart && ystart < n1) // left
69     user->nnz += 2 * nnz[idx] + (nm2 + 2 * (n1 - ystart)) * nnz[idx + 1] + nm2 * (n1 - ystart) * nnz[idx + 2];
70   else if (xstart && xstart < n1 && ystart && ystart < n1) // center
71     user->nnz += 2 * nnz[idx] + (nm2 + (n1 - ystart) + (nm2 - ystart)) * nnz[idx + 1] + (nm2 * (nm2 - ystart) + (n1 - xstart)) * nnz[idx + 2];
72   else if (xstart == n1 && ystart && ystart < n1) // right
73     user->nnz += 2 * nnz[idx] + (nm2 + n1 - ystart + nm2 - ystart) * nnz[idx + 1] + nm2 * (nm2 - ystart) * nnz[idx + 2];
74   else if (ystart == n1 && !xstart) // top left
75     user->nnz += 2 * nnz[idx] + nm2 * nnz[idx + 1];
76   else if (ystart == n1 && xstart && xstart < n1) // top
77     user->nnz += nnz[idx] + (n1 - xstart) * nnz[idx + 1];
78   else if (xstart == n1 && ystart == n1) // top right
79     user->nnz += nnz[idx];
80 
81   /* top and middle plane the same */
82   if (zend == zstart) user->nnz = user->nnz - 4 * nnz[idx] - 4 * nm2 * nnz[idx + 1] - nm2 * nm2 * nnz[idx + 2];
83 
84   /* middle xy-planes */
85   if (zend - zstart > 1) user->nnz = user->nnz + (zend - zstart - 1) * (4 * nnz[1] + 4 * nnz[2] * nm2 + nnz[3] * nm2 * nm2);
86 
87   /* top xy-plane */
88   idx = zend == n1 ? 0 : 1;
89   if (zstart == zend && (!zend || zend == n1)) idx = 0;
90   if (zstart == zend && (zend && zend < n1)) idx = 1;
91   if (!xend && !yend) // bottom left
92     user->nnz += nnz[idx];
93   else if (xend && xend < n1 && !yend) // bottom
94     user->nnz += nnz[idx] + xend * nnz[idx + 1];
95   else if (xend == n1 && !yend) // bottom right
96     user->nnz += 2 * nnz[idx] + nm2 * nnz[idx + 1];
97   else if (!xend && yend && yend < n1) // left
98     user->nnz += 2 * nnz[idx] + (nm2 + yend + yend - 1) * nnz[idx + 1] + nm2 * (yend - 1) * nnz[idx + 2];
99   else if (xend && xend < n1 && yend && yend < n1) // center
100     user->nnz += 2 * nnz[idx] + (nm2 + yend + yend - 1) * nnz[idx + 1] + (nm2 * (yend - 1) + xend) * nnz[idx + 2];
101   else if (xend == n1 && yend && yend < n1) // right
102     user->nnz += 2 * nnz[idx] + (nm2 + 2 * yend) * nnz[idx + 1] + (nm2 * yend) * nnz[idx + 2];
103   else if (!xend && yend == n1) // top left
104     user->nnz += 3 * nnz[idx] + (3 * nm2) * nnz[idx + 1] + (nm2 * nm2) * nnz[idx + 2];
105   else if (xend && xend < n1 && yend == n1) // top
106     user->nnz += 3 * nnz[idx] + (3 * nm2 + xend) * nnz[idx + 1] + (nm2 * nm2) * nnz[idx + 2];
107   else if (xend == n1 && yend == n1) // top right
108     user->nnz += 4 * nnz[idx] + (4 * nm2) * nnz[idx + 1] + (nm2 * nm2) * nnz[idx + 2];
109   if (user->debug)
110     PetscCall(PetscPrintf(PETSC_COMM_SELF, "rank %d: xyzstart = %" PetscInt_FMT ",%" PetscInt_FMT ",%" PetscInt_FMT ", xvzend = %" PetscInt_FMT ",%" PetscInt_FMT ",%" PetscInt_FMT ", nnz = %" PetscInt_FMT "\n", user->rank, xstart, ystart, zstart, xend, yend, zend,
111                           user->nnz));
112   PetscFunctionReturn(PETSC_SUCCESS);
113 }
114 
FillCOO(Mat A,PetscCtx ctx)115 PetscErrorCode FillCOO(Mat A, PetscCtx ctx)
116 {
117   AppCtx      *user = (AppCtx *)ctx;
118   PetscInt     Ii, x, y, z, n = user->n, n2 = n * n, n1 = n - 1;
119   PetscInt     count = 0;
120   PetscInt    *coo_i, *coo_j;
121   PetscScalar *coo_v;
122   PetscScalar  h     = 1.0 / (n - 1);
123   PetscScalar  vcorn = -1.0 / 13 * h; //-1.0/12.0*h;
124   PetscScalar  vedge = -3.0 / 26 * h; //-1.0/6.0*h;
125   PetscScalar  vface = -3.0 / 13 * h; //-1.0e-9*h;
126   PetscScalar  vcent = 44.0 / 13 * h; //8.0/3.0*h;
127 
128   PetscFunctionBeginUser;
129   PetscCall(PetscCalloc3(user->nnz, &coo_i, user->nnz, &coo_j, user->nnz, &coo_v));
130   /* Fill COO arrays */
131   for (Ii = user->Istart; Ii < user->Iend; Ii++) {
132     x = Ii % n;
133     y = (Ii / n) % n;
134     z = Ii / n2;
135     if (x > 0 && y > 0 && z > 0) {
136       coo_i[count] = Ii;
137       coo_j[count] = Ii - 1 - n - n2;
138       coo_v[count] = vcorn;
139       count++;
140     }
141     if (x > 0 && y < n1 && z > 0) {
142       coo_i[count] = Ii;
143       coo_j[count] = Ii - 1 + n - n2;
144       coo_v[count] = vcorn;
145       count++;
146     }
147     if (x < n1 && y > 0 && z > 0) {
148       coo_i[count] = Ii;
149       coo_j[count] = Ii + 1 - n - n2;
150       coo_v[count] = vcorn;
151       count++;
152     }
153     if (x < n1 && y < n1 && z > 0) {
154       coo_i[count] = Ii;
155       coo_j[count] = Ii + 1 + n - n2;
156       coo_v[count] = vcorn;
157       count++;
158     }
159     if (x > 0 && y > 0 && z < n1) {
160       coo_i[count] = Ii;
161       coo_j[count] = Ii - 1 - n + n2;
162       coo_v[count] = vcorn;
163       count++;
164     }
165     if (x > 0 && y < n1 && z < n1) {
166       coo_i[count] = Ii;
167       coo_j[count] = Ii - 1 + n + n2;
168       coo_v[count] = vcorn;
169       count++;
170     }
171     if (x < n1 && y > 0 && z < n1) {
172       coo_i[count] = Ii;
173       coo_j[count] = Ii + 1 - n + n2;
174       coo_v[count] = vcorn;
175       count++;
176     }
177     if (x < n1 && y < n1 && z < n1) {
178       coo_i[count] = Ii;
179       coo_j[count] = Ii + 1 + n + n2;
180       coo_v[count] = vcorn;
181       count++;
182     }
183     /* 12 edges */
184     if (x > 0 && y > 0) {
185       coo_i[count] = Ii;
186       coo_j[count] = Ii - 1 - n;
187       coo_v[count] = vedge;
188       count++;
189     }
190     if (x > 0 && y < n1) {
191       coo_i[count] = Ii;
192       coo_j[count] = Ii - 1 + n;
193       coo_v[count] = vedge;
194       count++;
195     }
196     if (x < n1 && y > 0) {
197       coo_i[count] = Ii;
198       coo_j[count] = Ii + 1 - n;
199       coo_v[count] = vedge;
200       count++;
201     }
202     if (x < n1 && y < n1) {
203       coo_i[count] = Ii;
204       coo_j[count] = Ii + 1 + n;
205       coo_v[count] = vedge;
206       count++;
207     }
208     if (x > 0 && z > 0) {
209       coo_i[count] = Ii;
210       coo_j[count] = Ii - 1 - n2;
211       coo_v[count] = vedge;
212       count++;
213     }
214     if (x > 0 && z < n1) {
215       coo_i[count] = Ii;
216       coo_j[count] = Ii - 1 + n2;
217       coo_v[count] = vedge;
218       count++;
219     }
220     if (x < n1 && z > 0) {
221       coo_i[count] = Ii;
222       coo_j[count] = Ii + 1 - n2;
223       coo_v[count] = vedge;
224       count++;
225     }
226     if (x < n1 && z < n1) {
227       coo_i[count] = Ii;
228       coo_j[count] = Ii + 1 + n2;
229       coo_v[count] = vedge;
230       count++;
231     }
232     if (y > 0 && z > 0) {
233       coo_i[count] = Ii;
234       coo_j[count] = Ii - n - n2;
235       coo_v[count] = vedge;
236       count++;
237     }
238     if (y > 0 && z < n1) {
239       coo_i[count] = Ii;
240       coo_j[count] = Ii - n + n2;
241       coo_v[count] = vedge;
242       count++;
243     }
244     if (y < n1 && z > 0) {
245       coo_i[count] = Ii;
246       coo_j[count] = Ii + n - n2;
247       coo_v[count] = vedge;
248       count++;
249     }
250     if (y < n1 && z < n1) {
251       coo_i[count] = Ii;
252       coo_j[count] = Ii + n + n2;
253       coo_v[count] = vedge;
254       count++;
255     }
256     /* 6 faces */
257     if (x > 0) {
258       coo_i[count] = Ii;
259       coo_j[count] = Ii - 1;
260       coo_v[count] = vface;
261       count++;
262     }
263     if (x < n1) {
264       coo_i[count] = Ii;
265       coo_j[count] = Ii + 1;
266       coo_v[count] = vface;
267       count++;
268     }
269     if (y > 0) {
270       coo_i[count] = Ii;
271       coo_j[count] = Ii - n;
272       coo_v[count] = vface;
273       count++;
274     }
275     if (y < n1) {
276       coo_i[count] = Ii;
277       coo_j[count] = Ii + n;
278       coo_v[count] = vface;
279       count++;
280     }
281     if (z > 0) {
282       coo_i[count] = Ii;
283       coo_j[count] = Ii - n2;
284       coo_v[count] = vface;
285       count++;
286     }
287     if (z < n1) {
288       coo_i[count] = Ii;
289       coo_j[count] = Ii + n2;
290       coo_v[count] = vface;
291       count++;
292     }
293     /* Center */
294     coo_i[count] = Ii;
295     coo_j[count] = Ii;
296     coo_v[count] = vcent;
297     count++;
298   }
299   PetscCheck(count == user->nnz, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Expected %" PetscInt_FMT " nonzeros but got %" PetscInt_FMT " nonzeros in COO format", user->nnz, count);
300   PetscCall(MatSetPreallocationCOO(A, user->nnz, coo_i, coo_j));
301   PetscCall(MatSetValuesCOO(A, coo_v, INSERT_VALUES));
302   PetscCall(PetscFree3(coo_i, coo_j, coo_v));
303   PetscFunctionReturn(PETSC_SUCCESS);
304 }
305 
main(int argc,char ** argv)306 int main(int argc, char **argv)
307 {
308   AppCtx                     user;                               /* Application context */
309   Vec                        x, b, u;                            /* approx solution, RHS, and exact solution */
310   Mat                        A;                                  /* linear system matrix */
311   KSP                        ksp;                                /* linear solver context */
312   PC                         pc;                                 /* Preconditioner */
313   PetscReal                  norm;                               /* Error norm */
314   PetscInt                   nlocal     = PETSC_DECIDE, ksp_its; /* Number of KSP iterations */
315   PetscInt                   global_nnz = 0;                     /* Total number of nonzeros */
316   PetscLogDouble             time_start, time_mid1 = 0.0, time_mid2 = 0.0, time_end, time_avg, floprate;
317   PetscBool                  printTiming = PETSC_TRUE; /* If run in CI, do not print timing result */
318   PETSC_UNUSED PetscLogStage stage;
319 
320   PetscCall(PetscInitialize(&argc, &argv, NULL, help));
321   PetscCallMPI(MPI_Comm_size(PETSC_COMM_WORLD, &user.size));
322   PetscCallMPI(MPI_Comm_rank(PETSC_COMM_WORLD, &user.rank));
323 
324   user.n        = 100;         /* Default grid points per dimension */
325   user.matmult  = PETSC_FALSE; /* Test MatMult only */
326   user.its      = 100;         /* Default no. of iterations for MatMult test */
327   user.debug    = PETSC_FALSE; /* Debug PreallocateCOO() */
328   user.splitksp = PETSC_FALSE; /* Split KSPSolve and PCSetUp */
329   PetscCall(PetscOptionsGetInt(NULL, NULL, "-n", &user.n, NULL));
330   PetscCall(PetscOptionsGetInt(NULL, NULL, "-its", &user.its, NULL));
331   PetscCall(PetscOptionsGetBool(NULL, NULL, "-matmult", &user.matmult, NULL));
332   PetscCall(PetscOptionsGetBool(NULL, NULL, "-debug", &user.debug, NULL));
333   PetscCall(PetscOptionsGetBool(NULL, NULL, "-print_timing", &printTiming, NULL));
334   PetscCall(PetscOptionsGetBool(NULL, NULL, "-split_ksp", &user.splitksp, NULL));
335 
336   user.dim   = user.n * user.n * user.n;
337   global_nnz = 64 + 27 * (user.n - 2) * (user.n - 2) * (user.n - 2) + 108 * (user.n - 2) * (user.n - 2) + 144 * (user.n - 2);
338   PetscCheck(user.n >= 2, PETSC_COMM_WORLD, PETSC_ERR_USER_INPUT, "Requires at least 2 grid points (-n 2), you specified -n %" PetscInt_FMT, user.n);
339   PetscCheck(user.dim >= user.size, PETSC_COMM_WORLD, PETSC_ERR_USER_INPUT, "MPI size (%d) exceeds the grid size %" PetscInt_FMT " (-n %" PetscInt_FMT ")", user.size, user.dim, user.n);
340   PetscCall(PetscPrintf(PETSC_COMM_WORLD, "===========================================\n"));
341   if (user.matmult) PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Test: MatMult performance - Poisson\n"));
342   else PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Test: KSP performance - Poisson\n"));
343   PetscCall(PetscPrintf(PETSC_COMM_WORLD, "\tInput matrix: 27-pt finite difference stencil\n"));
344   PetscCall(PetscPrintf(PETSC_COMM_WORLD, "\t-n %" PetscInt_FMT "\n", user.n));
345   if (user.matmult) PetscCall(PetscPrintf(PETSC_COMM_WORLD, "\t-its %" PetscInt_FMT "\n", user.its));
346   PetscCall(PetscPrintf(PETSC_COMM_WORLD, "\tDoFs = %" PetscInt_FMT "\n", user.dim));
347   PetscCall(PetscPrintf(PETSC_COMM_WORLD, "\tNumber of nonzeros = %" PetscInt_FMT "\n", global_nnz));
348 
349   /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
350   /*  Create the Vecs and Mat                                            */
351   /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
352   PetscCall(PetscPrintf(PETSC_COMM_WORLD, "\nStep1  - creating Vecs and Mat...\n"));
353   PetscCall(PetscLogStageRegister("Step1  - Vecs and Mat", &stage));
354   PetscCall(PetscLogStagePush(stage));
355   PetscCall(MatCreate(PETSC_COMM_WORLD, &A));
356   PetscCall(MatSetSizes(A, PETSC_DECIDE, PETSC_DECIDE, user.dim, user.dim));
357   PetscCall(MatSetFromOptions(A));
358 
359   /* cannot use MatGetOwnershipRange() because the matrix has yet to be preallocated; that happens in MatSetPreallocateCOO() */
360   PetscCall(PetscSplitOwnership(PetscObjectComm((PetscObject)A), &nlocal, &user.dim));
361   PetscCallMPI(MPI_Scan(&nlocal, &user.Istart, 1, MPIU_INT, MPI_SUM, PetscObjectComm((PetscObject)A)));
362   user.Istart -= nlocal;
363   user.Iend = user.Istart + nlocal;
364 
365   PetscCall(PreallocateCOO(A, &user)); /* Determine local number of nonzeros */
366   PetscCall(FillCOO(A, &user));        /* Fill COO Matrix */
367 #if !defined(PETSC_HAVE_HIP)           /* Due to errors in MatSolve_SeqAIJHIPSPARSE_ICC0() */
368   PetscCall(MatSetOption(A, MAT_SPD, PETSC_TRUE));
369   PetscCall(MatSetOption(A, MAT_SPD_ETERNAL, PETSC_TRUE));
370 #endif
371   PetscCall(MatCreateVecs(A, &u, &b));
372   if (!user.matmult) PetscCall(VecDuplicate(b, &x));
373   PetscCall(VecSet(u, 1.0));   /* Exact solution */
374   PetscCall(MatMult(A, u, b)); /* Compute RHS based on exact solution */
375   PetscCall(PetscLogStagePop());
376 
377   if (user.matmult) {
378     /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
379     /*  MatMult                                                            */
380     /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
381     PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Step2  - running MatMult() %" PetscInt_FMT " times...\n", user.its));
382     PetscCall(PetscLogStageRegister("Step2  - MatMult", &stage));
383     PetscCall(PetscLogStagePush(stage));
384     PetscCall(PetscTime(&time_start));
385     for (int i = 0; i < user.its; i++) PetscCall(MatMult(A, u, b));
386     PetscCall(PetscTime(&time_end));
387     PetscCall(PetscLogStagePop());
388     /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
389     /*  Calculate Performance metrics                                      */
390     /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
391     time_avg = (time_end - time_start) / ((PetscLogDouble)user.its);
392     floprate = 2 * global_nnz / time_avg * 1e-9;
393     if (printTiming) {
394       PetscCall(PetscPrintf(PETSC_COMM_WORLD, "\n%-15s%-7.5f seconds\n", "Average time:", time_avg));
395       PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%-15s%-9.3e Gflops/sec\n", "FOM:", floprate)); /* figure of merit */
396     }
397     PetscCall(PetscPrintf(PETSC_COMM_WORLD, "===========================================\n"));
398   } else {
399     if (!user.splitksp) {
400       /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
401       /*  Solve the linear system of equations                               */
402       /*  Measure only time of PCSetUp() and KSPSolve()                      */
403       /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
404       PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Step2  - running KSPSolve()...\n"));
405       PetscCall(PetscLogStageRegister("Step2  - KSPSolve", &stage));
406       PetscCall(PetscLogStagePush(stage));
407       PetscCall(KSPCreate(PETSC_COMM_WORLD, &ksp));
408       PetscCall(KSPSetErrorIfNotConverged(ksp, PETSC_TRUE));
409       PetscCall(KSPSetOperators(ksp, A, A));
410       PetscCall(KSPSetFromOptions(ksp));
411       PetscCall(PetscTime(&time_start));
412       PetscCall(KSPSolve(ksp, b, x));
413       PetscCall(PetscTime(&time_end));
414       PetscCall(PetscLogStagePop());
415     } else {
416       /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
417       /*  Solve the linear system of equations                               */
418       /*  Measure only time of PCSetUp() and KSPSolve()                      */
419       /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
420       PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Step2a - running PCSetUp()...\n"));
421       PetscCall(PetscLogStageRegister("Step2a - PCSetUp", &stage));
422       PetscCall(PetscLogStagePush(stage));
423       PetscCall(KSPCreate(PETSC_COMM_WORLD, &ksp));
424       PetscCall(KSPSetErrorIfNotConverged(ksp, PETSC_TRUE));
425       PetscCall(KSPSetOperators(ksp, A, A));
426       PetscCall(KSPSetFromOptions(ksp));
427       PetscCall(KSPGetPC(ksp, &pc));
428       PetscCall(PetscTime(&time_start));
429       PetscCall(PCSetUp(pc));
430       PetscCall(PCSetUpOnBlocks(pc));
431       PetscCall(PetscTime(&time_mid1));
432       PetscCall(PetscLogStagePop());
433       PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Step2b - running KSPSolve()...\n"));
434       PetscCall(PetscLogStageRegister("Step2b - KSPSolve", &stage));
435       PetscCall(PetscLogStagePush(stage));
436       PetscCall(PetscTime(&time_mid2));
437       PetscCall(KSPSolve(ksp, b, x));
438       PetscCall(PetscTime(&time_end));
439       PetscCall(PetscLogStagePop());
440     }
441 
442     /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
443     /*  Calculate error norm                                               */
444     /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
445     PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Step3  - calculating error norm...\n"));
446     PetscCall(PetscLogStageRegister("Step3  - Error norm", &stage));
447     PetscCall(PetscLogStagePush(stage));
448     PetscCall(VecAXPY(x, -1.0, u));
449     PetscCall(VecNorm(x, NORM_2, &norm));
450     PetscCall(PetscLogStagePop());
451 
452     /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
453     /*  Summary                                                            */
454     /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
455     PetscCall(KSPGetIterationNumber(ksp, &ksp_its));
456     if (printTiming) {
457       PetscCall(PetscPrintf(PETSC_COMM_WORLD, "\n%-15s%-1.3e\n", "Error norm:", (double)norm));
458       PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%-15s%-3" PetscInt_FMT "\n", "KSP iters:", ksp_its));
459       if (user.splitksp) {
460         PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%-15s%-7.5f seconds\n", "PCSetUp:", time_mid1 - time_start));
461         PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%-15s%-7.5f seconds\n", "KSPSolve:", time_end - time_mid2));
462         PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%-15s%-7.5f seconds\n", "Total Solve:", time_end - time_start));
463       } else {
464         PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%-15s%-7.5f seconds\n", "KSPSolve:", time_end - time_start));
465       }
466       PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%-15s%-1.3e DoFs/sec\n", "FOM:", user.dim / (time_end - time_start))); /* figure of merit */
467     }
468     PetscCall(PetscPrintf(PETSC_COMM_WORLD, "===========================================\n"));
469   }
470   /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
471   /*  Free up memory                                                     */
472   /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
473   if (!user.matmult) {
474     PetscCall(KSPDestroy(&ksp));
475     PetscCall(VecDestroy(&x));
476   }
477   PetscCall(VecDestroy(&u));
478   PetscCall(VecDestroy(&b));
479   PetscCall(MatDestroy(&A));
480   PetscCall(PetscFinalize());
481   return 0;
482 }
483 
484 /*TEST
485 
486   testset:
487     args: -print_timing false -matmult -its 10 -n 8
488     nsize: {{1 3}}
489     output_file: output/bench_kspsolve_matmult.out
490 
491     test:
492       suffix: matmult
493 
494     test:
495       suffix: hip_matmult
496       requires: hip
497       args: -mat_type aijhipsparse
498 
499     test:
500       suffix: cuda_matmult
501       requires: cuda
502       args: -mat_type aijcusparse
503 
504     test:
505       suffix: kok_matmult
506       requires: kokkos_kernels
507       args: -mat_type aijkokkos
508 
509   testset:
510     args: -print_timing false -its 10 -n 8
511     nsize: {{3 5}}
512     output_file: output/bench_kspsolve_ksp.out
513 
514     test:
515       suffix: ksp
516 
517     test:
518       suffix: nbr
519       requires: defined(PETSC_HAVE_MPI_PERSISTENT_NEIGHBORHOOD_COLLECTIVES)
520       args: -sf_type neighbor -sf_neighbor_persistent {{0 1}}
521 
522     test:
523       suffix: hip_ksp
524       requires: hip
525       args: -mat_type aijhipsparse -sub_pc_factor_mat_factor_on_host {{0 1}}
526 
527     test:
528       suffix: cuda_ksp
529       requires: cuda
530       args: -mat_type aijcusparse -sub_pc_factor_mat_factor_on_host {{0 1}}
531 
532     test:
533       suffix: kok_ksp_1
534       requires: kokkos_kernels
535       args: -mat_type aijkokkos -pc_type bjacobi -sub_pc_type {{ilu icc}}
536 
537     test:
538       suffix: kok_ksp_2
539       requires: kokkos_kernels
540       args: -mat_type aijkokkos -pc_type bjacobi -sub_pc_type {{ilu icc}} -sub_pc_factor_mat_factor_on_host -sub_pc_factor_mat_solve_on_host {{0 1}}
541 
542     test:
543       suffix: kok_hypre
544       requires: kokkos_kernels defined(PETSC_HAVE_HYPRE_DEVICE)
545       args: -mat_type aijkokkos -pc_type hypre
546 
547     test:
548       suffix: kok_nbr
549       requires: kokkos_kernels defined(PETSC_HAVE_MPI_PERSISTENT_NEIGHBORHOOD_COLLECTIVES)
550       args: -mat_type aijkokkos -sf_type neighbor -sf_neighbor_persistent {{0 1}}
551 
552 TEST*/
553