xref: /petsc/src/ksp/pc/impls/is/nn/nn.c (revision d8e47b638cf8f604a99e9678e1df24f82d959cd7)
1 #include <../src/ksp/pc/impls/is/nn/nn.h>
2 
3 /*
4    PCSetUp_NN - Prepares for the use of the NN preconditioner
5                     by setting data structures and options.
6 
7    Input Parameter:
8 .  pc - the preconditioner context
9 
10    Application Interface Routine: PCSetUp()
11 
12    Note:
13    The interface routine PCSetUp() is not usually called directly by
14    the user, but instead is called by PCApply() if necessary.
15 */
16 static PetscErrorCode PCSetUp_NN(PC pc)
17 {
18   PetscFunctionBegin;
19   if (!pc->setupcalled) {
20     /* Set up all the "iterative substructuring" common block */
21     PetscCall(PCISSetUp(pc, PETSC_TRUE, PETSC_TRUE));
22     /* Create the coarse matrix. */
23     PetscCall(PCNNCreateCoarseMatrix(pc));
24   }
25   PetscFunctionReturn(PETSC_SUCCESS);
26 }
27 
28 /*
29    PCApply_NN - Applies the NN preconditioner to a vector.
30 
31    Input Parameters:
32 +  pc - the preconditioner context
33 -  r - input vector (global)
34 
35    Output Parameter:
36 .  z - output vector (global)
37 
38    Application Interface Routine: PCApply()
39  */
40 static PetscErrorCode PCApply_NN(PC pc, Vec r, Vec z)
41 {
42   PC_IS      *pcis  = (PC_IS *)pc->data;
43   PetscScalar m_one = -1.0;
44   Vec         w     = pcis->vec1_global;
45 
46   PetscFunctionBegin;
47   /*
48     Dirichlet solvers.
49     Solving $ B_I^{(i)}r_I^{(i)} $ at each processor.
50     Storing the local results at vec2_D
51   */
52   PetscCall(VecScatterBegin(pcis->global_to_D, r, pcis->vec1_D, INSERT_VALUES, SCATTER_FORWARD));
53   PetscCall(VecScatterEnd(pcis->global_to_D, r, pcis->vec1_D, INSERT_VALUES, SCATTER_FORWARD));
54   PetscCall(KSPSolve(pcis->ksp_D, pcis->vec1_D, pcis->vec2_D));
55 
56   /*
57     Computing $ r_B - \sum_j \tilde R_j^T A_{BI}^{(j)} (B_I^{(j)}r_I^{(j)}) $ .
58     Storing the result in the interface portion of the global vector w.
59   */
60   PetscCall(MatMult(pcis->A_BI, pcis->vec2_D, pcis->vec1_B));
61   PetscCall(VecScale(pcis->vec1_B, m_one));
62   PetscCall(VecCopy(r, w));
63   PetscCall(VecScatterBegin(pcis->global_to_B, pcis->vec1_B, w, ADD_VALUES, SCATTER_REVERSE));
64   PetscCall(VecScatterEnd(pcis->global_to_B, pcis->vec1_B, w, ADD_VALUES, SCATTER_REVERSE));
65 
66   /*
67     Apply the interface preconditioner
68   */
69   PetscCall(PCNNApplyInterfacePreconditioner(pc, w, z, pcis->work_N, pcis->vec1_B, pcis->vec2_B, pcis->vec3_B, pcis->vec1_D, pcis->vec3_D, pcis->vec1_N, pcis->vec2_N));
70 
71   /*
72     Computing $ t_I^{(i)} = A_{IB}^{(i)} \tilde R_i z_B $
73     The result is stored in vec1_D.
74   */
75   PetscCall(VecScatterBegin(pcis->global_to_B, z, pcis->vec1_B, INSERT_VALUES, SCATTER_FORWARD));
76   PetscCall(VecScatterEnd(pcis->global_to_B, z, pcis->vec1_B, INSERT_VALUES, SCATTER_FORWARD));
77   PetscCall(MatMult(pcis->A_IB, pcis->vec1_B, pcis->vec1_D));
78 
79   /*
80     Dirichlet solvers.
81     Computing $ B_I^{(i)}t_I^{(i)} $ and sticking into the global vector the blocks
82     $ B_I^{(i)}r_I^{(i)} - B_I^{(i)}t_I^{(i)} $.
83   */
84   PetscCall(VecScatterBegin(pcis->global_to_D, pcis->vec2_D, z, INSERT_VALUES, SCATTER_REVERSE));
85   PetscCall(VecScatterEnd(pcis->global_to_D, pcis->vec2_D, z, INSERT_VALUES, SCATTER_REVERSE));
86   PetscCall(KSPSolve(pcis->ksp_D, pcis->vec1_D, pcis->vec2_D));
87   PetscCall(VecScale(pcis->vec2_D, m_one));
88   PetscCall(VecScatterBegin(pcis->global_to_D, pcis->vec2_D, z, ADD_VALUES, SCATTER_REVERSE));
89   PetscCall(VecScatterEnd(pcis->global_to_D, pcis->vec2_D, z, ADD_VALUES, SCATTER_REVERSE));
90   PetscFunctionReturn(PETSC_SUCCESS);
91 }
92 
93 /*
94    PCDestroy_NN - Destroys the private context for the NN preconditioner
95    that was created with PCCreate_NN().
96 
97    Input Parameter:
98 .  pc - the preconditioner context
99 
100    Application Interface Routine: PCDestroy()
101 */
102 static PetscErrorCode PCDestroy_NN(PC pc)
103 {
104   PC_NN *pcnn = (PC_NN *)pc->data;
105 
106   PetscFunctionBegin;
107   PetscCall(PCISReset(pc));
108 
109   PetscCall(MatDestroy(&pcnn->coarse_mat));
110   PetscCall(VecDestroy(&pcnn->coarse_x));
111   PetscCall(VecDestroy(&pcnn->coarse_b));
112   PetscCall(KSPDestroy(&pcnn->ksp_coarse));
113   if (pcnn->DZ_IN) {
114     PetscCall(PetscFree(pcnn->DZ_IN[0]));
115     PetscCall(PetscFree(pcnn->DZ_IN));
116   }
117 
118   /*
119       Free the private data structure that was hanging off the PC
120   */
121   PetscCall(PetscFree(pc->data));
122   PetscFunctionReturn(PETSC_SUCCESS);
123 }
124 
125 /*MC
126    PCNN - Balancing Neumann-Neumann for scalar elliptic PDEs.
127 
128    Options Database Keys:
129 +    -pc_nn_turn_off_first_balancing - do not balance the residual before solving the local Neumann problems
130                                        (this skips the first coarse grid solve in the preconditioner)
131 .    -pc_nn_turn_off_second_balancing - do not balance the solution solving the local Neumann problems
132                                        (this skips the second coarse grid solve in the preconditioner)
133 .    -pc_is_damp_fixed <fact> -
134 .    -pc_is_remove_nullspace_fixed -
135 .    -pc_is_set_damping_factor_floating <fact> -
136 .    -pc_is_not_damp_floating -
137 -    -pc_is_not_remove_nullspace_floating -
138 
139    Options Database prefixes for the subsolvers this preconditioner uses:
140 +  -nn_coarse_pc_ - for the coarse grid preconditioner
141 .  -is_localD_pc_ - for the Dirichlet subproblem preconditioner
142 -  -is_localN_pc_ - for the Neumann subproblem preconditioner
143 
144    Level: intermediate
145 
146    Notes:
147    The matrix used with this preconditioner must be of type `MATIS`
148 
149    Unlike more 'conventional' Neumann-Neumann preconditioners this iterates over ALL the
150    degrees of freedom, NOT just those on the interface (this allows the use of approximate solvers
151    on the subdomains; though in our experience using approximate solvers is slower.).
152 
153    Contributed by Paulo Goldfeld
154 
155 .seealso: [](ch_ksp), `PCCreate()`, `PCSetType()`, `PCType`, `PC`, `MATIS`, `PCBDDC`
156 M*/
157 
158 PETSC_EXTERN PetscErrorCode PCCreate_NN(PC pc)
159 {
160   PC_NN *pcnn;
161 
162   PetscFunctionBegin;
163   /*
164      Creates the private data structure for this preconditioner and
165      attach it to the PC object.
166   */
167   PetscCall(PetscNew(&pcnn));
168   pc->data = (void *)pcnn;
169 
170   PetscCall(PCISInitialize(pc));
171   pcnn->coarse_mat = NULL;
172   pcnn->coarse_x   = NULL;
173   pcnn->coarse_b   = NULL;
174   pcnn->ksp_coarse = NULL;
175   pcnn->DZ_IN      = NULL;
176 
177   /*
178       Set the pointers for the functions that are provided above.
179       Now when the user-level routines (such as PCApply(), PCDestroy(), etc.)
180       are called, they will automatically call these functions.  Note we
181       choose not to provide a couple of these functions since they are
182       not needed.
183   */
184   pc->ops->apply               = PCApply_NN;
185   pc->ops->applytranspose      = NULL;
186   pc->ops->setup               = PCSetUp_NN;
187   pc->ops->destroy             = PCDestroy_NN;
188   pc->ops->view                = NULL;
189   pc->ops->applyrichardson     = NULL;
190   pc->ops->applysymmetricleft  = NULL;
191   pc->ops->applysymmetricright = NULL;
192   PetscFunctionReturn(PETSC_SUCCESS);
193 }
194 
195 /*
196    PCNNCreateCoarseMatrix -
197 */
198 PetscErrorCode PCNNCreateCoarseMatrix(PC pc)
199 {
200   MPI_Request  *send_request, *recv_request;
201   PetscInt      i, j, k;
202   PetscScalar  *mat;    /* Sub-matrix with this subdomain's contribution to the coarse matrix             */
203   PetscScalar **DZ_OUT; /* proc[k].DZ_OUT[i][] = bit of vector to be sent from processor k to processor i */
204 
205   PC_IS        *pcis     = (PC_IS *)pc->data;
206   PC_NN        *pcnn     = (PC_NN *)pc->data;
207   PetscMPIInt   n_neigh  = (PetscMPIInt)pcis->n_neigh;
208   PetscInt     *neigh    = pcis->neigh;
209   PetscInt     *n_shared = pcis->n_shared;
210   PetscInt    **shared   = pcis->shared;
211   PetscScalar **DZ_IN; /* Must be initialized after memory allocation. */
212 
213   PetscFunctionBegin;
214   /* Allocate memory for mat (the +1 is to handle the case n_neigh equal to zero) */
215   PetscCall(PetscMalloc1(n_neigh * n_neigh + 1, &mat));
216 
217   /* Allocate memory for DZ */
218   /* Notice that DZ_OUT[0] is allocated some space that is never used. */
219   /* This is just in order to DZ_OUT and DZ_IN to have exactly the same form. */
220   {
221     PetscInt size_of_Z = 0;
222     PetscCall(PetscMalloc((n_neigh + 1) * sizeof(PetscScalar *), &pcnn->DZ_IN));
223     DZ_IN = pcnn->DZ_IN;
224     PetscCall(PetscMalloc((n_neigh + 1) * sizeof(PetscScalar *), &DZ_OUT));
225     for (i = 0; i < n_neigh; i++) size_of_Z += n_shared[i];
226     PetscCall(PetscMalloc((size_of_Z + 1) * sizeof(PetscScalar), &DZ_IN[0]));
227     PetscCall(PetscMalloc((size_of_Z + 1) * sizeof(PetscScalar), &DZ_OUT[0]));
228   }
229   for (i = 1; i < n_neigh; i++) {
230     DZ_IN[i]  = DZ_IN[i - 1] + n_shared[i - 1];
231     DZ_OUT[i] = DZ_OUT[i - 1] + n_shared[i - 1];
232   }
233 
234   /* Set the values of DZ_OUT, in order to send this info to the neighbours */
235   /* First, set the auxiliary array pcis->work_N. */
236   PetscCall(PCISScatterArrayNToVecB(pc, pcis->work_N, pcis->D, INSERT_VALUES, SCATTER_REVERSE));
237   for (i = 1; i < n_neigh; i++) {
238     for (j = 0; j < n_shared[i]; j++) DZ_OUT[i][j] = pcis->work_N[shared[i][j]];
239   }
240 
241   /* Non-blocking send/receive the common-interface chunks of scaled nullspaces */
242   /* Notice that send_request[] and recv_request[] could have one less element. */
243   /* We make them longer to have request[i] corresponding to neigh[i].          */
244   {
245     PetscMPIInt tag;
246     PetscCall(PetscObjectGetNewTag((PetscObject)pc, &tag));
247     PetscCall(PetscMalloc2(n_neigh + 1, &send_request, n_neigh + 1, &recv_request));
248     for (i = 1; i < n_neigh; i++) {
249       PetscCallMPI(MPIU_Isend((void *)DZ_OUT[i], n_shared[i], MPIU_SCALAR, (PetscMPIInt)neigh[i], tag, PetscObjectComm((PetscObject)pc), &send_request[i]));
250       PetscCallMPI(MPIU_Irecv((void *)DZ_IN[i], n_shared[i], MPIU_SCALAR, (PetscMPIInt)neigh[i], tag, PetscObjectComm((PetscObject)pc), &recv_request[i]));
251     }
252   }
253 
254   /* Set DZ_IN[0][] (recall that neigh[0]==rank, always) */
255   for (j = 0; j < n_shared[0]; j++) DZ_IN[0][j] = pcis->work_N[shared[0][j]];
256 
257   /* Start computing with local D*Z while communication goes on.    */
258   /* Apply Schur complement. The result is "stored" in vec (more    */
259   /* precisely, vec points to the result, stored in pc_nn->vec1_B)  */
260   /* and also scattered to pcnn->work_N.                            */
261   PetscCall(PCNNApplySchurToChunk(pc, n_shared[0], shared[0], DZ_IN[0], pcis->work_N, pcis->vec1_B, pcis->vec2_B, pcis->vec1_D, pcis->vec2_D));
262 
263   /* Compute the first column, while completing the receiving. */
264   for (i = 0; i < n_neigh; i++) {
265     MPI_Status  stat;
266     PetscMPIInt ind = 0;
267     if (i > 0) {
268       PetscCallMPI(MPI_Waitany(n_neigh - 1, recv_request + 1, &ind, &stat));
269       ind++;
270     }
271     mat[ind * n_neigh + 0] = 0.0;
272     for (k = 0; k < n_shared[ind]; k++) mat[ind * n_neigh + 0] += DZ_IN[ind][k] * pcis->work_N[shared[ind][k]];
273   }
274 
275   /* Compute the remaining of the columns */
276   for (j = 1; j < n_neigh; j++) {
277     PetscCall(PCNNApplySchurToChunk(pc, n_shared[j], shared[j], DZ_IN[j], pcis->work_N, pcis->vec1_B, pcis->vec2_B, pcis->vec1_D, pcis->vec2_D));
278     for (i = 0; i < n_neigh; i++) {
279       mat[i * n_neigh + j] = 0.0;
280       for (k = 0; k < n_shared[i]; k++) mat[i * n_neigh + j] += DZ_IN[i][k] * pcis->work_N[shared[i][k]];
281     }
282   }
283 
284   /* Complete the sending. */
285   if (n_neigh > 1) {
286     MPI_Status *stat;
287     PetscCall(PetscMalloc1(n_neigh - 1, &stat));
288     if (n_neigh - 1) PetscCallMPI(MPI_Waitall(n_neigh - 1, &send_request[1], stat));
289     PetscCall(PetscFree(stat));
290   }
291 
292   /* Free the memory for the MPI requests */
293   PetscCall(PetscFree2(send_request, recv_request));
294 
295   /* Free the memory for DZ_OUT */
296   if (DZ_OUT) {
297     PetscCall(PetscFree(DZ_OUT[0]));
298     PetscCall(PetscFree(DZ_OUT));
299   }
300 
301   {
302     PetscMPIInt size;
303     PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)pc), &size));
304     /* Create the global coarse vectors (rhs and solution). */
305     PetscCall(VecCreateMPI(PetscObjectComm((PetscObject)pc), 1, size, &pcnn->coarse_b));
306     PetscCall(VecDuplicate(pcnn->coarse_b, &pcnn->coarse_x));
307     /* Create and set the global coarse AIJ matrix. */
308     PetscCall(MatCreate(PetscObjectComm((PetscObject)pc), &pcnn->coarse_mat));
309     PetscCall(MatSetSizes(pcnn->coarse_mat, 1, 1, size, size));
310     PetscCall(MatSetType(pcnn->coarse_mat, MATAIJ));
311     PetscCall(MatSeqAIJSetPreallocation(pcnn->coarse_mat, 1, NULL));
312     PetscCall(MatMPIAIJSetPreallocation(pcnn->coarse_mat, 1, NULL, n_neigh, NULL));
313     PetscCall(MatSetOption(pcnn->coarse_mat, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_FALSE));
314     PetscCall(MatSetOption(pcnn->coarse_mat, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_FALSE));
315     PetscCall(MatSetValues(pcnn->coarse_mat, n_neigh, neigh, n_neigh, neigh, mat, ADD_VALUES));
316     PetscCall(MatAssemblyBegin(pcnn->coarse_mat, MAT_FINAL_ASSEMBLY));
317     PetscCall(MatAssemblyEnd(pcnn->coarse_mat, MAT_FINAL_ASSEMBLY));
318   }
319 
320   {
321     PetscMPIInt rank;
322     PetscScalar one = 1.0;
323     PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)pc), &rank));
324     /* "Zero out" rows of not-purely-Neumann subdomains */
325     if (pcis->pure_neumann) { /* does NOT zero the row; create an empty index set. The reason is that MatZeroRows() is collective. */
326       PetscCall(MatZeroRows(pcnn->coarse_mat, 0, NULL, one, NULL, NULL));
327     } else { /* here it DOES zero the row, since it's not a floating subdomain. */
328       PetscInt row = (PetscInt)rank;
329       PetscCall(MatZeroRows(pcnn->coarse_mat, 1, &row, one, NULL, NULL));
330     }
331   }
332 
333   /* Create the coarse linear solver context */
334   {
335     PC  pc_ctx, inner_pc;
336     KSP inner_ksp;
337 
338     PetscCall(KSPCreate(PetscObjectComm((PetscObject)pc), &pcnn->ksp_coarse));
339     PetscCall(KSPSetNestLevel(pcnn->ksp_coarse, pc->kspnestlevel));
340     PetscCall(PetscObjectIncrementTabLevel((PetscObject)pcnn->ksp_coarse, (PetscObject)pc, 2));
341     PetscCall(KSPSetOperators(pcnn->ksp_coarse, pcnn->coarse_mat, pcnn->coarse_mat));
342     PetscCall(KSPGetPC(pcnn->ksp_coarse, &pc_ctx));
343     PetscCall(PCSetType(pc_ctx, PCREDUNDANT));
344     PetscCall(KSPSetType(pcnn->ksp_coarse, KSPPREONLY));
345     PetscCall(PCRedundantGetKSP(pc_ctx, &inner_ksp));
346     PetscCall(KSPGetPC(inner_ksp, &inner_pc));
347     PetscCall(PCSetType(inner_pc, PCLU));
348     PetscCall(KSPSetOptionsPrefix(pcnn->ksp_coarse, "nn_coarse_"));
349     PetscCall(KSPSetFromOptions(pcnn->ksp_coarse));
350     /* the vectors in the following line are dummy arguments, just telling the KSP the vector size. Values are not used */
351     PetscCall(KSPSetUp(pcnn->ksp_coarse));
352   }
353 
354   /* Free the memory for mat */
355   PetscCall(PetscFree(mat));
356 
357   /* for DEBUGGING, save the coarse matrix to a file. */
358   {
359     PetscBool flg = PETSC_FALSE;
360     PetscCall(PetscOptionsGetBool(NULL, NULL, "-pc_nn_save_coarse_matrix", &flg, NULL));
361     if (flg) {
362       PetscViewer viewer;
363       PetscCall(PetscViewerASCIIOpen(PETSC_COMM_WORLD, "coarse.m", &viewer));
364       PetscCall(PetscViewerPushFormat(viewer, PETSC_VIEWER_ASCII_MATLAB));
365       PetscCall(MatView(pcnn->coarse_mat, viewer));
366       PetscCall(PetscViewerPopFormat(viewer));
367       PetscCall(PetscViewerDestroy(&viewer));
368     }
369   }
370 
371   /*  Set the variable pcnn->factor_coarse_rhs. */
372   pcnn->factor_coarse_rhs = (pcis->pure_neumann) ? 1.0 : 0.0;
373 
374   /* See historical note 02, at the bottom of this file. */
375   PetscFunctionReturn(PETSC_SUCCESS);
376 }
377 
378 /*
379    PCNNApplySchurToChunk -
380 
381    Input parameters:
382 .  pcnn
383 .  n - size of chunk
384 .  idx - indices of chunk
385 .  chunk - values
386 
387    Output parameters:
388 .  array_N - result of Schur complement applied to chunk, scattered to big array
389 .  vec1_B  - result of Schur complement applied to chunk
390 .  vec2_B  - garbage (used as work space)
391 .  vec1_D  - garbage (used as work space)
392 .  vec2_D  - garbage (used as work space)
393 
394 */
395 PetscErrorCode PCNNApplySchurToChunk(PC pc, PetscInt n, PetscInt *idx, PetscScalar *chunk, PetscScalar *array_N, Vec vec1_B, Vec vec2_B, Vec vec1_D, Vec vec2_D)
396 {
397   PetscInt i;
398   PC_IS   *pcis = (PC_IS *)pc->data;
399 
400   PetscFunctionBegin;
401   PetscCall(PetscArrayzero(array_N, pcis->n));
402   for (i = 0; i < n; i++) array_N[idx[i]] = chunk[i];
403   PetscCall(PCISScatterArrayNToVecB(pc, array_N, vec2_B, INSERT_VALUES, SCATTER_FORWARD));
404   PetscCall(PCISApplySchur(pc, vec2_B, vec1_B, (Vec)0, vec1_D, vec2_D));
405   PetscCall(PCISScatterArrayNToVecB(pc, array_N, vec1_B, INSERT_VALUES, SCATTER_REVERSE));
406   PetscFunctionReturn(PETSC_SUCCESS);
407 }
408 
409 /*
410    PCNNApplyInterfacePreconditioner - Apply the interface preconditioner, i.e.,
411                                       the preconditioner for the Schur complement.
412 
413    Input parameter:
414 .  r - global vector of interior and interface nodes. The values on the interior nodes are NOT used.
415 
416    Output parameters:
417 .  z - global vector of interior and interface nodes. The values on the interface are the result of
418        the application of the interface preconditioner to the interface part of r. The values on the
419        interior nodes are garbage.
420 .  work_N - array of local nodes (interior and interface, including ghosts); returns garbage (used as work space)
421 .  vec1_B - vector of local interface nodes (including ghosts); returns garbage (used as work space)
422 .  vec2_B - vector of local interface nodes (including ghosts); returns garbage (used as work space)
423 .  vec3_B - vector of local interface nodes (including ghosts); returns garbage (used as work space)
424 .  vec1_D - vector of local interior nodes; returns garbage (used as work space)
425 .  vec2_D - vector of local interior nodes; returns garbage (used as work space)
426 .  vec1_N - vector of local nodes (interior and interface, including ghosts); returns garbage (used as work space)
427 .  vec2_N - vector of local nodes (interior and interface, including ghosts); returns garbage (used as work space)
428 
429 */
430 PetscErrorCode PCNNApplyInterfacePreconditioner(PC pc, Vec r, Vec z, PetscScalar *work_N, Vec vec1_B, Vec vec2_B, Vec vec3_B, Vec vec1_D, Vec vec2_D, Vec vec1_N, Vec vec2_N)
431 {
432   PC_IS *pcis = (PC_IS *)pc->data;
433 
434   PetscFunctionBegin;
435   /*
436     First balancing step.
437   */
438   {
439     PetscBool flg = PETSC_FALSE;
440     PetscCall(PetscOptionsGetBool(NULL, NULL, "-pc_nn_turn_off_first_balancing", &flg, NULL));
441     if (!flg) {
442       PetscCall(PCNNBalancing(pc, r, (Vec)0, z, vec1_B, vec2_B, (Vec)0, vec1_D, vec2_D, work_N));
443     } else {
444       PetscCall(VecCopy(r, z));
445     }
446   }
447 
448   /*
449     Extract the local interface part of z and scale it by D
450   */
451   PetscCall(VecScatterBegin(pcis->global_to_B, z, vec1_B, INSERT_VALUES, SCATTER_FORWARD));
452   PetscCall(VecScatterEnd(pcis->global_to_B, z, vec1_B, INSERT_VALUES, SCATTER_FORWARD));
453   PetscCall(VecPointwiseMult(vec2_B, pcis->D, vec1_B));
454 
455   /* Neumann Solver */
456   PetscCall(PCISApplyInvSchur(pc, vec2_B, vec1_B, vec1_N, vec2_N));
457 
458   /*
459     Second balancing step.
460   */
461   {
462     PetscBool flg = PETSC_FALSE;
463     PetscCall(PetscOptionsGetBool(NULL, NULL, "-pc_turn_off_second_balancing", &flg, NULL));
464     if (!flg) {
465       PetscCall(PCNNBalancing(pc, r, vec1_B, z, vec2_B, vec3_B, (Vec)0, vec1_D, vec2_D, work_N));
466     } else {
467       PetscCall(VecPointwiseMult(vec2_B, pcis->D, vec1_B));
468       PetscCall(VecSet(z, 0.0));
469       PetscCall(VecScatterBegin(pcis->global_to_B, vec2_B, z, ADD_VALUES, SCATTER_REVERSE));
470       PetscCall(VecScatterEnd(pcis->global_to_B, vec2_B, z, ADD_VALUES, SCATTER_REVERSE));
471     }
472   }
473   PetscFunctionReturn(PETSC_SUCCESS);
474 }
475 
476 /*
477    PCNNBalancing - Computes z, as given in equations (15) and (16) (if the
478                    input argument u is provided), or s, as given in equations
479                    (12) and (13), if the input argument u is a null vector.
480                    Notice that the input argument u plays the role of u_i in
481                    equation (14). The equation numbers refer to [Man93].
482 
483    Input Parameters:
484 +  pcnn - NN preconditioner context.
485 .  r - MPI vector of all nodes (interior and interface). It's preserved.
486 -  u - (Optional) sequential vector of local interface nodes. It's preserved UNLESS vec3_B is null.
487 
488    Output Parameters:
489 +  z - MPI vector of interior and interface nodes. Returns s or z (see description above).
490 .  vec1_B - Sequential vector of local interface nodes. Workspace.
491 .  vec2_B - Sequential vector of local interface nodes. Workspace.
492 .  vec3_B - (Optional) sequential vector of local interface nodes. Workspace.
493 .  vec1_D - Sequential vector of local interior nodes. Workspace.
494 .  vec2_D - Sequential vector of local interior nodes. Workspace.
495 -  work_N - Array of all local nodes (interior and interface). Workspace.
496 
497 */
498 PetscErrorCode PCNNBalancing(PC pc, Vec r, Vec u, Vec z, Vec vec1_B, Vec vec2_B, Vec vec3_B, Vec vec1_D, Vec vec2_D, PetscScalar *work_N)
499 {
500   PetscInt     k;
501   PetscScalar  value;
502   PetscScalar *lambda;
503   PC_NN       *pcnn = (PC_NN *)pc->data;
504   PC_IS       *pcis = (PC_IS *)pc->data;
505 
506   PetscFunctionBegin;
507   PetscCall(PetscLogEventBegin(PC_ApplyCoarse, pc, 0, 0, 0));
508   if (u) {
509     if (!vec3_B) vec3_B = u;
510     PetscCall(VecPointwiseMult(vec1_B, pcis->D, u));
511     PetscCall(VecSet(z, 0.0));
512     PetscCall(VecScatterBegin(pcis->global_to_B, vec1_B, z, ADD_VALUES, SCATTER_REVERSE));
513     PetscCall(VecScatterEnd(pcis->global_to_B, vec1_B, z, ADD_VALUES, SCATTER_REVERSE));
514     PetscCall(VecScatterBegin(pcis->global_to_B, z, vec2_B, INSERT_VALUES, SCATTER_FORWARD));
515     PetscCall(VecScatterEnd(pcis->global_to_B, z, vec2_B, INSERT_VALUES, SCATTER_FORWARD));
516     PetscCall(PCISApplySchur(pc, vec2_B, vec3_B, (Vec)0, vec1_D, vec2_D));
517     PetscCall(VecScale(vec3_B, -1.0));
518     PetscCall(VecCopy(r, z));
519     PetscCall(VecScatterBegin(pcis->global_to_B, vec3_B, z, ADD_VALUES, SCATTER_REVERSE));
520     PetscCall(VecScatterEnd(pcis->global_to_B, vec3_B, z, ADD_VALUES, SCATTER_REVERSE));
521   } else {
522     PetscCall(VecCopy(r, z));
523   }
524   PetscCall(VecScatterBegin(pcis->global_to_B, z, vec2_B, INSERT_VALUES, SCATTER_FORWARD));
525   PetscCall(VecScatterEnd(pcis->global_to_B, z, vec2_B, INSERT_VALUES, SCATTER_FORWARD));
526   PetscCall(PCISScatterArrayNToVecB(pc, work_N, vec2_B, INSERT_VALUES, SCATTER_REVERSE));
527   for (k = 0, value = 0.0; k < pcis->n_shared[0]; k++) value += pcnn->DZ_IN[0][k] * work_N[pcis->shared[0][k]];
528   value *= pcnn->factor_coarse_rhs; /* This factor is set in CreateCoarseMatrix(). */
529   {
530     PetscMPIInt rank;
531     PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)pc), &rank));
532     PetscCall(VecSetValue(pcnn->coarse_b, rank, value, INSERT_VALUES));
533     /*
534        Since we are only inserting local values (one value actually) we don't need to do the
535        reduction that tells us there is no data that needs to be moved. Hence we comment out these
536        PetscCall(VecAssemblyBegin(pcnn->coarse_b));
537        PetscCall(VecAssemblyEnd  (pcnn->coarse_b));
538     */
539   }
540   PetscCall(KSPSolve(pcnn->ksp_coarse, pcnn->coarse_b, pcnn->coarse_x));
541   if (!u) PetscCall(VecScale(pcnn->coarse_x, -1.0));
542   PetscCall(VecGetArray(pcnn->coarse_x, &lambda));
543   for (k = 0; k < pcis->n_shared[0]; k++) work_N[pcis->shared[0][k]] = *lambda * pcnn->DZ_IN[0][k];
544   PetscCall(VecRestoreArray(pcnn->coarse_x, &lambda));
545   PetscCall(PCISScatterArrayNToVecB(pc, work_N, vec2_B, INSERT_VALUES, SCATTER_FORWARD));
546   PetscCall(VecSet(z, 0.0));
547   PetscCall(VecScatterBegin(pcis->global_to_B, vec2_B, z, ADD_VALUES, SCATTER_REVERSE));
548   PetscCall(VecScatterEnd(pcis->global_to_B, vec2_B, z, ADD_VALUES, SCATTER_REVERSE));
549   if (!u) {
550     PetscCall(VecScatterBegin(pcis->global_to_B, z, vec2_B, INSERT_VALUES, SCATTER_FORWARD));
551     PetscCall(VecScatterEnd(pcis->global_to_B, z, vec2_B, INSERT_VALUES, SCATTER_FORWARD));
552     PetscCall(PCISApplySchur(pc, vec2_B, vec1_B, (Vec)0, vec1_D, vec2_D));
553     PetscCall(VecCopy(r, z));
554   }
555   PetscCall(VecScatterBegin(pcis->global_to_B, vec1_B, z, ADD_VALUES, SCATTER_REVERSE));
556   PetscCall(VecScatterEnd(pcis->global_to_B, vec1_B, z, ADD_VALUES, SCATTER_REVERSE));
557   PetscCall(PetscLogEventEnd(PC_ApplyCoarse, pc, 0, 0, 0));
558   PetscFunctionReturn(PETSC_SUCCESS);
559 }
560 
561 /*  From now on, "footnotes" (or "historical notes").  */
562 /*
563    Historical note 01
564 
565    We considered the possibility of an alternative D_i that would still
566    provide a partition of unity (i.e., $ \sum_i  N_i D_i N_i^T = I $).
567    The basic principle was still the pseudo-inverse of the counting
568    function; the difference was that we would not count subdomains
569    that do not contribute to the coarse space (i.e., not pure-Neumann
570    subdomains).
571 
572    This turned out to be a bad idea:  we would solve trivial Neumann
573    problems in the not pure-Neumann subdomains, since we would be scaling
574    the balanced residual by zero.
575 */
576 
577 /*
578    Historical note 02
579 
580    We tried an alternative coarse problem, that would eliminate exactly a
581    constant error. Turned out not to improve the overall convergence.
582 */
583