xref: /petsc/src/ksp/pc/impls/bddc/bddcschurs.c (revision bf0c7fc2d7fb4e90d42e412b25194b878e6e442d) !
1 #include <petsc/private/pcbddcimpl.h>
2 #include <petsc/private/pcbddcprivateimpl.h>
3 #include <../src/mat/impls/dense/seq/dense.h>
4 #include <petscblaslapack.h>
5 
6 static inline PetscErrorCode PCBDDCAdjGetNextLayer_Private(PetscInt *, PetscInt, PetscBT, PetscInt *, PetscInt *, PetscInt *);
7 static PetscErrorCode        PCBDDCComputeExplicitSchur(Mat, PetscBool, MatReuse, Mat *);
8 static PetscErrorCode        PCBDDCReuseSolvers_Interior(PC, Vec, Vec);
9 static PetscErrorCode        PCBDDCReuseSolvers_Correction(PC, Vec, Vec);
10 
11 /* if v2 is not present, correction is done in-place */
12 PetscErrorCode PCBDDCReuseSolversBenignAdapt(PCBDDCReuseSolvers ctx, Vec v, Vec v2, PetscBool sol, PetscBool full)
13 {
14   PetscScalar *array;
15   PetscScalar *array2;
16 
17   PetscFunctionBegin;
18   if (!ctx->benign_n) PetscFunctionReturn(PETSC_SUCCESS);
19   if (sol && full) {
20     PetscInt n_I, size_schur;
21 
22     /* get sizes */
23     PetscCall(MatGetSize(ctx->benign_csAIB, &size_schur, NULL));
24     PetscCall(VecGetSize(v, &n_I));
25     n_I = n_I - size_schur;
26     /* get schur sol from array */
27     PetscCall(VecGetArray(v, &array));
28     PetscCall(VecPlaceArray(ctx->benign_dummy_schur_vec, array + n_I));
29     PetscCall(VecRestoreArray(v, &array));
30     /* apply interior sol correction */
31     PetscCall(MatMultTranspose(ctx->benign_csAIB, ctx->benign_dummy_schur_vec, ctx->benign_corr_work));
32     PetscCall(VecResetArray(ctx->benign_dummy_schur_vec));
33     PetscCall(MatMultAdd(ctx->benign_AIIm1ones, ctx->benign_corr_work, v, v));
34   }
35   if (v2) {
36     PetscInt nl;
37 
38     PetscCall(VecGetArrayRead(v, (const PetscScalar **)&array));
39     PetscCall(VecGetLocalSize(v2, &nl));
40     PetscCall(VecGetArray(v2, &array2));
41     PetscCall(PetscArraycpy(array2, array, nl));
42   } else {
43     PetscCall(VecGetArray(v, &array));
44     array2 = array;
45   }
46   if (!sol) { /* change rhs */
47     PetscInt n;
48     for (n = 0; n < ctx->benign_n; n++) {
49       PetscScalar     sum = 0.;
50       const PetscInt *cols;
51       PetscInt        nz, i;
52 
53       PetscCall(ISGetLocalSize(ctx->benign_zerodiag_subs[n], &nz));
54       PetscCall(ISGetIndices(ctx->benign_zerodiag_subs[n], &cols));
55       for (i = 0; i < nz - 1; i++) sum += array[cols[i]];
56 #if defined(PETSC_USE_COMPLEX)
57       sum = -(PetscRealPart(sum) / nz + PETSC_i * (PetscImaginaryPart(sum) / nz));
58 #else
59       sum = -sum / nz;
60 #endif
61       for (i = 0; i < nz - 1; i++) array2[cols[i]] += sum;
62       ctx->benign_save_vals[n] = array2[cols[nz - 1]];
63       array2[cols[nz - 1]]     = sum;
64       PetscCall(ISRestoreIndices(ctx->benign_zerodiag_subs[n], &cols));
65     }
66   } else {
67     PetscInt n;
68     for (n = 0; n < ctx->benign_n; n++) {
69       PetscScalar     sum = 0.;
70       const PetscInt *cols;
71       PetscInt        nz, i;
72       PetscCall(ISGetLocalSize(ctx->benign_zerodiag_subs[n], &nz));
73       PetscCall(ISGetIndices(ctx->benign_zerodiag_subs[n], &cols));
74       for (i = 0; i < nz - 1; i++) sum += array[cols[i]];
75 #if defined(PETSC_USE_COMPLEX)
76       sum = -(PetscRealPart(sum) / nz + PETSC_i * (PetscImaginaryPart(sum) / nz));
77 #else
78       sum = -sum / nz;
79 #endif
80       for (i = 0; i < nz - 1; i++) array2[cols[i]] += sum;
81       array2[cols[nz - 1]] = ctx->benign_save_vals[n];
82       PetscCall(ISRestoreIndices(ctx->benign_zerodiag_subs[n], &cols));
83     }
84   }
85   if (v2) {
86     PetscCall(VecRestoreArrayRead(v, (const PetscScalar **)&array));
87     PetscCall(VecRestoreArray(v2, &array2));
88   } else {
89     PetscCall(VecRestoreArray(v, &array));
90   }
91   if (!sol && full) {
92     Vec      usedv;
93     PetscInt n_I, size_schur;
94 
95     /* get sizes */
96     PetscCall(MatGetSize(ctx->benign_csAIB, &size_schur, NULL));
97     PetscCall(VecGetSize(v, &n_I));
98     n_I = n_I - size_schur;
99     /* compute schur rhs correction */
100     if (v2) {
101       usedv = v2;
102     } else {
103       usedv = v;
104     }
105     /* apply schur rhs correction */
106     PetscCall(MatMultTranspose(ctx->benign_AIIm1ones, usedv, ctx->benign_corr_work));
107     PetscCall(VecGetArrayRead(usedv, (const PetscScalar **)&array));
108     PetscCall(VecPlaceArray(ctx->benign_dummy_schur_vec, array + n_I));
109     PetscCall(VecRestoreArrayRead(usedv, (const PetscScalar **)&array));
110     PetscCall(MatMultAdd(ctx->benign_csAIB, ctx->benign_corr_work, ctx->benign_dummy_schur_vec, ctx->benign_dummy_schur_vec));
111     PetscCall(VecResetArray(ctx->benign_dummy_schur_vec));
112   }
113   PetscFunctionReturn(PETSC_SUCCESS);
114 }
115 
116 static PetscErrorCode PCBDDCReuseSolvers_Solve_Private(PC pc, Vec rhs, Vec sol, PetscBool transpose, PetscBool full)
117 {
118   PCBDDCReuseSolvers ctx;
119   PetscBool          copy = PETSC_FALSE;
120 
121   PetscFunctionBegin;
122   PetscCall(PCShellGetContext(pc, &ctx));
123   if (full) {
124     PetscCall(MatMumpsSetIcntl(ctx->F, 26, -1));
125 #if defined(PETSC_HAVE_MKL_PARDISO)
126     PetscCall(MatMkl_PardisoSetCntl(ctx->F, 70, 0));
127 #endif
128     copy = ctx->has_vertices;
129   } else { /* interior solver */
130     PetscCall(MatMumpsSetIcntl(ctx->F, 26, 0));
131 #if defined(PETSC_HAVE_MKL_PARDISO)
132     PetscCall(MatMkl_PardisoSetCntl(ctx->F, 70, 1));
133 #endif
134     copy = PETSC_TRUE;
135   }
136   /* copy rhs into factored matrix workspace */
137   if (copy) {
138     PetscInt     n;
139     PetscScalar *array, *array_solver;
140 
141     PetscCall(VecGetLocalSize(rhs, &n));
142     PetscCall(VecGetArrayRead(rhs, (const PetscScalar **)&array));
143     PetscCall(VecGetArray(ctx->rhs, &array_solver));
144     PetscCall(PetscArraycpy(array_solver, array, n));
145     PetscCall(VecRestoreArray(ctx->rhs, &array_solver));
146     PetscCall(VecRestoreArrayRead(rhs, (const PetscScalar **)&array));
147 
148     PetscCall(PCBDDCReuseSolversBenignAdapt(ctx, ctx->rhs, NULL, PETSC_FALSE, full));
149     if (transpose) {
150       PetscCall(MatSolveTranspose(ctx->F, ctx->rhs, ctx->sol));
151     } else {
152       PetscCall(MatSolve(ctx->F, ctx->rhs, ctx->sol));
153     }
154     PetscCall(PCBDDCReuseSolversBenignAdapt(ctx, ctx->sol, NULL, PETSC_TRUE, full));
155 
156     /* get back data to caller worskpace */
157     PetscCall(VecGetArrayRead(ctx->sol, (const PetscScalar **)&array_solver));
158     PetscCall(VecGetArray(sol, &array));
159     PetscCall(PetscArraycpy(array, array_solver, n));
160     PetscCall(VecRestoreArray(sol, &array));
161     PetscCall(VecRestoreArrayRead(ctx->sol, (const PetscScalar **)&array_solver));
162   } else {
163     if (ctx->benign_n) {
164       PetscCall(PCBDDCReuseSolversBenignAdapt(ctx, rhs, ctx->rhs, PETSC_FALSE, full));
165       if (transpose) {
166         PetscCall(MatSolveTranspose(ctx->F, ctx->rhs, sol));
167       } else {
168         PetscCall(MatSolve(ctx->F, ctx->rhs, sol));
169       }
170       PetscCall(PCBDDCReuseSolversBenignAdapt(ctx, sol, NULL, PETSC_TRUE, full));
171     } else {
172       if (transpose) {
173         PetscCall(MatSolveTranspose(ctx->F, rhs, sol));
174       } else {
175         PetscCall(MatSolve(ctx->F, rhs, sol));
176       }
177     }
178   }
179   /* restore defaults */
180   PetscCall(MatMumpsSetIcntl(ctx->F, 26, -1));
181 #if defined(PETSC_HAVE_MKL_PARDISO)
182   PetscCall(MatMkl_PardisoSetCntl(ctx->F, 70, 0));
183 #endif
184   PetscFunctionReturn(PETSC_SUCCESS);
185 }
186 
187 static PetscErrorCode PCBDDCReuseSolvers_Correction(PC pc, Vec rhs, Vec sol)
188 {
189   PetscFunctionBegin;
190   PetscCall(PCBDDCReuseSolvers_Solve_Private(pc, rhs, sol, PETSC_FALSE, PETSC_TRUE));
191   PetscFunctionReturn(PETSC_SUCCESS);
192 }
193 
194 static PetscErrorCode PCBDDCReuseSolvers_CorrectionTranspose(PC pc, Vec rhs, Vec sol)
195 {
196   PetscFunctionBegin;
197   PetscCall(PCBDDCReuseSolvers_Solve_Private(pc, rhs, sol, PETSC_TRUE, PETSC_TRUE));
198   PetscFunctionReturn(PETSC_SUCCESS);
199 }
200 
201 static PetscErrorCode PCBDDCReuseSolvers_Interior(PC pc, Vec rhs, Vec sol)
202 {
203   PetscFunctionBegin;
204   PetscCall(PCBDDCReuseSolvers_Solve_Private(pc, rhs, sol, PETSC_FALSE, PETSC_FALSE));
205   PetscFunctionReturn(PETSC_SUCCESS);
206 }
207 
208 static PetscErrorCode PCBDDCReuseSolvers_InteriorTranspose(PC pc, Vec rhs, Vec sol)
209 {
210   PetscFunctionBegin;
211   PetscCall(PCBDDCReuseSolvers_Solve_Private(pc, rhs, sol, PETSC_TRUE, PETSC_FALSE));
212   PetscFunctionReturn(PETSC_SUCCESS);
213 }
214 
215 static PetscErrorCode PCBDDCReuseSolvers_View(PC pc, PetscViewer viewer)
216 {
217   PCBDDCReuseSolvers ctx;
218   PetscBool          isascii;
219 
220   PetscFunctionBegin;
221   PetscCall(PCShellGetContext(pc, &ctx));
222   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &isascii));
223   if (isascii) PetscCall(PetscViewerPushFormat(viewer, PETSC_VIEWER_ASCII_INFO));
224   PetscCall(MatView(ctx->F, viewer));
225   if (isascii) PetscCall(PetscViewerPopFormat(viewer));
226   PetscFunctionReturn(PETSC_SUCCESS);
227 }
228 
229 static PetscErrorCode PCBDDCReuseSolversReset(PCBDDCReuseSolvers reuse)
230 {
231   PetscInt i;
232 
233   PetscFunctionBegin;
234   PetscCall(MatDestroy(&reuse->F));
235   PetscCall(VecDestroy(&reuse->sol));
236   PetscCall(VecDestroy(&reuse->rhs));
237   PetscCall(PCDestroy(&reuse->interior_solver));
238   PetscCall(PCDestroy(&reuse->correction_solver));
239   PetscCall(ISDestroy(&reuse->is_R));
240   PetscCall(ISDestroy(&reuse->is_B));
241   PetscCall(VecScatterDestroy(&reuse->correction_scatter_B));
242   PetscCall(VecDestroy(&reuse->sol_B));
243   PetscCall(VecDestroy(&reuse->rhs_B));
244   for (i = 0; i < reuse->benign_n; i++) PetscCall(ISDestroy(&reuse->benign_zerodiag_subs[i]));
245   PetscCall(PetscFree(reuse->benign_zerodiag_subs));
246   PetscCall(PetscFree(reuse->benign_save_vals));
247   PetscCall(MatDestroy(&reuse->benign_csAIB));
248   PetscCall(MatDestroy(&reuse->benign_AIIm1ones));
249   PetscCall(VecDestroy(&reuse->benign_corr_work));
250   PetscCall(VecDestroy(&reuse->benign_dummy_schur_vec));
251   PetscFunctionReturn(PETSC_SUCCESS);
252 }
253 
254 static PetscErrorCode PCBDDCReuseSolvers_Destroy(PC pc)
255 {
256   PCBDDCReuseSolvers ctx;
257 
258   PetscFunctionBegin;
259   PetscCall(PCShellGetContext(pc, &ctx));
260   PetscCall(PCBDDCReuseSolversReset(ctx));
261   PetscCall(PetscFree(ctx));
262   PetscCall(PCShellSetContext(pc, NULL));
263   PetscFunctionReturn(PETSC_SUCCESS);
264 }
265 
266 static PetscErrorCode PCBDDCComputeExplicitSchur(Mat M, PetscBool issym, MatReuse reuse, Mat *S)
267 {
268   Mat         B, C, D, Bd, Cd, AinvBd;
269   KSP         ksp;
270   PC          pc;
271   PetscBool   isLU, isILU, isCHOL, Bdense, Cdense;
272   PetscReal   fill = 2.0;
273   PetscInt    n_I;
274   PetscMPIInt size;
275 
276   PetscFunctionBegin;
277   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)M), &size));
278   PetscCheck(size == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not for parallel matrices");
279   if (reuse == MAT_REUSE_MATRIX) {
280     PetscBool Sdense;
281 
282     PetscCall(PetscObjectTypeCompare((PetscObject)*S, MATSEQDENSE, &Sdense));
283     PetscCheck(Sdense, PetscObjectComm((PetscObject)M), PETSC_ERR_SUP, "S should dense");
284   }
285   PetscCall(MatSchurComplementGetSubMatrices(M, NULL, NULL, &B, &C, &D));
286   PetscCall(MatSchurComplementGetKSP(M, &ksp));
287   PetscCall(KSPGetPC(ksp, &pc));
288   PetscCall(PetscObjectTypeCompare((PetscObject)pc, PCLU, &isLU));
289   PetscCall(PetscObjectTypeCompare((PetscObject)pc, PCILU, &isILU));
290   PetscCall(PetscObjectTypeCompare((PetscObject)pc, PCCHOLESKY, &isCHOL));
291   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSE, &Bdense));
292   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &Cdense));
293   PetscCall(MatGetSize(B, &n_I, NULL));
294   if (n_I) {
295     if (!Bdense) {
296       PetscCall(MatConvert(B, MATSEQDENSE, MAT_INITIAL_MATRIX, &Bd));
297     } else {
298       Bd = B;
299     }
300 
301     if (isLU || isILU || isCHOL) {
302       Mat fact;
303       PetscCall(KSPSetUp(ksp));
304       PetscCall(PCFactorGetMatrix(pc, &fact));
305       PetscCall(MatDuplicate(Bd, MAT_DO_NOT_COPY_VALUES, &AinvBd));
306       PetscCall(MatMatSolve(fact, Bd, AinvBd));
307     } else {
308       PetscBool ex = PETSC_TRUE;
309 
310       if (ex) {
311         Mat Ainvd;
312 
313         PetscCall(PCComputeOperator(pc, MATDENSE, &Ainvd));
314         PetscCall(MatMatMult(Ainvd, Bd, MAT_INITIAL_MATRIX, fill, &AinvBd));
315         PetscCall(MatDestroy(&Ainvd));
316       } else {
317         Vec          sol, rhs;
318         PetscScalar *arrayrhs, *arraysol;
319         PetscInt     i, nrhs, n;
320 
321         PetscCall(MatDuplicate(Bd, MAT_DO_NOT_COPY_VALUES, &AinvBd));
322         PetscCall(MatGetSize(Bd, &n, &nrhs));
323         PetscCall(MatDenseGetArray(Bd, &arrayrhs));
324         PetscCall(MatDenseGetArray(AinvBd, &arraysol));
325         PetscCall(KSPGetSolution(ksp, &sol));
326         PetscCall(KSPGetRhs(ksp, &rhs));
327         for (i = 0; i < nrhs; i++) {
328           PetscCall(VecPlaceArray(rhs, arrayrhs + i * n));
329           PetscCall(VecPlaceArray(sol, arraysol + i * n));
330           PetscCall(KSPSolve(ksp, rhs, sol));
331           PetscCall(VecResetArray(rhs));
332           PetscCall(VecResetArray(sol));
333         }
334         PetscCall(MatDenseRestoreArray(Bd, &arrayrhs));
335         PetscCall(MatDenseRestoreArray(AinvBd, &arrayrhs));
336       }
337     }
338     if (!Bdense & !issym) PetscCall(MatDestroy(&Bd));
339 
340     if (!issym) {
341       if (!Cdense) {
342         PetscCall(MatConvert(C, MATSEQDENSE, MAT_INITIAL_MATRIX, &Cd));
343       } else {
344         Cd = C;
345       }
346       PetscCall(MatMatMult(Cd, AinvBd, reuse, fill, S));
347       if (!Cdense) PetscCall(MatDestroy(&Cd));
348     } else {
349       PetscCall(MatTransposeMatMult(Bd, AinvBd, reuse, fill, S));
350       if (!Bdense) PetscCall(MatDestroy(&Bd));
351     }
352     PetscCall(MatDestroy(&AinvBd));
353   }
354 
355   if (D) {
356     Mat       Dd;
357     PetscBool Ddense;
358 
359     PetscCall(PetscObjectTypeCompare((PetscObject)D, MATSEQDENSE, &Ddense));
360     if (!Ddense) {
361       PetscCall(MatConvert(D, MATSEQDENSE, MAT_INITIAL_MATRIX, &Dd));
362     } else {
363       Dd = D;
364     }
365     if (n_I) {
366       PetscCall(MatAYPX(*S, -1.0, Dd, SAME_NONZERO_PATTERN));
367     } else {
368       if (reuse == MAT_INITIAL_MATRIX) {
369         PetscCall(MatDuplicate(Dd, MAT_COPY_VALUES, S));
370       } else {
371         PetscCall(MatCopy(Dd, *S, SAME_NONZERO_PATTERN));
372       }
373     }
374     if (!Ddense) PetscCall(MatDestroy(&Dd));
375   } else {
376     PetscCall(MatScale(*S, -1.0));
377   }
378   PetscFunctionReturn(PETSC_SUCCESS);
379 }
380 
381 PetscErrorCode PCBDDCSubSchursSetUp(PCBDDCSubSchurs sub_schurs, Mat Ain, Mat Sin, PetscBool exact_schur, PetscInt xadj[], PetscInt adjncy[], PetscInt nlayers, Vec scaling, PetscBool compute_Stilda, PetscBool reuse_solvers, PetscBool benign_trick, PetscInt benign_n, PetscInt benign_p0_lidx[], IS benign_zerodiag_subs[], Mat change, IS change_primal)
382 {
383   Mat          F, A_II, A_IB, A_BI, A_BB, AE_II;
384   Mat          S_all;
385   Vec          gstash, lstash;
386   VecScatter   sstash;
387   IS           is_I, is_I_layer;
388   IS           all_subsets, all_subsets_mult, all_subsets_n;
389   PetscScalar *stasharray, *Bwork;
390   PetscInt    *all_local_idx_N, *all_local_subid_N = NULL;
391   PetscInt    *auxnum1, *auxnum2;
392   PetscInt    *local_subs = sub_schurs->graph->local_subs;
393   PetscInt     i, subset_size, max_subset_size, n_local_subs = sub_schurs->graph->n_local_subs;
394   PetscInt     n_B, extra, local_size, global_size;
395   PetscInt     local_stash_size;
396   PetscBLASInt B_N, B_ierr, B_lwork, *pivots;
397   MPI_Comm     comm_n;
398   PetscBool    deluxe   = PETSC_TRUE;
399   PetscBool    use_potr = PETSC_FALSE, use_sytr = PETSC_FALSE;
400   PetscViewer  matl_dbg_viewer = NULL;
401   PetscBool    flg, multi_element = sub_schurs->graph->multi_element;
402 
403   PetscFunctionBegin;
404   PetscCall(MatDestroy(&sub_schurs->A));
405   PetscCall(MatDestroy(&sub_schurs->S));
406   if (Ain) {
407     PetscCall(PetscObjectReference((PetscObject)Ain));
408     sub_schurs->A = Ain;
409   }
410 
411   PetscCall(PetscObjectReference((PetscObject)Sin));
412   sub_schurs->S = Sin;
413   if (sub_schurs->schur_explicit) sub_schurs->schur_explicit = (PetscBool)(!!sub_schurs->A);
414 
415   /* preliminary checks */
416   PetscCheck(sub_schurs->schur_explicit || !compute_Stilda, PetscObjectComm((PetscObject)sub_schurs->l2gmap), PETSC_ERR_SUP, "Adaptive selection of constraints requires MUMPS and/or MKL_PARDISO");
417 
418   if (benign_trick) sub_schurs->is_posdef = PETSC_FALSE;
419 
420   /* debug (MATLAB) */
421   if (sub_schurs->debug) {
422     PetscMPIInt size, rank;
423     PetscInt    nr, *print_schurs_ranks, print_schurs = PETSC_FALSE;
424 
425     PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)sub_schurs->l2gmap), &size));
426     PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)sub_schurs->l2gmap), &rank));
427     nr = size;
428     PetscCall(PetscMalloc1(nr, &print_schurs_ranks));
429     PetscOptionsBegin(PetscObjectComm((PetscObject)sub_schurs->l2gmap), sub_schurs->prefix, "BDDC sub_schurs options", "PC");
430     PetscCall(PetscOptionsIntArray("-sub_schurs_debug_ranks", "Ranks to debug (all if the option is not used)", NULL, print_schurs_ranks, &nr, &flg));
431     if (!flg) print_schurs = PETSC_TRUE;
432     else {
433       print_schurs = PETSC_FALSE;
434       for (i = 0; i < nr; i++)
435         if (print_schurs_ranks[i] == rank) {
436           print_schurs = PETSC_TRUE;
437           break;
438         }
439     }
440     PetscOptionsEnd();
441     PetscCall(PetscFree(print_schurs_ranks));
442     if (print_schurs) {
443       char filename[256];
444 
445       PetscCall(PetscSNPrintf(filename, sizeof(filename), "sub_schurs_Schur_r%d.m", PetscGlobalRank));
446       PetscCall(PetscViewerASCIIOpen(PETSC_COMM_SELF, filename, &matl_dbg_viewer));
447       PetscCall(PetscViewerPushFormat(matl_dbg_viewer, PETSC_VIEWER_ASCII_MATLAB));
448     }
449   }
450 
451   /* DEBUG: turn on/off multi-element code path */
452   PetscCall(PetscOptionsGetBool(NULL, sub_schurs->prefix, "-sub_schurs_multielement_code", &multi_element, NULL));
453   if (n_local_subs == 0) multi_element = PETSC_FALSE;
454 
455   /* restrict work on active processes */
456   if (sub_schurs->restrict_comm) {
457     PetscSubcomm subcomm;
458     PetscMPIInt  color, rank;
459 
460     color = 0;
461     if (!sub_schurs->n_subs) color = 1; /* this can happen if we are in a multilevel case or if the subdomain is disconnected */
462     PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)sub_schurs->l2gmap), &rank));
463     PetscCall(PetscSubcommCreate(PetscObjectComm((PetscObject)sub_schurs->l2gmap), &subcomm));
464     PetscCall(PetscSubcommSetNumber(subcomm, 2));
465     PetscCall(PetscSubcommSetTypeGeneral(subcomm, color, rank));
466     PetscCall(PetscCommDuplicate(PetscSubcommChild(subcomm), &comm_n, NULL));
467     PetscCall(PetscSubcommDestroy(&subcomm));
468     if (!sub_schurs->n_subs) {
469       PetscCall(PetscCommDestroy(&comm_n));
470       PetscFunctionReturn(PETSC_SUCCESS);
471     }
472   } else {
473     PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)sub_schurs->l2gmap), &comm_n, NULL));
474   }
475 
476   /* get Schur complement matrices */
477   if (!sub_schurs->schur_explicit) {
478     Mat       tA_IB, tA_BI, tA_BB;
479     PetscBool isseqsbaij;
480     PetscCall(MatSchurComplementGetSubMatrices(sub_schurs->S, &A_II, NULL, &tA_IB, &tA_BI, &tA_BB));
481     PetscCall(PetscObjectTypeCompare((PetscObject)tA_BB, MATSEQSBAIJ, &isseqsbaij));
482     if (isseqsbaij) {
483       PetscCall(MatConvert(tA_BB, MATSEQAIJ, MAT_INITIAL_MATRIX, &A_BB));
484       PetscCall(MatConvert(tA_IB, MATSEQAIJ, MAT_INITIAL_MATRIX, &A_IB));
485       PetscCall(MatConvert(tA_BI, MATSEQAIJ, MAT_INITIAL_MATRIX, &A_BI));
486     } else {
487       PetscCall(PetscObjectReference((PetscObject)tA_BB));
488       A_BB = tA_BB;
489       PetscCall(PetscObjectReference((PetscObject)tA_IB));
490       A_IB = tA_IB;
491       PetscCall(PetscObjectReference((PetscObject)tA_BI));
492       A_BI = tA_BI;
493     }
494   } else {
495     A_II = NULL;
496     A_IB = NULL;
497     A_BI = NULL;
498     A_BB = NULL;
499   }
500   S_all = NULL;
501 
502   /* determine interior problems */
503   PetscCall(ISGetLocalSize(sub_schurs->is_I, &i));
504   if (nlayers >= 0 && i) { /* Interior problems can be different from the original one */
505     PetscBT         touched;
506     const PetscInt *idx_B;
507     PetscInt        n_I, n_B, n_local_dofs, n_prev_added, j, layer, *local_numbering;
508 
509     PetscCheck(xadj, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cannot request layering without adjacency");
510     /* get sizes */
511     PetscCall(ISGetLocalSize(sub_schurs->is_I, &n_I));
512     PetscCall(ISGetLocalSize(sub_schurs->is_B, &n_B));
513 
514     PetscCall(PetscMalloc1(n_I + n_B, &local_numbering));
515     PetscCall(PetscBTCreate(n_I + n_B, &touched));
516     PetscCall(PetscBTMemzero(n_I + n_B, touched));
517 
518     /* all boundary dofs must be skipped when adding layers */
519     PetscCall(ISGetIndices(sub_schurs->is_B, &idx_B));
520     for (j = 0; j < n_B; j++) PetscCall(PetscBTSet(touched, idx_B[j]));
521     PetscCall(PetscArraycpy(local_numbering, idx_B, n_B));
522     PetscCall(ISRestoreIndices(sub_schurs->is_B, &idx_B));
523 
524     /* add prescribed number of layers of dofs */
525     n_local_dofs = n_B;
526     n_prev_added = n_B;
527     for (layer = 0; layer < nlayers; layer++) {
528       PetscInt n_added = 0;
529       if (n_local_dofs == n_I + n_B) break;
530       PetscCheck(n_local_dofs <= n_I + n_B, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Error querying layer %" PetscInt_FMT ". Out of bound access (%" PetscInt_FMT " > %" PetscInt_FMT ")", layer, n_local_dofs, n_I + n_B);
531       PetscCall(PCBDDCAdjGetNextLayer_Private(local_numbering + n_local_dofs, n_prev_added, touched, xadj, adjncy, &n_added));
532       n_prev_added = n_added;
533       n_local_dofs += n_added;
534       if (!n_added) break;
535     }
536     PetscCall(PetscBTDestroy(&touched));
537 
538     /* IS for I layer dofs in original numbering */
539     PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)sub_schurs->is_I), n_local_dofs - n_B, local_numbering + n_B, PETSC_COPY_VALUES, &is_I_layer));
540     PetscCall(PetscFree(local_numbering));
541     PetscCall(ISSort(is_I_layer));
542     /* IS for I layer dofs in I numbering */
543     if (!sub_schurs->schur_explicit) {
544       ISLocalToGlobalMapping ItoNmap;
545       PetscCall(ISLocalToGlobalMappingCreateIS(sub_schurs->is_I, &ItoNmap));
546       PetscCall(ISGlobalToLocalMappingApplyIS(ItoNmap, IS_GTOLM_DROP, is_I_layer, &is_I));
547       PetscCall(ISLocalToGlobalMappingDestroy(&ItoNmap));
548 
549       /* II block */
550       PetscCall(MatCreateSubMatrix(A_II, is_I, is_I, MAT_INITIAL_MATRIX, &AE_II));
551     }
552   } else {
553     PetscInt n_I;
554 
555     /* IS for I dofs in original numbering */
556     PetscCall(PetscObjectReference((PetscObject)sub_schurs->is_I));
557     is_I_layer = sub_schurs->is_I;
558 
559     /* IS for I dofs in I numbering (strided 1) */
560     if (!sub_schurs->schur_explicit) {
561       PetscCall(ISGetSize(sub_schurs->is_I, &n_I));
562       PetscCall(ISCreateStride(PetscObjectComm((PetscObject)sub_schurs->is_I), n_I, 0, 1, &is_I));
563 
564       /* II block is the same */
565       PetscCall(PetscObjectReference((PetscObject)A_II));
566       AE_II = A_II;
567     }
568   }
569 
570   /* Get info on subset sizes and sum of all subsets sizes */
571   max_subset_size = 0;
572   local_size      = 0;
573   for (i = 0; i < sub_schurs->n_subs; i++) {
574     PetscCall(ISGetLocalSize(sub_schurs->is_subs[i], &subset_size));
575     max_subset_size = PetscMax(subset_size, max_subset_size);
576     local_size += subset_size;
577   }
578 
579   /* Work arrays for local indices */
580   extra = 0;
581   PetscCall(ISGetLocalSize(sub_schurs->is_B, &n_B));
582   if (sub_schurs->schur_explicit && is_I_layer) PetscCall(ISGetLocalSize(is_I_layer, &extra));
583   PetscCall(PetscMalloc1(n_B + extra, &all_local_idx_N));
584   if (multi_element) PetscCall(PetscMalloc1(n_B + extra, &all_local_subid_N));
585   if (extra) {
586     const PetscInt *idxs;
587     PetscCall(ISGetIndices(is_I_layer, &idxs));
588     PetscCall(PetscArraycpy(all_local_idx_N, idxs, extra));
589     if (multi_element)
590       for (PetscInt j = 0; j < extra; j++) all_local_subid_N[j] = local_subs[idxs[j]];
591     PetscCall(ISRestoreIndices(is_I_layer, &idxs));
592   }
593   PetscCall(PetscMalloc1(sub_schurs->n_subs, &auxnum1));
594   PetscCall(PetscMalloc1(sub_schurs->n_subs, &auxnum2));
595 
596   /* Get local indices in local numbering */
597   local_size       = 0;
598   local_stash_size = 0;
599   for (i = 0; i < sub_schurs->n_subs; i++) {
600     const PetscInt *idxs;
601 
602     PetscCall(ISGetLocalSize(sub_schurs->is_subs[i], &subset_size));
603     PetscCall(ISGetIndices(sub_schurs->is_subs[i], &idxs));
604     /* start (smallest in global ordering) and multiplicity */
605     auxnum1[i] = idxs[0];
606     auxnum2[i] = subset_size * subset_size;
607     /* subset indices in local numbering */
608     PetscCall(PetscArraycpy(all_local_idx_N + local_size + extra, idxs, subset_size));
609     if (multi_element)
610       for (PetscInt j = 0; j < subset_size; j++) all_local_subid_N[j + local_size + extra] = local_subs[idxs[j]];
611     PetscCall(ISRestoreIndices(sub_schurs->is_subs[i], &idxs));
612     local_size += subset_size;
613     local_stash_size += subset_size * subset_size;
614   }
615 
616   /* allocate extra workspace needed only for GETRI or SYTRF when inverting the blocks or the entire Schur complement */
617   use_potr = use_sytr = PETSC_FALSE;
618   if (benign_trick || (sub_schurs->is_hermitian && sub_schurs->is_posdef)) {
619     use_potr = PETSC_TRUE;
620   } else if (sub_schurs->is_symmetric) {
621     use_sytr = PETSC_TRUE;
622   }
623   if (local_size && !use_potr && compute_Stilda) {
624     PetscScalar  lwork, dummyscalar = 0.;
625     PetscBLASInt dummyint = 0;
626 
627     B_lwork = -1;
628     PetscCall(PetscBLASIntCast(local_size, &B_N));
629     PetscCall(PetscFPTrapPush(PETSC_FP_TRAP_OFF));
630     if (use_sytr) {
631       PetscCallBLAS("LAPACKsytrf", LAPACKsytrf_("L", &B_N, &dummyscalar, &B_N, &dummyint, &lwork, &B_lwork, &B_ierr));
632       PetscCheck(!B_ierr, PETSC_COMM_SELF, PETSC_ERR_LIB, "Error in query to SYTRF Lapack routine %" PetscBLASInt_FMT, B_ierr);
633     } else {
634       PetscCallBLAS("LAPACKgetri", LAPACKgetri_(&B_N, &dummyscalar, &B_N, &dummyint, &lwork, &B_lwork, &B_ierr));
635       PetscCheck(!B_ierr, PETSC_COMM_SELF, PETSC_ERR_LIB, "Error in query to GETRI Lapack routine %" PetscBLASInt_FMT, B_ierr);
636     }
637     PetscCall(PetscFPTrapPop());
638     PetscCall(PetscBLASIntCast((PetscInt)PetscRealPart(lwork), &B_lwork));
639     PetscCall(PetscMalloc2(B_lwork, &Bwork, B_N, &pivots));
640   } else {
641     Bwork  = NULL;
642     pivots = NULL;
643   }
644 
645   /* prepare data for summing up properly schurs on subsets */
646   PetscCall(ISCreateGeneral(comm_n, sub_schurs->n_subs, auxnum1, PETSC_OWN_POINTER, &all_subsets_n));
647   PetscCall(ISLocalToGlobalMappingApplyIS(sub_schurs->l2gmap, all_subsets_n, &all_subsets));
648   PetscCall(ISDestroy(&all_subsets_n));
649   PetscCall(ISCreateGeneral(comm_n, sub_schurs->n_subs, auxnum2, PETSC_OWN_POINTER, &all_subsets_mult));
650   PetscCall(ISRenumber(all_subsets, all_subsets_mult, &global_size, &all_subsets_n));
651   PetscCall(ISDestroy(&all_subsets));
652   PetscCall(ISDestroy(&all_subsets_mult));
653   PetscCall(ISGetLocalSize(all_subsets_n, &i));
654   PetscCheck(i == local_stash_size, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Invalid size of new subset! %" PetscInt_FMT " != %" PetscInt_FMT, i, local_stash_size);
655   PetscCall(VecCreateSeqWithArray(PETSC_COMM_SELF, 1, local_stash_size, NULL, &lstash));
656   PetscCall(VecCreateMPI(comm_n, PETSC_DECIDE, global_size, &gstash));
657   PetscCall(VecScatterCreate(lstash, NULL, gstash, all_subsets_n, &sstash));
658   PetscCall(ISDestroy(&all_subsets_n));
659 
660   /* subset indices in local boundary numbering */
661   if (!sub_schurs->is_Ej_all) {
662     PetscInt *all_local_idx_B;
663 
664     PetscCall(PetscMalloc1(local_size, &all_local_idx_B));
665     PetscCall(ISGlobalToLocalMappingApply(sub_schurs->BtoNmap, IS_GTOLM_DROP, local_size, all_local_idx_N + extra, &subset_size, all_local_idx_B));
666     PetscCheck(subset_size == local_size, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Error in sub_schurs serial (BtoNmap)! %" PetscInt_FMT " != %" PetscInt_FMT, subset_size, local_size);
667     PetscCall(ISCreateGeneral(PETSC_COMM_SELF, local_size, all_local_idx_B, PETSC_OWN_POINTER, &sub_schurs->is_Ej_all));
668   }
669 
670   if (change) {
671     ISLocalToGlobalMapping BtoS;
672     IS                     change_primal_B;
673     IS                     change_primal_all;
674 
675     PetscCheck(!sub_schurs->change_primal_sub, PETSC_COMM_SELF, PETSC_ERR_PLIB, "This should not happen");
676     PetscCheck(!sub_schurs->change, PETSC_COMM_SELF, PETSC_ERR_PLIB, "This should not happen");
677     PetscCall(PetscMalloc1(sub_schurs->n_subs, &sub_schurs->change_primal_sub));
678     for (i = 0; i < sub_schurs->n_subs; i++) {
679       ISLocalToGlobalMapping NtoS;
680       PetscCall(ISLocalToGlobalMappingCreateIS(sub_schurs->is_subs[i], &NtoS));
681       PetscCall(ISGlobalToLocalMappingApplyIS(NtoS, IS_GTOLM_DROP, change_primal, &sub_schurs->change_primal_sub[i]));
682       PetscCall(ISLocalToGlobalMappingDestroy(&NtoS));
683     }
684     PetscCall(ISGlobalToLocalMappingApplyIS(sub_schurs->BtoNmap, IS_GTOLM_DROP, change_primal, &change_primal_B));
685     PetscCall(ISLocalToGlobalMappingCreateIS(sub_schurs->is_Ej_all, &BtoS));
686     PetscCall(ISGlobalToLocalMappingApplyIS(BtoS, IS_GTOLM_DROP, change_primal_B, &change_primal_all));
687     PetscCall(ISLocalToGlobalMappingDestroy(&BtoS));
688     PetscCall(ISDestroy(&change_primal_B));
689     PetscCall(PetscMalloc1(sub_schurs->n_subs, &sub_schurs->change));
690     for (i = 0; i < sub_schurs->n_subs; i++) {
691       Mat change_sub;
692 
693       PetscCall(ISGetLocalSize(sub_schurs->is_subs[i], &subset_size));
694       PetscCall(KSPCreate(PETSC_COMM_SELF, &sub_schurs->change[i]));
695       PetscCall(KSPSetNestLevel(sub_schurs->change[i], 1)); /* do not seem to have direct access to a PC from which to get the level of nests */
696       PetscCall(KSPSetType(sub_schurs->change[i], KSPPREONLY));
697       if (!sub_schurs->change_with_qr) {
698         PetscCall(MatCreateSubMatrix(change, sub_schurs->is_subs[i], sub_schurs->is_subs[i], MAT_INITIAL_MATRIX, &change_sub));
699       } else {
700         Mat change_subt;
701         PetscCall(MatCreateSubMatrix(change, sub_schurs->is_subs[i], sub_schurs->is_subs[i], MAT_INITIAL_MATRIX, &change_subt));
702         PetscCall(MatConvert(change_subt, MATSEQDENSE, MAT_INITIAL_MATRIX, &change_sub));
703         PetscCall(MatDestroy(&change_subt));
704       }
705       PetscCall(KSPSetOperators(sub_schurs->change[i], change_sub, change_sub));
706       PetscCall(MatDestroy(&change_sub));
707       PetscCall(KSPSetOptionsPrefix(sub_schurs->change[i], sub_schurs->prefix));
708       PetscCall(KSPAppendOptionsPrefix(sub_schurs->change[i], "sub_schurs_change_"));
709     }
710     PetscCall(ISDestroy(&change_primal_all));
711   }
712 
713   /* Local matrix of all local Schur on subsets (transposed) */
714   if (!sub_schurs->S_Ej_all) {
715     Mat          T;
716     PetscScalar *v;
717     PetscInt    *ii, *jj;
718     PetscInt     cum, i, j, k;
719 
720     /* MatSeqAIJSetPreallocation + MatSetValues is slow for these kind of matrices (may have large blocks)
721        Allocate properly a representative matrix and duplicate */
722     PetscCall(PetscMalloc3(local_size + 1, &ii, local_stash_size, &jj, local_stash_size, &v));
723     ii[0] = 0;
724     cum   = 0;
725     for (i = 0; i < sub_schurs->n_subs; i++) {
726       PetscCall(ISGetLocalSize(sub_schurs->is_subs[i], &subset_size));
727       for (j = 0; j < subset_size; j++) {
728         const PetscInt row = cum + j;
729         PetscInt       col = cum;
730 
731         ii[row + 1] = ii[row] + subset_size;
732         for (k = ii[row]; k < ii[row + 1]; k++) {
733           jj[k] = col;
734           col++;
735         }
736       }
737       cum += subset_size;
738     }
739     PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, local_size, local_size, ii, jj, v, &T));
740     PetscCall(MatDuplicate(T, MAT_DO_NOT_COPY_VALUES, &sub_schurs->S_Ej_all));
741     PetscCall(MatDestroy(&T));
742     PetscCall(PetscFree3(ii, jj, v));
743   }
744   /* matrices for deluxe scaling and adaptive selection */
745   if (compute_Stilda) {
746     if (!sub_schurs->sum_S_Ej_tilda_all) PetscCall(MatDuplicate(sub_schurs->S_Ej_all, MAT_DO_NOT_COPY_VALUES, &sub_schurs->sum_S_Ej_tilda_all));
747     if (!sub_schurs->sum_S_Ej_inv_all && deluxe) PetscCall(MatDuplicate(sub_schurs->S_Ej_all, MAT_DO_NOT_COPY_VALUES, &sub_schurs->sum_S_Ej_inv_all));
748   }
749 
750   /* Compute Schur complements explicitly */
751   F = NULL;
752   if (!sub_schurs->schur_explicit) {
753     /* this code branch is used when MatFactor with Schur complement support is not present or when explicitly requested;
754        it is not efficient, unless the economic version of the scaling is used */
755     Mat          S_Ej_expl;
756     PetscScalar *work;
757     PetscInt     j, *dummy_idx;
758     PetscBool    Sdense;
759 
760     PetscCall(PetscMalloc2(max_subset_size, &dummy_idx, max_subset_size * max_subset_size, &work));
761     local_size = 0;
762     for (i = 0; i < sub_schurs->n_subs; i++) {
763       IS  is_subset_B;
764       Mat AE_EE, AE_IE, AE_EI, S_Ej;
765 
766       /* subsets in original and boundary numbering */
767       PetscCall(ISGlobalToLocalMappingApplyIS(sub_schurs->BtoNmap, IS_GTOLM_DROP, sub_schurs->is_subs[i], &is_subset_B));
768       /* EE block */
769       PetscCall(MatCreateSubMatrix(A_BB, is_subset_B, is_subset_B, MAT_INITIAL_MATRIX, &AE_EE));
770       /* IE block */
771       PetscCall(MatCreateSubMatrix(A_IB, is_I, is_subset_B, MAT_INITIAL_MATRIX, &AE_IE));
772       /* EI block */
773       if (sub_schurs->is_symmetric) {
774         PetscCall(MatCreateTranspose(AE_IE, &AE_EI));
775       } else if (sub_schurs->is_hermitian) {
776         PetscCall(MatCreateHermitianTranspose(AE_IE, &AE_EI));
777       } else {
778         PetscCall(MatCreateSubMatrix(A_BI, is_subset_B, is_I, MAT_INITIAL_MATRIX, &AE_EI));
779       }
780       PetscCall(ISDestroy(&is_subset_B));
781       PetscCall(MatCreateSchurComplement(AE_II, AE_II, AE_IE, AE_EI, AE_EE, &S_Ej));
782       PetscCall(MatDestroy(&AE_EE));
783       PetscCall(MatDestroy(&AE_IE));
784       PetscCall(MatDestroy(&AE_EI));
785       if (AE_II == A_II) { /* we can reuse the same ksp */
786         KSP ksp;
787         PetscCall(MatSchurComplementGetKSP(sub_schurs->S, &ksp));
788         PetscCall(MatSchurComplementSetKSP(S_Ej, ksp));
789       } else { /* build new ksp object which inherits ksp and pc types from the original one */
790         KSP       origksp, schurksp;
791         PC        origpc, schurpc;
792         KSPType   ksp_type;
793         PetscInt  n_internal;
794         PetscBool ispcnone;
795 
796         PetscCall(MatSchurComplementGetKSP(sub_schurs->S, &origksp));
797         PetscCall(MatSchurComplementGetKSP(S_Ej, &schurksp));
798         PetscCall(KSPGetType(origksp, &ksp_type));
799         PetscCall(KSPSetType(schurksp, ksp_type));
800         PetscCall(KSPGetPC(schurksp, &schurpc));
801         PetscCall(KSPGetPC(origksp, &origpc));
802         PetscCall(PetscObjectTypeCompare((PetscObject)origpc, PCNONE, &ispcnone));
803         if (!ispcnone) {
804           PCType pc_type;
805           PetscCall(PCGetType(origpc, &pc_type));
806           PetscCall(PCSetType(schurpc, pc_type));
807         } else {
808           PetscCall(PCSetType(schurpc, PCLU));
809         }
810         PetscCall(ISGetSize(is_I, &n_internal));
811         if (!n_internal) { /* UMFPACK gives error with 0 sized problems */
812           MatSolverType solver = NULL;
813           PetscCall(PCFactorGetMatSolverType(origpc, &solver));
814           if (solver) PetscCall(PCFactorSetMatSolverType(schurpc, solver));
815         }
816         PetscCall(KSPSetUp(schurksp));
817       }
818       PetscCall(ISGetLocalSize(sub_schurs->is_subs[i], &subset_size));
819       PetscCall(MatCreateSeqDense(PETSC_COMM_SELF, subset_size, subset_size, work, &S_Ej_expl));
820       PetscCall(PCBDDCComputeExplicitSchur(S_Ej, sub_schurs->is_symmetric, MAT_REUSE_MATRIX, &S_Ej_expl));
821       PetscCall(PetscObjectTypeCompare((PetscObject)S_Ej_expl, MATSEQDENSE, &Sdense));
822       PetscCheck(Sdense, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not yet implemented for sparse matrices");
823       for (j = 0; j < subset_size; j++) dummy_idx[j] = local_size + j;
824       PetscCall(MatSetValues(sub_schurs->S_Ej_all, subset_size, dummy_idx, subset_size, dummy_idx, work, INSERT_VALUES));
825       PetscCall(MatDestroy(&S_Ej));
826       PetscCall(MatDestroy(&S_Ej_expl));
827       local_size += subset_size;
828     }
829     PetscCall(PetscFree2(dummy_idx, work));
830     /* free */
831     PetscCall(ISDestroy(&is_I));
832     PetscCall(MatDestroy(&AE_II));
833     PetscCall(PetscFree(all_local_idx_N));
834   } else {
835     Mat                A, cs_AIB_mat = NULL, benign_AIIm1_ones_mat = NULL;
836     Mat               *gdswA;
837     Vec                Dall = NULL;
838     IS                 is_A_all, *is_p_r = NULL, is_schur;
839     MatType            Stype;
840     PetscScalar       *work, *S_data, *schur_factor, infty = PETSC_MAX_REAL;
841     PetscScalar       *SEj_arr = NULL, *SEjinv_arr = NULL;
842     const PetscScalar *rS_data;
843     PetscInt           n, n_I, size_schur, size_active_schur, cum, cum2;
844     PetscBool          economic, solver_S, S_lower_triangular = PETSC_FALSE;
845     PetscBool          schur_has_vertices, factor_workaround;
846     PetscBool          use_cholesky;
847 #if defined(PETSC_HAVE_VIENNACL) || defined(PETSC_HAVE_CUDA)
848     PetscBool oldpin;
849 #endif
850     /* multi-element */
851     IS *is_sub_all = NULL, *is_sub_schur_all = NULL, *is_sub_schur = NULL;
852 
853     /* get sizes */
854     n_I = 0;
855     if (is_I_layer) PetscCall(ISGetLocalSize(is_I_layer, &n_I));
856     economic = PETSC_FALSE;
857     PetscCall(ISGetLocalSize(sub_schurs->is_I, &cum));
858     if (cum != n_I) economic = PETSC_TRUE;
859     PetscCall(MatGetLocalSize(sub_schurs->A, &n, NULL));
860     size_active_schur = local_size;
861 
862     /* import scaling vector (wrong formulation if we have 3D edges) */
863     if (scaling && compute_Stilda) {
864       const PetscScalar *array;
865       PetscScalar       *array2;
866       const PetscInt    *idxs;
867       PetscInt           i;
868 
869       PetscCall(ISGetIndices(sub_schurs->is_Ej_all, &idxs));
870       PetscCall(VecCreateSeq(PETSC_COMM_SELF, size_active_schur, &Dall));
871       PetscCall(VecGetArrayRead(scaling, &array));
872       PetscCall(VecGetArray(Dall, &array2));
873       for (i = 0; i < size_active_schur; i++) array2[i] = array[idxs[i]];
874       PetscCall(VecRestoreArray(Dall, &array2));
875       PetscCall(VecRestoreArrayRead(scaling, &array));
876       PetscCall(ISRestoreIndices(sub_schurs->is_Ej_all, &idxs));
877       deluxe = PETSC_FALSE;
878     }
879 
880     /* size active schurs does not count any dirichlet or vertex dof on the interface */
881     factor_workaround  = PETSC_FALSE;
882     schur_has_vertices = PETSC_FALSE;
883     cum                = n_I + size_active_schur;
884     if (sub_schurs->is_dir) {
885       const PetscInt *idxs;
886       PetscInt        n_dir;
887 
888       PetscCall(ISGetLocalSize(sub_schurs->is_dir, &n_dir));
889       PetscCall(ISGetIndices(sub_schurs->is_dir, &idxs));
890       PetscCall(PetscArraycpy(all_local_idx_N + cum, idxs, n_dir));
891       if (multi_element)
892         for (PetscInt j = 0; j < n_dir; j++) all_local_subid_N[j + cum] = local_subs[idxs[j]];
893       PetscCall(ISRestoreIndices(sub_schurs->is_dir, &idxs));
894       cum += n_dir;
895       if (!sub_schurs->gdsw) factor_workaround = PETSC_TRUE;
896     }
897     /* include the primal vertices in the Schur complement */
898     if (exact_schur && sub_schurs->is_vertices && (compute_Stilda || benign_n)) {
899       PetscInt n_v;
900 
901       PetscCall(ISGetLocalSize(sub_schurs->is_vertices, &n_v));
902       if (n_v) {
903         const PetscInt *idxs;
904 
905         PetscCall(ISGetIndices(sub_schurs->is_vertices, &idxs));
906         PetscCall(PetscArraycpy(all_local_idx_N + cum, idxs, n_v));
907         if (multi_element)
908           for (PetscInt j = 0; j < n_v; j++) all_local_subid_N[j + cum] = local_subs[idxs[j]];
909         PetscCall(ISRestoreIndices(sub_schurs->is_vertices, &idxs));
910         cum += n_v;
911         if (!sub_schurs->gdsw) factor_workaround = PETSC_TRUE;
912         schur_has_vertices = PETSC_TRUE;
913       }
914     }
915     size_schur = cum - n_I;
916     PetscCall(ISCreateGeneral(PETSC_COMM_SELF, cum, all_local_idx_N, PETSC_OWN_POINTER, &is_A_all));
917 #if defined(PETSC_HAVE_VIENNACL) || defined(PETSC_HAVE_CUDA)
918     oldpin = sub_schurs->A->boundtocpu;
919     PetscCall(MatBindToCPU(sub_schurs->A, PETSC_TRUE));
920 #endif
921     if (cum == n) {
922       PetscCall(ISSetPermutation(is_A_all));
923       PetscCall(MatPermute(sub_schurs->A, is_A_all, is_A_all, &A));
924     } else {
925       PetscCall(MatCreateSubMatrix(sub_schurs->A, is_A_all, is_A_all, MAT_INITIAL_MATRIX, &A));
926     }
927 #if defined(PETSC_HAVE_VIENNACL) || defined(PETSC_HAVE_CUDA)
928     PetscCall(MatBindToCPU(sub_schurs->A, oldpin));
929 #endif
930     PetscCall(MatSetOptionsPrefixFactor(A, sub_schurs->prefix));
931     PetscCall(MatAppendOptionsPrefixFactor(A, "sub_schurs_"));
932     /* subsets ordered last */
933     PetscCall(ISCreateStride(PETSC_COMM_SELF, size_schur, n_I, 1, &is_schur));
934 
935     if (multi_element) {
936       PetscInt *idx_sub;
937 
938       PetscCall(PetscMalloc3(n_local_subs, &is_sub_all, n_local_subs, &is_sub_schur_all, n_local_subs, &is_sub_schur));
939       PetscCall(PetscMalloc1(n + size_schur, &idx_sub));
940       for (PetscInt sub = 0; sub < n_local_subs; sub++) {
941         PetscInt size_sub = 0, size_schur_sub = 0, size_I_sub;
942 
943         for (PetscInt j = 0; j < n_I; j++)
944           if (all_local_subid_N[j] == sub) idx_sub[size_sub++] = j;
945         size_I_sub = size_sub;
946         for (PetscInt j = n_I; j < n_I + size_schur; j++)
947           if (all_local_subid_N[j] == sub) {
948             idx_sub[size_sub++]           = j;
949             idx_sub[n + size_schur_sub++] = j - n_I;
950           }
951 
952         PetscCall(ISCreateGeneral(PETSC_COMM_SELF, size_sub, idx_sub, PETSC_COPY_VALUES, &is_sub_all[sub]));
953         PetscCall(ISCreateGeneral(PETSC_COMM_SELF, size_schur_sub, idx_sub + n, PETSC_COPY_VALUES, &is_sub_schur[sub]));
954         PetscCall(ISCreateStride(PETSC_COMM_SELF, size_schur_sub, size_I_sub, 1, &is_sub_schur_all[sub]));
955       }
956       PetscCall(PetscFree(idx_sub));
957     }
958 
959     /* if we actually change the basis for the pressures, LDL^T factors will use a lot of memory
960        this is a workaround */
961     if (benign_n) {
962       Vec                    v, benign_AIIm1_ones;
963       ISLocalToGlobalMapping N_to_reor;
964       IS                     is_p0, is_p0_p;
965       PetscScalar           *cs_AIB, *AIIm1_data;
966       PetscInt               sizeA;
967 
968       PetscCall(ISLocalToGlobalMappingCreateIS(is_A_all, &N_to_reor));
969       PetscCall(ISCreateGeneral(PETSC_COMM_SELF, benign_n, benign_p0_lidx, PETSC_COPY_VALUES, &is_p0));
970       PetscCall(ISGlobalToLocalMappingApplyIS(N_to_reor, IS_GTOLM_DROP, is_p0, &is_p0_p));
971       PetscCall(ISDestroy(&is_p0));
972       PetscCall(MatCreateVecs(A, &v, &benign_AIIm1_ones));
973       PetscCall(VecGetSize(v, &sizeA));
974       PetscCall(MatCreateSeqDense(PETSC_COMM_SELF, sizeA, benign_n, NULL, &benign_AIIm1_ones_mat));
975       PetscCall(MatCreateSeqDense(PETSC_COMM_SELF, size_schur, benign_n, NULL, &cs_AIB_mat));
976       PetscCall(MatDenseGetArray(cs_AIB_mat, &cs_AIB));
977       PetscCall(MatDenseGetArray(benign_AIIm1_ones_mat, &AIIm1_data));
978       PetscCall(PetscMalloc1(benign_n, &is_p_r));
979       /* compute colsum of A_IB restricted to pressures */
980       for (i = 0; i < benign_n; i++) {
981         const PetscScalar *array;
982         const PetscInt    *idxs;
983         PetscInt           j, nz;
984 
985         PetscCall(ISGlobalToLocalMappingApplyIS(N_to_reor, IS_GTOLM_DROP, benign_zerodiag_subs[i], &is_p_r[i]));
986         PetscCall(ISGetLocalSize(is_p_r[i], &nz));
987         PetscCall(ISGetIndices(is_p_r[i], &idxs));
988         for (j = 0; j < nz; j++) AIIm1_data[idxs[j] + sizeA * i] = 1.;
989         PetscCall(ISRestoreIndices(is_p_r[i], &idxs));
990         PetscCall(VecPlaceArray(benign_AIIm1_ones, AIIm1_data + sizeA * i));
991         PetscCall(MatMult(A, benign_AIIm1_ones, v));
992         PetscCall(VecResetArray(benign_AIIm1_ones));
993         PetscCall(VecGetArrayRead(v, &array));
994         for (j = 0; j < size_schur; j++) {
995 #if defined(PETSC_USE_COMPLEX)
996           cs_AIB[i * size_schur + j] = (PetscRealPart(array[j + n_I]) / nz + PETSC_i * (PetscImaginaryPart(array[j + n_I]) / nz));
997 #else
998           cs_AIB[i * size_schur + j] = array[j + n_I] / nz;
999 #endif
1000         }
1001         PetscCall(VecRestoreArrayRead(v, &array));
1002       }
1003       PetscCall(MatDenseRestoreArray(cs_AIB_mat, &cs_AIB));
1004       PetscCall(MatDenseRestoreArray(benign_AIIm1_ones_mat, &AIIm1_data));
1005       PetscCall(VecDestroy(&v));
1006       PetscCall(VecDestroy(&benign_AIIm1_ones));
1007       PetscCall(MatSetOption(A, MAT_KEEP_NONZERO_PATTERN, PETSC_FALSE));
1008       PetscCall(MatSetOption(A, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_FALSE));
1009       PetscCall(MatSetOption(A, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_FALSE));
1010       PetscCall(MatZeroRowsColumnsIS(A, is_p0_p, 1.0, NULL, NULL));
1011       PetscCall(ISDestroy(&is_p0_p));
1012       PetscCall(ISLocalToGlobalMappingDestroy(&N_to_reor));
1013     }
1014     PetscCall(MatSetOption(A, MAT_SYMMETRIC, sub_schurs->is_symmetric));
1015     PetscCall(MatSetOption(A, MAT_HERMITIAN, sub_schurs->is_hermitian));
1016     PetscCall(MatSetOption(A, MAT_SPD, sub_schurs->is_posdef));
1017 
1018     /* for complexes, symmetric and hermitian at the same time implies null imaginary part */
1019     use_cholesky = (PetscBool)((use_potr || use_sytr) && sub_schurs->is_hermitian && sub_schurs->is_symmetric);
1020 
1021     /* when using the benign subspace trick, the local Schur complements are SPD */
1022     /* MKL_PARDISO does not handle well the computation of a Schur complement from a symmetric indefinite factorization
1023        Use LU and adapt pivoting perturbation (still, solution is not as accurate as with using MUMPS) */
1024     if (benign_trick) {
1025       sub_schurs->is_posdef = PETSC_TRUE;
1026       PetscCall(PetscStrcmp(sub_schurs->mat_solver_type, MATSOLVERMKL_PARDISO, &flg));
1027       if (flg) use_cholesky = PETSC_FALSE;
1028     }
1029     if (sub_schurs->mat_factor_type == MAT_FACTOR_NONE) sub_schurs->mat_factor_type = use_cholesky ? MAT_FACTOR_CHOLESKY : MAT_FACTOR_LU;
1030 
1031     if (n_I && !multi_element) {
1032       char      stype[64];
1033       PetscBool gpu = PETSC_FALSE;
1034 
1035       PetscCall(MatGetFactor(A, sub_schurs->mat_solver_type, sub_schurs->mat_factor_type, &F));
1036       PetscCheck(F, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatGetFactor not supported by matrix instance of type %s. Rerun with \"-info :mat | grep MatGetFactor_\" for additional information", ((PetscObject)A)->type_name);
1037       PetscCall(MatSetErrorIfFailure(A, PETSC_TRUE));
1038 #if defined(PETSC_HAVE_MKL_PARDISO)
1039       if (benign_trick) PetscCall(MatMkl_PardisoSetCntl(F, 10, 10));
1040 #endif
1041       PetscCall(MatFactorSetSchurIS(F, is_schur));
1042 
1043       /* factorization step */
1044       switch (sub_schurs->mat_factor_type) {
1045       case MAT_FACTOR_CHOLESKY:
1046         PetscCall(MatCholeskyFactorSymbolic(F, A, NULL, NULL));
1047         /* be sure that icntl 19 is not set by command line */
1048         PetscCall(MatMumpsSetIcntl(F, 19, 2));
1049         PetscCall(MatCholeskyFactorNumeric(F, A, NULL));
1050         S_lower_triangular = PETSC_TRUE;
1051         break;
1052       case MAT_FACTOR_LU:
1053         PetscCall(MatLUFactorSymbolic(F, A, NULL, NULL, NULL));
1054         /* be sure that icntl 19 is not set by command line */
1055         PetscCall(MatMumpsSetIcntl(F, 19, 3));
1056         PetscCall(MatLUFactorNumeric(F, A, NULL));
1057         break;
1058       default:
1059         SETERRQ(PetscObjectComm((PetscObject)F), PETSC_ERR_SUP, "Unsupported factor type %s", MatFactorTypes[sub_schurs->mat_factor_type]);
1060       }
1061       PetscCall(MatViewFromOptions(F, (PetscObject)A, "-mat_factor_view"));
1062 
1063       if (matl_dbg_viewer) {
1064         Mat S;
1065         IS  is;
1066 
1067         PetscCall(PetscObjectSetName((PetscObject)A, "A"));
1068         PetscCall(MatView(A, matl_dbg_viewer));
1069         PetscCall(MatFactorCreateSchurComplement(F, &S, NULL));
1070         PetscCall(PetscObjectSetName((PetscObject)S, "S"));
1071         PetscCall(MatView(S, matl_dbg_viewer));
1072         PetscCall(MatDestroy(&S));
1073         PetscCall(ISCreateStride(PETSC_COMM_SELF, n_I, 0, 1, &is));
1074         PetscCall(PetscObjectSetName((PetscObject)is, "I"));
1075         PetscCall(ISView(is, matl_dbg_viewer));
1076         PetscCall(ISDestroy(&is));
1077         PetscCall(ISCreateStride(PETSC_COMM_SELF, size_schur, n_I, 1, &is));
1078         PetscCall(PetscObjectSetName((PetscObject)is, "B"));
1079         PetscCall(ISView(is, matl_dbg_viewer));
1080         PetscCall(ISDestroy(&is));
1081         PetscCall(PetscObjectSetName((PetscObject)is_A_all, "IA"));
1082         PetscCall(ISView(is_A_all, matl_dbg_viewer));
1083         for (i = 0, cum = 0; i < sub_schurs->n_subs; i++) {
1084           IS   is;
1085           char name[16];
1086 
1087           PetscCall(PetscSNPrintf(name, sizeof(name), "IE%" PetscInt_FMT, i));
1088           PetscCall(ISGetLocalSize(sub_schurs->is_subs[i], &subset_size));
1089           PetscCall(ISCreateStride(PETSC_COMM_SELF, subset_size, cum, 1, &is));
1090           PetscCall(PetscObjectSetName((PetscObject)is, name));
1091           PetscCall(ISView(is, matl_dbg_viewer));
1092           PetscCall(ISDestroy(&is));
1093           if (sub_schurs->change) {
1094             Mat T;
1095 
1096             PetscCall(PetscSNPrintf(name, sizeof(name), "TE%" PetscInt_FMT, i));
1097             PetscCall(KSPGetOperators(sub_schurs->change[i], &T, NULL));
1098             PetscCall(PetscObjectSetName((PetscObject)T, name));
1099             PetscCall(MatView(T, matl_dbg_viewer));
1100             PetscCall(PetscSNPrintf(name, sizeof(name), "ITE%" PetscInt_FMT, i));
1101             PetscCall(PetscObjectSetName((PetscObject)sub_schurs->change_primal_sub[i], name));
1102             PetscCall(ISView(sub_schurs->change_primal_sub[i], matl_dbg_viewer));
1103           }
1104           cum += subset_size;
1105         }
1106         PetscCall(PetscViewerFlush(matl_dbg_viewer));
1107       }
1108 
1109       /* get explicit Schur Complement computed during numeric factorization */
1110       PetscCall(MatFactorGetSchurComplement(F, &S_all, NULL));
1111       PetscCall(PetscStrncpy(stype, MATSEQDENSE, sizeof(stype)));
1112 #if defined(PETSC_HAVE_CUDA)
1113       PetscCall(PetscObjectTypeCompareAny((PetscObject)A, &gpu, MATSEQAIJVIENNACL, MATSEQAIJCUSPARSE, ""));
1114 #endif
1115       if (gpu) PetscCall(PetscStrncpy(stype, MATSEQDENSECUDA, sizeof(stype)));
1116       PetscCall(PetscOptionsGetString(NULL, sub_schurs->prefix, "-sub_schurs_schur_mat_type", stype, sizeof(stype), NULL));
1117       PetscCall(MatConvert(S_all, stype, MAT_INPLACE_MATRIX, &S_all));
1118       PetscCall(MatSetOption(S_all, MAT_SPD, sub_schurs->is_posdef));
1119       PetscCall(MatSetOption(S_all, MAT_HERMITIAN, sub_schurs->is_hermitian));
1120       PetscCall(MatGetType(S_all, &Stype));
1121 
1122       /* we can reuse the solvers if we are not using the economic version */
1123       reuse_solvers = (PetscBool)(reuse_solvers && !economic && !sub_schurs->graph->multi_element);
1124       if (!sub_schurs->gdsw) {
1125         factor_workaround = (PetscBool)(reuse_solvers && factor_workaround);
1126         if (!sub_schurs->is_posdef && factor_workaround && compute_Stilda && size_active_schur) reuse_solvers = factor_workaround = PETSC_FALSE;
1127       }
1128       solver_S = PETSC_TRUE;
1129 
1130       /* update the Schur complement with the change of basis on the pressures */
1131       if (benign_n) {
1132         const PetscScalar *cs_AIB;
1133         PetscScalar       *S_data, *AIIm1_data;
1134         Mat                S2 = NULL, S3 = NULL; /* dbg */
1135         PetscScalar       *S2_data, *S3_data;    /* dbg */
1136         Vec                v, benign_AIIm1_ones;
1137         PetscInt           sizeA;
1138 
1139         PetscCall(MatDenseGetArray(S_all, &S_data));
1140         PetscCall(MatCreateVecs(A, &v, &benign_AIIm1_ones));
1141         PetscCall(VecGetSize(v, &sizeA));
1142         PetscCall(MatMumpsSetIcntl(F, 26, 0));
1143 #if defined(PETSC_HAVE_MKL_PARDISO)
1144         PetscCall(MatMkl_PardisoSetCntl(F, 70, 1));
1145 #endif
1146         PetscCall(MatDenseGetArrayRead(cs_AIB_mat, &cs_AIB));
1147         PetscCall(MatDenseGetArray(benign_AIIm1_ones_mat, &AIIm1_data));
1148         if (matl_dbg_viewer) {
1149           PetscCall(MatDuplicate(S_all, MAT_DO_NOT_COPY_VALUES, &S2));
1150           PetscCall(MatDuplicate(S_all, MAT_DO_NOT_COPY_VALUES, &S3));
1151           PetscCall(MatDenseGetArray(S2, &S2_data));
1152           PetscCall(MatDenseGetArray(S3, &S3_data));
1153         }
1154         for (i = 0; i < benign_n; i++) {
1155           PetscScalar    *array, sum = 0., one = 1., *sums;
1156           const PetscInt *idxs;
1157           PetscInt        k, j, nz;
1158           PetscBLASInt    B_k, B_n;
1159 
1160           PetscCall(PetscCalloc1(benign_n, &sums));
1161           PetscCall(VecPlaceArray(benign_AIIm1_ones, AIIm1_data + sizeA * i));
1162           PetscCall(VecCopy(benign_AIIm1_ones, v));
1163           PetscCall(MatSolve(F, v, benign_AIIm1_ones));
1164           PetscCall(MatMult(A, benign_AIIm1_ones, v));
1165           PetscCall(VecResetArray(benign_AIIm1_ones));
1166           /* p0 dofs (eliminated) are excluded from the sums */
1167           for (k = 0; k < benign_n; k++) {
1168             PetscCall(ISGetLocalSize(is_p_r[k], &nz));
1169             PetscCall(ISGetIndices(is_p_r[k], &idxs));
1170             for (j = 0; j < nz - 1; j++) sums[k] -= AIIm1_data[idxs[j] + sizeA * i];
1171             PetscCall(ISRestoreIndices(is_p_r[k], &idxs));
1172           }
1173           PetscCall(VecGetArrayRead(v, (const PetscScalar **)&array));
1174           if (matl_dbg_viewer) {
1175             Vec  vv;
1176             char name[16];
1177 
1178             PetscCall(VecCreateSeqWithArray(PETSC_COMM_SELF, 1, size_schur, array + n_I, &vv));
1179             PetscCall(PetscSNPrintf(name, sizeof(name), "Pvs%" PetscInt_FMT, i));
1180             PetscCall(PetscObjectSetName((PetscObject)vv, name));
1181             PetscCall(VecView(vv, matl_dbg_viewer));
1182           }
1183           /* perform sparse rank updates on symmetric Schur (TODO: move outside of the loop?) */
1184           /* cs_AIB already scaled by 1./nz */
1185           B_k = 1;
1186           PetscCall(PetscBLASIntCast(size_schur, &B_n));
1187           for (k = 0; k < benign_n; k++) {
1188             sum = sums[k];
1189 
1190             if (PetscAbsScalar(sum) == 0.0) continue;
1191             if (k == i) {
1192               if (B_n) PetscCallBLAS("BLASsyrk", BLASsyrk_("L", "N", &B_n, &B_k, &sum, cs_AIB + i * size_schur, &B_n, &one, S_data, &B_n));
1193               if (matl_dbg_viewer && B_n) PetscCallBLAS("BLASsyrk", BLASsyrk_("L", "N", &B_n, &B_k, &sum, cs_AIB + i * size_schur, &B_n, &one, S3_data, &B_n));
1194             } else { /* XXX Is it correct to use symmetric rank-2 update with half of the sum? */
1195               sum /= 2.0;
1196               if (B_n) PetscCallBLAS("BLASsyr2k", BLASsyr2k_("L", "N", &B_n, &B_k, &sum, cs_AIB + k * size_schur, &B_n, cs_AIB + i * size_schur, &B_n, &one, S_data, &B_n));
1197               if (matl_dbg_viewer && B_n) PetscCallBLAS("BLASsyr2k", BLASsyr2k_("L", "N", &B_n, &B_k, &sum, cs_AIB + k * size_schur, &B_n, cs_AIB + i * size_schur, &B_n, &one, S3_data, &B_n));
1198             }
1199           }
1200           sum = 1.;
1201           if (B_n) PetscCallBLAS("BLASsyr2k", BLASsyr2k_("L", "N", &B_n, &B_k, &sum, array + n_I, &B_n, cs_AIB + i * size_schur, &B_n, &one, S_data, &B_n));
1202           if (matl_dbg_viewer && B_n) PetscCallBLAS("BLASsyr2k", BLASsyr2k_("L", "N", &B_n, &B_k, &sum, array + n_I, &B_n, cs_AIB + i * size_schur, &B_n, &one, S2_data, &B_n));
1203           PetscCall(VecRestoreArrayRead(v, (const PetscScalar **)&array));
1204           /* set p0 entry of AIIm1_ones to zero */
1205           PetscCall(ISGetLocalSize(is_p_r[i], &nz));
1206           PetscCall(ISGetIndices(is_p_r[i], &idxs));
1207           for (j = 0; j < benign_n; j++) AIIm1_data[idxs[nz - 1] + sizeA * j] = 0.;
1208           PetscCall(ISRestoreIndices(is_p_r[i], &idxs));
1209           PetscCall(PetscFree(sums));
1210         }
1211         PetscCall(VecDestroy(&benign_AIIm1_ones));
1212         if (matl_dbg_viewer) {
1213           PetscCall(MatDenseRestoreArray(S2, &S2_data));
1214           PetscCall(MatDenseRestoreArray(S3, &S3_data));
1215         }
1216         if (!S_lower_triangular) { /* I need to expand the upper triangular data (column-oriented) */
1217           PetscInt k, j;
1218           for (k = 0; k < size_schur; k++) {
1219             for (j = k; j < size_schur; j++) S_data[j * size_schur + k] = PetscConj(S_data[k * size_schur + j]);
1220           }
1221         }
1222 
1223         /* restore defaults */
1224         PetscCall(MatMumpsSetIcntl(F, 26, -1));
1225 #if defined(PETSC_HAVE_MKL_PARDISO)
1226         PetscCall(MatMkl_PardisoSetCntl(F, 70, 0));
1227 #endif
1228         PetscCall(MatDenseRestoreArrayRead(cs_AIB_mat, &cs_AIB));
1229         PetscCall(MatDenseRestoreArray(benign_AIIm1_ones_mat, &AIIm1_data));
1230         PetscCall(VecDestroy(&v));
1231         PetscCall(MatDenseRestoreArray(S_all, &S_data));
1232         if (matl_dbg_viewer) {
1233           Mat S;
1234 
1235           PetscCall(MatFactorRestoreSchurComplement(F, &S_all, MAT_FACTOR_SCHUR_UNFACTORED));
1236           PetscCall(MatFactorCreateSchurComplement(F, &S, NULL));
1237           PetscCall(PetscObjectSetName((PetscObject)S, "Sb"));
1238           PetscCall(MatView(S, matl_dbg_viewer));
1239           PetscCall(MatDestroy(&S));
1240           PetscCall(PetscObjectSetName((PetscObject)S2, "S2P"));
1241           PetscCall(MatView(S2, matl_dbg_viewer));
1242           PetscCall(PetscObjectSetName((PetscObject)S3, "S3P"));
1243           PetscCall(MatView(S3, matl_dbg_viewer));
1244           PetscCall(PetscObjectSetName((PetscObject)cs_AIB_mat, "cs"));
1245           PetscCall(MatView(cs_AIB_mat, matl_dbg_viewer));
1246           PetscCall(MatFactorGetSchurComplement(F, &S_all, NULL));
1247         }
1248         PetscCall(MatDestroy(&S2));
1249         PetscCall(MatDestroy(&S3));
1250       }
1251       if (!reuse_solvers) {
1252         for (i = 0; i < benign_n; i++) PetscCall(ISDestroy(&is_p_r[i]));
1253         PetscCall(PetscFree(is_p_r));
1254         PetscCall(MatDestroy(&cs_AIB_mat));
1255         PetscCall(MatDestroy(&benign_AIIm1_ones_mat));
1256       }
1257     } else if (multi_element) { /* MUMPS does not support sparse Schur complements. Loop over local subs */
1258       PetscInt       *nnz;
1259       const PetscInt *idxs;
1260       PetscInt        size_schur_sub;
1261 
1262       PetscCall(PetscCalloc1(size_schur, &nnz));
1263       for (PetscInt sub = 0; sub < n_local_subs; sub++) {
1264         PetscCall(ISGetLocalSize(is_sub_schur[sub], &size_schur_sub));
1265         PetscCall(ISGetIndices(is_sub_schur[sub], &idxs));
1266         for (PetscInt j = 0; j < size_schur_sub; j++) nnz[idxs[j]] = size_schur_sub;
1267         PetscCall(ISRestoreIndices(is_sub_schur[sub], &idxs));
1268       }
1269       PetscCall(MatCreateSeqAIJ(PETSC_COMM_SELF, size_schur, size_schur, 0, nnz, &S_all));
1270       PetscCall(MatSetOption(S_all, MAT_ROW_ORIENTED, sub_schurs->is_hermitian));
1271       PetscCall(PetscFree(nnz));
1272 
1273       for (PetscInt sub = 0; sub < n_local_subs; sub++) {
1274         Mat                Asub, Ssub;
1275         const PetscScalar *vals;
1276         PetscInt           size_all_sub;
1277 
1278         F = NULL;
1279         PetscCall(ISGetLocalSize(is_sub_schur[sub], &size_schur_sub));
1280         PetscCall(ISGetLocalSize(is_sub_all[sub], &size_all_sub));
1281         PetscCall(MatCreateSubMatrix(A, is_sub_all[sub], is_sub_all[sub], MAT_INITIAL_MATRIX, &Asub));
1282         if (size_schur_sub == size_all_sub) {
1283           /* we can't use MatFactor when size_schur == size_of_the_problem */
1284           PetscCall(MatConvert(Asub, MATDENSE, MAT_INITIAL_MATRIX, &Ssub));
1285         } else {
1286           PetscCall(MatGetFactor(Asub, sub_schurs->mat_solver_type, sub_schurs->mat_factor_type, &F));
1287           PetscCheck(F, PetscObjectComm((PetscObject)Asub), PETSC_ERR_SUP, "MatGetFactor not supported by matrix instance of type %s. Rerun with \"-info :mat | grep MatGetFactor_\" for additional information", ((PetscObject)Asub)->type_name);
1288           PetscCall(MatSetErrorIfFailure(Asub, PETSC_TRUE));
1289 #if defined(PETSC_HAVE_MKL_PARDISO)
1290           if (benign_trick) PetscCall(MatMkl_PardisoSetCntl(F, 10, 10));
1291 #endif
1292           /* subsets ordered last */
1293           PetscCall(MatFactorSetSchurIS(F, is_sub_schur_all[sub]));
1294 
1295           /* factorization step */
1296           switch (sub_schurs->mat_factor_type) {
1297           case MAT_FACTOR_CHOLESKY:
1298             PetscCall(MatCholeskyFactorSymbolic(F, Asub, NULL, NULL));
1299             /* be sure that icntl 19 is not set by command line */
1300             PetscCall(MatMumpsSetIcntl(F, 19, 2));
1301             PetscCall(MatCholeskyFactorNumeric(F, Asub, NULL));
1302             S_lower_triangular = PETSC_TRUE;
1303             break;
1304           case MAT_FACTOR_LU:
1305             PetscCall(MatLUFactorSymbolic(F, Asub, NULL, NULL, NULL));
1306             /* be sure that icntl 19 is not set by command line */
1307             PetscCall(MatMumpsSetIcntl(F, 19, 3));
1308             PetscCall(MatLUFactorNumeric(F, Asub, NULL));
1309             break;
1310           default:
1311             SETERRQ(PetscObjectComm((PetscObject)F), PETSC_ERR_SUP, "Unsupported factor type %s", MatFactorTypes[sub_schurs->mat_factor_type]);
1312           }
1313           PetscCall(MatFactorCreateSchurComplement(F, &Ssub, NULL));
1314         }
1315         PetscCall(MatDestroy(&Asub));
1316         PetscCall(MatDenseGetArrayRead(Ssub, &vals));
1317         PetscCall(ISGetIndices(is_sub_schur[sub], &idxs));
1318         PetscCall(MatSetValues(S_all, size_schur_sub, idxs, size_schur_sub, idxs, vals, INSERT_VALUES));
1319         PetscCall(ISRestoreIndices(is_sub_schur[sub], &idxs));
1320         PetscCall(MatDenseRestoreArrayRead(Ssub, &vals));
1321         PetscCall(MatDestroy(&Ssub));
1322         PetscCall(MatDestroy(&F));
1323       }
1324       PetscCall(MatAssemblyBegin(S_all, MAT_FINAL_ASSEMBLY));
1325       PetscCall(MatAssemblyEnd(S_all, MAT_FINAL_ASSEMBLY));
1326       PetscCall(MatSetOption(S_all, MAT_SPD, sub_schurs->is_posdef));
1327       PetscCall(MatSetOption(S_all, MAT_HERMITIAN, sub_schurs->is_hermitian));
1328       Stype             = MATDENSE;
1329       reuse_solvers     = PETSC_FALSE;
1330       factor_workaround = PETSC_FALSE;
1331       solver_S          = PETSC_FALSE;
1332     } else { /* we can't use MatFactor when size_schur == size_of_the_problem */
1333       PetscCall(MatConvert(A, MATSEQDENSE, MAT_INITIAL_MATRIX, &S_all));
1334       PetscCall(MatGetType(S_all, &Stype));
1335       reuse_solvers     = PETSC_FALSE; /* TODO: why we can't reuse the solvers here? */
1336       factor_workaround = PETSC_FALSE;
1337       solver_S          = PETSC_FALSE;
1338     }
1339 
1340     if (reuse_solvers) {
1341       Mat                A_II, pA_II, Afake;
1342       Vec                vec1_B;
1343       PCBDDCReuseSolvers msolv_ctx;
1344       PetscInt           n_R;
1345 
1346       if (sub_schurs->reuse_solver) {
1347         PetscCall(PCBDDCReuseSolversReset(sub_schurs->reuse_solver));
1348       } else {
1349         PetscCall(PetscNew(&sub_schurs->reuse_solver));
1350       }
1351       msolv_ctx = sub_schurs->reuse_solver;
1352       PetscCall(MatSchurComplementGetSubMatrices(sub_schurs->S, &A_II, &pA_II, NULL, NULL, NULL));
1353       PetscCall(PetscObjectReference((PetscObject)F));
1354       msolv_ctx->F = F;
1355       PetscCall(MatCreateVecs(F, &msolv_ctx->sol, NULL));
1356       /* currently PETSc has no support for MatSolve(F,x,x), so cheat and let rhs and sol share the same memory */
1357       {
1358         PetscScalar *array;
1359         PetscInt     n;
1360 
1361         PetscCall(VecGetLocalSize(msolv_ctx->sol, &n));
1362         PetscCall(VecGetArray(msolv_ctx->sol, &array));
1363         PetscCall(VecCreateSeqWithArray(PetscObjectComm((PetscObject)msolv_ctx->sol), 1, n, array, &msolv_ctx->rhs));
1364         PetscCall(VecRestoreArray(msolv_ctx->sol, &array));
1365       }
1366       msolv_ctx->has_vertices = schur_has_vertices;
1367 
1368       /* interior solver */
1369       PetscCall(PCCreate(PetscObjectComm((PetscObject)A_II), &msolv_ctx->interior_solver));
1370       PetscCall(PCSetOperators(msolv_ctx->interior_solver, A_II, pA_II));
1371       PetscCall(PCSetType(msolv_ctx->interior_solver, PCSHELL));
1372       PetscCall(PCShellSetName(msolv_ctx->interior_solver, "Interior solver (w/o Schur factorization)"));
1373       PetscCall(PCShellSetContext(msolv_ctx->interior_solver, msolv_ctx));
1374       PetscCall(PCShellSetView(msolv_ctx->interior_solver, PCBDDCReuseSolvers_View));
1375       PetscCall(PCShellSetApply(msolv_ctx->interior_solver, PCBDDCReuseSolvers_Interior));
1376       PetscCall(PCShellSetApplyTranspose(msolv_ctx->interior_solver, PCBDDCReuseSolvers_InteriorTranspose));
1377       if (sub_schurs->gdsw) PetscCall(PCShellSetDestroy(msolv_ctx->interior_solver, PCBDDCReuseSolvers_Destroy));
1378 
1379       /* correction solver */
1380       if (!sub_schurs->gdsw) {
1381         PetscCall(PCCreate(PetscObjectComm((PetscObject)A_II), &msolv_ctx->correction_solver));
1382         PetscCall(PCSetType(msolv_ctx->correction_solver, PCSHELL));
1383         PetscCall(PCShellSetName(msolv_ctx->correction_solver, "Correction solver (with Schur factorization)"));
1384         PetscCall(PCShellSetContext(msolv_ctx->correction_solver, msolv_ctx));
1385         PetscCall(PCShellSetView(msolv_ctx->interior_solver, PCBDDCReuseSolvers_View));
1386         PetscCall(PCShellSetApply(msolv_ctx->correction_solver, PCBDDCReuseSolvers_Correction));
1387         PetscCall(PCShellSetApplyTranspose(msolv_ctx->correction_solver, PCBDDCReuseSolvers_CorrectionTranspose));
1388 
1389         /* scatter and vecs for Schur complement solver */
1390         PetscCall(MatCreateVecs(S_all, &msolv_ctx->sol_B, &msolv_ctx->rhs_B));
1391         PetscCall(MatCreateVecs(sub_schurs->S, &vec1_B, NULL));
1392         if (!schur_has_vertices) {
1393           PetscCall(ISGlobalToLocalMappingApplyIS(sub_schurs->BtoNmap, IS_GTOLM_DROP, is_A_all, &msolv_ctx->is_B));
1394           PetscCall(VecScatterCreate(vec1_B, msolv_ctx->is_B, msolv_ctx->sol_B, NULL, &msolv_ctx->correction_scatter_B));
1395           PetscCall(PetscObjectReference((PetscObject)is_A_all));
1396           msolv_ctx->is_R = is_A_all;
1397         } else {
1398           IS              is_B_all;
1399           const PetscInt *idxs;
1400           PetscInt        dual, n_v, n;
1401 
1402           PetscCall(ISGetLocalSize(sub_schurs->is_vertices, &n_v));
1403           dual = size_schur - n_v;
1404           PetscCall(ISGetLocalSize(is_A_all, &n));
1405           PetscCall(ISGetIndices(is_A_all, &idxs));
1406           PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)is_A_all), dual, idxs + n_I, PETSC_COPY_VALUES, &is_B_all));
1407           PetscCall(ISGlobalToLocalMappingApplyIS(sub_schurs->BtoNmap, IS_GTOLM_DROP, is_B_all, &msolv_ctx->is_B));
1408           PetscCall(ISDestroy(&is_B_all));
1409           PetscCall(ISCreateStride(PetscObjectComm((PetscObject)is_A_all), dual, 0, 1, &is_B_all));
1410           PetscCall(VecScatterCreate(vec1_B, msolv_ctx->is_B, msolv_ctx->sol_B, is_B_all, &msolv_ctx->correction_scatter_B));
1411           PetscCall(ISDestroy(&is_B_all));
1412           PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)is_A_all), n - n_v, idxs, PETSC_COPY_VALUES, &msolv_ctx->is_R));
1413           PetscCall(ISRestoreIndices(is_A_all, &idxs));
1414         }
1415         PetscCall(ISGetLocalSize(msolv_ctx->is_R, &n_R));
1416         PetscCall(MatCreateSeqAIJ(PETSC_COMM_SELF, n_R, n_R, 0, NULL, &Afake));
1417         PetscCall(MatAssemblyBegin(Afake, MAT_FINAL_ASSEMBLY));
1418         PetscCall(MatAssemblyEnd(Afake, MAT_FINAL_ASSEMBLY));
1419         PetscCall(PCSetOperators(msolv_ctx->correction_solver, Afake, Afake));
1420         PetscCall(MatDestroy(&Afake));
1421         PetscCall(VecDestroy(&vec1_B));
1422       }
1423       /* communicate benign info to solver context */
1424       if (benign_n) {
1425         PetscScalar *array;
1426 
1427         msolv_ctx->benign_n             = benign_n;
1428         msolv_ctx->benign_zerodiag_subs = is_p_r;
1429         PetscCall(PetscMalloc1(benign_n, &msolv_ctx->benign_save_vals));
1430         msolv_ctx->benign_csAIB = cs_AIB_mat;
1431         PetscCall(MatCreateVecs(cs_AIB_mat, &msolv_ctx->benign_corr_work, NULL));
1432         PetscCall(VecGetArray(msolv_ctx->benign_corr_work, &array));
1433         PetscCall(VecCreateSeqWithArray(PETSC_COMM_SELF, 1, size_schur, array, &msolv_ctx->benign_dummy_schur_vec));
1434         PetscCall(VecRestoreArray(msolv_ctx->benign_corr_work, &array));
1435         msolv_ctx->benign_AIIm1ones = benign_AIIm1_ones_mat;
1436       }
1437     } else {
1438       if (sub_schurs->reuse_solver) PetscCall(PCBDDCReuseSolversReset(sub_schurs->reuse_solver));
1439       PetscCall(PetscFree(sub_schurs->reuse_solver));
1440     }
1441     PetscCall(MatDestroy(&A));
1442     PetscCall(ISDestroy(&is_A_all));
1443 
1444     /* Work arrays */
1445     PetscCall(PetscMalloc1(max_subset_size * max_subset_size, &work));
1446 
1447     /* S_Ej_all */
1448     PetscInt *idx_work = NULL;
1449     cum = cum2 = 0;
1450     if (!multi_element) PetscCall(MatDenseGetArrayRead(S_all, &rS_data));
1451     else PetscCall(PetscMalloc1(max_subset_size, &idx_work));
1452     PetscCall(MatSeqAIJGetArray(sub_schurs->S_Ej_all, &SEj_arr));
1453     if (sub_schurs->sum_S_Ej_inv_all) PetscCall(MatSeqAIJGetArray(sub_schurs->sum_S_Ej_inv_all, &SEjinv_arr));
1454     if (sub_schurs->gdsw) PetscCall(MatCreateSubMatrices(sub_schurs->A, sub_schurs->n_subs, sub_schurs->is_subs, sub_schurs->is_subs, MAT_INITIAL_MATRIX, &gdswA));
1455     for (i = 0; i < sub_schurs->n_subs; i++) {
1456       /* get S_E (or K^i_EE for GDSW) */
1457       PetscCall(ISGetLocalSize(sub_schurs->is_subs[i], &subset_size));
1458       if (sub_schurs->gdsw) {
1459         Mat T;
1460 
1461         PetscCall(MatCreateSeqDense(PETSC_COMM_SELF, subset_size, subset_size, work, &T));
1462         PetscCall(MatConvert(gdswA[i], MATDENSE, MAT_REUSE_MATRIX, &T));
1463         PetscCall(MatDestroy(&T));
1464       } else {
1465         if (multi_element) { /* transpose copy to workspace */
1466           // XXX CSR directly?
1467           for (PetscInt j = 0; j < subset_size; j++) idx_work[j] = cum + j;
1468           PetscCall(MatGetValues(S_all, subset_size, idx_work, subset_size, idx_work, work));
1469           if (!sub_schurs->is_hermitian) {
1470             for (PetscInt k = 0; k < subset_size; k++) {
1471               for (PetscInt j = k; j < subset_size; j++) {
1472                 PetscScalar t             = work[k * subset_size + j];
1473                 work[k * subset_size + j] = work[j * subset_size + k];
1474                 work[j * subset_size + k] = t;
1475               }
1476             }
1477           }
1478         } else if (S_lower_triangular) { /* I need to expand the upper triangular data (column-oriented) */
1479           for (PetscInt k = 0; k < subset_size; k++) {
1480             for (PetscInt j = k; j < subset_size; j++) {
1481               work[k * subset_size + j] = rS_data[cum2 + k * size_schur + j];
1482               work[j * subset_size + k] = PetscConj(rS_data[cum2 + k * size_schur + j]);
1483             }
1484           }
1485         } else { /* just copy to workspace */
1486           for (PetscInt k = 0; k < subset_size; k++) {
1487             for (PetscInt j = 0; j < subset_size; j++) work[k * subset_size + j] = rS_data[cum2 + k * size_schur + j];
1488           }
1489         }
1490       }
1491       /* insert S_E values */
1492       if (sub_schurs->change) {
1493         Mat change_sub, SEj, T;
1494 
1495         /* change basis */
1496         PetscCall(KSPGetOperators(sub_schurs->change[i], &change_sub, NULL));
1497         PetscCall(MatCreateSeqDense(PETSC_COMM_SELF, subset_size, subset_size, work, &SEj));
1498         if (!sub_schurs->change_with_qr) { /* currently there's no support for PtAP with P SeqAIJ */
1499           Mat T2;
1500           PetscCall(MatTransposeMatMult(change_sub, SEj, MAT_INITIAL_MATRIX, 1.0, &T2));
1501           PetscCall(MatMatMult(T2, change_sub, MAT_INITIAL_MATRIX, 1.0, &T));
1502           PetscCall(MatConvert(T, MATSEQDENSE, MAT_INPLACE_MATRIX, &T));
1503           PetscCall(MatDestroy(&T2));
1504         } else {
1505           PetscCall(MatPtAP(SEj, change_sub, MAT_INITIAL_MATRIX, 1.0, &T));
1506         }
1507         PetscCall(MatCopy(T, SEj, SAME_NONZERO_PATTERN));
1508         PetscCall(MatDestroy(&T));
1509         PetscCall(MatZeroRowsColumnsIS(SEj, sub_schurs->change_primal_sub[i], 1.0, NULL, NULL));
1510         PetscCall(MatDestroy(&SEj));
1511       }
1512       PetscCall(PetscArraycpy(SEj_arr, work, subset_size * subset_size));
1513       if (compute_Stilda) {
1514         if (deluxe) { /* if adaptivity is requested, invert S_E blocks */
1515           Mat                M;
1516           const PetscScalar *vals;
1517           PetscBool          isdense, isdensecuda;
1518 
1519           PetscCall(MatCreateSeqDense(PETSC_COMM_SELF, subset_size, subset_size, work, &M));
1520           PetscCall(MatSetOption(M, MAT_SPD, sub_schurs->is_posdef));
1521           PetscCall(MatSetOption(M, MAT_HERMITIAN, sub_schurs->is_hermitian));
1522           if (!PetscBTLookup(sub_schurs->is_edge, i)) PetscCall(MatSetType(M, Stype));
1523           PetscCall(PetscObjectTypeCompare((PetscObject)M, MATSEQDENSE, &isdense));
1524           PetscCall(PetscObjectTypeCompare((PetscObject)M, MATSEQDENSECUDA, &isdensecuda));
1525           switch (sub_schurs->mat_factor_type) {
1526           case MAT_FACTOR_CHOLESKY:
1527             PetscCall(MatCholeskyFactor(M, NULL, NULL));
1528             break;
1529           case MAT_FACTOR_LU:
1530             PetscCall(MatLUFactor(M, NULL, NULL, NULL));
1531             break;
1532           default:
1533             SETERRQ(PetscObjectComm((PetscObject)F), PETSC_ERR_SUP, "Unsupported factor type %s", MatFactorTypes[sub_schurs->mat_factor_type]);
1534           }
1535           if (isdense) {
1536             PetscCall(MatSeqDenseInvertFactors_Private(M));
1537 #if defined(PETSC_HAVE_CUDA)
1538           } else if (isdensecuda) {
1539             PetscCall(MatSeqDenseCUDAInvertFactors_Internal(M));
1540 #endif
1541           } else SETERRQ(PetscObjectComm((PetscObject)M), PETSC_ERR_SUP, "Not implemented for type %s", Stype);
1542           PetscCall(MatDenseGetArrayRead(M, &vals));
1543           PetscCall(PetscArraycpy(SEjinv_arr, vals, subset_size * subset_size));
1544           PetscCall(MatDenseRestoreArrayRead(M, &vals));
1545           PetscCall(MatDestroy(&M));
1546         } else if (scaling) { /* not using deluxe */
1547           Mat          SEj;
1548           Vec          D;
1549           PetscScalar *array;
1550 
1551           PetscCall(MatCreateSeqDense(PETSC_COMM_SELF, subset_size, subset_size, work, &SEj));
1552           PetscCall(VecGetArray(Dall, &array));
1553           PetscCall(VecCreateSeqWithArray(PETSC_COMM_SELF, 1, subset_size, array + cum, &D));
1554           PetscCall(VecRestoreArray(Dall, &array));
1555           PetscCall(VecShift(D, -1.));
1556           PetscCall(MatDiagonalScale(SEj, D, D));
1557           PetscCall(MatDestroy(&SEj));
1558           PetscCall(VecDestroy(&D));
1559           PetscCall(PetscArraycpy(SEj_arr, work, subset_size * subset_size));
1560         }
1561       }
1562       cum += subset_size;
1563       cum2 += subset_size * (size_schur + 1);
1564       SEj_arr += subset_size * subset_size;
1565       if (SEjinv_arr) SEjinv_arr += subset_size * subset_size;
1566     }
1567     if (sub_schurs->gdsw) PetscCall(MatDestroySubMatrices(sub_schurs->n_subs, &gdswA));
1568     if (!multi_element) PetscCall(MatDenseRestoreArrayRead(S_all, &rS_data));
1569     PetscCall(MatSeqAIJRestoreArray(sub_schurs->S_Ej_all, &SEj_arr));
1570     if (sub_schurs->sum_S_Ej_inv_all) PetscCall(MatSeqAIJRestoreArray(sub_schurs->sum_S_Ej_inv_all, &SEjinv_arr));
1571     if (solver_S) PetscCall(MatFactorRestoreSchurComplement(F, &S_all, MAT_FACTOR_SCHUR_UNFACTORED));
1572 
1573     /* may prevent from unneeded copies, since MUMPS or MKL_Pardiso always use CPU memory
1574        however, preliminary tests indicate using GPUs is still faster in the solve phase */
1575 #if defined(PETSC_HAVE_VIENNACL) || defined(PETSC_HAVE_CUDA)
1576     if (reuse_solvers) {
1577       Mat                  St;
1578       MatFactorSchurStatus st;
1579 
1580       flg = PETSC_FALSE;
1581       PetscCall(PetscOptionsGetBool(NULL, sub_schurs->prefix, "-sub_schurs_schur_pin_to_cpu", &flg, NULL));
1582       PetscCall(MatFactorGetSchurComplement(F, &St, &st));
1583       PetscCall(MatBindToCPU(St, flg));
1584       PetscCall(MatFactorRestoreSchurComplement(F, &St, st));
1585     }
1586 #endif
1587 
1588     schur_factor = NULL;
1589     if (compute_Stilda && size_active_schur) {
1590       if (sub_schurs->n_subs == 1 && size_schur == size_active_schur && deluxe) { /* we already computed the inverse */
1591         PetscCall(MatSeqAIJGetArrayWrite(sub_schurs->sum_S_Ej_tilda_all, &SEjinv_arr));
1592         PetscCall(PetscArraycpy(SEjinv_arr, work, size_schur * size_schur));
1593         PetscCall(MatSeqAIJRestoreArrayWrite(sub_schurs->sum_S_Ej_tilda_all, &SEjinv_arr));
1594       } else {
1595         Mat S_all_inv = NULL;
1596 
1597         if (solver_S && !sub_schurs->gdsw) {
1598           /* for adaptive selection we need S^-1; for solver reusage we need S_\Delta\Delta^-1.
1599              The latter is not the principal subminor for S^-1. However, the factors can be reused since S_\Delta\Delta is the leading principal submatrix of S */
1600           if (factor_workaround) { /* invert without calling MatFactorInvertSchurComplement, since we are hacking */
1601             PetscScalar *data;
1602             PetscInt     nd = 0;
1603 
1604             PetscCheck(use_potr, PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor update not yet implemented for non SPD matrices");
1605             PetscCall(MatFactorGetSchurComplement(F, &S_all_inv, NULL));
1606             PetscCall(MatDenseGetArray(S_all_inv, &data));
1607             if (sub_schurs->is_dir) { /* dirichlet dofs could have different scalings */
1608               PetscCall(ISGetLocalSize(sub_schurs->is_dir, &nd));
1609             }
1610 
1611             /* factor and invert activedofs and vertices (dirichlet dofs does not contribute) */
1612             if (schur_has_vertices) {
1613               Mat          M;
1614               PetscScalar *tdata;
1615               PetscInt     nv = 0, news;
1616 
1617               PetscCall(ISGetLocalSize(sub_schurs->is_vertices, &nv));
1618               news = size_active_schur + nv;
1619               PetscCall(PetscCalloc1(news * news, &tdata));
1620               for (i = 0; i < size_active_schur; i++) {
1621                 PetscCall(PetscArraycpy(tdata + i * (news + 1), data + i * (size_schur + 1), size_active_schur - i));
1622                 PetscCall(PetscArraycpy(tdata + i * (news + 1) + size_active_schur - i, data + i * size_schur + size_active_schur + nd, nv));
1623               }
1624               for (i = 0; i < nv; i++) {
1625                 PetscInt k = i + size_active_schur;
1626                 PetscCall(PetscArraycpy(tdata + k * (news + 1), data + (k + nd) * (size_schur + 1), nv - i));
1627               }
1628 
1629               PetscCall(MatCreateSeqDense(PETSC_COMM_SELF, news, news, tdata, &M));
1630               PetscCall(MatSetOption(M, MAT_SPD, PETSC_TRUE));
1631               PetscCall(MatCholeskyFactor(M, NULL, NULL));
1632               /* save the factors */
1633               cum = 0;
1634               PetscCall(PetscMalloc1((size_active_schur * (size_active_schur + 1)) / 2 + nd, &schur_factor));
1635               for (i = 0; i < size_active_schur; i++) {
1636                 PetscCall(PetscArraycpy(schur_factor + cum, tdata + i * (news + 1), size_active_schur - i));
1637                 cum += size_active_schur - i;
1638               }
1639               for (i = 0; i < nd; i++) schur_factor[cum + i] = PetscSqrtReal(PetscRealPart(data[(i + size_active_schur) * (size_schur + 1)]));
1640               S_all_inv->ops->solve             = M->ops->solve;
1641               S_all_inv->ops->matsolve          = M->ops->matsolve;
1642               S_all_inv->ops->solvetranspose    = M->ops->solvetranspose;
1643               S_all_inv->ops->matsolvetranspose = M->ops->matsolvetranspose;
1644               S_all_inv->factortype             = MAT_FACTOR_CHOLESKY;
1645               PetscCall(MatSeqDenseInvertFactors_Private(M));
1646               /* move back just the active dofs to the Schur complement */
1647               for (i = 0; i < size_active_schur; i++) PetscCall(PetscArraycpy(data + i * size_schur, tdata + i * news, size_active_schur));
1648               PetscCall(PetscFree(tdata));
1649               PetscCall(MatDestroy(&M));
1650             } else { /* we can factorize and invert just the activedofs */
1651               Mat          M;
1652               PetscScalar *aux;
1653 
1654               PetscCall(PetscMalloc1(nd, &aux));
1655               for (i = 0; i < nd; i++) aux[i] = 1.0 / data[(i + size_active_schur) * (size_schur + 1)];
1656               PetscCall(MatCreateSeqDense(PETSC_COMM_SELF, size_active_schur, size_active_schur, data, &M));
1657               PetscCall(MatDenseSetLDA(M, size_schur));
1658               PetscCall(MatSetOption(M, MAT_SPD, PETSC_TRUE));
1659               PetscCall(MatCholeskyFactor(M, NULL, NULL));
1660               PetscCall(MatSeqDenseInvertFactors_Private(M));
1661               PetscCall(MatDestroy(&M));
1662               PetscCall(MatCreateSeqDense(PETSC_COMM_SELF, size_schur, nd, data + size_active_schur * size_schur, &M));
1663               PetscCall(MatZeroEntries(M));
1664               PetscCall(MatDestroy(&M));
1665               PetscCall(MatCreateSeqDense(PETSC_COMM_SELF, nd, size_schur, data + size_active_schur, &M));
1666               PetscCall(MatDenseSetLDA(M, size_schur));
1667               PetscCall(MatZeroEntries(M));
1668               PetscCall(MatDestroy(&M));
1669               for (i = 0; i < nd; i++) data[(i + size_active_schur) * (size_schur + 1)] = aux[i];
1670               PetscCall(PetscFree(aux));
1671             }
1672             PetscCall(MatDenseRestoreArray(S_all_inv, &data));
1673           } else { /* use MatFactor calls to invert S */
1674             PetscCall(MatFactorInvertSchurComplement(F));
1675             PetscCall(MatFactorGetSchurComplement(F, &S_all_inv, NULL));
1676           }
1677         } else if (!sub_schurs->gdsw) { /* we need to invert explicitly since we are not using MatFactor for S */
1678           if (multi_element) {
1679             PetscCall(MatDuplicate(S_all, MAT_DO_NOT_COPY_VALUES, &S_all_inv));
1680             PetscCall(MatSetOption(S_all_inv, MAT_ROW_ORIENTED, sub_schurs->is_hermitian));
1681             for (PetscInt sub = 0; sub < n_local_subs; sub++) {
1682               const PetscScalar *vals;
1683               const PetscInt    *idxs;
1684               PetscInt           size_schur_sub;
1685               Mat                M;
1686 
1687               PetscCall(MatCreateSubMatrix(S_all, is_sub_schur[sub], is_sub_schur[sub], MAT_INITIAL_MATRIX, &M));
1688               PetscCall(MatConvert(M, MATDENSE, MAT_INPLACE_MATRIX, &M));
1689               PetscCall(MatSetOption(M, MAT_SPD, sub_schurs->is_posdef));
1690               PetscCall(MatSetOption(M, MAT_HERMITIAN, sub_schurs->is_hermitian));
1691               switch (sub_schurs->mat_factor_type) {
1692               case MAT_FACTOR_CHOLESKY:
1693                 PetscCall(MatCholeskyFactor(M, NULL, NULL));
1694                 break;
1695               case MAT_FACTOR_LU:
1696                 PetscCall(MatLUFactor(M, NULL, NULL, NULL));
1697                 break;
1698               default:
1699                 SETERRQ(PetscObjectComm((PetscObject)F), PETSC_ERR_SUP, "Unsupported factor type %s", MatFactorTypes[sub_schurs->mat_factor_type]);
1700               }
1701               PetscCall(MatSeqDenseInvertFactors_Private(M));
1702               PetscCall(MatDenseGetArrayRead(M, &vals));
1703               PetscCall(ISGetLocalSize(is_sub_schur[sub], &size_schur_sub));
1704               PetscCall(ISGetIndices(is_sub_schur[sub], &idxs));
1705               PetscCall(MatSetValues(S_all_inv, size_schur_sub, idxs, size_schur_sub, idxs, vals, INSERT_VALUES));
1706               PetscCall(ISRestoreIndices(is_sub_schur[sub], &idxs));
1707               PetscCall(MatDenseRestoreArrayRead(M, &vals));
1708               PetscCall(MatDestroy(&M));
1709             }
1710             PetscCall(MatAssemblyBegin(S_all_inv, MAT_FINAL_ASSEMBLY));
1711             PetscCall(MatAssemblyEnd(S_all_inv, MAT_FINAL_ASSEMBLY));
1712           } else {
1713             PetscCall(PetscObjectReference((PetscObject)S_all));
1714             S_all_inv = S_all;
1715             PetscCall(MatDenseGetArray(S_all_inv, &S_data));
1716             PetscCall(PetscBLASIntCast(size_schur, &B_N));
1717             PetscCall(PetscFPTrapPush(PETSC_FP_TRAP_OFF));
1718             if (use_potr) {
1719               PetscCallBLAS("LAPACKpotrf", LAPACKpotrf_("L", &B_N, S_data, &B_N, &B_ierr));
1720               PetscCheck(!B_ierr, PETSC_COMM_SELF, PETSC_ERR_LIB, "Error in POTRF Lapack routine %" PetscBLASInt_FMT, B_ierr);
1721               PetscCallBLAS("LAPACKpotri", LAPACKpotri_("L", &B_N, S_data, &B_N, &B_ierr));
1722               PetscCheck(!B_ierr, PETSC_COMM_SELF, PETSC_ERR_LIB, "Error in POTRI Lapack routine %" PetscBLASInt_FMT, B_ierr);
1723             } else if (use_sytr) {
1724               PetscCallBLAS("LAPACKsytrf", LAPACKsytrf_("L", &B_N, S_data, &B_N, pivots, Bwork, &B_lwork, &B_ierr));
1725               PetscCheck(!B_ierr, PETSC_COMM_SELF, PETSC_ERR_LIB, "Error in SYTRF Lapack routine %" PetscBLASInt_FMT, B_ierr);
1726               PetscCallBLAS("LAPACKsytri", LAPACKsytri_("L", &B_N, S_data, &B_N, pivots, Bwork, &B_ierr));
1727               PetscCheck(!B_ierr, PETSC_COMM_SELF, PETSC_ERR_LIB, "Error in SYTRI Lapack routine %" PetscBLASInt_FMT, B_ierr);
1728             } else {
1729               PetscCallBLAS("LAPACKgetrf", LAPACKgetrf_(&B_N, &B_N, S_data, &B_N, pivots, &B_ierr));
1730               PetscCheck(!B_ierr, PETSC_COMM_SELF, PETSC_ERR_LIB, "Error in GETRF Lapack routine %" PetscBLASInt_FMT, B_ierr);
1731               PetscCallBLAS("LAPACKgetri", LAPACKgetri_(&B_N, S_data, &B_N, pivots, Bwork, &B_lwork, &B_ierr));
1732               PetscCheck(!B_ierr, PETSC_COMM_SELF, PETSC_ERR_LIB, "Error in GETRI Lapack routine %" PetscBLASInt_FMT, B_ierr);
1733             }
1734             PetscCall(PetscLogFlops(1.0 * size_schur * size_schur * size_schur));
1735             PetscCall(PetscFPTrapPop());
1736             PetscCall(MatDenseRestoreArray(S_all_inv, &S_data));
1737           }
1738         } else if (sub_schurs->gdsw) {
1739           Mat      tS, tX, SEj, S_II, S_IE, S_EE;
1740           KSP      pS_II;
1741           PC       pS_II_pc;
1742           IS       EE, II;
1743           PetscInt nS;
1744 
1745           PetscCall(MatFactorCreateSchurComplement(F, &tS, NULL));
1746           PetscCall(MatGetSize(tS, &nS, NULL));
1747           PetscCall(MatSeqAIJGetArray(sub_schurs->sum_S_Ej_tilda_all, &SEjinv_arr));
1748           for (i = 0, cum = 0; i < sub_schurs->n_subs; i++) { /* naive implementation */
1749             PetscCall(ISGetLocalSize(sub_schurs->is_subs[i], &subset_size));
1750             PetscCall(MatCreateSeqDense(PETSC_COMM_SELF, subset_size, subset_size, SEjinv_arr, &SEj));
1751 
1752             PetscCall(ISCreateStride(PETSC_COMM_SELF, subset_size, cum, 1, &EE));
1753             PetscCall(ISComplement(EE, 0, nS, &II));
1754             PetscCall(MatCreateSubMatrix(tS, II, II, MAT_INITIAL_MATRIX, &S_II));
1755             PetscCall(MatCreateSubMatrix(tS, II, EE, MAT_INITIAL_MATRIX, &S_IE));
1756             PetscCall(MatCreateSubMatrix(tS, EE, EE, MAT_INITIAL_MATRIX, &S_EE));
1757             PetscCall(ISDestroy(&II));
1758             PetscCall(ISDestroy(&EE));
1759 
1760             PetscCall(KSPCreate(PETSC_COMM_SELF, &pS_II));
1761             PetscCall(KSPSetNestLevel(pS_II, 1)); /* do not have direct access to a PC to provide the level of nesting of the KSP */
1762             PetscCall(KSPSetType(pS_II, KSPPREONLY));
1763             PetscCall(KSPGetPC(pS_II, &pS_II_pc));
1764             PetscCall(PCSetType(pS_II_pc, PCSVD));
1765             PetscCall(KSPSetOptionsPrefix(pS_II, sub_schurs->prefix));
1766             PetscCall(KSPAppendOptionsPrefix(pS_II, "pseudo_"));
1767             PetscCall(KSPSetOperators(pS_II, S_II, S_II));
1768             PetscCall(MatDestroy(&S_II));
1769             PetscCall(KSPSetFromOptions(pS_II));
1770             PetscCall(KSPSetUp(pS_II));
1771             PetscCall(MatDuplicate(S_IE, MAT_DO_NOT_COPY_VALUES, &tX));
1772             PetscCall(KSPMatSolve(pS_II, S_IE, tX));
1773             PetscCall(KSPDestroy(&pS_II));
1774 
1775             PetscCall(MatTransposeMatMult(S_IE, tX, MAT_REUSE_MATRIX, PETSC_DETERMINE, &SEj));
1776             PetscCall(MatDestroy(&S_IE));
1777             PetscCall(MatDestroy(&tX));
1778             PetscCall(MatAYPX(SEj, -1, S_EE, SAME_NONZERO_PATTERN));
1779             PetscCall(MatDestroy(&S_EE));
1780 
1781             PetscCall(MatDestroy(&SEj));
1782             cum += subset_size;
1783             SEjinv_arr += subset_size * subset_size;
1784           }
1785           PetscCall(MatDestroy(&tS));
1786           PetscCall(MatSeqAIJRestoreArray(sub_schurs->sum_S_Ej_tilda_all, &SEjinv_arr));
1787         }
1788         /* S_Ej_tilda_all */
1789         cum = cum2 = 0;
1790         rS_data    = NULL;
1791         if (S_all_inv && !multi_element) PetscCall(MatDenseGetArrayRead(S_all_inv, &rS_data));
1792         PetscCall(MatSeqAIJGetArrayWrite(sub_schurs->sum_S_Ej_tilda_all, &SEjinv_arr));
1793         for (i = 0; i < sub_schurs->n_subs; i++) {
1794           PetscInt j;
1795 
1796           PetscCall(ISGetLocalSize(sub_schurs->is_subs[i], &subset_size));
1797           /* get (St^-1)_E */
1798           /* Unless we are changing the variables, I don't need to expand to upper triangular since St^-1
1799              will be properly accessed later during adaptive selection */
1800           if (multi_element) { /* transpose copy to workspace */
1801             // XXX CSR directly?
1802             for (PetscInt j = 0; j < subset_size; j++) idx_work[j] = cum + j;
1803             PetscCall(MatGetValues(S_all_inv, subset_size, idx_work, subset_size, idx_work, work));
1804             if (!sub_schurs->is_hermitian) {
1805               for (PetscInt k = 0; k < subset_size; k++) {
1806                 for (PetscInt j = k; j < subset_size; j++) {
1807                   PetscScalar t             = work[k * subset_size + j];
1808                   work[k * subset_size + j] = work[j * subset_size + k];
1809                   work[j * subset_size + k] = t;
1810                 }
1811               }
1812             }
1813           } else if (rS_data) {
1814             if (S_lower_triangular) {
1815               PetscInt k;
1816               if (sub_schurs->change) {
1817                 for (k = 0; k < subset_size; k++) {
1818                   for (j = k; j < subset_size; j++) {
1819                     work[k * subset_size + j] = rS_data[cum2 + k * size_schur + j];
1820                     work[j * subset_size + k] = work[k * subset_size + j];
1821                   }
1822                 }
1823               } else {
1824                 for (k = 0; k < subset_size; k++) {
1825                   for (j = k; j < subset_size; j++) work[k * subset_size + j] = rS_data[cum2 + k * size_schur + j];
1826                 }
1827               }
1828             } else {
1829               PetscInt k;
1830               for (k = 0; k < subset_size; k++) {
1831                 for (j = 0; j < subset_size; j++) work[k * subset_size + j] = rS_data[cum2 + k * size_schur + j];
1832               }
1833             }
1834           }
1835           if (sub_schurs->change) {
1836             Mat         change_sub, SEj, T;
1837             PetscScalar val = sub_schurs->gdsw ? PETSC_SMALL : 1. / PETSC_SMALL;
1838 
1839             /* change basis */
1840             PetscCall(KSPGetOperators(sub_schurs->change[i], &change_sub, NULL));
1841             PetscCall(MatCreateSeqDense(PETSC_COMM_SELF, subset_size, subset_size, (rS_data || multi_element) ? work : SEjinv_arr, &SEj));
1842             if (!sub_schurs->change_with_qr) { /* currently there's no support for PtAP with P SeqAIJ */
1843               Mat T2;
1844               PetscCall(MatTransposeMatMult(change_sub, SEj, MAT_INITIAL_MATRIX, 1.0, &T2));
1845               PetscCall(MatMatMult(T2, change_sub, MAT_INITIAL_MATRIX, 1.0, &T));
1846               PetscCall(MatDestroy(&T2));
1847               PetscCall(MatConvert(T, MATSEQDENSE, MAT_INPLACE_MATRIX, &T));
1848             } else {
1849               PetscCall(MatPtAP(SEj, change_sub, MAT_INITIAL_MATRIX, 1.0, &T));
1850             }
1851             PetscCall(MatCopy(T, SEj, SAME_NONZERO_PATTERN));
1852             PetscCall(MatDestroy(&T));
1853             PetscCall(MatZeroRowsColumnsIS(SEj, sub_schurs->change_primal_sub[i], val, NULL, NULL));
1854             PetscCall(MatDestroy(&SEj));
1855           }
1856           if (rS_data || multi_element) PetscCall(PetscArraycpy(SEjinv_arr, work, subset_size * subset_size));
1857           cum += subset_size;
1858           cum2 += subset_size * (size_schur + 1);
1859           SEjinv_arr += subset_size * subset_size;
1860         }
1861         PetscCall(MatSeqAIJRestoreArrayWrite(sub_schurs->sum_S_Ej_tilda_all, &SEjinv_arr));
1862         if (S_all_inv) {
1863           if (!multi_element) PetscCall(MatDenseRestoreArrayRead(S_all_inv, &rS_data));
1864           if (solver_S) {
1865             if (schur_has_vertices) {
1866               PetscCall(MatFactorRestoreSchurComplement(F, &S_all_inv, MAT_FACTOR_SCHUR_FACTORED));
1867             } else {
1868               PetscCall(MatFactorRestoreSchurComplement(F, &S_all_inv, MAT_FACTOR_SCHUR_INVERTED));
1869             }
1870           }
1871         }
1872         PetscCall(MatDestroy(&S_all_inv));
1873       }
1874 
1875       /* move back factors if needed */
1876       if (schur_has_vertices && factor_workaround && !sub_schurs->gdsw) {
1877         Mat          S_tmp;
1878         PetscInt     nd = 0;
1879         PetscScalar *data;
1880 
1881         PetscCheck(use_potr, PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor update not yet implemented for non SPD matrices");
1882         PetscCheck(solver_S, PETSC_COMM_SELF, PETSC_ERR_PLIB, "This should not happen");
1883         PetscCall(MatFactorGetSchurComplement(F, &S_tmp, NULL));
1884         PetscCall(MatDenseGetArray(S_tmp, &data));
1885         PetscCall(PetscArrayzero(data, size_schur * size_schur));
1886 
1887         if (S_lower_triangular) {
1888           cum = 0;
1889           for (i = 0; i < size_active_schur; i++) {
1890             PetscCall(PetscArraycpy(data + i * (size_schur + 1), schur_factor + cum, size_active_schur - i));
1891             cum += size_active_schur - i;
1892           }
1893         } else {
1894           PetscCall(PetscArraycpy(data, schur_factor, size_schur * size_schur));
1895         }
1896         if (sub_schurs->is_dir) {
1897           PetscCall(ISGetLocalSize(sub_schurs->is_dir, &nd));
1898           for (i = 0; i < nd; i++) data[(i + size_active_schur) * (size_schur + 1)] = schur_factor[cum + i];
1899         }
1900         /* workaround: since I cannot modify the matrices used inside the solvers for the forward and backward substitutions,
1901              set the diagonal entry of the Schur factor to a very large value */
1902         for (i = size_active_schur + nd; i < size_schur; i++) data[i * (size_schur + 1)] = infty;
1903         PetscCall(MatDenseRestoreArray(S_tmp, &data));
1904         PetscCall(MatFactorRestoreSchurComplement(F, &S_tmp, MAT_FACTOR_SCHUR_FACTORED));
1905       }
1906     } else if (factor_workaround && !sub_schurs->gdsw) { /* we need to eliminate any unneeded coupling */
1907       PetscScalar *data;
1908       PetscInt     nd = 0;
1909 
1910       if (sub_schurs->is_dir) { /* dirichlet dofs could have different scalings */
1911         PetscCall(ISGetLocalSize(sub_schurs->is_dir, &nd));
1912       }
1913       PetscCall(MatFactorGetSchurComplement(F, &S_all, NULL));
1914       PetscCall(MatDenseGetArray(S_all, &data));
1915       for (i = 0; i < size_active_schur; i++) PetscCall(PetscArrayzero(data + i * size_schur + size_active_schur, size_schur - size_active_schur));
1916       for (i = size_active_schur + nd; i < size_schur; i++) {
1917         PetscCall(PetscArrayzero(data + i * size_schur + size_active_schur, size_schur - size_active_schur));
1918         data[i * (size_schur + 1)] = infty;
1919       }
1920       PetscCall(MatDenseRestoreArray(S_all, &data));
1921       PetscCall(MatFactorRestoreSchurComplement(F, &S_all, MAT_FACTOR_SCHUR_UNFACTORED));
1922     }
1923     PetscCall(PetscFree(idx_work));
1924     PetscCall(PetscFree(work));
1925     PetscCall(PetscFree(schur_factor));
1926     PetscCall(VecDestroy(&Dall));
1927     PetscCall(ISDestroy(&is_schur));
1928     if (multi_element) {
1929       for (PetscInt sub = 0; sub < n_local_subs; sub++) {
1930         PetscCall(ISDestroy(&is_sub_all[sub]));
1931         PetscCall(ISDestroy(&is_sub_schur_all[sub]));
1932         PetscCall(ISDestroy(&is_sub_schur[sub]));
1933       }
1934       PetscCall(PetscFree3(is_sub_all, is_sub_schur_all, is_sub_schur));
1935     }
1936   }
1937   PetscCall(ISDestroy(&is_I_layer));
1938   PetscCall(MatDestroy(&S_all));
1939   PetscCall(MatDestroy(&A_BB));
1940   PetscCall(MatDestroy(&A_IB));
1941   PetscCall(MatDestroy(&A_BI));
1942   PetscCall(MatDestroy(&F));
1943 
1944   PetscCall(MatAssemblyBegin(sub_schurs->S_Ej_all, MAT_FINAL_ASSEMBLY));
1945   PetscCall(MatAssemblyEnd(sub_schurs->S_Ej_all, MAT_FINAL_ASSEMBLY));
1946   if (compute_Stilda) {
1947     PetscCall(MatAssemblyBegin(sub_schurs->sum_S_Ej_tilda_all, MAT_FINAL_ASSEMBLY));
1948     PetscCall(MatAssemblyEnd(sub_schurs->sum_S_Ej_tilda_all, MAT_FINAL_ASSEMBLY));
1949     if (deluxe) {
1950       PetscCall(MatAssemblyBegin(sub_schurs->sum_S_Ej_inv_all, MAT_FINAL_ASSEMBLY));
1951       PetscCall(MatAssemblyEnd(sub_schurs->sum_S_Ej_inv_all, MAT_FINAL_ASSEMBLY));
1952     }
1953   }
1954 
1955   /* Get local part of (\sum_j S_Ej) */
1956   if (!sub_schurs->sum_S_Ej_all) PetscCall(MatDuplicate(sub_schurs->S_Ej_all, MAT_DO_NOT_COPY_VALUES, &sub_schurs->sum_S_Ej_all));
1957   PetscCall(VecSet(gstash, 0.0));
1958   PetscCall(MatSeqAIJGetArray(sub_schurs->S_Ej_all, &stasharray));
1959   PetscCall(VecPlaceArray(lstash, stasharray));
1960   PetscCall(VecScatterBegin(sstash, lstash, gstash, ADD_VALUES, SCATTER_FORWARD));
1961   PetscCall(VecScatterEnd(sstash, lstash, gstash, ADD_VALUES, SCATTER_FORWARD));
1962   PetscCall(MatSeqAIJRestoreArray(sub_schurs->S_Ej_all, &stasharray));
1963   PetscCall(VecResetArray(lstash));
1964   PetscCall(MatSeqAIJGetArray(sub_schurs->sum_S_Ej_all, &stasharray));
1965   PetscCall(VecPlaceArray(lstash, stasharray));
1966   PetscCall(VecScatterBegin(sstash, gstash, lstash, INSERT_VALUES, SCATTER_REVERSE));
1967   PetscCall(VecScatterEnd(sstash, gstash, lstash, INSERT_VALUES, SCATTER_REVERSE));
1968   PetscCall(MatSeqAIJRestoreArray(sub_schurs->sum_S_Ej_all, &stasharray));
1969   PetscCall(VecResetArray(lstash));
1970 
1971   /* Get local part of (\sum_j S^-1_Ej) (\sum_j St^-1_Ej) */
1972   if (compute_Stilda) {
1973     PetscCall(VecSet(gstash, 0.0));
1974     PetscCall(MatSeqAIJGetArray(sub_schurs->sum_S_Ej_tilda_all, &stasharray));
1975     PetscCall(VecPlaceArray(lstash, stasharray));
1976     PetscCall(VecScatterBegin(sstash, lstash, gstash, ADD_VALUES, SCATTER_FORWARD));
1977     PetscCall(VecScatterEnd(sstash, lstash, gstash, ADD_VALUES, SCATTER_FORWARD));
1978     PetscCall(VecScatterBegin(sstash, gstash, lstash, INSERT_VALUES, SCATTER_REVERSE));
1979     PetscCall(VecScatterEnd(sstash, gstash, lstash, INSERT_VALUES, SCATTER_REVERSE));
1980     PetscCall(MatSeqAIJRestoreArray(sub_schurs->sum_S_Ej_tilda_all, &stasharray));
1981     PetscCall(VecResetArray(lstash));
1982     if (deluxe) {
1983       PetscCall(VecSet(gstash, 0.0));
1984       PetscCall(MatSeqAIJGetArray(sub_schurs->sum_S_Ej_inv_all, &stasharray));
1985       PetscCall(VecPlaceArray(lstash, stasharray));
1986       PetscCall(VecScatterBegin(sstash, lstash, gstash, ADD_VALUES, SCATTER_FORWARD));
1987       PetscCall(VecScatterEnd(sstash, lstash, gstash, ADD_VALUES, SCATTER_FORWARD));
1988       PetscCall(VecScatterBegin(sstash, gstash, lstash, INSERT_VALUES, SCATTER_REVERSE));
1989       PetscCall(VecScatterEnd(sstash, gstash, lstash, INSERT_VALUES, SCATTER_REVERSE));
1990       PetscCall(MatSeqAIJRestoreArray(sub_schurs->sum_S_Ej_inv_all, &stasharray));
1991       PetscCall(VecResetArray(lstash));
1992     } else if (!sub_schurs->gdsw) {
1993       PetscScalar *array;
1994       PetscInt     cum;
1995 
1996       PetscCall(MatSeqAIJGetArray(sub_schurs->sum_S_Ej_tilda_all, &array));
1997       cum = 0;
1998       for (i = 0; i < sub_schurs->n_subs; i++) {
1999         PetscCall(ISGetLocalSize(sub_schurs->is_subs[i], &subset_size));
2000         PetscCall(PetscBLASIntCast(subset_size, &B_N));
2001         PetscCall(PetscFPTrapPush(PETSC_FP_TRAP_OFF));
2002         if (use_potr) {
2003           PetscCallBLAS("LAPACKpotrf", LAPACKpotrf_("L", &B_N, array + cum, &B_N, &B_ierr));
2004           PetscCheck(!B_ierr, PETSC_COMM_SELF, PETSC_ERR_LIB, "Error in POTRF Lapack routine %" PetscBLASInt_FMT, B_ierr);
2005           PetscCallBLAS("LAPACKpotri", LAPACKpotri_("L", &B_N, array + cum, &B_N, &B_ierr));
2006           PetscCheck(!B_ierr, PETSC_COMM_SELF, PETSC_ERR_LIB, "Error in POTRI Lapack routine %" PetscBLASInt_FMT, B_ierr);
2007         } else if (use_sytr) {
2008           PetscCallBLAS("LAPACKsytrf", LAPACKsytrf_("L", &B_N, array + cum, &B_N, pivots, Bwork, &B_lwork, &B_ierr));
2009           PetscCheck(!B_ierr, PETSC_COMM_SELF, PETSC_ERR_LIB, "Error in SYTRF Lapack routine %" PetscBLASInt_FMT, B_ierr);
2010           PetscCallBLAS("LAPACKsytri", LAPACKsytri_("L", &B_N, array + cum, &B_N, pivots, Bwork, &B_ierr));
2011           PetscCheck(!B_ierr, PETSC_COMM_SELF, PETSC_ERR_LIB, "Error in SYTRI Lapack routine %" PetscBLASInt_FMT, B_ierr);
2012         } else {
2013           PetscCallBLAS("LAPACKgetrf", LAPACKgetrf_(&B_N, &B_N, array + cum, &B_N, pivots, &B_ierr));
2014           PetscCheck(!B_ierr, PETSC_COMM_SELF, PETSC_ERR_LIB, "Error in GETRF Lapack routine %" PetscBLASInt_FMT, B_ierr);
2015           PetscCallBLAS("LAPACKgetri", LAPACKgetri_(&B_N, array + cum, &B_N, pivots, Bwork, &B_lwork, &B_ierr));
2016           PetscCheck(!B_ierr, PETSC_COMM_SELF, PETSC_ERR_LIB, "Error in GETRI Lapack routine %" PetscBLASInt_FMT, B_ierr);
2017         }
2018         PetscCall(PetscLogFlops(1.0 * subset_size * subset_size * subset_size));
2019         PetscCall(PetscFPTrapPop());
2020         cum += subset_size * subset_size;
2021       }
2022       PetscCall(MatSeqAIJRestoreArray(sub_schurs->sum_S_Ej_tilda_all, &array));
2023       PetscCall(PetscObjectReference((PetscObject)sub_schurs->sum_S_Ej_all));
2024       PetscCall(MatDestroy(&sub_schurs->sum_S_Ej_inv_all));
2025       sub_schurs->sum_S_Ej_inv_all = sub_schurs->sum_S_Ej_all;
2026     }
2027   }
2028   PetscCall(VecDestroy(&lstash));
2029   PetscCall(VecDestroy(&gstash));
2030   PetscCall(VecScatterDestroy(&sstash));
2031 
2032   if (matl_dbg_viewer) {
2033     if (sub_schurs->S_Ej_all) {
2034       PetscCall(PetscObjectSetName((PetscObject)sub_schurs->S_Ej_all, "SE"));
2035       PetscCall(MatView(sub_schurs->S_Ej_all, matl_dbg_viewer));
2036     }
2037     if (sub_schurs->sum_S_Ej_all) {
2038       PetscCall(PetscObjectSetName((PetscObject)sub_schurs->sum_S_Ej_all, "SSE"));
2039       PetscCall(MatView(sub_schurs->sum_S_Ej_all, matl_dbg_viewer));
2040     }
2041     if (sub_schurs->sum_S_Ej_inv_all) {
2042       PetscCall(PetscObjectSetName((PetscObject)sub_schurs->sum_S_Ej_inv_all, "SSEm"));
2043       PetscCall(MatView(sub_schurs->sum_S_Ej_inv_all, matl_dbg_viewer));
2044     }
2045     if (sub_schurs->sum_S_Ej_tilda_all) {
2046       PetscCall(PetscObjectSetName((PetscObject)sub_schurs->sum_S_Ej_tilda_all, "SSEt"));
2047       PetscCall(MatView(sub_schurs->sum_S_Ej_tilda_all, matl_dbg_viewer));
2048     }
2049   }
2050 
2051   /* when not explicit, we need to set the factor type */
2052   if (sub_schurs->mat_factor_type == MAT_FACTOR_NONE) sub_schurs->mat_factor_type = sub_schurs->is_hermitian ? MAT_FACTOR_CHOLESKY : MAT_FACTOR_LU;
2053 
2054   /* free workspace */
2055   if (matl_dbg_viewer) PetscCall(PetscViewerFlush(matl_dbg_viewer));
2056   if (sub_schurs->debug) PetscCallMPI(MPI_Barrier(comm_n));
2057   PetscCall(PetscViewerDestroy(&matl_dbg_viewer));
2058   PetscCall(PetscFree2(Bwork, pivots));
2059   PetscCall(PetscCommDestroy(&comm_n));
2060   PetscCall(PetscFree(all_local_subid_N));
2061   PetscFunctionReturn(PETSC_SUCCESS);
2062 }
2063 
2064 PetscErrorCode PCBDDCSubSchursInit(PCBDDCSubSchurs sub_schurs, const char *prefix, IS is_I, IS is_B, PCBDDCGraph graph, ISLocalToGlobalMapping BtoNmap, PetscBool copycc, PetscBool gdsw)
2065 {
2066   IS       *faces, *edges, *all_cc, vertices;
2067   PetscInt  s, i, n_faces, n_edges, n_all_cc;
2068   PetscBool is_sorted, ispardiso, ismumps;
2069 
2070   PetscFunctionBegin;
2071   PetscCall(ISSorted(is_I, &is_sorted));
2072   PetscCheck(is_sorted, PetscObjectComm((PetscObject)is_I), PETSC_ERR_PLIB, "IS for I dofs should be shorted");
2073   PetscCall(ISSorted(is_B, &is_sorted));
2074   PetscCheck(is_sorted, PetscObjectComm((PetscObject)is_B), PETSC_ERR_PLIB, "IS for B dofs should be shorted");
2075 
2076   /* reset any previous data */
2077   PetscCall(PCBDDCSubSchursReset(sub_schurs));
2078 
2079   sub_schurs->gdsw  = gdsw;
2080   sub_schurs->graph = graph;
2081 
2082   /* get index sets for faces and edges (already sorted by global ordering) */
2083   PetscCall(PCBDDCGraphGetCandidatesIS(graph, &n_faces, &faces, &n_edges, &edges, &vertices));
2084   n_all_cc = n_faces + n_edges;
2085   PetscCall(PetscBTCreate(n_all_cc, &sub_schurs->is_edge));
2086   PetscCall(PetscMalloc1(n_all_cc, &all_cc));
2087   n_all_cc = 0;
2088   for (i = 0; i < n_faces; i++) {
2089     PetscCall(ISGetSize(faces[i], &s));
2090     if (!s) continue;
2091     if (copycc) {
2092       PetscCall(ISDuplicate(faces[i], &all_cc[n_all_cc]));
2093     } else {
2094       PetscCall(PetscObjectReference((PetscObject)faces[i]));
2095       all_cc[n_all_cc] = faces[i];
2096     }
2097     n_all_cc++;
2098   }
2099   for (i = 0; i < n_edges; i++) {
2100     PetscCall(ISGetSize(edges[i], &s));
2101     if (!s) continue;
2102     if (copycc) {
2103       PetscCall(ISDuplicate(edges[i], &all_cc[n_all_cc]));
2104     } else {
2105       PetscCall(PetscObjectReference((PetscObject)edges[i]));
2106       all_cc[n_all_cc] = edges[i];
2107     }
2108     PetscCall(PetscBTSet(sub_schurs->is_edge, n_all_cc));
2109     n_all_cc++;
2110   }
2111   PetscCall(PetscObjectReference((PetscObject)vertices));
2112   sub_schurs->is_vertices = vertices;
2113   PetscCall(PCBDDCGraphRestoreCandidatesIS(graph, &n_faces, &faces, &n_edges, &edges, &vertices));
2114   sub_schurs->is_dir = NULL;
2115   PetscCall(PCBDDCGraphGetDirichletDofsB(graph, &sub_schurs->is_dir));
2116 
2117   /* Determine if MatFactor can be used */
2118   PetscCall(PetscStrallocpy(prefix, &sub_schurs->prefix));
2119 #if defined(PETSC_HAVE_MUMPS)
2120   PetscCall(PetscStrncpy(sub_schurs->mat_solver_type, MATSOLVERMUMPS, sizeof(sub_schurs->mat_solver_type)));
2121 #elif defined(PETSC_HAVE_MKL_PARDISO)
2122   PetscCall(PetscStrncpy(sub_schurs->mat_solver_type, MATSOLVERMKL_PARDISO, sizeof(sub_schurs->mat_solver_type)));
2123 #else
2124   PetscCall(PetscStrncpy(sub_schurs->mat_solver_type, MATSOLVERPETSC, sizeof(sub_schurs->mat_solver_type)));
2125 #endif
2126   sub_schurs->mat_factor_type = MAT_FACTOR_NONE;
2127   sub_schurs->is_hermitian    = PetscDefined(USE_COMPLEX) ? PETSC_FALSE : PETSC_TRUE; /* Hermitian Cholesky is not supported by PETSc and external packages */
2128   sub_schurs->is_posdef       = PETSC_TRUE;
2129   sub_schurs->is_symmetric    = PETSC_TRUE;
2130   sub_schurs->debug           = PETSC_FALSE;
2131   sub_schurs->restrict_comm   = PETSC_FALSE;
2132   PetscOptionsBegin(PetscObjectComm((PetscObject)graph->l2gmap), sub_schurs->prefix, "BDDC sub_schurs options", "PC");
2133   PetscCall(PetscOptionsString("-sub_schurs_mat_solver_type", "Specific direct solver to use", NULL, sub_schurs->mat_solver_type, sub_schurs->mat_solver_type, sizeof(sub_schurs->mat_solver_type), NULL));
2134   PetscCall(PetscOptionsEnum("-sub_schurs_mat_factor_type", "Factor type to use. Use MAT_FACTOR_NONE for automatic selection", NULL, MatFactorTypes, (PetscEnum)sub_schurs->mat_factor_type, (PetscEnum *)&sub_schurs->mat_factor_type, NULL));
2135   PetscCall(PetscOptionsBool("-sub_schurs_symmetric", "Symmetric problem", NULL, sub_schurs->is_symmetric, &sub_schurs->is_symmetric, NULL));
2136   PetscCall(PetscOptionsBool("-sub_schurs_hermitian", "Hermitian problem", NULL, sub_schurs->is_hermitian, &sub_schurs->is_hermitian, NULL));
2137   PetscCall(PetscOptionsBool("-sub_schurs_posdef", "Positive definite problem", NULL, sub_schurs->is_posdef, &sub_schurs->is_posdef, NULL));
2138   PetscCall(PetscOptionsBool("-sub_schurs_restrictcomm", "Restrict communicator on active processes only", NULL, sub_schurs->restrict_comm, &sub_schurs->restrict_comm, NULL));
2139   PetscCall(PetscOptionsBool("-sub_schurs_debug", "Debug output", NULL, sub_schurs->debug, &sub_schurs->debug, NULL));
2140   PetscOptionsEnd();
2141   PetscCall(PetscStrcmp(sub_schurs->mat_solver_type, MATSOLVERMUMPS, &ismumps));
2142   PetscCall(PetscStrcmp(sub_schurs->mat_solver_type, MATSOLVERMKL_PARDISO, &ispardiso));
2143   sub_schurs->schur_explicit = (PetscBool)(ispardiso || ismumps);
2144 
2145   /* for reals, symmetric and Hermitian are synonyms */
2146 #if !defined(PETSC_USE_COMPLEX)
2147   sub_schurs->is_symmetric = (PetscBool)(sub_schurs->is_symmetric && sub_schurs->is_hermitian);
2148   sub_schurs->is_hermitian = sub_schurs->is_symmetric;
2149 #endif
2150 
2151   PetscCall(PetscObjectReference((PetscObject)is_I));
2152   sub_schurs->is_I = is_I;
2153   PetscCall(PetscObjectReference((PetscObject)is_B));
2154   sub_schurs->is_B = is_B;
2155   PetscCall(PetscObjectReference((PetscObject)graph->l2gmap));
2156   sub_schurs->l2gmap = graph->l2gmap;
2157   PetscCall(PetscObjectReference((PetscObject)BtoNmap));
2158   sub_schurs->BtoNmap            = BtoNmap;
2159   sub_schurs->n_subs             = n_all_cc;
2160   sub_schurs->is_subs            = all_cc;
2161   sub_schurs->S_Ej_all           = NULL;
2162   sub_schurs->sum_S_Ej_all       = NULL;
2163   sub_schurs->sum_S_Ej_inv_all   = NULL;
2164   sub_schurs->sum_S_Ej_tilda_all = NULL;
2165   sub_schurs->is_Ej_all          = NULL;
2166   PetscFunctionReturn(PETSC_SUCCESS);
2167 }
2168 
2169 PetscErrorCode PCBDDCSubSchursCreate(PCBDDCSubSchurs *sub_schurs)
2170 {
2171   PCBDDCSubSchurs schurs_ctx;
2172 
2173   PetscFunctionBegin;
2174   PetscCall(PetscNew(&schurs_ctx));
2175   schurs_ctx->n_subs = 0;
2176   *sub_schurs        = schurs_ctx;
2177   PetscFunctionReturn(PETSC_SUCCESS);
2178 }
2179 
2180 PetscErrorCode PCBDDCSubSchursReset(PCBDDCSubSchurs sub_schurs)
2181 {
2182   PetscFunctionBegin;
2183   if (!sub_schurs) PetscFunctionReturn(PETSC_SUCCESS);
2184   sub_schurs->graph = NULL;
2185   PetscCall(PetscFree(sub_schurs->prefix));
2186   PetscCall(MatDestroy(&sub_schurs->A));
2187   PetscCall(MatDestroy(&sub_schurs->S));
2188   PetscCall(ISDestroy(&sub_schurs->is_I));
2189   PetscCall(ISDestroy(&sub_schurs->is_B));
2190   PetscCall(ISLocalToGlobalMappingDestroy(&sub_schurs->l2gmap));
2191   PetscCall(ISLocalToGlobalMappingDestroy(&sub_schurs->BtoNmap));
2192   PetscCall(MatDestroy(&sub_schurs->S_Ej_all));
2193   PetscCall(MatDestroy(&sub_schurs->sum_S_Ej_all));
2194   PetscCall(MatDestroy(&sub_schurs->sum_S_Ej_inv_all));
2195   PetscCall(MatDestroy(&sub_schurs->sum_S_Ej_tilda_all));
2196   PetscCall(ISDestroy(&sub_schurs->is_Ej_all));
2197   PetscCall(ISDestroy(&sub_schurs->is_vertices));
2198   PetscCall(ISDestroy(&sub_schurs->is_dir));
2199   PetscCall(PetscBTDestroy(&sub_schurs->is_edge));
2200   for (PetscInt i = 0; i < sub_schurs->n_subs; i++) PetscCall(ISDestroy(&sub_schurs->is_subs[i]));
2201   if (sub_schurs->n_subs) PetscCall(PetscFree(sub_schurs->is_subs));
2202   if (sub_schurs->reuse_solver) PetscCall(PCBDDCReuseSolversReset(sub_schurs->reuse_solver));
2203   PetscCall(PetscFree(sub_schurs->reuse_solver));
2204   if (sub_schurs->change) {
2205     for (PetscInt i = 0; i < sub_schurs->n_subs; i++) {
2206       PetscCall(KSPDestroy(&sub_schurs->change[i]));
2207       PetscCall(ISDestroy(&sub_schurs->change_primal_sub[i]));
2208     }
2209   }
2210   PetscCall(PetscFree(sub_schurs->change));
2211   PetscCall(PetscFree(sub_schurs->change_primal_sub));
2212   sub_schurs->n_subs = 0;
2213   PetscFunctionReturn(PETSC_SUCCESS);
2214 }
2215 
2216 PetscErrorCode PCBDDCSubSchursDestroy(PCBDDCSubSchurs *sub_schurs)
2217 {
2218   PetscFunctionBegin;
2219   PetscCall(PCBDDCSubSchursReset(*sub_schurs));
2220   PetscCall(PetscFree(*sub_schurs));
2221   PetscFunctionReturn(PETSC_SUCCESS);
2222 }
2223 
2224 static inline PetscErrorCode PCBDDCAdjGetNextLayer_Private(PetscInt *queue_tip, PetscInt n_prev, PetscBT touched, PetscInt *xadj, PetscInt *adjncy, PetscInt *n_added)
2225 {
2226   PetscInt i, j, n;
2227 
2228   PetscFunctionBegin;
2229   n = 0;
2230   for (i = -n_prev; i < 0; i++) {
2231     PetscInt start_dof = queue_tip[i];
2232     for (j = xadj[start_dof]; j < xadj[start_dof + 1]; j++) {
2233       PetscInt dof = adjncy[j];
2234       if (!PetscBTLookup(touched, dof)) {
2235         PetscCall(PetscBTSet(touched, dof));
2236         queue_tip[n] = dof;
2237         n++;
2238       }
2239     }
2240   }
2241   *n_added = n;
2242   PetscFunctionReturn(PETSC_SUCCESS);
2243 }
2244