xref: /petsc/src/mat/impls/sbaij/seq/sbaijfact.c (revision 7e1a0bbe36d2be40a00a95404ece00db4857f70d)
1 #include <../src/mat/impls/baij/seq/baij.h>
2 #include <../src/mat/impls/sbaij/seq/sbaij.h>
3 #include <petsc/private/kernels/blockinvert.h>
4 #include <petscis.h>
5 
6 PetscErrorCode MatGetInertia_SeqSBAIJ(Mat F, PetscInt *nneg, PetscInt *nzero, PetscInt *npos)
7 {
8   Mat_SeqSBAIJ *fact = (Mat_SeqSBAIJ *)F->data;
9   MatScalar    *dd   = fact->a;
10   PetscInt      mbs = fact->mbs, bs = F->rmap->bs, i, nneg_tmp, npos_tmp, *fi = fact->diag;
11 
12   PetscFunctionBegin;
13   PetscCheck(bs == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for bs: %" PetscInt_FMT " >1 yet", bs);
14 
15   nneg_tmp = 0;
16   npos_tmp = 0;
17   if (fi) {
18     for (i = 0; i < mbs; i++) {
19       if (PetscRealPart(dd[*fi]) > 0.0) npos_tmp++;
20       else if (PetscRealPart(dd[*fi]) < 0.0) nneg_tmp++;
21       fi++;
22     }
23   } else {
24     for (i = 0; i < mbs; i++) {
25       if (PetscRealPart(dd[fact->i[i]]) > 0.0) npos_tmp++;
26       else if (PetscRealPart(dd[fact->i[i]]) < 0.0) nneg_tmp++;
27     }
28   }
29   if (nneg) *nneg = nneg_tmp;
30   if (npos) *npos = npos_tmp;
31   if (nzero) *nzero = mbs - nneg_tmp - npos_tmp;
32   PetscFunctionReturn(PETSC_SUCCESS);
33 }
34 
35 /*
36   Symbolic U^T*D*U factorization for SBAIJ format. Modified from SSF of YSMP.
37   Use Modified Sparse Row (MSR) storage for u and ju. See page 85, "Iterative Methods ..." by Saad.
38 */
39 static PetscErrorCode MatCholeskyFactorSymbolic_SeqSBAIJ_MSR(Mat F, Mat A, IS perm, const MatFactorInfo *info)
40 {
41   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data, *b;
42   const PetscInt *rip, *ai, *aj;
43   PetscInt        i, mbs = a->mbs, *jutmp, bs = A->rmap->bs, bs2 = a->bs2;
44   PetscInt        m, reallocs = 0, prow;
45   PetscInt       *jl, *q, jmin, jmax, juidx, nzk, qm, *iu, *ju, k, j, vj, umax, maxadd;
46   PetscReal       f = info->fill;
47   PetscBool       perm_identity;
48 
49   PetscFunctionBegin;
50   /* check whether perm is the identity mapping */
51   PetscCall(ISIdentity(perm, &perm_identity));
52   PetscCall(ISGetIndices(perm, &rip));
53 
54   if (perm_identity) { /* without permutation */
55     a->permute = PETSC_FALSE;
56 
57     ai = a->i;
58     aj = a->j;
59   } else { /* non-trivial permutation */
60     a->permute = PETSC_TRUE;
61 
62     PetscCall(MatReorderingSeqSBAIJ(A, perm));
63 
64     ai = a->inew;
65     aj = a->jnew;
66   }
67 
68   /* initialization */
69   PetscCall(PetscShmgetAllocateArray(mbs + 1, sizeof(PetscInt), (void **)&iu));
70   umax = (PetscInt)(f * ai[mbs] + 1);
71   umax += mbs + 1;
72   PetscCall(PetscShmgetAllocateArray(umax, sizeof(PetscInt), (void **)&ju));
73   iu[0] = mbs + 1;
74   juidx = mbs + 1; /* index for ju */
75   /* jl linked list for pivot row -- linked list for col index */
76   PetscCall(PetscMalloc2(mbs, &jl, mbs, &q));
77   for (i = 0; i < mbs; i++) {
78     jl[i] = mbs;
79     q[i]  = 0;
80   }
81 
82   /* for each row k */
83   for (k = 0; k < mbs; k++) {
84     for (i = 0; i < mbs; i++) q[i] = 0; /* to be removed! */
85     nzk  = 0;                           /* num. of nz blocks in k-th block row with diagonal block excluded */
86     q[k] = mbs;
87     /* initialize nonzero structure of k-th row to row rip[k] of A */
88     jmin = ai[rip[k]] + 1; /* exclude diag[k] */
89     jmax = ai[rip[k] + 1];
90     for (j = jmin; j < jmax; j++) {
91       vj = rip[aj[j]]; /* col. value */
92       if (vj > k) {
93         qm = k;
94         do {
95           m  = qm;
96           qm = q[m];
97         } while (qm < vj);
98         PetscCheck(qm != vj, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Duplicate entry in A");
99         nzk++;
100         q[m]  = vj;
101         q[vj] = qm;
102       } /* if (vj > k) */
103     } /* for (j=jmin; j<jmax; j++) */
104 
105     /* modify nonzero structure of k-th row by computing fill-in
106        for each row i to be merged in */
107     prow = k;
108     prow = jl[prow]; /* next pivot row (== mbs for symbolic factorization) */
109 
110     while (prow < k) {
111       /* merge row prow into k-th row */
112       jmin = iu[prow] + 1;
113       jmax = iu[prow + 1];
114       qm   = k;
115       for (j = jmin; j < jmax; j++) {
116         vj = ju[j];
117         do {
118           m  = qm;
119           qm = q[m];
120         } while (qm < vj);
121         if (qm != vj) {
122           nzk++;
123           q[m]  = vj;
124           q[vj] = qm;
125           qm    = vj;
126         }
127       }
128       prow = jl[prow]; /* next pivot row */
129     }
130 
131     /* add k to row list for first nonzero element in k-th row */
132     if (nzk > 0) {
133       i     = q[k]; /* col value of first nonzero element in U(k, k+1:mbs-1) */
134       jl[k] = jl[i];
135       jl[i] = k;
136     }
137     iu[k + 1] = iu[k] + nzk;
138 
139     /* allocate more space to ju if needed */
140     if (iu[k + 1] > umax) {
141       /* estimate how much additional space we will need */
142       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
143       /* just double the memory each time */
144       maxadd = umax;
145       if (maxadd < nzk) maxadd = (mbs - k) * (nzk + 1) / 2;
146       umax += maxadd;
147 
148       /* allocate a longer ju */
149       PetscCall(PetscShmgetAllocateArray(umax, sizeof(PetscInt), (void **)&jutmp));
150       PetscCall(PetscArraycpy(jutmp, ju, iu[k]));
151       PetscCall(PetscShmgetDeallocateArray((void **)&ju));
152       ju = jutmp;
153       reallocs++; /* count how many times we realloc */
154     }
155 
156     /* save nonzero structure of k-th row in ju */
157     i = k;
158     while (nzk--) {
159       i           = q[i];
160       ju[juidx++] = i;
161     }
162   }
163 
164 #if defined(PETSC_USE_INFO)
165   if (ai[mbs] != 0) {
166     PetscReal af = ((PetscReal)iu[mbs]) / ((PetscReal)ai[mbs]);
167     PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)f, (double)af));
168     PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
169     PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g);\n", (double)af));
170     PetscCall(PetscInfo(A, "for best performance.\n"));
171   } else {
172     PetscCall(PetscInfo(A, "Empty matrix\n"));
173   }
174 #endif
175 
176   PetscCall(ISRestoreIndices(perm, &rip));
177   PetscCall(PetscFree2(jl, q));
178 
179   /* put together the new matrix */
180   PetscCall(MatSeqSBAIJSetPreallocation(F, bs, MAT_SKIP_ALLOCATION, NULL));
181 
182   b          = (Mat_SeqSBAIJ *)F->data;
183   b->free_ij = PETSC_TRUE;
184   PetscCall(PetscShmgetAllocateArray((iu[mbs] + 1) * bs2, sizeof(PetscScalar), (void **)&b->a));
185   b->free_a = PETSC_TRUE;
186   b->j      = ju;
187   b->i      = iu;
188   b->diag   = NULL;
189   b->ilen   = NULL;
190   b->imax   = NULL;
191   b->row    = perm;
192 
193   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
194 
195   PetscCall(PetscObjectReference((PetscObject)perm));
196 
197   b->icol = perm;
198   PetscCall(PetscObjectReference((PetscObject)perm));
199   PetscCall(PetscMalloc1(bs * mbs + bs, &b->solve_work));
200   /* In b structure:  Free imax, ilen, old a, old j.
201      Allocate idnew, solve_work, new a, new j */
202   b->maxnz = b->nz = iu[mbs];
203 
204   F->info.factor_mallocs   = reallocs;
205   F->info.fill_ratio_given = f;
206   if (ai[mbs] != 0) {
207     F->info.fill_ratio_needed = ((PetscReal)iu[mbs]) / ((PetscReal)ai[mbs]);
208   } else {
209     F->info.fill_ratio_needed = 0.0;
210   }
211   PetscCall(MatSeqSBAIJSetNumericFactorization_inplace(F, perm_identity));
212   PetscFunctionReturn(PETSC_SUCCESS);
213 }
214 /*
215     Symbolic U^T*D*U factorization for SBAIJ format.
216     See MatICCFactorSymbolic_SeqAIJ() for description of its data structure.
217 */
218 #include <petscbt.h>
219 #include <../src/mat/utils/freespace.h>
220 PetscErrorCode MatCholeskyFactorSymbolic_SeqSBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
221 {
222   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
223   Mat_SeqSBAIJ      *b;
224   PetscBool          perm_identity, missing;
225   PetscReal          fill = info->fill;
226   const PetscInt    *rip, *ai = a->i, *aj = a->j;
227   PetscInt           i, mbs = a->mbs, bs = A->rmap->bs, reallocs = 0, prow;
228   PetscInt          *jl, jmin, jmax, nzk, *ui, k, j, *il, nextprow;
229   PetscInt           nlnk, *lnk, ncols, *cols, *uj, **ui_ptr, *uj_ptr, *udiag;
230   PetscFreeSpaceList free_space = NULL, current_space = NULL;
231   PetscBT            lnkbt;
232 
233   PetscFunctionBegin;
234   PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
235   PetscCall(MatMissingDiagonal(A, &missing, &i));
236   PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
237   if (bs > 1) {
238     PetscCall(MatCholeskyFactorSymbolic_SeqSBAIJ_inplace(fact, A, perm, info));
239     PetscFunctionReturn(PETSC_SUCCESS);
240   }
241 
242   /* check whether perm is the identity mapping */
243   PetscCall(ISIdentity(perm, &perm_identity));
244   PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported for sbaij matrix. Use aij format");
245   a->permute = PETSC_FALSE;
246   PetscCall(ISGetIndices(perm, &rip));
247 
248   /* initialization */
249   PetscCall(PetscShmgetAllocateArray(mbs + 1, sizeof(PetscInt), (void **)&ui));
250   PetscCall(PetscMalloc1(mbs + 1, &udiag));
251   ui[0] = 0;
252 
253   /* jl: linked list for storing indices of the pivot rows
254      il: il[i] points to the 1st nonzero entry of U(i,k:mbs-1) */
255   PetscCall(PetscMalloc4(mbs, &ui_ptr, mbs, &il, mbs, &jl, mbs, &cols));
256   for (i = 0; i < mbs; i++) {
257     jl[i] = mbs;
258     il[i] = 0;
259   }
260 
261   /* create and initialize a linked list for storing column indices of the active row k */
262   nlnk = mbs + 1;
263   PetscCall(PetscLLCreate(mbs, mbs, nlnk, lnk, lnkbt));
264 
265   /* initial FreeSpace size is fill*(ai[mbs]+1) */
266   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[mbs] + 1), &free_space));
267   current_space = free_space;
268 
269   for (k = 0; k < mbs; k++) { /* for each active row k */
270     /* initialize lnk by the column indices of row rip[k] of A */
271     nzk   = 0;
272     ncols = ai[k + 1] - ai[k];
273     PetscCheck(ncols, PETSC_COMM_SELF, PETSC_ERR_MAT_CH_ZRPVT, "Empty row %" PetscInt_FMT " in matrix ", k);
274     for (j = 0; j < ncols; j++) {
275       i       = *(aj + ai[k] + j);
276       cols[j] = i;
277     }
278     PetscCall(PetscLLAdd(ncols, cols, mbs, &nlnk, lnk, lnkbt));
279     nzk += nlnk;
280 
281     /* update lnk by computing fill-in for each pivot row to be merged in */
282     prow = jl[k]; /* 1st pivot row */
283 
284     while (prow < k) {
285       nextprow = jl[prow];
286       /* merge prow into k-th row */
287       jmin   = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:mbs-1) */
288       jmax   = ui[prow + 1];
289       ncols  = jmax - jmin;
290       uj_ptr = ui_ptr[prow] + jmin - ui[prow]; /* points to the 2nd nzero entry in U(prow,k:mbs-1) */
291       PetscCall(PetscLLAddSorted(ncols, uj_ptr, mbs, &nlnk, lnk, lnkbt));
292       nzk += nlnk;
293 
294       /* update il and jl for prow */
295       if (jmin < jmax) {
296         il[prow] = jmin;
297         j        = *uj_ptr;
298         jl[prow] = jl[j];
299         jl[j]    = prow;
300       }
301       prow = nextprow;
302     }
303 
304     /* if free space is not available, make more free space */
305     if (current_space->local_remaining < nzk) {
306       i = mbs - k + 1;                                   /* num of unfactored rows */
307       i = PetscIntMultTruncate(i, PetscMin(nzk, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
308       PetscCall(PetscFreeSpaceGet(i, &current_space));
309       reallocs++;
310     }
311 
312     /* copy data into free space, then initialize lnk */
313     PetscCall(PetscLLClean(mbs, mbs, nzk, lnk, current_space->array, lnkbt));
314 
315     /* add the k-th row into il and jl */
316     if (nzk > 1) {
317       i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */
318       jl[k] = jl[i];
319       jl[i] = k;
320       il[k] = ui[k] + 1;
321     }
322     ui_ptr[k] = current_space->array;
323 
324     current_space->array += nzk;
325     current_space->local_used += nzk;
326     current_space->local_remaining -= nzk;
327 
328     ui[k + 1] = ui[k] + nzk;
329   }
330 
331   PetscCall(ISRestoreIndices(perm, &rip));
332   PetscCall(PetscFree4(ui_ptr, il, jl, cols));
333 
334   /* destroy list of free space and other temporary array(s) */
335   PetscCall(PetscShmgetAllocateArray(ui[mbs], sizeof(PetscInt), (void **)&uj));
336   PetscCall(PetscFreeSpaceContiguous_Cholesky(&free_space, uj, mbs, ui, udiag)); /* store matrix factor */
337   PetscCall(PetscLLDestroy(lnk, lnkbt));
338 
339   /* put together the new matrix in MATSEQSBAIJ format */
340   PetscCall(MatSeqSBAIJSetPreallocation(fact, bs, MAT_SKIP_ALLOCATION, NULL));
341 
342   b          = (Mat_SeqSBAIJ *)fact->data;
343   b->free_ij = PETSC_TRUE;
344   PetscCall(PetscShmgetAllocateArray(ui[mbs], sizeof(PetscScalar), (void **)&b->a));
345   b->free_a    = PETSC_TRUE;
346   b->j         = uj;
347   b->i         = ui;
348   b->diag      = udiag;
349   b->free_diag = PETSC_TRUE;
350   b->ilen      = NULL;
351   b->imax      = NULL;
352   b->row       = perm;
353   b->icol      = perm;
354 
355   PetscCall(PetscObjectReference((PetscObject)perm));
356   PetscCall(PetscObjectReference((PetscObject)perm));
357 
358   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
359 
360   PetscCall(PetscMalloc1(mbs + 1, &b->solve_work));
361 
362   b->maxnz = b->nz = ui[mbs];
363 
364   fact->info.factor_mallocs   = reallocs;
365   fact->info.fill_ratio_given = fill;
366   if (ai[mbs] != 0) {
367     fact->info.fill_ratio_needed = ((PetscReal)ui[mbs]) / ai[mbs];
368   } else {
369     fact->info.fill_ratio_needed = 0.0;
370   }
371 #if defined(PETSC_USE_INFO)
372   if (ai[mbs] != 0) {
373     PetscReal af = fact->info.fill_ratio_needed;
374     PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af));
375     PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
376     PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af));
377   } else {
378     PetscCall(PetscInfo(A, "Empty matrix\n"));
379   }
380 #endif
381   fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
382   PetscFunctionReturn(PETSC_SUCCESS);
383 }
384 
385 PetscErrorCode MatCholeskyFactorSymbolic_SeqSBAIJ_inplace(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
386 {
387   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
388   Mat_SeqSBAIJ      *b;
389   PetscBool          perm_identity, missing;
390   PetscReal          fill = info->fill;
391   const PetscInt    *rip, *ai, *aj;
392   PetscInt           i, mbs = a->mbs, bs = A->rmap->bs, reallocs = 0, prow, d;
393   PetscInt          *jl, jmin, jmax, nzk, *ui, k, j, *il, nextprow;
394   PetscInt           nlnk, *lnk, ncols, *cols, *uj, **ui_ptr, *uj_ptr;
395   PetscFreeSpaceList free_space = NULL, current_space = NULL;
396   PetscBT            lnkbt;
397 
398   PetscFunctionBegin;
399   PetscCall(MatMissingDiagonal(A, &missing, &d));
400   PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, d);
401 
402   /*
403    This code originally uses Modified Sparse Row (MSR) storage
404    (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choice!
405    Then it is rewritten so the factor B takes seqsbaij format. However the associated
406    MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1 or !perm_identity,
407    thus the original code in MSR format is still used for these cases.
408    The code below should replace MatCholeskyFactorSymbolic_SeqSBAIJ_MSR() whenever
409    MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor.
410   */
411   if (bs > 1) {
412     PetscCall(MatCholeskyFactorSymbolic_SeqSBAIJ_MSR(fact, A, perm, info));
413     PetscFunctionReturn(PETSC_SUCCESS);
414   }
415 
416   /* check whether perm is the identity mapping */
417   PetscCall(ISIdentity(perm, &perm_identity));
418   PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported for sbaij matrix. Use aij format");
419   a->permute = PETSC_FALSE;
420   ai         = a->i;
421   aj         = a->j;
422   PetscCall(ISGetIndices(perm, &rip));
423 
424   /* initialization */
425   PetscCall(PetscShmgetAllocateArray(mbs + 1, sizeof(PetscInt), (void **)&ui));
426   ui[0] = 0;
427 
428   /* jl: linked list for storing indices of the pivot rows
429      il: il[i] points to the 1st nonzero entry of U(i,k:mbs-1) */
430   PetscCall(PetscMalloc4(mbs, &ui_ptr, mbs, &il, mbs, &jl, mbs, &cols));
431   for (i = 0; i < mbs; i++) {
432     jl[i] = mbs;
433     il[i] = 0;
434   }
435 
436   /* create and initialize a linked list for storing column indices of the active row k */
437   nlnk = mbs + 1;
438   PetscCall(PetscLLCreate(mbs, mbs, nlnk, lnk, lnkbt));
439 
440   /* initial FreeSpace size is fill*(ai[mbs]+1) */
441   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[mbs] + 1), &free_space));
442   current_space = free_space;
443 
444   for (k = 0; k < mbs; k++) { /* for each active row k */
445     /* initialize lnk by the column indices of row rip[k] of A */
446     nzk   = 0;
447     ncols = ai[rip[k] + 1] - ai[rip[k]];
448     for (j = 0; j < ncols; j++) {
449       i       = *(aj + ai[rip[k]] + j);
450       cols[j] = rip[i];
451     }
452     PetscCall(PetscLLAdd(ncols, cols, mbs, &nlnk, lnk, lnkbt));
453     nzk += nlnk;
454 
455     /* update lnk by computing fill-in for each pivot row to be merged in */
456     prow = jl[k]; /* 1st pivot row */
457 
458     while (prow < k) {
459       nextprow = jl[prow];
460       /* merge prow into k-th row */
461       jmin   = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:mbs-1) */
462       jmax   = ui[prow + 1];
463       ncols  = jmax - jmin;
464       uj_ptr = ui_ptr[prow] + jmin - ui[prow]; /* points to the 2nd nzero entry in U(prow,k:mbs-1) */
465       PetscCall(PetscLLAddSorted(ncols, uj_ptr, mbs, &nlnk, lnk, lnkbt));
466       nzk += nlnk;
467 
468       /* update il and jl for prow */
469       if (jmin < jmax) {
470         il[prow] = jmin;
471 
472         j        = *uj_ptr;
473         jl[prow] = jl[j];
474         jl[j]    = prow;
475       }
476       prow = nextprow;
477     }
478 
479     /* if free space is not available, make more free space */
480     if (current_space->local_remaining < nzk) {
481       i = mbs - k + 1;                                                            /* num of unfactored rows */
482       i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
483       PetscCall(PetscFreeSpaceGet(i, &current_space));
484       reallocs++;
485     }
486 
487     /* copy data into free space, then initialize lnk */
488     PetscCall(PetscLLClean(mbs, mbs, nzk, lnk, current_space->array, lnkbt));
489 
490     /* add the k-th row into il and jl */
491     if (nzk - 1 > 0) {
492       i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */
493       jl[k] = jl[i];
494       jl[i] = k;
495       il[k] = ui[k] + 1;
496     }
497     ui_ptr[k] = current_space->array;
498 
499     current_space->array += nzk;
500     current_space->local_used += nzk;
501     current_space->local_remaining -= nzk;
502 
503     ui[k + 1] = ui[k] + nzk;
504   }
505 
506   PetscCall(ISRestoreIndices(perm, &rip));
507   PetscCall(PetscFree4(ui_ptr, il, jl, cols));
508 
509   /* destroy list of free space and other temporary array(s) */
510   PetscCall(PetscShmgetAllocateArray(ui[mbs] + 1, sizeof(PetscInt), (void **)&uj));
511   PetscCall(PetscFreeSpaceContiguous(&free_space, uj));
512   PetscCall(PetscLLDestroy(lnk, lnkbt));
513 
514   /* put together the new matrix in MATSEQSBAIJ format */
515   PetscCall(MatSeqSBAIJSetPreallocation(fact, bs, MAT_SKIP_ALLOCATION, NULL));
516 
517   b = (Mat_SeqSBAIJ *)fact->data;
518   PetscCall(PetscShmgetAllocateArray(ui[mbs] + 1, sizeof(PetscScalar), (void **)&b->a));
519   b->free_a  = PETSC_TRUE;
520   b->free_ij = PETSC_TRUE;
521   b->j       = uj;
522   b->i       = ui;
523   b->diag    = NULL;
524   b->ilen    = NULL;
525   b->imax    = NULL;
526   b->row     = perm;
527 
528   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
529 
530   PetscCall(PetscObjectReference((PetscObject)perm));
531   b->icol = perm;
532   PetscCall(PetscObjectReference((PetscObject)perm));
533   PetscCall(PetscMalloc1(mbs + 1, &b->solve_work));
534   b->maxnz = b->nz = ui[mbs];
535 
536   fact->info.factor_mallocs   = reallocs;
537   fact->info.fill_ratio_given = fill;
538   if (ai[mbs] != 0) {
539     fact->info.fill_ratio_needed = ((PetscReal)ui[mbs]) / ai[mbs];
540   } else {
541     fact->info.fill_ratio_needed = 0.0;
542   }
543 #if defined(PETSC_USE_INFO)
544   if (ai[mbs] != 0) {
545     PetscReal af = fact->info.fill_ratio_needed;
546     PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af));
547     PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
548     PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af));
549   } else {
550     PetscCall(PetscInfo(A, "Empty matrix\n"));
551   }
552 #endif
553   PetscCall(MatSeqSBAIJSetNumericFactorization_inplace(fact, perm_identity));
554   PetscFunctionReturn(PETSC_SUCCESS);
555 }
556 
557 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_N(Mat C, Mat A, const MatFactorInfo *info)
558 {
559   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)C->data;
560   IS              perm = b->row;
561   const PetscInt *ai, *aj, *perm_ptr, mbs = a->mbs, *bi = b->i, *bj = b->j;
562   PetscInt        i, j;
563   PetscInt       *a2anew, k, k1, jmin, jmax, *jl, *il, vj, nexti, ili;
564   PetscInt        bs = A->rmap->bs, bs2 = a->bs2;
565   MatScalar      *ba = b->a, *aa, *ap, *dk, *uik;
566   MatScalar      *u, *diag, *rtmp, *rtmp_ptr;
567   MatScalar      *work;
568   PetscInt       *pivots;
569   PetscBool       allowzeropivot, zeropivotdetected;
570 
571   PetscFunctionBegin;
572   /* initialization */
573   PetscCall(PetscCalloc1(bs2 * mbs, &rtmp));
574   PetscCall(PetscMalloc2(mbs, &il, mbs, &jl));
575   allowzeropivot = PetscNot(A->erroriffailure);
576 
577   il[0] = 0;
578   for (i = 0; i < mbs; i++) jl[i] = mbs;
579 
580   PetscCall(PetscMalloc3(bs2, &dk, bs2, &uik, bs, &work));
581   PetscCall(PetscMalloc1(bs, &pivots));
582 
583   PetscCall(ISGetIndices(perm, &perm_ptr));
584 
585   /* check permutation */
586   if (!a->permute) {
587     ai = a->i;
588     aj = a->j;
589     aa = a->a;
590   } else {
591     ai = a->inew;
592     aj = a->jnew;
593     PetscCall(PetscMalloc1(bs2 * ai[mbs], &aa));
594     PetscCall(PetscArraycpy(aa, a->a, bs2 * ai[mbs]));
595     PetscCall(PetscMalloc1(ai[mbs], &a2anew));
596     PetscCall(PetscArraycpy(a2anew, a->a2anew, ai[mbs]));
597 
598     for (i = 0; i < mbs; i++) {
599       jmin = ai[i];
600       jmax = ai[i + 1];
601       for (j = jmin; j < jmax; j++) {
602         while (a2anew[j] != j) {
603           k         = a2anew[j];
604           a2anew[j] = a2anew[k];
605           a2anew[k] = k;
606           for (k1 = 0; k1 < bs2; k1++) {
607             dk[k1]           = aa[k * bs2 + k1];
608             aa[k * bs2 + k1] = aa[j * bs2 + k1];
609             aa[j * bs2 + k1] = dk[k1];
610           }
611         }
612         /* transform column-oriented blocks that lie in the lower triangle to row-oriented blocks */
613         if (i > aj[j]) {
614           ap = aa + j * bs2;                       /* ptr to the beginning of j-th block of aa */
615           for (k = 0; k < bs2; k++) dk[k] = ap[k]; /* dk <- j-th block of aa */
616           for (k = 0; k < bs; k++) {               /* j-th block of aa <- dk^T */
617             for (k1 = 0; k1 < bs; k1++) *ap++ = dk[k + bs * k1];
618           }
619         }
620       }
621     }
622     PetscCall(PetscFree(a2anew));
623   }
624 
625   /* for each row k */
626   for (k = 0; k < mbs; k++) {
627     /*initialize k-th row with elements nonzero in row perm(k) of A */
628     jmin = ai[perm_ptr[k]];
629     jmax = ai[perm_ptr[k] + 1];
630 
631     ap = aa + jmin * bs2;
632     for (j = jmin; j < jmax; j++) {
633       vj       = perm_ptr[aj[j]]; /* block col. index */
634       rtmp_ptr = rtmp + vj * bs2;
635       for (i = 0; i < bs2; i++) *rtmp_ptr++ = *ap++;
636     }
637 
638     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
639     PetscCall(PetscArraycpy(dk, rtmp + k * bs2, bs2));
640     i = jl[k]; /* first row to be added to k_th row  */
641 
642     while (i < k) {
643       nexti = jl[i]; /* next row to be added to k_th row */
644 
645       /* compute multiplier */
646       ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
647 
648       /* uik = -inv(Di)*U_bar(i,k) */
649       diag = ba + i * bs2;
650       u    = ba + ili * bs2;
651       PetscCall(PetscArrayzero(uik, bs2));
652       PetscKernel_A_gets_A_minus_B_times_C(bs, uik, diag, u);
653 
654       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
655       PetscKernel_A_gets_A_plus_Btranspose_times_C(bs, dk, uik, u);
656       PetscCall(PetscLogFlops(4.0 * bs * bs2));
657 
658       /* update -U(i,k) */
659       PetscCall(PetscArraycpy(ba + ili * bs2, uik, bs2));
660 
661       /* add multiple of row i to k-th row ... */
662       jmin = ili + 1;
663       jmax = bi[i + 1];
664       if (jmin < jmax) {
665         for (j = jmin; j < jmax; j++) {
666           /* rtmp += -U(i,k)^T * U_bar(i,j) */
667           rtmp_ptr = rtmp + bj[j] * bs2;
668           u        = ba + j * bs2;
669           PetscKernel_A_gets_A_plus_Btranspose_times_C(bs, rtmp_ptr, uik, u);
670         }
671         PetscCall(PetscLogFlops(2.0 * bs * bs2 * (jmax - jmin)));
672 
673         /* ... add i to row list for next nonzero entry */
674         il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */
675         j     = bj[jmin];
676         jl[i] = jl[j];
677         jl[j] = i; /* update jl */
678       }
679       i = nexti;
680     }
681 
682     /* save nonzero entries in k-th row of U ... */
683 
684     /* invert diagonal block */
685     diag = ba + k * bs2;
686     PetscCall(PetscArraycpy(diag, dk, bs2));
687 
688     PetscCall(PetscKernel_A_gets_inverse_A(bs, diag, pivots, work, allowzeropivot, &zeropivotdetected));
689     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
690 
691     jmin = bi[k];
692     jmax = bi[k + 1];
693     if (jmin < jmax) {
694       for (j = jmin; j < jmax; j++) {
695         vj       = bj[j]; /* block col. index of U */
696         u        = ba + j * bs2;
697         rtmp_ptr = rtmp + vj * bs2;
698         for (k1 = 0; k1 < bs2; k1++) {
699           *u++        = *rtmp_ptr;
700           *rtmp_ptr++ = 0.0;
701         }
702       }
703 
704       /* ... add k to row list for first nonzero entry in k-th row */
705       il[k] = jmin;
706       i     = bj[jmin];
707       jl[k] = jl[i];
708       jl[i] = k;
709     }
710   }
711 
712   PetscCall(PetscFree(rtmp));
713   PetscCall(PetscFree2(il, jl));
714   PetscCall(PetscFree3(dk, uik, work));
715   PetscCall(PetscFree(pivots));
716   if (a->permute) PetscCall(PetscFree(aa));
717 
718   PetscCall(ISRestoreIndices(perm, &perm_ptr));
719 
720   C->ops->solve          = MatSolve_SeqSBAIJ_N_inplace;
721   C->ops->solvetranspose = MatSolve_SeqSBAIJ_N_inplace;
722   C->ops->forwardsolve   = MatForwardSolve_SeqSBAIJ_N_inplace;
723   C->ops->backwardsolve  = MatBackwardSolve_SeqSBAIJ_N_inplace;
724 
725   C->assembled    = PETSC_TRUE;
726   C->preallocated = PETSC_TRUE;
727 
728   PetscCall(PetscLogFlops(1.3333 * bs * bs2 * b->mbs)); /* from inverting diagonal blocks */
729   PetscFunctionReturn(PETSC_SUCCESS);
730 }
731 
732 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering(Mat C, Mat A, const MatFactorInfo *info)
733 {
734   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)C->data;
735   PetscInt      i, j, mbs = a->mbs, *bi = b->i, *bj = b->j;
736   PetscInt     *ai, *aj, k, k1, jmin, jmax, *jl, *il, vj, nexti, ili;
737   PetscInt      bs = A->rmap->bs, bs2 = a->bs2;
738   MatScalar    *ba = b->a, *aa, *ap, *dk, *uik;
739   MatScalar    *u, *diag, *rtmp, *rtmp_ptr;
740   MatScalar    *work;
741   PetscInt     *pivots;
742   PetscBool     allowzeropivot, zeropivotdetected;
743 
744   PetscFunctionBegin;
745   PetscCall(PetscCalloc1(bs2 * mbs, &rtmp));
746   PetscCall(PetscMalloc2(mbs, &il, mbs, &jl));
747   il[0] = 0;
748   for (i = 0; i < mbs; i++) jl[i] = mbs;
749 
750   PetscCall(PetscMalloc3(bs2, &dk, bs2, &uik, bs, &work));
751   PetscCall(PetscMalloc1(bs, &pivots));
752   allowzeropivot = PetscNot(A->erroriffailure);
753 
754   ai = a->i;
755   aj = a->j;
756   aa = a->a;
757 
758   /* for each row k */
759   for (k = 0; k < mbs; k++) {
760     /*initialize k-th row with elements nonzero in row k of A */
761     jmin = ai[k];
762     jmax = ai[k + 1];
763     ap   = aa + jmin * bs2;
764     for (j = jmin; j < jmax; j++) {
765       vj       = aj[j]; /* block col. index */
766       rtmp_ptr = rtmp + vj * bs2;
767       for (i = 0; i < bs2; i++) *rtmp_ptr++ = *ap++;
768     }
769 
770     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
771     PetscCall(PetscArraycpy(dk, rtmp + k * bs2, bs2));
772     i = jl[k]; /* first row to be added to k_th row  */
773 
774     while (i < k) {
775       nexti = jl[i]; /* next row to be added to k_th row */
776 
777       /* compute multiplier */
778       ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
779 
780       /* uik = -inv(Di)*U_bar(i,k) */
781       diag = ba + i * bs2;
782       u    = ba + ili * bs2;
783       PetscCall(PetscArrayzero(uik, bs2));
784       PetscKernel_A_gets_A_minus_B_times_C(bs, uik, diag, u);
785 
786       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
787       PetscKernel_A_gets_A_plus_Btranspose_times_C(bs, dk, uik, u);
788       PetscCall(PetscLogFlops(2.0 * bs * bs2));
789 
790       /* update -U(i,k) */
791       PetscCall(PetscArraycpy(ba + ili * bs2, uik, bs2));
792 
793       /* add multiple of row i to k-th row ... */
794       jmin = ili + 1;
795       jmax = bi[i + 1];
796       if (jmin < jmax) {
797         for (j = jmin; j < jmax; j++) {
798           /* rtmp += -U(i,k)^T * U_bar(i,j) */
799           rtmp_ptr = rtmp + bj[j] * bs2;
800           u        = ba + j * bs2;
801           PetscKernel_A_gets_A_plus_Btranspose_times_C(bs, rtmp_ptr, uik, u);
802         }
803         PetscCall(PetscLogFlops(2.0 * bs * bs2 * (jmax - jmin)));
804 
805         /* ... add i to row list for next nonzero entry */
806         il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */
807         j     = bj[jmin];
808         jl[i] = jl[j];
809         jl[j] = i; /* update jl */
810       }
811       i = nexti;
812     }
813 
814     /* save nonzero entries in k-th row of U ... */
815 
816     /* invert diagonal block */
817     diag = ba + k * bs2;
818     PetscCall(PetscArraycpy(diag, dk, bs2));
819 
820     PetscCall(PetscKernel_A_gets_inverse_A(bs, diag, pivots, work, allowzeropivot, &zeropivotdetected));
821     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
822 
823     jmin = bi[k];
824     jmax = bi[k + 1];
825     if (jmin < jmax) {
826       for (j = jmin; j < jmax; j++) {
827         vj       = bj[j]; /* block col. index of U */
828         u        = ba + j * bs2;
829         rtmp_ptr = rtmp + vj * bs2;
830         for (k1 = 0; k1 < bs2; k1++) {
831           *u++        = *rtmp_ptr;
832           *rtmp_ptr++ = 0.0;
833         }
834       }
835 
836       /* ... add k to row list for first nonzero entry in k-th row */
837       il[k] = jmin;
838       i     = bj[jmin];
839       jl[k] = jl[i];
840       jl[i] = k;
841     }
842   }
843 
844   PetscCall(PetscFree(rtmp));
845   PetscCall(PetscFree2(il, jl));
846   PetscCall(PetscFree3(dk, uik, work));
847   PetscCall(PetscFree(pivots));
848 
849   C->ops->solve          = MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace;
850   C->ops->solvetranspose = MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace;
851   C->ops->forwardsolve   = MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace;
852   C->ops->backwardsolve  = MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace;
853   C->assembled           = PETSC_TRUE;
854   C->preallocated        = PETSC_TRUE;
855 
856   PetscCall(PetscLogFlops(1.3333 * bs * bs2 * b->mbs)); /* from inverting diagonal blocks */
857   PetscFunctionReturn(PETSC_SUCCESS);
858 }
859 
860 /*
861     Numeric U^T*D*U factorization for SBAIJ format. Modified from SNF of YSMP.
862     Version for blocks 2 by 2.
863 */
864 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_2(Mat C, Mat A, const MatFactorInfo *info)
865 {
866   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)C->data;
867   IS              perm = b->row;
868   const PetscInt *ai, *aj, *perm_ptr;
869   PetscInt        i, j, mbs = a->mbs, *bi = b->i, *bj = b->j;
870   PetscInt       *a2anew, k, k1, jmin, jmax, *jl, *il, vj, nexti, ili;
871   MatScalar      *ba = b->a, *aa, *ap;
872   MatScalar      *u, *diag, *rtmp, *rtmp_ptr, dk[4], uik[4];
873   PetscReal       shift = info->shiftamount;
874   PetscBool       allowzeropivot, zeropivotdetected;
875 
876   PetscFunctionBegin;
877   allowzeropivot = PetscNot(A->erroriffailure);
878 
879   /* initialization */
880   /* il and jl record the first nonzero element in each row of the accessing
881      window U(0:k, k:mbs-1).
882      jl:    list of rows to be added to uneliminated rows
883             i>= k: jl(i) is the first row to be added to row i
884             i<  k: jl(i) is the row following row i in some list of rows
885             jl(i) = mbs indicates the end of a list
886      il(i): points to the first nonzero element in columns k,...,mbs-1 of
887             row i of U */
888   PetscCall(PetscCalloc1(4 * mbs, &rtmp));
889   PetscCall(PetscMalloc2(mbs, &il, mbs, &jl));
890   il[0] = 0;
891   for (i = 0; i < mbs; i++) jl[i] = mbs;
892 
893   PetscCall(ISGetIndices(perm, &perm_ptr));
894 
895   /* check permutation */
896   if (!a->permute) {
897     ai = a->i;
898     aj = a->j;
899     aa = a->a;
900   } else {
901     ai = a->inew;
902     aj = a->jnew;
903     PetscCall(PetscMalloc1(4 * ai[mbs], &aa));
904     PetscCall(PetscArraycpy(aa, a->a, 4 * ai[mbs]));
905     PetscCall(PetscMalloc1(ai[mbs], &a2anew));
906     PetscCall(PetscArraycpy(a2anew, a->a2anew, ai[mbs]));
907 
908     for (i = 0; i < mbs; i++) {
909       jmin = ai[i];
910       jmax = ai[i + 1];
911       for (j = jmin; j < jmax; j++) {
912         while (a2anew[j] != j) {
913           k         = a2anew[j];
914           a2anew[j] = a2anew[k];
915           a2anew[k] = k;
916           for (k1 = 0; k1 < 4; k1++) {
917             dk[k1]         = aa[k * 4 + k1];
918             aa[k * 4 + k1] = aa[j * 4 + k1];
919             aa[j * 4 + k1] = dk[k1];
920           }
921         }
922         /* transform column-oriented blocks that lie in the lower triangle to row-oriented blocks */
923         if (i > aj[j]) {
924           ap    = aa + j * 4; /* ptr to the beginning of the block */
925           dk[1] = ap[1];      /* swap ap[1] and ap[2] */
926           ap[1] = ap[2];
927           ap[2] = dk[1];
928         }
929       }
930     }
931     PetscCall(PetscFree(a2anew));
932   }
933 
934   /* for each row k */
935   for (k = 0; k < mbs; k++) {
936     /*initialize k-th row with elements nonzero in row perm(k) of A */
937     jmin = ai[perm_ptr[k]];
938     jmax = ai[perm_ptr[k] + 1];
939     ap   = aa + jmin * 4;
940     for (j = jmin; j < jmax; j++) {
941       vj       = perm_ptr[aj[j]]; /* block col. index */
942       rtmp_ptr = rtmp + vj * 4;
943       for (i = 0; i < 4; i++) *rtmp_ptr++ = *ap++;
944     }
945 
946     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
947     PetscCall(PetscArraycpy(dk, rtmp + k * 4, 4));
948     i = jl[k]; /* first row to be added to k_th row  */
949 
950     while (i < k) {
951       nexti = jl[i]; /* next row to be added to k_th row */
952 
953       /* compute multiplier */
954       ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
955 
956       /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */
957       diag   = ba + i * 4;
958       u      = ba + ili * 4;
959       uik[0] = -(diag[0] * u[0] + diag[2] * u[1]);
960       uik[1] = -(diag[1] * u[0] + diag[3] * u[1]);
961       uik[2] = -(diag[0] * u[2] + diag[2] * u[3]);
962       uik[3] = -(diag[1] * u[2] + diag[3] * u[3]);
963 
964       /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */
965       dk[0] += uik[0] * u[0] + uik[1] * u[1];
966       dk[1] += uik[2] * u[0] + uik[3] * u[1];
967       dk[2] += uik[0] * u[2] + uik[1] * u[3];
968       dk[3] += uik[2] * u[2] + uik[3] * u[3];
969 
970       PetscCall(PetscLogFlops(16.0 * 2.0));
971 
972       /* update -U(i,k): ba[ili] = uik */
973       PetscCall(PetscArraycpy(ba + ili * 4, uik, 4));
974 
975       /* add multiple of row i to k-th row ... */
976       jmin = ili + 1;
977       jmax = bi[i + 1];
978       if (jmin < jmax) {
979         for (j = jmin; j < jmax; j++) {
980           /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */
981           rtmp_ptr = rtmp + bj[j] * 4;
982           u        = ba + j * 4;
983           rtmp_ptr[0] += uik[0] * u[0] + uik[1] * u[1];
984           rtmp_ptr[1] += uik[2] * u[0] + uik[3] * u[1];
985           rtmp_ptr[2] += uik[0] * u[2] + uik[1] * u[3];
986           rtmp_ptr[3] += uik[2] * u[2] + uik[3] * u[3];
987         }
988         PetscCall(PetscLogFlops(16.0 * (jmax - jmin)));
989 
990         /* ... add i to row list for next nonzero entry */
991         il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */
992         j     = bj[jmin];
993         jl[i] = jl[j];
994         jl[j] = i; /* update jl */
995       }
996       i = nexti;
997     }
998 
999     /* save nonzero entries in k-th row of U ... */
1000 
1001     /* invert diagonal block */
1002     diag = ba + k * 4;
1003     PetscCall(PetscArraycpy(diag, dk, 4));
1004     PetscCall(PetscKernel_A_gets_inverse_A_2(diag, shift, allowzeropivot, &zeropivotdetected));
1005     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1006 
1007     jmin = bi[k];
1008     jmax = bi[k + 1];
1009     if (jmin < jmax) {
1010       for (j = jmin; j < jmax; j++) {
1011         vj       = bj[j]; /* block col. index of U */
1012         u        = ba + j * 4;
1013         rtmp_ptr = rtmp + vj * 4;
1014         for (k1 = 0; k1 < 4; k1++) {
1015           *u++        = *rtmp_ptr;
1016           *rtmp_ptr++ = 0.0;
1017         }
1018       }
1019 
1020       /* ... add k to row list for first nonzero entry in k-th row */
1021       il[k] = jmin;
1022       i     = bj[jmin];
1023       jl[k] = jl[i];
1024       jl[i] = k;
1025     }
1026   }
1027 
1028   PetscCall(PetscFree(rtmp));
1029   PetscCall(PetscFree2(il, jl));
1030   if (a->permute) PetscCall(PetscFree(aa));
1031   PetscCall(ISRestoreIndices(perm, &perm_ptr));
1032 
1033   C->ops->solve          = MatSolve_SeqSBAIJ_2_inplace;
1034   C->ops->solvetranspose = MatSolve_SeqSBAIJ_2_inplace;
1035   C->assembled           = PETSC_TRUE;
1036   C->preallocated        = PETSC_TRUE;
1037 
1038   PetscCall(PetscLogFlops(1.3333 * 8 * b->mbs)); /* from inverting diagonal blocks */
1039   PetscFunctionReturn(PETSC_SUCCESS);
1040 }
1041 
1042 /*
1043       Version for when blocks are 2 by 2 Using natural ordering
1044 */
1045 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering(Mat C, Mat A, const MatFactorInfo *info)
1046 {
1047   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)C->data;
1048   PetscInt      i, j, mbs = a->mbs, *bi = b->i, *bj = b->j;
1049   PetscInt     *ai, *aj, k, k1, jmin, jmax, *jl, *il, vj, nexti, ili;
1050   MatScalar    *ba = b->a, *aa, *ap, dk[8], uik[8];
1051   MatScalar    *u, *diag, *rtmp, *rtmp_ptr;
1052   PetscReal     shift = info->shiftamount;
1053   PetscBool     allowzeropivot, zeropivotdetected;
1054 
1055   PetscFunctionBegin;
1056   allowzeropivot = PetscNot(A->erroriffailure);
1057 
1058   /* initialization */
1059   /* il and jl record the first nonzero element in each row of the accessing
1060      window U(0:k, k:mbs-1).
1061      jl:    list of rows to be added to uneliminated rows
1062             i>= k: jl(i) is the first row to be added to row i
1063             i<  k: jl(i) is the row following row i in some list of rows
1064             jl(i) = mbs indicates the end of a list
1065      il(i): points to the first nonzero element in columns k,...,mbs-1 of
1066             row i of U */
1067   PetscCall(PetscCalloc1(4 * mbs, &rtmp));
1068   PetscCall(PetscMalloc2(mbs, &il, mbs, &jl));
1069   il[0] = 0;
1070   for (i = 0; i < mbs; i++) jl[i] = mbs;
1071 
1072   ai = a->i;
1073   aj = a->j;
1074   aa = a->a;
1075 
1076   /* for each row k */
1077   for (k = 0; k < mbs; k++) {
1078     /*initialize k-th row with elements nonzero in row k of A */
1079     jmin = ai[k];
1080     jmax = ai[k + 1];
1081     ap   = aa + jmin * 4;
1082     for (j = jmin; j < jmax; j++) {
1083       vj       = aj[j]; /* block col. index */
1084       rtmp_ptr = rtmp + vj * 4;
1085       for (i = 0; i < 4; i++) *rtmp_ptr++ = *ap++;
1086     }
1087 
1088     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
1089     PetscCall(PetscArraycpy(dk, rtmp + k * 4, 4));
1090     i = jl[k]; /* first row to be added to k_th row  */
1091 
1092     while (i < k) {
1093       nexti = jl[i]; /* next row to be added to k_th row */
1094 
1095       /* compute multiplier */
1096       ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
1097 
1098       /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */
1099       diag   = ba + i * 4;
1100       u      = ba + ili * 4;
1101       uik[0] = -(diag[0] * u[0] + diag[2] * u[1]);
1102       uik[1] = -(diag[1] * u[0] + diag[3] * u[1]);
1103       uik[2] = -(diag[0] * u[2] + diag[2] * u[3]);
1104       uik[3] = -(diag[1] * u[2] + diag[3] * u[3]);
1105 
1106       /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */
1107       dk[0] += uik[0] * u[0] + uik[1] * u[1];
1108       dk[1] += uik[2] * u[0] + uik[3] * u[1];
1109       dk[2] += uik[0] * u[2] + uik[1] * u[3];
1110       dk[3] += uik[2] * u[2] + uik[3] * u[3];
1111 
1112       PetscCall(PetscLogFlops(16.0 * 2.0));
1113 
1114       /* update -U(i,k): ba[ili] = uik */
1115       PetscCall(PetscArraycpy(ba + ili * 4, uik, 4));
1116 
1117       /* add multiple of row i to k-th row ... */
1118       jmin = ili + 1;
1119       jmax = bi[i + 1];
1120       if (jmin < jmax) {
1121         for (j = jmin; j < jmax; j++) {
1122           /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */
1123           rtmp_ptr = rtmp + bj[j] * 4;
1124           u        = ba + j * 4;
1125           rtmp_ptr[0] += uik[0] * u[0] + uik[1] * u[1];
1126           rtmp_ptr[1] += uik[2] * u[0] + uik[3] * u[1];
1127           rtmp_ptr[2] += uik[0] * u[2] + uik[1] * u[3];
1128           rtmp_ptr[3] += uik[2] * u[2] + uik[3] * u[3];
1129         }
1130         PetscCall(PetscLogFlops(16.0 * (jmax - jmin)));
1131 
1132         /* ... add i to row list for next nonzero entry */
1133         il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */
1134         j     = bj[jmin];
1135         jl[i] = jl[j];
1136         jl[j] = i; /* update jl */
1137       }
1138       i = nexti;
1139     }
1140 
1141     /* save nonzero entries in k-th row of U ... */
1142 
1143     /* invert diagonal block */
1144     diag = ba + k * 4;
1145     PetscCall(PetscArraycpy(diag, dk, 4));
1146     PetscCall(PetscKernel_A_gets_inverse_A_2(diag, shift, allowzeropivot, &zeropivotdetected));
1147     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1148 
1149     jmin = bi[k];
1150     jmax = bi[k + 1];
1151     if (jmin < jmax) {
1152       for (j = jmin; j < jmax; j++) {
1153         vj       = bj[j]; /* block col. index of U */
1154         u        = ba + j * 4;
1155         rtmp_ptr = rtmp + vj * 4;
1156         for (k1 = 0; k1 < 4; k1++) {
1157           *u++        = *rtmp_ptr;
1158           *rtmp_ptr++ = 0.0;
1159         }
1160       }
1161 
1162       /* ... add k to row list for first nonzero entry in k-th row */
1163       il[k] = jmin;
1164       i     = bj[jmin];
1165       jl[k] = jl[i];
1166       jl[i] = k;
1167     }
1168   }
1169 
1170   PetscCall(PetscFree(rtmp));
1171   PetscCall(PetscFree2(il, jl));
1172 
1173   C->ops->solve          = MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace;
1174   C->ops->solvetranspose = MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace;
1175   C->ops->forwardsolve   = MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace;
1176   C->ops->backwardsolve  = MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace;
1177   C->assembled           = PETSC_TRUE;
1178   C->preallocated        = PETSC_TRUE;
1179 
1180   PetscCall(PetscLogFlops(1.3333 * 8 * b->mbs)); /* from inverting diagonal blocks */
1181   PetscFunctionReturn(PETSC_SUCCESS);
1182 }
1183 
1184 /*
1185     Numeric U^T*D*U factorization for SBAIJ format.
1186     Version for blocks are 1 by 1.
1187 */
1188 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_1_inplace(Mat C, Mat A, const MatFactorInfo *info)
1189 {
1190   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)C->data;
1191   IS              ip = b->row;
1192   const PetscInt *ai, *aj, *rip;
1193   PetscInt       *a2anew, i, j, mbs = a->mbs, *bi = b->i, *bj = b->j, *bcol;
1194   PetscInt        k, jmin, jmax, *jl, *il, col, nexti, ili, nz;
1195   MatScalar      *rtmp, *ba = b->a, *bval, *aa, dk, uikdi;
1196   PetscReal       rs;
1197   FactorShiftCtx  sctx;
1198 
1199   PetscFunctionBegin;
1200   /* MatPivotSetUp(): initialize shift context sctx */
1201   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
1202 
1203   PetscCall(ISGetIndices(ip, &rip));
1204   if (!a->permute) {
1205     ai = a->i;
1206     aj = a->j;
1207     aa = a->a;
1208   } else {
1209     ai = a->inew;
1210     aj = a->jnew;
1211     nz = ai[mbs];
1212     PetscCall(PetscMalloc1(nz, &aa));
1213     a2anew = a->a2anew;
1214     bval   = a->a;
1215     for (j = 0; j < nz; j++) aa[a2anew[j]] = *(bval++);
1216   }
1217 
1218   /* initialization */
1219   /* il and jl record the first nonzero element in each row of the accessing
1220      window U(0:k, k:mbs-1).
1221      jl:    list of rows to be added to uneliminated rows
1222             i>= k: jl(i) is the first row to be added to row i
1223             i<  k: jl(i) is the row following row i in some list of rows
1224             jl(i) = mbs indicates the end of a list
1225      il(i): points to the first nonzero element in columns k,...,mbs-1 of
1226             row i of U */
1227   PetscCall(PetscMalloc3(mbs, &rtmp, mbs, &il, mbs, &jl));
1228 
1229   do {
1230     sctx.newshift = PETSC_FALSE;
1231     il[0]         = 0;
1232     for (i = 0; i < mbs; i++) {
1233       rtmp[i] = 0.0;
1234       jl[i]   = mbs;
1235     }
1236 
1237     for (k = 0; k < mbs; k++) {
1238       /*initialize k-th row by the perm[k]-th row of A */
1239       jmin = ai[rip[k]];
1240       jmax = ai[rip[k] + 1];
1241       bval = ba + bi[k];
1242       for (j = jmin; j < jmax; j++) {
1243         col       = rip[aj[j]];
1244         rtmp[col] = aa[j];
1245         *bval++   = 0.0; /* for in-place factorization */
1246       }
1247 
1248       /* shift the diagonal of the matrix */
1249       if (sctx.nshift) rtmp[k] += sctx.shift_amount;
1250 
1251       /* modify k-th row by adding in those rows i with U(i,k)!=0 */
1252       dk = rtmp[k];
1253       i  = jl[k]; /* first row to be added to k_th row  */
1254 
1255       while (i < k) {
1256         nexti = jl[i]; /* next row to be added to k_th row */
1257 
1258         /* compute multiplier, update diag(k) and U(i,k) */
1259         ili   = il[i];                /* index of first nonzero element in U(i,k:bms-1) */
1260         uikdi = -ba[ili] * ba[bi[i]]; /* diagonal(k) */
1261         dk += uikdi * ba[ili];
1262         ba[ili] = uikdi; /* -U(i,k) */
1263 
1264         /* add multiple of row i to k-th row */
1265         jmin = ili + 1;
1266         jmax = bi[i + 1];
1267         if (jmin < jmax) {
1268           for (j = jmin; j < jmax; j++) rtmp[bj[j]] += uikdi * ba[j];
1269           PetscCall(PetscLogFlops(2.0 * (jmax - jmin)));
1270 
1271           /* update il and jl for row i */
1272           il[i] = jmin;
1273           j     = bj[jmin];
1274           jl[i] = jl[j];
1275           jl[j] = i;
1276         }
1277         i = nexti;
1278       }
1279 
1280       /* shift the diagonals when zero pivot is detected */
1281       /* compute rs=sum of abs(off-diagonal) */
1282       rs   = 0.0;
1283       jmin = bi[k] + 1;
1284       nz   = bi[k + 1] - jmin;
1285       if (nz) {
1286         bcol = bj + jmin;
1287         while (nz--) {
1288           rs += PetscAbsScalar(rtmp[*bcol]);
1289           bcol++;
1290         }
1291       }
1292 
1293       sctx.rs = rs;
1294       sctx.pv = dk;
1295       PetscCall(MatPivotCheck(C, A, info, &sctx, k));
1296       if (sctx.newshift) break; /* sctx.shift_amount is updated */
1297       dk = sctx.pv;
1298 
1299       /* copy data into U(k,:) */
1300       ba[bi[k]] = 1.0 / dk; /* U(k,k) */
1301       jmin      = bi[k] + 1;
1302       jmax      = bi[k + 1];
1303       if (jmin < jmax) {
1304         for (j = jmin; j < jmax; j++) {
1305           col       = bj[j];
1306           ba[j]     = rtmp[col];
1307           rtmp[col] = 0.0;
1308         }
1309         /* add the k-th row into il and jl */
1310         il[k] = jmin;
1311         i     = bj[jmin];
1312         jl[k] = jl[i];
1313         jl[i] = k;
1314       }
1315     }
1316   } while (sctx.newshift);
1317   PetscCall(PetscFree3(rtmp, il, jl));
1318   if (a->permute) PetscCall(PetscFree(aa));
1319 
1320   PetscCall(ISRestoreIndices(ip, &rip));
1321 
1322   C->ops->solve          = MatSolve_SeqSBAIJ_1_inplace;
1323   C->ops->solves         = MatSolves_SeqSBAIJ_1_inplace;
1324   C->ops->solvetranspose = MatSolve_SeqSBAIJ_1_inplace;
1325   C->ops->forwardsolve   = MatForwardSolve_SeqSBAIJ_1_inplace;
1326   C->ops->backwardsolve  = MatBackwardSolve_SeqSBAIJ_1_inplace;
1327   C->assembled           = PETSC_TRUE;
1328   C->preallocated        = PETSC_TRUE;
1329 
1330   PetscCall(PetscLogFlops(C->rmap->N));
1331   if (sctx.nshift) {
1332     if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1333       PetscCall(PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
1334     } else if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
1335       PetscCall(PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
1336     }
1337   }
1338   PetscFunctionReturn(PETSC_SUCCESS);
1339 }
1340 
1341 /*
1342   Version for when blocks are 1 by 1 Using natural ordering under new datastructure
1343   Modified from MatCholeskyFactorNumeric_SeqAIJ()
1344 */
1345 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info)
1346 {
1347   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ *)A->data;
1348   Mat_SeqSBAIJ  *b = (Mat_SeqSBAIJ *)B->data;
1349   PetscInt       i, j, mbs = A->rmap->n, *bi = b->i, *bj = b->j, *bdiag = b->diag, *bjtmp;
1350   PetscInt      *ai = a->i, *aj = a->j, *ajtmp;
1351   PetscInt       k, jmin, jmax, *c2r, *il, col, nexti, ili, nz;
1352   MatScalar     *rtmp, *ba = b->a, *bval, *aa = a->a, dk, uikdi;
1353   FactorShiftCtx sctx;
1354   PetscReal      rs;
1355   MatScalar      d, *v;
1356 
1357   PetscFunctionBegin;
1358   PetscCall(PetscMalloc3(mbs, &rtmp, mbs, &il, mbs, &c2r));
1359 
1360   /* MatPivotSetUp(): initialize shift context sctx */
1361   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
1362 
1363   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1364     sctx.shift_top = info->zeropivot;
1365 
1366     PetscCall(PetscArrayzero(rtmp, mbs));
1367 
1368     for (i = 0; i < mbs; i++) {
1369       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1370       d = (aa)[a->diag[i]];
1371       rtmp[i] += -PetscRealPart(d); /* diagonal entry */
1372       ajtmp = aj + ai[i] + 1;       /* exclude diagonal */
1373       v     = aa + ai[i] + 1;
1374       nz    = ai[i + 1] - ai[i] - 1;
1375       for (j = 0; j < nz; j++) {
1376         rtmp[i] += PetscAbsScalar(v[j]);
1377         rtmp[ajtmp[j]] += PetscAbsScalar(v[j]);
1378       }
1379       if (PetscRealPart(rtmp[i]) > sctx.shift_top) sctx.shift_top = PetscRealPart(rtmp[i]);
1380     }
1381     sctx.shift_top *= 1.1;
1382     sctx.nshift_max = 5;
1383     sctx.shift_lo   = 0.;
1384     sctx.shift_hi   = 1.;
1385   }
1386 
1387   /* allocate working arrays
1388      c2r: linked list, keep track of pivot rows for a given column. c2r[col]: head of the list for a given col
1389      il:  for active k row, il[i] gives the index of the 1st nonzero entry in U[i,k:n-1] in bj and ba arrays
1390   */
1391   do {
1392     sctx.newshift = PETSC_FALSE;
1393 
1394     for (i = 0; i < mbs; i++) c2r[i] = mbs;
1395     if (mbs) il[0] = 0;
1396 
1397     for (k = 0; k < mbs; k++) {
1398       /* zero rtmp */
1399       nz    = bi[k + 1] - bi[k];
1400       bjtmp = bj + bi[k];
1401       for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0;
1402 
1403       /* load in initial unfactored row */
1404       bval = ba + bi[k];
1405       jmin = ai[k];
1406       jmax = ai[k + 1];
1407       for (j = jmin; j < jmax; j++) {
1408         col       = aj[j];
1409         rtmp[col] = aa[j];
1410         *bval++   = 0.0; /* for in-place factorization */
1411       }
1412       /* shift the diagonal of the matrix: ZeropivotApply() */
1413       rtmp[k] += sctx.shift_amount; /* shift the diagonal of the matrix */
1414 
1415       /* modify k-th row by adding in those rows i with U(i,k)!=0 */
1416       dk = rtmp[k];
1417       i  = c2r[k]; /* first row to be added to k_th row  */
1418 
1419       while (i < k) {
1420         nexti = c2r[i]; /* next row to be added to k_th row */
1421 
1422         /* compute multiplier, update diag(k) and U(i,k) */
1423         ili   = il[i];                   /* index of first nonzero element in U(i,k:bms-1) */
1424         uikdi = -ba[ili] * ba[bdiag[i]]; /* diagonal(k) */
1425         dk += uikdi * ba[ili];           /* update diag[k] */
1426         ba[ili] = uikdi;                 /* -U(i,k) */
1427 
1428         /* add multiple of row i to k-th row */
1429         jmin = ili + 1;
1430         jmax = bi[i + 1];
1431         if (jmin < jmax) {
1432           for (j = jmin; j < jmax; j++) rtmp[bj[j]] += uikdi * ba[j];
1433           /* update il and c2r for row i */
1434           il[i]  = jmin;
1435           j      = bj[jmin];
1436           c2r[i] = c2r[j];
1437           c2r[j] = i;
1438         }
1439         i = nexti;
1440       }
1441 
1442       /* copy data into U(k,:) */
1443       rs   = 0.0;
1444       jmin = bi[k];
1445       jmax = bi[k + 1] - 1;
1446       if (jmin < jmax) {
1447         for (j = jmin; j < jmax; j++) {
1448           col   = bj[j];
1449           ba[j] = rtmp[col];
1450           rs += PetscAbsScalar(ba[j]);
1451         }
1452         /* add the k-th row into il and c2r */
1453         il[k]  = jmin;
1454         i      = bj[jmin];
1455         c2r[k] = c2r[i];
1456         c2r[i] = k;
1457       }
1458 
1459       sctx.rs = rs;
1460       sctx.pv = dk;
1461       PetscCall(MatPivotCheck(B, A, info, &sctx, k));
1462       if (sctx.newshift) break;
1463       dk = sctx.pv;
1464 
1465       ba[bdiag[k]] = 1.0 / dk; /* U(k,k) */
1466     }
1467   } while (sctx.newshift);
1468 
1469   PetscCall(PetscFree3(rtmp, il, c2r));
1470 
1471   B->ops->solve          = MatSolve_SeqSBAIJ_1_NaturalOrdering;
1472   B->ops->solves         = MatSolves_SeqSBAIJ_1;
1473   B->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering;
1474   B->ops->matsolve       = MatMatSolve_SeqSBAIJ_1_NaturalOrdering;
1475   B->ops->forwardsolve   = MatForwardSolve_SeqSBAIJ_1_NaturalOrdering;
1476   B->ops->backwardsolve  = MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering;
1477 
1478   B->assembled    = PETSC_TRUE;
1479   B->preallocated = PETSC_TRUE;
1480 
1481   PetscCall(PetscLogFlops(B->rmap->n));
1482 
1483   /* MatPivotView() */
1484   if (sctx.nshift) {
1485     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
1486       PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
1487     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1488       PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
1489     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1490       PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount));
1491     }
1492   }
1493   PetscFunctionReturn(PETSC_SUCCESS);
1494 }
1495 
1496 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info)
1497 {
1498   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)C->data;
1499   PetscInt       i, j, mbs = a->mbs;
1500   PetscInt      *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
1501   PetscInt       k, jmin, *jl, *il, nexti, ili, *acol, *bcol, nz;
1502   MatScalar     *rtmp, *ba = b->a, *aa = a->a, dk, uikdi, *aval, *bval;
1503   PetscReal      rs;
1504   FactorShiftCtx sctx;
1505 
1506   PetscFunctionBegin;
1507   /* MatPivotSetUp(): initialize shift context sctx */
1508   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
1509 
1510   /* initialization */
1511   /* il and jl record the first nonzero element in each row of the accessing
1512      window U(0:k, k:mbs-1).
1513      jl:    list of rows to be added to uneliminated rows
1514             i>= k: jl(i) is the first row to be added to row i
1515             i<  k: jl(i) is the row following row i in some list of rows
1516             jl(i) = mbs indicates the end of a list
1517      il(i): points to the first nonzero element in U(i,k:mbs-1)
1518   */
1519   PetscCall(PetscMalloc1(mbs, &rtmp));
1520   PetscCall(PetscMalloc2(mbs, &il, mbs, &jl));
1521 
1522   do {
1523     sctx.newshift = PETSC_FALSE;
1524     il[0]         = 0;
1525     for (i = 0; i < mbs; i++) {
1526       rtmp[i] = 0.0;
1527       jl[i]   = mbs;
1528     }
1529 
1530     for (k = 0; k < mbs; k++) {
1531       /*initialize k-th row with elements nonzero in row perm(k) of A */
1532       nz   = ai[k + 1] - ai[k];
1533       acol = aj + ai[k];
1534       aval = aa + ai[k];
1535       bval = ba + bi[k];
1536       while (nz--) {
1537         rtmp[*acol++] = *aval++;
1538         *bval++       = 0.0; /* for in-place factorization */
1539       }
1540 
1541       /* shift the diagonal of the matrix */
1542       if (sctx.nshift) rtmp[k] += sctx.shift_amount;
1543 
1544       /* modify k-th row by adding in those rows i with U(i,k)!=0 */
1545       dk = rtmp[k];
1546       i  = jl[k]; /* first row to be added to k_th row  */
1547 
1548       while (i < k) {
1549         nexti = jl[i]; /* next row to be added to k_th row */
1550         /* compute multiplier, update D(k) and U(i,k) */
1551         ili   = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
1552         uikdi = -ba[ili] * ba[bi[i]];
1553         dk += uikdi * ba[ili];
1554         ba[ili] = uikdi; /* -U(i,k) */
1555 
1556         /* add multiple of row i to k-th row ... */
1557         jmin = ili + 1;
1558         nz   = bi[i + 1] - jmin;
1559         if (nz > 0) {
1560           bcol = bj + jmin;
1561           bval = ba + jmin;
1562           PetscCall(PetscLogFlops(2.0 * nz));
1563           while (nz--) rtmp[*bcol++] += uikdi * (*bval++);
1564 
1565           /* update il and jl for i-th row */
1566           il[i] = jmin;
1567           j     = bj[jmin];
1568           jl[i] = jl[j];
1569           jl[j] = i;
1570         }
1571         i = nexti;
1572       }
1573 
1574       /* shift the diagonals when zero pivot is detected */
1575       /* compute rs=sum of abs(off-diagonal) */
1576       rs   = 0.0;
1577       jmin = bi[k] + 1;
1578       nz   = bi[k + 1] - jmin;
1579       if (nz) {
1580         bcol = bj + jmin;
1581         while (nz--) {
1582           rs += PetscAbsScalar(rtmp[*bcol]);
1583           bcol++;
1584         }
1585       }
1586 
1587       sctx.rs = rs;
1588       sctx.pv = dk;
1589       PetscCall(MatPivotCheck(C, A, info, &sctx, k));
1590       if (sctx.newshift) break; /* sctx.shift_amount is updated */
1591       dk = sctx.pv;
1592 
1593       /* copy data into U(k,:) */
1594       ba[bi[k]] = 1.0 / dk;
1595       jmin      = bi[k] + 1;
1596       nz        = bi[k + 1] - jmin;
1597       if (nz) {
1598         bcol = bj + jmin;
1599         bval = ba + jmin;
1600         while (nz--) {
1601           *bval++       = rtmp[*bcol];
1602           rtmp[*bcol++] = 0.0;
1603         }
1604         /* add k-th row into il and jl */
1605         il[k] = jmin;
1606         i     = bj[jmin];
1607         jl[k] = jl[i];
1608         jl[i] = k;
1609       }
1610     } /* end of for (k = 0; k<mbs; k++) */
1611   } while (sctx.newshift);
1612   PetscCall(PetscFree(rtmp));
1613   PetscCall(PetscFree2(il, jl));
1614 
1615   C->ops->solve          = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1616   C->ops->solves         = MatSolves_SeqSBAIJ_1_inplace;
1617   C->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1618   C->ops->forwardsolve   = MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1619   C->ops->backwardsolve  = MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1620 
1621   C->assembled    = PETSC_TRUE;
1622   C->preallocated = PETSC_TRUE;
1623 
1624   PetscCall(PetscLogFlops(C->rmap->N));
1625   if (sctx.nshift) {
1626     if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1627       PetscCall(PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
1628     } else if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
1629       PetscCall(PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
1630     }
1631   }
1632   PetscFunctionReturn(PETSC_SUCCESS);
1633 }
1634 
1635 PetscErrorCode MatCholeskyFactor_SeqSBAIJ(Mat A, IS perm, const MatFactorInfo *info)
1636 {
1637   Mat C;
1638 
1639   PetscFunctionBegin;
1640   PetscCall(MatGetFactor(A, "petsc", MAT_FACTOR_CHOLESKY, &C));
1641   PetscCall(MatCholeskyFactorSymbolic(C, A, perm, info));
1642   PetscCall(MatCholeskyFactorNumeric(C, A, info));
1643 
1644   A->ops->solve          = C->ops->solve;
1645   A->ops->solvetranspose = C->ops->solvetranspose;
1646 
1647   PetscCall(MatHeaderMerge(A, &C));
1648   PetscFunctionReturn(PETSC_SUCCESS);
1649 }
1650