xref: /petsc/src/mat/impls/sbaij/seq/sbaijfact.c (revision f13dfd9ea68e0ddeee984e65c377a1819eab8a8a)
1 #include <../src/mat/impls/baij/seq/baij.h>
2 #include <../src/mat/impls/sbaij/seq/sbaij.h>
3 #include <petsc/private/kernels/blockinvert.h>
4 #include <petscis.h>
5 
6 PetscErrorCode MatGetInertia_SeqSBAIJ(Mat F, PetscInt *nneg, PetscInt *nzero, PetscInt *npos)
7 {
8   Mat_SeqSBAIJ *fact = (Mat_SeqSBAIJ *)F->data;
9   MatScalar    *dd   = fact->a;
10   PetscInt      mbs = fact->mbs, bs = F->rmap->bs, i, nneg_tmp, npos_tmp, *fi = fact->diag;
11 
12   PetscFunctionBegin;
13   PetscCheck(bs == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for bs: %" PetscInt_FMT " >1 yet", bs);
14 
15   nneg_tmp = 0;
16   npos_tmp = 0;
17   if (fi) {
18     for (i = 0; i < mbs; i++) {
19       if (PetscRealPart(dd[*fi]) > 0.0) npos_tmp++;
20       else if (PetscRealPart(dd[*fi]) < 0.0) nneg_tmp++;
21       fi++;
22     }
23   } else {
24     for (i = 0; i < mbs; i++) {
25       if (PetscRealPart(dd[fact->i[i]]) > 0.0) npos_tmp++;
26       else if (PetscRealPart(dd[fact->i[i]]) < 0.0) nneg_tmp++;
27     }
28   }
29   if (nneg) *nneg = nneg_tmp;
30   if (npos) *npos = npos_tmp;
31   if (nzero) *nzero = mbs - nneg_tmp - npos_tmp;
32   PetscFunctionReturn(PETSC_SUCCESS);
33 }
34 
35 /*
36   Symbolic U^T*D*U factorization for SBAIJ format. Modified from SSF of YSMP.
37   Use Modified Sparse Row (MSR) storage for u and ju. See page 85, "Iterative Methods ..." by Saad.
38 */
39 static PetscErrorCode MatCholeskyFactorSymbolic_SeqSBAIJ_MSR(Mat F, Mat A, IS perm, const MatFactorInfo *info)
40 {
41   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data, *b;
42   const PetscInt *rip, *ai, *aj;
43   PetscInt        i, mbs = a->mbs, *jutmp, bs = A->rmap->bs, bs2 = a->bs2;
44   PetscInt        m, reallocs = 0, prow;
45   PetscInt       *jl, *q, jmin, jmax, juidx, nzk, qm, *iu, *ju, k, j, vj, umax, maxadd;
46   PetscReal       f = info->fill;
47   PetscBool       perm_identity;
48 
49   PetscFunctionBegin;
50   /* check whether perm is the identity mapping */
51   PetscCall(ISIdentity(perm, &perm_identity));
52   PetscCall(ISGetIndices(perm, &rip));
53 
54   if (perm_identity) { /* without permutation */
55     a->permute = PETSC_FALSE;
56 
57     ai = a->i;
58     aj = a->j;
59   } else { /* non-trivial permutation */
60     a->permute = PETSC_TRUE;
61 
62     PetscCall(MatReorderingSeqSBAIJ(A, perm));
63 
64     ai = a->inew;
65     aj = a->jnew;
66   }
67 
68   /* initialization */
69   PetscCall(PetscMalloc1(mbs + 1, &iu));
70   umax = (PetscInt)(f * ai[mbs] + 1);
71   umax += mbs + 1;
72   PetscCall(PetscMalloc1(umax, &ju));
73   iu[0] = mbs + 1;
74   juidx = mbs + 1; /* index for ju */
75   /* jl linked list for pivot row -- linked list for col index */
76   PetscCall(PetscMalloc2(mbs, &jl, mbs, &q));
77   for (i = 0; i < mbs; i++) {
78     jl[i] = mbs;
79     q[i]  = 0;
80   }
81 
82   /* for each row k */
83   for (k = 0; k < mbs; k++) {
84     for (i = 0; i < mbs; i++) q[i] = 0; /* to be removed! */
85     nzk  = 0;                           /* num. of nz blocks in k-th block row with diagonal block excluded */
86     q[k] = mbs;
87     /* initialize nonzero structure of k-th row to row rip[k] of A */
88     jmin = ai[rip[k]] + 1; /* exclude diag[k] */
89     jmax = ai[rip[k] + 1];
90     for (j = jmin; j < jmax; j++) {
91       vj = rip[aj[j]]; /* col. value */
92       if (vj > k) {
93         qm = k;
94         do {
95           m  = qm;
96           qm = q[m];
97         } while (qm < vj);
98         PetscCheck(qm != vj, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Duplicate entry in A");
99         nzk++;
100         q[m]  = vj;
101         q[vj] = qm;
102       } /* if (vj > k) */
103     } /* for (j=jmin; j<jmax; j++) */
104 
105     /* modify nonzero structure of k-th row by computing fill-in
106        for each row i to be merged in */
107     prow = k;
108     prow = jl[prow]; /* next pivot row (== mbs for symbolic factorization) */
109 
110     while (prow < k) {
111       /* merge row prow into k-th row */
112       jmin = iu[prow] + 1;
113       jmax = iu[prow + 1];
114       qm   = k;
115       for (j = jmin; j < jmax; j++) {
116         vj = ju[j];
117         do {
118           m  = qm;
119           qm = q[m];
120         } while (qm < vj);
121         if (qm != vj) {
122           nzk++;
123           q[m]  = vj;
124           q[vj] = qm;
125           qm    = vj;
126         }
127       }
128       prow = jl[prow]; /* next pivot row */
129     }
130 
131     /* add k to row list for first nonzero element in k-th row */
132     if (nzk > 0) {
133       i     = q[k]; /* col value of first nonzero element in U(k, k+1:mbs-1) */
134       jl[k] = jl[i];
135       jl[i] = k;
136     }
137     iu[k + 1] = iu[k] + nzk;
138 
139     /* allocate more space to ju if needed */
140     if (iu[k + 1] > umax) {
141       /* estimate how much additional space we will need */
142       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
143       /* just double the memory each time */
144       maxadd = umax;
145       if (maxadd < nzk) maxadd = (mbs - k) * (nzk + 1) / 2;
146       umax += maxadd;
147 
148       /* allocate a longer ju */
149       PetscCall(PetscMalloc1(umax, &jutmp));
150       PetscCall(PetscArraycpy(jutmp, ju, iu[k]));
151       PetscCall(PetscFree(ju));
152       ju = jutmp;
153       reallocs++; /* count how many times we realloc */
154     }
155 
156     /* save nonzero structure of k-th row in ju */
157     i = k;
158     while (nzk--) {
159       i           = q[i];
160       ju[juidx++] = i;
161     }
162   }
163 
164 #if defined(PETSC_USE_INFO)
165   if (ai[mbs] != 0) {
166     PetscReal af = ((PetscReal)iu[mbs]) / ((PetscReal)ai[mbs]);
167     PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)f, (double)af));
168     PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
169     PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g);\n", (double)af));
170     PetscCall(PetscInfo(A, "for best performance.\n"));
171   } else {
172     PetscCall(PetscInfo(A, "Empty matrix\n"));
173   }
174 #endif
175 
176   PetscCall(ISRestoreIndices(perm, &rip));
177   PetscCall(PetscFree2(jl, q));
178 
179   /* put together the new matrix */
180   PetscCall(MatSeqSBAIJSetPreallocation(F, bs, MAT_SKIP_ALLOCATION, NULL));
181 
182   b               = (Mat_SeqSBAIJ *)(F)->data;
183   b->singlemalloc = PETSC_FALSE;
184   b->free_a       = PETSC_TRUE;
185   b->free_ij      = PETSC_TRUE;
186 
187   PetscCall(PetscMalloc1((iu[mbs] + 1) * bs2, &b->a));
188   b->j    = ju;
189   b->i    = iu;
190   b->diag = NULL;
191   b->ilen = NULL;
192   b->imax = NULL;
193   b->row  = perm;
194 
195   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
196 
197   PetscCall(PetscObjectReference((PetscObject)perm));
198 
199   b->icol = perm;
200   PetscCall(PetscObjectReference((PetscObject)perm));
201   PetscCall(PetscMalloc1(bs * mbs + bs, &b->solve_work));
202   /* In b structure:  Free imax, ilen, old a, old j.
203      Allocate idnew, solve_work, new a, new j */
204   b->maxnz = b->nz = iu[mbs];
205 
206   (F)->info.factor_mallocs   = reallocs;
207   (F)->info.fill_ratio_given = f;
208   if (ai[mbs] != 0) {
209     (F)->info.fill_ratio_needed = ((PetscReal)iu[mbs]) / ((PetscReal)ai[mbs]);
210   } else {
211     (F)->info.fill_ratio_needed = 0.0;
212   }
213   PetscCall(MatSeqSBAIJSetNumericFactorization_inplace(F, perm_identity));
214   PetscFunctionReturn(PETSC_SUCCESS);
215 }
216 /*
217     Symbolic U^T*D*U factorization for SBAIJ format.
218     See MatICCFactorSymbolic_SeqAIJ() for description of its data structure.
219 */
220 #include <petscbt.h>
221 #include <../src/mat/utils/freespace.h>
222 PetscErrorCode MatCholeskyFactorSymbolic_SeqSBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
223 {
224   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
225   Mat_SeqSBAIJ      *b;
226   PetscBool          perm_identity, missing;
227   PetscReal          fill = info->fill;
228   const PetscInt    *rip, *ai = a->i, *aj = a->j;
229   PetscInt           i, mbs = a->mbs, bs = A->rmap->bs, reallocs = 0, prow;
230   PetscInt          *jl, jmin, jmax, nzk, *ui, k, j, *il, nextprow;
231   PetscInt           nlnk, *lnk, ncols, *cols, *uj, **ui_ptr, *uj_ptr, *udiag;
232   PetscFreeSpaceList free_space = NULL, current_space = NULL;
233   PetscBT            lnkbt;
234 
235   PetscFunctionBegin;
236   PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
237   PetscCall(MatMissingDiagonal(A, &missing, &i));
238   PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
239   if (bs > 1) {
240     PetscCall(MatCholeskyFactorSymbolic_SeqSBAIJ_inplace(fact, A, perm, info));
241     PetscFunctionReturn(PETSC_SUCCESS);
242   }
243 
244   /* check whether perm is the identity mapping */
245   PetscCall(ISIdentity(perm, &perm_identity));
246   PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported for sbaij matrix. Use aij format");
247   a->permute = PETSC_FALSE;
248   PetscCall(ISGetIndices(perm, &rip));
249 
250   /* initialization */
251   PetscCall(PetscMalloc1(mbs + 1, &ui));
252   PetscCall(PetscMalloc1(mbs + 1, &udiag));
253   ui[0] = 0;
254 
255   /* jl: linked list for storing indices of the pivot rows
256      il: il[i] points to the 1st nonzero entry of U(i,k:mbs-1) */
257   PetscCall(PetscMalloc4(mbs, &ui_ptr, mbs, &il, mbs, &jl, mbs, &cols));
258   for (i = 0; i < mbs; i++) {
259     jl[i] = mbs;
260     il[i] = 0;
261   }
262 
263   /* create and initialize a linked list for storing column indices of the active row k */
264   nlnk = mbs + 1;
265   PetscCall(PetscLLCreate(mbs, mbs, nlnk, lnk, lnkbt));
266 
267   /* initial FreeSpace size is fill*(ai[mbs]+1) */
268   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[mbs] + 1), &free_space));
269   current_space = free_space;
270 
271   for (k = 0; k < mbs; k++) { /* for each active row k */
272     /* initialize lnk by the column indices of row rip[k] of A */
273     nzk   = 0;
274     ncols = ai[k + 1] - ai[k];
275     PetscCheck(ncols, PETSC_COMM_SELF, PETSC_ERR_MAT_CH_ZRPVT, "Empty row %" PetscInt_FMT " in matrix ", k);
276     for (j = 0; j < ncols; j++) {
277       i       = *(aj + ai[k] + j);
278       cols[j] = i;
279     }
280     PetscCall(PetscLLAdd(ncols, cols, mbs, &nlnk, lnk, lnkbt));
281     nzk += nlnk;
282 
283     /* update lnk by computing fill-in for each pivot row to be merged in */
284     prow = jl[k]; /* 1st pivot row */
285 
286     while (prow < k) {
287       nextprow = jl[prow];
288       /* merge prow into k-th row */
289       jmin   = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:mbs-1) */
290       jmax   = ui[prow + 1];
291       ncols  = jmax - jmin;
292       uj_ptr = ui_ptr[prow] + jmin - ui[prow]; /* points to the 2nd nzero entry in U(prow,k:mbs-1) */
293       PetscCall(PetscLLAddSorted(ncols, uj_ptr, mbs, &nlnk, lnk, lnkbt));
294       nzk += nlnk;
295 
296       /* update il and jl for prow */
297       if (jmin < jmax) {
298         il[prow] = jmin;
299         j        = *uj_ptr;
300         jl[prow] = jl[j];
301         jl[j]    = prow;
302       }
303       prow = nextprow;
304     }
305 
306     /* if free space is not available, make more free space */
307     if (current_space->local_remaining < nzk) {
308       i = mbs - k + 1;                                   /* num of unfactored rows */
309       i = PetscIntMultTruncate(i, PetscMin(nzk, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
310       PetscCall(PetscFreeSpaceGet(i, &current_space));
311       reallocs++;
312     }
313 
314     /* copy data into free space, then initialize lnk */
315     PetscCall(PetscLLClean(mbs, mbs, nzk, lnk, current_space->array, lnkbt));
316 
317     /* add the k-th row into il and jl */
318     if (nzk > 1) {
319       i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */
320       jl[k] = jl[i];
321       jl[i] = k;
322       il[k] = ui[k] + 1;
323     }
324     ui_ptr[k] = current_space->array;
325 
326     current_space->array += nzk;
327     current_space->local_used += nzk;
328     current_space->local_remaining -= nzk;
329 
330     ui[k + 1] = ui[k] + nzk;
331   }
332 
333   PetscCall(ISRestoreIndices(perm, &rip));
334   PetscCall(PetscFree4(ui_ptr, il, jl, cols));
335 
336   /* destroy list of free space and other temporary array(s) */
337   PetscCall(PetscMalloc1(ui[mbs] + 1, &uj));
338   PetscCall(PetscFreeSpaceContiguous_Cholesky(&free_space, uj, mbs, ui, udiag)); /* store matrix factor */
339   PetscCall(PetscLLDestroy(lnk, lnkbt));
340 
341   /* put together the new matrix in MATSEQSBAIJ format */
342   PetscCall(MatSeqSBAIJSetPreallocation(fact, bs, MAT_SKIP_ALLOCATION, NULL));
343 
344   b               = (Mat_SeqSBAIJ *)fact->data;
345   b->singlemalloc = PETSC_FALSE;
346   b->free_a       = PETSC_TRUE;
347   b->free_ij      = PETSC_TRUE;
348 
349   PetscCall(PetscMalloc1(ui[mbs] + 1, &b->a));
350 
351   b->j         = uj;
352   b->i         = ui;
353   b->diag      = udiag;
354   b->free_diag = PETSC_TRUE;
355   b->ilen      = NULL;
356   b->imax      = NULL;
357   b->row       = perm;
358   b->icol      = perm;
359 
360   PetscCall(PetscObjectReference((PetscObject)perm));
361   PetscCall(PetscObjectReference((PetscObject)perm));
362 
363   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
364 
365   PetscCall(PetscMalloc1(mbs + 1, &b->solve_work));
366 
367   b->maxnz = b->nz = ui[mbs];
368 
369   fact->info.factor_mallocs   = reallocs;
370   fact->info.fill_ratio_given = fill;
371   if (ai[mbs] != 0) {
372     fact->info.fill_ratio_needed = ((PetscReal)ui[mbs]) / ai[mbs];
373   } else {
374     fact->info.fill_ratio_needed = 0.0;
375   }
376 #if defined(PETSC_USE_INFO)
377   if (ai[mbs] != 0) {
378     PetscReal af = fact->info.fill_ratio_needed;
379     PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af));
380     PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
381     PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af));
382   } else {
383     PetscCall(PetscInfo(A, "Empty matrix\n"));
384   }
385 #endif
386   fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
387   PetscFunctionReturn(PETSC_SUCCESS);
388 }
389 
390 PetscErrorCode MatCholeskyFactorSymbolic_SeqSBAIJ_inplace(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
391 {
392   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
393   Mat_SeqSBAIJ      *b;
394   PetscBool          perm_identity, missing;
395   PetscReal          fill = info->fill;
396   const PetscInt    *rip, *ai, *aj;
397   PetscInt           i, mbs = a->mbs, bs = A->rmap->bs, reallocs = 0, prow, d;
398   PetscInt          *jl, jmin, jmax, nzk, *ui, k, j, *il, nextprow;
399   PetscInt           nlnk, *lnk, ncols, *cols, *uj, **ui_ptr, *uj_ptr;
400   PetscFreeSpaceList free_space = NULL, current_space = NULL;
401   PetscBT            lnkbt;
402 
403   PetscFunctionBegin;
404   PetscCall(MatMissingDiagonal(A, &missing, &d));
405   PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, d);
406 
407   /*
408    This code originally uses Modified Sparse Row (MSR) storage
409    (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choice!
410    Then it is rewritten so the factor B takes seqsbaij format. However the associated
411    MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1 or !perm_identity,
412    thus the original code in MSR format is still used for these cases.
413    The code below should replace MatCholeskyFactorSymbolic_SeqSBAIJ_MSR() whenever
414    MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor.
415   */
416   if (bs > 1) {
417     PetscCall(MatCholeskyFactorSymbolic_SeqSBAIJ_MSR(fact, A, perm, info));
418     PetscFunctionReturn(PETSC_SUCCESS);
419   }
420 
421   /* check whether perm is the identity mapping */
422   PetscCall(ISIdentity(perm, &perm_identity));
423   PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported for sbaij matrix. Use aij format");
424   a->permute = PETSC_FALSE;
425   ai         = a->i;
426   aj         = a->j;
427   PetscCall(ISGetIndices(perm, &rip));
428 
429   /* initialization */
430   PetscCall(PetscMalloc1(mbs + 1, &ui));
431   ui[0] = 0;
432 
433   /* jl: linked list for storing indices of the pivot rows
434      il: il[i] points to the 1st nonzero entry of U(i,k:mbs-1) */
435   PetscCall(PetscMalloc4(mbs, &ui_ptr, mbs, &il, mbs, &jl, mbs, &cols));
436   for (i = 0; i < mbs; i++) {
437     jl[i] = mbs;
438     il[i] = 0;
439   }
440 
441   /* create and initialize a linked list for storing column indices of the active row k */
442   nlnk = mbs + 1;
443   PetscCall(PetscLLCreate(mbs, mbs, nlnk, lnk, lnkbt));
444 
445   /* initial FreeSpace size is fill*(ai[mbs]+1) */
446   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[mbs] + 1), &free_space));
447   current_space = free_space;
448 
449   for (k = 0; k < mbs; k++) { /* for each active row k */
450     /* initialize lnk by the column indices of row rip[k] of A */
451     nzk   = 0;
452     ncols = ai[rip[k] + 1] - ai[rip[k]];
453     for (j = 0; j < ncols; j++) {
454       i       = *(aj + ai[rip[k]] + j);
455       cols[j] = rip[i];
456     }
457     PetscCall(PetscLLAdd(ncols, cols, mbs, &nlnk, lnk, lnkbt));
458     nzk += nlnk;
459 
460     /* update lnk by computing fill-in for each pivot row to be merged in */
461     prow = jl[k]; /* 1st pivot row */
462 
463     while (prow < k) {
464       nextprow = jl[prow];
465       /* merge prow into k-th row */
466       jmin   = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:mbs-1) */
467       jmax   = ui[prow + 1];
468       ncols  = jmax - jmin;
469       uj_ptr = ui_ptr[prow] + jmin - ui[prow]; /* points to the 2nd nzero entry in U(prow,k:mbs-1) */
470       PetscCall(PetscLLAddSorted(ncols, uj_ptr, mbs, &nlnk, lnk, lnkbt));
471       nzk += nlnk;
472 
473       /* update il and jl for prow */
474       if (jmin < jmax) {
475         il[prow] = jmin;
476 
477         j        = *uj_ptr;
478         jl[prow] = jl[j];
479         jl[j]    = prow;
480       }
481       prow = nextprow;
482     }
483 
484     /* if free space is not available, make more free space */
485     if (current_space->local_remaining < nzk) {
486       i = mbs - k + 1;                                                            /* num of unfactored rows */
487       i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
488       PetscCall(PetscFreeSpaceGet(i, &current_space));
489       reallocs++;
490     }
491 
492     /* copy data into free space, then initialize lnk */
493     PetscCall(PetscLLClean(mbs, mbs, nzk, lnk, current_space->array, lnkbt));
494 
495     /* add the k-th row into il and jl */
496     if (nzk - 1 > 0) {
497       i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */
498       jl[k] = jl[i];
499       jl[i] = k;
500       il[k] = ui[k] + 1;
501     }
502     ui_ptr[k] = current_space->array;
503 
504     current_space->array += nzk;
505     current_space->local_used += nzk;
506     current_space->local_remaining -= nzk;
507 
508     ui[k + 1] = ui[k] + nzk;
509   }
510 
511   PetscCall(ISRestoreIndices(perm, &rip));
512   PetscCall(PetscFree4(ui_ptr, il, jl, cols));
513 
514   /* destroy list of free space and other temporary array(s) */
515   PetscCall(PetscMalloc1(ui[mbs] + 1, &uj));
516   PetscCall(PetscFreeSpaceContiguous(&free_space, uj));
517   PetscCall(PetscLLDestroy(lnk, lnkbt));
518 
519   /* put together the new matrix in MATSEQSBAIJ format */
520   PetscCall(MatSeqSBAIJSetPreallocation(fact, bs, MAT_SKIP_ALLOCATION, NULL));
521 
522   b               = (Mat_SeqSBAIJ *)fact->data;
523   b->singlemalloc = PETSC_FALSE;
524   b->free_a       = PETSC_TRUE;
525   b->free_ij      = PETSC_TRUE;
526 
527   PetscCall(PetscMalloc1(ui[mbs] + 1, &b->a));
528 
529   b->j    = uj;
530   b->i    = ui;
531   b->diag = NULL;
532   b->ilen = NULL;
533   b->imax = NULL;
534   b->row  = perm;
535 
536   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
537 
538   PetscCall(PetscObjectReference((PetscObject)perm));
539   b->icol = perm;
540   PetscCall(PetscObjectReference((PetscObject)perm));
541   PetscCall(PetscMalloc1(mbs + 1, &b->solve_work));
542   b->maxnz = b->nz = ui[mbs];
543 
544   fact->info.factor_mallocs   = reallocs;
545   fact->info.fill_ratio_given = fill;
546   if (ai[mbs] != 0) {
547     fact->info.fill_ratio_needed = ((PetscReal)ui[mbs]) / ai[mbs];
548   } else {
549     fact->info.fill_ratio_needed = 0.0;
550   }
551 #if defined(PETSC_USE_INFO)
552   if (ai[mbs] != 0) {
553     PetscReal af = fact->info.fill_ratio_needed;
554     PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af));
555     PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
556     PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af));
557   } else {
558     PetscCall(PetscInfo(A, "Empty matrix\n"));
559   }
560 #endif
561   PetscCall(MatSeqSBAIJSetNumericFactorization_inplace(fact, perm_identity));
562   PetscFunctionReturn(PETSC_SUCCESS);
563 }
564 
565 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_N(Mat C, Mat A, const MatFactorInfo *info)
566 {
567   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)C->data;
568   IS              perm = b->row;
569   const PetscInt *ai, *aj, *perm_ptr, mbs = a->mbs, *bi = b->i, *bj = b->j;
570   PetscInt        i, j;
571   PetscInt       *a2anew, k, k1, jmin, jmax, *jl, *il, vj, nexti, ili;
572   PetscInt        bs = A->rmap->bs, bs2 = a->bs2;
573   MatScalar      *ba = b->a, *aa, *ap, *dk, *uik;
574   MatScalar      *u, *diag, *rtmp, *rtmp_ptr;
575   MatScalar      *work;
576   PetscInt       *pivots;
577   PetscBool       allowzeropivot, zeropivotdetected;
578 
579   PetscFunctionBegin;
580   /* initialization */
581   PetscCall(PetscCalloc1(bs2 * mbs, &rtmp));
582   PetscCall(PetscMalloc2(mbs, &il, mbs, &jl));
583   allowzeropivot = PetscNot(A->erroriffailure);
584 
585   il[0] = 0;
586   for (i = 0; i < mbs; i++) jl[i] = mbs;
587 
588   PetscCall(PetscMalloc3(bs2, &dk, bs2, &uik, bs, &work));
589   PetscCall(PetscMalloc1(bs, &pivots));
590 
591   PetscCall(ISGetIndices(perm, &perm_ptr));
592 
593   /* check permutation */
594   if (!a->permute) {
595     ai = a->i;
596     aj = a->j;
597     aa = a->a;
598   } else {
599     ai = a->inew;
600     aj = a->jnew;
601     PetscCall(PetscMalloc1(bs2 * ai[mbs], &aa));
602     PetscCall(PetscArraycpy(aa, a->a, bs2 * ai[mbs]));
603     PetscCall(PetscMalloc1(ai[mbs], &a2anew));
604     PetscCall(PetscArraycpy(a2anew, a->a2anew, ai[mbs]));
605 
606     for (i = 0; i < mbs; i++) {
607       jmin = ai[i];
608       jmax = ai[i + 1];
609       for (j = jmin; j < jmax; j++) {
610         while (a2anew[j] != j) {
611           k         = a2anew[j];
612           a2anew[j] = a2anew[k];
613           a2anew[k] = k;
614           for (k1 = 0; k1 < bs2; k1++) {
615             dk[k1]           = aa[k * bs2 + k1];
616             aa[k * bs2 + k1] = aa[j * bs2 + k1];
617             aa[j * bs2 + k1] = dk[k1];
618           }
619         }
620         /* transform column-oriented blocks that lie in the lower triangle to row-oriented blocks */
621         if (i > aj[j]) {
622           ap = aa + j * bs2;                       /* ptr to the beginning of j-th block of aa */
623           for (k = 0; k < bs2; k++) dk[k] = ap[k]; /* dk <- j-th block of aa */
624           for (k = 0; k < bs; k++) {               /* j-th block of aa <- dk^T */
625             for (k1 = 0; k1 < bs; k1++) *ap++ = dk[k + bs * k1];
626           }
627         }
628       }
629     }
630     PetscCall(PetscFree(a2anew));
631   }
632 
633   /* for each row k */
634   for (k = 0; k < mbs; k++) {
635     /*initialize k-th row with elements nonzero in row perm(k) of A */
636     jmin = ai[perm_ptr[k]];
637     jmax = ai[perm_ptr[k] + 1];
638 
639     ap = aa + jmin * bs2;
640     for (j = jmin; j < jmax; j++) {
641       vj       = perm_ptr[aj[j]]; /* block col. index */
642       rtmp_ptr = rtmp + vj * bs2;
643       for (i = 0; i < bs2; i++) *rtmp_ptr++ = *ap++;
644     }
645 
646     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
647     PetscCall(PetscArraycpy(dk, rtmp + k * bs2, bs2));
648     i = jl[k]; /* first row to be added to k_th row  */
649 
650     while (i < k) {
651       nexti = jl[i]; /* next row to be added to k_th row */
652 
653       /* compute multiplier */
654       ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
655 
656       /* uik = -inv(Di)*U_bar(i,k) */
657       diag = ba + i * bs2;
658       u    = ba + ili * bs2;
659       PetscCall(PetscArrayzero(uik, bs2));
660       PetscKernel_A_gets_A_minus_B_times_C(bs, uik, diag, u);
661 
662       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
663       PetscKernel_A_gets_A_plus_Btranspose_times_C(bs, dk, uik, u);
664       PetscCall(PetscLogFlops(4.0 * bs * bs2));
665 
666       /* update -U(i,k) */
667       PetscCall(PetscArraycpy(ba + ili * bs2, uik, bs2));
668 
669       /* add multiple of row i to k-th row ... */
670       jmin = ili + 1;
671       jmax = bi[i + 1];
672       if (jmin < jmax) {
673         for (j = jmin; j < jmax; j++) {
674           /* rtmp += -U(i,k)^T * U_bar(i,j) */
675           rtmp_ptr = rtmp + bj[j] * bs2;
676           u        = ba + j * bs2;
677           PetscKernel_A_gets_A_plus_Btranspose_times_C(bs, rtmp_ptr, uik, u);
678         }
679         PetscCall(PetscLogFlops(2.0 * bs * bs2 * (jmax - jmin)));
680 
681         /* ... add i to row list for next nonzero entry */
682         il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */
683         j     = bj[jmin];
684         jl[i] = jl[j];
685         jl[j] = i; /* update jl */
686       }
687       i = nexti;
688     }
689 
690     /* save nonzero entries in k-th row of U ... */
691 
692     /* invert diagonal block */
693     diag = ba + k * bs2;
694     PetscCall(PetscArraycpy(diag, dk, bs2));
695 
696     PetscCall(PetscKernel_A_gets_inverse_A(bs, diag, pivots, work, allowzeropivot, &zeropivotdetected));
697     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
698 
699     jmin = bi[k];
700     jmax = bi[k + 1];
701     if (jmin < jmax) {
702       for (j = jmin; j < jmax; j++) {
703         vj       = bj[j]; /* block col. index of U */
704         u        = ba + j * bs2;
705         rtmp_ptr = rtmp + vj * bs2;
706         for (k1 = 0; k1 < bs2; k1++) {
707           *u++        = *rtmp_ptr;
708           *rtmp_ptr++ = 0.0;
709         }
710       }
711 
712       /* ... add k to row list for first nonzero entry in k-th row */
713       il[k] = jmin;
714       i     = bj[jmin];
715       jl[k] = jl[i];
716       jl[i] = k;
717     }
718   }
719 
720   PetscCall(PetscFree(rtmp));
721   PetscCall(PetscFree2(il, jl));
722   PetscCall(PetscFree3(dk, uik, work));
723   PetscCall(PetscFree(pivots));
724   if (a->permute) PetscCall(PetscFree(aa));
725 
726   PetscCall(ISRestoreIndices(perm, &perm_ptr));
727 
728   C->ops->solve          = MatSolve_SeqSBAIJ_N_inplace;
729   C->ops->solvetranspose = MatSolve_SeqSBAIJ_N_inplace;
730   C->ops->forwardsolve   = MatForwardSolve_SeqSBAIJ_N_inplace;
731   C->ops->backwardsolve  = MatBackwardSolve_SeqSBAIJ_N_inplace;
732 
733   C->assembled    = PETSC_TRUE;
734   C->preallocated = PETSC_TRUE;
735 
736   PetscCall(PetscLogFlops(1.3333 * bs * bs2 * b->mbs)); /* from inverting diagonal blocks */
737   PetscFunctionReturn(PETSC_SUCCESS);
738 }
739 
740 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering(Mat C, Mat A, const MatFactorInfo *info)
741 {
742   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)C->data;
743   PetscInt      i, j, mbs = a->mbs, *bi = b->i, *bj = b->j;
744   PetscInt     *ai, *aj, k, k1, jmin, jmax, *jl, *il, vj, nexti, ili;
745   PetscInt      bs = A->rmap->bs, bs2 = a->bs2;
746   MatScalar    *ba = b->a, *aa, *ap, *dk, *uik;
747   MatScalar    *u, *diag, *rtmp, *rtmp_ptr;
748   MatScalar    *work;
749   PetscInt     *pivots;
750   PetscBool     allowzeropivot, zeropivotdetected;
751 
752   PetscFunctionBegin;
753   PetscCall(PetscCalloc1(bs2 * mbs, &rtmp));
754   PetscCall(PetscMalloc2(mbs, &il, mbs, &jl));
755   il[0] = 0;
756   for (i = 0; i < mbs; i++) jl[i] = mbs;
757 
758   PetscCall(PetscMalloc3(bs2, &dk, bs2, &uik, bs, &work));
759   PetscCall(PetscMalloc1(bs, &pivots));
760   allowzeropivot = PetscNot(A->erroriffailure);
761 
762   ai = a->i;
763   aj = a->j;
764   aa = a->a;
765 
766   /* for each row k */
767   for (k = 0; k < mbs; k++) {
768     /*initialize k-th row with elements nonzero in row k of A */
769     jmin = ai[k];
770     jmax = ai[k + 1];
771     ap   = aa + jmin * bs2;
772     for (j = jmin; j < jmax; j++) {
773       vj       = aj[j]; /* block col. index */
774       rtmp_ptr = rtmp + vj * bs2;
775       for (i = 0; i < bs2; i++) *rtmp_ptr++ = *ap++;
776     }
777 
778     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
779     PetscCall(PetscArraycpy(dk, rtmp + k * bs2, bs2));
780     i = jl[k]; /* first row to be added to k_th row  */
781 
782     while (i < k) {
783       nexti = jl[i]; /* next row to be added to k_th row */
784 
785       /* compute multiplier */
786       ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
787 
788       /* uik = -inv(Di)*U_bar(i,k) */
789       diag = ba + i * bs2;
790       u    = ba + ili * bs2;
791       PetscCall(PetscArrayzero(uik, bs2));
792       PetscKernel_A_gets_A_minus_B_times_C(bs, uik, diag, u);
793 
794       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
795       PetscKernel_A_gets_A_plus_Btranspose_times_C(bs, dk, uik, u);
796       PetscCall(PetscLogFlops(2.0 * bs * bs2));
797 
798       /* update -U(i,k) */
799       PetscCall(PetscArraycpy(ba + ili * bs2, uik, bs2));
800 
801       /* add multiple of row i to k-th row ... */
802       jmin = ili + 1;
803       jmax = bi[i + 1];
804       if (jmin < jmax) {
805         for (j = jmin; j < jmax; j++) {
806           /* rtmp += -U(i,k)^T * U_bar(i,j) */
807           rtmp_ptr = rtmp + bj[j] * bs2;
808           u        = ba + j * bs2;
809           PetscKernel_A_gets_A_plus_Btranspose_times_C(bs, rtmp_ptr, uik, u);
810         }
811         PetscCall(PetscLogFlops(2.0 * bs * bs2 * (jmax - jmin)));
812 
813         /* ... add i to row list for next nonzero entry */
814         il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */
815         j     = bj[jmin];
816         jl[i] = jl[j];
817         jl[j] = i; /* update jl */
818       }
819       i = nexti;
820     }
821 
822     /* save nonzero entries in k-th row of U ... */
823 
824     /* invert diagonal block */
825     diag = ba + k * bs2;
826     PetscCall(PetscArraycpy(diag, dk, bs2));
827 
828     PetscCall(PetscKernel_A_gets_inverse_A(bs, diag, pivots, work, allowzeropivot, &zeropivotdetected));
829     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
830 
831     jmin = bi[k];
832     jmax = bi[k + 1];
833     if (jmin < jmax) {
834       for (j = jmin; j < jmax; j++) {
835         vj       = bj[j]; /* block col. index of U */
836         u        = ba + j * bs2;
837         rtmp_ptr = rtmp + vj * bs2;
838         for (k1 = 0; k1 < bs2; k1++) {
839           *u++        = *rtmp_ptr;
840           *rtmp_ptr++ = 0.0;
841         }
842       }
843 
844       /* ... add k to row list for first nonzero entry in k-th row */
845       il[k] = jmin;
846       i     = bj[jmin];
847       jl[k] = jl[i];
848       jl[i] = k;
849     }
850   }
851 
852   PetscCall(PetscFree(rtmp));
853   PetscCall(PetscFree2(il, jl));
854   PetscCall(PetscFree3(dk, uik, work));
855   PetscCall(PetscFree(pivots));
856 
857   C->ops->solve          = MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace;
858   C->ops->solvetranspose = MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace;
859   C->ops->forwardsolve   = MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace;
860   C->ops->backwardsolve  = MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace;
861   C->assembled           = PETSC_TRUE;
862   C->preallocated        = PETSC_TRUE;
863 
864   PetscCall(PetscLogFlops(1.3333 * bs * bs2 * b->mbs)); /* from inverting diagonal blocks */
865   PetscFunctionReturn(PETSC_SUCCESS);
866 }
867 
868 /*
869     Numeric U^T*D*U factorization for SBAIJ format. Modified from SNF of YSMP.
870     Version for blocks 2 by 2.
871 */
872 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_2(Mat C, Mat A, const MatFactorInfo *info)
873 {
874   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)C->data;
875   IS              perm = b->row;
876   const PetscInt *ai, *aj, *perm_ptr;
877   PetscInt        i, j, mbs = a->mbs, *bi = b->i, *bj = b->j;
878   PetscInt       *a2anew, k, k1, jmin, jmax, *jl, *il, vj, nexti, ili;
879   MatScalar      *ba = b->a, *aa, *ap;
880   MatScalar      *u, *diag, *rtmp, *rtmp_ptr, dk[4], uik[4];
881   PetscReal       shift = info->shiftamount;
882   PetscBool       allowzeropivot, zeropivotdetected;
883 
884   PetscFunctionBegin;
885   allowzeropivot = PetscNot(A->erroriffailure);
886 
887   /* initialization */
888   /* il and jl record the first nonzero element in each row of the accessing
889      window U(0:k, k:mbs-1).
890      jl:    list of rows to be added to uneliminated rows
891             i>= k: jl(i) is the first row to be added to row i
892             i<  k: jl(i) is the row following row i in some list of rows
893             jl(i) = mbs indicates the end of a list
894      il(i): points to the first nonzero element in columns k,...,mbs-1 of
895             row i of U */
896   PetscCall(PetscCalloc1(4 * mbs, &rtmp));
897   PetscCall(PetscMalloc2(mbs, &il, mbs, &jl));
898   il[0] = 0;
899   for (i = 0; i < mbs; i++) jl[i] = mbs;
900 
901   PetscCall(ISGetIndices(perm, &perm_ptr));
902 
903   /* check permutation */
904   if (!a->permute) {
905     ai = a->i;
906     aj = a->j;
907     aa = a->a;
908   } else {
909     ai = a->inew;
910     aj = a->jnew;
911     PetscCall(PetscMalloc1(4 * ai[mbs], &aa));
912     PetscCall(PetscArraycpy(aa, a->a, 4 * ai[mbs]));
913     PetscCall(PetscMalloc1(ai[mbs], &a2anew));
914     PetscCall(PetscArraycpy(a2anew, a->a2anew, ai[mbs]));
915 
916     for (i = 0; i < mbs; i++) {
917       jmin = ai[i];
918       jmax = ai[i + 1];
919       for (j = jmin; j < jmax; j++) {
920         while (a2anew[j] != j) {
921           k         = a2anew[j];
922           a2anew[j] = a2anew[k];
923           a2anew[k] = k;
924           for (k1 = 0; k1 < 4; k1++) {
925             dk[k1]         = aa[k * 4 + k1];
926             aa[k * 4 + k1] = aa[j * 4 + k1];
927             aa[j * 4 + k1] = dk[k1];
928           }
929         }
930         /* transform column-oriented blocks that lie in the lower triangle to row-oriented blocks */
931         if (i > aj[j]) {
932           ap    = aa + j * 4; /* ptr to the beginning of the block */
933           dk[1] = ap[1];      /* swap ap[1] and ap[2] */
934           ap[1] = ap[2];
935           ap[2] = dk[1];
936         }
937       }
938     }
939     PetscCall(PetscFree(a2anew));
940   }
941 
942   /* for each row k */
943   for (k = 0; k < mbs; k++) {
944     /*initialize k-th row with elements nonzero in row perm(k) of A */
945     jmin = ai[perm_ptr[k]];
946     jmax = ai[perm_ptr[k] + 1];
947     ap   = aa + jmin * 4;
948     for (j = jmin; j < jmax; j++) {
949       vj       = perm_ptr[aj[j]]; /* block col. index */
950       rtmp_ptr = rtmp + vj * 4;
951       for (i = 0; i < 4; i++) *rtmp_ptr++ = *ap++;
952     }
953 
954     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
955     PetscCall(PetscArraycpy(dk, rtmp + k * 4, 4));
956     i = jl[k]; /* first row to be added to k_th row  */
957 
958     while (i < k) {
959       nexti = jl[i]; /* next row to be added to k_th row */
960 
961       /* compute multiplier */
962       ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
963 
964       /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */
965       diag   = ba + i * 4;
966       u      = ba + ili * 4;
967       uik[0] = -(diag[0] * u[0] + diag[2] * u[1]);
968       uik[1] = -(diag[1] * u[0] + diag[3] * u[1]);
969       uik[2] = -(diag[0] * u[2] + diag[2] * u[3]);
970       uik[3] = -(diag[1] * u[2] + diag[3] * u[3]);
971 
972       /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */
973       dk[0] += uik[0] * u[0] + uik[1] * u[1];
974       dk[1] += uik[2] * u[0] + uik[3] * u[1];
975       dk[2] += uik[0] * u[2] + uik[1] * u[3];
976       dk[3] += uik[2] * u[2] + uik[3] * u[3];
977 
978       PetscCall(PetscLogFlops(16.0 * 2.0));
979 
980       /* update -U(i,k): ba[ili] = uik */
981       PetscCall(PetscArraycpy(ba + ili * 4, uik, 4));
982 
983       /* add multiple of row i to k-th row ... */
984       jmin = ili + 1;
985       jmax = bi[i + 1];
986       if (jmin < jmax) {
987         for (j = jmin; j < jmax; j++) {
988           /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */
989           rtmp_ptr = rtmp + bj[j] * 4;
990           u        = ba + j * 4;
991           rtmp_ptr[0] += uik[0] * u[0] + uik[1] * u[1];
992           rtmp_ptr[1] += uik[2] * u[0] + uik[3] * u[1];
993           rtmp_ptr[2] += uik[0] * u[2] + uik[1] * u[3];
994           rtmp_ptr[3] += uik[2] * u[2] + uik[3] * u[3];
995         }
996         PetscCall(PetscLogFlops(16.0 * (jmax - jmin)));
997 
998         /* ... add i to row list for next nonzero entry */
999         il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */
1000         j     = bj[jmin];
1001         jl[i] = jl[j];
1002         jl[j] = i; /* update jl */
1003       }
1004       i = nexti;
1005     }
1006 
1007     /* save nonzero entries in k-th row of U ... */
1008 
1009     /* invert diagonal block */
1010     diag = ba + k * 4;
1011     PetscCall(PetscArraycpy(diag, dk, 4));
1012     PetscCall(PetscKernel_A_gets_inverse_A_2(diag, shift, allowzeropivot, &zeropivotdetected));
1013     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1014 
1015     jmin = bi[k];
1016     jmax = bi[k + 1];
1017     if (jmin < jmax) {
1018       for (j = jmin; j < jmax; j++) {
1019         vj       = bj[j]; /* block col. index of U */
1020         u        = ba + j * 4;
1021         rtmp_ptr = rtmp + vj * 4;
1022         for (k1 = 0; k1 < 4; k1++) {
1023           *u++        = *rtmp_ptr;
1024           *rtmp_ptr++ = 0.0;
1025         }
1026       }
1027 
1028       /* ... add k to row list for first nonzero entry in k-th row */
1029       il[k] = jmin;
1030       i     = bj[jmin];
1031       jl[k] = jl[i];
1032       jl[i] = k;
1033     }
1034   }
1035 
1036   PetscCall(PetscFree(rtmp));
1037   PetscCall(PetscFree2(il, jl));
1038   if (a->permute) PetscCall(PetscFree(aa));
1039   PetscCall(ISRestoreIndices(perm, &perm_ptr));
1040 
1041   C->ops->solve          = MatSolve_SeqSBAIJ_2_inplace;
1042   C->ops->solvetranspose = MatSolve_SeqSBAIJ_2_inplace;
1043   C->assembled           = PETSC_TRUE;
1044   C->preallocated        = PETSC_TRUE;
1045 
1046   PetscCall(PetscLogFlops(1.3333 * 8 * b->mbs)); /* from inverting diagonal blocks */
1047   PetscFunctionReturn(PETSC_SUCCESS);
1048 }
1049 
1050 /*
1051       Version for when blocks are 2 by 2 Using natural ordering
1052 */
1053 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering(Mat C, Mat A, const MatFactorInfo *info)
1054 {
1055   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)C->data;
1056   PetscInt      i, j, mbs = a->mbs, *bi = b->i, *bj = b->j;
1057   PetscInt     *ai, *aj, k, k1, jmin, jmax, *jl, *il, vj, nexti, ili;
1058   MatScalar    *ba = b->a, *aa, *ap, dk[8], uik[8];
1059   MatScalar    *u, *diag, *rtmp, *rtmp_ptr;
1060   PetscReal     shift = info->shiftamount;
1061   PetscBool     allowzeropivot, zeropivotdetected;
1062 
1063   PetscFunctionBegin;
1064   allowzeropivot = PetscNot(A->erroriffailure);
1065 
1066   /* initialization */
1067   /* il and jl record the first nonzero element in each row of the accessing
1068      window U(0:k, k:mbs-1).
1069      jl:    list of rows to be added to uneliminated rows
1070             i>= k: jl(i) is the first row to be added to row i
1071             i<  k: jl(i) is the row following row i in some list of rows
1072             jl(i) = mbs indicates the end of a list
1073      il(i): points to the first nonzero element in columns k,...,mbs-1 of
1074             row i of U */
1075   PetscCall(PetscCalloc1(4 * mbs, &rtmp));
1076   PetscCall(PetscMalloc2(mbs, &il, mbs, &jl));
1077   il[0] = 0;
1078   for (i = 0; i < mbs; i++) jl[i] = mbs;
1079 
1080   ai = a->i;
1081   aj = a->j;
1082   aa = a->a;
1083 
1084   /* for each row k */
1085   for (k = 0; k < mbs; k++) {
1086     /*initialize k-th row with elements nonzero in row k of A */
1087     jmin = ai[k];
1088     jmax = ai[k + 1];
1089     ap   = aa + jmin * 4;
1090     for (j = jmin; j < jmax; j++) {
1091       vj       = aj[j]; /* block col. index */
1092       rtmp_ptr = rtmp + vj * 4;
1093       for (i = 0; i < 4; i++) *rtmp_ptr++ = *ap++;
1094     }
1095 
1096     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
1097     PetscCall(PetscArraycpy(dk, rtmp + k * 4, 4));
1098     i = jl[k]; /* first row to be added to k_th row  */
1099 
1100     while (i < k) {
1101       nexti = jl[i]; /* next row to be added to k_th row */
1102 
1103       /* compute multiplier */
1104       ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
1105 
1106       /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */
1107       diag   = ba + i * 4;
1108       u      = ba + ili * 4;
1109       uik[0] = -(diag[0] * u[0] + diag[2] * u[1]);
1110       uik[1] = -(diag[1] * u[0] + diag[3] * u[1]);
1111       uik[2] = -(diag[0] * u[2] + diag[2] * u[3]);
1112       uik[3] = -(diag[1] * u[2] + diag[3] * u[3]);
1113 
1114       /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */
1115       dk[0] += uik[0] * u[0] + uik[1] * u[1];
1116       dk[1] += uik[2] * u[0] + uik[3] * u[1];
1117       dk[2] += uik[0] * u[2] + uik[1] * u[3];
1118       dk[3] += uik[2] * u[2] + uik[3] * u[3];
1119 
1120       PetscCall(PetscLogFlops(16.0 * 2.0));
1121 
1122       /* update -U(i,k): ba[ili] = uik */
1123       PetscCall(PetscArraycpy(ba + ili * 4, uik, 4));
1124 
1125       /* add multiple of row i to k-th row ... */
1126       jmin = ili + 1;
1127       jmax = bi[i + 1];
1128       if (jmin < jmax) {
1129         for (j = jmin; j < jmax; j++) {
1130           /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */
1131           rtmp_ptr = rtmp + bj[j] * 4;
1132           u        = ba + j * 4;
1133           rtmp_ptr[0] += uik[0] * u[0] + uik[1] * u[1];
1134           rtmp_ptr[1] += uik[2] * u[0] + uik[3] * u[1];
1135           rtmp_ptr[2] += uik[0] * u[2] + uik[1] * u[3];
1136           rtmp_ptr[3] += uik[2] * u[2] + uik[3] * u[3];
1137         }
1138         PetscCall(PetscLogFlops(16.0 * (jmax - jmin)));
1139 
1140         /* ... add i to row list for next nonzero entry */
1141         il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */
1142         j     = bj[jmin];
1143         jl[i] = jl[j];
1144         jl[j] = i; /* update jl */
1145       }
1146       i = nexti;
1147     }
1148 
1149     /* save nonzero entries in k-th row of U ... */
1150 
1151     /* invert diagonal block */
1152     diag = ba + k * 4;
1153     PetscCall(PetscArraycpy(diag, dk, 4));
1154     PetscCall(PetscKernel_A_gets_inverse_A_2(diag, shift, allowzeropivot, &zeropivotdetected));
1155     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1156 
1157     jmin = bi[k];
1158     jmax = bi[k + 1];
1159     if (jmin < jmax) {
1160       for (j = jmin; j < jmax; j++) {
1161         vj       = bj[j]; /* block col. index of U */
1162         u        = ba + j * 4;
1163         rtmp_ptr = rtmp + vj * 4;
1164         for (k1 = 0; k1 < 4; k1++) {
1165           *u++        = *rtmp_ptr;
1166           *rtmp_ptr++ = 0.0;
1167         }
1168       }
1169 
1170       /* ... add k to row list for first nonzero entry in k-th row */
1171       il[k] = jmin;
1172       i     = bj[jmin];
1173       jl[k] = jl[i];
1174       jl[i] = k;
1175     }
1176   }
1177 
1178   PetscCall(PetscFree(rtmp));
1179   PetscCall(PetscFree2(il, jl));
1180 
1181   C->ops->solve          = MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace;
1182   C->ops->solvetranspose = MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace;
1183   C->ops->forwardsolve   = MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace;
1184   C->ops->backwardsolve  = MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace;
1185   C->assembled           = PETSC_TRUE;
1186   C->preallocated        = PETSC_TRUE;
1187 
1188   PetscCall(PetscLogFlops(1.3333 * 8 * b->mbs)); /* from inverting diagonal blocks */
1189   PetscFunctionReturn(PETSC_SUCCESS);
1190 }
1191 
1192 /*
1193     Numeric U^T*D*U factorization for SBAIJ format.
1194     Version for blocks are 1 by 1.
1195 */
1196 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_1_inplace(Mat C, Mat A, const MatFactorInfo *info)
1197 {
1198   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)C->data;
1199   IS              ip = b->row;
1200   const PetscInt *ai, *aj, *rip;
1201   PetscInt       *a2anew, i, j, mbs = a->mbs, *bi = b->i, *bj = b->j, *bcol;
1202   PetscInt        k, jmin, jmax, *jl, *il, col, nexti, ili, nz;
1203   MatScalar      *rtmp, *ba = b->a, *bval, *aa, dk, uikdi;
1204   PetscReal       rs;
1205   FactorShiftCtx  sctx;
1206 
1207   PetscFunctionBegin;
1208   /* MatPivotSetUp(): initialize shift context sctx */
1209   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
1210 
1211   PetscCall(ISGetIndices(ip, &rip));
1212   if (!a->permute) {
1213     ai = a->i;
1214     aj = a->j;
1215     aa = a->a;
1216   } else {
1217     ai = a->inew;
1218     aj = a->jnew;
1219     nz = ai[mbs];
1220     PetscCall(PetscMalloc1(nz, &aa));
1221     a2anew = a->a2anew;
1222     bval   = a->a;
1223     for (j = 0; j < nz; j++) aa[a2anew[j]] = *(bval++);
1224   }
1225 
1226   /* initialization */
1227   /* il and jl record the first nonzero element in each row of the accessing
1228      window U(0:k, k:mbs-1).
1229      jl:    list of rows to be added to uneliminated rows
1230             i>= k: jl(i) is the first row to be added to row i
1231             i<  k: jl(i) is the row following row i in some list of rows
1232             jl(i) = mbs indicates the end of a list
1233      il(i): points to the first nonzero element in columns k,...,mbs-1 of
1234             row i of U */
1235   PetscCall(PetscMalloc3(mbs, &rtmp, mbs, &il, mbs, &jl));
1236 
1237   do {
1238     sctx.newshift = PETSC_FALSE;
1239     il[0]         = 0;
1240     for (i = 0; i < mbs; i++) {
1241       rtmp[i] = 0.0;
1242       jl[i]   = mbs;
1243     }
1244 
1245     for (k = 0; k < mbs; k++) {
1246       /*initialize k-th row by the perm[k]-th row of A */
1247       jmin = ai[rip[k]];
1248       jmax = ai[rip[k] + 1];
1249       bval = ba + bi[k];
1250       for (j = jmin; j < jmax; j++) {
1251         col       = rip[aj[j]];
1252         rtmp[col] = aa[j];
1253         *bval++   = 0.0; /* for in-place factorization */
1254       }
1255 
1256       /* shift the diagonal of the matrix */
1257       if (sctx.nshift) rtmp[k] += sctx.shift_amount;
1258 
1259       /* modify k-th row by adding in those rows i with U(i,k)!=0 */
1260       dk = rtmp[k];
1261       i  = jl[k]; /* first row to be added to k_th row  */
1262 
1263       while (i < k) {
1264         nexti = jl[i]; /* next row to be added to k_th row */
1265 
1266         /* compute multiplier, update diag(k) and U(i,k) */
1267         ili   = il[i];                /* index of first nonzero element in U(i,k:bms-1) */
1268         uikdi = -ba[ili] * ba[bi[i]]; /* diagonal(k) */
1269         dk += uikdi * ba[ili];
1270         ba[ili] = uikdi; /* -U(i,k) */
1271 
1272         /* add multiple of row i to k-th row */
1273         jmin = ili + 1;
1274         jmax = bi[i + 1];
1275         if (jmin < jmax) {
1276           for (j = jmin; j < jmax; j++) rtmp[bj[j]] += uikdi * ba[j];
1277           PetscCall(PetscLogFlops(2.0 * (jmax - jmin)));
1278 
1279           /* update il and jl for row i */
1280           il[i] = jmin;
1281           j     = bj[jmin];
1282           jl[i] = jl[j];
1283           jl[j] = i;
1284         }
1285         i = nexti;
1286       }
1287 
1288       /* shift the diagonals when zero pivot is detected */
1289       /* compute rs=sum of abs(off-diagonal) */
1290       rs   = 0.0;
1291       jmin = bi[k] + 1;
1292       nz   = bi[k + 1] - jmin;
1293       if (nz) {
1294         bcol = bj + jmin;
1295         while (nz--) {
1296           rs += PetscAbsScalar(rtmp[*bcol]);
1297           bcol++;
1298         }
1299       }
1300 
1301       sctx.rs = rs;
1302       sctx.pv = dk;
1303       PetscCall(MatPivotCheck(C, A, info, &sctx, k));
1304       if (sctx.newshift) break; /* sctx.shift_amount is updated */
1305       dk = sctx.pv;
1306 
1307       /* copy data into U(k,:) */
1308       ba[bi[k]] = 1.0 / dk; /* U(k,k) */
1309       jmin      = bi[k] + 1;
1310       jmax      = bi[k + 1];
1311       if (jmin < jmax) {
1312         for (j = jmin; j < jmax; j++) {
1313           col       = bj[j];
1314           ba[j]     = rtmp[col];
1315           rtmp[col] = 0.0;
1316         }
1317         /* add the k-th row into il and jl */
1318         il[k] = jmin;
1319         i     = bj[jmin];
1320         jl[k] = jl[i];
1321         jl[i] = k;
1322       }
1323     }
1324   } while (sctx.newshift);
1325   PetscCall(PetscFree3(rtmp, il, jl));
1326   if (a->permute) PetscCall(PetscFree(aa));
1327 
1328   PetscCall(ISRestoreIndices(ip, &rip));
1329 
1330   C->ops->solve          = MatSolve_SeqSBAIJ_1_inplace;
1331   C->ops->solves         = MatSolves_SeqSBAIJ_1_inplace;
1332   C->ops->solvetranspose = MatSolve_SeqSBAIJ_1_inplace;
1333   C->ops->forwardsolve   = MatForwardSolve_SeqSBAIJ_1_inplace;
1334   C->ops->backwardsolve  = MatBackwardSolve_SeqSBAIJ_1_inplace;
1335   C->assembled           = PETSC_TRUE;
1336   C->preallocated        = PETSC_TRUE;
1337 
1338   PetscCall(PetscLogFlops(C->rmap->N));
1339   if (sctx.nshift) {
1340     if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1341       PetscCall(PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
1342     } else if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
1343       PetscCall(PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
1344     }
1345   }
1346   PetscFunctionReturn(PETSC_SUCCESS);
1347 }
1348 
1349 /*
1350   Version for when blocks are 1 by 1 Using natural ordering under new datastructure
1351   Modified from MatCholeskyFactorNumeric_SeqAIJ()
1352 */
1353 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info)
1354 {
1355   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ *)A->data;
1356   Mat_SeqSBAIJ  *b = (Mat_SeqSBAIJ *)B->data;
1357   PetscInt       i, j, mbs = A->rmap->n, *bi = b->i, *bj = b->j, *bdiag = b->diag, *bjtmp;
1358   PetscInt      *ai = a->i, *aj = a->j, *ajtmp;
1359   PetscInt       k, jmin, jmax, *c2r, *il, col, nexti, ili, nz;
1360   MatScalar     *rtmp, *ba = b->a, *bval, *aa = a->a, dk, uikdi;
1361   FactorShiftCtx sctx;
1362   PetscReal      rs;
1363   MatScalar      d, *v;
1364 
1365   PetscFunctionBegin;
1366   PetscCall(PetscMalloc3(mbs, &rtmp, mbs, &il, mbs, &c2r));
1367 
1368   /* MatPivotSetUp(): initialize shift context sctx */
1369   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
1370 
1371   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1372     sctx.shift_top = info->zeropivot;
1373 
1374     PetscCall(PetscArrayzero(rtmp, mbs));
1375 
1376     for (i = 0; i < mbs; i++) {
1377       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1378       d = (aa)[a->diag[i]];
1379       rtmp[i] += -PetscRealPart(d); /* diagonal entry */
1380       ajtmp = aj + ai[i] + 1;       /* exclude diagonal */
1381       v     = aa + ai[i] + 1;
1382       nz    = ai[i + 1] - ai[i] - 1;
1383       for (j = 0; j < nz; j++) {
1384         rtmp[i] += PetscAbsScalar(v[j]);
1385         rtmp[ajtmp[j]] += PetscAbsScalar(v[j]);
1386       }
1387       if (PetscRealPart(rtmp[i]) > sctx.shift_top) sctx.shift_top = PetscRealPart(rtmp[i]);
1388     }
1389     sctx.shift_top *= 1.1;
1390     sctx.nshift_max = 5;
1391     sctx.shift_lo   = 0.;
1392     sctx.shift_hi   = 1.;
1393   }
1394 
1395   /* allocate working arrays
1396      c2r: linked list, keep track of pivot rows for a given column. c2r[col]: head of the list for a given col
1397      il:  for active k row, il[i] gives the index of the 1st nonzero entry in U[i,k:n-1] in bj and ba arrays
1398   */
1399   do {
1400     sctx.newshift = PETSC_FALSE;
1401 
1402     for (i = 0; i < mbs; i++) c2r[i] = mbs;
1403     if (mbs) il[0] = 0;
1404 
1405     for (k = 0; k < mbs; k++) {
1406       /* zero rtmp */
1407       nz    = bi[k + 1] - bi[k];
1408       bjtmp = bj + bi[k];
1409       for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0;
1410 
1411       /* load in initial unfactored row */
1412       bval = ba + bi[k];
1413       jmin = ai[k];
1414       jmax = ai[k + 1];
1415       for (j = jmin; j < jmax; j++) {
1416         col       = aj[j];
1417         rtmp[col] = aa[j];
1418         *bval++   = 0.0; /* for in-place factorization */
1419       }
1420       /* shift the diagonal of the matrix: ZeropivotApply() */
1421       rtmp[k] += sctx.shift_amount; /* shift the diagonal of the matrix */
1422 
1423       /* modify k-th row by adding in those rows i with U(i,k)!=0 */
1424       dk = rtmp[k];
1425       i  = c2r[k]; /* first row to be added to k_th row  */
1426 
1427       while (i < k) {
1428         nexti = c2r[i]; /* next row to be added to k_th row */
1429 
1430         /* compute multiplier, update diag(k) and U(i,k) */
1431         ili   = il[i];                   /* index of first nonzero element in U(i,k:bms-1) */
1432         uikdi = -ba[ili] * ba[bdiag[i]]; /* diagonal(k) */
1433         dk += uikdi * ba[ili];           /* update diag[k] */
1434         ba[ili] = uikdi;                 /* -U(i,k) */
1435 
1436         /* add multiple of row i to k-th row */
1437         jmin = ili + 1;
1438         jmax = bi[i + 1];
1439         if (jmin < jmax) {
1440           for (j = jmin; j < jmax; j++) rtmp[bj[j]] += uikdi * ba[j];
1441           /* update il and c2r for row i */
1442           il[i]  = jmin;
1443           j      = bj[jmin];
1444           c2r[i] = c2r[j];
1445           c2r[j] = i;
1446         }
1447         i = nexti;
1448       }
1449 
1450       /* copy data into U(k,:) */
1451       rs   = 0.0;
1452       jmin = bi[k];
1453       jmax = bi[k + 1] - 1;
1454       if (jmin < jmax) {
1455         for (j = jmin; j < jmax; j++) {
1456           col   = bj[j];
1457           ba[j] = rtmp[col];
1458           rs += PetscAbsScalar(ba[j]);
1459         }
1460         /* add the k-th row into il and c2r */
1461         il[k]  = jmin;
1462         i      = bj[jmin];
1463         c2r[k] = c2r[i];
1464         c2r[i] = k;
1465       }
1466 
1467       sctx.rs = rs;
1468       sctx.pv = dk;
1469       PetscCall(MatPivotCheck(B, A, info, &sctx, k));
1470       if (sctx.newshift) break;
1471       dk = sctx.pv;
1472 
1473       ba[bdiag[k]] = 1.0 / dk; /* U(k,k) */
1474     }
1475   } while (sctx.newshift);
1476 
1477   PetscCall(PetscFree3(rtmp, il, c2r));
1478 
1479   B->ops->solve          = MatSolve_SeqSBAIJ_1_NaturalOrdering;
1480   B->ops->solves         = MatSolves_SeqSBAIJ_1;
1481   B->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering;
1482   B->ops->matsolve       = MatMatSolve_SeqSBAIJ_1_NaturalOrdering;
1483   B->ops->forwardsolve   = MatForwardSolve_SeqSBAIJ_1_NaturalOrdering;
1484   B->ops->backwardsolve  = MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering;
1485 
1486   B->assembled    = PETSC_TRUE;
1487   B->preallocated = PETSC_TRUE;
1488 
1489   PetscCall(PetscLogFlops(B->rmap->n));
1490 
1491   /* MatPivotView() */
1492   if (sctx.nshift) {
1493     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
1494       PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
1495     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1496       PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
1497     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1498       PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount));
1499     }
1500   }
1501   PetscFunctionReturn(PETSC_SUCCESS);
1502 }
1503 
1504 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info)
1505 {
1506   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)C->data;
1507   PetscInt       i, j, mbs = a->mbs;
1508   PetscInt      *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
1509   PetscInt       k, jmin, *jl, *il, nexti, ili, *acol, *bcol, nz;
1510   MatScalar     *rtmp, *ba = b->a, *aa = a->a, dk, uikdi, *aval, *bval;
1511   PetscReal      rs;
1512   FactorShiftCtx sctx;
1513 
1514   PetscFunctionBegin;
1515   /* MatPivotSetUp(): initialize shift context sctx */
1516   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
1517 
1518   /* initialization */
1519   /* il and jl record the first nonzero element in each row of the accessing
1520      window U(0:k, k:mbs-1).
1521      jl:    list of rows to be added to uneliminated rows
1522             i>= k: jl(i) is the first row to be added to row i
1523             i<  k: jl(i) is the row following row i in some list of rows
1524             jl(i) = mbs indicates the end of a list
1525      il(i): points to the first nonzero element in U(i,k:mbs-1)
1526   */
1527   PetscCall(PetscMalloc1(mbs, &rtmp));
1528   PetscCall(PetscMalloc2(mbs, &il, mbs, &jl));
1529 
1530   do {
1531     sctx.newshift = PETSC_FALSE;
1532     il[0]         = 0;
1533     for (i = 0; i < mbs; i++) {
1534       rtmp[i] = 0.0;
1535       jl[i]   = mbs;
1536     }
1537 
1538     for (k = 0; k < mbs; k++) {
1539       /*initialize k-th row with elements nonzero in row perm(k) of A */
1540       nz   = ai[k + 1] - ai[k];
1541       acol = aj + ai[k];
1542       aval = aa + ai[k];
1543       bval = ba + bi[k];
1544       while (nz--) {
1545         rtmp[*acol++] = *aval++;
1546         *bval++       = 0.0; /* for in-place factorization */
1547       }
1548 
1549       /* shift the diagonal of the matrix */
1550       if (sctx.nshift) rtmp[k] += sctx.shift_amount;
1551 
1552       /* modify k-th row by adding in those rows i with U(i,k)!=0 */
1553       dk = rtmp[k];
1554       i  = jl[k]; /* first row to be added to k_th row  */
1555 
1556       while (i < k) {
1557         nexti = jl[i]; /* next row to be added to k_th row */
1558         /* compute multiplier, update D(k) and U(i,k) */
1559         ili   = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
1560         uikdi = -ba[ili] * ba[bi[i]];
1561         dk += uikdi * ba[ili];
1562         ba[ili] = uikdi; /* -U(i,k) */
1563 
1564         /* add multiple of row i to k-th row ... */
1565         jmin = ili + 1;
1566         nz   = bi[i + 1] - jmin;
1567         if (nz > 0) {
1568           bcol = bj + jmin;
1569           bval = ba + jmin;
1570           PetscCall(PetscLogFlops(2.0 * nz));
1571           while (nz--) rtmp[*bcol++] += uikdi * (*bval++);
1572 
1573           /* update il and jl for i-th row */
1574           il[i] = jmin;
1575           j     = bj[jmin];
1576           jl[i] = jl[j];
1577           jl[j] = i;
1578         }
1579         i = nexti;
1580       }
1581 
1582       /* shift the diagonals when zero pivot is detected */
1583       /* compute rs=sum of abs(off-diagonal) */
1584       rs   = 0.0;
1585       jmin = bi[k] + 1;
1586       nz   = bi[k + 1] - jmin;
1587       if (nz) {
1588         bcol = bj + jmin;
1589         while (nz--) {
1590           rs += PetscAbsScalar(rtmp[*bcol]);
1591           bcol++;
1592         }
1593       }
1594 
1595       sctx.rs = rs;
1596       sctx.pv = dk;
1597       PetscCall(MatPivotCheck(C, A, info, &sctx, k));
1598       if (sctx.newshift) break; /* sctx.shift_amount is updated */
1599       dk = sctx.pv;
1600 
1601       /* copy data into U(k,:) */
1602       ba[bi[k]] = 1.0 / dk;
1603       jmin      = bi[k] + 1;
1604       nz        = bi[k + 1] - jmin;
1605       if (nz) {
1606         bcol = bj + jmin;
1607         bval = ba + jmin;
1608         while (nz--) {
1609           *bval++       = rtmp[*bcol];
1610           rtmp[*bcol++] = 0.0;
1611         }
1612         /* add k-th row into il and jl */
1613         il[k] = jmin;
1614         i     = bj[jmin];
1615         jl[k] = jl[i];
1616         jl[i] = k;
1617       }
1618     } /* end of for (k = 0; k<mbs; k++) */
1619   } while (sctx.newshift);
1620   PetscCall(PetscFree(rtmp));
1621   PetscCall(PetscFree2(il, jl));
1622 
1623   C->ops->solve          = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1624   C->ops->solves         = MatSolves_SeqSBAIJ_1_inplace;
1625   C->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1626   C->ops->forwardsolve   = MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1627   C->ops->backwardsolve  = MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1628 
1629   C->assembled    = PETSC_TRUE;
1630   C->preallocated = PETSC_TRUE;
1631 
1632   PetscCall(PetscLogFlops(C->rmap->N));
1633   if (sctx.nshift) {
1634     if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1635       PetscCall(PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
1636     } else if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
1637       PetscCall(PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
1638     }
1639   }
1640   PetscFunctionReturn(PETSC_SUCCESS);
1641 }
1642 
1643 PetscErrorCode MatCholeskyFactor_SeqSBAIJ(Mat A, IS perm, const MatFactorInfo *info)
1644 {
1645   Mat C;
1646 
1647   PetscFunctionBegin;
1648   PetscCall(MatGetFactor(A, "petsc", MAT_FACTOR_CHOLESKY, &C));
1649   PetscCall(MatCholeskyFactorSymbolic(C, A, perm, info));
1650   PetscCall(MatCholeskyFactorNumeric(C, A, info));
1651 
1652   A->ops->solve          = C->ops->solve;
1653   A->ops->solvetranspose = C->ops->solvetranspose;
1654 
1655   PetscCall(MatHeaderMerge(A, &C));
1656   PetscFunctionReturn(PETSC_SUCCESS);
1657 }
1658