xref: /petsc/src/mat/impls/sbaij/seq/sbaijfact.c (revision e8c0849ab8fe171bed529bea27238c9b402db591)
1 #include <../src/mat/impls/baij/seq/baij.h>
2 #include <../src/mat/impls/sbaij/seq/sbaij.h>
3 #include <petsc/private/kernels/blockinvert.h>
4 #include <petscis.h>
5 
MatGetInertia_SeqSBAIJ(Mat F,PetscInt * nneg,PetscInt * nzero,PetscInt * npos)6 PetscErrorCode MatGetInertia_SeqSBAIJ(Mat F, PetscInt *nneg, PetscInt *nzero, PetscInt *npos)
7 {
8   Mat_SeqSBAIJ *fact = (Mat_SeqSBAIJ *)F->data;
9   MatScalar    *dd   = fact->a;
10   PetscInt      mbs = fact->mbs, bs = F->rmap->bs, i, nneg_tmp, npos_tmp, *fi = fact->diag;
11 
12   PetscFunctionBegin;
13   PetscCheck(bs == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for bs: %" PetscInt_FMT " >1 yet", bs);
14 
15   nneg_tmp = 0;
16   npos_tmp = 0;
17   if (fi) {
18     for (i = 0; i < mbs; i++) {
19       if (PetscRealPart(dd[*fi]) > 0.0) npos_tmp++;
20       else if (PetscRealPart(dd[*fi]) < 0.0) nneg_tmp++;
21       fi++;
22     }
23   } else {
24     for (i = 0; i < mbs; i++) {
25       if (PetscRealPart(dd[fact->i[i]]) > 0.0) npos_tmp++;
26       else if (PetscRealPart(dd[fact->i[i]]) < 0.0) nneg_tmp++;
27     }
28   }
29   if (nneg) *nneg = nneg_tmp;
30   if (npos) *npos = npos_tmp;
31   if (nzero) *nzero = mbs - nneg_tmp - npos_tmp;
32   PetscFunctionReturn(PETSC_SUCCESS);
33 }
34 
35 /*
36   Symbolic U^T*D*U factorization for SBAIJ format. Modified from SSF of YSMP.
37   Use Modified Sparse Row (MSR) storage for u and ju. See page 85, "Iterative Methods ..." by Saad.
38 */
MatCholeskyFactorSymbolic_SeqSBAIJ_MSR(Mat F,Mat A,IS perm,const MatFactorInfo * info)39 static PetscErrorCode MatCholeskyFactorSymbolic_SeqSBAIJ_MSR(Mat F, Mat A, IS perm, const MatFactorInfo *info)
40 {
41   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data, *b;
42   const PetscInt *rip, *ai, *aj;
43   PetscInt        i, mbs = a->mbs, *jutmp, bs = A->rmap->bs, bs2 = a->bs2;
44   PetscInt        m, reallocs = 0, prow;
45   PetscInt       *jl, *q, jmin, jmax, juidx, nzk, qm, *iu, *ju, k, j, vj, umax, maxadd;
46   PetscReal       f = info->fill;
47   PetscBool       perm_identity;
48 
49   PetscFunctionBegin;
50   /* check whether perm is the identity mapping */
51   PetscCall(ISIdentity(perm, &perm_identity));
52   PetscCall(ISGetIndices(perm, &rip));
53 
54   if (perm_identity) { /* without permutation */
55     a->permute = PETSC_FALSE;
56 
57     ai = a->i;
58     aj = a->j;
59   } else { /* non-trivial permutation */
60     a->permute = PETSC_TRUE;
61 
62     PetscCall(MatReorderingSeqSBAIJ(A, perm));
63 
64     ai = a->inew;
65     aj = a->jnew;
66   }
67 
68   /* initialization */
69   PetscCall(PetscShmgetAllocateArray(mbs + 1, sizeof(PetscInt), (void **)&iu));
70   umax = (PetscInt)(f * ai[mbs] + 1);
71   umax += mbs + 1;
72   PetscCall(PetscShmgetAllocateArray(umax, sizeof(PetscInt), (void **)&ju));
73   iu[0] = mbs + 1;
74   juidx = mbs + 1; /* index for ju */
75   /* jl linked list for pivot row -- linked list for col index */
76   PetscCall(PetscMalloc2(mbs, &jl, mbs, &q));
77   for (i = 0; i < mbs; i++) {
78     jl[i] = mbs;
79     q[i]  = 0;
80   }
81 
82   /* for each row k */
83   for (k = 0; k < mbs; k++) {
84     for (i = 0; i < mbs; i++) q[i] = 0; /* to be removed! */
85     nzk  = 0;                           /* num. of nz blocks in k-th block row with diagonal block excluded */
86     q[k] = mbs;
87     /* initialize nonzero structure of k-th row to row rip[k] of A */
88     jmin = ai[rip[k]] + 1; /* exclude diag[k] */
89     jmax = ai[rip[k] + 1];
90     for (j = jmin; j < jmax; j++) {
91       vj = rip[aj[j]]; /* col. value */
92       if (vj > k) {
93         qm = k;
94         do {
95           m  = qm;
96           qm = q[m];
97         } while (qm < vj);
98         PetscCheck(qm != vj, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Duplicate entry in A");
99         nzk++;
100         q[m]  = vj;
101         q[vj] = qm;
102       } /* if (vj > k) */
103     } /* for (j=jmin; j<jmax; j++) */
104 
105     /* modify nonzero structure of k-th row by computing fill-in
106        for each row i to be merged in */
107     prow = k;
108     prow = jl[prow]; /* next pivot row (== mbs for symbolic factorization) */
109 
110     while (prow < k) {
111       /* merge row prow into k-th row */
112       jmin = iu[prow] + 1;
113       jmax = iu[prow + 1];
114       qm   = k;
115       for (j = jmin; j < jmax; j++) {
116         vj = ju[j];
117         do {
118           m  = qm;
119           qm = q[m];
120         } while (qm < vj);
121         if (qm != vj) {
122           nzk++;
123           q[m]  = vj;
124           q[vj] = qm;
125           qm    = vj;
126         }
127       }
128       prow = jl[prow]; /* next pivot row */
129     }
130 
131     /* add k to row list for first nonzero element in k-th row */
132     if (nzk > 0) {
133       i     = q[k]; /* col value of first nonzero element in U(k, k+1:mbs-1) */
134       jl[k] = jl[i];
135       jl[i] = k;
136     }
137     iu[k + 1] = iu[k] + nzk;
138 
139     /* allocate more space to ju if needed */
140     if (iu[k + 1] > umax) {
141       /* estimate how much additional space we will need */
142       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
143       /* just double the memory each time */
144       maxadd = umax;
145       if (maxadd < nzk) maxadd = (mbs - k) * (nzk + 1) / 2;
146       umax += maxadd;
147 
148       /* allocate a longer ju */
149       PetscCall(PetscShmgetAllocateArray(umax, sizeof(PetscInt), (void **)&jutmp));
150       PetscCall(PetscArraycpy(jutmp, ju, iu[k]));
151       PetscCall(PetscShmgetDeallocateArray((void **)&ju));
152       ju = jutmp;
153       reallocs++; /* count how many times we realloc */
154     }
155 
156     /* save nonzero structure of k-th row in ju */
157     i = k;
158     while (nzk--) {
159       i           = q[i];
160       ju[juidx++] = i;
161     }
162   }
163 
164 #if defined(PETSC_USE_INFO)
165   if (ai[mbs] != 0) {
166     PetscReal af = ((PetscReal)iu[mbs]) / ((PetscReal)ai[mbs]);
167     PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)f, (double)af));
168     PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
169     PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g);\n", (double)af));
170     PetscCall(PetscInfo(A, "for best performance.\n"));
171   } else {
172     PetscCall(PetscInfo(A, "Empty matrix\n"));
173   }
174 #endif
175 
176   PetscCall(ISRestoreIndices(perm, &rip));
177   PetscCall(PetscFree2(jl, q));
178 
179   /* put together the new matrix */
180   PetscCall(MatSeqSBAIJSetPreallocation(F, bs, MAT_SKIP_ALLOCATION, NULL));
181 
182   b          = (Mat_SeqSBAIJ *)F->data;
183   b->free_ij = PETSC_TRUE;
184   PetscCall(PetscShmgetAllocateArray((iu[mbs] + 1) * bs2, sizeof(PetscScalar), (void **)&b->a));
185   b->free_a = PETSC_TRUE;
186   b->j      = ju;
187   b->i      = iu;
188   b->diag   = NULL;
189   b->ilen   = NULL;
190   b->imax   = NULL;
191   b->row    = perm;
192 
193   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
194 
195   PetscCall(PetscObjectReference((PetscObject)perm));
196 
197   b->icol = perm;
198   PetscCall(PetscObjectReference((PetscObject)perm));
199   PetscCall(PetscMalloc1(bs * mbs + bs, &b->solve_work));
200   /* In b structure:  Free imax, ilen, old a, old j.
201      Allocate idnew, solve_work, new a, new j */
202   b->maxnz = b->nz = iu[mbs];
203 
204   F->info.factor_mallocs   = reallocs;
205   F->info.fill_ratio_given = f;
206   if (ai[mbs] != 0) {
207     F->info.fill_ratio_needed = ((PetscReal)iu[mbs]) / ((PetscReal)ai[mbs]);
208   } else {
209     F->info.fill_ratio_needed = 0.0;
210   }
211   PetscCall(MatSeqSBAIJSetNumericFactorization_inplace(F, perm_identity));
212   PetscFunctionReturn(PETSC_SUCCESS);
213 }
214 /*
215     Symbolic U^T*D*U factorization for SBAIJ format.
216     See MatICCFactorSymbolic_SeqAIJ() for description of its data structure.
217 */
218 #include <petscbt.h>
219 #include <../src/mat/utils/freespace.h>
MatCholeskyFactorSymbolic_SeqSBAIJ(Mat fact,Mat A,IS perm,const MatFactorInfo * info)220 PetscErrorCode MatCholeskyFactorSymbolic_SeqSBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
221 {
222   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
223   Mat_SeqSBAIJ      *b;
224   PetscBool          perm_identity;
225   PetscReal          fill = info->fill;
226   const PetscInt    *rip, *ai = a->i, *aj = a->j;
227   PetscInt           i, mbs = a->mbs, bs = A->rmap->bs, reallocs = 0, prow;
228   PetscInt          *jl, jmin, jmax, nzk, *ui, k, j, *il, nextprow;
229   PetscInt           nlnk, *lnk, ncols, *cols, *uj, **ui_ptr, *uj_ptr, *udiag;
230   PetscFreeSpaceList free_space = NULL, current_space = NULL;
231   PetscBT            lnkbt;
232   PetscBool          diagDense;
233 
234   PetscFunctionBegin;
235   PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
236   PetscCall(MatGetDiagonalMarkers_SeqSBAIJ(A, NULL, &diagDense));
237   PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry");
238   if (bs > 1) {
239     PetscCall(MatCholeskyFactorSymbolic_SeqSBAIJ_inplace(fact, A, perm, info));
240     PetscFunctionReturn(PETSC_SUCCESS);
241   }
242 
243   /* check whether perm is the identity mapping */
244   PetscCall(ISIdentity(perm, &perm_identity));
245   PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported for sbaij matrix. Use aij format");
246   a->permute = PETSC_FALSE;
247   PetscCall(ISGetIndices(perm, &rip));
248 
249   /* initialization */
250   PetscCall(PetscShmgetAllocateArray(mbs + 1, sizeof(PetscInt), (void **)&ui));
251   PetscCall(PetscMalloc1(mbs + 1, &udiag));
252   ui[0] = 0;
253 
254   /* jl: linked list for storing indices of the pivot rows
255      il: il[i] points to the 1st nonzero entry of U(i,k:mbs-1) */
256   PetscCall(PetscMalloc4(mbs, &ui_ptr, mbs, &il, mbs, &jl, mbs, &cols));
257   for (i = 0; i < mbs; i++) {
258     jl[i] = mbs;
259     il[i] = 0;
260   }
261 
262   /* create and initialize a linked list for storing column indices of the active row k */
263   nlnk = mbs + 1;
264   PetscCall(PetscLLCreate(mbs, mbs, nlnk, lnk, lnkbt));
265 
266   /* initial FreeSpace size is fill*(ai[mbs]+1) */
267   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[mbs] + 1), &free_space));
268   current_space = free_space;
269 
270   for (k = 0; k < mbs; k++) { /* for each active row k */
271     /* initialize lnk by the column indices of row rip[k] of A */
272     nzk   = 0;
273     ncols = ai[k + 1] - ai[k];
274     PetscCheck(ncols, PETSC_COMM_SELF, PETSC_ERR_MAT_CH_ZRPVT, "Empty row %" PetscInt_FMT " in matrix ", k);
275     for (j = 0; j < ncols; j++) {
276       i       = *(aj + ai[k] + j);
277       cols[j] = i;
278     }
279     PetscCall(PetscLLAdd(ncols, cols, mbs, &nlnk, lnk, lnkbt));
280     nzk += nlnk;
281 
282     /* update lnk by computing fill-in for each pivot row to be merged in */
283     prow = jl[k]; /* 1st pivot row */
284 
285     while (prow < k) {
286       nextprow = jl[prow];
287       /* merge prow into k-th row */
288       jmin   = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:mbs-1) */
289       jmax   = ui[prow + 1];
290       ncols  = jmax - jmin;
291       uj_ptr = ui_ptr[prow] + jmin - ui[prow]; /* points to the 2nd nzero entry in U(prow,k:mbs-1) */
292       PetscCall(PetscLLAddSorted(ncols, uj_ptr, mbs, &nlnk, lnk, lnkbt));
293       nzk += nlnk;
294 
295       /* update il and jl for prow */
296       if (jmin < jmax) {
297         il[prow] = jmin;
298         j        = *uj_ptr;
299         jl[prow] = jl[j];
300         jl[j]    = prow;
301       }
302       prow = nextprow;
303     }
304 
305     /* if free space is not available, make more free space */
306     if (current_space->local_remaining < nzk) {
307       i = mbs - k + 1;                                   /* num of unfactored rows */
308       i = PetscIntMultTruncate(i, PetscMin(nzk, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
309       PetscCall(PetscFreeSpaceGet(i, &current_space));
310       reallocs++;
311     }
312 
313     /* copy data into free space, then initialize lnk */
314     PetscCall(PetscLLClean(mbs, mbs, nzk, lnk, current_space->array, lnkbt));
315 
316     /* add the k-th row into il and jl */
317     if (nzk > 1) {
318       i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */
319       jl[k] = jl[i];
320       jl[i] = k;
321       il[k] = ui[k] + 1;
322     }
323     ui_ptr[k] = current_space->array;
324 
325     current_space->array += nzk;
326     current_space->local_used += nzk;
327     current_space->local_remaining -= nzk;
328 
329     ui[k + 1] = ui[k] + nzk;
330   }
331 
332   PetscCall(ISRestoreIndices(perm, &rip));
333   PetscCall(PetscFree4(ui_ptr, il, jl, cols));
334 
335   /* destroy list of free space and other temporary array(s) */
336   PetscCall(PetscShmgetAllocateArray(ui[mbs], sizeof(PetscInt), (void **)&uj));
337   PetscCall(PetscFreeSpaceContiguous_Cholesky(&free_space, uj, mbs, ui, udiag)); /* store matrix factor */
338   PetscCall(PetscLLDestroy(lnk, lnkbt));
339 
340   /* put together the new matrix in MATSEQSBAIJ format */
341   PetscCall(MatSeqSBAIJSetPreallocation(fact, bs, MAT_SKIP_ALLOCATION, NULL));
342 
343   b          = (Mat_SeqSBAIJ *)fact->data;
344   b->free_ij = PETSC_TRUE;
345   PetscCall(PetscShmgetAllocateArray(ui[mbs], sizeof(PetscScalar), (void **)&b->a));
346   b->free_a = PETSC_TRUE;
347   b->j      = uj;
348   b->i      = ui;
349   b->diag   = udiag;
350   b->ilen   = NULL;
351   b->imax   = NULL;
352   b->row    = perm;
353   b->icol   = perm;
354 
355   PetscCall(PetscObjectReference((PetscObject)perm));
356   PetscCall(PetscObjectReference((PetscObject)perm));
357 
358   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
359 
360   PetscCall(PetscMalloc1(mbs + 1, &b->solve_work));
361 
362   b->maxnz = b->nz = ui[mbs];
363 
364   fact->info.factor_mallocs   = reallocs;
365   fact->info.fill_ratio_given = fill;
366   if (ai[mbs] != 0) {
367     fact->info.fill_ratio_needed = ((PetscReal)ui[mbs]) / ai[mbs];
368   } else {
369     fact->info.fill_ratio_needed = 0.0;
370   }
371 #if defined(PETSC_USE_INFO)
372   if (ai[mbs] != 0) {
373     PetscReal af = fact->info.fill_ratio_needed;
374     PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af));
375     PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
376     PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af));
377   } else {
378     PetscCall(PetscInfo(A, "Empty matrix\n"));
379   }
380 #endif
381   fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
382   PetscFunctionReturn(PETSC_SUCCESS);
383 }
384 
MatCholeskyFactorSymbolic_SeqSBAIJ_inplace(Mat fact,Mat A,IS perm,const MatFactorInfo * info)385 PetscErrorCode MatCholeskyFactorSymbolic_SeqSBAIJ_inplace(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
386 {
387   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
388   Mat_SeqSBAIJ      *b;
389   PetscBool          perm_identity;
390   PetscReal          fill = info->fill;
391   const PetscInt    *rip, *ai, *aj;
392   PetscInt           i, mbs = a->mbs, bs = A->rmap->bs, reallocs = 0, prow;
393   PetscInt          *jl, jmin, jmax, nzk, *ui, k, j, *il, nextprow;
394   PetscInt           nlnk, *lnk, ncols, *cols, *uj, **ui_ptr, *uj_ptr;
395   PetscFreeSpaceList free_space = NULL, current_space = NULL;
396   PetscBT            lnkbt;
397   PetscBool          diagDense;
398 
399   PetscFunctionBegin;
400   PetscCall(MatGetDiagonalMarkers_SeqSBAIJ(A, NULL, &diagDense));
401   PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry");
402 
403   /*
404    This code originally uses Modified Sparse Row (MSR) storage
405    (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choice!
406    Then it is rewritten so the factor B takes seqsbaij format. However the associated
407    MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1 or !perm_identity,
408    thus the original code in MSR format is still used for these cases.
409    The code below should replace MatCholeskyFactorSymbolic_SeqSBAIJ_MSR() whenever
410    MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor.
411   */
412   if (bs > 1) {
413     PetscCall(MatCholeskyFactorSymbolic_SeqSBAIJ_MSR(fact, A, perm, info));
414     PetscFunctionReturn(PETSC_SUCCESS);
415   }
416 
417   /* check whether perm is the identity mapping */
418   PetscCall(ISIdentity(perm, &perm_identity));
419   PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported for sbaij matrix. Use aij format");
420   a->permute = PETSC_FALSE;
421   ai         = a->i;
422   aj         = a->j;
423   PetscCall(ISGetIndices(perm, &rip));
424 
425   /* initialization */
426   PetscCall(PetscShmgetAllocateArray(mbs + 1, sizeof(PetscInt), (void **)&ui));
427   ui[0] = 0;
428 
429   /* jl: linked list for storing indices of the pivot rows
430      il: il[i] points to the 1st nonzero entry of U(i,k:mbs-1) */
431   PetscCall(PetscMalloc4(mbs, &ui_ptr, mbs, &il, mbs, &jl, mbs, &cols));
432   for (i = 0; i < mbs; i++) {
433     jl[i] = mbs;
434     il[i] = 0;
435   }
436 
437   /* create and initialize a linked list for storing column indices of the active row k */
438   nlnk = mbs + 1;
439   PetscCall(PetscLLCreate(mbs, mbs, nlnk, lnk, lnkbt));
440 
441   /* initial FreeSpace size is fill*(ai[mbs]+1) */
442   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[mbs] + 1), &free_space));
443   current_space = free_space;
444 
445   for (k = 0; k < mbs; k++) { /* for each active row k */
446     /* initialize lnk by the column indices of row rip[k] of A */
447     nzk   = 0;
448     ncols = ai[rip[k] + 1] - ai[rip[k]];
449     for (j = 0; j < ncols; j++) {
450       i       = *(aj + ai[rip[k]] + j);
451       cols[j] = rip[i];
452     }
453     PetscCall(PetscLLAdd(ncols, cols, mbs, &nlnk, lnk, lnkbt));
454     nzk += nlnk;
455 
456     /* update lnk by computing fill-in for each pivot row to be merged in */
457     prow = jl[k]; /* 1st pivot row */
458 
459     while (prow < k) {
460       nextprow = jl[prow];
461       /* merge prow into k-th row */
462       jmin   = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:mbs-1) */
463       jmax   = ui[prow + 1];
464       ncols  = jmax - jmin;
465       uj_ptr = ui_ptr[prow] + jmin - ui[prow]; /* points to the 2nd nzero entry in U(prow,k:mbs-1) */
466       PetscCall(PetscLLAddSorted(ncols, uj_ptr, mbs, &nlnk, lnk, lnkbt));
467       nzk += nlnk;
468 
469       /* update il and jl for prow */
470       if (jmin < jmax) {
471         il[prow] = jmin;
472 
473         j        = *uj_ptr;
474         jl[prow] = jl[j];
475         jl[j]    = prow;
476       }
477       prow = nextprow;
478     }
479 
480     /* if free space is not available, make more free space */
481     if (current_space->local_remaining < nzk) {
482       i = mbs - k + 1;                                                            /* num of unfactored rows */
483       i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
484       PetscCall(PetscFreeSpaceGet(i, &current_space));
485       reallocs++;
486     }
487 
488     /* copy data into free space, then initialize lnk */
489     PetscCall(PetscLLClean(mbs, mbs, nzk, lnk, current_space->array, lnkbt));
490 
491     /* add the k-th row into il and jl */
492     if (nzk - 1 > 0) {
493       i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */
494       jl[k] = jl[i];
495       jl[i] = k;
496       il[k] = ui[k] + 1;
497     }
498     ui_ptr[k] = current_space->array;
499 
500     current_space->array += nzk;
501     current_space->local_used += nzk;
502     current_space->local_remaining -= nzk;
503 
504     ui[k + 1] = ui[k] + nzk;
505   }
506 
507   PetscCall(ISRestoreIndices(perm, &rip));
508   PetscCall(PetscFree4(ui_ptr, il, jl, cols));
509 
510   /* destroy list of free space and other temporary array(s) */
511   PetscCall(PetscShmgetAllocateArray(ui[mbs] + 1, sizeof(PetscInt), (void **)&uj));
512   PetscCall(PetscFreeSpaceContiguous(&free_space, uj));
513   PetscCall(PetscLLDestroy(lnk, lnkbt));
514 
515   /* put together the new matrix in MATSEQSBAIJ format */
516   PetscCall(MatSeqSBAIJSetPreallocation(fact, bs, MAT_SKIP_ALLOCATION, NULL));
517 
518   b = (Mat_SeqSBAIJ *)fact->data;
519   PetscCall(PetscShmgetAllocateArray(ui[mbs] + 1, sizeof(PetscScalar), (void **)&b->a));
520   b->free_a  = PETSC_TRUE;
521   b->free_ij = PETSC_TRUE;
522   b->j       = uj;
523   b->i       = ui;
524   b->diag    = NULL;
525   b->ilen    = NULL;
526   b->imax    = NULL;
527   b->row     = perm;
528 
529   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
530 
531   PetscCall(PetscObjectReference((PetscObject)perm));
532   b->icol = perm;
533   PetscCall(PetscObjectReference((PetscObject)perm));
534   PetscCall(PetscMalloc1(mbs + 1, &b->solve_work));
535   b->maxnz = b->nz = ui[mbs];
536 
537   fact->info.factor_mallocs   = reallocs;
538   fact->info.fill_ratio_given = fill;
539   if (ai[mbs] != 0) {
540     fact->info.fill_ratio_needed = ((PetscReal)ui[mbs]) / ai[mbs];
541   } else {
542     fact->info.fill_ratio_needed = 0.0;
543   }
544 #if defined(PETSC_USE_INFO)
545   if (ai[mbs] != 0) {
546     PetscReal af = fact->info.fill_ratio_needed;
547     PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af));
548     PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
549     PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af));
550   } else {
551     PetscCall(PetscInfo(A, "Empty matrix\n"));
552   }
553 #endif
554   PetscCall(MatSeqSBAIJSetNumericFactorization_inplace(fact, perm_identity));
555   PetscFunctionReturn(PETSC_SUCCESS);
556 }
557 
MatCholeskyFactorNumeric_SeqSBAIJ_N(Mat C,Mat A,const MatFactorInfo * info)558 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_N(Mat C, Mat A, const MatFactorInfo *info)
559 {
560   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)C->data;
561   IS              perm = b->row;
562   const PetscInt *ai, *aj, *perm_ptr, mbs = a->mbs, *bi = b->i, *bj = b->j;
563   PetscInt        i, j;
564   PetscInt       *a2anew, k, k1, jmin, jmax, *jl, *il, vj, nexti, ili;
565   PetscInt        bs = A->rmap->bs, bs2 = a->bs2;
566   MatScalar      *ba = b->a, *aa, *ap, *dk, *uik;
567   MatScalar      *u, *diag, *rtmp, *rtmp_ptr;
568   MatScalar      *work;
569   PetscInt       *pivots;
570   PetscBool       allowzeropivot, zeropivotdetected;
571 
572   PetscFunctionBegin;
573   /* initialization */
574   PetscCall(PetscCalloc1(bs2 * mbs, &rtmp));
575   PetscCall(PetscMalloc2(mbs, &il, mbs, &jl));
576   allowzeropivot = PetscNot(A->erroriffailure);
577 
578   il[0] = 0;
579   for (i = 0; i < mbs; i++) jl[i] = mbs;
580 
581   PetscCall(PetscMalloc3(bs2, &dk, bs2, &uik, bs, &work));
582   PetscCall(PetscMalloc1(bs, &pivots));
583 
584   PetscCall(ISGetIndices(perm, &perm_ptr));
585 
586   /* check permutation */
587   if (!a->permute) {
588     ai = a->i;
589     aj = a->j;
590     aa = a->a;
591   } else {
592     ai = a->inew;
593     aj = a->jnew;
594     PetscCall(PetscMalloc1(bs2 * ai[mbs], &aa));
595     PetscCall(PetscArraycpy(aa, a->a, bs2 * ai[mbs]));
596     PetscCall(PetscMalloc1(ai[mbs], &a2anew));
597     PetscCall(PetscArraycpy(a2anew, a->a2anew, ai[mbs]));
598 
599     for (i = 0; i < mbs; i++) {
600       jmin = ai[i];
601       jmax = ai[i + 1];
602       for (j = jmin; j < jmax; j++) {
603         while (a2anew[j] != j) {
604           k         = a2anew[j];
605           a2anew[j] = a2anew[k];
606           a2anew[k] = k;
607           for (k1 = 0; k1 < bs2; k1++) {
608             dk[k1]           = aa[k * bs2 + k1];
609             aa[k * bs2 + k1] = aa[j * bs2 + k1];
610             aa[j * bs2 + k1] = dk[k1];
611           }
612         }
613         /* transform column-oriented blocks that lie in the lower triangle to row-oriented blocks */
614         if (i > aj[j]) {
615           ap = aa + j * bs2;                       /* ptr to the beginning of j-th block of aa */
616           for (k = 0; k < bs2; k++) dk[k] = ap[k]; /* dk <- j-th block of aa */
617           for (k = 0; k < bs; k++) {               /* j-th block of aa <- dk^T */
618             for (k1 = 0; k1 < bs; k1++) *ap++ = dk[k + bs * k1];
619           }
620         }
621       }
622     }
623     PetscCall(PetscFree(a2anew));
624   }
625 
626   /* for each row k */
627   for (k = 0; k < mbs; k++) {
628     /*initialize k-th row with elements nonzero in row perm(k) of A */
629     jmin = ai[perm_ptr[k]];
630     jmax = ai[perm_ptr[k] + 1];
631 
632     ap = aa + jmin * bs2;
633     for (j = jmin; j < jmax; j++) {
634       vj       = perm_ptr[aj[j]]; /* block col. index */
635       rtmp_ptr = rtmp + vj * bs2;
636       for (i = 0; i < bs2; i++) *rtmp_ptr++ = *ap++;
637     }
638 
639     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
640     PetscCall(PetscArraycpy(dk, rtmp + k * bs2, bs2));
641     i = jl[k]; /* first row to be added to k_th row  */
642 
643     while (i < k) {
644       nexti = jl[i]; /* next row to be added to k_th row */
645 
646       /* compute multiplier */
647       ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
648 
649       /* uik = -inv(Di)*U_bar(i,k) */
650       diag = ba + i * bs2;
651       u    = ba + ili * bs2;
652       PetscCall(PetscArrayzero(uik, bs2));
653       PetscKernel_A_gets_A_minus_B_times_C(bs, uik, diag, u);
654 
655       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
656       PetscKernel_A_gets_A_plus_Btranspose_times_C(bs, dk, uik, u);
657       PetscCall(PetscLogFlops(4.0 * bs * bs2));
658 
659       /* update -U(i,k) */
660       PetscCall(PetscArraycpy(ba + ili * bs2, uik, bs2));
661 
662       /* add multiple of row i to k-th row ... */
663       jmin = ili + 1;
664       jmax = bi[i + 1];
665       if (jmin < jmax) {
666         for (j = jmin; j < jmax; j++) {
667           /* rtmp += -U(i,k)^T * U_bar(i,j) */
668           rtmp_ptr = rtmp + bj[j] * bs2;
669           u        = ba + j * bs2;
670           PetscKernel_A_gets_A_plus_Btranspose_times_C(bs, rtmp_ptr, uik, u);
671         }
672         PetscCall(PetscLogFlops(2.0 * bs * bs2 * (jmax - jmin)));
673 
674         /* ... add i to row list for next nonzero entry */
675         il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */
676         j     = bj[jmin];
677         jl[i] = jl[j];
678         jl[j] = i; /* update jl */
679       }
680       i = nexti;
681     }
682 
683     /* save nonzero entries in k-th row of U ... */
684 
685     /* invert diagonal block */
686     diag = ba + k * bs2;
687     PetscCall(PetscArraycpy(diag, dk, bs2));
688 
689     PetscCall(PetscKernel_A_gets_inverse_A(bs, diag, pivots, work, allowzeropivot, &zeropivotdetected));
690     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
691 
692     jmin = bi[k];
693     jmax = bi[k + 1];
694     if (jmin < jmax) {
695       for (j = jmin; j < jmax; j++) {
696         vj       = bj[j]; /* block col. index of U */
697         u        = ba + j * bs2;
698         rtmp_ptr = rtmp + vj * bs2;
699         for (k1 = 0; k1 < bs2; k1++) {
700           *u++        = *rtmp_ptr;
701           *rtmp_ptr++ = 0.0;
702         }
703       }
704 
705       /* ... add k to row list for first nonzero entry in k-th row */
706       il[k] = jmin;
707       i     = bj[jmin];
708       jl[k] = jl[i];
709       jl[i] = k;
710     }
711   }
712 
713   PetscCall(PetscFree(rtmp));
714   PetscCall(PetscFree2(il, jl));
715   PetscCall(PetscFree3(dk, uik, work));
716   PetscCall(PetscFree(pivots));
717   if (a->permute) PetscCall(PetscFree(aa));
718 
719   PetscCall(ISRestoreIndices(perm, &perm_ptr));
720 
721   C->ops->solve          = MatSolve_SeqSBAIJ_N_inplace;
722   C->ops->solvetranspose = MatSolve_SeqSBAIJ_N_inplace;
723   C->ops->forwardsolve   = MatForwardSolve_SeqSBAIJ_N_inplace;
724   C->ops->backwardsolve  = MatBackwardSolve_SeqSBAIJ_N_inplace;
725 
726   C->assembled    = PETSC_TRUE;
727   C->preallocated = PETSC_TRUE;
728 
729   PetscCall(PetscLogFlops(1.3333 * bs * bs2 * b->mbs)); /* from inverting diagonal blocks */
730   PetscFunctionReturn(PETSC_SUCCESS);
731 }
732 
MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering(Mat C,Mat A,const MatFactorInfo * info)733 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering(Mat C, Mat A, const MatFactorInfo *info)
734 {
735   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)C->data;
736   PetscInt      i, j, mbs = a->mbs, *bi = b->i, *bj = b->j;
737   PetscInt     *ai, *aj, k, k1, jmin, jmax, *jl, *il, vj, nexti, ili;
738   PetscInt      bs = A->rmap->bs, bs2 = a->bs2;
739   MatScalar    *ba = b->a, *aa, *ap, *dk, *uik;
740   MatScalar    *u, *diag, *rtmp, *rtmp_ptr;
741   MatScalar    *work;
742   PetscInt     *pivots;
743   PetscBool     allowzeropivot, zeropivotdetected;
744 
745   PetscFunctionBegin;
746   PetscCall(PetscCalloc1(bs2 * mbs, &rtmp));
747   PetscCall(PetscMalloc2(mbs, &il, mbs, &jl));
748   il[0] = 0;
749   for (i = 0; i < mbs; i++) jl[i] = mbs;
750 
751   PetscCall(PetscMalloc3(bs2, &dk, bs2, &uik, bs, &work));
752   PetscCall(PetscMalloc1(bs, &pivots));
753   allowzeropivot = PetscNot(A->erroriffailure);
754 
755   ai = a->i;
756   aj = a->j;
757   aa = a->a;
758 
759   /* for each row k */
760   for (k = 0; k < mbs; k++) {
761     /*initialize k-th row with elements nonzero in row k of A */
762     jmin = ai[k];
763     jmax = ai[k + 1];
764     ap   = aa + jmin * bs2;
765     for (j = jmin; j < jmax; j++) {
766       vj       = aj[j]; /* block col. index */
767       rtmp_ptr = rtmp + vj * bs2;
768       for (i = 0; i < bs2; i++) *rtmp_ptr++ = *ap++;
769     }
770 
771     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
772     PetscCall(PetscArraycpy(dk, rtmp + k * bs2, bs2));
773     i = jl[k]; /* first row to be added to k_th row  */
774 
775     while (i < k) {
776       nexti = jl[i]; /* next row to be added to k_th row */
777 
778       /* compute multiplier */
779       ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
780 
781       /* uik = -inv(Di)*U_bar(i,k) */
782       diag = ba + i * bs2;
783       u    = ba + ili * bs2;
784       PetscCall(PetscArrayzero(uik, bs2));
785       PetscKernel_A_gets_A_minus_B_times_C(bs, uik, diag, u);
786 
787       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
788       PetscKernel_A_gets_A_plus_Btranspose_times_C(bs, dk, uik, u);
789       PetscCall(PetscLogFlops(2.0 * bs * bs2));
790 
791       /* update -U(i,k) */
792       PetscCall(PetscArraycpy(ba + ili * bs2, uik, bs2));
793 
794       /* add multiple of row i to k-th row ... */
795       jmin = ili + 1;
796       jmax = bi[i + 1];
797       if (jmin < jmax) {
798         for (j = jmin; j < jmax; j++) {
799           /* rtmp += -U(i,k)^T * U_bar(i,j) */
800           rtmp_ptr = rtmp + bj[j] * bs2;
801           u        = ba + j * bs2;
802           PetscKernel_A_gets_A_plus_Btranspose_times_C(bs, rtmp_ptr, uik, u);
803         }
804         PetscCall(PetscLogFlops(2.0 * bs * bs2 * (jmax - jmin)));
805 
806         /* ... add i to row list for next nonzero entry */
807         il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */
808         j     = bj[jmin];
809         jl[i] = jl[j];
810         jl[j] = i; /* update jl */
811       }
812       i = nexti;
813     }
814 
815     /* save nonzero entries in k-th row of U ... */
816 
817     /* invert diagonal block */
818     diag = ba + k * bs2;
819     PetscCall(PetscArraycpy(diag, dk, bs2));
820 
821     PetscCall(PetscKernel_A_gets_inverse_A(bs, diag, pivots, work, allowzeropivot, &zeropivotdetected));
822     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
823 
824     jmin = bi[k];
825     jmax = bi[k + 1];
826     if (jmin < jmax) {
827       for (j = jmin; j < jmax; j++) {
828         vj       = bj[j]; /* block col. index of U */
829         u        = ba + j * bs2;
830         rtmp_ptr = rtmp + vj * bs2;
831         for (k1 = 0; k1 < bs2; k1++) {
832           *u++        = *rtmp_ptr;
833           *rtmp_ptr++ = 0.0;
834         }
835       }
836 
837       /* ... add k to row list for first nonzero entry in k-th row */
838       il[k] = jmin;
839       i     = bj[jmin];
840       jl[k] = jl[i];
841       jl[i] = k;
842     }
843   }
844 
845   PetscCall(PetscFree(rtmp));
846   PetscCall(PetscFree2(il, jl));
847   PetscCall(PetscFree3(dk, uik, work));
848   PetscCall(PetscFree(pivots));
849 
850   C->ops->solve          = MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace;
851   C->ops->solvetranspose = MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace;
852   C->ops->forwardsolve   = MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace;
853   C->ops->backwardsolve  = MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace;
854   C->assembled           = PETSC_TRUE;
855   C->preallocated        = PETSC_TRUE;
856 
857   PetscCall(PetscLogFlops(1.3333 * bs * bs2 * b->mbs)); /* from inverting diagonal blocks */
858   PetscFunctionReturn(PETSC_SUCCESS);
859 }
860 
861 /*
862     Numeric U^T*D*U factorization for SBAIJ format. Modified from SNF of YSMP.
863     Version for blocks 2 by 2.
864 */
MatCholeskyFactorNumeric_SeqSBAIJ_2(Mat C,Mat A,const MatFactorInfo * info)865 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_2(Mat C, Mat A, const MatFactorInfo *info)
866 {
867   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)C->data;
868   IS              perm = b->row;
869   const PetscInt *ai, *aj, *perm_ptr;
870   PetscInt        i, j, mbs = a->mbs, *bi = b->i, *bj = b->j;
871   PetscInt       *a2anew, k, k1, jmin, jmax, *jl, *il, vj, nexti, ili;
872   MatScalar      *ba = b->a, *aa, *ap;
873   MatScalar      *u, *diag, *rtmp, *rtmp_ptr, dk[4], uik[4];
874   PetscReal       shift = info->shiftamount;
875   PetscBool       allowzeropivot, zeropivotdetected;
876 
877   PetscFunctionBegin;
878   allowzeropivot = PetscNot(A->erroriffailure);
879 
880   /* initialization */
881   /* il and jl record the first nonzero element in each row of the accessing
882      window U(0:k, k:mbs-1).
883      jl:    list of rows to be added to uneliminated rows
884             i>= k: jl(i) is the first row to be added to row i
885             i<  k: jl(i) is the row following row i in some list of rows
886             jl(i) = mbs indicates the end of a list
887      il(i): points to the first nonzero element in columns k,...,mbs-1 of
888             row i of U */
889   PetscCall(PetscCalloc1(4 * mbs, &rtmp));
890   PetscCall(PetscMalloc2(mbs, &il, mbs, &jl));
891   il[0] = 0;
892   for (i = 0; i < mbs; i++) jl[i] = mbs;
893 
894   PetscCall(ISGetIndices(perm, &perm_ptr));
895 
896   /* check permutation */
897   if (!a->permute) {
898     ai = a->i;
899     aj = a->j;
900     aa = a->a;
901   } else {
902     ai = a->inew;
903     aj = a->jnew;
904     PetscCall(PetscMalloc1(4 * ai[mbs], &aa));
905     PetscCall(PetscArraycpy(aa, a->a, 4 * ai[mbs]));
906     PetscCall(PetscMalloc1(ai[mbs], &a2anew));
907     PetscCall(PetscArraycpy(a2anew, a->a2anew, ai[mbs]));
908 
909     for (i = 0; i < mbs; i++) {
910       jmin = ai[i];
911       jmax = ai[i + 1];
912       for (j = jmin; j < jmax; j++) {
913         while (a2anew[j] != j) {
914           k         = a2anew[j];
915           a2anew[j] = a2anew[k];
916           a2anew[k] = k;
917           for (k1 = 0; k1 < 4; k1++) {
918             dk[k1]         = aa[k * 4 + k1];
919             aa[k * 4 + k1] = aa[j * 4 + k1];
920             aa[j * 4 + k1] = dk[k1];
921           }
922         }
923         /* transform column-oriented blocks that lie in the lower triangle to row-oriented blocks */
924         if (i > aj[j]) {
925           ap    = aa + j * 4; /* ptr to the beginning of the block */
926           dk[1] = ap[1];      /* swap ap[1] and ap[2] */
927           ap[1] = ap[2];
928           ap[2] = dk[1];
929         }
930       }
931     }
932     PetscCall(PetscFree(a2anew));
933   }
934 
935   /* for each row k */
936   for (k = 0; k < mbs; k++) {
937     /*initialize k-th row with elements nonzero in row perm(k) of A */
938     jmin = ai[perm_ptr[k]];
939     jmax = ai[perm_ptr[k] + 1];
940     ap   = aa + jmin * 4;
941     for (j = jmin; j < jmax; j++) {
942       vj       = perm_ptr[aj[j]]; /* block col. index */
943       rtmp_ptr = rtmp + vj * 4;
944       for (i = 0; i < 4; i++) *rtmp_ptr++ = *ap++;
945     }
946 
947     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
948     PetscCall(PetscArraycpy(dk, rtmp + k * 4, 4));
949     i = jl[k]; /* first row to be added to k_th row  */
950 
951     while (i < k) {
952       nexti = jl[i]; /* next row to be added to k_th row */
953 
954       /* compute multiplier */
955       ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
956 
957       /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */
958       diag   = ba + i * 4;
959       u      = ba + ili * 4;
960       uik[0] = -(diag[0] * u[0] + diag[2] * u[1]);
961       uik[1] = -(diag[1] * u[0] + diag[3] * u[1]);
962       uik[2] = -(diag[0] * u[2] + diag[2] * u[3]);
963       uik[3] = -(diag[1] * u[2] + diag[3] * u[3]);
964 
965       /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */
966       dk[0] += uik[0] * u[0] + uik[1] * u[1];
967       dk[1] += uik[2] * u[0] + uik[3] * u[1];
968       dk[2] += uik[0] * u[2] + uik[1] * u[3];
969       dk[3] += uik[2] * u[2] + uik[3] * u[3];
970 
971       PetscCall(PetscLogFlops(16.0 * 2.0));
972 
973       /* update -U(i,k): ba[ili] = uik */
974       PetscCall(PetscArraycpy(ba + ili * 4, uik, 4));
975 
976       /* add multiple of row i to k-th row ... */
977       jmin = ili + 1;
978       jmax = bi[i + 1];
979       if (jmin < jmax) {
980         for (j = jmin; j < jmax; j++) {
981           /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */
982           rtmp_ptr = rtmp + bj[j] * 4;
983           u        = ba + j * 4;
984           rtmp_ptr[0] += uik[0] * u[0] + uik[1] * u[1];
985           rtmp_ptr[1] += uik[2] * u[0] + uik[3] * u[1];
986           rtmp_ptr[2] += uik[0] * u[2] + uik[1] * u[3];
987           rtmp_ptr[3] += uik[2] * u[2] + uik[3] * u[3];
988         }
989         PetscCall(PetscLogFlops(16.0 * (jmax - jmin)));
990 
991         /* ... add i to row list for next nonzero entry */
992         il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */
993         j     = bj[jmin];
994         jl[i] = jl[j];
995         jl[j] = i; /* update jl */
996       }
997       i = nexti;
998     }
999 
1000     /* save nonzero entries in k-th row of U ... */
1001 
1002     /* invert diagonal block */
1003     diag = ba + k * 4;
1004     PetscCall(PetscArraycpy(diag, dk, 4));
1005     PetscCall(PetscKernel_A_gets_inverse_A_2(diag, shift, allowzeropivot, &zeropivotdetected));
1006     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1007 
1008     jmin = bi[k];
1009     jmax = bi[k + 1];
1010     if (jmin < jmax) {
1011       for (j = jmin; j < jmax; j++) {
1012         vj       = bj[j]; /* block col. index of U */
1013         u        = ba + j * 4;
1014         rtmp_ptr = rtmp + vj * 4;
1015         for (k1 = 0; k1 < 4; k1++) {
1016           *u++        = *rtmp_ptr;
1017           *rtmp_ptr++ = 0.0;
1018         }
1019       }
1020 
1021       /* ... add k to row list for first nonzero entry in k-th row */
1022       il[k] = jmin;
1023       i     = bj[jmin];
1024       jl[k] = jl[i];
1025       jl[i] = k;
1026     }
1027   }
1028 
1029   PetscCall(PetscFree(rtmp));
1030   PetscCall(PetscFree2(il, jl));
1031   if (a->permute) PetscCall(PetscFree(aa));
1032   PetscCall(ISRestoreIndices(perm, &perm_ptr));
1033 
1034   C->ops->solve          = MatSolve_SeqSBAIJ_2_inplace;
1035   C->ops->solvetranspose = MatSolve_SeqSBAIJ_2_inplace;
1036   C->assembled           = PETSC_TRUE;
1037   C->preallocated        = PETSC_TRUE;
1038 
1039   PetscCall(PetscLogFlops(1.3333 * 8 * b->mbs)); /* from inverting diagonal blocks */
1040   PetscFunctionReturn(PETSC_SUCCESS);
1041 }
1042 
1043 /*
1044       Version for when blocks are 2 by 2 Using natural ordering
1045 */
MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering(Mat C,Mat A,const MatFactorInfo * info)1046 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering(Mat C, Mat A, const MatFactorInfo *info)
1047 {
1048   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)C->data;
1049   PetscInt      i, j, mbs = a->mbs, *bi = b->i, *bj = b->j;
1050   PetscInt     *ai, *aj, k, k1, jmin, jmax, *jl, *il, vj, nexti, ili;
1051   MatScalar    *ba = b->a, *aa, *ap, dk[8], uik[8];
1052   MatScalar    *u, *diag, *rtmp, *rtmp_ptr;
1053   PetscReal     shift = info->shiftamount;
1054   PetscBool     allowzeropivot, zeropivotdetected;
1055 
1056   PetscFunctionBegin;
1057   allowzeropivot = PetscNot(A->erroriffailure);
1058 
1059   /* initialization */
1060   /* il and jl record the first nonzero element in each row of the accessing
1061      window U(0:k, k:mbs-1).
1062      jl:    list of rows to be added to uneliminated rows
1063             i>= k: jl(i) is the first row to be added to row i
1064             i<  k: jl(i) is the row following row i in some list of rows
1065             jl(i) = mbs indicates the end of a list
1066      il(i): points to the first nonzero element in columns k,...,mbs-1 of
1067             row i of U */
1068   PetscCall(PetscCalloc1(4 * mbs, &rtmp));
1069   PetscCall(PetscMalloc2(mbs, &il, mbs, &jl));
1070   il[0] = 0;
1071   for (i = 0; i < mbs; i++) jl[i] = mbs;
1072 
1073   ai = a->i;
1074   aj = a->j;
1075   aa = a->a;
1076 
1077   /* for each row k */
1078   for (k = 0; k < mbs; k++) {
1079     /*initialize k-th row with elements nonzero in row k of A */
1080     jmin = ai[k];
1081     jmax = ai[k + 1];
1082     ap   = aa + jmin * 4;
1083     for (j = jmin; j < jmax; j++) {
1084       vj       = aj[j]; /* block col. index */
1085       rtmp_ptr = rtmp + vj * 4;
1086       for (i = 0; i < 4; i++) *rtmp_ptr++ = *ap++;
1087     }
1088 
1089     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
1090     PetscCall(PetscArraycpy(dk, rtmp + k * 4, 4));
1091     i = jl[k]; /* first row to be added to k_th row  */
1092 
1093     while (i < k) {
1094       nexti = jl[i]; /* next row to be added to k_th row */
1095 
1096       /* compute multiplier */
1097       ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
1098 
1099       /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */
1100       diag   = ba + i * 4;
1101       u      = ba + ili * 4;
1102       uik[0] = -(diag[0] * u[0] + diag[2] * u[1]);
1103       uik[1] = -(diag[1] * u[0] + diag[3] * u[1]);
1104       uik[2] = -(diag[0] * u[2] + diag[2] * u[3]);
1105       uik[3] = -(diag[1] * u[2] + diag[3] * u[3]);
1106 
1107       /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */
1108       dk[0] += uik[0] * u[0] + uik[1] * u[1];
1109       dk[1] += uik[2] * u[0] + uik[3] * u[1];
1110       dk[2] += uik[0] * u[2] + uik[1] * u[3];
1111       dk[3] += uik[2] * u[2] + uik[3] * u[3];
1112 
1113       PetscCall(PetscLogFlops(16.0 * 2.0));
1114 
1115       /* update -U(i,k): ba[ili] = uik */
1116       PetscCall(PetscArraycpy(ba + ili * 4, uik, 4));
1117 
1118       /* add multiple of row i to k-th row ... */
1119       jmin = ili + 1;
1120       jmax = bi[i + 1];
1121       if (jmin < jmax) {
1122         for (j = jmin; j < jmax; j++) {
1123           /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */
1124           rtmp_ptr = rtmp + bj[j] * 4;
1125           u        = ba + j * 4;
1126           rtmp_ptr[0] += uik[0] * u[0] + uik[1] * u[1];
1127           rtmp_ptr[1] += uik[2] * u[0] + uik[3] * u[1];
1128           rtmp_ptr[2] += uik[0] * u[2] + uik[1] * u[3];
1129           rtmp_ptr[3] += uik[2] * u[2] + uik[3] * u[3];
1130         }
1131         PetscCall(PetscLogFlops(16.0 * (jmax - jmin)));
1132 
1133         /* ... add i to row list for next nonzero entry */
1134         il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */
1135         j     = bj[jmin];
1136         jl[i] = jl[j];
1137         jl[j] = i; /* update jl */
1138       }
1139       i = nexti;
1140     }
1141 
1142     /* save nonzero entries in k-th row of U ... */
1143 
1144     /* invert diagonal block */
1145     diag = ba + k * 4;
1146     PetscCall(PetscArraycpy(diag, dk, 4));
1147     PetscCall(PetscKernel_A_gets_inverse_A_2(diag, shift, allowzeropivot, &zeropivotdetected));
1148     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1149 
1150     jmin = bi[k];
1151     jmax = bi[k + 1];
1152     if (jmin < jmax) {
1153       for (j = jmin; j < jmax; j++) {
1154         vj       = bj[j]; /* block col. index of U */
1155         u        = ba + j * 4;
1156         rtmp_ptr = rtmp + vj * 4;
1157         for (k1 = 0; k1 < 4; k1++) {
1158           *u++        = *rtmp_ptr;
1159           *rtmp_ptr++ = 0.0;
1160         }
1161       }
1162 
1163       /* ... add k to row list for first nonzero entry in k-th row */
1164       il[k] = jmin;
1165       i     = bj[jmin];
1166       jl[k] = jl[i];
1167       jl[i] = k;
1168     }
1169   }
1170 
1171   PetscCall(PetscFree(rtmp));
1172   PetscCall(PetscFree2(il, jl));
1173 
1174   C->ops->solve          = MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace;
1175   C->ops->solvetranspose = MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace;
1176   C->ops->forwardsolve   = MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace;
1177   C->ops->backwardsolve  = MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace;
1178   C->assembled           = PETSC_TRUE;
1179   C->preallocated        = PETSC_TRUE;
1180 
1181   PetscCall(PetscLogFlops(1.3333 * 8 * b->mbs)); /* from inverting diagonal blocks */
1182   PetscFunctionReturn(PETSC_SUCCESS);
1183 }
1184 
1185 /*
1186     Numeric U^T*D*U factorization for SBAIJ format.
1187     Version for blocks are 1 by 1.
1188 */
MatCholeskyFactorNumeric_SeqSBAIJ_1_inplace(Mat C,Mat A,const MatFactorInfo * info)1189 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_1_inplace(Mat C, Mat A, const MatFactorInfo *info)
1190 {
1191   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)C->data;
1192   IS              ip = b->row;
1193   const PetscInt *ai, *aj, *rip;
1194   PetscInt       *a2anew, i, j, mbs = a->mbs, *bi = b->i, *bj = b->j, *bcol;
1195   PetscInt        k, jmin, jmax, *jl, *il, col, nexti, ili, nz;
1196   MatScalar      *rtmp, *ba = b->a, *bval, *aa, dk, uikdi;
1197   PetscReal       rs;
1198   FactorShiftCtx  sctx;
1199 
1200   PetscFunctionBegin;
1201   /* MatPivotSetUp(): initialize shift context sctx */
1202   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
1203 
1204   PetscCall(ISGetIndices(ip, &rip));
1205   if (!a->permute) {
1206     ai = a->i;
1207     aj = a->j;
1208     aa = a->a;
1209   } else {
1210     ai = a->inew;
1211     aj = a->jnew;
1212     nz = ai[mbs];
1213     PetscCall(PetscMalloc1(nz, &aa));
1214     a2anew = a->a2anew;
1215     bval   = a->a;
1216     for (j = 0; j < nz; j++) aa[a2anew[j]] = *(bval++);
1217   }
1218 
1219   /* initialization */
1220   /* il and jl record the first nonzero element in each row of the accessing
1221      window U(0:k, k:mbs-1).
1222      jl:    list of rows to be added to uneliminated rows
1223             i>= k: jl(i) is the first row to be added to row i
1224             i<  k: jl(i) is the row following row i in some list of rows
1225             jl(i) = mbs indicates the end of a list
1226      il(i): points to the first nonzero element in columns k,...,mbs-1 of
1227             row i of U */
1228   PetscCall(PetscMalloc3(mbs, &rtmp, mbs, &il, mbs, &jl));
1229 
1230   do {
1231     sctx.newshift = PETSC_FALSE;
1232     il[0]         = 0;
1233     for (i = 0; i < mbs; i++) {
1234       rtmp[i] = 0.0;
1235       jl[i]   = mbs;
1236     }
1237 
1238     for (k = 0; k < mbs; k++) {
1239       /*initialize k-th row by the perm[k]-th row of A */
1240       jmin = ai[rip[k]];
1241       jmax = ai[rip[k] + 1];
1242       bval = ba + bi[k];
1243       for (j = jmin; j < jmax; j++) {
1244         col       = rip[aj[j]];
1245         rtmp[col] = aa[j];
1246         *bval++   = 0.0; /* for in-place factorization */
1247       }
1248 
1249       /* shift the diagonal of the matrix */
1250       if (sctx.nshift) rtmp[k] += sctx.shift_amount;
1251 
1252       /* modify k-th row by adding in those rows i with U(i,k)!=0 */
1253       dk = rtmp[k];
1254       i  = jl[k]; /* first row to be added to k_th row  */
1255 
1256       while (i < k) {
1257         nexti = jl[i]; /* next row to be added to k_th row */
1258 
1259         /* compute multiplier, update diag(k) and U(i,k) */
1260         ili   = il[i];                /* index of first nonzero element in U(i,k:bms-1) */
1261         uikdi = -ba[ili] * ba[bi[i]]; /* diagonal(k) */
1262         dk += uikdi * ba[ili];
1263         ba[ili] = uikdi; /* -U(i,k) */
1264 
1265         /* add multiple of row i to k-th row */
1266         jmin = ili + 1;
1267         jmax = bi[i + 1];
1268         if (jmin < jmax) {
1269           for (j = jmin; j < jmax; j++) rtmp[bj[j]] += uikdi * ba[j];
1270           PetscCall(PetscLogFlops(2.0 * (jmax - jmin)));
1271 
1272           /* update il and jl for row i */
1273           il[i] = jmin;
1274           j     = bj[jmin];
1275           jl[i] = jl[j];
1276           jl[j] = i;
1277         }
1278         i = nexti;
1279       }
1280 
1281       /* shift the diagonals when zero pivot is detected */
1282       /* compute rs=sum of abs(off-diagonal) */
1283       rs   = 0.0;
1284       jmin = bi[k] + 1;
1285       nz   = bi[k + 1] - jmin;
1286       if (nz) {
1287         bcol = bj + jmin;
1288         while (nz--) {
1289           rs += PetscAbsScalar(rtmp[*bcol]);
1290           bcol++;
1291         }
1292       }
1293 
1294       sctx.rs = rs;
1295       sctx.pv = dk;
1296       PetscCall(MatPivotCheck(C, A, info, &sctx, k));
1297       if (sctx.newshift) break; /* sctx.shift_amount is updated */
1298       dk = sctx.pv;
1299 
1300       /* copy data into U(k,:) */
1301       ba[bi[k]] = 1.0 / dk; /* U(k,k) */
1302       jmin      = bi[k] + 1;
1303       jmax      = bi[k + 1];
1304       if (jmin < jmax) {
1305         for (j = jmin; j < jmax; j++) {
1306           col       = bj[j];
1307           ba[j]     = rtmp[col];
1308           rtmp[col] = 0.0;
1309         }
1310         /* add the k-th row into il and jl */
1311         il[k] = jmin;
1312         i     = bj[jmin];
1313         jl[k] = jl[i];
1314         jl[i] = k;
1315       }
1316     }
1317   } while (sctx.newshift);
1318   PetscCall(PetscFree3(rtmp, il, jl));
1319   if (a->permute) PetscCall(PetscFree(aa));
1320 
1321   PetscCall(ISRestoreIndices(ip, &rip));
1322 
1323   C->ops->solve          = MatSolve_SeqSBAIJ_1_inplace;
1324   C->ops->solves         = MatSolves_SeqSBAIJ_1_inplace;
1325   C->ops->solvetranspose = MatSolve_SeqSBAIJ_1_inplace;
1326   C->ops->forwardsolve   = MatForwardSolve_SeqSBAIJ_1_inplace;
1327   C->ops->backwardsolve  = MatBackwardSolve_SeqSBAIJ_1_inplace;
1328   C->assembled           = PETSC_TRUE;
1329   C->preallocated        = PETSC_TRUE;
1330 
1331   PetscCall(PetscLogFlops(C->rmap->N));
1332   if (sctx.nshift) {
1333     if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1334       PetscCall(PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
1335     } else if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
1336       PetscCall(PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
1337     }
1338   }
1339   PetscFunctionReturn(PETSC_SUCCESS);
1340 }
1341 
1342 /*
1343   Version for when blocks are 1 by 1 Using natural ordering under new datastructure
1344   Modified from MatCholeskyFactorNumeric_SeqAIJ()
1345 */
MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering(Mat B,Mat A,const MatFactorInfo * info)1346 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info)
1347 {
1348   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;
1349   Mat_SeqSBAIJ   *b = (Mat_SeqSBAIJ *)B->data;
1350   PetscInt        i, j, mbs = A->rmap->n, *bi = b->i, *bj = b->j, *bdiag = b->diag, *bjtmp;
1351   PetscInt       *ai = a->i, *aj = a->j, *ajtmp;
1352   PetscInt        k, jmin, jmax, *c2r, *il, col, nexti, ili, nz;
1353   MatScalar      *rtmp, *ba = b->a, *bval, *aa = a->a, dk, uikdi;
1354   FactorShiftCtx  sctx;
1355   PetscReal       rs;
1356   MatScalar       d, *v;
1357   const PetscInt *adiag;
1358 
1359   PetscFunctionBegin;
1360   PetscCall(MatGetDiagonalMarkers_SeqSBAIJ(A, &adiag, NULL));
1361   PetscCall(PetscMalloc3(mbs, &rtmp, mbs, &il, mbs, &c2r));
1362 
1363   /* MatPivotSetUp(): initialize shift context sctx */
1364   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
1365 
1366   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1367     sctx.shift_top = info->zeropivot;
1368 
1369     PetscCall(PetscArrayzero(rtmp, mbs));
1370 
1371     for (i = 0; i < mbs; i++) {
1372       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1373       d = aa[adiag[i]];
1374       rtmp[i] += -PetscRealPart(d); /* diagonal entry */
1375       ajtmp = aj + ai[i] + 1;       /* exclude diagonal */
1376       v     = aa + ai[i] + 1;
1377       nz    = ai[i + 1] - ai[i] - 1;
1378       for (j = 0; j < nz; j++) {
1379         rtmp[i] += PetscAbsScalar(v[j]);
1380         rtmp[ajtmp[j]] += PetscAbsScalar(v[j]);
1381       }
1382       if (PetscRealPart(rtmp[i]) > sctx.shift_top) sctx.shift_top = PetscRealPart(rtmp[i]);
1383     }
1384     sctx.shift_top *= 1.1;
1385     sctx.nshift_max = 5;
1386     sctx.shift_lo   = 0.;
1387     sctx.shift_hi   = 1.;
1388   }
1389 
1390   /* allocate working arrays
1391      c2r: linked list, keep track of pivot rows for a given column. c2r[col]: head of the list for a given col
1392      il:  for active k row, il[i] gives the index of the 1st nonzero entry in U[i,k:n-1] in bj and ba arrays
1393   */
1394   do {
1395     sctx.newshift = PETSC_FALSE;
1396 
1397     for (i = 0; i < mbs; i++) c2r[i] = mbs;
1398     if (mbs) il[0] = 0;
1399 
1400     for (k = 0; k < mbs; k++) {
1401       /* zero rtmp */
1402       nz    = bi[k + 1] - bi[k];
1403       bjtmp = bj + bi[k];
1404       for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0;
1405 
1406       /* load in initial unfactored row */
1407       bval = ba + bi[k];
1408       jmin = ai[k];
1409       jmax = ai[k + 1];
1410       for (j = jmin; j < jmax; j++) {
1411         col       = aj[j];
1412         rtmp[col] = aa[j];
1413         *bval++   = 0.0; /* for in-place factorization */
1414       }
1415       /* shift the diagonal of the matrix: ZeropivotApply() */
1416       rtmp[k] += sctx.shift_amount; /* shift the diagonal of the matrix */
1417 
1418       /* modify k-th row by adding in those rows i with U(i,k)!=0 */
1419       dk = rtmp[k];
1420       i  = c2r[k]; /* first row to be added to k_th row  */
1421 
1422       while (i < k) {
1423         nexti = c2r[i]; /* next row to be added to k_th row */
1424 
1425         /* compute multiplier, update diag(k) and U(i,k) */
1426         ili   = il[i];                   /* index of first nonzero element in U(i,k:bms-1) */
1427         uikdi = -ba[ili] * ba[bdiag[i]]; /* diagonal(k) */
1428         dk += uikdi * ba[ili];           /* update diag[k] */
1429         ba[ili] = uikdi;                 /* -U(i,k) */
1430 
1431         /* add multiple of row i to k-th row */
1432         jmin = ili + 1;
1433         jmax = bi[i + 1];
1434         if (jmin < jmax) {
1435           for (j = jmin; j < jmax; j++) rtmp[bj[j]] += uikdi * ba[j];
1436           /* update il and c2r for row i */
1437           il[i]  = jmin;
1438           j      = bj[jmin];
1439           c2r[i] = c2r[j];
1440           c2r[j] = i;
1441         }
1442         i = nexti;
1443       }
1444 
1445       /* copy data into U(k,:) */
1446       rs   = 0.0;
1447       jmin = bi[k];
1448       jmax = bi[k + 1] - 1;
1449       if (jmin < jmax) {
1450         for (j = jmin; j < jmax; j++) {
1451           col   = bj[j];
1452           ba[j] = rtmp[col];
1453           rs += PetscAbsScalar(ba[j]);
1454         }
1455         /* add the k-th row into il and c2r */
1456         il[k]  = jmin;
1457         i      = bj[jmin];
1458         c2r[k] = c2r[i];
1459         c2r[i] = k;
1460       }
1461 
1462       sctx.rs = rs;
1463       sctx.pv = dk;
1464       PetscCall(MatPivotCheck(B, A, info, &sctx, k));
1465       if (sctx.newshift) break;
1466       dk = sctx.pv;
1467 
1468       ba[bdiag[k]] = 1.0 / dk; /* U(k,k) */
1469     }
1470   } while (sctx.newshift);
1471 
1472   PetscCall(PetscFree3(rtmp, il, c2r));
1473 
1474   B->ops->solve          = MatSolve_SeqSBAIJ_1_NaturalOrdering;
1475   B->ops->solves         = MatSolves_SeqSBAIJ_1;
1476   B->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering;
1477   B->ops->matsolve       = MatMatSolve_SeqSBAIJ_1_NaturalOrdering;
1478   B->ops->forwardsolve   = MatForwardSolve_SeqSBAIJ_1_NaturalOrdering;
1479   B->ops->backwardsolve  = MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering;
1480 
1481   B->assembled    = PETSC_TRUE;
1482   B->preallocated = PETSC_TRUE;
1483 
1484   PetscCall(PetscLogFlops(B->rmap->n));
1485 
1486   /* MatPivotView() */
1487   if (sctx.nshift) {
1488     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
1489       PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
1490     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1491       PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
1492     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1493       PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount));
1494     }
1495   }
1496   PetscFunctionReturn(PETSC_SUCCESS);
1497 }
1498 
MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering_inplace(Mat C,Mat A,const MatFactorInfo * info)1499 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info)
1500 {
1501   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)C->data;
1502   PetscInt       i, j, mbs = a->mbs;
1503   PetscInt      *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
1504   PetscInt       k, jmin, *jl, *il, nexti, ili, *acol, *bcol, nz;
1505   MatScalar     *rtmp, *ba = b->a, *aa = a->a, dk, uikdi, *aval, *bval;
1506   PetscReal      rs;
1507   FactorShiftCtx sctx;
1508 
1509   PetscFunctionBegin;
1510   /* MatPivotSetUp(): initialize shift context sctx */
1511   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
1512 
1513   /* initialization */
1514   /* il and jl record the first nonzero element in each row of the accessing
1515      window U(0:k, k:mbs-1).
1516      jl:    list of rows to be added to uneliminated rows
1517             i>= k: jl(i) is the first row to be added to row i
1518             i<  k: jl(i) is the row following row i in some list of rows
1519             jl(i) = mbs indicates the end of a list
1520      il(i): points to the first nonzero element in U(i,k:mbs-1)
1521   */
1522   PetscCall(PetscMalloc1(mbs, &rtmp));
1523   PetscCall(PetscMalloc2(mbs, &il, mbs, &jl));
1524 
1525   do {
1526     sctx.newshift = PETSC_FALSE;
1527     il[0]         = 0;
1528     for (i = 0; i < mbs; i++) {
1529       rtmp[i] = 0.0;
1530       jl[i]   = mbs;
1531     }
1532 
1533     for (k = 0; k < mbs; k++) {
1534       /*initialize k-th row with elements nonzero in row perm(k) of A */
1535       nz   = ai[k + 1] - ai[k];
1536       acol = aj + ai[k];
1537       aval = aa + ai[k];
1538       bval = ba + bi[k];
1539       while (nz--) {
1540         rtmp[*acol++] = *aval++;
1541         *bval++       = 0.0; /* for in-place factorization */
1542       }
1543 
1544       /* shift the diagonal of the matrix */
1545       if (sctx.nshift) rtmp[k] += sctx.shift_amount;
1546 
1547       /* modify k-th row by adding in those rows i with U(i,k)!=0 */
1548       dk = rtmp[k];
1549       i  = jl[k]; /* first row to be added to k_th row  */
1550 
1551       while (i < k) {
1552         nexti = jl[i]; /* next row to be added to k_th row */
1553         /* compute multiplier, update D(k) and U(i,k) */
1554         ili   = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
1555         uikdi = -ba[ili] * ba[bi[i]];
1556         dk += uikdi * ba[ili];
1557         ba[ili] = uikdi; /* -U(i,k) */
1558 
1559         /* add multiple of row i to k-th row ... */
1560         jmin = ili + 1;
1561         nz   = bi[i + 1] - jmin;
1562         if (nz > 0) {
1563           bcol = bj + jmin;
1564           bval = ba + jmin;
1565           PetscCall(PetscLogFlops(2.0 * nz));
1566           while (nz--) rtmp[*bcol++] += uikdi * (*bval++);
1567 
1568           /* update il and jl for i-th row */
1569           il[i] = jmin;
1570           j     = bj[jmin];
1571           jl[i] = jl[j];
1572           jl[j] = i;
1573         }
1574         i = nexti;
1575       }
1576 
1577       /* shift the diagonals when zero pivot is detected */
1578       /* compute rs=sum of abs(off-diagonal) */
1579       rs   = 0.0;
1580       jmin = bi[k] + 1;
1581       nz   = bi[k + 1] - jmin;
1582       if (nz) {
1583         bcol = bj + jmin;
1584         while (nz--) {
1585           rs += PetscAbsScalar(rtmp[*bcol]);
1586           bcol++;
1587         }
1588       }
1589 
1590       sctx.rs = rs;
1591       sctx.pv = dk;
1592       PetscCall(MatPivotCheck(C, A, info, &sctx, k));
1593       if (sctx.newshift) break; /* sctx.shift_amount is updated */
1594       dk = sctx.pv;
1595 
1596       /* copy data into U(k,:) */
1597       ba[bi[k]] = 1.0 / dk;
1598       jmin      = bi[k] + 1;
1599       nz        = bi[k + 1] - jmin;
1600       if (nz) {
1601         bcol = bj + jmin;
1602         bval = ba + jmin;
1603         while (nz--) {
1604           *bval++       = rtmp[*bcol];
1605           rtmp[*bcol++] = 0.0;
1606         }
1607         /* add k-th row into il and jl */
1608         il[k] = jmin;
1609         i     = bj[jmin];
1610         jl[k] = jl[i];
1611         jl[i] = k;
1612       }
1613     } /* end of for (k = 0; k<mbs; k++) */
1614   } while (sctx.newshift);
1615   PetscCall(PetscFree(rtmp));
1616   PetscCall(PetscFree2(il, jl));
1617 
1618   C->ops->solve          = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1619   C->ops->solves         = MatSolves_SeqSBAIJ_1_inplace;
1620   C->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1621   C->ops->forwardsolve   = MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1622   C->ops->backwardsolve  = MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1623 
1624   C->assembled    = PETSC_TRUE;
1625   C->preallocated = PETSC_TRUE;
1626 
1627   PetscCall(PetscLogFlops(C->rmap->N));
1628   if (sctx.nshift) {
1629     if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1630       PetscCall(PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
1631     } else if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
1632       PetscCall(PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
1633     }
1634   }
1635   PetscFunctionReturn(PETSC_SUCCESS);
1636 }
1637 
MatCholeskyFactor_SeqSBAIJ(Mat A,IS perm,const MatFactorInfo * info)1638 PetscErrorCode MatCholeskyFactor_SeqSBAIJ(Mat A, IS perm, const MatFactorInfo *info)
1639 {
1640   Mat C;
1641 
1642   PetscFunctionBegin;
1643   PetscCall(MatGetFactor(A, "petsc", MAT_FACTOR_CHOLESKY, &C));
1644   PetscCall(MatCholeskyFactorSymbolic(C, A, perm, info));
1645   PetscCall(MatCholeskyFactorNumeric(C, A, info));
1646 
1647   A->ops->solve          = C->ops->solve;
1648   A->ops->solvetranspose = C->ops->solvetranspose;
1649 
1650   PetscCall(MatHeaderMerge(A, &C));
1651   PetscFunctionReturn(PETSC_SUCCESS);
1652 }
1653