xref: /petsc/src/mat/impls/sbaij/seq/sbaijfact.c (revision 2205254efee3a00a594e5e2a3a70f74dcb40bc03)
1 
2 #include <../src/mat/impls/baij/seq/baij.h>
3 #include <../src/mat/impls/sbaij/seq/sbaij.h>
4 #include <../src/mat/blockinvert.h>
5 #include <petscis.h>
6 
7 /*
8   input:
9    F -- numeric factor
10   output:
11    nneg, nzero, npos: matrix inertia
12 */
13 
14 #undef __FUNCT__
15 #define __FUNCT__ "MatGetInertia_SeqSBAIJ"
16 PetscErrorCode MatGetInertia_SeqSBAIJ(Mat F,PetscInt *nneig,PetscInt *nzero,PetscInt *npos)
17 {
18   Mat_SeqSBAIJ *fact_ptr = (Mat_SeqSBAIJ*)F->data;
19   MatScalar    *dd = fact_ptr->a;
20   PetscInt     mbs=fact_ptr->mbs,bs=F->rmap->bs,i,nneig_tmp,npos_tmp,*fi = fact_ptr->diag;
21 
22   PetscFunctionBegin;
23   if (bs != 1) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for bs: %D >1 yet",bs);
24   nneig_tmp = 0; npos_tmp = 0;
25   for (i=0; i<mbs; i++) {
26     if (PetscRealPart(dd[*fi]) > 0.0) {
27       npos_tmp++;
28     } else if (PetscRealPart(dd[*fi]) < 0.0) {
29       nneig_tmp++;
30     }
31     fi++;
32   }
33   if (nneig) *nneig = nneig_tmp;
34   if (npos)  *npos  = npos_tmp;
35   if (nzero) *nzero = mbs - nneig_tmp - npos_tmp;
36   PetscFunctionReturn(0);
37 }
38 
39 /*
40   Symbolic U^T*D*U factorization for SBAIJ format. Modified from SSF of YSMP.
41   Use Modified Sparse Row (MSR) storage for u and ju. See page 85, "Iterative Methods ..." by Saad.
42 */
43 #undef __FUNCT__
44 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqSBAIJ_MSR"
45 PetscErrorCode MatCholeskyFactorSymbolic_SeqSBAIJ_MSR(Mat F,Mat A,IS perm,const MatFactorInfo *info)
46 {
47   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b;
48   PetscErrorCode ierr;
49   const PetscInt *rip,*ai,*aj;
50   PetscInt       i,mbs = a->mbs,*jutmp,bs = A->rmap->bs,bs2=a->bs2;
51   PetscInt       m,reallocs = 0,prow;
52   PetscInt       *jl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd;
53   PetscReal      f = info->fill;
54   PetscBool      perm_identity;
55 
56   PetscFunctionBegin;
57   /* check whether perm is the identity mapping */
58   ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr);
59   ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr);
60 
61   if (perm_identity) { /* without permutation */
62     a->permute = PETSC_FALSE;
63     ai = a->i; aj = a->j;
64   } else {            /* non-trivial permutation */
65     a->permute = PETSC_TRUE;
66     ierr = MatReorderingSeqSBAIJ(A,perm);CHKERRQ(ierr);
67     ai = a->inew; aj = a->jnew;
68   }
69 
70   /* initialization */
71   ierr  = PetscMalloc((mbs+1)*sizeof(PetscInt),&iu);CHKERRQ(ierr);
72   umax  = (PetscInt)(f*ai[mbs] + 1); umax += mbs + 1;
73   ierr  = PetscMalloc(umax*sizeof(PetscInt),&ju);CHKERRQ(ierr);
74   iu[0] = mbs+1;
75   juidx = mbs + 1; /* index for ju */
76   /* jl linked list for pivot row -- linked list for col index */
77   ierr  = PetscMalloc2(mbs,PetscInt,&jl,mbs,PetscInt,&q);CHKERRQ(ierr);
78   for (i=0; i<mbs; i++) {
79     jl[i] = mbs;
80     q[i] = 0;
81   }
82 
83   /* for each row k */
84   for (k=0; k<mbs; k++) {
85     for (i=0; i<mbs; i++) q[i] = 0;  /* to be removed! */
86     nzk  = 0; /* num. of nz blocks in k-th block row with diagonal block excluded */
87     q[k] = mbs;
88     /* initialize nonzero structure of k-th row to row rip[k] of A */
89     jmin = ai[rip[k]] +1; /* exclude diag[k] */
90     jmax = ai[rip[k]+1];
91     for (j=jmin; j<jmax; j++) {
92       vj = rip[aj[j]]; /* col. value */
93       if (vj > k) {
94         qm = k;
95         do {
96           m  = qm; qm = q[m];
97         } while (qm < vj);
98         if (qm == vj) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Duplicate entry in A\n");
99         nzk++;
100         q[m]  = vj;
101         q[vj] = qm;
102       } /* if (vj > k) */
103     } /* for (j=jmin; j<jmax; j++) */
104 
105     /* modify nonzero structure of k-th row by computing fill-in
106        for each row i to be merged in */
107     prow = k;
108     prow = jl[prow]; /* next pivot row (== mbs for symbolic factorization) */
109 
110     while (prow < k) {
111       /* merge row prow into k-th row */
112       jmin = iu[prow] + 1; jmax = iu[prow+1];
113       qm = k;
114       for (j=jmin; j<jmax; j++) {
115         vj = ju[j];
116         do {
117           m = qm; qm = q[m];
118         } while (qm < vj);
119         if (qm != vj) {
120          nzk++; q[m] = vj; q[vj] = qm; qm = vj;
121         }
122       }
123       prow = jl[prow]; /* next pivot row */
124     }
125 
126     /* add k to row list for first nonzero element in k-th row */
127     if (nzk > 0) {
128       i = q[k]; /* col value of first nonzero element in U(k, k+1:mbs-1) */
129       jl[k] = jl[i]; jl[i] = k;
130     }
131     iu[k+1] = iu[k] + nzk;
132 
133     /* allocate more space to ju if needed */
134     if (iu[k+1] > umax) {
135       /* estimate how much additional space we will need */
136       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
137       /* just double the memory each time */
138       maxadd = umax;
139       if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
140       umax += maxadd;
141 
142       /* allocate a longer ju */
143       ierr = PetscMalloc(umax*sizeof(PetscInt),&jutmp);CHKERRQ(ierr);
144       ierr = PetscMemcpy(jutmp,ju,iu[k]*sizeof(PetscInt));CHKERRQ(ierr);
145       ierr = PetscFree(ju);CHKERRQ(ierr);
146       ju   = jutmp;
147       reallocs++; /* count how many times we realloc */
148     }
149 
150     /* save nonzero structure of k-th row in ju */
151     i=k;
152     while (nzk --) {
153       i           = q[i];
154       ju[juidx++] = i;
155     }
156   }
157 
158 #if defined(PETSC_USE_INFO)
159   if (ai[mbs] != 0) {
160     PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
161     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr);
162     ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
163     ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr);
164     ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
165   } else {
166     ierr = PetscInfo(A,"Empty matrix.\n");CHKERRQ(ierr);
167   }
168 #endif
169 
170   ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
171   ierr = PetscFree2(jl,q);CHKERRQ(ierr);
172 
173   /* put together the new matrix */
174   ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(F,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
175 
176   /* ierr = PetscLogObjectParent(B,iperm);CHKERRQ(ierr); */
177   b = (Mat_SeqSBAIJ*)(F)->data;
178   b->singlemalloc = PETSC_FALSE;
179   b->free_a       = PETSC_TRUE;
180   b->free_ij       = PETSC_TRUE;
181   ierr = PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2,&b->a);CHKERRQ(ierr);
182   b->j    = ju;
183   b->i    = iu;
184   b->diag = 0;
185   b->ilen = 0;
186   b->imax = 0;
187   b->row  = perm;
188   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
189   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
190   b->icol = perm;
191   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
192   ierr    = PetscMalloc((bs*mbs+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
193   /* In b structure:  Free imax, ilen, old a, old j.
194      Allocate idnew, solve_work, new a, new j */
195   ierr = PetscLogObjectMemory(F,(iu[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));CHKERRQ(ierr);
196   b->maxnz = b->nz = iu[mbs];
197 
198   (F)->info.factor_mallocs    = reallocs;
199   (F)->info.fill_ratio_given  = f;
200   if (ai[mbs] != 0) {
201     (F)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
202   } else {
203     (F)->info.fill_ratio_needed = 0.0;
204   }
205   ierr = MatSeqSBAIJSetNumericFactorization_inplace(F,perm_identity);CHKERRQ(ierr);
206   PetscFunctionReturn(0);
207 }
208 /*
209     Symbolic U^T*D*U factorization for SBAIJ format.
210     See MatICCFactorSymbolic_SeqAIJ() for description of its data structure.
211 */
212 #include <petscbt.h>
213 #include <../src/mat/utils/freespace.h>
214 #undef __FUNCT__
215 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqSBAIJ"
216 PetscErrorCode MatCholeskyFactorSymbolic_SeqSBAIJ(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
217 {
218   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data;
219   Mat_SeqSBAIJ       *b;
220   PetscErrorCode     ierr;
221   PetscBool          perm_identity,missing;
222   PetscReal          fill = info->fill;
223   const PetscInt     *rip,*ai=a->i,*aj=a->j;
224   PetscInt           i,mbs=a->mbs,bs=A->rmap->bs,reallocs=0,prow,d;
225   PetscInt           *jl,jmin,jmax,nzk,*ui,k,j,*il,nextprow;
226   PetscInt           nlnk,*lnk,ncols,*cols,*uj,**ui_ptr,*uj_ptr,*udiag;
227   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
228   PetscBT            lnkbt;
229 
230   PetscFunctionBegin;
231   if (bs > 1) {
232     ierr = MatCholeskyFactorSymbolic_SeqSBAIJ_inplace(fact,A,perm,info);CHKERRQ(ierr);
233     PetscFunctionReturn(0);
234   }
235   if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
236   ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr);
237   if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
238 
239   /* check whether perm is the identity mapping */
240   ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr);
241   if (!perm_identity) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
242   a->permute = PETSC_FALSE;
243   ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr);
244 
245   /* initialization */
246   ierr  = PetscMalloc((mbs+1)*sizeof(PetscInt),&ui);CHKERRQ(ierr);
247   ierr  = PetscMalloc((mbs+1)*sizeof(PetscInt),&udiag);CHKERRQ(ierr);
248   ui[0] = 0;
249 
250   /* jl: linked list for storing indices of the pivot rows
251      il: il[i] points to the 1st nonzero entry of U(i,k:mbs-1) */
252   ierr = PetscMalloc4(mbs,PetscInt*,&ui_ptr,mbs,PetscInt,&il,mbs,PetscInt,&jl,mbs,PetscInt,&cols);CHKERRQ(ierr);
253   for (i=0; i<mbs; i++) {
254     jl[i] = mbs; il[i] = 0;
255   }
256 
257   /* create and initialize a linked list for storing column indices of the active row k */
258   nlnk = mbs + 1;
259   ierr = PetscLLCreate(mbs,mbs,nlnk,lnk,lnkbt);CHKERRQ(ierr);
260 
261   /* initial FreeSpace size is fill*(ai[mbs]+1) */
262   ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[mbs]+1)),&free_space);CHKERRQ(ierr);
263   current_space = free_space;
264 
265   for (k=0; k<mbs; k++) {  /* for each active row k */
266     /* initialize lnk by the column indices of row rip[k] of A */
267     nzk   = 0;
268     ncols = ai[k+1] - ai[k];
269     if (!ncols) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Empty row %D in matrix ",k);
270     for (j=0; j<ncols; j++) {
271       i = *(aj + ai[k] + j);
272       cols[j] = i;
273     }
274     ierr = PetscLLAdd(ncols,cols,mbs,nlnk,lnk,lnkbt);CHKERRQ(ierr);
275     nzk += nlnk;
276 
277     /* update lnk by computing fill-in for each pivot row to be merged in */
278     prow = jl[k]; /* 1st pivot row */
279 
280     while (prow < k) {
281       nextprow = jl[prow];
282       /* merge prow into k-th row */
283       jmin = il[prow] + 1;  /* index of the 2nd nzero entry in U(prow,k:mbs-1) */
284       jmax = ui[prow+1];
285       ncols = jmax-jmin;
286       uj_ptr = ui_ptr[prow] + jmin - ui[prow]; /* points to the 2nd nzero entry in U(prow,k:mbs-1) */
287       ierr = PetscLLAddSorted(ncols,uj_ptr,mbs,nlnk,lnk,lnkbt);CHKERRQ(ierr);
288       nzk += nlnk;
289 
290       /* update il and jl for prow */
291       if (jmin < jmax) {
292         il[prow] = jmin;
293         j = *uj_ptr; jl[prow] = jl[j]; jl[j] = prow;
294       }
295       prow = nextprow;
296     }
297 
298     /* if free space is not available, make more free space */
299     if (current_space->local_remaining<nzk) {
300       i  = mbs - k + 1; /* num of unfactored rows */
301       i *= PetscMin(nzk, i-1); /* i*nzk, i*(i-1): estimated and max additional space needed */
302       ierr = PetscFreeSpaceGet(i,&current_space);CHKERRQ(ierr);
303       reallocs++;
304     }
305 
306     /* copy data into free space, then initialize lnk */
307     ierr = PetscLLClean(mbs,mbs,nzk,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
308 
309     /* add the k-th row into il and jl */
310     if (nzk > 1) {
311       i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */
312       jl[k] = jl[i]; jl[i] = k;
313       il[k] = ui[k] + 1;
314     }
315     ui_ptr[k] = current_space->array;
316     current_space->array           += nzk;
317     current_space->local_used      += nzk;
318     current_space->local_remaining -= nzk;
319 
320     ui[k+1] = ui[k] + nzk;
321   }
322 
323   ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
324   ierr = PetscFree4(ui_ptr,il,jl,cols);CHKERRQ(ierr);
325 
326   /* destroy list of free space and other temporary array(s) */
327   ierr = PetscMalloc((ui[mbs]+1)*sizeof(PetscInt),&uj);CHKERRQ(ierr);
328   ierr = PetscFreeSpaceContiguous_Cholesky(&free_space,uj,mbs,ui,udiag);CHKERRQ(ierr); /* store matrix factor */
329   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
330 
331   /* put together the new matrix in MATSEQSBAIJ format */
332   ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
333 
334   b = (Mat_SeqSBAIJ*)fact->data;
335   b->singlemalloc = PETSC_FALSE;
336   b->free_a       = PETSC_TRUE;
337   b->free_ij      = PETSC_TRUE;
338   ierr = PetscMalloc((ui[mbs]+1)*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
339   b->j    = uj;
340   b->i    = ui;
341   b->diag = udiag;
342   b->free_diag = PETSC_TRUE;
343   b->ilen = 0;
344   b->imax = 0;
345   b->row  = perm;
346   b->icol = perm;
347   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
348   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
349   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
350   ierr    = PetscMalloc((mbs+1)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
351   ierr    = PetscLogObjectMemory(fact,ui[mbs]*(sizeof(PetscInt)+sizeof(MatScalar)));CHKERRQ(ierr);
352   b->maxnz = b->nz = ui[mbs];
353 
354   fact->info.factor_mallocs    = reallocs;
355   fact->info.fill_ratio_given  = fill;
356   if (ai[mbs] != 0) {
357     fact->info.fill_ratio_needed = ((PetscReal)ui[mbs])/ai[mbs];
358   } else {
359     fact->info.fill_ratio_needed = 0.0;
360   }
361 #if defined(PETSC_USE_INFO)
362   if (ai[mbs] != 0) {
363     PetscReal af = fact->info.fill_ratio_needed;
364     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,fill,af);CHKERRQ(ierr);
365     ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
366     ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G) for best performance.\n",af);CHKERRQ(ierr);
367   } else {
368     ierr = PetscInfo(A,"Empty matrix.\n");CHKERRQ(ierr);
369   }
370 #endif
371   fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
372   PetscFunctionReturn(0);
373 }
374 
375 #undef __FUNCT__
376 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqSBAIJ_inplace"
377 PetscErrorCode MatCholeskyFactorSymbolic_SeqSBAIJ_inplace(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
378 {
379   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data;
380   Mat_SeqSBAIJ       *b;
381   PetscErrorCode     ierr;
382   PetscBool          perm_identity,missing;
383   PetscReal          fill = info->fill;
384   const PetscInt     *rip,*ai,*aj;
385   PetscInt           i,mbs=a->mbs,bs=A->rmap->bs,reallocs=0,prow,d;
386   PetscInt           *jl,jmin,jmax,nzk,*ui,k,j,*il,nextprow;
387   PetscInt           nlnk,*lnk,ncols,*cols,*uj,**ui_ptr,*uj_ptr;
388   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
389   PetscBT            lnkbt;
390 
391   PetscFunctionBegin;
392   ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr);
393   if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
394 
395   /*
396    This code originally uses Modified Sparse Row (MSR) storage
397    (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choise!
398    Then it is rewritten so the factor B takes seqsbaij format. However the associated
399    MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1 or !perm_identity,
400    thus the original code in MSR format is still used for these cases.
401    The code below should replace MatCholeskyFactorSymbolic_SeqSBAIJ_MSR() whenever
402    MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor.
403   */
404   if (bs > 1) {
405     ierr = MatCholeskyFactorSymbolic_SeqSBAIJ_MSR(fact,A,perm,info);CHKERRQ(ierr);
406     PetscFunctionReturn(0);
407   }
408 
409   /* check whether perm is the identity mapping */
410   ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr);
411 
412   if (perm_identity) {
413     a->permute = PETSC_FALSE;
414     ai = a->i; aj = a->j;
415   } else {
416     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
417     /* There are bugs for reordeing. Needs further work.
418        MatReordering for sbaij cannot be efficient. User should use aij formt! */
419     a->permute = PETSC_TRUE;
420     ierr = MatReorderingSeqSBAIJ(A,perm);CHKERRQ(ierr);
421     ai = a->inew; aj = a->jnew;
422   }
423   ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr);
424 
425   /* initialization */
426   ierr  = PetscMalloc((mbs+1)*sizeof(PetscInt),&ui);CHKERRQ(ierr);
427   ui[0] = 0;
428 
429   /* jl: linked list for storing indices of the pivot rows
430      il: il[i] points to the 1st nonzero entry of U(i,k:mbs-1) */
431   ierr = PetscMalloc4(mbs,PetscInt*,&ui_ptr,mbs,PetscInt,&il,mbs,PetscInt,&jl,mbs,PetscInt,&cols);CHKERRQ(ierr);
432   for (i=0; i<mbs; i++) {
433     jl[i] = mbs; il[i] = 0;
434   }
435 
436   /* create and initialize a linked list for storing column indices of the active row k */
437   nlnk = mbs + 1;
438   ierr = PetscLLCreate(mbs,mbs,nlnk,lnk,lnkbt);CHKERRQ(ierr);
439 
440   /* initial FreeSpace size is fill*(ai[mbs]+1) */
441   ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[mbs]+1)),&free_space);CHKERRQ(ierr);
442   current_space = free_space;
443 
444   for (k=0; k<mbs; k++) {  /* for each active row k */
445     /* initialize lnk by the column indices of row rip[k] of A */
446     nzk   = 0;
447     ncols = ai[rip[k]+1] - ai[rip[k]];
448     for (j=0; j<ncols; j++) {
449       i = *(aj + ai[rip[k]] + j);
450       cols[j] = rip[i];
451     }
452     ierr = PetscLLAdd(ncols,cols,mbs,nlnk,lnk,lnkbt);CHKERRQ(ierr);
453     nzk += nlnk;
454 
455     /* update lnk by computing fill-in for each pivot row to be merged in */
456     prow = jl[k]; /* 1st pivot row */
457 
458     while (prow < k) {
459       nextprow = jl[prow];
460       /* merge prow into k-th row */
461       jmin = il[prow] + 1;  /* index of the 2nd nzero entry in U(prow,k:mbs-1) */
462       jmax = ui[prow+1];
463       ncols = jmax-jmin;
464       uj_ptr = ui_ptr[prow] + jmin - ui[prow]; /* points to the 2nd nzero entry in U(prow,k:mbs-1) */
465       ierr = PetscLLAddSorted(ncols,uj_ptr,mbs,nlnk,lnk,lnkbt);CHKERRQ(ierr);
466       nzk += nlnk;
467 
468       /* update il and jl for prow */
469       if (jmin < jmax) {
470         il[prow] = jmin;
471         j = *uj_ptr; jl[prow] = jl[j]; jl[j] = prow;
472       }
473       prow = nextprow;
474     }
475 
476     /* if free space is not available, make more free space */
477     if (current_space->local_remaining<nzk) {
478       i = mbs - k + 1; /* num of unfactored rows */
479       i = PetscMin(i*nzk, i*(i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
480       ierr = PetscFreeSpaceGet(i,&current_space);CHKERRQ(ierr);
481       reallocs++;
482     }
483 
484     /* copy data into free space, then initialize lnk */
485     ierr = PetscLLClean(mbs,mbs,nzk,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
486 
487     /* add the k-th row into il and jl */
488     if (nzk-1 > 0) {
489       i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */
490       jl[k] = jl[i]; jl[i] = k;
491       il[k] = ui[k] + 1;
492     }
493     ui_ptr[k] = current_space->array;
494     current_space->array           += nzk;
495     current_space->local_used      += nzk;
496     current_space->local_remaining -= nzk;
497 
498     ui[k+1] = ui[k] + nzk;
499   }
500 
501   ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
502   ierr = PetscFree4(ui_ptr,il,jl,cols);CHKERRQ(ierr);
503 
504   /* destroy list of free space and other temporary array(s) */
505   ierr = PetscMalloc((ui[mbs]+1)*sizeof(PetscInt),&uj);CHKERRQ(ierr);
506   ierr = PetscFreeSpaceContiguous(&free_space,uj);CHKERRQ(ierr);
507   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
508 
509   /* put together the new matrix in MATSEQSBAIJ format */
510   ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
511 
512   b = (Mat_SeqSBAIJ*)fact->data;
513   b->singlemalloc = PETSC_FALSE;
514   b->free_a       = PETSC_TRUE;
515   b->free_ij      = PETSC_TRUE;
516   ierr = PetscMalloc((ui[mbs]+1)*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
517   b->j    = uj;
518   b->i    = ui;
519   b->diag = 0;
520   b->ilen = 0;
521   b->imax = 0;
522   b->row  = perm;
523   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
524   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
525   b->icol = perm;
526   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
527   ierr    = PetscMalloc((mbs+1)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
528   ierr    = PetscLogObjectMemory(fact,(ui[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));CHKERRQ(ierr);
529   b->maxnz = b->nz = ui[mbs];
530 
531   fact->info.factor_mallocs    = reallocs;
532   fact->info.fill_ratio_given  = fill;
533   if (ai[mbs] != 0) {
534     fact->info.fill_ratio_needed = ((PetscReal)ui[mbs])/ai[mbs];
535   } else {
536     fact->info.fill_ratio_needed = 0.0;
537   }
538 #if defined(PETSC_USE_INFO)
539   if (ai[mbs] != 0) {
540     PetscReal af = fact->info.fill_ratio_needed;
541     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,fill,af);CHKERRQ(ierr);
542     ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
543     ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G) for best performance.\n",af);CHKERRQ(ierr);
544   } else {
545     ierr = PetscInfo(A,"Empty matrix.\n");CHKERRQ(ierr);
546   }
547 #endif
548   ierr = MatSeqSBAIJSetNumericFactorization_inplace(fact,perm_identity);CHKERRQ(ierr);
549   PetscFunctionReturn(0);
550 }
551 
552 #undef __FUNCT__
553 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_N"
554 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_N(Mat C,Mat A,const MatFactorInfo *info)
555 {
556   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
557   IS             perm = b->row;
558   PetscErrorCode ierr;
559   const PetscInt *ai,*aj,*perm_ptr,mbs=a->mbs,*bi=b->i,*bj=b->j;
560   PetscInt       i,j;
561   PetscInt       *a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
562   PetscInt       bs=A->rmap->bs,bs2 = a->bs2,bslog = 0;
563   MatScalar      *ba = b->a,*aa,*ap,*dk,*uik;
564   MatScalar      *u,*diag,*rtmp,*rtmp_ptr;
565   MatScalar      *work;
566   PetscInt       *pivots;
567 
568   PetscFunctionBegin;
569   /* initialization */
570   ierr = PetscMalloc(bs2*mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
571   ierr = PetscMemzero(rtmp,bs2*mbs*sizeof(MatScalar));CHKERRQ(ierr);
572   ierr = PetscMalloc2(mbs,PetscInt,&il,mbs,PetscInt,&jl);CHKERRQ(ierr);
573   for (i=0; i<mbs; i++) {
574     jl[i] = mbs; il[0] = 0;
575   }
576   ierr = PetscMalloc3(bs2,MatScalar,&dk,bs2,MatScalar,&uik,bs,MatScalar,&work);CHKERRQ(ierr);
577   ierr = PetscMalloc(bs*sizeof(PetscInt),&pivots);CHKERRQ(ierr);
578 
579   ierr  = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr);
580 
581   /* check permutation */
582   if (!a->permute) {
583     ai = a->i; aj = a->j; aa = a->a;
584   } else {
585     ai   = a->inew; aj = a->jnew;
586     ierr = PetscMalloc(bs2*ai[mbs]*sizeof(MatScalar),&aa);CHKERRQ(ierr);
587     ierr = PetscMemcpy(aa,a->a,bs2*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
588     ierr = PetscMalloc(ai[mbs]*sizeof(PetscInt),&a2anew);CHKERRQ(ierr);
589     ierr = PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(PetscInt));CHKERRQ(ierr);
590 
591     /* flops in while loop */
592     bslog = 2*bs*bs2;
593 
594     for (i=0; i<mbs; i++) {
595       jmin = ai[i]; jmax = ai[i+1];
596       for (j=jmin; j<jmax; j++) {
597         while (a2anew[j] != j) {
598           k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
599           for (k1=0; k1<bs2; k1++) {
600             dk[k1]       = aa[k*bs2+k1];
601             aa[k*bs2+k1] = aa[j*bs2+k1];
602             aa[j*bs2+k1] = dk[k1];
603           }
604         }
605         /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
606         if (i > aj[j]) {
607           /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */
608           ap = aa + j*bs2;                     /* ptr to the beginning of j-th block of aa */
609           for (k=0; k<bs2; k++) dk[k] = ap[k]; /* dk <- j-th block of aa */
610           for (k=0; k<bs; k++) {               /* j-th block of aa <- dk^T */
611             for (k1=0; k1<bs; k1++) *ap++ = dk[k + bs*k1];
612           }
613         }
614       }
615     }
616     ierr = PetscFree(a2anew);CHKERRQ(ierr);
617   }
618 
619   /* for each row k */
620   for (k = 0; k<mbs; k++) {
621 
622     /*initialize k-th row with elements nonzero in row perm(k) of A */
623     jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
624 
625     ap = aa + jmin*bs2;
626     for (j = jmin; j < jmax; j++) {
627       vj = perm_ptr[aj[j]];         /* block col. index */
628       rtmp_ptr = rtmp + vj*bs2;
629       for (i=0; i<bs2; i++) *rtmp_ptr++ = *ap++;
630     }
631 
632     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
633     ierr = PetscMemcpy(dk,rtmp+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
634     i = jl[k]; /* first row to be added to k_th row  */
635 
636     while (i < k) {
637       nexti = jl[i]; /* next row to be added to k_th row */
638 
639       /* compute multiplier */
640       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
641 
642       /* uik = -inv(Di)*U_bar(i,k) */
643       diag = ba + i*bs2;
644       u    = ba + ili*bs2;
645       ierr = PetscMemzero(uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
646       PetscKernel_A_gets_A_minus_B_times_C(bs,uik,diag,u);
647 
648       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
649       PetscKernel_A_gets_A_plus_Btranspose_times_C(bs,dk,uik,u);
650       ierr = PetscLogFlops(bslog*2.0);CHKERRQ(ierr);
651 
652       /* update -U(i,k) */
653       ierr = PetscMemcpy(ba+ili*bs2,uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
654 
655       /* add multiple of row i to k-th row ... */
656       jmin = ili + 1; jmax = bi[i+1];
657       if (jmin < jmax) {
658         for (j=jmin; j<jmax; j++) {
659           /* rtmp += -U(i,k)^T * U_bar(i,j) */
660           rtmp_ptr = rtmp + bj[j]*bs2;
661           u = ba + j*bs2;
662           PetscKernel_A_gets_A_plus_Btranspose_times_C(bs,rtmp_ptr,uik,u);
663         }
664         ierr = PetscLogFlops(bslog*(jmax-jmin));CHKERRQ(ierr);
665 
666         /* ... add i to row list for next nonzero entry */
667         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
668         j     = bj[jmin];
669         jl[i] = jl[j]; jl[j] = i; /* update jl */
670       }
671       i = nexti;
672     }
673 
674     /* save nonzero entries in k-th row of U ... */
675 
676     /* invert diagonal block */
677     diag = ba+k*bs2;
678     ierr = PetscMemcpy(diag,dk,bs2*sizeof(MatScalar));CHKERRQ(ierr);
679     ierr = PetscKernel_A_gets_inverse_A(bs,diag,pivots,work);CHKERRQ(ierr);
680 
681     jmin = bi[k]; jmax = bi[k+1];
682     if (jmin < jmax) {
683       for (j=jmin; j<jmax; j++) {
684          vj = bj[j];           /* block col. index of U */
685          u   = ba + j*bs2;
686          rtmp_ptr = rtmp + vj*bs2;
687          for (k1=0; k1<bs2; k1++) {
688            *u++        = *rtmp_ptr;
689            *rtmp_ptr++ = 0.0;
690          }
691       }
692 
693       /* ... add k to row list for first nonzero entry in k-th row */
694       il[k] = jmin;
695       i     = bj[jmin];
696       jl[k] = jl[i]; jl[i] = k;
697     }
698   }
699 
700   ierr = PetscFree(rtmp);CHKERRQ(ierr);
701   ierr = PetscFree2(il,jl);CHKERRQ(ierr);
702   ierr = PetscFree3(dk,uik,work);CHKERRQ(ierr);
703   ierr = PetscFree(pivots);CHKERRQ(ierr);
704   if (a->permute) {
705     ierr = PetscFree(aa);CHKERRQ(ierr);
706   }
707 
708   ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr);
709   C->ops->solve          = MatSolve_SeqSBAIJ_N_inplace;
710   C->ops->solvetranspose = MatSolve_SeqSBAIJ_N_inplace;
711   C->ops->forwardsolve   = MatForwardSolve_SeqSBAIJ_N_inplace;
712   C->ops->backwardsolve  = MatBackwardSolve_SeqSBAIJ_N_inplace;
713 
714   C->assembled    = PETSC_TRUE;
715   C->preallocated = PETSC_TRUE;
716   ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
717   PetscFunctionReturn(0);
718 }
719 
720 #undef __FUNCT__
721 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering"
722 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering(Mat C,Mat A,const MatFactorInfo *info)
723 {
724   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
725   PetscErrorCode ierr;
726   PetscInt       i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
727   PetscInt       *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
728   PetscInt       bs=A->rmap->bs,bs2 = a->bs2,bslog;
729   MatScalar      *ba = b->a,*aa,*ap,*dk,*uik;
730   MatScalar      *u,*diag,*rtmp,*rtmp_ptr;
731   MatScalar      *work;
732   PetscInt       *pivots;
733 
734   PetscFunctionBegin;
735   ierr = PetscMalloc(bs2*mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
736   ierr = PetscMemzero(rtmp,bs2*mbs*sizeof(MatScalar));CHKERRQ(ierr);
737   ierr = PetscMalloc2(mbs,PetscInt,&il,mbs,PetscInt,&jl);CHKERRQ(ierr);
738   for (i=0; i<mbs; i++) {
739     jl[i] = mbs; il[0] = 0;
740   }
741   ierr = PetscMalloc3(bs2,MatScalar,&dk,bs2,MatScalar,&uik,bs,MatScalar,&work);CHKERRQ(ierr);
742   ierr = PetscMalloc(bs*sizeof(PetscInt),&pivots);CHKERRQ(ierr);
743 
744   ai = a->i; aj = a->j; aa = a->a;
745 
746   /* flops in while loop */
747   bslog = 2*bs*bs2;
748 
749   /* for each row k */
750   for (k = 0; k<mbs; k++) {
751 
752     /*initialize k-th row with elements nonzero in row k of A */
753     jmin = ai[k]; jmax = ai[k+1];
754     ap = aa + jmin*bs2;
755     for (j = jmin; j < jmax; j++) {
756       vj = aj[j];         /* block col. index */
757       rtmp_ptr = rtmp + vj*bs2;
758       for (i=0; i<bs2; i++) *rtmp_ptr++ = *ap++;
759     }
760 
761     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
762     ierr = PetscMemcpy(dk,rtmp+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
763     i = jl[k]; /* first row to be added to k_th row  */
764 
765     while (i < k) {
766       nexti = jl[i]; /* next row to be added to k_th row */
767 
768       /* compute multiplier */
769       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
770 
771       /* uik = -inv(Di)*U_bar(i,k) */
772       diag = ba + i*bs2;
773       u    = ba + ili*bs2;
774       ierr = PetscMemzero(uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
775       PetscKernel_A_gets_A_minus_B_times_C(bs,uik,diag,u);
776 
777       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
778       PetscKernel_A_gets_A_plus_Btranspose_times_C(bs,dk,uik,u);
779       ierr = PetscLogFlops(bslog*2.0);CHKERRQ(ierr);
780 
781       /* update -U(i,k) */
782       ierr = PetscMemcpy(ba+ili*bs2,uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
783 
784       /* add multiple of row i to k-th row ... */
785       jmin = ili + 1; jmax = bi[i+1];
786       if (jmin < jmax) {
787         for (j=jmin; j<jmax; j++) {
788           /* rtmp += -U(i,k)^T * U_bar(i,j) */
789           rtmp_ptr = rtmp + bj[j]*bs2;
790           u = ba + j*bs2;
791           PetscKernel_A_gets_A_plus_Btranspose_times_C(bs,rtmp_ptr,uik,u);
792         }
793         ierr = PetscLogFlops(bslog*(jmax-jmin));CHKERRQ(ierr);
794 
795         /* ... add i to row list for next nonzero entry */
796         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
797         j     = bj[jmin];
798         jl[i] = jl[j]; jl[j] = i; /* update jl */
799       }
800       i = nexti;
801     }
802 
803     /* save nonzero entries in k-th row of U ... */
804 
805     /* invert diagonal block */
806     diag = ba+k*bs2;
807     ierr = PetscMemcpy(diag,dk,bs2*sizeof(MatScalar));CHKERRQ(ierr);
808     ierr = PetscKernel_A_gets_inverse_A(bs,diag,pivots,work);CHKERRQ(ierr);
809 
810     jmin = bi[k]; jmax = bi[k+1];
811     if (jmin < jmax) {
812       for (j=jmin; j<jmax; j++) {
813          vj = bj[j];           /* block col. index of U */
814          u   = ba + j*bs2;
815          rtmp_ptr = rtmp + vj*bs2;
816          for (k1=0; k1<bs2; k1++) {
817            *u++        = *rtmp_ptr;
818            *rtmp_ptr++ = 0.0;
819          }
820       }
821 
822       /* ... add k to row list for first nonzero entry in k-th row */
823       il[k] = jmin;
824       i     = bj[jmin];
825       jl[k] = jl[i]; jl[i] = k;
826     }
827   }
828 
829   ierr = PetscFree(rtmp);CHKERRQ(ierr);
830   ierr = PetscFree2(il,jl);CHKERRQ(ierr);
831   ierr = PetscFree3(dk,uik,work);CHKERRQ(ierr);
832   ierr = PetscFree(pivots);CHKERRQ(ierr);
833 
834   C->ops->solve          = MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace;
835   C->ops->solvetranspose = MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace;
836   C->ops->forwardsolve   = MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace;
837   C->ops->backwardsolve  = MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace;
838   C->assembled = PETSC_TRUE;
839   C->preallocated = PETSC_TRUE;
840   ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
841   PetscFunctionReturn(0);
842 }
843 
844 /*
845     Numeric U^T*D*U factorization for SBAIJ format. Modified from SNF of YSMP.
846     Version for blocks 2 by 2.
847 */
848 #undef __FUNCT__
849 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_2"
850 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_2(Mat C,Mat A,const MatFactorInfo *info)
851 {
852   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
853   IS             perm = b->row;
854   PetscErrorCode ierr;
855   const PetscInt *ai,*aj,*perm_ptr;
856   PetscInt       i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
857   PetscInt       *a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
858   MatScalar      *ba = b->a,*aa,*ap;
859   MatScalar      *u,*diag,*rtmp,*rtmp_ptr,dk[4],uik[4];
860   PetscReal      shift = info->shiftamount;
861 
862   PetscFunctionBegin;
863   /* initialization */
864   /* il and jl record the first nonzero element in each row of the accessing
865      window U(0:k, k:mbs-1).
866      jl:    list of rows to be added to uneliminated rows
867             i>= k: jl(i) is the first row to be added to row i
868             i<  k: jl(i) is the row following row i in some list of rows
869             jl(i) = mbs indicates the end of a list
870      il(i): points to the first nonzero element in columns k,...,mbs-1 of
871             row i of U */
872   ierr = PetscMalloc(4*mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
873   ierr = PetscMemzero(rtmp,4*mbs*sizeof(MatScalar));CHKERRQ(ierr);
874   ierr = PetscMalloc2(mbs,PetscInt,&il,mbs,PetscInt,&jl);CHKERRQ(ierr);
875   for (i=0; i<mbs; i++) {
876     jl[i] = mbs; il[0] = 0;
877   }
878   ierr = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr);
879 
880   /* check permutation */
881   if (!a->permute) {
882     ai = a->i; aj = a->j; aa = a->a;
883   } else {
884     ai   = a->inew; aj = a->jnew;
885     ierr = PetscMalloc(4*ai[mbs]*sizeof(MatScalar),&aa);CHKERRQ(ierr);
886     ierr = PetscMemcpy(aa,a->a,4*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
887     ierr = PetscMalloc(ai[mbs]*sizeof(PetscInt),&a2anew);CHKERRQ(ierr);
888     ierr = PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(PetscInt));CHKERRQ(ierr);
889 
890     for (i=0; i<mbs; i++) {
891       jmin = ai[i]; jmax = ai[i+1];
892       for (j=jmin; j<jmax; j++) {
893         while (a2anew[j] != j) {
894           k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
895           for (k1=0; k1<4; k1++) {
896             dk[k1]       = aa[k*4+k1];
897             aa[k*4+k1] = aa[j*4+k1];
898             aa[j*4+k1] = dk[k1];
899           }
900         }
901         /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
902         if (i > aj[j]) {
903           /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */
904           ap = aa + j*4;     /* ptr to the beginning of the block */
905           dk[1] = ap[1];     /* swap ap[1] and ap[2] */
906           ap[1] = ap[2];
907           ap[2] = dk[1];
908         }
909       }
910     }
911     ierr = PetscFree(a2anew);CHKERRQ(ierr);
912   }
913 
914   /* for each row k */
915   for (k = 0; k<mbs; k++) {
916 
917     /*initialize k-th row with elements nonzero in row perm(k) of A */
918     jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
919     ap = aa + jmin*4;
920     for (j = jmin; j < jmax; j++) {
921       vj = perm_ptr[aj[j]];         /* block col. index */
922       rtmp_ptr = rtmp + vj*4;
923       for (i=0; i<4; i++) *rtmp_ptr++ = *ap++;
924     }
925 
926     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
927     ierr = PetscMemcpy(dk,rtmp+k*4,4*sizeof(MatScalar));CHKERRQ(ierr);
928     i = jl[k]; /* first row to be added to k_th row  */
929 
930     while (i < k) {
931       nexti = jl[i]; /* next row to be added to k_th row */
932 
933       /* compute multiplier */
934       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
935 
936       /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */
937       diag = ba + i*4;
938       u    = ba + ili*4;
939       uik[0] = -(diag[0]*u[0] + diag[2]*u[1]);
940       uik[1] = -(diag[1]*u[0] + diag[3]*u[1]);
941       uik[2] = -(diag[0]*u[2] + diag[2]*u[3]);
942       uik[3] = -(diag[1]*u[2] + diag[3]*u[3]);
943 
944       /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */
945       dk[0] += uik[0]*u[0] + uik[1]*u[1];
946       dk[1] += uik[2]*u[0] + uik[3]*u[1];
947       dk[2] += uik[0]*u[2] + uik[1]*u[3];
948       dk[3] += uik[2]*u[2] + uik[3]*u[3];
949 
950       ierr = PetscLogFlops(16.0*2.0);CHKERRQ(ierr);
951 
952       /* update -U(i,k): ba[ili] = uik */
953       ierr = PetscMemcpy(ba+ili*4,uik,4*sizeof(MatScalar));CHKERRQ(ierr);
954 
955       /* add multiple of row i to k-th row ... */
956       jmin = ili + 1; jmax = bi[i+1];
957       if (jmin < jmax) {
958         for (j=jmin; j<jmax; j++) {
959           /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */
960           rtmp_ptr = rtmp + bj[j]*4;
961           u = ba + j*4;
962           rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1];
963           rtmp_ptr[1] += uik[2]*u[0] + uik[3]*u[1];
964           rtmp_ptr[2] += uik[0]*u[2] + uik[1]*u[3];
965           rtmp_ptr[3] += uik[2]*u[2] + uik[3]*u[3];
966         }
967         ierr = PetscLogFlops(16.0*(jmax-jmin));CHKERRQ(ierr);
968 
969         /* ... add i to row list for next nonzero entry */
970         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
971         j     = bj[jmin];
972         jl[i] = jl[j]; jl[j] = i; /* update jl */
973       }
974       i = nexti;
975     }
976 
977     /* save nonzero entries in k-th row of U ... */
978 
979     /* invert diagonal block */
980     diag = ba+k*4;
981     ierr = PetscMemcpy(diag,dk,4*sizeof(MatScalar));CHKERRQ(ierr);
982     ierr = PetscKernel_A_gets_inverse_A_2(diag,shift);CHKERRQ(ierr);
983 
984     jmin = bi[k]; jmax = bi[k+1];
985     if (jmin < jmax) {
986       for (j=jmin; j<jmax; j++) {
987          vj = bj[j];           /* block col. index of U */
988          u   = ba + j*4;
989          rtmp_ptr = rtmp + vj*4;
990          for (k1=0; k1<4; k1++) {
991            *u++        = *rtmp_ptr;
992            *rtmp_ptr++ = 0.0;
993          }
994       }
995 
996       /* ... add k to row list for first nonzero entry in k-th row */
997       il[k] = jmin;
998       i     = bj[jmin];
999       jl[k] = jl[i]; jl[i] = k;
1000     }
1001   }
1002 
1003   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1004   ierr = PetscFree2(il,jl);CHKERRQ(ierr);
1005   if (a->permute) {
1006     ierr = PetscFree(aa);CHKERRQ(ierr);
1007   }
1008   ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr);
1009   C->ops->solve          = MatSolve_SeqSBAIJ_2_inplace;
1010   C->ops->solvetranspose = MatSolve_SeqSBAIJ_2_inplace;
1011   C->assembled = PETSC_TRUE;
1012   C->preallocated = PETSC_TRUE;
1013   ierr = PetscLogFlops(1.3333*8*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
1014   PetscFunctionReturn(0);
1015 }
1016 
1017 /*
1018       Version for when blocks are 2 by 2 Using natural ordering
1019 */
1020 #undef __FUNCT__
1021 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering"
1022 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering(Mat C,Mat A,const MatFactorInfo *info)
1023 {
1024   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
1025   PetscErrorCode ierr;
1026   PetscInt       i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
1027   PetscInt       *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
1028   MatScalar      *ba = b->a,*aa,*ap,dk[8],uik[8];
1029   MatScalar      *u,*diag,*rtmp,*rtmp_ptr;
1030   PetscReal      shift = info->shiftamount;
1031 
1032   PetscFunctionBegin;
1033   /* initialization */
1034   /* il and jl record the first nonzero element in each row of the accessing
1035      window U(0:k, k:mbs-1).
1036      jl:    list of rows to be added to uneliminated rows
1037             i>= k: jl(i) is the first row to be added to row i
1038             i<  k: jl(i) is the row following row i in some list of rows
1039             jl(i) = mbs indicates the end of a list
1040      il(i): points to the first nonzero element in columns k,...,mbs-1 of
1041             row i of U */
1042   ierr = PetscMalloc(4*mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
1043   ierr = PetscMemzero(rtmp,4*mbs*sizeof(MatScalar));CHKERRQ(ierr);
1044   ierr = PetscMalloc2(mbs,PetscInt,&il,mbs,PetscInt,&jl);CHKERRQ(ierr);
1045   for (i=0; i<mbs; i++) {
1046     jl[i] = mbs; il[0] = 0;
1047   }
1048   ai = a->i; aj = a->j; aa = a->a;
1049 
1050   /* for each row k */
1051   for (k = 0; k<mbs; k++) {
1052 
1053     /*initialize k-th row with elements nonzero in row k of A */
1054     jmin = ai[k]; jmax = ai[k+1];
1055     ap = aa + jmin*4;
1056     for (j = jmin; j < jmax; j++) {
1057       vj = aj[j];         /* block col. index */
1058       rtmp_ptr = rtmp + vj*4;
1059       for (i=0; i<4; i++) *rtmp_ptr++ = *ap++;
1060     }
1061 
1062     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
1063     ierr = PetscMemcpy(dk,rtmp+k*4,4*sizeof(MatScalar));CHKERRQ(ierr);
1064     i = jl[k]; /* first row to be added to k_th row  */
1065 
1066     while (i < k) {
1067       nexti = jl[i]; /* next row to be added to k_th row */
1068 
1069       /* compute multiplier */
1070       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
1071 
1072       /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */
1073       diag = ba + i*4;
1074       u    = ba + ili*4;
1075       uik[0] = -(diag[0]*u[0] + diag[2]*u[1]);
1076       uik[1] = -(diag[1]*u[0] + diag[3]*u[1]);
1077       uik[2] = -(diag[0]*u[2] + diag[2]*u[3]);
1078       uik[3] = -(diag[1]*u[2] + diag[3]*u[3]);
1079 
1080       /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */
1081       dk[0] += uik[0]*u[0] + uik[1]*u[1];
1082       dk[1] += uik[2]*u[0] + uik[3]*u[1];
1083       dk[2] += uik[0]*u[2] + uik[1]*u[3];
1084       dk[3] += uik[2]*u[2] + uik[3]*u[3];
1085 
1086       ierr = PetscLogFlops(16.0*2.0);CHKERRQ(ierr);
1087 
1088       /* update -U(i,k): ba[ili] = uik */
1089       ierr = PetscMemcpy(ba+ili*4,uik,4*sizeof(MatScalar));CHKERRQ(ierr);
1090 
1091       /* add multiple of row i to k-th row ... */
1092       jmin = ili + 1; jmax = bi[i+1];
1093       if (jmin < jmax) {
1094         for (j=jmin; j<jmax; j++) {
1095           /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */
1096           rtmp_ptr = rtmp + bj[j]*4;
1097           u = ba + j*4;
1098           rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1];
1099           rtmp_ptr[1] += uik[2]*u[0] + uik[3]*u[1];
1100           rtmp_ptr[2] += uik[0]*u[2] + uik[1]*u[3];
1101           rtmp_ptr[3] += uik[2]*u[2] + uik[3]*u[3];
1102         }
1103         ierr = PetscLogFlops(16.0*(jmax-jmin));CHKERRQ(ierr);
1104 
1105         /* ... add i to row list for next nonzero entry */
1106         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
1107         j     = bj[jmin];
1108         jl[i] = jl[j]; jl[j] = i; /* update jl */
1109       }
1110       i = nexti;
1111     }
1112 
1113     /* save nonzero entries in k-th row of U ... */
1114 
1115     /* invert diagonal block */
1116     diag = ba+k*4;
1117     ierr = PetscMemcpy(diag,dk,4*sizeof(MatScalar));CHKERRQ(ierr);
1118     ierr = PetscKernel_A_gets_inverse_A_2(diag,shift);CHKERRQ(ierr);
1119 
1120     jmin = bi[k]; jmax = bi[k+1];
1121     if (jmin < jmax) {
1122       for (j=jmin; j<jmax; j++) {
1123          vj = bj[j];           /* block col. index of U */
1124          u   = ba + j*4;
1125          rtmp_ptr = rtmp + vj*4;
1126          for (k1=0; k1<4; k1++) {
1127            *u++        = *rtmp_ptr;
1128            *rtmp_ptr++ = 0.0;
1129          }
1130       }
1131 
1132       /* ... add k to row list for first nonzero entry in k-th row */
1133       il[k] = jmin;
1134       i     = bj[jmin];
1135       jl[k] = jl[i]; jl[i] = k;
1136     }
1137   }
1138 
1139   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1140   ierr = PetscFree2(il,jl);CHKERRQ(ierr);
1141 
1142   C->ops->solve          = MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace;
1143   C->ops->solvetranspose = MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace;
1144   C->ops->forwardsolve   = MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace;
1145   C->ops->backwardsolve  = MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace;
1146   C->assembled = PETSC_TRUE;
1147   C->preallocated = PETSC_TRUE;
1148   ierr = PetscLogFlops(1.3333*8*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
1149   PetscFunctionReturn(0);
1150 }
1151 
1152 /*
1153     Numeric U^T*D*U factorization for SBAIJ format.
1154     Version for blocks are 1 by 1.
1155 */
1156 #undef __FUNCT__
1157 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_1_inplace"
1158 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_1_inplace(Mat C,Mat A,const MatFactorInfo *info)
1159 {
1160   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data,*b=(Mat_SeqSBAIJ *)C->data;
1161   IS             ip=b->row;
1162   PetscErrorCode ierr;
1163   const PetscInt *ai,*aj,*rip;
1164   PetscInt       *a2anew,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j,*bcol;
1165   PetscInt       k,jmin,jmax,*jl,*il,col,nexti,ili,nz;
1166   MatScalar      *rtmp,*ba=b->a,*bval,*aa,dk,uikdi;
1167   PetscReal      rs;
1168   FactorShiftCtx sctx;
1169 
1170   PetscFunctionBegin;
1171   /* MatPivotSetUp(): initialize shift context sctx */
1172   ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr);
1173 
1174   ierr  = ISGetIndices(ip,&rip);CHKERRQ(ierr);
1175   if (!a->permute) {
1176     ai = a->i; aj = a->j; aa = a->a;
1177   } else {
1178     ai = a->inew; aj = a->jnew;
1179     nz = ai[mbs];
1180     ierr = PetscMalloc(nz*sizeof(MatScalar),&aa);CHKERRQ(ierr);
1181     a2anew = a->a2anew;
1182     bval   = a->a;
1183     for (j=0; j<nz; j++) {
1184       aa[a2anew[j]] = *(bval++);
1185     }
1186   }
1187 
1188   /* initialization */
1189   /* il and jl record the first nonzero element in each row of the accessing
1190      window U(0:k, k:mbs-1).
1191      jl:    list of rows to be added to uneliminated rows
1192             i>= k: jl(i) is the first row to be added to row i
1193             i<  k: jl(i) is the row following row i in some list of rows
1194             jl(i) = mbs indicates the end of a list
1195      il(i): points to the first nonzero element in columns k,...,mbs-1 of
1196             row i of U */
1197   ierr = PetscMalloc3(mbs,MatScalar,&rtmp,mbs,PetscInt,&il,mbs,PetscInt,&jl);CHKERRQ(ierr);
1198 
1199   do {
1200     sctx.newshift = PETSC_FALSE;
1201     for (i=0; i<mbs; i++) {
1202       rtmp[i] = 0.0; jl[i] = mbs; il[0] = 0;
1203     }
1204 
1205     for (k = 0; k<mbs; k++) {
1206       /*initialize k-th row by the perm[k]-th row of A */
1207       jmin = ai[rip[k]]; jmax = ai[rip[k]+1];
1208       bval = ba + bi[k];
1209       for (j = jmin; j < jmax; j++) {
1210         col = rip[aj[j]];
1211         rtmp[col] = aa[j];
1212         *bval++  = 0.0; /* for in-place factorization */
1213       }
1214 
1215       /* shift the diagonal of the matrix */
1216       if (sctx.nshift) rtmp[k] += sctx.shift_amount;
1217 
1218       /* modify k-th row by adding in those rows i with U(i,k)!=0 */
1219       dk = rtmp[k];
1220       i = jl[k]; /* first row to be added to k_th row  */
1221 
1222       while (i < k) {
1223         nexti = jl[i]; /* next row to be added to k_th row */
1224 
1225         /* compute multiplier, update diag(k) and U(i,k) */
1226         ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
1227         uikdi = - ba[ili]*ba[bi[i]];  /* diagonal(k) */
1228         dk += uikdi*ba[ili];
1229         ba[ili] = uikdi; /* -U(i,k) */
1230 
1231         /* add multiple of row i to k-th row */
1232         jmin = ili + 1; jmax = bi[i+1];
1233         if (jmin < jmax) {
1234           for (j=jmin; j<jmax; j++) rtmp[bj[j]] += uikdi*ba[j];
1235           ierr = PetscLogFlops(2.0*(jmax-jmin));CHKERRQ(ierr);
1236 
1237           /* update il and jl for row i */
1238           il[i] = jmin;
1239           j = bj[jmin]; jl[i] = jl[j]; jl[j] = i;
1240         }
1241         i = nexti;
1242       }
1243 
1244       /* shift the diagonals when zero pivot is detected */
1245       /* compute rs=sum of abs(off-diagonal) */
1246       rs   = 0.0;
1247       jmin = bi[k]+1;
1248       nz   = bi[k+1] - jmin;
1249       if (nz) {
1250         bcol = bj + jmin;
1251         while (nz--) {
1252           rs += PetscAbsScalar(rtmp[*bcol]);
1253           bcol++;
1254         }
1255       }
1256 
1257       sctx.rs = rs;
1258       sctx.pv = dk;
1259       ierr = MatPivotCheck(A,info,&sctx,k);CHKERRQ(ierr);
1260       if (sctx.newshift) break;    /* sctx.shift_amount is updated */
1261       dk = sctx.pv;
1262 
1263       /* copy data into U(k,:) */
1264       ba[bi[k]] = 1.0/dk; /* U(k,k) */
1265       jmin = bi[k]+1; jmax = bi[k+1];
1266       if (jmin < jmax) {
1267         for (j=jmin; j<jmax; j++) {
1268           col = bj[j]; ba[j] = rtmp[col]; rtmp[col] = 0.0;
1269         }
1270         /* add the k-th row into il and jl */
1271         il[k] = jmin;
1272         i = bj[jmin]; jl[k] = jl[i]; jl[i] = k;
1273       }
1274     }
1275   } while (sctx.newshift);
1276   ierr = PetscFree3(rtmp,il,jl);CHKERRQ(ierr);
1277   if (a->permute) {ierr = PetscFree(aa);CHKERRQ(ierr);}
1278 
1279   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
1280   C->ops->solve          = MatSolve_SeqSBAIJ_1_inplace;
1281   C->ops->solves         = MatSolves_SeqSBAIJ_1_inplace;
1282   C->ops->solvetranspose = MatSolve_SeqSBAIJ_1_inplace;
1283   C->ops->forwardsolve   = MatForwardSolve_SeqSBAIJ_1_inplace;
1284   C->ops->backwardsolve  = MatBackwardSolve_SeqSBAIJ_1_inplace;
1285   C->assembled    = PETSC_TRUE;
1286   C->preallocated = PETSC_TRUE;
1287   ierr = PetscLogFlops(C->rmap->N);CHKERRQ(ierr);
1288   if (sctx.nshift) {
1289     if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1290       ierr = PetscInfo2(A,"number of shiftnz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
1291     } else if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
1292       ierr = PetscInfo2(A,"number of shiftpd tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
1293     }
1294   }
1295   PetscFunctionReturn(0);
1296 }
1297 
1298 /*
1299   Version for when blocks are 1 by 1 Using natural ordering under new datastructure
1300   Modified from MatCholeskyFactorNumeric_SeqAIJ()
1301 */
1302 #undef __FUNCT__
1303 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering"
1304 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info)
1305 {
1306   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1307   Mat_SeqSBAIJ   *b=(Mat_SeqSBAIJ*)B->data;
1308   PetscErrorCode ierr;
1309   PetscInt       i,j,mbs=A->rmap->n,*bi=b->i,*bj=b->j,*bdiag=b->diag,*bjtmp;
1310   PetscInt       *ai=a->i,*aj=a->j,*ajtmp;
1311   PetscInt       k,jmin,jmax,*c2r,*il,col,nexti,ili,nz;
1312   MatScalar      *rtmp,*ba=b->a,*bval,*aa=a->a,dk,uikdi;
1313   FactorShiftCtx sctx;
1314   PetscReal      rs;
1315   MatScalar      d,*v;
1316 
1317   PetscFunctionBegin;
1318   ierr = PetscMalloc3(mbs,MatScalar,&rtmp,mbs,PetscInt,&il,mbs,PetscInt,&c2r);CHKERRQ(ierr);
1319 
1320   /* MatPivotSetUp(): initialize shift context sctx */
1321   ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr);
1322 
1323   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1324     sctx.shift_top = info->zeropivot;
1325     ierr = PetscMemzero(rtmp,mbs*sizeof(MatScalar));CHKERRQ(ierr);
1326     for (i=0; i<mbs; i++) {
1327       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1328       d  = (aa)[a->diag[i]];
1329       rtmp[i] += - PetscRealPart(d); /* diagonal entry */
1330       ajtmp = aj + ai[i] + 1;        /* exclude diagonal */
1331       v     = aa + ai[i] + 1;
1332       nz    = ai[i+1] - ai[i] - 1 ;
1333       for (j=0; j<nz; j++) {
1334         rtmp[i] += PetscAbsScalar(v[j]);
1335         rtmp[ajtmp[j]] += PetscAbsScalar(v[j]);
1336       }
1337       if (PetscRealPart(rtmp[i]) > sctx.shift_top) sctx.shift_top = PetscRealPart(rtmp[i]);
1338     }
1339     sctx.shift_top   *= 1.1;
1340     sctx.nshift_max   = 5;
1341     sctx.shift_lo     = 0.;
1342     sctx.shift_hi     = 1.;
1343   }
1344 
1345   /* allocate working arrays
1346      c2r: linked list, keep track of pivot rows for a given column. c2r[col]: head of the list for a given col
1347      il:  for active k row, il[i] gives the index of the 1st nonzero entry in U[i,k:n-1] in bj and ba arrays
1348   */
1349   do {
1350     sctx.newshift = PETSC_FALSE;
1351 
1352     for (i=0; i<mbs; i++)  c2r[i] = mbs;
1353     if (mbs) il[0] = 0;
1354 
1355     for (k = 0; k<mbs; k++) {
1356       /* zero rtmp */
1357       nz = bi[k+1] - bi[k];
1358       bjtmp = bj + bi[k];
1359       for (j=0; j<nz; j++) rtmp[bjtmp[j]] = 0.0;
1360 
1361       /* load in initial unfactored row */
1362       bval = ba + bi[k];
1363       jmin = ai[k]; jmax = ai[k+1];
1364       for (j = jmin; j < jmax; j++) {
1365         col = aj[j];
1366         rtmp[col] = aa[j];
1367         *bval++   = 0.0; /* for in-place factorization */
1368       }
1369       /* shift the diagonal of the matrix: ZeropivotApply() */
1370       rtmp[k] += sctx.shift_amount;  /* shift the diagonal of the matrix */
1371 
1372       /* modify k-th row by adding in those rows i with U(i,k)!=0 */
1373       dk = rtmp[k];
1374       i  = c2r[k]; /* first row to be added to k_th row  */
1375 
1376       while (i < k) {
1377         nexti = c2r[i]; /* next row to be added to k_th row */
1378 
1379         /* compute multiplier, update diag(k) and U(i,k) */
1380         ili   = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
1381         uikdi = - ba[ili]*ba[bdiag[i]];  /* diagonal(k) */
1382         dk   += uikdi*ba[ili]; /* update diag[k] */
1383         ba[ili] = uikdi; /* -U(i,k) */
1384 
1385         /* add multiple of row i to k-th row */
1386         jmin = ili + 1; jmax = bi[i+1];
1387         if (jmin < jmax) {
1388           for (j=jmin; j<jmax; j++) rtmp[bj[j]] += uikdi*ba[j];
1389           /* update il and c2r for row i */
1390           il[i] = jmin;
1391           j = bj[jmin]; c2r[i] = c2r[j]; c2r[j] = i;
1392         }
1393         i = nexti;
1394       }
1395 
1396       /* copy data into U(k,:) */
1397       rs   = 0.0;
1398       jmin = bi[k]; jmax = bi[k+1]-1;
1399       if (jmin < jmax) {
1400         for (j=jmin; j<jmax; j++) {
1401           col = bj[j]; ba[j] = rtmp[col]; rs += PetscAbsScalar(ba[j]);
1402         }
1403         /* add the k-th row into il and c2r */
1404         il[k] = jmin;
1405         i = bj[jmin]; c2r[k] = c2r[i]; c2r[i] = k;
1406       }
1407 
1408       sctx.rs  = rs;
1409       sctx.pv  = dk;
1410       ierr = MatPivotCheck(A,info,&sctx,k);CHKERRQ(ierr);
1411       if (sctx.newshift) break;
1412       dk = sctx.pv;
1413 
1414       ba[bdiag[k]] = 1.0/dk; /* U(k,k) */
1415     }
1416   } while (sctx.newshift);
1417 
1418   ierr = PetscFree3(rtmp,il,c2r);CHKERRQ(ierr);
1419 
1420   B->ops->solve           = MatSolve_SeqSBAIJ_1_NaturalOrdering;
1421   B->ops->solves          = MatSolves_SeqSBAIJ_1;
1422   B->ops->solvetranspose  = MatSolve_SeqSBAIJ_1_NaturalOrdering;
1423   B->ops->forwardsolve    = MatForwardSolve_SeqSBAIJ_1_NaturalOrdering;
1424   B->ops->backwardsolve   = MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering;
1425 
1426   B->assembled    = PETSC_TRUE;
1427   B->preallocated = PETSC_TRUE;
1428   ierr = PetscLogFlops(B->rmap->n);CHKERRQ(ierr);
1429 
1430   /* MatPivotView() */
1431   if (sctx.nshift) {
1432     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
1433       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr);
1434     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1435       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
1436     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1437       ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %G\n",sctx.nshift,info->shiftamount);CHKERRQ(ierr);
1438     }
1439   }
1440   PetscFunctionReturn(0);
1441 }
1442 
1443 #undef __FUNCT__
1444 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering_inplace"
1445 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering_inplace(Mat C,Mat A,const MatFactorInfo *info)
1446 {
1447   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data,*b=(Mat_SeqSBAIJ *)C->data;
1448   PetscErrorCode ierr;
1449   PetscInt       i,j,mbs = a->mbs;
1450   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
1451   PetscInt       k,jmin,*jl,*il,nexti,ili,*acol,*bcol,nz;
1452   MatScalar      *rtmp,*ba=b->a,*aa=a->a,dk,uikdi,*aval,*bval;
1453   PetscReal      rs;
1454   FactorShiftCtx sctx;
1455 
1456   PetscFunctionBegin;
1457   /* MatPivotSetUp(): initialize shift context sctx */
1458   ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr);
1459 
1460   /* initialization */
1461   /* il and jl record the first nonzero element in each row of the accessing
1462      window U(0:k, k:mbs-1).
1463      jl:    list of rows to be added to uneliminated rows
1464             i>= k: jl(i) is the first row to be added to row i
1465             i<  k: jl(i) is the row following row i in some list of rows
1466             jl(i) = mbs indicates the end of a list
1467      il(i): points to the first nonzero element in U(i,k:mbs-1)
1468   */
1469   ierr = PetscMalloc(mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
1470   ierr = PetscMalloc2(mbs,PetscInt,&il,mbs,PetscInt,&jl);CHKERRQ(ierr);
1471 
1472   do {
1473     sctx.newshift = PETSC_FALSE;
1474     for (i=0; i<mbs; i++) {
1475       rtmp[i] = 0.0; jl[i] = mbs; il[0] = 0;
1476     }
1477 
1478     for (k = 0; k<mbs; k++) {
1479       /*initialize k-th row with elements nonzero in row perm(k) of A */
1480       nz   = ai[k+1] - ai[k];
1481       acol = aj + ai[k];
1482       aval = aa + ai[k];
1483       bval = ba + bi[k];
1484       while (nz--) {
1485         rtmp[*acol++] = *aval++;
1486         *bval++       = 0.0; /* for in-place factorization */
1487       }
1488 
1489       /* shift the diagonal of the matrix */
1490       if (sctx.nshift) rtmp[k] += sctx.shift_amount;
1491 
1492       /* modify k-th row by adding in those rows i with U(i,k)!=0 */
1493       dk = rtmp[k];
1494       i  = jl[k]; /* first row to be added to k_th row  */
1495 
1496       while (i < k) {
1497         nexti = jl[i]; /* next row to be added to k_th row */
1498         /* compute multiplier, update D(k) and U(i,k) */
1499         ili   = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
1500         uikdi = - ba[ili]*ba[bi[i]];
1501         dk   += uikdi*ba[ili];
1502         ba[ili] = uikdi; /* -U(i,k) */
1503 
1504         /* add multiple of row i to k-th row ... */
1505         jmin = ili + 1;
1506         nz   = bi[i+1] - jmin;
1507         if (nz > 0) {
1508           bcol = bj + jmin;
1509           bval = ba + jmin;
1510           ierr = PetscLogFlops(2.0*nz);CHKERRQ(ierr);
1511           while (nz --) rtmp[*bcol++] += uikdi*(*bval++);
1512 
1513           /* update il and jl for i-th row */
1514           il[i] = jmin;
1515           j = bj[jmin]; jl[i] = jl[j]; jl[j] = i;
1516         }
1517         i = nexti;
1518       }
1519 
1520       /* shift the diagonals when zero pivot is detected */
1521       /* compute rs=sum of abs(off-diagonal) */
1522       rs   = 0.0;
1523       jmin = bi[k]+1;
1524       nz   = bi[k+1] - jmin;
1525       if (nz) {
1526         bcol = bj + jmin;
1527         while (nz--) {
1528           rs += PetscAbsScalar(rtmp[*bcol]);
1529           bcol++;
1530         }
1531       }
1532 
1533       sctx.rs = rs;
1534       sctx.pv = dk;
1535       ierr = MatPivotCheck(A,info,&sctx,k);CHKERRQ(ierr);
1536       if (sctx.newshift) break;    /* sctx.shift_amount is updated */
1537       dk = sctx.pv;
1538 
1539       /* copy data into U(k,:) */
1540       ba[bi[k]] = 1.0/dk;
1541       jmin      = bi[k]+1;
1542       nz        = bi[k+1] - jmin;
1543       if (nz) {
1544         bcol = bj + jmin;
1545         bval = ba + jmin;
1546         while (nz--) {
1547           *bval++       = rtmp[*bcol];
1548           rtmp[*bcol++] = 0.0;
1549         }
1550         /* add k-th row into il and jl */
1551         il[k] = jmin;
1552         i = bj[jmin]; jl[k] = jl[i]; jl[i] = k;
1553       }
1554     } /* end of for (k = 0; k<mbs; k++) */
1555   } while (sctx.newshift);
1556   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1557   ierr = PetscFree2(il,jl);CHKERRQ(ierr);
1558 
1559   C->ops->solve          = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1560   C->ops->solves         = MatSolves_SeqSBAIJ_1_inplace;
1561   C->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1562   C->ops->forwardsolve   = MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1563   C->ops->backwardsolve  = MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1564 
1565   C->assembled    = PETSC_TRUE;
1566   C->preallocated = PETSC_TRUE;
1567   ierr = PetscLogFlops(C->rmap->N);CHKERRQ(ierr);
1568   if (sctx.nshift) {
1569     if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1570       ierr = PetscInfo2(A,"number of shiftnz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
1571     } else if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
1572       ierr = PetscInfo2(A,"number of shiftpd tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
1573     }
1574   }
1575   PetscFunctionReturn(0);
1576 }
1577 
1578 #undef __FUNCT__
1579 #define __FUNCT__ "MatCholeskyFactor_SeqSBAIJ"
1580 PetscErrorCode MatCholeskyFactor_SeqSBAIJ(Mat A,IS perm,const MatFactorInfo *info)
1581 {
1582   PetscErrorCode ierr;
1583   Mat            C;
1584 
1585   PetscFunctionBegin;
1586   ierr = MatGetFactor(A,"petsc",MAT_FACTOR_CHOLESKY,&C);CHKERRQ(ierr);
1587   ierr = MatCholeskyFactorSymbolic(C,A,perm,info);CHKERRQ(ierr);
1588   ierr = MatCholeskyFactorNumeric(C,A,info);CHKERRQ(ierr);
1589   A->ops->solve            = C->ops->solve;
1590   A->ops->solvetranspose   = C->ops->solvetranspose;
1591   ierr = MatHeaderMerge(A,C);CHKERRQ(ierr);
1592   PetscFunctionReturn(0);
1593 }
1594 
1595 
1596