xref: /petsc/src/mat/impls/sbaij/seq/sbaijfact.c (revision 6849ba73f22fecb8f92ef896a42e4e8bd4cd6965)
1 
2 #include "src/mat/impls/baij/seq/baij.h"
3 #include "src/mat/impls/sbaij/seq/sbaij.h"
4 #include "src/inline/ilu.h"
5 #include "include/petscis.h"
6 
7 #if !defined(PETSC_USE_COMPLEX)
8 /*
9   input:
10    F -- numeric factor
11   output:
12    nneg, nzero, npos: matrix inertia
13 */
14 
15 #undef __FUNCT__
16 #define __FUNCT__ "MatGetInertia_SeqSBAIJ"
17 PetscErrorCode MatGetInertia_SeqSBAIJ(Mat F,int *nneig,int *nzero,int *npos)
18 {
19   Mat_SeqSBAIJ *fact_ptr = (Mat_SeqSBAIJ*)F->data;
20   PetscScalar  *dd = fact_ptr->a;
21   int          mbs=fact_ptr->mbs,bs=fact_ptr->bs,i,nneig_tmp,npos_tmp,
22                *fi = fact_ptr->i;
23 
24   PetscFunctionBegin;
25   if (bs != 1) SETERRQ1(PETSC_ERR_SUP,"No support for bs: %d >1 yet",bs);
26   nneig_tmp = 0; npos_tmp = 0;
27   for (i=0; i<mbs; i++){
28     if (PetscRealPart(dd[*fi]) > 0.0){
29       npos_tmp++;
30     } else if (PetscRealPart(dd[*fi]) < 0.0){
31       nneig_tmp++;
32     }
33     fi++;
34   }
35   if (nneig) *nneig = nneig_tmp;
36   if (npos)  *npos  = npos_tmp;
37   if (nzero) *nzero = mbs - nneig_tmp - npos_tmp;
38 
39   PetscFunctionReturn(0);
40 }
41 #endif /* !defined(PETSC_USE_COMPLEX) */
42 
43 /* Using Modified Sparse Row (MSR) storage.
44 See page 85, "Iterative Methods ..." by Saad. */
45 /*
46     Symbolic U^T*D*U factorization for SBAIJ format. Modified from SSF of YSMP.
47 */
48 /* Use Modified Sparse Row storage for u and ju, see Saad pp.85 */
49 #undef __FUNCT__
50 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqSBAIJ"
51 PetscErrorCode MatCholeskyFactorSymbolic_SeqSBAIJ(Mat A,IS perm,MatFactorInfo *info,Mat *B)
52 {
53   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b;
54   PetscErrorCode ierr;
55   int          *rip,i,mbs = a->mbs,*ai,*aj;
56   int          *jutmp,bs = a->bs,bs2=a->bs2;
57   int          m,realloc = 0,prow;
58   int          *jl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd;
59   int          *il,ili,nextprow;
60   PetscReal    f = info->fill;
61   PetscTruth   perm_identity;
62 
63   PetscFunctionBegin;
64   /* check whether perm is the identity mapping */
65   ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr);
66 
67   /* -- inplace factorization, i.e., use sbaij for *B -- */
68   if (perm_identity && bs==1 ){
69     if (!perm_identity) a->permute = PETSC_TRUE;
70 
71   ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr);
72 
73   if (perm_identity){ /* without permutation */
74     ai = a->i; aj = a->j;
75   } else {            /* non-trivial permutation */
76     ierr = MatReorderingSeqSBAIJ(A,perm);CHKERRQ(ierr);
77     ai = a->inew; aj = a->jnew;
78   }
79 
80   /* initialization */
81   ierr  = PetscMalloc((mbs+1)*sizeof(int),&iu);CHKERRQ(ierr);
82   umax  = (int)(f*ai[mbs] + 1);
83   ierr  = PetscMalloc(umax*sizeof(int),&ju);CHKERRQ(ierr);
84   iu[0] = 0;
85   juidx = 0; /* index for ju */
86   ierr  = PetscMalloc((3*mbs+1)*sizeof(int),&jl);CHKERRQ(ierr); /* linked list for getting pivot row */
87   q     = jl + mbs;   /* linked list for col index of active row */
88   il    = q  + mbs;
89   for (i=0; i<mbs; i++){
90     jl[i] = mbs;
91     q[i]  = 0;
92     il[i] = 0;
93   }
94 
95   /* for each row k */
96   for (k=0; k<mbs; k++){
97     nzk  = 0; /* num. of nz blocks in k-th block row with diagonal block excluded */
98     q[k] = mbs;
99     /* initialize nonzero structure of k-th row to row rip[k] of A */
100     jmin = ai[rip[k]] +1; /* exclude diag[k] */
101     jmax = ai[rip[k]+1];
102     for (j=jmin; j<jmax; j++){
103       vj = rip[aj[j]]; /* col. value */
104       if(vj > k){
105         qm = k;
106         do {
107           m  = qm; qm = q[m];
108         } while(qm < vj);
109         if (qm == vj) {
110           SETERRQ(1," error: duplicate entry in A\n");
111         }
112         nzk++;
113         q[m]  = vj;
114         q[vj] = qm;
115       } /* if(vj > k) */
116     } /* for (j=jmin; j<jmax; j++) */
117 
118     /* modify nonzero structure of k-th row by computing fill-in
119        for each row i to be merged in */
120     prow = k;
121     prow = jl[prow]; /* next pivot row (== mbs for symbolic factorization) */
122 
123     while (prow < k){
124       nextprow = jl[prow];
125 
126       /* merge row prow into k-th row */
127       ili = il[prow];
128       jmin = ili + 1;  /* points to 2nd nzero entry in U(prow,k:mbs-1) */
129       jmax = iu[prow+1];
130       qm = k;
131       for (j=jmin; j<jmax; j++){
132         vj = ju[j];
133         do {
134           m = qm; qm = q[m];
135         } while (qm < vj);
136         if (qm != vj){  /* a fill */
137           nzk++; q[m] = vj; q[vj] = qm; qm = vj;
138         }
139       } /* end of for (j=jmin; j<jmax; j++) */
140       if (jmin < jmax){
141         il[prow] = jmin;
142         j = ju[jmin];
143         jl[prow] = jl[j]; jl[j] = prow;  /* update jl */
144       }
145       prow = nextprow;
146     }
147 
148     /* update il and jl */
149     if (nzk > 0){
150       i = q[k]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */
151       jl[k] = jl[i]; jl[i] = k;
152       il[k] = iu[k] + 1;
153     }
154     iu[k+1] = iu[k] + nzk + 1;  /* include diag[k] */
155 
156     /* allocate more space to ju if needed */
157     if (iu[k+1] > umax) {
158       /* estimate how much additional space we will need */
159       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
160       /* just double the memory each time */
161       maxadd = umax;
162       if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
163       umax += maxadd;
164 
165       /* allocate a longer ju */
166       ierr = PetscMalloc(umax*sizeof(int),&jutmp);CHKERRQ(ierr);
167       ierr = PetscMemcpy(jutmp,ju,iu[k]*sizeof(int));CHKERRQ(ierr);
168       ierr = PetscFree(ju);CHKERRQ(ierr);
169       ju   = jutmp;
170       realloc++; /* count how many times we realloc */
171     }
172 
173     /* save nonzero structure of k-th row in ju */
174     ju[juidx++] = k; /* diag[k] */
175     i = k;
176     while (nzk --) {
177       i           = q[i];
178       ju[juidx++] = i;
179     }
180   }
181 
182   if (ai[mbs] != 0) {
183     PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
184     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",realloc,f,af);
185     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Run with -pc_cholesky_fill %g or use \n",af);
186     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:PCCholeskySetFill(pc,%g);\n",af);
187     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:for best performance.\n");
188   } else {
189      PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Empty matrix.\n");
190   }
191 
192   ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
193   /* ierr = PetscFree(q);CHKERRQ(ierr); */
194   ierr = PetscFree(jl);CHKERRQ(ierr);
195 
196   /* put together the new matrix */
197   ierr = MatCreate(A->comm,bs*mbs,bs*mbs,bs*mbs,bs*mbs,B);CHKERRQ(ierr);
198   ierr = MatSetType(*B,A->type_name);CHKERRQ(ierr);
199   ierr = MatSeqSBAIJSetPreallocation(*B,bs,0,PETSC_NULL);CHKERRQ(ierr);
200 
201   /* PetscLogObjectParent(*B,iperm); */
202   b = (Mat_SeqSBAIJ*)(*B)->data;
203   ierr = PetscFree(b->imax);CHKERRQ(ierr);
204   b->singlemalloc = PETSC_FALSE;
205   /* the next line frees the default space generated by the Create() */
206   ierr = PetscFree(b->a);CHKERRQ(ierr);
207   ierr = PetscFree(b->ilen);CHKERRQ(ierr);
208   ierr = PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2,&b->a);CHKERRQ(ierr);
209   b->j    = ju;
210   b->i    = iu;
211   b->diag = 0;
212   b->ilen = 0;
213   b->imax = 0;
214   b->row  = perm;
215   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
216   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
217   b->icol = perm;
218   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
219   ierr    = PetscMalloc((bs*mbs+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
220   /* In b structure:  Free imax, ilen, old a, old j.
221      Allocate idnew, solve_work, new a, new j */
222   PetscLogObjectMemory(*B,(iu[mbs]-mbs)*(sizeof(int)+sizeof(MatScalar)));
223   b->maxnz = b->nz = iu[mbs];
224 
225   (*B)->factor                 = FACTOR_CHOLESKY;
226   (*B)->info.factor_mallocs    = realloc;
227   (*B)->info.fill_ratio_given  = f;
228   if (ai[mbs] != 0) {
229     (*B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
230   } else {
231     (*B)->info.fill_ratio_needed = 0.0;
232   }
233 
234 
235   (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
236   (*B)->ops->solve           = MatSolve_SeqSBAIJ_1_NaturalOrdering;
237   PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=1\n");
238 
239   PetscFunctionReturn(0);
240   }
241   /* -----------  end of new code --------------------*/
242 
243 
244   if (!perm_identity) a->permute = PETSC_TRUE;
245 
246   ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr);
247 
248   if (perm_identity){ /* without permutation */
249     ai = a->i; aj = a->j;
250   } else {            /* non-trivial permutation */
251     ierr = MatReorderingSeqSBAIJ(A,perm);CHKERRQ(ierr);
252     ai = a->inew; aj = a->jnew;
253   }
254 
255   /* initialization */
256   ierr  = PetscMalloc((mbs+1)*sizeof(int),&iu);CHKERRQ(ierr);
257   umax  = (int)(f*ai[mbs] + 1); umax += mbs + 1;
258   ierr  = PetscMalloc(umax*sizeof(int),&ju);CHKERRQ(ierr);
259   iu[0] = mbs+1;
260   juidx = mbs + 1; /* index for ju */
261   ierr  = PetscMalloc(2*mbs*sizeof(int),&jl);CHKERRQ(ierr); /* linked list for pivot row */
262   q     = jl + mbs;   /* linked list for col index */
263   for (i=0; i<mbs; i++){
264     jl[i] = mbs;
265     q[i] = 0;
266   }
267 
268   /* for each row k */
269   for (k=0; k<mbs; k++){
270     for (i=0; i<mbs; i++) q[i] = 0;  /* to be removed! */
271     nzk  = 0; /* num. of nz blocks in k-th block row with diagonal block excluded */
272     q[k] = mbs;
273     /* initialize nonzero structure of k-th row to row rip[k] of A */
274     jmin = ai[rip[k]] +1; /* exclude diag[k] */
275     jmax = ai[rip[k]+1];
276     for (j=jmin; j<jmax; j++){
277       vj = rip[aj[j]]; /* col. value */
278       if(vj > k){
279         qm = k;
280         do {
281           m  = qm; qm = q[m];
282         } while(qm < vj);
283         if (qm == vj) {
284           SETERRQ(1," error: duplicate entry in A\n");
285         }
286         nzk++;
287         q[m]  = vj;
288         q[vj] = qm;
289       } /* if(vj > k) */
290     } /* for (j=jmin; j<jmax; j++) */
291 
292     /* modify nonzero structure of k-th row by computing fill-in
293        for each row i to be merged in */
294     prow = k;
295     prow = jl[prow]; /* next pivot row (== mbs for symbolic factorization) */
296 
297     while (prow < k){
298       /* merge row prow into k-th row */
299       jmin = iu[prow] + 1; jmax = iu[prow+1];
300       qm = k;
301       for (j=jmin; j<jmax; j++){
302         vj = ju[j];
303         do {
304           m = qm; qm = q[m];
305         } while (qm < vj);
306         if (qm != vj){
307          nzk++; q[m] = vj; q[vj] = qm; qm = vj;
308         }
309       }
310       prow = jl[prow]; /* next pivot row */
311     }
312 
313     /* add k to row list for first nonzero element in k-th row */
314     if (nzk > 0){
315       i = q[k]; /* col value of first nonzero element in U(k, k+1:mbs-1) */
316       jl[k] = jl[i]; jl[i] = k;
317     }
318     iu[k+1] = iu[k] + nzk;
319 
320     /* allocate more space to ju if needed */
321     if (iu[k+1] > umax) {
322       /* estimate how much additional space we will need */
323       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
324       /* just double the memory each time */
325       maxadd = umax;
326       if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
327       umax += maxadd;
328 
329       /* allocate a longer ju */
330       ierr = PetscMalloc(umax*sizeof(int),&jutmp);CHKERRQ(ierr);
331       ierr = PetscMemcpy(jutmp,ju,iu[k]*sizeof(int));CHKERRQ(ierr);
332       ierr = PetscFree(ju);CHKERRQ(ierr);
333       ju   = jutmp;
334       realloc++; /* count how many times we realloc */
335     }
336 
337     /* save nonzero structure of k-th row in ju */
338     i=k;
339     while (nzk --) {
340       i           = q[i];
341       ju[juidx++] = i;
342     }
343   }
344 
345   if (ai[mbs] != 0) {
346     PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
347     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",realloc,f,af);
348     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Run with -pc_cholesky_fill %g or use \n",af);
349     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:PCCholeskySetFill(pc,%g);\n",af);
350     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:for best performance.\n");
351   } else {
352      PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Empty matrix.\n");
353   }
354 
355   ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
356   /* ierr = PetscFree(q);CHKERRQ(ierr); */
357   ierr = PetscFree(jl);CHKERRQ(ierr);
358 
359   /* put together the new matrix */
360   ierr = MatCreate(A->comm,bs*mbs,bs*mbs,bs*mbs,bs*mbs,B);CHKERRQ(ierr);
361   ierr = MatSetType(*B,A->type_name);CHKERRQ(ierr);
362   ierr = MatSeqSBAIJSetPreallocation(*B,bs,0,PETSC_NULL);CHKERRQ(ierr);
363 
364   /* PetscLogObjectParent(*B,iperm); */
365   b = (Mat_SeqSBAIJ*)(*B)->data;
366   ierr = PetscFree(b->imax);CHKERRQ(ierr);
367   b->singlemalloc = PETSC_FALSE;
368   /* the next line frees the default space generated by the Create() */
369   ierr = PetscFree(b->a);CHKERRQ(ierr);
370   ierr = PetscFree(b->ilen);CHKERRQ(ierr);
371   ierr = PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2,&b->a);CHKERRQ(ierr);
372   b->j    = ju;
373   b->i    = iu;
374   b->diag = 0;
375   b->ilen = 0;
376   b->imax = 0;
377   b->row  = perm;
378   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
379   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
380   b->icol = perm;
381   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
382   ierr    = PetscMalloc((bs*mbs+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
383   /* In b structure:  Free imax, ilen, old a, old j.
384      Allocate idnew, solve_work, new a, new j */
385   PetscLogObjectMemory(*B,(iu[mbs]-mbs)*(sizeof(int)+sizeof(MatScalar)));
386   b->maxnz = b->nz = iu[mbs];
387 
388   (*B)->factor                 = FACTOR_CHOLESKY;
389   (*B)->info.factor_mallocs    = realloc;
390   (*B)->info.fill_ratio_given  = f;
391   if (ai[mbs] != 0) {
392     (*B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
393   } else {
394     (*B)->info.fill_ratio_needed = 0.0;
395   }
396 
397   if (perm_identity){
398     switch (bs) {
399       case 1:
400         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
401         (*B)->ops->solve           = MatSolve_SeqSBAIJ_1_NaturalOrdering;
402         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=1\n");
403         break;
404       case 2:
405         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering;
406         (*B)->ops->solve           = MatSolve_SeqSBAIJ_2_NaturalOrdering;
407         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=2\n");
408         break;
409       case 3:
410         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_3_NaturalOrdering;
411         (*B)->ops->solve           = MatSolve_SeqSBAIJ_3_NaturalOrdering;
412         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:sing special in-place natural ordering factor and solve BS=3\n");
413         break;
414       case 4:
415         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_4_NaturalOrdering;
416         (*B)->ops->solve           = MatSolve_SeqSBAIJ_4_NaturalOrdering;
417         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=4\n");
418         break;
419       case 5:
420         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_5_NaturalOrdering;
421         (*B)->ops->solve           = MatSolve_SeqSBAIJ_5_NaturalOrdering;
422         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=5\n");
423         break;
424       case 6:
425         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_6_NaturalOrdering;
426         (*B)->ops->solve           = MatSolve_SeqSBAIJ_6_NaturalOrdering;
427         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=6\n");
428         break;
429       case 7:
430         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_7_NaturalOrdering;
431         (*B)->ops->solve           = MatSolve_SeqSBAIJ_7_NaturalOrdering;
432         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=7\n");
433       break;
434       default:
435         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering;
436         (*B)->ops->solve           = MatSolve_SeqSBAIJ_N_NaturalOrdering;
437         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS>7\n");
438       break;
439     }
440   }
441 
442   PetscFunctionReturn(0);
443 }
444 
445 
446 #undef __FUNCT__
447 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_N"
448 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_N(Mat A,Mat *B)
449 {
450   Mat                C = *B;
451   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
452   IS                 perm = b->row;
453   PetscErrorCode ierr;
454   int                *perm_ptr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
455   int                *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
456   int                bs=a->bs,bs2 = a->bs2;
457   MatScalar          *ba = b->a,*aa,*ap,*dk,*uik;
458   MatScalar          *u,*diag,*rtmp,*rtmp_ptr;
459   MatScalar          *work;
460   int                *pivots;
461 
462   PetscFunctionBegin;
463 
464   /* initialization */
465   ierr = PetscMalloc(bs2*mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
466   ierr = PetscMemzero(rtmp,bs2*mbs*sizeof(MatScalar));CHKERRQ(ierr);
467   ierr = PetscMalloc(2*mbs*sizeof(int),&il);CHKERRQ(ierr);
468   jl   = il + mbs;
469   for (i=0; i<mbs; i++) {
470     jl[i] = mbs; il[0] = 0;
471   }
472   ierr = PetscMalloc((2*bs2+bs)*sizeof(MatScalar),&dk);CHKERRQ(ierr);
473   uik  = dk + bs2;
474   work = uik + bs2;
475   ierr = PetscMalloc(bs*sizeof(int),&pivots);CHKERRQ(ierr);
476 
477   ierr  = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr);
478 
479   /* check permutation */
480   if (!a->permute){
481     ai = a->i; aj = a->j; aa = a->a;
482   } else {
483     ai   = a->inew; aj = a->jnew;
484     ierr = PetscMalloc(bs2*ai[mbs]*sizeof(MatScalar),&aa);CHKERRQ(ierr);
485     ierr = PetscMemcpy(aa,a->a,bs2*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
486     ierr = PetscMalloc(ai[mbs]*sizeof(int),&a2anew);CHKERRQ(ierr);
487     ierr = PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(int));CHKERRQ(ierr);
488 
489     for (i=0; i<mbs; i++){
490       jmin = ai[i]; jmax = ai[i+1];
491       for (j=jmin; j<jmax; j++){
492         while (a2anew[j] != j){
493           k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
494           for (k1=0; k1<bs2; k1++){
495             dk[k1]       = aa[k*bs2+k1];
496             aa[k*bs2+k1] = aa[j*bs2+k1];
497             aa[j*bs2+k1] = dk[k1];
498           }
499         }
500         /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
501         if (i > aj[j]){
502           /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */
503           ap = aa + j*bs2;                     /* ptr to the beginning of j-th block of aa */
504           for (k=0; k<bs2; k++) dk[k] = ap[k]; /* dk <- j-th block of aa */
505           for (k=0; k<bs; k++){               /* j-th block of aa <- dk^T */
506             for (k1=0; k1<bs; k1++) *ap++ = dk[k + bs*k1];
507           }
508         }
509       }
510     }
511     ierr = PetscFree(a2anew);CHKERRQ(ierr);
512   }
513 
514   /* for each row k */
515   for (k = 0; k<mbs; k++){
516 
517     /*initialize k-th row with elements nonzero in row perm(k) of A */
518     jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
519 
520     ap = aa + jmin*bs2;
521     for (j = jmin; j < jmax; j++){
522       vj = perm_ptr[aj[j]];         /* block col. index */
523       rtmp_ptr = rtmp + vj*bs2;
524       for (i=0; i<bs2; i++) *rtmp_ptr++ = *ap++;
525     }
526 
527     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
528     ierr = PetscMemcpy(dk,rtmp+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
529     i = jl[k]; /* first row to be added to k_th row  */
530 
531     while (i < k){
532       nexti = jl[i]; /* next row to be added to k_th row */
533 
534       /* compute multiplier */
535       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
536 
537       /* uik = -inv(Di)*U_bar(i,k) */
538       diag = ba + i*bs2;
539       u    = ba + ili*bs2;
540       ierr = PetscMemzero(uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
541       Kernel_A_gets_A_minus_B_times_C(bs,uik,diag,u);
542 
543       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
544       Kernel_A_gets_A_plus_Btranspose_times_C(bs,dk,uik,u);
545 
546       /* update -U(i,k) */
547       ierr = PetscMemcpy(ba+ili*bs2,uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
548 
549       /* add multiple of row i to k-th row ... */
550       jmin = ili + 1; jmax = bi[i+1];
551       if (jmin < jmax){
552         for (j=jmin; j<jmax; j++) {
553           /* rtmp += -U(i,k)^T * U_bar(i,j) */
554           rtmp_ptr = rtmp + bj[j]*bs2;
555           u = ba + j*bs2;
556           Kernel_A_gets_A_plus_Btranspose_times_C(bs,rtmp_ptr,uik,u);
557         }
558 
559         /* ... add i to row list for next nonzero entry */
560         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
561         j     = bj[jmin];
562         jl[i] = jl[j]; jl[j] = i; /* update jl */
563       }
564       i = nexti;
565     }
566 
567     /* save nonzero entries in k-th row of U ... */
568 
569     /* invert diagonal block */
570     diag = ba+k*bs2;
571     ierr = PetscMemcpy(diag,dk,bs2*sizeof(MatScalar));CHKERRQ(ierr);
572     Kernel_A_gets_inverse_A(bs,diag,pivots,work);
573 
574     jmin = bi[k]; jmax = bi[k+1];
575     if (jmin < jmax) {
576       for (j=jmin; j<jmax; j++){
577          vj = bj[j];           /* block col. index of U */
578          u   = ba + j*bs2;
579          rtmp_ptr = rtmp + vj*bs2;
580          for (k1=0; k1<bs2; k1++){
581            *u++        = *rtmp_ptr;
582            *rtmp_ptr++ = 0.0;
583          }
584       }
585 
586       /* ... add k to row list for first nonzero entry in k-th row */
587       il[k] = jmin;
588       i     = bj[jmin];
589       jl[k] = jl[i]; jl[i] = k;
590     }
591   }
592 
593   ierr = PetscFree(rtmp);CHKERRQ(ierr);
594   ierr = PetscFree(il);CHKERRQ(ierr);
595   ierr = PetscFree(dk);CHKERRQ(ierr);
596   ierr = PetscFree(pivots);CHKERRQ(ierr);
597   if (a->permute){
598     ierr = PetscFree(aa);CHKERRQ(ierr);
599   }
600 
601   ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr);
602   C->factor       = FACTOR_CHOLESKY;
603   C->assembled    = PETSC_TRUE;
604   C->preallocated = PETSC_TRUE;
605   PetscLogFlops(1.3333*bs*bs2*b->mbs); /* from inverting diagonal blocks */
606   PetscFunctionReturn(0);
607 }
608 
609 #undef __FUNCT__
610 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering"
611 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering(Mat A,Mat *B)
612 {
613   Mat                C = *B;
614   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
615   PetscErrorCode ierr;
616   int i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
617   int                *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
618   int                bs=a->bs,bs2 = a->bs2;
619   MatScalar          *ba = b->a,*aa,*ap,*dk,*uik;
620   MatScalar          *u,*diag,*rtmp,*rtmp_ptr;
621   MatScalar          *work;
622   int                *pivots;
623 
624   PetscFunctionBegin;
625 
626   /* initialization */
627 
628   ierr = PetscMalloc(bs2*mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
629   ierr = PetscMemzero(rtmp,bs2*mbs*sizeof(MatScalar));CHKERRQ(ierr);
630   ierr = PetscMalloc(2*mbs*sizeof(int),&il);CHKERRQ(ierr);
631   jl   = il + mbs;
632   for (i=0; i<mbs; i++) {
633     jl[i] = mbs; il[0] = 0;
634   }
635   ierr = PetscMalloc((2*bs2+bs)*sizeof(MatScalar),&dk);CHKERRQ(ierr);
636   uik  = dk + bs2;
637   work = uik + bs2;
638   ierr = PetscMalloc(bs*sizeof(int),&pivots);CHKERRQ(ierr);
639 
640   ai = a->i; aj = a->j; aa = a->a;
641 
642   /* for each row k */
643   for (k = 0; k<mbs; k++){
644 
645     /*initialize k-th row with elements nonzero in row k of A */
646     jmin = ai[k]; jmax = ai[k+1];
647     ap = aa + jmin*bs2;
648     for (j = jmin; j < jmax; j++){
649       vj = aj[j];         /* block col. index */
650       rtmp_ptr = rtmp + vj*bs2;
651       for (i=0; i<bs2; i++) *rtmp_ptr++ = *ap++;
652     }
653 
654     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
655     ierr = PetscMemcpy(dk,rtmp+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
656     i = jl[k]; /* first row to be added to k_th row  */
657 
658     while (i < k){
659       nexti = jl[i]; /* next row to be added to k_th row */
660 
661       /* compute multiplier */
662       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
663 
664       /* uik = -inv(Di)*U_bar(i,k) */
665       diag = ba + i*bs2;
666       u    = ba + ili*bs2;
667       ierr = PetscMemzero(uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
668       Kernel_A_gets_A_minus_B_times_C(bs,uik,diag,u);
669 
670       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
671       Kernel_A_gets_A_plus_Btranspose_times_C(bs,dk,uik,u);
672 
673       /* update -U(i,k) */
674       ierr = PetscMemcpy(ba+ili*bs2,uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
675 
676       /* add multiple of row i to k-th row ... */
677       jmin = ili + 1; jmax = bi[i+1];
678       if (jmin < jmax){
679         for (j=jmin; j<jmax; j++) {
680           /* rtmp += -U(i,k)^T * U_bar(i,j) */
681           rtmp_ptr = rtmp + bj[j]*bs2;
682           u = ba + j*bs2;
683           Kernel_A_gets_A_plus_Btranspose_times_C(bs,rtmp_ptr,uik,u);
684         }
685 
686         /* ... add i to row list for next nonzero entry */
687         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
688         j     = bj[jmin];
689         jl[i] = jl[j]; jl[j] = i; /* update jl */
690       }
691       i = nexti;
692     }
693 
694     /* save nonzero entries in k-th row of U ... */
695 
696     /* invert diagonal block */
697     diag = ba+k*bs2;
698     ierr = PetscMemcpy(diag,dk,bs2*sizeof(MatScalar));CHKERRQ(ierr);
699     Kernel_A_gets_inverse_A(bs,diag,pivots,work);
700 
701     jmin = bi[k]; jmax = bi[k+1];
702     if (jmin < jmax) {
703       for (j=jmin; j<jmax; j++){
704          vj = bj[j];           /* block col. index of U */
705          u   = ba + j*bs2;
706          rtmp_ptr = rtmp + vj*bs2;
707          for (k1=0; k1<bs2; k1++){
708            *u++        = *rtmp_ptr;
709            *rtmp_ptr++ = 0.0;
710          }
711       }
712 
713       /* ... add k to row list for first nonzero entry in k-th row */
714       il[k] = jmin;
715       i     = bj[jmin];
716       jl[k] = jl[i]; jl[i] = k;
717     }
718   }
719 
720   ierr = PetscFree(rtmp);CHKERRQ(ierr);
721   ierr = PetscFree(il);CHKERRQ(ierr);
722   ierr = PetscFree(dk);CHKERRQ(ierr);
723   ierr = PetscFree(pivots);CHKERRQ(ierr);
724 
725   C->factor    = FACTOR_CHOLESKY;
726   C->assembled = PETSC_TRUE;
727   C->preallocated = PETSC_TRUE;
728   PetscLogFlops(1.3333*bs*bs2*b->mbs); /* from inverting diagonal blocks */
729   PetscFunctionReturn(0);
730 }
731 
732 /*
733     Numeric U^T*D*U factorization for SBAIJ format. Modified from SNF of YSMP.
734     Version for blocks 2 by 2.
735 */
736 #undef __FUNCT__
737 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_2"
738 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_2(Mat A,Mat *B)
739 {
740   Mat                C = *B;
741   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
742   IS                 perm = b->row;
743   PetscErrorCode ierr;
744   int                *perm_ptr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
745   int                *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
746   MatScalar          *ba = b->a,*aa,*ap,*dk,*uik;
747   MatScalar          *u,*diag,*rtmp,*rtmp_ptr;
748 
749   PetscFunctionBegin;
750 
751   /* initialization */
752   /* il and jl record the first nonzero element in each row of the accessing
753      window U(0:k, k:mbs-1).
754      jl:    list of rows to be added to uneliminated rows
755             i>= k: jl(i) is the first row to be added to row i
756             i<  k: jl(i) is the row following row i in some list of rows
757             jl(i) = mbs indicates the end of a list
758      il(i): points to the first nonzero element in columns k,...,mbs-1 of
759             row i of U */
760   ierr = PetscMalloc(4*mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
761   ierr = PetscMemzero(rtmp,4*mbs*sizeof(MatScalar));CHKERRQ(ierr);
762   ierr = PetscMalloc(2*mbs*sizeof(int),&il);CHKERRQ(ierr);
763   jl   = il + mbs;
764   for (i=0; i<mbs; i++) {
765     jl[i] = mbs; il[0] = 0;
766   }
767   ierr = PetscMalloc(8*sizeof(MatScalar),&dk);CHKERRQ(ierr);
768   uik  = dk + 4;
769   ierr = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr);
770 
771   /* check permutation */
772   if (!a->permute){
773     ai = a->i; aj = a->j; aa = a->a;
774   } else {
775     ai   = a->inew; aj = a->jnew;
776     ierr = PetscMalloc(4*ai[mbs]*sizeof(MatScalar),&aa);CHKERRQ(ierr);
777     ierr = PetscMemcpy(aa,a->a,4*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
778     ierr = PetscMalloc(ai[mbs]*sizeof(int),&a2anew);CHKERRQ(ierr);
779     ierr = PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(int));CHKERRQ(ierr);
780 
781     for (i=0; i<mbs; i++){
782       jmin = ai[i]; jmax = ai[i+1];
783       for (j=jmin; j<jmax; j++){
784         while (a2anew[j] != j){
785           k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
786           for (k1=0; k1<4; k1++){
787             dk[k1]       = aa[k*4+k1];
788             aa[k*4+k1] = aa[j*4+k1];
789             aa[j*4+k1] = dk[k1];
790           }
791         }
792         /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
793         if (i > aj[j]){
794           /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */
795           ap = aa + j*4;     /* ptr to the beginning of the block */
796           dk[1] = ap[1];     /* swap ap[1] and ap[2] */
797           ap[1] = ap[2];
798           ap[2] = dk[1];
799         }
800       }
801     }
802     ierr = PetscFree(a2anew);CHKERRQ(ierr);
803   }
804 
805   /* for each row k */
806   for (k = 0; k<mbs; k++){
807 
808     /*initialize k-th row with elements nonzero in row perm(k) of A */
809     jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
810     ap = aa + jmin*4;
811     for (j = jmin; j < jmax; j++){
812       vj = perm_ptr[aj[j]];         /* block col. index */
813       rtmp_ptr = rtmp + vj*4;
814       for (i=0; i<4; i++) *rtmp_ptr++ = *ap++;
815     }
816 
817     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
818     ierr = PetscMemcpy(dk,rtmp+k*4,4*sizeof(MatScalar));CHKERRQ(ierr);
819     i = jl[k]; /* first row to be added to k_th row  */
820 
821     while (i < k){
822       nexti = jl[i]; /* next row to be added to k_th row */
823 
824       /* compute multiplier */
825       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
826 
827       /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */
828       diag = ba + i*4;
829       u    = ba + ili*4;
830       uik[0] = -(diag[0]*u[0] + diag[2]*u[1]);
831       uik[1] = -(diag[1]*u[0] + diag[3]*u[1]);
832       uik[2] = -(diag[0]*u[2] + diag[2]*u[3]);
833       uik[3] = -(diag[1]*u[2] + diag[3]*u[3]);
834 
835       /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */
836       dk[0] += uik[0]*u[0] + uik[1]*u[1];
837       dk[1] += uik[2]*u[0] + uik[3]*u[1];
838       dk[2] += uik[0]*u[2] + uik[1]*u[3];
839       dk[3] += uik[2]*u[2] + uik[3]*u[3];
840 
841       /* update -U(i,k): ba[ili] = uik */
842       ierr = PetscMemcpy(ba+ili*4,uik,4*sizeof(MatScalar));CHKERRQ(ierr);
843 
844       /* add multiple of row i to k-th row ... */
845       jmin = ili + 1; jmax = bi[i+1];
846       if (jmin < jmax){
847         for (j=jmin; j<jmax; j++) {
848           /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */
849           rtmp_ptr = rtmp + bj[j]*4;
850           u = ba + j*4;
851           rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1];
852           rtmp_ptr[1] += uik[2]*u[0] + uik[3]*u[1];
853           rtmp_ptr[2] += uik[0]*u[2] + uik[1]*u[3];
854           rtmp_ptr[3] += uik[2]*u[2] + uik[3]*u[3];
855         }
856 
857         /* ... add i to row list for next nonzero entry */
858         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
859         j     = bj[jmin];
860         jl[i] = jl[j]; jl[j] = i; /* update jl */
861       }
862       i = nexti;
863     }
864 
865     /* save nonzero entries in k-th row of U ... */
866 
867     /* invert diagonal block */
868     diag = ba+k*4;
869     ierr = PetscMemcpy(diag,dk,4*sizeof(MatScalar));CHKERRQ(ierr);
870     ierr = Kernel_A_gets_inverse_A_2(diag);CHKERRQ(ierr);
871 
872     jmin = bi[k]; jmax = bi[k+1];
873     if (jmin < jmax) {
874       for (j=jmin; j<jmax; j++){
875          vj = bj[j];           /* block col. index of U */
876          u   = ba + j*4;
877          rtmp_ptr = rtmp + vj*4;
878          for (k1=0; k1<4; k1++){
879            *u++        = *rtmp_ptr;
880            *rtmp_ptr++ = 0.0;
881          }
882       }
883 
884       /* ... add k to row list for first nonzero entry in k-th row */
885       il[k] = jmin;
886       i     = bj[jmin];
887       jl[k] = jl[i]; jl[i] = k;
888     }
889   }
890 
891   ierr = PetscFree(rtmp);CHKERRQ(ierr);
892   ierr = PetscFree(il);CHKERRQ(ierr);
893   ierr = PetscFree(dk);CHKERRQ(ierr);
894   if (a->permute) {
895     ierr = PetscFree(aa);CHKERRQ(ierr);
896   }
897   ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr);
898   C->factor    = FACTOR_CHOLESKY;
899   C->assembled = PETSC_TRUE;
900   C->preallocated = PETSC_TRUE;
901   PetscLogFlops(1.3333*8*b->mbs); /* from inverting diagonal blocks */
902   PetscFunctionReturn(0);
903 }
904 
905 /*
906       Version for when blocks are 2 by 2 Using natural ordering
907 */
908 #undef __FUNCT__
909 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering"
910 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering(Mat A,Mat *B)
911 {
912   Mat                C = *B;
913   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
914   PetscErrorCode ierr;
915   int i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
916   int                *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
917   MatScalar          *ba = b->a,*aa,*ap,*dk,*uik;
918   MatScalar          *u,*diag,*rtmp,*rtmp_ptr;
919 
920   PetscFunctionBegin;
921 
922   /* initialization */
923   /* il and jl record the first nonzero element in each row of the accessing
924      window U(0:k, k:mbs-1).
925      jl:    list of rows to be added to uneliminated rows
926             i>= k: jl(i) is the first row to be added to row i
927             i<  k: jl(i) is the row following row i in some list of rows
928             jl(i) = mbs indicates the end of a list
929      il(i): points to the first nonzero element in columns k,...,mbs-1 of
930             row i of U */
931   ierr = PetscMalloc(4*mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
932   ierr = PetscMemzero(rtmp,4*mbs*sizeof(MatScalar));CHKERRQ(ierr);
933   ierr = PetscMalloc(2*mbs*sizeof(int),&il);CHKERRQ(ierr);
934   jl   = il + mbs;
935   for (i=0; i<mbs; i++) {
936     jl[i] = mbs; il[0] = 0;
937   }
938   ierr = PetscMalloc(8*sizeof(MatScalar),&dk);CHKERRQ(ierr);
939   uik  = dk + 4;
940 
941   ai = a->i; aj = a->j; aa = a->a;
942 
943   /* for each row k */
944   for (k = 0; k<mbs; k++){
945 
946     /*initialize k-th row with elements nonzero in row k of A */
947     jmin = ai[k]; jmax = ai[k+1];
948     ap = aa + jmin*4;
949     for (j = jmin; j < jmax; j++){
950       vj = aj[j];         /* block col. index */
951       rtmp_ptr = rtmp + vj*4;
952       for (i=0; i<4; i++) *rtmp_ptr++ = *ap++;
953     }
954 
955     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
956     ierr = PetscMemcpy(dk,rtmp+k*4,4*sizeof(MatScalar));CHKERRQ(ierr);
957     i = jl[k]; /* first row to be added to k_th row  */
958 
959     while (i < k){
960       nexti = jl[i]; /* next row to be added to k_th row */
961 
962       /* compute multiplier */
963       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
964 
965       /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */
966       diag = ba + i*4;
967       u    = ba + ili*4;
968       uik[0] = -(diag[0]*u[0] + diag[2]*u[1]);
969       uik[1] = -(diag[1]*u[0] + diag[3]*u[1]);
970       uik[2] = -(diag[0]*u[2] + diag[2]*u[3]);
971       uik[3] = -(diag[1]*u[2] + diag[3]*u[3]);
972 
973       /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */
974       dk[0] += uik[0]*u[0] + uik[1]*u[1];
975       dk[1] += uik[2]*u[0] + uik[3]*u[1];
976       dk[2] += uik[0]*u[2] + uik[1]*u[3];
977       dk[3] += uik[2]*u[2] + uik[3]*u[3];
978 
979       /* update -U(i,k): ba[ili] = uik */
980       ierr = PetscMemcpy(ba+ili*4,uik,4*sizeof(MatScalar));CHKERRQ(ierr);
981 
982       /* add multiple of row i to k-th row ... */
983       jmin = ili + 1; jmax = bi[i+1];
984       if (jmin < jmax){
985         for (j=jmin; j<jmax; j++) {
986           /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */
987           rtmp_ptr = rtmp + bj[j]*4;
988           u = ba + j*4;
989           rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1];
990           rtmp_ptr[1] += uik[2]*u[0] + uik[3]*u[1];
991           rtmp_ptr[2] += uik[0]*u[2] + uik[1]*u[3];
992           rtmp_ptr[3] += uik[2]*u[2] + uik[3]*u[3];
993         }
994 
995         /* ... add i to row list for next nonzero entry */
996         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
997         j     = bj[jmin];
998         jl[i] = jl[j]; jl[j] = i; /* update jl */
999       }
1000       i = nexti;
1001     }
1002 
1003     /* save nonzero entries in k-th row of U ... */
1004 
1005     /* invert diagonal block */
1006     diag = ba+k*4;
1007     ierr = PetscMemcpy(diag,dk,4*sizeof(MatScalar));CHKERRQ(ierr);
1008     ierr = Kernel_A_gets_inverse_A_2(diag);CHKERRQ(ierr);
1009 
1010     jmin = bi[k]; jmax = bi[k+1];
1011     if (jmin < jmax) {
1012       for (j=jmin; j<jmax; j++){
1013          vj = bj[j];           /* block col. index of U */
1014          u   = ba + j*4;
1015          rtmp_ptr = rtmp + vj*4;
1016          for (k1=0; k1<4; k1++){
1017            *u++        = *rtmp_ptr;
1018            *rtmp_ptr++ = 0.0;
1019          }
1020       }
1021 
1022       /* ... add k to row list for first nonzero entry in k-th row */
1023       il[k] = jmin;
1024       i     = bj[jmin];
1025       jl[k] = jl[i]; jl[i] = k;
1026     }
1027   }
1028 
1029   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1030   ierr = PetscFree(il);CHKERRQ(ierr);
1031   ierr = PetscFree(dk);CHKERRQ(ierr);
1032 
1033   C->factor    = FACTOR_CHOLESKY;
1034   C->assembled = PETSC_TRUE;
1035   C->preallocated = PETSC_TRUE;
1036   PetscLogFlops(1.3333*8*b->mbs); /* from inverting diagonal blocks */
1037   PetscFunctionReturn(0);
1038 }
1039 
1040 /*
1041     Numeric U^T*D*U factorization for SBAIJ format. Modified from SNF of YSMP.
1042     Version for blocks are 1 by 1.
1043 */
1044 #undef __FUNCT__
1045 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_1"
1046 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_1(Mat A,Mat *B)
1047 {
1048   Mat                C = *B;
1049   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
1050   IS                 ip = b->row;
1051   PetscErrorCode ierr;
1052   int                *rip,i,j,mbs = a->mbs,*bi = b->i,*bj = b->j;
1053   int                *ai,*aj,*r;
1054   int                k,jmin,jmax,*jl,*il,vj,nexti,ili;
1055   MatScalar          *rtmp;
1056   MatScalar          *ba = b->a,*aa,ak;
1057   MatScalar          dk,uikdi;
1058 
1059   PetscFunctionBegin;
1060   ierr  = ISGetIndices(ip,&rip);CHKERRQ(ierr);
1061   if (!a->permute){
1062     ai = a->i; aj = a->j; aa = a->a;
1063   } else {
1064     ai = a->inew; aj = a->jnew;
1065     ierr = PetscMalloc(ai[mbs]*sizeof(MatScalar),&aa);CHKERRQ(ierr);
1066     ierr = PetscMemcpy(aa,a->a,ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
1067     ierr = PetscMalloc(ai[mbs]*sizeof(int),&r);CHKERRQ(ierr);
1068     ierr= PetscMemcpy(r,a->a2anew,(ai[mbs])*sizeof(int));CHKERRQ(ierr);
1069 
1070     jmin = ai[0]; jmax = ai[mbs];
1071     for (j=jmin; j<jmax; j++){
1072       while (r[j] != j){
1073         k = r[j]; r[j] = r[k]; r[k] = k;
1074         ak = aa[k]; aa[k] = aa[j]; aa[j] = ak;
1075       }
1076     }
1077     ierr = PetscFree(r);CHKERRQ(ierr);
1078   }
1079 
1080   /* initialization */
1081   /* il and jl record the first nonzero element in each row of the accessing
1082      window U(0:k, k:mbs-1).
1083      jl:    list of rows to be added to uneliminated rows
1084             i>= k: jl(i) is the first row to be added to row i
1085             i<  k: jl(i) is the row following row i in some list of rows
1086             jl(i) = mbs indicates the end of a list
1087      il(i): points to the first nonzero element in columns k,...,mbs-1 of
1088             row i of U */
1089   ierr = PetscMalloc(mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
1090   ierr = PetscMalloc(2*mbs*sizeof(int),&il);CHKERRQ(ierr);
1091   jl   = il + mbs;
1092   for (i=0; i<mbs; i++) {
1093     rtmp[i] = 0.0; jl[i] = mbs; il[0] = 0;
1094   }
1095 
1096   /* for each row k */
1097   for (k = 0; k<mbs; k++){
1098 
1099     /*initialize k-th row with elements nonzero in row perm(k) of A */
1100     jmin = ai[rip[k]]; jmax = ai[rip[k]+1];
1101 
1102     for (j = jmin; j < jmax; j++){
1103       vj = rip[aj[j]];
1104       rtmp[vj] = aa[j];
1105     }
1106 
1107     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
1108     dk = rtmp[k];
1109     i = jl[k]; /* first row to be added to k_th row  */
1110 
1111     while (i < k){
1112       nexti = jl[i]; /* next row to be added to k_th row */
1113 
1114       /* compute multiplier, update D(k) and U(i,k) */
1115       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
1116       uikdi = - ba[ili]*ba[i];
1117       dk += uikdi*ba[ili];
1118       ba[ili] = uikdi; /* -U(i,k) */
1119 
1120       /* add multiple of row i to k-th row ... */
1121       jmin = ili + 1; jmax = bi[i+1];
1122       if (jmin < jmax){
1123         for (j=jmin; j<jmax; j++) rtmp[bj[j]] += uikdi*ba[j];
1124         /* ... add i to row list for next nonzero entry */
1125         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
1126         j     = bj[jmin];
1127         jl[i] = jl[j]; jl[j] = i; /* update jl */
1128       }
1129       i = nexti;
1130     }
1131 
1132     /* check for zero pivot and save diagoanl element */
1133     if (dk == 0.0){
1134       SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot");
1135       /*
1136     } else if (PetscRealPart(dk) < 0.0){
1137       SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Negative pivot: d[%d] = %g\n",k,dk);
1138       */
1139     }
1140 
1141     /* save nonzero entries in k-th row of U ... */
1142     ba[k] = 1.0/dk;
1143     jmin = bi[k]; jmax = bi[k+1];
1144     if (jmin < jmax) {
1145       for (j=jmin; j<jmax; j++){
1146          vj = bj[j]; ba[j] = rtmp[vj]; rtmp[vj] = 0.0;
1147       }
1148       /* ... add k to row list for first nonzero entry in k-th row */
1149       il[k] = jmin;
1150       i     = bj[jmin];
1151       jl[k] = jl[i]; jl[i] = k;
1152     }
1153   }
1154 
1155   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1156   ierr = PetscFree(il);CHKERRQ(ierr);
1157   if (a->permute){
1158     ierr = PetscFree(aa);CHKERRQ(ierr);
1159   }
1160 
1161   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
1162   C->factor    = FACTOR_CHOLESKY;
1163   C->assembled = PETSC_TRUE;
1164   C->preallocated = PETSC_TRUE;
1165   PetscLogFlops(b->mbs);
1166   PetscFunctionReturn(0);
1167 }
1168 
1169 /*
1170   Version for when blocks are 1 by 1 Using natural ordering
1171 */
1172 #undef __FUNCT__
1173 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering"
1174 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering(Mat A,Mat *B)
1175 {
1176   Mat                C = *B;
1177   Mat_SeqSBAIJ       *a=(Mat_SeqSBAIJ*)A->data,*b=(Mat_SeqSBAIJ *)C->data;
1178   PetscErrorCode ierr;
1179   int i,j,mbs = a->mbs;
1180   int                *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
1181   int                k,jmin,*jl,*il,nexti,ili,*acol,*bcol,nz,ndamp = 0;
1182   MatScalar          *rtmp,*ba=b->a,*aa=a->a,dk,uikdi,*aval,*bval;
1183   PetscReal          damping=b->factor_damping, zeropivot=b->factor_zeropivot,shift_amount;
1184   PetscTruth         damp,chshift;
1185   int                nshift=0;
1186 
1187   PetscFunctionBegin;
1188   /* initialization */
1189   /* il and jl record the first nonzero element in each row of the accessing
1190      window U(0:k, k:mbs-1).
1191      jl:    list of rows to be added to uneliminated rows
1192             i>= k: jl(i) is the first row to be added to row i
1193             i<  k: jl(i) is the row following row i in some list of rows
1194             jl(i) = mbs indicates the end of a list
1195      il(i): points to the first nonzero element in U(i,k:mbs-1)
1196   */
1197   ierr = PetscMalloc(mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
1198   ierr = PetscMalloc(2*mbs*sizeof(int),&il);CHKERRQ(ierr);
1199   jl   = il + mbs;
1200 
1201   shift_amount = 0;
1202   do {
1203     damp = PETSC_FALSE;
1204     chshift = PETSC_FALSE;
1205     for (i=0; i<mbs; i++) {
1206       rtmp[i] = 0.0; jl[i] = mbs; il[0] = 0;
1207     }
1208 
1209     for (k = 0; k<mbs; k++){ /* row k */
1210     /*initialize k-th row with elements nonzero in row perm(k) of A */
1211       nz   = ai[k+1] - ai[k];
1212       acol = aj + ai[k];
1213       aval = aa + ai[k];
1214       bval = ba + bi[k];
1215       while (nz -- ){
1216         rtmp[*acol++] = *aval++;
1217         *bval++       = 0.0; /* for in-place factorization */
1218       }
1219       /* damp the diagonal of the matrix */
1220       if (ndamp||nshift) rtmp[k] += damping+shift_amount;
1221 
1222       /* modify k-th row by adding in those rows i with U(i,k) != 0 */
1223       dk = rtmp[k];
1224       i  = jl[k]; /* first row to be added to k_th row  */
1225 
1226       while (i < k){
1227         nexti = jl[i]; /* next row to be added to k_th row */
1228 
1229         /* compute multiplier, update D(k) and U(i,k) */
1230         ili   = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
1231         uikdi = - ba[ili]*ba[bi[i]];
1232         dk   += uikdi*ba[ili];
1233         ba[ili] = uikdi; /* -U(i,k) */
1234 
1235         /* add multiple of row i to k-th row ... */
1236         jmin = ili + 1;
1237         nz   = bi[i+1] - jmin;
1238         if (nz > 0){
1239           bcol = bj + jmin;
1240           bval = ba + jmin;
1241           while (nz --) rtmp[*bcol++] += uikdi*(*bval++);
1242           /* ... add i to row list for next nonzero entry */
1243           il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
1244           j     = bj[jmin];
1245           jl[i] = jl[j]; jl[j] = i; /* update jl */
1246         }
1247         i = nexti;
1248       }
1249 
1250       if (PetscRealPart(dk) < zeropivot && b->factor_shift){
1251 	/* calculate a shift that would make this row diagonally dominant */
1252 	PetscReal rs = PetscAbs(PetscRealPart(dk));
1253 	jmin      = bi[k]+1;
1254 	nz        = bi[k+1] - jmin;
1255 	if (nz){
1256 	  bcol = bj + jmin;
1257 	  bval = ba + jmin;
1258 	  while (nz--){
1259 	    rs += PetscAbsScalar(rtmp[*bcol++]);
1260 	  }
1261 	}
1262 	/* if this shift is less than the previous, just up the previous
1263 	   one by a bit */
1264 	shift_amount = PetscMax(rs,1.1*shift_amount);
1265 	chshift  = PETSC_TRUE;
1266 	/* Unlike in the ILU case there is no exit condition on nshift:
1267 	   we increase the shift until it converges. There is no guarantee that
1268 	   this algorithm converges faster or slower, or is better or worse
1269 	   than the ILU algorithm. */
1270 	nshift++;
1271 	break;
1272       }
1273       if (PetscRealPart(dk) < zeropivot){
1274         if (damping == (PetscReal) PETSC_DECIDE) damping = -PetscRealPart(dk)/(k+1);
1275         if (damping > 0.0) {
1276           if (ndamp) damping *= 2.0;
1277           damp = PETSC_TRUE;
1278           ndamp++;
1279           break;
1280         } else if (PetscAbsScalar(dk) < zeropivot){
1281           SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",k,PetscRealPart(dk),zeropivot);
1282         } else {
1283           PetscLogInfo((PetscObject)A,"Negative pivot %g in row %d of Cholesky factorization\n",PetscRealPart(dk),k);
1284         }
1285       }
1286 
1287       /* save nonzero entries in k-th row of U ... */
1288       /* printf("%d, dk: %g, 1/dk: %g\n",k,dk,1/dk); */
1289       ba[bi[k]] = 1.0/dk;
1290       jmin      = bi[k]+1;
1291       nz        = bi[k+1] - jmin;
1292       if (nz){
1293         bcol = bj + jmin;
1294         bval = ba + jmin;
1295         while (nz--){
1296           *bval++       = rtmp[*bcol];
1297           rtmp[*bcol++] = 0.0;
1298         }
1299         /* ... add k to row list for first nonzero entry in k-th row */
1300         il[k] = jmin;
1301         i     = bj[jmin];
1302         jl[k] = jl[i]; jl[i] = k;
1303       }
1304     } /* end of for (k = 0; k<mbs; k++) */
1305   } while (damp||chshift);
1306   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1307   ierr = PetscFree(il);CHKERRQ(ierr);
1308 
1309   C->factor       = FACTOR_CHOLESKY;
1310   C->assembled    = PETSC_TRUE;
1311   C->preallocated = PETSC_TRUE;
1312   PetscLogFlops(b->mbs);
1313   if (ndamp) {
1314     PetscLogInfo(0,"MatCholeskyFactorNumerical_SeqSBAIJ_1_NaturalOrdering: number of damping tries %d damping value %g\n",ndamp,damping);
1315   }
1316  if (nshift) {
1317     PetscLogInfo(0,"MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering diagonal shifted %d shifts\n",nshift);
1318   }
1319 
1320   PetscFunctionReturn(0);
1321 }
1322 
1323 #undef __FUNCT__
1324 #define __FUNCT__ "MatCholeskyFactor_SeqSBAIJ"
1325 PetscErrorCode MatCholeskyFactor_SeqSBAIJ(Mat A,IS perm,MatFactorInfo *info)
1326 {
1327   PetscErrorCode ierr;
1328   Mat C;
1329 
1330   PetscFunctionBegin;
1331   ierr = MatCholeskyFactorSymbolic(A,perm,info,&C);CHKERRQ(ierr);
1332   ierr = MatCholeskyFactorNumeric(A,&C);CHKERRQ(ierr);
1333   ierr = MatHeaderCopy(A,C);CHKERRQ(ierr);
1334   PetscFunctionReturn(0);
1335 }
1336 
1337 
1338