xref: /petsc/src/mat/impls/baij/seq/baijfact.c (revision f3fe499b4cc4d64bf04aa4f5e4963dcc4eb56541)
1 #define PETSCMAT_DLL
2 
3 /*
4     Factorization code for BAIJ format.
5 */
6 #include "../src/mat/impls/baij/seq/baij.h"
7 #include "../src/mat/blockinvert.h"
8 
9 #undef __FUNCT__
10 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_2"
11 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2(Mat B,Mat A,const MatFactorInfo *info)
12 {
13   Mat             C=B;
14   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
15   IS              isrow = b->row,isicol = b->icol;
16   PetscErrorCode  ierr;
17   const PetscInt  *r,*ic;
18   PetscInt        i,j,k,nz,nzL,row,*pj;
19   const PetscInt  n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,bs2=a->bs2;
20   const PetscInt  *ajtmp,*bjtmp,*bdiag=b->diag;
21   MatScalar       *rtmp,*pc,*mwork,*pv;
22   MatScalar       *aa=a->a,*v;
23   PetscInt        flg;
24   PetscReal       shift = info->shiftamount;
25 
26   PetscFunctionBegin;
27   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
28   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
29 
30   /* generate work space needed by the factorization */
31   ierr = PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);CHKERRQ(ierr);
32   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
33 
34   for (i=0; i<n; i++){
35     /* zero rtmp */
36     /* L part */
37     nz    = bi[i+1] - bi[i];
38     bjtmp = bj + bi[i];
39     for  (j=0; j<nz; j++){
40       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
41     }
42 
43     /* U part */
44     nz = bdiag[i] - bdiag[i+1];
45     bjtmp = bj + bdiag[i+1]+1;
46     for  (j=0; j<nz; j++){
47       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
48     }
49 
50     /* load in initial (unfactored row) */
51     nz    = ai[r[i]+1] - ai[r[i]];
52     ajtmp = aj + ai[r[i]];
53     v     = aa + bs2*ai[r[i]];
54     for (j=0; j<nz; j++) {
55       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
56     }
57 
58     /* elimination */
59     bjtmp = bj + bi[i];
60     nzL   = bi[i+1] - bi[i];
61     for(k=0;k < nzL;k++) {
62       row = bjtmp[k];
63       pc = rtmp + bs2*row;
64       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
65       if (flg) {
66         pv = b->a + bs2*bdiag[row];
67         /* Kernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
68         ierr = Kernel_A_gets_A_times_B_2(pc,pv,mwork);CHKERRQ(ierr);
69 
70         pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
71         pv = b->a + bs2*(bdiag[row+1]+1);
72         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
73         for (j=0; j<nz; j++) {
74           /* Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
75           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
76           v    = rtmp + 4*pj[j];
77           ierr = Kernel_A_gets_A_minus_B_times_C_2(v,pc,pv);CHKERRQ(ierr);
78           pv  += 4;
79         }
80         ierr = PetscLogFlops(16*nz+12);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
81       }
82     }
83 
84     /* finished row so stick it into b->a */
85     /* L part */
86     pv   = b->a + bs2*bi[i] ;
87     pj   = b->j + bi[i] ;
88     nz   = bi[i+1] - bi[i];
89     for (j=0; j<nz; j++) {
90       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
91     }
92 
93     /* Mark diagonal and invert diagonal for simplier triangular solves */
94     pv   = b->a + bs2*bdiag[i];
95     pj   = b->j + bdiag[i];
96     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
97     /* ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); */
98     ierr = Kernel_A_gets_inverse_A_2(pv,shift);CHKERRQ(ierr);
99 
100     /* U part */
101     pv = b->a + bs2*(bdiag[i+1]+1);
102     pj = b->j + bdiag[i+1]+1;
103     nz = bdiag[i] - bdiag[i+1] - 1;
104     for (j=0; j<nz; j++){
105       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
106     }
107   }
108 
109   ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr);
110   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
111   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
112   C->ops->solve          = MatSolve_SeqBAIJ_2;
113   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2;
114 
115   C->assembled = PETSC_TRUE;
116   ierr = PetscLogFlops(1.333333333333*2*2*2*n);CHKERRQ(ierr); /* from inverting diagonal blocks */
117   PetscFunctionReturn(0);
118 }
119 
120 #undef __FUNCT__
121 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering"
122 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info)
123 {
124   Mat             C=B;
125   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
126   PetscErrorCode  ierr;
127   PetscInt        i,j,k,nz,nzL,row,*pj;
128   const PetscInt  n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,bs2=a->bs2;
129   const PetscInt  *ajtmp,*bjtmp,*bdiag=b->diag;
130   MatScalar       *rtmp,*pc,*mwork,*pv;
131   MatScalar       *aa=a->a,*v;
132   PetscInt       flg;
133   PetscReal      shift = info->shiftamount;
134 
135   PetscFunctionBegin;
136   /* generate work space needed by the factorization */
137   ierr = PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);CHKERRQ(ierr);
138   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
139 
140   for (i=0; i<n; i++){
141     /* zero rtmp */
142     /* L part */
143     nz    = bi[i+1] - bi[i];
144     bjtmp = bj + bi[i];
145     for  (j=0; j<nz; j++){
146       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
147     }
148 
149     /* U part */
150     nz = bdiag[i] - bdiag[i+1];
151     bjtmp = bj + bdiag[i+1]+1;
152     for  (j=0; j<nz; j++){
153       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
154     }
155 
156     /* load in initial (unfactored row) */
157     nz    = ai[i+1] - ai[i];
158     ajtmp = aj + ai[i];
159     v     = aa + bs2*ai[i];
160     for (j=0; j<nz; j++) {
161       ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
162     }
163 
164     /* elimination */
165     bjtmp = bj + bi[i];
166     nzL   = bi[i+1] - bi[i];
167     for(k=0;k < nzL;k++) {
168       row = bjtmp[k];
169       pc = rtmp + bs2*row;
170       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
171       if (flg) {
172         pv = b->a + bs2*bdiag[row];
173         /* Kernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
174         ierr = Kernel_A_gets_A_times_B_2(pc,pv,mwork);CHKERRQ(ierr);
175 
176         pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
177 	pv = b->a + bs2*(bdiag[row+1]+1);
178 	nz = bdiag[row]-bdiag[row+1] - 1; /* num of entries in U(row,:) excluding diag */
179         for (j=0; j<nz; j++) {
180           /* Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
181           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
182           v    = rtmp + 4*pj[j];
183           ierr = Kernel_A_gets_A_minus_B_times_C_2(v,pc,pv);CHKERRQ(ierr);
184           pv  += 4;
185         }
186         ierr = PetscLogFlops(16*nz+12);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
187       }
188     }
189 
190     /* finished row so stick it into b->a */
191     /* L part */
192     pv   = b->a + bs2*bi[i] ;
193     pj   = b->j + bi[i] ;
194     nz   = bi[i+1] - bi[i];
195     for (j=0; j<nz; j++) {
196       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
197     }
198 
199     /* Mark diagonal and invert diagonal for simplier triangular solves */
200     pv   = b->a + bs2*bdiag[i];
201     pj   = b->j + bdiag[i];
202     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
203     /* ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); */
204     ierr = Kernel_A_gets_inverse_A_2(pv,shift);CHKERRQ(ierr);
205 
206     /* U part */
207     /*
208     pv = b->a + bs2*bi[2*n-i];
209     pj = b->j + bi[2*n-i];
210     nz = bi[2*n-i+1] - bi[2*n-i] - 1;
211     */
212     pv = b->a + bs2*(bdiag[i+1]+1);
213     pj = b->j + bdiag[i+1]+1;
214     nz = bdiag[i] - bdiag[i+1] - 1;
215     for (j=0; j<nz; j++){
216       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
217     }
218   }
219   ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr);
220 
221   C->ops->solve          = MatSolve_SeqBAIJ_2_NaturalOrdering;
222   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering;
223   C->assembled = PETSC_TRUE;
224   ierr = PetscLogFlops(1.333333333333*2*2*2*n);CHKERRQ(ierr); /* from inverting diagonal blocks */
225   PetscFunctionReturn(0);
226 }
227 
228 #undef __FUNCT__
229 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_2_inplace"
230 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_inplace(Mat B,Mat A,const MatFactorInfo *info)
231 {
232   Mat            C = B;
233   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data;
234   IS             isrow = b->row,isicol = b->icol;
235   PetscErrorCode ierr;
236   const PetscInt *r,*ic;
237   PetscInt       i,j,n = a->mbs,*bi = b->i,*bj = b->j;
238   PetscInt       *ajtmpold,*ajtmp,nz,row;
239   PetscInt       *diag_offset=b->diag,idx,*ai=a->i,*aj=a->j,*pj;
240   MatScalar      *pv,*v,*rtmp,m1,m2,m3,m4,*pc,*w,*x,x1,x2,x3,x4;
241   MatScalar      p1,p2,p3,p4;
242   MatScalar      *ba = b->a,*aa = a->a;
243   PetscReal      shift = info->shiftamount;
244 
245   PetscFunctionBegin;
246   ierr  = ISGetIndices(isrow,&r);CHKERRQ(ierr);
247   ierr  = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
248   ierr  = PetscMalloc(4*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
249 
250   for (i=0; i<n; i++) {
251     nz    = bi[i+1] - bi[i];
252     ajtmp = bj + bi[i];
253     for  (j=0; j<nz; j++) {
254       x = rtmp+4*ajtmp[j]; x[0] = x[1] = x[2] = x[3] = 0.0;
255     }
256     /* load in initial (unfactored row) */
257     idx      = r[i];
258     nz       = ai[idx+1] - ai[idx];
259     ajtmpold = aj + ai[idx];
260     v        = aa + 4*ai[idx];
261     for (j=0; j<nz; j++) {
262       x    = rtmp+4*ic[ajtmpold[j]];
263       x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3];
264       v    += 4;
265     }
266     row = *ajtmp++;
267     while (row < i) {
268       pc = rtmp + 4*row;
269       p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3];
270       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0) {
271         pv = ba + 4*diag_offset[row];
272         pj = bj + diag_offset[row] + 1;
273         x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
274         pc[0] = m1 = p1*x1 + p3*x2;
275         pc[1] = m2 = p2*x1 + p4*x2;
276         pc[2] = m3 = p1*x3 + p3*x4;
277         pc[3] = m4 = p2*x3 + p4*x4;
278         nz = bi[row+1] - diag_offset[row] - 1;
279         pv += 4;
280         for (j=0; j<nz; j++) {
281           x1   = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
282           x    = rtmp + 4*pj[j];
283           x[0] -= m1*x1 + m3*x2;
284           x[1] -= m2*x1 + m4*x2;
285           x[2] -= m1*x3 + m3*x4;
286           x[3] -= m2*x3 + m4*x4;
287           pv   += 4;
288         }
289         ierr = PetscLogFlops(16.0*nz+12.0);CHKERRQ(ierr);
290       }
291       row = *ajtmp++;
292     }
293     /* finished row so stick it into b->a */
294     pv = ba + 4*bi[i];
295     pj = bj + bi[i];
296     nz = bi[i+1] - bi[i];
297     for (j=0; j<nz; j++) {
298       x     = rtmp+4*pj[j];
299       pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3];
300       pv   += 4;
301     }
302     /* invert diagonal block */
303     w = ba + 4*diag_offset[i];
304     ierr = Kernel_A_gets_inverse_A_2(w,shift);CHKERRQ(ierr);
305   }
306 
307   ierr = PetscFree(rtmp);CHKERRQ(ierr);
308   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
309   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
310   C->ops->solve          = MatSolve_SeqBAIJ_2_inplace;
311   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_inplace;
312   C->assembled = PETSC_TRUE;
313   ierr = PetscLogFlops(1.333333333333*8*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
314   PetscFunctionReturn(0);
315 }
316 /*
317       Version for when blocks are 2 by 2 Using natural ordering
318 */
319 #undef __FUNCT__
320 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering_inplace"
321 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering_inplace(Mat C,Mat A,const MatFactorInfo *info)
322 {
323   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data;
324   PetscErrorCode ierr;
325   PetscInt       i,j,n = a->mbs,*bi = b->i,*bj = b->j;
326   PetscInt       *ajtmpold,*ajtmp,nz,row;
327   PetscInt       *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj;
328   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
329   MatScalar      p1,p2,p3,p4,m1,m2,m3,m4,x1,x2,x3,x4;
330   MatScalar      *ba = b->a,*aa = a->a;
331   PetscReal      shift = info->shiftamount;
332 
333   PetscFunctionBegin;
334   ierr = PetscMalloc(4*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
335   for (i=0; i<n; i++) {
336     nz    = bi[i+1] - bi[i];
337     ajtmp = bj + bi[i];
338     for  (j=0; j<nz; j++) {
339       x = rtmp+4*ajtmp[j];
340       x[0]  = x[1]  = x[2]  = x[3]  = 0.0;
341     }
342     /* load in initial (unfactored row) */
343     nz       = ai[i+1] - ai[i];
344     ajtmpold = aj + ai[i];
345     v        = aa + 4*ai[i];
346     for (j=0; j<nz; j++) {
347       x    = rtmp+4*ajtmpold[j];
348       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
349       v    += 4;
350     }
351     row = *ajtmp++;
352     while (row < i) {
353       pc  = rtmp + 4*row;
354       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
355       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0) {
356         pv = ba + 4*diag_offset[row];
357         pj = bj + diag_offset[row] + 1;
358         x1  = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
359         pc[0] = m1 = p1*x1 + p3*x2;
360         pc[1] = m2 = p2*x1 + p4*x2;
361         pc[2] = m3 = p1*x3 + p3*x4;
362         pc[3] = m4 = p2*x3 + p4*x4;
363         nz = bi[row+1] - diag_offset[row] - 1;
364         pv += 4;
365         for (j=0; j<nz; j++) {
366           x1   = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
367           x    = rtmp + 4*pj[j];
368           x[0] -= m1*x1 + m3*x2;
369           x[1] -= m2*x1 + m4*x2;
370           x[2] -= m1*x3 + m3*x4;
371           x[3] -= m2*x3 + m4*x4;
372           pv   += 4;
373         }
374         ierr = PetscLogFlops(16.0*nz+12.0);CHKERRQ(ierr);
375       }
376       row = *ajtmp++;
377     }
378     /* finished row so stick it into b->a */
379     pv = ba + 4*bi[i];
380     pj = bj + bi[i];
381     nz = bi[i+1] - bi[i];
382     for (j=0; j<nz; j++) {
383       x      = rtmp+4*pj[j];
384       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
385       /*
386       printf(" col %d:",pj[j]);
387       PetscInt j1;
388       for (j1=0; j1<4; j1++) printf(" %g,",*(pv+j1));
389       printf("\n");
390       */
391       pv   += 4;
392     }
393     /* invert diagonal block */
394     w = ba + 4*diag_offset[i];
395     /*
396     printf(" \n%d -th: diag: ",i);
397     for (j=0; j<4; j++){
398       printf(" %g,",w[j]);
399     }
400     printf("\n----------------------------\n");
401     */
402     ierr = Kernel_A_gets_inverse_A_2(w,shift);CHKERRQ(ierr);
403   }
404 
405   ierr = PetscFree(rtmp);CHKERRQ(ierr);
406   C->ops->solve          = MatSolve_SeqBAIJ_2_NaturalOrdering_inplace;
407   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace;
408   C->assembled = PETSC_TRUE;
409   ierr = PetscLogFlops(1.333333333333*8*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
410   PetscFunctionReturn(0);
411 }
412 
413 /* ----------------------------------------------------------- */
414 /*
415      Version for when blocks are 1 by 1.
416 */
417 #undef __FUNCT__
418 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_1"
419 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_1(Mat B,Mat A,const MatFactorInfo *info)
420 {
421   Mat              C=B;
422   Mat_SeqBAIJ      *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
423   IS               isrow = b->row,isicol = b->icol;
424   PetscErrorCode   ierr;
425   const PetscInt   *r,*ic,*ics;
426   const PetscInt   n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag;
427   PetscInt         i,j,k,nz,nzL,row,*pj;
428   const PetscInt   *ajtmp,*bjtmp;
429   MatScalar        *rtmp,*pc,multiplier,*pv;
430   const  MatScalar *aa=a->a,*v;
431   PetscTruth       row_identity,col_identity;
432   FactorShiftCtx   sctx;
433   const PetscInt   *ddiag;
434   PetscReal        rs;
435   MatScalar        d;
436 
437   PetscFunctionBegin;
438   /* MatPivotSetUp(): initialize shift context sctx */
439   ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr);
440 
441   if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
442     ddiag          = a->diag;
443     sctx.shift_top = info->zeropivot;
444     for (i=0; i<n; i++) {
445       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
446       d  = (aa)[ddiag[i]];
447       rs = -PetscAbsScalar(d) - PetscRealPart(d);
448       v  = aa+ai[i];
449       nz = ai[i+1] - ai[i];
450       for (j=0; j<nz; j++)
451 	rs += PetscAbsScalar(v[j]);
452       if (rs>sctx.shift_top) sctx.shift_top = rs;
453     }
454     sctx.shift_top   *= 1.1;
455     sctx.nshift_max   = 5;
456     sctx.shift_lo     = 0.;
457     sctx.shift_hi     = 1.;
458   }
459 
460   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
461   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
462   ierr = PetscMalloc((n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
463   ics  = ic;
464 
465   do {
466     sctx.newshift = PETSC_FALSE;
467     for (i=0; i<n; i++){
468       /* zero rtmp */
469       /* L part */
470       nz    = bi[i+1] - bi[i];
471       bjtmp = bj + bi[i];
472       for  (j=0; j<nz; j++) rtmp[bjtmp[j]] = 0.0;
473 
474       /* U part */
475       nz = bdiag[i]-bdiag[i+1];
476       bjtmp = bj + bdiag[i+1]+1;
477       for  (j=0; j<nz; j++) rtmp[bjtmp[j]] = 0.0;
478 
479       /* load in initial (unfactored row) */
480       nz    = ai[r[i]+1] - ai[r[i]];
481       ajtmp = aj + ai[r[i]];
482       v     = aa + ai[r[i]];
483       for (j=0; j<nz; j++) {
484         rtmp[ics[ajtmp[j]]] = v[j];
485       }
486       /* ZeropivotApply() */
487       rtmp[i] += sctx.shift_amount;  /* shift the diagonal of the matrix */
488 
489       /* elimination */
490       bjtmp = bj + bi[i];
491       row   = *bjtmp++;
492       nzL   = bi[i+1] - bi[i];
493       for(k=0; k < nzL;k++) {
494         pc = rtmp + row;
495         if (*pc != 0.0) {
496           pv         = b->a + bdiag[row];
497           multiplier = *pc * (*pv);
498           *pc        = multiplier;
499           pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
500 	  pv = b->a + bdiag[row+1]+1;
501 	  nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
502           for (j=0; j<nz; j++) rtmp[pj[j]] -= multiplier * pv[j];
503           ierr = PetscLogFlops(2.0*nz);CHKERRQ(ierr);
504         }
505         row = *bjtmp++;
506       }
507 
508       /* finished row so stick it into b->a */
509       rs = 0.0;
510       /* L part */
511       pv   = b->a + bi[i] ;
512       pj   = b->j + bi[i] ;
513       nz   = bi[i+1] - bi[i];
514       for (j=0; j<nz; j++) {
515         pv[j] = rtmp[pj[j]]; rs += PetscAbsScalar(pv[j]);
516       }
517 
518       /* U part */
519       pv = b->a + bdiag[i+1]+1;
520       pj = b->j + bdiag[i+1]+1;
521       nz = bdiag[i] - bdiag[i+1]-1;
522       for (j=0; j<nz; j++) {
523         pv[j] = rtmp[pj[j]]; rs += PetscAbsScalar(pv[j]);
524       }
525 
526       sctx.rs  = rs;
527       sctx.pv  = rtmp[i];
528       ierr = MatPivotCheck(info,&sctx,i);CHKERRQ(ierr);
529       if(sctx.newshift) break; /* break for-loop */
530       rtmp[i] = sctx.pv; /* sctx.pv might be updated in the case of MAT_SHIFT_INBLOCKS */
531 
532       /* Mark diagonal and invert diagonal for simplier triangular solves */
533       pv  = b->a + bdiag[i];
534       *pv = 1.0/rtmp[i];
535 
536     } /* endof for (i=0; i<n; i++){ */
537 
538     /* MatPivotRefine() */
539     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max){
540       /*
541        * if no shift in this attempt & shifting & started shifting & can refine,
542        * then try lower shift
543        */
544       sctx.shift_hi       = sctx.shift_fraction;
545       sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.;
546       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
547       sctx.newshift       = PETSC_TRUE;
548       sctx.nshift++;
549     }
550   } while (sctx.newshift);
551 
552   ierr = PetscFree(rtmp);CHKERRQ(ierr);
553   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
554   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
555 
556   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
557   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
558   if (row_identity && col_identity) {
559     C->ops->solve = MatSolve_SeqBAIJ_1_NaturalOrdering;
560     C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering;
561   } else {
562     C->ops->solve = MatSolve_SeqBAIJ_1;
563     C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1;
564   }
565   C->assembled     = PETSC_TRUE;
566   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
567 
568   /* MatShiftView(A,info,&sctx) */
569   if (sctx.nshift){
570     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
571       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr);
572     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
573       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
574     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS){
575       ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %G\n",sctx.nshift,info->shiftamount);CHKERRQ(ierr);
576     }
577   }
578   PetscFunctionReturn(0);
579 }
580 
581 #undef __FUNCT__
582 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_1_inplace"
583 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_1_inplace(Mat C,Mat A,const MatFactorInfo *info)
584 {
585   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data;
586   IS             isrow = b->row,isicol = b->icol;
587   PetscErrorCode ierr;
588   const PetscInt *r,*ic;
589   PetscInt       i,j,n = a->mbs,*bi = b->i,*bj = b->j;
590   PetscInt       *ajtmpold,*ajtmp,nz,row,*ai = a->i,*aj = a->j;
591   PetscInt       *diag_offset = b->diag,diag,*pj;
592   MatScalar      *pv,*v,*rtmp,multiplier,*pc;
593   MatScalar      *ba = b->a,*aa = a->a;
594   PetscTruth     row_identity, col_identity;
595 
596   PetscFunctionBegin;
597   ierr  = ISGetIndices(isrow,&r);CHKERRQ(ierr);
598   ierr  = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
599   ierr  = PetscMalloc((n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
600 
601   for (i=0; i<n; i++) {
602     nz    = bi[i+1] - bi[i];
603     ajtmp = bj + bi[i];
604     for  (j=0; j<nz; j++) rtmp[ajtmp[j]] = 0.0;
605 
606     /* load in initial (unfactored row) */
607     nz       = ai[r[i]+1] - ai[r[i]];
608     ajtmpold = aj + ai[r[i]];
609     v        = aa + ai[r[i]];
610     for (j=0; j<nz; j++) rtmp[ic[ajtmpold[j]]] =  v[j];
611 
612     row = *ajtmp++;
613     while (row < i) {
614       pc = rtmp + row;
615       if (*pc != 0.0) {
616         pv         = ba + diag_offset[row];
617         pj         = bj + diag_offset[row] + 1;
618         multiplier = *pc * *pv++;
619         *pc        = multiplier;
620         nz         = bi[row+1] - diag_offset[row] - 1;
621         for (j=0; j<nz; j++) rtmp[pj[j]] -= multiplier * pv[j];
622         ierr = PetscLogFlops(1.0+2.0*nz);CHKERRQ(ierr);
623       }
624       row = *ajtmp++;
625     }
626     /* finished row so stick it into b->a */
627     pv = ba + bi[i];
628     pj = bj + bi[i];
629     nz = bi[i+1] - bi[i];
630     for (j=0; j<nz; j++) {pv[j] = rtmp[pj[j]];}
631     diag = diag_offset[i] - bi[i];
632     /* check pivot entry for current row */
633     if (pv[diag] == 0.0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot: row in original ordering %D in permuted ordering %D",r[i],i);
634     pv[diag] = 1.0/pv[diag];
635   }
636 
637   ierr = PetscFree(rtmp);CHKERRQ(ierr);
638   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
639   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
640   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
641   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
642   if (row_identity && col_identity) {
643     C->ops->solve          = MatSolve_SeqBAIJ_1_NaturalOrdering_inplace;
644     C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace;
645   } else {
646     C->ops->solve          = MatSolve_SeqBAIJ_1_inplace;
647     C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_inplace;
648   }
649   C->assembled = PETSC_TRUE;
650   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
651   PetscFunctionReturn(0);
652 }
653 
654 EXTERN_C_BEGIN
655 #undef __FUNCT__
656 #define __FUNCT__ "MatGetFactor_seqbaij_petsc"
657 PetscErrorCode MatGetFactor_seqbaij_petsc(Mat A,MatFactorType ftype,Mat *B)
658 {
659   PetscInt           n = A->rmap->n;
660   PetscErrorCode     ierr;
661 
662   PetscFunctionBegin;
663   ierr = MatCreate(((PetscObject)A)->comm,B);CHKERRQ(ierr);
664   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
665   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
666     ierr = MatSetType(*B,MATSEQBAIJ);CHKERRQ(ierr);
667     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqBAIJ;
668     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqBAIJ;
669   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
670     ierr = MatSetType(*B,MATSEQSBAIJ);CHKERRQ(ierr);
671     ierr = MatSeqSBAIJSetPreallocation(*B,1,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
672     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqBAIJ;
673     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqBAIJ;
674   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported");
675   (*B)->factortype = ftype;
676   PetscFunctionReturn(0);
677 }
678 EXTERN_C_END
679 
680 EXTERN_C_BEGIN
681 #undef __FUNCT__
682 #define __FUNCT__ "MatGetFactorAvailable_seqaij_petsc"
683 PetscErrorCode MatGetFactorAvailable_seqbaij_petsc(Mat A,MatFactorType ftype,PetscTruth *flg)
684 {
685   PetscFunctionBegin;
686   *flg = PETSC_TRUE;
687   PetscFunctionReturn(0);
688 }
689 EXTERN_C_END
690 
691 /* ----------------------------------------------------------- */
692 #undef __FUNCT__
693 #define __FUNCT__ "MatLUFactor_SeqBAIJ"
694 PetscErrorCode MatLUFactor_SeqBAIJ(Mat A,IS row,IS col,const MatFactorInfo *info)
695 {
696   PetscErrorCode ierr;
697   Mat            C;
698 
699   PetscFunctionBegin;
700   ierr = MatGetFactor(A,MATSOLVERPETSC,MAT_FACTOR_LU,&C);CHKERRQ(ierr);
701   ierr = MatLUFactorSymbolic(C,A,row,col,info);CHKERRQ(ierr);
702   ierr = MatLUFactorNumeric(C,A,info);CHKERRQ(ierr);
703   A->ops->solve            = C->ops->solve;
704   A->ops->solvetranspose   = C->ops->solvetranspose;
705   ierr = MatHeaderCopy(A,C);CHKERRQ(ierr);
706   ierr = PetscLogObjectParent(A,((Mat_SeqBAIJ*)(A->data))->icol);CHKERRQ(ierr);
707   PetscFunctionReturn(0);
708 }
709 
710 #include "../src/mat/impls/sbaij/seq/sbaij.h"
711 #undef __FUNCT__
712 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqBAIJ_N"
713 PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N(Mat C,Mat A,const MatFactorInfo *info)
714 {
715   PetscErrorCode ierr;
716   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data;
717   Mat_SeqSBAIJ   *b=(Mat_SeqSBAIJ*)C->data;
718   IS             ip=b->row;
719   const PetscInt *rip;
720   PetscInt       i,j,mbs=a->mbs,bs=A->rmap->bs,*bi=b->i,*bj=b->j,*bcol;
721   PetscInt       *ai=a->i,*aj=a->j;
722   PetscInt       k,jmin,jmax,*jl,*il,col,nexti,ili,nz;
723   MatScalar      *rtmp,*ba=b->a,*bval,*aa=a->a,dk,uikdi;
724   PetscReal      rs;
725   FactorShiftCtx sctx;
726 
727   PetscFunctionBegin;
728   if (bs > 1) {/* convert A to a SBAIJ matrix and apply Cholesky factorization from it */
729     if (!a->sbaijMat){
730       ierr = MatConvert(A,MATSEQSBAIJ,MAT_INITIAL_MATRIX,&a->sbaijMat);CHKERRQ(ierr);
731     }
732     ierr = (a->sbaijMat)->ops->choleskyfactornumeric(C,a->sbaijMat,info);CHKERRQ(ierr);
733     ierr = MatDestroy(a->sbaijMat);CHKERRQ(ierr);
734     a->sbaijMat = PETSC_NULL;
735     PetscFunctionReturn(0);
736   }
737 
738   /* MatPivotSetUp(): initialize shift context sctx */
739   ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr);
740 
741   ierr  = ISGetIndices(ip,&rip);CHKERRQ(ierr);
742   ierr = PetscMalloc3(mbs,MatScalar,&rtmp,mbs,PetscInt,&il,mbs,PetscInt,&jl);CHKERRQ(ierr);
743 
744   sctx.shift_amount = 0.;
745   sctx.nshift       = 0;
746   do {
747     sctx.newshift = PETSC_FALSE;
748     for (i=0; i<mbs; i++) {
749       rtmp[i] = 0.0; jl[i] = mbs; il[0] = 0;
750     }
751 
752     for (k = 0; k<mbs; k++){
753       bval = ba + bi[k];
754       /* initialize k-th row by the perm[k]-th row of A */
755       jmin = ai[rip[k]]; jmax = ai[rip[k]+1];
756       for (j = jmin; j < jmax; j++){
757         col = rip[aj[j]];
758         if (col >= k){ /* only take upper triangular entry */
759           rtmp[col] = aa[j];
760           *bval++  = 0.0; /* for in-place factorization */
761         }
762       }
763 
764       /* shift the diagonal of the matrix */
765       if (sctx.nshift) rtmp[k] += sctx.shift_amount;
766 
767       /* modify k-th row by adding in those rows i with U(i,k)!=0 */
768       dk = rtmp[k];
769       i = jl[k]; /* first row to be added to k_th row  */
770 
771       while (i < k){
772         nexti = jl[i]; /* next row to be added to k_th row */
773 
774         /* compute multiplier, update diag(k) and U(i,k) */
775         ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
776         uikdi = - ba[ili]*ba[bi[i]];  /* diagonal(k) */
777         dk += uikdi*ba[ili];
778         ba[ili] = uikdi; /* -U(i,k) */
779 
780         /* add multiple of row i to k-th row */
781         jmin = ili + 1; jmax = bi[i+1];
782         if (jmin < jmax){
783           for (j=jmin; j<jmax; j++) rtmp[bj[j]] += uikdi*ba[j];
784           /* update il and jl for row i */
785           il[i] = jmin;
786           j = bj[jmin]; jl[i] = jl[j]; jl[j] = i;
787         }
788         i = nexti;
789       }
790 
791       /* shift the diagonals when zero pivot is detected */
792       /* compute rs=sum of abs(off-diagonal) */
793       rs   = 0.0;
794       jmin = bi[k]+1;
795       nz   = bi[k+1] - jmin;
796       if (nz){
797         bcol = bj + jmin;
798         while (nz--){
799           rs += PetscAbsScalar(rtmp[*bcol]);
800           bcol++;
801         }
802       }
803 
804       sctx.rs = rs;
805       sctx.pv = dk;
806       ierr = MatPivotCheck(info,&sctx,k);CHKERRQ(ierr);
807       if (sctx.newshift) break;
808       dk = sctx.pv;
809 
810       /* copy data into U(k,:) */
811       ba[bi[k]] = 1.0/dk; /* U(k,k) */
812       jmin = bi[k]+1; jmax = bi[k+1];
813       if (jmin < jmax) {
814         for (j=jmin; j<jmax; j++){
815           col = bj[j]; ba[j] = rtmp[col]; rtmp[col] = 0.0;
816         }
817         /* add the k-th row into il and jl */
818         il[k] = jmin;
819         i = bj[jmin]; jl[k] = jl[i]; jl[i] = k;
820       }
821     }
822   } while (sctx.newshift);
823   ierr = PetscFree3(rtmp,il,jl);CHKERRQ(ierr);
824 
825   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
826   C->assembled    = PETSC_TRUE;
827   C->preallocated = PETSC_TRUE;
828   ierr = PetscLogFlops(C->rmap->N);CHKERRQ(ierr);
829   if (sctx.nshift){
830     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
831       ierr = PetscInfo2(A,"number of shiftpd tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
832     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
833       ierr = PetscInfo2(A,"number of shiftnz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
834     }
835   }
836   PetscFunctionReturn(0);
837 }
838 
839 #undef __FUNCT__
840 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering"
841 PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering(Mat C,Mat A,const MatFactorInfo *info)
842 {
843   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data;
844   Mat_SeqSBAIJ   *b=(Mat_SeqSBAIJ*)C->data;
845   PetscErrorCode ierr;
846   PetscInt       i,j,am=a->mbs;
847   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
848   PetscInt       k,jmin,*jl,*il,nexti,ili,*acol,*bcol,nz;
849   MatScalar      *rtmp,*ba=b->a,*aa=a->a,dk,uikdi,*aval,*bval;
850   PetscReal      rs;
851   FactorShiftCtx sctx;
852 
853   PetscFunctionBegin;
854   /* MatPivotSetUp(): initialize shift context sctx */
855   ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr);
856 
857   ierr = PetscMalloc3(am,MatScalar,&rtmp,am,PetscInt,&il,am,PetscInt,&jl);CHKERRQ(ierr);
858 
859   do {
860     sctx.newshift = PETSC_FALSE;
861     for (i=0; i<am; i++) {
862       rtmp[i] = 0.0; jl[i] = am; il[0] = 0;
863     }
864 
865     for (k = 0; k<am; k++){
866     /* initialize k-th row with elements nonzero in row perm(k) of A */
867       nz   = ai[k+1] - ai[k];
868       acol = aj + ai[k];
869       aval = aa + ai[k];
870       bval = ba + bi[k];
871       while (nz -- ){
872         if (*acol < k) { /* skip lower triangular entries */
873           acol++; aval++;
874         } else {
875           rtmp[*acol++] = *aval++;
876           *bval++       = 0.0; /* for in-place factorization */
877         }
878       }
879 
880       /* shift the diagonal of the matrix */
881       if (sctx.nshift) rtmp[k] += sctx.shift_amount;
882 
883       /* modify k-th row by adding in those rows i with U(i,k)!=0 */
884       dk = rtmp[k];
885       i  = jl[k]; /* first row to be added to k_th row  */
886 
887       while (i < k){
888         nexti = jl[i]; /* next row to be added to k_th row */
889         /* compute multiplier, update D(k) and U(i,k) */
890         ili   = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
891         uikdi = - ba[ili]*ba[bi[i]];
892         dk   += uikdi*ba[ili];
893         ba[ili] = uikdi; /* -U(i,k) */
894 
895         /* add multiple of row i to k-th row ... */
896         jmin = ili + 1;
897         nz   = bi[i+1] - jmin;
898         if (nz > 0){
899           bcol = bj + jmin;
900           bval = ba + jmin;
901           while (nz --) rtmp[*bcol++] += uikdi*(*bval++);
902           /* update il and jl for i-th row */
903           il[i] = jmin;
904           j = bj[jmin]; jl[i] = jl[j]; jl[j] = i;
905         }
906         i = nexti;
907       }
908 
909       /* shift the diagonals when zero pivot is detected */
910       /* compute rs=sum of abs(off-diagonal) */
911       rs   = 0.0;
912       jmin = bi[k]+1;
913       nz   = bi[k+1] - jmin;
914       if (nz){
915         bcol = bj + jmin;
916         while (nz--){
917           rs += PetscAbsScalar(rtmp[*bcol]);
918           bcol++;
919         }
920       }
921 
922       sctx.rs = rs;
923       sctx.pv = dk;
924       ierr = MatPivotCheck(info,&sctx,k);CHKERRQ(ierr);
925       if (sctx.newshift) break;    /* sctx.shift_amount is updated */
926       dk = sctx.pv;
927 
928       /* copy data into U(k,:) */
929       ba[bi[k]] = 1.0/dk;
930       jmin      = bi[k]+1;
931       nz        = bi[k+1] - jmin;
932       if (nz){
933         bcol = bj + jmin;
934         bval = ba + jmin;
935         while (nz--){
936           *bval++       = rtmp[*bcol];
937           rtmp[*bcol++] = 0.0;
938         }
939         /* add k-th row into il and jl */
940         il[k] = jmin;
941         i = bj[jmin]; jl[k] = jl[i]; jl[i] = k;
942       }
943     }
944   } while (sctx.newshift);
945   ierr = PetscFree3(rtmp,il,jl);CHKERRQ(ierr);
946 
947   C->ops->solve                 = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
948   C->ops->solvetranspose        = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
949   C->assembled    = PETSC_TRUE;
950   C->preallocated = PETSC_TRUE;
951   ierr = PetscLogFlops(C->rmap->N);CHKERRQ(ierr);
952     if (sctx.nshift){
953       if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
954       ierr = PetscInfo2(A,"number of shiftnz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
955       } else if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
956       ierr = PetscInfo2(A,"number of shiftpd tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
957     }
958   }
959   PetscFunctionReturn(0);
960 }
961 
962 #include "petscbt.h"
963 #include "../src/mat/utils/freespace.h"
964 #undef __FUNCT__
965 #define __FUNCT__ "MatICCFactorSymbolic_SeqBAIJ"
966 PetscErrorCode MatICCFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
967 {
968   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data;
969   Mat_SeqSBAIJ       *b;
970   Mat                B;
971   PetscErrorCode     ierr;
972   PetscTruth         perm_identity;
973   PetscInt           reallocs=0,i,*ai=a->i,*aj=a->j,am=a->mbs,bs=A->rmap->bs,*ui;
974   const PetscInt     *rip;
975   PetscInt           jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow;
976   PetscInt           nlnk,*lnk,*lnk_lvl=PETSC_NULL,ncols,ncols_upper,*cols,*cols_lvl,*uj,**uj_ptr,**uj_lvl_ptr;
977   PetscReal          fill=info->fill,levels=info->levels;
978   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
979   PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL;
980   PetscBT            lnkbt;
981 
982   PetscFunctionBegin;
983   if (bs > 1){
984     if (!a->sbaijMat){
985       ierr = MatConvert(A,MATSEQSBAIJ,MAT_INITIAL_MATRIX,&a->sbaijMat);CHKERRQ(ierr);
986     }
987     (fact)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqSBAIJ;  /* undue the change made in MatGetFactor_seqbaij_petsc */
988     ierr = MatICCFactorSymbolic(fact,a->sbaijMat,perm,info);CHKERRQ(ierr);
989     PetscFunctionReturn(0);
990   }
991 
992   ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr);
993   ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr);
994 
995   /* special case that simply copies fill pattern */
996   if (!levels && perm_identity) {
997     ierr = MatMarkDiagonal_SeqBAIJ(A);CHKERRQ(ierr);
998     ierr = PetscMalloc((am+1)*sizeof(PetscInt),&ui);CHKERRQ(ierr);
999     for (i=0; i<am; i++) {
1000       ui[i] = ai[i+1] - a->diag[i]; /* ui: rowlengths - changes when !perm_identity */
1001     }
1002     B = fact;
1003     ierr = MatSeqSBAIJSetPreallocation(B,1,0,ui);CHKERRQ(ierr);
1004 
1005 
1006     b  = (Mat_SeqSBAIJ*)B->data;
1007     uj = b->j;
1008     for (i=0; i<am; i++) {
1009       aj = a->j + a->diag[i];
1010       for (j=0; j<ui[i]; j++){
1011         *uj++ = *aj++;
1012       }
1013       b->ilen[i] = ui[i];
1014     }
1015     ierr = PetscFree(ui);CHKERRQ(ierr);
1016     B->factortype = MAT_FACTOR_NONE;
1017     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1018     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1019     B->factortype = MAT_FACTOR_ICC;
1020 
1021     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
1022     PetscFunctionReturn(0);
1023   }
1024 
1025   /* initialization */
1026   ierr  = PetscMalloc((am+1)*sizeof(PetscInt),&ui);CHKERRQ(ierr);
1027   ui[0] = 0;
1028   ierr  = PetscMalloc((2*am+1)*sizeof(PetscInt),&cols_lvl);CHKERRQ(ierr);
1029 
1030   /* jl: linked list for storing indices of the pivot rows
1031      il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
1032   ierr = PetscMalloc4(am,PetscInt*,&uj_ptr,am,PetscInt*,&uj_lvl_ptr,am,PetscInt,&il,am,PetscInt,&jl);CHKERRQ(ierr);
1033   for (i=0; i<am; i++){
1034     jl[i] = am; il[i] = 0;
1035   }
1036 
1037   /* create and initialize a linked list for storing column indices of the active row k */
1038   nlnk = am + 1;
1039   ierr = PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
1040 
1041   /* initial FreeSpace size is fill*(ai[am]+1) */
1042   ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space);CHKERRQ(ierr);
1043   current_space = free_space;
1044   ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space_lvl);CHKERRQ(ierr);
1045   current_space_lvl = free_space_lvl;
1046 
1047   for (k=0; k<am; k++){  /* for each active row k */
1048     /* initialize lnk by the column indices of row rip[k] of A */
1049     nzk   = 0;
1050     ncols = ai[rip[k]+1] - ai[rip[k]];
1051     ncols_upper = 0;
1052     cols        = cols_lvl + am;
1053     for (j=0; j<ncols; j++){
1054       i = rip[*(aj + ai[rip[k]] + j)];
1055       if (i >= k){ /* only take upper triangular entry */
1056         cols[ncols_upper] = i;
1057         cols_lvl[ncols_upper] = -1;  /* initialize level for nonzero entries */
1058         ncols_upper++;
1059       }
1060     }
1061     ierr = PetscIncompleteLLAdd(ncols_upper,cols,levels,cols_lvl,am,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
1062     nzk += nlnk;
1063 
1064     /* update lnk by computing fill-in for each pivot row to be merged in */
1065     prow = jl[k]; /* 1st pivot row */
1066 
1067     while (prow < k){
1068       nextprow = jl[prow];
1069 
1070       /* merge prow into k-th row */
1071       jmin = il[prow] + 1;  /* index of the 2nd nzero entry in U(prow,k:am-1) */
1072       jmax = ui[prow+1];
1073       ncols = jmax-jmin;
1074       i     = jmin - ui[prow];
1075       cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
1076       for (j=0; j<ncols; j++) cols_lvl[j] = *(uj_lvl_ptr[prow] + i + j);
1077       ierr = PetscIncompleteLLAddSorted(ncols,cols,levels,cols_lvl,am,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
1078       nzk += nlnk;
1079 
1080       /* update il and jl for prow */
1081       if (jmin < jmax){
1082         il[prow] = jmin;
1083         j = *cols; jl[prow] = jl[j]; jl[j] = prow;
1084       }
1085       prow = nextprow;
1086     }
1087 
1088     /* if free space is not available, make more free space */
1089     if (current_space->local_remaining<nzk) {
1090       i = am - k + 1; /* num of unfactored rows */
1091       i = PetscMin(i*nzk, i*(i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
1092       ierr = PetscFreeSpaceGet(i,&current_space);CHKERRQ(ierr);
1093       ierr = PetscFreeSpaceGet(i,&current_space_lvl);CHKERRQ(ierr);
1094       reallocs++;
1095     }
1096 
1097     /* copy data into free_space and free_space_lvl, then initialize lnk */
1098     ierr = PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr);
1099 
1100     /* add the k-th row into il and jl */
1101     if (nzk-1 > 0){
1102       i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
1103       jl[k] = jl[i]; jl[i] = k;
1104       il[k] = ui[k] + 1;
1105     }
1106     uj_ptr[k]     = current_space->array;
1107     uj_lvl_ptr[k] = current_space_lvl->array;
1108 
1109     current_space->array           += nzk;
1110     current_space->local_used      += nzk;
1111     current_space->local_remaining -= nzk;
1112 
1113     current_space_lvl->array           += nzk;
1114     current_space_lvl->local_used      += nzk;
1115     current_space_lvl->local_remaining -= nzk;
1116 
1117     ui[k+1] = ui[k] + nzk;
1118   }
1119 
1120 #if defined(PETSC_USE_INFO)
1121   if (ai[am] != 0) {
1122     PetscReal af = ((PetscReal)(2*ui[am]-am))/((PetscReal)ai[am]);
1123     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,fill,af);CHKERRQ(ierr);
1124     ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
1125     ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G) for best performance.\n",af);CHKERRQ(ierr);
1126   } else {
1127     ierr = PetscInfo(A,"Empty matrix.\n");CHKERRQ(ierr);
1128   }
1129 #endif
1130 
1131   ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
1132   ierr = PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);CHKERRQ(ierr);
1133   ierr = PetscFree(cols_lvl);CHKERRQ(ierr);
1134 
1135   /* destroy list of free space and other temporary array(s) */
1136   ierr = PetscMalloc((ui[am]+1)*sizeof(PetscInt),&uj);CHKERRQ(ierr);
1137   ierr = PetscFreeSpaceContiguous(&free_space,uj);CHKERRQ(ierr);
1138   ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
1139   ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr);
1140 
1141   /* put together the new matrix in MATSEQSBAIJ format */
1142   B = fact;
1143   ierr = MatSeqSBAIJSetPreallocation(B,1,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
1144 
1145   b = (Mat_SeqSBAIJ*)B->data;
1146   b->singlemalloc = PETSC_FALSE;
1147   b->free_a       = PETSC_TRUE;
1148   b->free_ij       = PETSC_TRUE;
1149   ierr = PetscMalloc((ui[am]+1)*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
1150   b->j    = uj;
1151   b->i    = ui;
1152   b->diag = 0;
1153   b->ilen = 0;
1154   b->imax = 0;
1155   b->row  = perm;
1156   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
1157   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
1158   b->icol = perm;
1159   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
1160   ierr    = PetscMalloc((am+1)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
1161   ierr    = PetscLogObjectMemory(B,(ui[am]-am)*(sizeof(PetscInt)+sizeof(MatScalar)));CHKERRQ(ierr);
1162   b->maxnz = b->nz = ui[am];
1163 
1164   B->info.factor_mallocs    = reallocs;
1165   B->info.fill_ratio_given  = fill;
1166   if (ai[am] != 0.) {
1167     B->info.fill_ratio_needed = ((PetscReal)ui[am])/((PetscReal)ai[am]);
1168   } else {
1169     B->info.fill_ratio_needed = 0.0;
1170   }
1171   if (perm_identity){
1172     B->ops->solve           = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1173     B->ops->solvetranspose  = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1174     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
1175   } else {
1176     (fact)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N;
1177   }
1178   PetscFunctionReturn(0);
1179 }
1180 
1181 #undef __FUNCT__
1182 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqBAIJ"
1183 PetscErrorCode MatCholeskyFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
1184 {
1185   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data;
1186   Mat_SeqSBAIJ       *b;
1187   Mat                B;
1188   PetscErrorCode     ierr;
1189   PetscTruth         perm_identity;
1190   PetscReal          fill = info->fill;
1191   const PetscInt     *rip;
1192   PetscInt           i,mbs=a->mbs,bs=A->rmap->bs,*ai=a->i,*aj=a->j,reallocs=0,prow;
1193   PetscInt           *jl,jmin,jmax,nzk,*ui,k,j,*il,nextprow;
1194   PetscInt           nlnk,*lnk,ncols,ncols_upper,*cols,*uj,**ui_ptr,*uj_ptr;
1195   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
1196   PetscBT            lnkbt;
1197 
1198   PetscFunctionBegin;
1199   if (bs > 1) { /* convert to seqsbaij */
1200     if (!a->sbaijMat){
1201       ierr = MatConvert(A,MATSEQSBAIJ,MAT_INITIAL_MATRIX,&a->sbaijMat);CHKERRQ(ierr);
1202     }
1203     (fact)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqSBAIJ; /* undue the change made in MatGetFactor_seqbaij_petsc */
1204     ierr = MatCholeskyFactorSymbolic(fact,a->sbaijMat,perm,info);CHKERRQ(ierr);
1205     PetscFunctionReturn(0);
1206   }
1207 
1208   /* check whether perm is the identity mapping */
1209   ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr);
1210   if (!perm_identity) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported");
1211   ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr);
1212 
1213   /* initialization */
1214   ierr  = PetscMalloc((mbs+1)*sizeof(PetscInt),&ui);CHKERRQ(ierr);
1215   ui[0] = 0;
1216 
1217   /* jl: linked list for storing indices of the pivot rows
1218      il: il[i] points to the 1st nonzero entry of U(i,k:mbs-1) */
1219   ierr = PetscMalloc4(mbs,PetscInt*,&ui_ptr,mbs,PetscInt,&il,mbs,PetscInt,&jl,mbs,PetscInt,&cols);CHKERRQ(ierr);
1220   for (i=0; i<mbs; i++){
1221     jl[i] = mbs; il[i] = 0;
1222   }
1223 
1224   /* create and initialize a linked list for storing column indices of the active row k */
1225   nlnk = mbs + 1;
1226   ierr = PetscLLCreate(mbs,mbs,nlnk,lnk,lnkbt);CHKERRQ(ierr);
1227 
1228   /* initial FreeSpace size is fill*(ai[mbs]+1) */
1229   ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[mbs]+1)),&free_space);CHKERRQ(ierr);
1230   current_space = free_space;
1231 
1232   for (k=0; k<mbs; k++){  /* for each active row k */
1233     /* initialize lnk by the column indices of row rip[k] of A */
1234     nzk   = 0;
1235     ncols = ai[rip[k]+1] - ai[rip[k]];
1236     ncols_upper = 0;
1237     for (j=0; j<ncols; j++){
1238       i = rip[*(aj + ai[rip[k]] + j)];
1239       if (i >= k){ /* only take upper triangular entry */
1240         cols[ncols_upper] = i;
1241         ncols_upper++;
1242       }
1243     }
1244     ierr = PetscLLAdd(ncols_upper,cols,mbs,nlnk,lnk,lnkbt);CHKERRQ(ierr);
1245     nzk += nlnk;
1246 
1247     /* update lnk by computing fill-in for each pivot row to be merged in */
1248     prow = jl[k]; /* 1st pivot row */
1249 
1250     while (prow < k){
1251       nextprow = jl[prow];
1252       /* merge prow into k-th row */
1253       jmin = il[prow] + 1;  /* index of the 2nd nzero entry in U(prow,k:mbs-1) */
1254       jmax = ui[prow+1];
1255       ncols = jmax-jmin;
1256       uj_ptr = ui_ptr[prow] + jmin - ui[prow]; /* points to the 2nd nzero entry in U(prow,k:mbs-1) */
1257       ierr = PetscLLAddSorted(ncols,uj_ptr,mbs,nlnk,lnk,lnkbt);CHKERRQ(ierr);
1258       nzk += nlnk;
1259 
1260       /* update il and jl for prow */
1261       if (jmin < jmax){
1262         il[prow] = jmin;
1263         j = *uj_ptr; jl[prow] = jl[j]; jl[j] = prow;
1264       }
1265       prow = nextprow;
1266     }
1267 
1268     /* if free space is not available, make more free space */
1269     if (current_space->local_remaining<nzk) {
1270       i = mbs - k + 1; /* num of unfactored rows */
1271       i = PetscMin(i*nzk, i*(i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
1272       ierr = PetscFreeSpaceGet(i,&current_space);CHKERRQ(ierr);
1273       reallocs++;
1274     }
1275 
1276     /* copy data into free space, then initialize lnk */
1277     ierr = PetscLLClean(mbs,mbs,nzk,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
1278 
1279     /* add the k-th row into il and jl */
1280     if (nzk-1 > 0){
1281       i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */
1282       jl[k] = jl[i]; jl[i] = k;
1283       il[k] = ui[k] + 1;
1284     }
1285     ui_ptr[k] = current_space->array;
1286     current_space->array           += nzk;
1287     current_space->local_used      += nzk;
1288     current_space->local_remaining -= nzk;
1289 
1290     ui[k+1] = ui[k] + nzk;
1291   }
1292 
1293 #if defined(PETSC_USE_INFO)
1294   if (ai[mbs] != 0.) {
1295     PetscReal af = ((PetscReal)ui[mbs])/((PetscReal)ai[mbs]);
1296     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,fill,af);CHKERRQ(ierr);
1297     ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
1298     ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G) for best performance.\n",af);CHKERRQ(ierr);
1299   } else {
1300     ierr = PetscInfo(A,"Empty matrix.\n");CHKERRQ(ierr);
1301   }
1302 #endif
1303 
1304   ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
1305   ierr = PetscFree4(ui_ptr,il,jl,cols);CHKERRQ(ierr);
1306 
1307   /* destroy list of free space and other temporary array(s) */
1308   ierr = PetscMalloc((ui[mbs]+1)*sizeof(PetscInt),&uj);CHKERRQ(ierr);
1309   ierr = PetscFreeSpaceContiguous(&free_space,uj);CHKERRQ(ierr);
1310   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
1311 
1312   /* put together the new matrix in MATSEQSBAIJ format */
1313   B    = fact;
1314   ierr = MatSeqSBAIJSetPreallocation(B,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
1315 
1316   b = (Mat_SeqSBAIJ*)B->data;
1317   b->singlemalloc = PETSC_FALSE;
1318   b->free_a       = PETSC_TRUE;
1319   b->free_ij      = PETSC_TRUE;
1320   ierr = PetscMalloc((ui[mbs]+1)*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
1321   b->j    = uj;
1322   b->i    = ui;
1323   b->diag = 0;
1324   b->ilen = 0;
1325   b->imax = 0;
1326   b->row  = perm;
1327   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
1328   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
1329   b->icol = perm;
1330   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
1331   ierr    = PetscMalloc((mbs+1)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
1332   ierr    = PetscLogObjectMemory(B,(ui[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));CHKERRQ(ierr);
1333   b->maxnz = b->nz = ui[mbs];
1334 
1335   B->info.factor_mallocs    = reallocs;
1336   B->info.fill_ratio_given  = fill;
1337   if (ai[mbs] != 0.) {
1338     B->info.fill_ratio_needed = ((PetscReal)ui[mbs])/((PetscReal)ai[mbs]);
1339   } else {
1340     B->info.fill_ratio_needed = 0.0;
1341   }
1342   if (perm_identity){
1343     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
1344   } else {
1345     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N;
1346   }
1347   PetscFunctionReturn(0);
1348 }
1349 
1350 #undef __FUNCT__
1351 #define __FUNCT__ "MatSolve_SeqBAIJ_N_NaturalOrdering"
1352 PetscErrorCode MatSolve_SeqBAIJ_N_NaturalOrdering(Mat A,Vec bb,Vec xx)
1353 {
1354   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1355   PetscErrorCode ierr;
1356   const PetscInt *ai=a->i,*aj=a->j,*adiag=a->diag,*vi;
1357   PetscInt       i,k,n=a->mbs;
1358   PetscInt       nz,bs=A->rmap->bs,bs2=a->bs2;
1359   MatScalar      *aa=a->a,*v;
1360   PetscScalar    *x,*b,*s,*t,*ls;
1361 
1362   PetscFunctionBegin;
1363   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1364   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1365   t  = a->solve_work;
1366 
1367   /* forward solve the lower triangular */
1368   ierr = PetscMemcpy(t,b,bs*sizeof(PetscScalar));CHKERRQ(ierr); /* copy 1st block of b to t */
1369 
1370   for (i=1; i<n; i++) {
1371     v   = aa + bs2*ai[i];
1372     vi  = aj + ai[i];
1373     nz = ai[i+1] - ai[i];
1374     s = t + bs*i;
1375     ierr = PetscMemcpy(s,b+bs*i,bs*sizeof(PetscScalar));CHKERRQ(ierr); /* copy i_th block of b to t */
1376     for(k=0;k<nz;k++){
1377       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*vi[k]);
1378       v += bs2;
1379     }
1380   }
1381 
1382   /* backward solve the upper triangular */
1383   ls = a->solve_work + A->cmap->n;
1384   for (i=n-1; i>=0; i--){
1385     v  = aa + bs2*(adiag[i+1]+1);
1386     vi = aj + adiag[i+1]+1;
1387     nz = adiag[i] - adiag[i+1]-1;
1388     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1389     for(k=0;k<nz;k++){
1390       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*vi[k]);
1391       v += bs2;
1392     }
1393     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*adiag[i],t+i*bs); /* *inv(diagonal[i]) */
1394     ierr = PetscMemcpy(x+i*bs,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1395   }
1396 
1397   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1398   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1399   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
1400   PetscFunctionReturn(0);
1401 }
1402 
1403 #undef __FUNCT__
1404 #define __FUNCT__ "MatSolve_SeqBAIJ_N"
1405 PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
1406 {
1407   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1408   IS             iscol=a->col,isrow=a->row;
1409   PetscErrorCode ierr;
1410   const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi;
1411   PetscInt       i,m,n=a->mbs;
1412   PetscInt       nz,bs=A->rmap->bs,bs2=a->bs2;
1413   MatScalar      *aa=a->a,*v;
1414   PetscScalar    *x,*b,*s,*t,*ls;
1415 
1416   PetscFunctionBegin;
1417   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1418   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1419   t  = a->solve_work;
1420 
1421   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1422   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1423 
1424   /* forward solve the lower triangular */
1425   ierr = PetscMemcpy(t,b+bs*r[0],bs*sizeof(PetscScalar));CHKERRQ(ierr);
1426   for (i=1; i<n; i++) {
1427     v   = aa + bs2*ai[i];
1428     vi  = aj + ai[i];
1429     nz = ai[i+1] - ai[i];
1430     s = t + bs*i;
1431     ierr = PetscMemcpy(s,b+bs*r[i],bs*sizeof(PetscScalar));CHKERRQ(ierr);
1432     for(m=0;m<nz;m++){
1433       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*vi[m]);
1434       v += bs2;
1435     }
1436   }
1437 
1438   /* backward solve the upper triangular */
1439   ls = a->solve_work + A->cmap->n;
1440   for (i=n-1; i>=0; i--){
1441     v  = aa + bs2*(adiag[i+1]+1);
1442     vi = aj + adiag[i+1]+1;
1443     nz = adiag[i] - adiag[i+1] - 1;
1444     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1445     for(m=0;m<nz;m++){
1446       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*vi[m]);
1447       v += bs2;
1448     }
1449     Kernel_w_gets_A_times_v(bs,ls,v,t+i*bs); /* *inv(diagonal[i]) */
1450     ierr = PetscMemcpy(x + bs*c[i],t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1451   }
1452   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1453   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1454   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1455   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1456   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
1457   PetscFunctionReturn(0);
1458 }
1459 
1460 #undef __FUNCT__
1461 #define __FUNCT__ "BlockAbs_privat"
1462 PetscErrorCode BlockAbs_private(PetscInt nbs,PetscInt bs2,PetscScalar *blockarray,PetscReal *absarray)
1463 {
1464   PetscErrorCode     ierr;
1465   PetscInt           i,j;
1466   PetscFunctionBegin;
1467   ierr = PetscMemzero(absarray,(nbs+1)*sizeof(PetscReal));CHKERRQ(ierr);
1468   for (i=0; i<nbs; i++){
1469     for (j=0; j<bs2; j++){
1470       if (absarray[i] < PetscAbsScalar(blockarray[i*nbs+j])) absarray[i] = PetscAbsScalar(blockarray[i*nbs+j]);
1471     }
1472   }
1473   PetscFunctionReturn(0);
1474 }
1475 
1476 #undef __FUNCT__
1477 #define __FUNCT__ "MatILUDTFactor_SeqBAIJ"
1478 /*
1479      This needs to be renamed and called by the regular MatILUFactor_SeqBAIJ when drop tolerance is used
1480 */
1481 PetscErrorCode MatILUDTFactor_SeqBAIJ(Mat A,IS isrow,IS iscol,const MatFactorInfo *info,Mat *fact)
1482 {
1483   Mat                B = *fact;
1484   Mat_SeqBAIJ        *a=(Mat_SeqBAIJ*)A->data,*b;
1485   IS                 isicol;
1486   PetscErrorCode     ierr;
1487   const PetscInt     *r,*ic;
1488   PetscInt           i,mbs=a->mbs,bs=A->rmap->bs,bs2=a->bs2,*ai=a->i,*aj=a->j,*ajtmp,*adiag;
1489   PetscInt           *bi,*bj,*bdiag;
1490 
1491   PetscInt           row,nzi,nzi_bl,nzi_bu,*im,dtcount,nzi_al,nzi_au;
1492   PetscInt           nlnk,*lnk;
1493   PetscBT            lnkbt;
1494   PetscTruth         row_identity,icol_identity;
1495   MatScalar          *aatmp,*pv,*batmp,*ba,*rtmp,*pc,*multiplier,*vtmp;
1496   PetscInt           j,nz,*pj,*bjtmp,k,ncut,*jtmp;
1497 
1498   PetscReal          dt=info->dt; /* shift=info->shiftamount; */
1499   PetscInt           nnz_max;
1500   PetscTruth         missing;
1501   PetscReal          *vtmp_abs;
1502   MatScalar          *v_work;
1503   PetscInt           *v_pivots;
1504 
1505   PetscFunctionBegin;
1506   /* ------- symbolic factorization, can be reused ---------*/
1507   ierr = MatMissingDiagonal(A,&missing,&i);CHKERRQ(ierr);
1508   if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",i);
1509   adiag=a->diag;
1510 
1511   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
1512 
1513   /* bdiag is location of diagonal in factor */
1514   ierr = PetscMalloc((mbs+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr);
1515 
1516   /* allocate row pointers bi */
1517   ierr = PetscMalloc((2*mbs+2)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
1518 
1519   /* allocate bj and ba; max num of nonzero entries is (ai[n]+2*n*dtcount+2) */
1520   dtcount = (PetscInt)info->dtcount;
1521   if (dtcount > mbs-1) dtcount = mbs-1;
1522   nnz_max  = ai[mbs]+2*mbs*dtcount +2;
1523   /* printf("MatILUDTFactor_SeqBAIJ, bs %d, ai[mbs] %d, nnz_max  %d, dtcount %d\n",bs,ai[mbs],nnz_max,dtcount); */
1524   ierr = PetscMalloc(nnz_max*sizeof(PetscInt),&bj);CHKERRQ(ierr);
1525   nnz_max = nnz_max*bs2;
1526   ierr = PetscMalloc(nnz_max*sizeof(MatScalar),&ba);CHKERRQ(ierr);
1527 
1528   /* put together the new matrix */
1529   ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(B,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
1530   ierr = PetscLogObjectParent(B,isicol);CHKERRQ(ierr);
1531   b    = (Mat_SeqBAIJ*)(B)->data;
1532   b->free_a       = PETSC_TRUE;
1533   b->free_ij      = PETSC_TRUE;
1534   b->singlemalloc = PETSC_FALSE;
1535   b->a          = ba;
1536   b->j          = bj;
1537   b->i          = bi;
1538   b->diag       = bdiag;
1539   b->ilen       = 0;
1540   b->imax       = 0;
1541   b->row        = isrow;
1542   b->col        = iscol;
1543   ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
1544   ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
1545   b->icol       = isicol;
1546   ierr = PetscMalloc((bs*(mbs+1))*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
1547 
1548   ierr = PetscLogObjectMemory(B,nnz_max*(sizeof(PetscInt)+sizeof(MatScalar)));CHKERRQ(ierr);
1549   b->maxnz = nnz_max/bs2;
1550 
1551   (B)->factortype            = MAT_FACTOR_ILUDT;
1552   (B)->info.factor_mallocs   = 0;
1553   (B)->info.fill_ratio_given = ((PetscReal)nnz_max)/((PetscReal)(ai[mbs]*bs2));
1554   CHKMEMQ;
1555   /* ------- end of symbolic factorization ---------*/
1556   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1557   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1558 
1559   /* linked list for storing column indices of the active row */
1560   nlnk = mbs + 1;
1561   ierr = PetscLLCreate(mbs,mbs,nlnk,lnk,lnkbt);CHKERRQ(ierr);
1562 
1563   /* im: used by PetscLLAddSortedLU(); jtmp: working array for column indices of active row */
1564   ierr = PetscMalloc2(mbs,PetscInt,&im,mbs,PetscInt,&jtmp);CHKERRQ(ierr);
1565   /* rtmp, vtmp: working arrays for sparse and contiguous row entries of active row */
1566   ierr = PetscMalloc2(mbs*bs2,MatScalar,&rtmp,mbs*bs2,MatScalar,&vtmp);CHKERRQ(ierr);
1567   ierr = PetscMalloc((mbs+1)*sizeof(PetscReal),&vtmp_abs);CHKERRQ(ierr);
1568   ierr = PetscMalloc3(bs,MatScalar,&v_work,bs2,MatScalar,&multiplier,bs,PetscInt,&v_pivots);CHKERRQ(ierr);
1569 
1570   bi[0]    = 0;
1571   bdiag[0] = (nnz_max/bs2)-1; /* location of diagonal in factor B */
1572   bi[2*mbs+1] = bdiag[0]+1; /* endof bj and ba array */
1573   for (i=0; i<mbs; i++) {
1574     /* copy initial fill into linked list */
1575     nzi = 0; /* nonzeros for active row i */
1576     nzi = ai[r[i]+1] - ai[r[i]];
1577     if (!nzi) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i);
1578     nzi_al = adiag[r[i]] - ai[r[i]];
1579     nzi_au = ai[r[i]+1] - adiag[r[i]] -1;
1580     /* printf("row %d, nzi_al/au %d %d\n",i,nzi_al,nzi_au); */
1581 
1582     /* load in initial unfactored row */
1583     ajtmp = aj + ai[r[i]];
1584     ierr = PetscLLAddPerm(nzi,ajtmp,ic,mbs,nlnk,lnk,lnkbt);CHKERRQ(ierr);
1585     ierr = PetscMemzero(rtmp,mbs*bs2*sizeof(PetscScalar));CHKERRQ(ierr);
1586     aatmp = a->a + bs2*ai[r[i]];
1587     for (j=0; j<nzi; j++) {
1588       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],aatmp+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
1589     }
1590 
1591     /* add pivot rows into linked list */
1592     row = lnk[mbs];
1593     while (row < i) {
1594       nzi_bl = bi[row+1] - bi[row] + 1;
1595       bjtmp = bj + bdiag[row+1]+1; /* points to 1st column next to the diagonal in U */
1596       ierr  = PetscLLAddSortedLU(bjtmp,row,nlnk,lnk,lnkbt,i,nzi_bl,im);CHKERRQ(ierr);
1597       nzi  += nlnk;
1598       row   = lnk[row];
1599     }
1600 
1601     /* copy data from lnk into jtmp, then initialize lnk */
1602     ierr = PetscLLClean(mbs,mbs,nzi,lnk,jtmp,lnkbt);CHKERRQ(ierr);
1603 
1604     /* numerical factorization */
1605     bjtmp = jtmp;
1606     row   = *bjtmp++; /* 1st pivot row */
1607 
1608     while  (row < i) {
1609       pc = rtmp + bs2*row;
1610       pv = ba + bs2*bdiag[row]; /* inv(diag) of the pivot row */
1611       Kernel_A_gets_A_times_B(bs,pc,pv,multiplier); /* pc= multiplier = pc*inv(diag[row]) */
1612       ierr = BlockAbs_private(1,bs2,pc,vtmp_abs);CHKERRQ(ierr);
1613       if (vtmp_abs[0] > dt){ /* apply tolerance dropping rule */
1614         pj         = bj + bdiag[row+1] + 1; /* point to 1st entry of U(row,:) */
1615         pv         = ba + bs2*(bdiag[row+1] + 1);
1616         nz         = bdiag[row] - bdiag[row+1] - 1; /* num of entries in U(row,:), excluding diagonal */
1617         for (j=0; j<nz; j++){
1618           Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j);
1619         }
1620         /* ierr = PetscLogFlops(bslog*(nz+1.0)-bs);CHKERRQ(ierr); */
1621       }
1622       row = *bjtmp++;
1623     }
1624 
1625     /* copy sparse rtmp into contiguous vtmp; separate L and U part */
1626     nzi_bl = 0; j = 0;
1627     while (jtmp[j] < i){ /* L-part. Note: jtmp is sorted */
1628       ierr = PetscMemcpy(vtmp+bs2*j,rtmp+bs2*jtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
1629       nzi_bl++; j++;
1630     }
1631     nzi_bu = nzi - nzi_bl -1;
1632     /* printf("nzi %d, nzi_bl %d, nzi_bu %d\n",nzi,nzi_bl,nzi_bu); */
1633 
1634     while (j < nzi){ /* U-part */
1635       ierr = PetscMemcpy(vtmp+bs2*j,rtmp+bs2*jtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
1636       /*
1637       printf(" col %d: ",jtmp[j]);
1638       for (j1=0; j1<bs2; j1++) printf(" %g",*(vtmp+bs2*j+j1));
1639       printf(" \n");
1640       */
1641       j++;
1642     }
1643 
1644     ierr = BlockAbs_private(nzi,bs2,vtmp,vtmp_abs);CHKERRQ(ierr);
1645     /*
1646     printf(" row %d, nzi %d, vtmp_abs\n",i,nzi);
1647     for (j1=0; j1<nzi; j1++) printf(" (%d %g),",jtmp[j1],vtmp_abs[j1]);
1648     printf(" \n");
1649     */
1650     bjtmp = bj + bi[i];
1651     batmp = ba + bs2*bi[i];
1652     /* apply level dropping rule to L part */
1653     ncut = nzi_al + dtcount;
1654     if (ncut < nzi_bl){
1655       ierr = PetscSortSplitReal(ncut,nzi_bl,vtmp_abs,jtmp);CHKERRQ(ierr);
1656       ierr = PetscSortIntWithScalarArray(ncut,jtmp,vtmp);CHKERRQ(ierr);
1657     } else {
1658       ncut = nzi_bl;
1659     }
1660     for (j=0; j<ncut; j++){
1661       bjtmp[j] = jtmp[j];
1662       ierr = PetscMemcpy(batmp+bs2*j,rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
1663       /*
1664       printf(" col %d: ",bjtmp[j]);
1665       for (j1=0; j1<bs2; j1++) printf(" %g,",*(batmp+bs2*j+j1));
1666       printf("\n");
1667       */
1668     }
1669     bi[i+1] = bi[i] + ncut;
1670     nzi = ncut + 1;
1671 
1672     /* apply level dropping rule to U part */
1673     ncut = nzi_au + dtcount;
1674     if (ncut < nzi_bu){
1675       ierr = PetscSortSplitReal(ncut,nzi_bu,vtmp_abs+nzi_bl+1,jtmp+nzi_bl+1);CHKERRQ(ierr);
1676       ierr = PetscSortIntWithScalarArray(ncut,jtmp+nzi_bl+1,vtmp+nzi_bl+1);CHKERRQ(ierr);
1677     } else {
1678       ncut = nzi_bu;
1679     }
1680     nzi += ncut;
1681 
1682     /* mark bdiagonal */
1683     bdiag[i+1]    = bdiag[i] - (ncut + 1);
1684     bi[2*mbs - i] = bi[2*mbs - i +1] - (ncut + 1);
1685 
1686     bjtmp = bj + bdiag[i];
1687     batmp = ba + bs2*bdiag[i];
1688     ierr = PetscMemcpy(batmp,rtmp+bs2*i,bs2*sizeof(MatScalar));CHKERRQ(ierr);
1689     *bjtmp = i;
1690     /*
1691     printf(" diag %d: ",*bjtmp);
1692     for (j=0; j<bs2; j++){
1693       printf(" %g,",batmp[j]);
1694     }
1695     printf("\n");
1696     */
1697     bjtmp = bj + bdiag[i+1]+1;
1698     batmp = ba + (bdiag[i+1]+1)*bs2;
1699 
1700     for (k=0; k<ncut; k++){
1701       bjtmp[k] = jtmp[nzi_bl+1+k];
1702       ierr = PetscMemcpy(batmp+bs2*k,rtmp+bs2*bjtmp[k],bs2*sizeof(MatScalar));CHKERRQ(ierr);
1703       /*
1704       printf(" col %d:",bjtmp[k]);
1705       for (j1=0; j1<bs2; j1++) printf(" %g,",*(batmp+bs2*k+j1));
1706       printf("\n");
1707       */
1708     }
1709 
1710     im[i] = nzi; /* used by PetscLLAddSortedLU() */
1711 
1712     /* invert diagonal block for simplier triangular solves - add shift??? */
1713     batmp = ba + bs2*bdiag[i];
1714     ierr = Kernel_A_gets_inverse_A(bs,batmp,v_pivots,v_work);CHKERRQ(ierr);
1715   } /* for (i=0; i<mbs; i++) */
1716   ierr = PetscFree3(v_work,multiplier,v_pivots);CHKERRQ(ierr);
1717 
1718   /* printf("end of L %d, beginning of U %d\n",bi[mbs],bdiag[mbs]); */
1719   if (bi[mbs] >= bdiag[mbs]) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"end of L array %d cannot >= the beginning of U array %d",bi[mbs],bdiag[mbs]);
1720 
1721   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1722   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
1723 
1724   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
1725 
1726   ierr = PetscFree2(im,jtmp);CHKERRQ(ierr);
1727   ierr = PetscFree2(rtmp,vtmp);CHKERRQ(ierr);
1728 
1729   ierr = PetscLogFlops(bs2*B->cmap->n);CHKERRQ(ierr);
1730   b->maxnz = b->nz = bi[mbs] + bdiag[0] - bdiag[mbs];
1731 
1732   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
1733   ierr = ISIdentity(isicol,&icol_identity);CHKERRQ(ierr);
1734   if (row_identity && icol_identity) {
1735     B->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering;
1736   } else {
1737     B->ops->solve = MatSolve_SeqBAIJ_N;
1738   }
1739 
1740   B->ops->solveadd          = 0;
1741   B->ops->solvetranspose    = 0;
1742   B->ops->solvetransposeadd = 0;
1743   B->ops->matsolve          = 0;
1744   B->assembled              = PETSC_TRUE;
1745   B->preallocated           = PETSC_TRUE;
1746   PetscFunctionReturn(0);
1747 }
1748