xref: /petsc/src/mat/impls/sbaij/seq/sbaijfact2.c (revision bebe2cf65d55febe21a5af8db2bd2e168caaa2e7)
1 
2 /*
3     Factorization code for SBAIJ format.
4 */
5 
6 #include <../src/mat/impls/sbaij/seq/sbaij.h>
7 #include <../src/mat/impls/baij/seq/baij.h>
8 #include <petsc/private/kernels/blockinvert.h>
9 
10 #undef __FUNCT__
11 #define __FUNCT__ "MatSolve_SeqSBAIJ_N_inplace"
12 PetscErrorCode MatSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
13 {
14   Mat_SeqSBAIJ      *a   =(Mat_SeqSBAIJ*)A->data;
15   IS                isrow=a->row;
16   PetscInt          mbs  =a->mbs,*ai=a->i,*aj=a->j;
17   PetscErrorCode    ierr;
18   const PetscInt    *r;
19   PetscInt          nz,*vj,k,idx,k1;
20   PetscInt          bs =A->rmap->bs,bs2 = a->bs2;
21   const MatScalar   *aa=a->a,*v,*diag;
22   PetscScalar       *x,*xk,*xj,*xk_tmp,*t;
23   const PetscScalar *b;
24 
25   PetscFunctionBegin;
26   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
27   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
28   t    = a->solve_work;
29   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
30   ierr = PetscMalloc1(bs,&xk_tmp);CHKERRQ(ierr);
31 
32   /* solve U^T * D * y = b by forward substitution */
33   xk = t;
34   for (k=0; k<mbs; k++) { /* t <- perm(b) */
35     idx = bs*r[k];
36     for (k1=0; k1<bs; k1++) *xk++ = b[idx+k1];
37   }
38   for (k=0; k<mbs; k++) {
39     v    = aa + bs2*ai[k];
40     xk   = t + k*bs;    /* Dk*xk = k-th block of x */
41     ierr = PetscMemcpy(xk_tmp,xk,bs*sizeof(PetscScalar));CHKERRQ(ierr); /* xk_tmp <- xk */
42     nz   = ai[k+1] - ai[k];
43     vj   = aj + ai[k];
44     xj   = t + (*vj)*bs; /* *vj-th block of x, *vj>k */
45     while (nz--) {
46       /* x(:) += U(k,:)^T*(Dk*xk) */
47       PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
48       vj++; xj = t + (*vj)*bs;
49       v       += bs2;
50     }
51     /* xk = inv(Dk)*(Dk*xk) */
52     diag = aa+k*bs2;                            /* ptr to inv(Dk) */
53     PetscKernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
54   }
55 
56   /* solve U*x = y by back substitution */
57   for (k=mbs-1; k>=0; k--) {
58     v  = aa + bs2*ai[k];
59     xk = t + k*bs;        /* xk */
60     nz = ai[k+1] - ai[k];
61     vj = aj + ai[k];
62     xj = t + (*vj)*bs;
63     while (nz--) {
64       /* xk += U(k,:)*x(:) */
65       PetscKernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
66       vj++;
67       v += bs2; xj = t + (*vj)*bs;
68     }
69     idx = bs*r[k];
70     for (k1=0; k1<bs; k1++) x[idx+k1] = *xk++;
71   }
72 
73   ierr = PetscFree(xk_tmp);CHKERRQ(ierr);
74   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
75   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
76   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
77   ierr = PetscLogFlops(4.0*bs2*a->nz -(bs+2.0*bs2)*mbs);CHKERRQ(ierr);
78   PetscFunctionReturn(0);
79 }
80 
81 #undef __FUNCT__
82 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_N_inplace"
83 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
84 {
85   PetscFunctionBegin;
86   SETERRQ(PETSC_COMM_SELF,1,"not implemented yet");
87   PetscFunctionReturn(0);
88 }
89 
90 #undef __FUNCT__
91 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_N_inplace"
92 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
93 {
94   PetscFunctionBegin;
95   SETERRQ(PETSC_COMM_SELF,1,"not implemented yet");
96   PetscFunctionReturn(0);
97 }
98 
99 #undef __FUNCT__
100 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_N_NaturalOrdering"
101 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x)
102 {
103   PetscErrorCode  ierr;
104   PetscInt        nz,k;
105   const PetscInt  *vj,bs2 = bs*bs;
106   const MatScalar *v,*diag;
107   PetscScalar     *xk,*xj,*xk_tmp;
108 
109   PetscFunctionBegin;
110   ierr = PetscMalloc1(bs,&xk_tmp);CHKERRQ(ierr);
111   for (k=0; k<mbs; k++) {
112     v    = aa + bs2*ai[k];
113     xk   = x + k*bs;    /* Dk*xk = k-th block of x */
114     ierr = PetscMemcpy(xk_tmp,xk,bs*sizeof(PetscScalar));CHKERRQ(ierr); /* xk_tmp <- xk */
115     nz   = ai[k+1] - ai[k];
116     vj   = aj + ai[k];
117     xj   = x + (*vj)*bs; /* *vj-th block of x, *vj>k */
118     while (nz--) {
119       /* x(:) += U(k,:)^T*(Dk*xk) */
120       PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
121       vj++; xj = x + (*vj)*bs;
122       v       += bs2;
123     }
124     /* xk = inv(Dk)*(Dk*xk) */
125     diag = aa+k*bs2;                            /* ptr to inv(Dk) */
126     PetscKernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
127   }
128   ierr = PetscFree(xk_tmp);CHKERRQ(ierr);
129   PetscFunctionReturn(0);
130 }
131 
132 #undef __FUNCT__
133 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering"
134 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x)
135 {
136   PetscInt        nz,k;
137   const PetscInt  *vj,bs2 = bs*bs;
138   const MatScalar *v;
139   PetscScalar     *xk,*xj;
140 
141   PetscFunctionBegin;
142   for (k=mbs-1; k>=0; k--) {
143     v  = aa + bs2*ai[k];
144     xk = x + k*bs;        /* xk */
145     nz = ai[k+1] - ai[k];
146     vj = aj + ai[k];
147     xj = x + (*vj)*bs;
148     while (nz--) {
149       /* xk += U(k,:)*x(:) */
150       PetscKernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
151       vj++;
152       v += bs2; xj = x + (*vj)*bs;
153     }
154   }
155   PetscFunctionReturn(0);
156 }
157 
158 #undef __FUNCT__
159 #define __FUNCT__ "MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace"
160 PetscErrorCode MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
161 {
162   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
163   PetscErrorCode    ierr;
164   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
165   PetscInt          bs =A->rmap->bs;
166   const MatScalar   *aa=a->a;
167   PetscScalar       *x;
168   const PetscScalar *b;
169 
170   PetscFunctionBegin;
171   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
172   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
173 
174   /* solve U^T * D * y = b by forward substitution */
175   ierr = PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */
176   ierr = MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr);
177 
178   /* solve U*x = y by back substitution */
179   ierr = MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr);
180 
181   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
182   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
183   ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr);
184   PetscFunctionReturn(0);
185 }
186 
187 #undef __FUNCT__
188 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace"
189 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
190 {
191   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
192   PetscErrorCode    ierr;
193   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
194   PetscInt          bs =A->rmap->bs;
195   const MatScalar   *aa=a->a;
196   const PetscScalar *b;
197   PetscScalar       *x;
198 
199   PetscFunctionBegin;
200   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
201   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
202   ierr = PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */
203   ierr = MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr);
204   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
205   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
206   ierr = PetscLogFlops(2.0*bs2*a->nz - bs*mbs);CHKERRQ(ierr);
207   PetscFunctionReturn(0);
208 }
209 
210 #undef __FUNCT__
211 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace"
212 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
213 {
214   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
215   PetscErrorCode    ierr;
216   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
217   PetscInt          bs =A->rmap->bs;
218   const MatScalar   *aa=a->a;
219   const PetscScalar *b;
220   PetscScalar       *x;
221 
222   PetscFunctionBegin;
223   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
224   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
225   ierr = PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
226   ierr = MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr);
227   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
228   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
229   ierr = PetscLogFlops(2.0*bs2*(a->nz-mbs));CHKERRQ(ierr);
230   PetscFunctionReturn(0);
231 }
232 
233 #undef __FUNCT__
234 #define __FUNCT__ "MatSolve_SeqSBAIJ_7_inplace"
235 PetscErrorCode MatSolve_SeqSBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
236 {
237   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
238   IS                isrow=a->row;
239   const PetscInt    mbs  =a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2,bs=A->rmap->bs,*r,*vj;
240   PetscErrorCode    ierr;
241   PetscInt          nz,k,idx;
242   const MatScalar   *aa=a->a,*v,*d;
243   PetscScalar       *x,x0,x1,x2,x3,x4,x5,x6,*t,*tp;
244   const PetscScalar *b;
245 
246   PetscFunctionBegin;
247   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
248   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
249   t    = a->solve_work;
250   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
251 
252   /* solve U^T * D * y = b by forward substitution */
253   tp = t;
254   for (k=0; k<mbs; k++) { /* t <- perm(b) */
255     idx   = 7*r[k];
256     tp[0] = b[idx];
257     tp[1] = b[idx+1];
258     tp[2] = b[idx+2];
259     tp[3] = b[idx+3];
260     tp[4] = b[idx+4];
261     tp[5] = b[idx+5];
262     tp[6] = b[idx+6];
263     tp   += 7;
264   }
265 
266   for (k=0; k<mbs; k++) {
267     v  = aa + 49*ai[k];
268     vj = aj + ai[k];
269     tp = t + k*7;
270     x0 =tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6];
271     nz = ai[k+1] - ai[k];
272     tp = t + (*vj)*7;
273     while (nz--) {
274       tp[0]+=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
275       tp[1]+=  v[7]*x0 +  v[8]*x1 +  v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
276       tp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
277       tp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
278       tp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
279       tp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
280       tp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
281       vj++;
282       tp = t + (*vj)*7;
283       v += 49;
284     }
285 
286     /* xk = inv(Dk)*(Dk*xk) */
287     d     = aa+k*49;       /* ptr to inv(Dk) */
288     tp    = t + k*7;
289     tp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
290     tp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
291     tp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
292     tp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
293     tp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
294     tp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
295     tp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
296   }
297 
298   /* solve U*x = y by back substitution */
299   for (k=mbs-1; k>=0; k--) {
300     v  = aa + 49*ai[k];
301     vj = aj + ai[k];
302     tp = t + k*7;
303     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];  x6=tp[6]; /* xk */
304     nz = ai[k+1] - ai[k];
305 
306     tp = t + (*vj)*7;
307     while (nz--) {
308       /* xk += U(k,:)*x(:) */
309       x0 += v[0]*tp[0] + v[7]*tp[1] + v[14]*tp[2] + v[21]*tp[3] + v[28]*tp[4] + v[35]*tp[5] + v[42]*tp[6];
310       x1 += v[1]*tp[0] + v[8]*tp[1] + v[15]*tp[2] + v[22]*tp[3] + v[29]*tp[4] + v[36]*tp[5] + v[43]*tp[6];
311       x2 += v[2]*tp[0] + v[9]*tp[1] + v[16]*tp[2] + v[23]*tp[3] + v[30]*tp[4] + v[37]*tp[5] + v[44]*tp[6];
312       x3 += v[3]*tp[0]+ v[10]*tp[1] + v[17]*tp[2] + v[24]*tp[3] + v[31]*tp[4] + v[38]*tp[5] + v[45]*tp[6];
313       x4 += v[4]*tp[0]+ v[11]*tp[1] + v[18]*tp[2] + v[25]*tp[3] + v[32]*tp[4] + v[39]*tp[5] + v[46]*tp[6];
314       x5 += v[5]*tp[0]+ v[12]*tp[1] + v[19]*tp[2] + v[26]*tp[3] + v[33]*tp[4] + v[40]*tp[5] + v[47]*tp[6];
315       x6 += v[6]*tp[0]+ v[13]*tp[1] + v[20]*tp[2] + v[27]*tp[3] + v[34]*tp[4] + v[41]*tp[5] + v[48]*tp[6];
316       vj++;
317       tp = t + (*vj)*7;
318       v += 49;
319     }
320     tp       = t + k*7;
321     tp[0]    = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5; tp[6]=x6;
322     idx      = 7*r[k];
323     x[idx]   = x0;
324     x[idx+1] = x1;
325     x[idx+2] = x2;
326     x[idx+3] = x3;
327     x[idx+4] = x4;
328     x[idx+5] = x5;
329     x[idx+6] = x6;
330   }
331 
332   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
333   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
334   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
335   ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr);
336   PetscFunctionReturn(0);
337 }
338 
339 #undef __FUNCT__
340 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_7_NaturalOrdering"
341 PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
342 {
343   const MatScalar *v,*d;
344   PetscScalar     *xp,x0,x1,x2,x3,x4,x5,x6;
345   PetscInt        nz,k;
346   const PetscInt  *vj;
347 
348   PetscFunctionBegin;
349   for (k=0; k<mbs; k++) {
350     v  = aa + 49*ai[k];
351     xp = x + k*7;
352     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* Dk*xk = k-th block of x */
353     nz = ai[k+1] - ai[k];
354     vj = aj + ai[k];
355     xp = x + (*vj)*7;
356     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
357     PetscPrefetchBlock(v+49*nz,49*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
358     while (nz--) {
359       /* x(:) += U(k,:)^T*(Dk*xk) */
360       xp[0]+=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
361       xp[1]+=  v[7]*x0 +  v[8]*x1 +  v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
362       xp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
363       xp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
364       xp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
365       xp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
366       xp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
367       vj++;
368       xp = x + (*vj)*7;
369       v += 49;
370     }
371     /* xk = inv(Dk)*(Dk*xk) */
372     d     = aa+k*49;       /* ptr to inv(Dk) */
373     xp    = x + k*7;
374     xp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
375     xp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
376     xp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
377     xp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
378     xp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
379     xp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
380     xp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
381   }
382   PetscFunctionReturn(0);
383 }
384 
385 #undef __FUNCT__
386 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering"
387 PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
388 {
389   const MatScalar *v;
390   PetscScalar     *xp,x0,x1,x2,x3,x4,x5,x6;
391   PetscInt        nz,k;
392   const PetscInt  *vj;
393 
394   PetscFunctionBegin;
395   for (k=mbs-1; k>=0; k--) {
396     v  = aa + 49*ai[k];
397     xp = x + k*7;
398     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* xk */
399     nz = ai[k+1] - ai[k];
400     vj = aj + ai[k];
401     xp = x + (*vj)*7;
402     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
403     PetscPrefetchBlock(v-49*nz,49*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
404     while (nz--) {
405       /* xk += U(k,:)*x(:) */
406       x0 += v[0]*xp[0] + v[7]*xp[1] + v[14]*xp[2] + v[21]*xp[3] + v[28]*xp[4] + v[35]*xp[5] + v[42]*xp[6];
407       x1 += v[1]*xp[0] + v[8]*xp[1] + v[15]*xp[2] + v[22]*xp[3] + v[29]*xp[4] + v[36]*xp[5] + v[43]*xp[6];
408       x2 += v[2]*xp[0] + v[9]*xp[1] + v[16]*xp[2] + v[23]*xp[3] + v[30]*xp[4] + v[37]*xp[5] + v[44]*xp[6];
409       x3 += v[3]*xp[0]+ v[10]*xp[1] + v[17]*xp[2] + v[24]*xp[3] + v[31]*xp[4] + v[38]*xp[5] + v[45]*xp[6];
410       x4 += v[4]*xp[0]+ v[11]*xp[1] + v[18]*xp[2] + v[25]*xp[3] + v[32]*xp[4] + v[39]*xp[5] + v[46]*xp[6];
411       x5 += v[5]*xp[0]+ v[12]*xp[1] + v[19]*xp[2] + v[26]*xp[3] + v[33]*xp[4] + v[40]*xp[5] + v[47]*xp[6];
412       x6 += v[6]*xp[0]+ v[13]*xp[1] + v[20]*xp[2] + v[27]*xp[3] + v[34]*xp[4] + v[41]*xp[5] + v[48]*xp[6];
413       vj++;
414       v += 49; xp = x + (*vj)*7;
415     }
416     xp   = x + k*7;
417     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5; xp[6]=x6;
418   }
419   PetscFunctionReturn(0);
420 }
421 
422 #undef __FUNCT__
423 #define __FUNCT__ "MatSolve_SeqSBAIJ_7_NaturalOrdering_inplace"
424 PetscErrorCode MatSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
425 {
426   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
427   PetscErrorCode    ierr;
428   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
429   const MatScalar   *aa=a->a;
430   PetscScalar       *x;
431   const PetscScalar *b;
432 
433   PetscFunctionBegin;
434   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
435   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
436 
437   /* solve U^T * D * y = b by forward substitution */
438   ierr = PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */
439   ierr = MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
440 
441   /* solve U*x = y by back substitution */
442   ierr = MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
443 
444   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
445   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
446   ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr);
447   PetscFunctionReturn(0);
448 }
449 
450 #undef __FUNCT__
451 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace"
452 PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
453 {
454   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
455   PetscErrorCode    ierr;
456   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
457   const MatScalar   *aa=a->a;
458   PetscScalar       *x;
459   const PetscScalar *b;
460 
461   PetscFunctionBegin;
462   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
463   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
464   ierr = PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
465   ierr = MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
466   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
467   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
468   ierr = PetscLogFlops(2.0*bs2*a->nz - bs*mbs);CHKERRQ(ierr);
469   PetscFunctionReturn(0);
470 }
471 
472 #undef __FUNCT__
473 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace"
474 PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
475 {
476   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
477   PetscErrorCode    ierr;
478   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
479   const MatScalar   *aa=a->a;
480   PetscScalar       *x;
481   const PetscScalar *b;
482 
483   PetscFunctionBegin;
484   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
485   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
486   ierr = PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
487   ierr = MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
488   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
489   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
490   ierr = PetscLogFlops(2.0*bs2*(a->nz-mbs));CHKERRQ(ierr);
491   PetscFunctionReturn(0);
492 }
493 
494 #undef __FUNCT__
495 #define __FUNCT__ "MatSolve_SeqSBAIJ_6_inplace"
496 PetscErrorCode MatSolve_SeqSBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
497 {
498   Mat_SeqSBAIJ      *a   =(Mat_SeqSBAIJ*)A->data;
499   IS                isrow=a->row;
500   const PetscInt    mbs  =a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2,*r,*vj;
501   PetscErrorCode    ierr;
502   PetscInt          nz,k,idx;
503   const MatScalar   *aa=a->a,*v,*d;
504   PetscScalar       *x,x0,x1,x2,x3,x4,x5,*t,*tp;
505   const PetscScalar *b;
506 
507   PetscFunctionBegin;
508   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
509   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
510   t    = a->solve_work;
511   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
512 
513   /* solve U^T * D * y = b by forward substitution */
514   tp = t;
515   for (k=0; k<mbs; k++) { /* t <- perm(b) */
516     idx   = 6*r[k];
517     tp[0] = b[idx];
518     tp[1] = b[idx+1];
519     tp[2] = b[idx+2];
520     tp[3] = b[idx+3];
521     tp[4] = b[idx+4];
522     tp[5] = b[idx+5];
523     tp   += 6;
524   }
525 
526   for (k=0; k<mbs; k++) {
527     v  = aa + 36*ai[k];
528     vj = aj + ai[k];
529     tp = t + k*6;
530     x0 =tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];
531     nz = ai[k+1] - ai[k];
532     tp = t + (*vj)*6;
533     while (nz--) {
534       tp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
535       tp[1] +=  v[6]*x0 +  v[7]*x1 +  v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
536       tp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
537       tp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
538       tp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
539       tp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
540       vj++;
541       tp = t + (*vj)*6;
542       v += 36;
543     }
544 
545     /* xk = inv(Dk)*(Dk*xk) */
546     d     = aa+k*36;       /* ptr to inv(Dk) */
547     tp    = t + k*6;
548     tp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
549     tp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
550     tp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
551     tp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
552     tp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
553     tp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
554   }
555 
556   /* solve U*x = y by back substitution */
557   for (k=mbs-1; k>=0; k--) {
558     v  = aa + 36*ai[k];
559     vj = aj + ai[k];
560     tp = t + k*6;
561     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; /* xk */
562     nz = ai[k+1] - ai[k];
563 
564     tp = t + (*vj)*6;
565     while (nz--) {
566       /* xk += U(k,:)*x(:) */
567       x0 += v[0]*tp[0] + v[6]*tp[1] + v[12]*tp[2] + v[18]*tp[3] + v[24]*tp[4] + v[30]*tp[5];
568       x1 += v[1]*tp[0] + v[7]*tp[1] + v[13]*tp[2] + v[19]*tp[3] + v[25]*tp[4] + v[31]*tp[5];
569       x2 += v[2]*tp[0] + v[8]*tp[1] + v[14]*tp[2] + v[20]*tp[3] + v[26]*tp[4] + v[32]*tp[5];
570       x3 += v[3]*tp[0] + v[9]*tp[1] + v[15]*tp[2] + v[21]*tp[3] + v[27]*tp[4] + v[33]*tp[5];
571       x4 += v[4]*tp[0]+ v[10]*tp[1] + v[16]*tp[2] + v[22]*tp[3] + v[28]*tp[4] + v[34]*tp[5];
572       x5 += v[5]*tp[0]+ v[11]*tp[1] + v[17]*tp[2] + v[23]*tp[3] + v[29]*tp[4] + v[35]*tp[5];
573       vj++;
574       tp = t + (*vj)*6;
575       v += 36;
576     }
577     tp       = t + k*6;
578     tp[0]    = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5;
579     idx      = 6*r[k];
580     x[idx]   = x0;
581     x[idx+1] = x1;
582     x[idx+2] = x2;
583     x[idx+3] = x3;
584     x[idx+4] = x4;
585     x[idx+5] = x5;
586   }
587 
588   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
589   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
590   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
591   ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr);
592   PetscFunctionReturn(0);
593 }
594 
595 #undef __FUNCT__
596 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_6_NaturalOrdering"
597 PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
598 {
599   const MatScalar *v,*d;
600   PetscScalar     *xp,x0,x1,x2,x3,x4,x5;
601   PetscInt        nz,k;
602   const PetscInt  *vj;
603 
604   PetscFunctionBegin;
605   for (k=0; k<mbs; k++) {
606     v  = aa + 36*ai[k];
607     xp = x + k*6;
608     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* Dk*xk = k-th block of x */
609     nz = ai[k+1] - ai[k];
610     vj = aj + ai[k];
611     xp = x + (*vj)*6;
612     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
613     PetscPrefetchBlock(v+36*nz,36*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
614     while (nz--) {
615       /* x(:) += U(k,:)^T*(Dk*xk) */
616       xp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
617       xp[1] +=  v[6]*x0 +  v[7]*x1 +  v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
618       xp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
619       xp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
620       xp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
621       xp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
622       vj++;
623       xp = x + (*vj)*6;
624       v += 36;
625     }
626     /* xk = inv(Dk)*(Dk*xk) */
627     d     = aa+k*36;       /* ptr to inv(Dk) */
628     xp    = x + k*6;
629     xp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
630     xp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
631     xp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
632     xp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
633     xp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
634     xp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
635   }
636   PetscFunctionReturn(0);
637 }
638 #undef __FUNCT__
639 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering"
640 PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
641 {
642   const MatScalar   *v;
643   PetscScalar       *xp,x0,x1,x2,x3,x4,x5;
644   PetscInt          nz,k;
645   const PetscInt    *vj;
646 
647   PetscFunctionBegin;
648   for (k=mbs-1; k>=0; k--) {
649     v  = aa + 36*ai[k];
650     xp = x + k*6;
651     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* xk */
652     nz = ai[k+1] - ai[k];
653     vj = aj + ai[k];
654     xp = x + (*vj)*6;
655     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
656     PetscPrefetchBlock(v-36*nz,36*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
657     while (nz--) {
658       /* xk += U(k,:)*x(:) */
659       x0 += v[0]*xp[0] + v[6]*xp[1] + v[12]*xp[2] + v[18]*xp[3] + v[24]*xp[4] + v[30]*xp[5];
660       x1 += v[1]*xp[0] + v[7]*xp[1] + v[13]*xp[2] + v[19]*xp[3] + v[25]*xp[4] + v[31]*xp[5];
661       x2 += v[2]*xp[0] + v[8]*xp[1] + v[14]*xp[2] + v[20]*xp[3] + v[26]*xp[4] + v[32]*xp[5];
662       x3 += v[3]*xp[0] + v[9]*xp[1] + v[15]*xp[2] + v[21]*xp[3] + v[27]*xp[4] + v[33]*xp[5];
663       x4 += v[4]*xp[0]+ v[10]*xp[1] + v[16]*xp[2] + v[22]*xp[3] + v[28]*xp[4] + v[34]*xp[5];
664       x5 += v[5]*xp[0]+ v[11]*xp[1] + v[17]*xp[2] + v[23]*xp[3] + v[29]*xp[4] + v[35]*xp[5];
665       vj++;
666       v += 36; xp = x + (*vj)*6;
667     }
668     xp   = x + k*6;
669     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5;
670   }
671   PetscFunctionReturn(0);
672 }
673 
674 
675 #undef __FUNCT__
676 #define __FUNCT__ "MatSolve_SeqSBAIJ_6_NaturalOrdering_inplace"
677 PetscErrorCode MatSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
678 {
679   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
680   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
681   const MatScalar   *aa=a->a;
682   PetscScalar       *x;
683   const PetscScalar *b;
684   PetscErrorCode    ierr;
685 
686   PetscFunctionBegin;
687   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
688   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
689 
690   /* solve U^T * D * y = b by forward substitution */
691   ierr = PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */
692   ierr = MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
693 
694   /* solve U*x = y by back substitution */
695   ierr = MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
696 
697   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
698   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
699   ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr);
700   PetscFunctionReturn(0);
701 }
702 
703 #undef __FUNCT__
704 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace"
705 PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
706 {
707   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
708   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
709   const MatScalar   *aa=a->a;
710   PetscScalar       *x;
711   const PetscScalar *b;
712   PetscErrorCode    ierr;
713 
714   PetscFunctionBegin;
715   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
716   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
717   ierr = PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */
718   ierr = MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
719   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
720   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
721   ierr = PetscLogFlops(2.0*bs2*a->nz - bs*mbs);CHKERRQ(ierr);
722   PetscFunctionReturn(0);
723 }
724 
725 #undef __FUNCT__
726 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace"
727 PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
728 {
729   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
730   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
731   const MatScalar   *aa=a->a;
732   PetscScalar       *x;
733   const PetscScalar *b;
734   PetscErrorCode    ierr;
735 
736   PetscFunctionBegin;
737   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
738   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
739   ierr = PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */
740   ierr = MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
741   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
742   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
743   ierr = PetscLogFlops(2.0*bs2*(a->nz - mbs));CHKERRQ(ierr);
744   PetscFunctionReturn(0);
745 }
746 
747 #undef __FUNCT__
748 #define __FUNCT__ "MatSolve_SeqSBAIJ_5_inplace"
749 PetscErrorCode MatSolve_SeqSBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
750 {
751   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
752   IS                isrow=a->row;
753   const PetscInt    mbs  =a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2 = a->bs2;
754   PetscErrorCode    ierr;
755   const PetscInt    *r,*vj;
756   PetscInt          nz,k,idx;
757   const MatScalar   *aa=a->a,*v,*diag;
758   PetscScalar       *x,x0,x1,x2,x3,x4,*t,*tp;
759   const PetscScalar *b;
760 
761   PetscFunctionBegin;
762   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
763   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
764   t    = a->solve_work;
765   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
766 
767   /* solve U^T * D * y = b by forward substitution */
768   tp = t;
769   for (k=0; k<mbs; k++) { /* t <- perm(b) */
770     idx   = 5*r[k];
771     tp[0] = b[idx];
772     tp[1] = b[idx+1];
773     tp[2] = b[idx+2];
774     tp[3] = b[idx+3];
775     tp[4] = b[idx+4];
776     tp   += 5;
777   }
778 
779   for (k=0; k<mbs; k++) {
780     v  = aa + 25*ai[k];
781     vj = aj + ai[k];
782     tp = t + k*5;
783     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4];
784     nz = ai[k+1] - ai[k];
785 
786     tp = t + (*vj)*5;
787     while (nz--) {
788       tp[0] +=  v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4;
789       tp[1] +=  v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4;
790       tp[2] += v[10]*x0+ v[11]*x1+ v[12]*x2+ v[13]*x3+ v[14]*x4;
791       tp[3] += v[15]*x0+ v[16]*x1+ v[17]*x2+ v[18]*x3+ v[19]*x4;
792       tp[4] += v[20]*x0+ v[21]*x1+ v[22]*x2+ v[23]*x3+ v[24]*x4;
793       vj++;
794       tp = t + (*vj)*5;
795       v += 25;
796     }
797 
798     /* xk = inv(Dk)*(Dk*xk) */
799     diag  = aa+k*25;          /* ptr to inv(Dk) */
800     tp    = t + k*5;
801     tp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
802     tp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
803     tp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
804     tp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
805     tp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
806   }
807 
808   /* solve U*x = y by back substitution */
809   for (k=mbs-1; k>=0; k--) {
810     v  = aa + 25*ai[k];
811     vj = aj + ai[k];
812     tp = t + k*5;
813     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; /* xk */
814     nz = ai[k+1] - ai[k];
815 
816     tp = t + (*vj)*5;
817     while (nz--) {
818       /* xk += U(k,:)*x(:) */
819       x0 += v[0]*tp[0] + v[5]*tp[1] + v[10]*tp[2] + v[15]*tp[3] + v[20]*tp[4];
820       x1 += v[1]*tp[0] + v[6]*tp[1] + v[11]*tp[2] + v[16]*tp[3] + v[21]*tp[4];
821       x2 += v[2]*tp[0] + v[7]*tp[1] + v[12]*tp[2] + v[17]*tp[3] + v[22]*tp[4];
822       x3 += v[3]*tp[0] + v[8]*tp[1] + v[13]*tp[2] + v[18]*tp[3] + v[23]*tp[4];
823       x4 += v[4]*tp[0] + v[9]*tp[1] + v[14]*tp[2] + v[19]*tp[3] + v[24]*tp[4];
824       vj++;
825       tp = t + (*vj)*5;
826       v += 25;
827     }
828     tp       = t + k*5;
829     tp[0]    = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4;
830     idx      = 5*r[k];
831     x[idx]   = x0;
832     x[idx+1] = x1;
833     x[idx+2] = x2;
834     x[idx+3] = x3;
835     x[idx+4] = x4;
836   }
837 
838   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
839   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
840   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
841   ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr);
842   PetscFunctionReturn(0);
843 }
844 
845 #undef __FUNCT__
846 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_5_NaturalOrdering"
847 PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
848 {
849   const MatScalar *v,*diag;
850   PetscScalar     *xp,x0,x1,x2,x3,x4;
851   PetscInt        nz,k;
852   const PetscInt  *vj;
853 
854   PetscFunctionBegin;
855   for (k=0; k<mbs; k++) {
856     v  = aa + 25*ai[k];
857     xp = x + k*5;
858     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* Dk*xk = k-th block of x */
859     nz = ai[k+1] - ai[k];
860     vj = aj + ai[k];
861     xp = x + (*vj)*5;
862     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
863     PetscPrefetchBlock(v+25*nz,25*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
864     while (nz--) {
865       /* x(:) += U(k,:)^T*(Dk*xk) */
866       xp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4;
867       xp[1] +=  v[5]*x0 +  v[6]*x1 +  v[7]*x2 + v[8]*x3 + v[9]*x4;
868       xp[2] += v[10]*x0 + v[11]*x1 + v[12]*x2+ v[13]*x3+ v[14]*x4;
869       xp[3] += v[15]*x0 + v[16]*x1 + v[17]*x2+ v[18]*x3+ v[19]*x4;
870       xp[4] += v[20]*x0 + v[21]*x1 + v[22]*x2+ v[23]*x3+ v[24]*x4;
871       vj++;
872       xp = x + (*vj)*5;
873       v += 25;
874     }
875     /* xk = inv(Dk)*(Dk*xk) */
876     diag  = aa+k*25;         /* ptr to inv(Dk) */
877     xp    = x + k*5;
878     xp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
879     xp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
880     xp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
881     xp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
882     xp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
883   }
884   PetscFunctionReturn(0);
885 }
886 
887 #undef __FUNCT__
888 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering"
889 PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
890 {
891   const MatScalar *v;
892   PetscScalar     *xp,x0,x1,x2,x3,x4;
893   PetscInt        nz,k;
894   const PetscInt  *vj;
895 
896   PetscFunctionBegin;
897   for (k=mbs-1; k>=0; k--) {
898     v  = aa + 25*ai[k];
899     xp = x + k*5;
900     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; /* xk */
901     nz = ai[k+1] - ai[k];
902     vj = aj + ai[k];
903     xp = x + (*vj)*5;
904     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
905     PetscPrefetchBlock(v-25*nz,25*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
906     while (nz--) {
907       /* xk += U(k,:)*x(:) */
908       x0 += v[0]*xp[0] + v[5]*xp[1] + v[10]*xp[2] + v[15]*xp[3] + v[20]*xp[4];
909       x1 += v[1]*xp[0] + v[6]*xp[1] + v[11]*xp[2] + v[16]*xp[3] + v[21]*xp[4];
910       x2 += v[2]*xp[0] + v[7]*xp[1] + v[12]*xp[2] + v[17]*xp[3] + v[22]*xp[4];
911       x3 += v[3]*xp[0] + v[8]*xp[1] + v[13]*xp[2] + v[18]*xp[3] + v[23]*xp[4];
912       x4 += v[4]*xp[0] + v[9]*xp[1] + v[14]*xp[2] + v[19]*xp[3] + v[24]*xp[4];
913       vj++;
914       v += 25; xp = x + (*vj)*5;
915     }
916     xp   = x + k*5;
917     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4;
918   }
919   PetscFunctionReturn(0);
920 }
921 
922 #undef __FUNCT__
923 #define __FUNCT__ "MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace"
924 PetscErrorCode MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
925 {
926   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
927   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2 = a->bs2;
928   const MatScalar   *aa=a->a;
929   PetscScalar       *x;
930   const PetscScalar *b;
931   PetscErrorCode    ierr;
932 
933   PetscFunctionBegin;
934   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
935   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
936 
937   /* solve U^T * D * y = b by forward substitution */
938   ierr = PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */
939   ierr = MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
940 
941   /* solve U*x = y by back substitution */
942   ierr = MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
943 
944   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
945   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
946   ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr);
947   PetscFunctionReturn(0);
948 }
949 
950 #undef __FUNCT__
951 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace"
952 PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
953 {
954   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
955   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
956   const MatScalar   *aa=a->a;
957   PetscScalar       *x;
958   const PetscScalar *b;
959   PetscErrorCode    ierr;
960 
961   PetscFunctionBegin;
962   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
963   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
964   ierr = PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */
965   ierr = MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
966   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
967   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
968   ierr = PetscLogFlops(2.0*bs2*a->nz - bs*mbs);CHKERRQ(ierr);
969   PetscFunctionReturn(0);
970 }
971 
972 #undef __FUNCT__
973 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace"
974 PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
975 {
976   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
977   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
978   const MatScalar   *aa=a->a;
979   PetscScalar       *x;
980   const PetscScalar *b;
981   PetscErrorCode    ierr;
982 
983   PetscFunctionBegin;
984   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
985   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
986   ierr = PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
987   ierr = MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
988   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
989   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
990   ierr = PetscLogFlops(2.0*bs2*(a->nz-mbs));CHKERRQ(ierr);
991   PetscFunctionReturn(0);
992 }
993 
994 #undef __FUNCT__
995 #define __FUNCT__ "MatSolve_SeqSBAIJ_4_inplace"
996 PetscErrorCode MatSolve_SeqSBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
997 {
998   Mat_SeqSBAIJ      *a   =(Mat_SeqSBAIJ*)A->data;
999   IS                isrow=a->row;
1000   const PetscInt    mbs  =a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1001   PetscErrorCode    ierr;
1002   const PetscInt    *r,*vj;
1003   PetscInt          nz,k,idx;
1004   const MatScalar   *aa=a->a,*v,*diag;
1005   PetscScalar       *x,x0,x1,x2,x3,*t,*tp;
1006   const PetscScalar *b;
1007 
1008   PetscFunctionBegin;
1009   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1010   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1011   t    = a->solve_work;
1012   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1013 
1014   /* solve U^T * D * y = b by forward substitution */
1015   tp = t;
1016   for (k=0; k<mbs; k++) { /* t <- perm(b) */
1017     idx   = 4*r[k];
1018     tp[0] = b[idx];
1019     tp[1] = b[idx+1];
1020     tp[2] = b[idx+2];
1021     tp[3] = b[idx+3];
1022     tp   += 4;
1023   }
1024 
1025   for (k=0; k<mbs; k++) {
1026     v  = aa + 16*ai[k];
1027     vj = aj + ai[k];
1028     tp = t + k*4;
1029     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3];
1030     nz = ai[k+1] - ai[k];
1031 
1032     tp = t + (*vj)*4;
1033     while (nz--) {
1034       tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
1035       tp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
1036       tp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
1037       tp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
1038       vj++;
1039       tp = t + (*vj)*4;
1040       v += 16;
1041     }
1042 
1043     /* xk = inv(Dk)*(Dk*xk) */
1044     diag  = aa+k*16;          /* ptr to inv(Dk) */
1045     tp    = t + k*4;
1046     tp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
1047     tp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
1048     tp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
1049     tp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
1050   }
1051 
1052   /* solve U*x = y by back substitution */
1053   for (k=mbs-1; k>=0; k--) {
1054     v  = aa + 16*ai[k];
1055     vj = aj + ai[k];
1056     tp = t + k*4;
1057     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; /* xk */
1058     nz = ai[k+1] - ai[k];
1059 
1060     tp = t + (*vj)*4;
1061     while (nz--) {
1062       /* xk += U(k,:)*x(:) */
1063       x0 += v[0]*tp[0] + v[4]*tp[1] + v[8]*tp[2] + v[12]*tp[3];
1064       x1 += v[1]*tp[0] + v[5]*tp[1] + v[9]*tp[2] + v[13]*tp[3];
1065       x2 += v[2]*tp[0] + v[6]*tp[1]+ v[10]*tp[2] + v[14]*tp[3];
1066       x3 += v[3]*tp[0] + v[7]*tp[1]+ v[11]*tp[2] + v[15]*tp[3];
1067       vj++;
1068       tp = t + (*vj)*4;
1069       v += 16;
1070     }
1071     tp       = t + k*4;
1072     tp[0]    =x0; tp[1]=x1; tp[2]=x2; tp[3]=x3;
1073     idx      = 4*r[k];
1074     x[idx]   = x0;
1075     x[idx+1] = x1;
1076     x[idx+2] = x2;
1077     x[idx+3] = x3;
1078   }
1079 
1080   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1081   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1082   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1083   ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr);
1084   PetscFunctionReturn(0);
1085 }
1086 
1087 #undef __FUNCT__
1088 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_4_NaturalOrdering"
1089 PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1090 {
1091   const MatScalar *v,*diag;
1092   PetscScalar     *xp,x0,x1,x2,x3;
1093   PetscInt        nz,k;
1094   const PetscInt  *vj;
1095 
1096   PetscFunctionBegin;
1097   for (k=0; k<mbs; k++) {
1098     v  = aa + 16*ai[k];
1099     xp = x + k*4;
1100     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* Dk*xk = k-th block of x */
1101     nz = ai[k+1] - ai[k];
1102     vj = aj + ai[k];
1103     xp = x + (*vj)*4;
1104     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
1105     PetscPrefetchBlock(v+16*nz,16*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1106     while (nz--) {
1107       /* x(:) += U(k,:)^T*(Dk*xk) */
1108       xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
1109       xp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
1110       xp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
1111       xp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
1112       vj++;
1113       xp = x + (*vj)*4;
1114       v += 16;
1115     }
1116     /* xk = inv(Dk)*(Dk*xk) */
1117     diag  = aa+k*16;         /* ptr to inv(Dk) */
1118     xp    = x + k*4;
1119     xp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
1120     xp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
1121     xp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
1122     xp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
1123   }
1124   PetscFunctionReturn(0);
1125 }
1126 
1127 #undef __FUNCT__
1128 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering"
1129 PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1130 {
1131   const MatScalar *v;
1132   PetscScalar     *xp,x0,x1,x2,x3;
1133   PetscInt        nz,k;
1134   const PetscInt  *vj;
1135 
1136   PetscFunctionBegin;
1137   for (k=mbs-1; k>=0; k--) {
1138     v  = aa + 16*ai[k];
1139     xp = x + k*4;
1140     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* xk */
1141     nz = ai[k+1] - ai[k];
1142     vj = aj + ai[k];
1143     xp = x + (*vj)*4;
1144     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
1145     PetscPrefetchBlock(v-16*nz,16*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1146     while (nz--) {
1147       /* xk += U(k,:)*x(:) */
1148       x0 += v[0]*xp[0] + v[4]*xp[1] + v[8]*xp[2] + v[12]*xp[3];
1149       x1 += v[1]*xp[0] + v[5]*xp[1] + v[9]*xp[2] + v[13]*xp[3];
1150       x2 += v[2]*xp[0] + v[6]*xp[1]+ v[10]*xp[2] + v[14]*xp[3];
1151       x3 += v[3]*xp[0] + v[7]*xp[1]+ v[11]*xp[2] + v[15]*xp[3];
1152       vj++;
1153       v += 16; xp = x + (*vj)*4;
1154     }
1155     xp    = x + k*4;
1156     xp[0] = x0; xp[1] = x1; xp[2] = x2; xp[3] = x3;
1157   }
1158   PetscFunctionReturn(0);
1159 }
1160 
1161 #undef __FUNCT__
1162 #define __FUNCT__ "MatSolve_SeqSBAIJ_4_NaturalOrdering_inplace"
1163 PetscErrorCode MatSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1164 {
1165   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1166   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1167   const MatScalar   *aa=a->a;
1168   PetscScalar       *x;
1169   const PetscScalar *b;
1170   PetscErrorCode    ierr;
1171 
1172   PetscFunctionBegin;
1173   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1174   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1175 
1176   /* solve U^T * D * y = b by forward substitution */
1177   ierr = PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */
1178   ierr = MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1179 
1180   /* solve U*x = y by back substitution */
1181   ierr = MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1182   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1183   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1184   ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr);
1185   PetscFunctionReturn(0);
1186 }
1187 
1188 #undef __FUNCT__
1189 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace"
1190 PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1191 {
1192   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1193   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1194   const MatScalar   *aa=a->a;
1195   PetscScalar       *x;
1196   const PetscScalar *b;
1197   PetscErrorCode    ierr;
1198 
1199   PetscFunctionBegin;
1200   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1201   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1202   ierr = PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */
1203   ierr = MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1204   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1205   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1206   ierr = PetscLogFlops(2.0*bs2*a->nz - bs*mbs);CHKERRQ(ierr);
1207   PetscFunctionReturn(0);
1208 }
1209 
1210 #undef __FUNCT__
1211 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace"
1212 PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1213 {
1214   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1215   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
1216   const MatScalar   *aa=a->a;
1217   PetscScalar       *x;
1218   const PetscScalar *b;
1219   PetscErrorCode    ierr;
1220 
1221   PetscFunctionBegin;
1222   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1223   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1224   ierr = PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1225   ierr = MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1226   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1227   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1228   ierr = PetscLogFlops(2.0*bs2*(a->nz-mbs));CHKERRQ(ierr);
1229   PetscFunctionReturn(0);
1230 }
1231 
1232 #undef __FUNCT__
1233 #define __FUNCT__ "MatSolve_SeqSBAIJ_3_inplace"
1234 PetscErrorCode MatSolve_SeqSBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
1235 {
1236   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1237   IS                isrow=a->row;
1238   const PetscInt    mbs  =a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1239   PetscErrorCode    ierr;
1240   const PetscInt    *r;
1241   PetscInt          nz,k,idx;
1242   const PetscInt    *vj;
1243   const MatScalar   *aa=a->a,*v,*diag;
1244   PetscScalar       *x,x0,x1,x2,*t,*tp;
1245   const PetscScalar *b;
1246 
1247   PetscFunctionBegin;
1248   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1249   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1250   t    = a->solve_work;
1251   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1252 
1253   /* solve U^T * D * y = b by forward substitution */
1254   tp = t;
1255   for (k=0; k<mbs; k++) { /* t <- perm(b) */
1256     idx   = 3*r[k];
1257     tp[0] = b[idx];
1258     tp[1] = b[idx+1];
1259     tp[2] = b[idx+2];
1260     tp   += 3;
1261   }
1262 
1263   for (k=0; k<mbs; k++) {
1264     v  = aa + 9*ai[k];
1265     vj = aj + ai[k];
1266     tp = t + k*3;
1267     x0 = tp[0]; x1 = tp[1]; x2 = tp[2];
1268     nz = ai[k+1] - ai[k];
1269 
1270     tp = t + (*vj)*3;
1271     while (nz--) {
1272       tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
1273       tp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
1274       tp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
1275       vj++;
1276       tp = t + (*vj)*3;
1277       v += 9;
1278     }
1279 
1280     /* xk = inv(Dk)*(Dk*xk) */
1281     diag  = aa+k*9;          /* ptr to inv(Dk) */
1282     tp    = t + k*3;
1283     tp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
1284     tp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
1285     tp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
1286   }
1287 
1288   /* solve U*x = y by back substitution */
1289   for (k=mbs-1; k>=0; k--) {
1290     v  = aa + 9*ai[k];
1291     vj = aj + ai[k];
1292     tp = t + k*3;
1293     x0 = tp[0]; x1 = tp[1]; x2 = tp[2];  /* xk */
1294     nz = ai[k+1] - ai[k];
1295 
1296     tp = t + (*vj)*3;
1297     while (nz--) {
1298       /* xk += U(k,:)*x(:) */
1299       x0 += v[0]*tp[0] + v[3]*tp[1] + v[6]*tp[2];
1300       x1 += v[1]*tp[0] + v[4]*tp[1] + v[7]*tp[2];
1301       x2 += v[2]*tp[0] + v[5]*tp[1] + v[8]*tp[2];
1302       vj++;
1303       tp = t + (*vj)*3;
1304       v += 9;
1305     }
1306     tp       = t + k*3;
1307     tp[0]    = x0; tp[1] = x1; tp[2] = x2;
1308     idx      = 3*r[k];
1309     x[idx]   = x0;
1310     x[idx+1] = x1;
1311     x[idx+2] = x2;
1312   }
1313 
1314   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1315   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1316   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1317   ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr);
1318   PetscFunctionReturn(0);
1319 }
1320 
1321 #undef __FUNCT__
1322 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_3_NaturalOrdering"
1323 PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1324 {
1325   const MatScalar *v,*diag;
1326   PetscScalar     *xp,x0,x1,x2;
1327   PetscInt        nz,k;
1328   const PetscInt  *vj;
1329 
1330   PetscFunctionBegin;
1331   for (k=0; k<mbs; k++) {
1332     v  = aa + 9*ai[k];
1333     xp = x + k*3;
1334     x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* Dk*xk = k-th block of x */
1335     nz = ai[k+1] - ai[k];
1336     vj = aj + ai[k];
1337     xp = x + (*vj)*3;
1338     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);    /* Indices for the next row (assumes same size as this one) */
1339     PetscPrefetchBlock(v+9*nz,9*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1340     while (nz--) {
1341       /* x(:) += U(k,:)^T*(Dk*xk) */
1342       xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
1343       xp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
1344       xp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
1345       vj++;
1346       xp = x + (*vj)*3;
1347       v += 9;
1348     }
1349     /* xk = inv(Dk)*(Dk*xk) */
1350     diag  = aa+k*9;         /* ptr to inv(Dk) */
1351     xp    = x + k*3;
1352     xp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
1353     xp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
1354     xp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
1355   }
1356   PetscFunctionReturn(0);
1357 }
1358 
1359 #undef __FUNCT__
1360 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering"
1361 PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1362 {
1363   const MatScalar *v;
1364   PetscScalar     *xp,x0,x1,x2;
1365   PetscInt        nz,k;
1366   const PetscInt  *vj;
1367 
1368   PetscFunctionBegin;
1369   for (k=mbs-1; k>=0; k--) {
1370     v  = aa + 9*ai[k];
1371     xp = x + k*3;
1372     x0 = xp[0]; x1 = xp[1]; x2 = xp[2];  /* xk */
1373     nz = ai[k+1] - ai[k];
1374     vj = aj + ai[k];
1375     xp = x + (*vj)*3;
1376     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);    /* Indices for the next row (assumes same size as this one) */
1377     PetscPrefetchBlock(v-9*nz,9*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1378     while (nz--) {
1379       /* xk += U(k,:)*x(:) */
1380       x0 += v[0]*xp[0] + v[3]*xp[1] + v[6]*xp[2];
1381       x1 += v[1]*xp[0] + v[4]*xp[1] + v[7]*xp[2];
1382       x2 += v[2]*xp[0] + v[5]*xp[1] + v[8]*xp[2];
1383       vj++;
1384       v += 9; xp = x + (*vj)*3;
1385     }
1386     xp    = x + k*3;
1387     xp[0] = x0; xp[1] = x1; xp[2] = x2;
1388   }
1389   PetscFunctionReturn(0);
1390 }
1391 
1392 #undef __FUNCT__
1393 #define __FUNCT__ "MatSolve_SeqSBAIJ_3_NaturalOrdering_inplace"
1394 PetscErrorCode MatSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1395 {
1396   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1397   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1398   const MatScalar   *aa=a->a;
1399   PetscScalar       *x;
1400   const PetscScalar *b;
1401   PetscErrorCode    ierr;
1402 
1403   PetscFunctionBegin;
1404   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1405   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1406 
1407   /* solve U^T * D * y = b by forward substitution */
1408   ierr = PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1409   ierr = MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1410 
1411   /* solve U*x = y by back substitution */
1412   ierr = MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1413 
1414   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1415   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1416   ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr);
1417   PetscFunctionReturn(0);
1418 }
1419 
1420 #undef __FUNCT__
1421 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace"
1422 PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1423 {
1424   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1425   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1426   const MatScalar   *aa=a->a;
1427   PetscScalar       *x;
1428   const PetscScalar *b;
1429   PetscErrorCode    ierr;
1430 
1431   PetscFunctionBegin;
1432   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1433   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1434   ierr = PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1435   ierr = MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1436   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1437   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1438   ierr = PetscLogFlops(2.0*bs2*a->nz - bs*mbs);CHKERRQ(ierr);
1439   PetscFunctionReturn(0);
1440 }
1441 
1442 #undef __FUNCT__
1443 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace"
1444 PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1445 {
1446   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1447   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
1448   const MatScalar   *aa=a->a;
1449   PetscScalar       *x;
1450   const PetscScalar *b;
1451   PetscErrorCode    ierr;
1452 
1453   PetscFunctionBegin;
1454   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1455   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1456   ierr = PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1457   ierr = MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1458   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1459   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1460   ierr = PetscLogFlops(2.0*bs2*(a->nz-mbs));CHKERRQ(ierr);
1461   PetscFunctionReturn(0);
1462 }
1463 
1464 #undef __FUNCT__
1465 #define __FUNCT__ "MatSolve_SeqSBAIJ_2_inplace"
1466 PetscErrorCode MatSolve_SeqSBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
1467 {
1468   Mat_SeqSBAIJ      *a   =(Mat_SeqSBAIJ*)A->data;
1469   IS                isrow=a->row;
1470   const PetscInt    mbs  =a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1471   PetscErrorCode    ierr;
1472   const PetscInt    *r,*vj;
1473   PetscInt          nz,k,k2,idx;
1474   const MatScalar   *aa=a->a,*v,*diag;
1475   PetscScalar       *x,x0,x1,*t;
1476   const PetscScalar *b;
1477 
1478   PetscFunctionBegin;
1479   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1480   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1481   t    = a->solve_work;
1482   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1483 
1484   /* solve U^T * D * y = perm(b) by forward substitution */
1485   for (k=0; k<mbs; k++) {  /* t <- perm(b) */
1486     idx      = 2*r[k];
1487     t[k*2]   = b[idx];
1488     t[k*2+1] = b[idx+1];
1489   }
1490   for (k=0; k<mbs; k++) {
1491     v  = aa + 4*ai[k];
1492     vj = aj + ai[k];
1493     k2 = k*2;
1494     x0 = t[k2]; x1 = t[k2+1];
1495     nz = ai[k+1] - ai[k];
1496     while (nz--) {
1497       t[(*vj)*2]   += v[0]*x0 + v[1]*x1;
1498       t[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1499       vj++; v      += 4;
1500     }
1501     diag    = aa+k*4; /* ptr to inv(Dk) */
1502     t[k2]   = diag[0]*x0 + diag[2]*x1;
1503     t[k2+1] = diag[1]*x0 + diag[3]*x1;
1504   }
1505 
1506   /* solve U*x = y by back substitution */
1507   for (k=mbs-1; k>=0; k--) {
1508     v  = aa + 4*ai[k];
1509     vj = aj + ai[k];
1510     k2 = k*2;
1511     x0 = t[k2]; x1 = t[k2+1];
1512     nz = ai[k+1] - ai[k];
1513     while (nz--) {
1514       x0 += v[0]*t[(*vj)*2] + v[2]*t[(*vj)*2+1];
1515       x1 += v[1]*t[(*vj)*2] + v[3]*t[(*vj)*2+1];
1516       vj++;
1517       v += 4;
1518     }
1519     t[k2]    = x0;
1520     t[k2+1]  = x1;
1521     idx      = 2*r[k];
1522     x[idx]   = x0;
1523     x[idx+1] = x1;
1524   }
1525 
1526   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1527   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1528   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1529   ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr);
1530   PetscFunctionReturn(0);
1531 }
1532 
1533 #undef __FUNCT__
1534 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_2_NaturalOrdering"
1535 PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1536 {
1537   const MatScalar *v,*diag;
1538   PetscScalar     x0,x1;
1539   PetscInt        nz,k,k2;
1540   const PetscInt  *vj;
1541 
1542   PetscFunctionBegin;
1543   for (k=0; k<mbs; k++) {
1544     v  = aa + 4*ai[k];
1545     vj = aj + ai[k];
1546     k2 = k*2;
1547     x0 = x[k2]; x1 = x[k2+1];  /* Dk*xk = k-th block of x */
1548     nz = ai[k+1] - ai[k];
1549     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);    /* Indices for the next row (assumes same size as this one) */
1550     PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1551     while (nz--) {
1552       /* x(:) += U(k,:)^T*(Dk*xk) */
1553       x[(*vj)*2]   += v[0]*x0 + v[1]*x1;
1554       x[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1555       vj++; v      += 4;
1556     }
1557     /* xk = inv(Dk)*(Dk*xk) */
1558     diag    = aa+k*4;       /* ptr to inv(Dk) */
1559     x[k2]   = diag[0]*x0 + diag[2]*x1;
1560     x[k2+1] = diag[1]*x0 + diag[3]*x1;
1561   }
1562   PetscFunctionReturn(0);
1563 }
1564 
1565 #undef __FUNCT__
1566 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering"
1567 PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1568 {
1569   const MatScalar *v;
1570   PetscScalar     x0,x1;
1571   PetscInt        nz,k,k2;
1572   const PetscInt  *vj;
1573 
1574   PetscFunctionBegin;
1575   for (k=mbs-1; k>=0; k--) {
1576     v  = aa + 4*ai[k];
1577     vj = aj + ai[k];
1578     k2 = k*2;
1579     x0 = x[k2]; x1 = x[k2+1];  /* xk */
1580     nz = ai[k+1] - ai[k];
1581     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);    /* Indices for the next row (assumes same size as this one) */
1582     PetscPrefetchBlock(v-4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1583     while (nz--) {
1584       /* xk += U(k,:)*x(:) */
1585       x0 += v[0]*x[(*vj)*2] + v[2]*x[(*vj)*2+1];
1586       x1 += v[1]*x[(*vj)*2] + v[3]*x[(*vj)*2+1];
1587       vj++;
1588       v += 4;
1589     }
1590     x[k2]   = x0;
1591     x[k2+1] = x1;
1592   }
1593   PetscFunctionReturn(0);
1594 }
1595 
1596 #undef __FUNCT__
1597 #define __FUNCT__ "MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace"
1598 PetscErrorCode MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1599 {
1600   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1601   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1602   const MatScalar   *aa=a->a;
1603   PetscScalar       *x;
1604   const PetscScalar *b;
1605   PetscErrorCode    ierr;
1606 
1607   PetscFunctionBegin;
1608   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1609   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1610 
1611   /* solve U^T * D * y = b by forward substitution */
1612   ierr = PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1613   ierr = MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1614 
1615   /* solve U*x = y by back substitution */
1616   ierr = MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1617 
1618   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1619   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1620   ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr);
1621   PetscFunctionReturn(0);
1622 }
1623 
1624 #undef __FUNCT__
1625 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace"
1626 PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1627 {
1628   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1629   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1630   const MatScalar   *aa=a->a;
1631   PetscScalar       *x;
1632   const PetscScalar *b;
1633   PetscErrorCode    ierr;
1634 
1635   PetscFunctionBegin;
1636   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1637   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1638   ierr = PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1639   ierr = MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1640   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1641   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1642   ierr = PetscLogFlops(2.0*bs2*a->nz - bs*mbs);CHKERRQ(ierr);
1643   PetscFunctionReturn(0);
1644 }
1645 
1646 #undef __FUNCT__
1647 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace"
1648 PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1649 {
1650   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1651   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
1652   const MatScalar   *aa=a->a;
1653   PetscScalar       *x;
1654   const PetscScalar *b;
1655   PetscErrorCode    ierr;
1656 
1657   PetscFunctionBegin;
1658   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1659   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1660   ierr = PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1661   ierr = MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1662   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1663   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1664   ierr = PetscLogFlops(2.0*bs2*(a->nz - mbs));CHKERRQ(ierr);
1665   PetscFunctionReturn(0);
1666 }
1667 
1668 #undef __FUNCT__
1669 #define __FUNCT__ "MatSolve_SeqSBAIJ_1"
1670 PetscErrorCode MatSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1671 {
1672   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1673   IS                isrow=a->row;
1674   PetscErrorCode    ierr;
1675   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag = a->diag;
1676   const MatScalar   *aa=a->a,*v;
1677   const PetscScalar *b;
1678   PetscScalar       *x,xk,*t;
1679   PetscInt          nz,k,j;
1680 
1681   PetscFunctionBegin;
1682   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1683   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1684   t    = a->solve_work;
1685   ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1686 
1687   /* solve U^T*D*y = perm(b) by forward substitution */
1688   for (k=0; k<mbs; k++) t[k] = b[rp[k]];
1689   for (k=0; k<mbs; k++) {
1690     v  = aa + ai[k];
1691     vj = aj + ai[k];
1692     xk = t[k];
1693     nz = ai[k+1] - ai[k] - 1;
1694     for (j=0; j<nz; j++) t[vj[j]] += v[j]*xk;
1695     t[k] = xk*v[nz];   /* v[nz] = 1/D(k) */
1696   }
1697 
1698   /* solve U*perm(x) = y by back substitution */
1699   for (k=mbs-1; k>=0; k--) {
1700     v  = aa + adiag[k] - 1;
1701     vj = aj + adiag[k] - 1;
1702     nz = ai[k+1] - ai[k] - 1;
1703     for (j=0; j<nz; j++) t[k] += v[-j]*t[vj[-j]];
1704     x[rp[k]] = t[k];
1705   }
1706 
1707   ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1708   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1709   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1710   ierr = PetscLogFlops(4.0*a->nz - 3.0*mbs);CHKERRQ(ierr);
1711   PetscFunctionReturn(0);
1712 }
1713 
1714 #undef __FUNCT__
1715 #define __FUNCT__ "MatSolve_SeqSBAIJ_1_inplace"
1716 PetscErrorCode MatSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1717 {
1718   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1719   IS                isrow=a->row;
1720   PetscErrorCode    ierr;
1721   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1722   const MatScalar   *aa=a->a,*v;
1723   PetscScalar       *x,xk,*t;
1724   const PetscScalar *b;
1725   PetscInt          nz,k;
1726 
1727   PetscFunctionBegin;
1728   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1729   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1730   t    = a->solve_work;
1731   ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1732 
1733   /* solve U^T*D*y = perm(b) by forward substitution */
1734   for (k=0; k<mbs; k++) t[k] = b[rp[k]];
1735   for (k=0; k<mbs; k++) {
1736     v  = aa + ai[k] + 1;
1737     vj = aj + ai[k] + 1;
1738     xk = t[k];
1739     nz = ai[k+1] - ai[k] - 1;
1740     while (nz--) t[*vj++] += (*v++) * xk;
1741     t[k] = xk*aa[ai[k]];  /* aa[k] = 1/D(k) */
1742   }
1743 
1744   /* solve U*perm(x) = y by back substitution */
1745   for (k=mbs-1; k>=0; k--) {
1746     v  = aa + ai[k] + 1;
1747     vj = aj + ai[k] + 1;
1748     nz = ai[k+1] - ai[k] - 1;
1749     while (nz--) t[k] += (*v++) * t[*vj++];
1750     x[rp[k]] = t[k];
1751   }
1752 
1753   ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1754   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1755   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1756   ierr = PetscLogFlops(4.0*a->nz - 3*mbs);CHKERRQ(ierr);
1757   PetscFunctionReturn(0);
1758 }
1759 
1760 #undef __FUNCT__
1761 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_1"
1762 PetscErrorCode MatForwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1763 {
1764   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1765   IS                isrow=a->row;
1766   PetscErrorCode    ierr;
1767   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag = a->diag;
1768   const MatScalar   *aa=a->a,*v;
1769   PetscReal         diagk;
1770   PetscScalar       *x,xk;
1771   const PetscScalar *b;
1772   PetscInt          nz,k;
1773 
1774   PetscFunctionBegin;
1775   /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1776   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1777   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1778   ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1779 
1780   for (k=0; k<mbs; k++) x[k] = b[rp[k]];
1781   for (k=0; k<mbs; k++) {
1782     v  = aa + ai[k];
1783     vj = aj + ai[k];
1784     xk = x[k];
1785     nz = ai[k+1] - ai[k] - 1;
1786     while (nz--) x[*vj++] += (*v++) * xk;
1787 
1788     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
1789     if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1790     x[k] = xk*PetscSqrtReal(diagk);
1791   }
1792   ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1793   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1794   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1795   ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr);
1796   PetscFunctionReturn(0);
1797 }
1798 
1799 #undef __FUNCT__
1800 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_1_inplace"
1801 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1802 {
1803   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1804   IS                isrow=a->row;
1805   PetscErrorCode    ierr;
1806   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1807   const MatScalar   *aa=a->a,*v;
1808   PetscReal         diagk;
1809   PetscScalar       *x,xk;
1810   const PetscScalar *b;
1811   PetscInt          nz,k;
1812 
1813   PetscFunctionBegin;
1814   /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1815   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1816   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1817   ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1818 
1819   for (k=0; k<mbs; k++) x[k] = b[rp[k]];
1820   for (k=0; k<mbs; k++) {
1821     v  = aa + ai[k] + 1;
1822     vj = aj + ai[k] + 1;
1823     xk = x[k];
1824     nz = ai[k+1] - ai[k] - 1;
1825     while (nz--) x[*vj++] += (*v++) * xk;
1826 
1827     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
1828     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1829     x[k] = xk*PetscSqrtReal(diagk);
1830   }
1831   ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1832   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1833   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1834   ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr);
1835   PetscFunctionReturn(0);
1836 }
1837 
1838 #undef __FUNCT__
1839 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_1"
1840 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1841 {
1842   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1843   IS                isrow=a->row;
1844   PetscErrorCode    ierr;
1845   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag = a->diag;
1846   const MatScalar   *aa=a->a,*v;
1847   PetscReal         diagk;
1848   PetscScalar       *x,*t;
1849   const PetscScalar *b;
1850   PetscInt          nz,k;
1851 
1852   PetscFunctionBegin;
1853   /* solve D^(1/2)*U*perm(x) = b by back substitution */
1854   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1855   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1856   t    = a->solve_work;
1857   ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1858 
1859   for (k=mbs-1; k>=0; k--) {
1860     v     = aa + ai[k];
1861     vj    = aj + ai[k];
1862     diagk = PetscRealPart(aa[adiag[k]]);
1863     if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1864     t[k] = b[k] * PetscSqrtReal(diagk);
1865     nz   = ai[k+1] - ai[k] - 1;
1866     while (nz--) t[k] += (*v++) * t[*vj++];
1867     x[rp[k]] = t[k];
1868   }
1869   ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1870   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1871   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1872   ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr);
1873   PetscFunctionReturn(0);
1874 }
1875 
1876 #undef __FUNCT__
1877 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_1_inplace"
1878 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1879 {
1880   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1881   IS                isrow=a->row;
1882   PetscErrorCode    ierr;
1883   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1884   const MatScalar   *aa=a->a,*v;
1885   PetscReal         diagk;
1886   PetscScalar       *x,*t;
1887   const PetscScalar *b;
1888   PetscInt          nz,k;
1889 
1890   PetscFunctionBegin;
1891   /* solve D^(1/2)*U*perm(x) = b by back substitution */
1892   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1893   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1894   t    = a->solve_work;
1895   ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1896 
1897   for (k=mbs-1; k>=0; k--) {
1898     v     = aa + ai[k] + 1;
1899     vj    = aj + ai[k] + 1;
1900     diagk = PetscRealPart(aa[ai[k]]);
1901     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1902     t[k] = b[k] * PetscSqrtReal(diagk);
1903     nz   = ai[k+1] - ai[k] - 1;
1904     while (nz--) t[k] += (*v++) * t[*vj++];
1905     x[rp[k]] = t[k];
1906   }
1907   ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1908   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1909   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1910   ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr);
1911   PetscFunctionReturn(0);
1912 }
1913 
1914 #undef __FUNCT__
1915 #define __FUNCT__ "MatSolves_SeqSBAIJ_1"
1916 PetscErrorCode MatSolves_SeqSBAIJ_1(Mat A,Vecs bb,Vecs xx)
1917 {
1918   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1919   PetscErrorCode ierr;
1920 
1921   PetscFunctionBegin;
1922   if (A->rmap->bs == 1) {
1923     ierr = MatSolve_SeqSBAIJ_1(A,bb->v,xx->v);CHKERRQ(ierr);
1924   } else {
1925     IS                isrow=a->row;
1926     const PetscInt    *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp;
1927     const MatScalar   *aa=a->a,*v;
1928     PetscScalar       *x,*t;
1929     const PetscScalar *b;
1930     PetscInt          nz,k,n,i,j;
1931 
1932     if (bb->n > a->solves_work_n) {
1933       ierr = PetscFree(a->solves_work);CHKERRQ(ierr);
1934       ierr = PetscMalloc1(bb->n*A->rmap->N,&a->solves_work);CHKERRQ(ierr);
1935       a->solves_work_n = bb->n;
1936     }
1937     n    = bb->n;
1938     ierr = VecGetArrayRead(bb->v,&b);CHKERRQ(ierr);
1939     ierr = VecGetArray(xx->v,&x);CHKERRQ(ierr);
1940     t    = a->solves_work;
1941 
1942     ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1943 
1944     /* solve U^T*D*y = perm(b) by forward substitution */
1945     for (k=0; k<mbs; k++) {
1946       for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs]; /* values are stored interlaced in t */
1947     }
1948     for (k=0; k<mbs; k++) {
1949       v  = aa + ai[k];
1950       vj = aj + ai[k];
1951       nz = ai[k+1] - ai[k] - 1;
1952       for (j=0; j<nz; j++) {
1953         for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i];
1954         v++;vj++;
1955       }
1956       for (i=0; i<n; i++) t[n*k+i] *= aa[nz];  /* note: aa[nz] = 1/D(k) */
1957     }
1958 
1959     /* solve U*perm(x) = y by back substitution */
1960     for (k=mbs-1; k>=0; k--) {
1961       v  = aa + ai[k] - 1;
1962       vj = aj + ai[k] - 1;
1963       nz = ai[k+1] - ai[k] - 1;
1964       for (j=0; j<nz; j++) {
1965         for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i];
1966         v++;vj++;
1967       }
1968       for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i];
1969     }
1970 
1971     ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1972     ierr = VecRestoreArrayRead(bb->v,&b);CHKERRQ(ierr);
1973     ierr = VecRestoreArray(xx->v,&x);CHKERRQ(ierr);
1974     ierr = PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));CHKERRQ(ierr);
1975   }
1976   PetscFunctionReturn(0);
1977 }
1978 
1979 #undef __FUNCT__
1980 #define __FUNCT__ "MatSolves_SeqSBAIJ_1_inplace"
1981 PetscErrorCode MatSolves_SeqSBAIJ_1_inplace(Mat A,Vecs bb,Vecs xx)
1982 {
1983   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1984   PetscErrorCode ierr;
1985 
1986   PetscFunctionBegin;
1987   if (A->rmap->bs == 1) {
1988     ierr = MatSolve_SeqSBAIJ_1_inplace(A,bb->v,xx->v);CHKERRQ(ierr);
1989   } else {
1990     IS                isrow=a->row;
1991     const PetscInt    *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp;
1992     const MatScalar   *aa=a->a,*v;
1993     PetscScalar       *x,*t;
1994     const PetscScalar *b;
1995     PetscInt          nz,k,n,i;
1996 
1997     if (bb->n > a->solves_work_n) {
1998       ierr = PetscFree(a->solves_work);CHKERRQ(ierr);
1999       ierr = PetscMalloc1(bb->n*A->rmap->N,&a->solves_work);CHKERRQ(ierr);
2000       a->solves_work_n = bb->n;
2001     }
2002     n    = bb->n;
2003     ierr = VecGetArrayRead(bb->v,&b);CHKERRQ(ierr);
2004     ierr = VecGetArray(xx->v,&x);CHKERRQ(ierr);
2005     t    = a->solves_work;
2006 
2007     ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
2008 
2009     /* solve U^T*D*y = perm(b) by forward substitution */
2010     for (k=0; k<mbs; k++) {
2011       for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs];  /* values are stored interlaced in t */
2012     }
2013     for (k=0; k<mbs; k++) {
2014       v  = aa + ai[k];
2015       vj = aj + ai[k];
2016       nz = ai[k+1] - ai[k];
2017       while (nz--) {
2018         for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i];
2019         v++;vj++;
2020       }
2021       for (i=0; i<n; i++) t[n*k+i] *= aa[k];  /* note: aa[k] = 1/D(k) */
2022     }
2023 
2024     /* solve U*perm(x) = y by back substitution */
2025     for (k=mbs-1; k>=0; k--) {
2026       v  = aa + ai[k];
2027       vj = aj + ai[k];
2028       nz = ai[k+1] - ai[k];
2029       while (nz--) {
2030         for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i];
2031         v++;vj++;
2032       }
2033       for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i];
2034     }
2035 
2036     ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
2037     ierr = VecRestoreArrayRead(bb->v,&b);CHKERRQ(ierr);
2038     ierr = VecRestoreArray(xx->v,&x);CHKERRQ(ierr);
2039     ierr = PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));CHKERRQ(ierr);
2040   }
2041   PetscFunctionReturn(0);
2042 }
2043 
2044 #undef __FUNCT__
2045 #define __FUNCT__ "MatSolve_SeqSBAIJ_1_NaturalOrdering"
2046 PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2047 {
2048   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
2049   PetscErrorCode    ierr;
2050   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*vj,*adiag = a->diag;
2051   const MatScalar   *aa=a->a,*v;
2052   const PetscScalar *b;
2053   PetscScalar       *x,xi;
2054   PetscInt          nz,i,j;
2055 
2056   PetscFunctionBegin;
2057   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2058   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2059 
2060   /* solve U^T*D*y = b by forward substitution */
2061   ierr = PetscMemcpy(x,b,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
2062   for (i=0; i<mbs; i++) {
2063     v  = aa + ai[i];
2064     vj = aj + ai[i];
2065     xi = x[i];
2066     nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
2067     for (j=0; j<nz; j++) x[vj[j]] += v[j]* xi;
2068     x[i] = xi*v[nz];  /* v[nz] = aa[diag[i]] = 1/D(i) */
2069   }
2070 
2071   /* solve U*x = y by backward substitution */
2072   for (i=mbs-2; i>=0; i--) {
2073     xi = x[i];
2074     v  = aa + adiag[i] - 1; /* end of row i, excluding diag */
2075     vj = aj + adiag[i] - 1;
2076     nz = ai[i+1] - ai[i] - 1;
2077     for (j=0; j<nz; j++) xi += v[-j]*x[vj[-j]];
2078     x[i] = xi;
2079   }
2080 
2081   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2082   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2083   ierr = PetscLogFlops(4.0*a->nz - 3*mbs);CHKERRQ(ierr);
2084   PetscFunctionReturn(0);
2085 }
2086 
2087 #undef __FUNCT__
2088 #define __FUNCT__ "MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace"
2089 PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2090 {
2091   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
2092   PetscErrorCode    ierr;
2093   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*vj;
2094   const MatScalar   *aa=a->a,*v;
2095   PetscScalar       *x,xk;
2096   const PetscScalar *b;
2097   PetscInt          nz,k;
2098 
2099   PetscFunctionBegin;
2100   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2101   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2102 
2103   /* solve U^T*D*y = b by forward substitution */
2104   ierr = PetscMemcpy(x,b,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
2105   for (k=0; k<mbs; k++) {
2106     v  = aa + ai[k] + 1;
2107     vj = aj + ai[k] + 1;
2108     xk = x[k];
2109     nz = ai[k+1] - ai[k] - 1;     /* exclude diag[k] */
2110     while (nz--) x[*vj++] += (*v++) * xk;
2111     x[k] = xk*aa[ai[k]];  /* note: aa[diag[k]] = 1/D(k) */
2112   }
2113 
2114   /* solve U*x = y by back substitution */
2115   for (k=mbs-2; k>=0; k--) {
2116     v  = aa + ai[k] + 1;
2117     vj = aj + ai[k] + 1;
2118     xk = x[k];
2119     nz = ai[k+1] - ai[k] - 1;
2120     while (nz--) {
2121       xk += (*v++) * x[*vj++];
2122     }
2123     x[k] = xk;
2124   }
2125 
2126   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2127   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2128   ierr = PetscLogFlops(4.0*a->nz - 3*mbs);CHKERRQ(ierr);
2129   PetscFunctionReturn(0);
2130 }
2131 
2132 #undef __FUNCT__
2133 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_1_NaturalOrdering"
2134 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2135 {
2136   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
2137   PetscErrorCode    ierr;
2138   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag = a->diag,*vj;
2139   const MatScalar   *aa=a->a,*v;
2140   PetscReal         diagk;
2141   PetscScalar       *x;
2142   const PetscScalar *b;
2143   PetscInt          nz,k;
2144 
2145   PetscFunctionBegin;
2146   /* solve U^T*D^(1/2)*x = b by forward substitution */
2147   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2148   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2149   ierr = PetscMemcpy(x,b,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
2150   for (k=0; k<mbs; k++) {
2151     v  = aa + ai[k];
2152     vj = aj + ai[k];
2153     nz = ai[k+1] - ai[k] - 1;     /* exclude diag[k] */
2154     while (nz--) x[*vj++] += (*v++) * x[k];
2155     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[adiag[k]] = 1/D(k) */
2156     if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",PetscRealPart(aa[adiag[k]]),PetscImaginaryPart(aa[adiag[k]]));
2157     x[k] *= PetscSqrtReal(diagk);
2158   }
2159   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2160   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2161   ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr);
2162   PetscFunctionReturn(0);
2163 }
2164 
2165 #undef __FUNCT__
2166 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace"
2167 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2168 {
2169   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
2170   PetscErrorCode    ierr;
2171   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*vj;
2172   const MatScalar   *aa=a->a,*v;
2173   PetscReal         diagk;
2174   PetscScalar       *x;
2175   const PetscScalar *b;
2176   PetscInt          nz,k;
2177 
2178   PetscFunctionBegin;
2179   /* solve U^T*D^(1/2)*x = b by forward substitution */
2180   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2181   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2182   ierr = PetscMemcpy(x,b,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
2183   for (k=0; k<mbs; k++) {
2184     v  = aa + ai[k] + 1;
2185     vj = aj + ai[k] + 1;
2186     nz = ai[k+1] - ai[k] - 1;     /* exclude diag[k] */
2187     while (nz--) x[*vj++] += (*v++) * x[k];
2188     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2189     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",PetscRealPart(aa[ai[k]]),PetscImaginaryPart(aa[ai[k]]));
2190     x[k] *= PetscSqrtReal(diagk);
2191   }
2192   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2193   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2194   ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr);
2195   PetscFunctionReturn(0);
2196 }
2197 
2198 #undef __FUNCT__
2199 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering"
2200 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2201 {
2202   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
2203   PetscErrorCode    ierr;
2204   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag = a->diag,*vj;
2205   const MatScalar   *aa=a->a,*v;
2206   PetscReal         diagk;
2207   PetscScalar       *x;
2208   const PetscScalar *b;
2209   PetscInt          nz,k;
2210 
2211   PetscFunctionBegin;
2212   /* solve D^(1/2)*U*x = b by back substitution */
2213   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2214   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2215 
2216   for (k=mbs-1; k>=0; k--) {
2217     v     = aa + ai[k];
2218     vj    = aj + ai[k];
2219     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
2220     if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
2221     x[k] = PetscSqrtReal(diagk)*b[k];
2222     nz   = ai[k+1] - ai[k] - 1;
2223     while (nz--) x[k] += (*v++) * x[*vj++];
2224   }
2225   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2226   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2227   ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr);
2228   PetscFunctionReturn(0);
2229 }
2230 
2231 #undef __FUNCT__
2232 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace"
2233 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2234 {
2235   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
2236   PetscErrorCode    ierr;
2237   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*vj;
2238   const MatScalar   *aa=a->a,*v;
2239   PetscReal         diagk;
2240   PetscScalar       *x;
2241   const PetscScalar *b;
2242   PetscInt          nz,k;
2243 
2244   PetscFunctionBegin;
2245   /* solve D^(1/2)*U*x = b by back substitution */
2246   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2247   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2248 
2249   for (k=mbs-1; k>=0; k--) {
2250     v     = aa + ai[k] + 1;
2251     vj    = aj + ai[k] + 1;
2252     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2253     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
2254     x[k] = PetscSqrtReal(diagk)*b[k];
2255     nz   = ai[k+1] - ai[k] - 1;
2256     while (nz--) x[k] += (*v++) * x[*vj++];
2257   }
2258   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2259   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2260   ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr);
2261   PetscFunctionReturn(0);
2262 }
2263 
2264 /* Use Modified Sparse Row storage for u and ju, see Saad pp.85 */
2265 #undef __FUNCT__
2266 #define __FUNCT__ "MatICCFactorSymbolic_SeqSBAIJ_MSR"
2267 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_MSR(Mat B,Mat A,IS perm,const MatFactorInfo *info)
2268 {
2269   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b;
2270   PetscErrorCode ierr;
2271   const PetscInt *rip,mbs = a->mbs,*ai,*aj;
2272   PetscInt       *jutmp,bs = A->rmap->bs,bs2=a->bs2,i;
2273   PetscInt       m,reallocs = 0,*levtmp;
2274   PetscInt       *prowl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd;
2275   PetscInt       incrlev,*lev,shift,prow,nz;
2276   PetscReal      f = info->fill,levels = info->levels;
2277   PetscBool      perm_identity;
2278 
2279   PetscFunctionBegin;
2280   /* check whether perm is the identity mapping */
2281   ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr);
2282 
2283   if (perm_identity) {
2284     a->permute = PETSC_FALSE;
2285     ai         = a->i; aj = a->j;
2286   } else { /*  non-trivial permutation */
2287     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2288     a->permute = PETSC_TRUE;
2289     ierr       = MatReorderingSeqSBAIJ(A, perm);CHKERRQ(ierr);
2290     ai         = a->inew; aj = a->jnew;
2291   }
2292 
2293   /* initialization */
2294   ierr  = ISGetIndices(perm,&rip);CHKERRQ(ierr);
2295   umax  = (PetscInt)(f*ai[mbs] + 1);
2296   ierr  = PetscMalloc1(umax,&lev);CHKERRQ(ierr);
2297   umax += mbs + 1;
2298   shift = mbs + 1;
2299   ierr  = PetscMalloc1(mbs+1,&iu);CHKERRQ(ierr);
2300   ierr  = PetscMalloc1(umax,&ju);CHKERRQ(ierr);
2301   iu[0] = mbs + 1;
2302   juidx = mbs + 1;
2303   /* prowl: linked list for pivot row */
2304   ierr = PetscMalloc3(mbs,&prowl,mbs,&q,mbs,&levtmp);CHKERRQ(ierr);
2305   /* q: linked list for col index */
2306 
2307   for (i=0; i<mbs; i++) {
2308     prowl[i] = mbs;
2309     q[i]     = 0;
2310   }
2311 
2312   /* for each row k */
2313   for (k=0; k<mbs; k++) {
2314     nzk  = 0;
2315     q[k] = mbs;
2316     /* copy current row into linked list */
2317     nz = ai[rip[k]+1] - ai[rip[k]];
2318     j  = ai[rip[k]];
2319     while (nz--) {
2320       vj = rip[aj[j++]];
2321       if (vj > k) {
2322         qm = k;
2323         do {
2324           m = qm; qm = q[m];
2325         } while (qm < vj);
2326         if (qm == vj) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Duplicate entry in A\n");
2327         nzk++;
2328         q[m]       = vj;
2329         q[vj]      = qm;
2330         levtmp[vj] = 0;   /* initialize lev for nonzero element */
2331       }
2332     }
2333 
2334     /* modify nonzero structure of k-th row by computing fill-in
2335        for each row prow to be merged in */
2336     prow = k;
2337     prow = prowl[prow]; /* next pivot row (== 0 for symbolic factorization) */
2338 
2339     while (prow < k) {
2340       /* merge row prow into k-th row */
2341       jmin = iu[prow] + 1;
2342       jmax = iu[prow+1];
2343       qm   = k;
2344       for (j=jmin; j<jmax; j++) {
2345         incrlev = lev[j-shift] + 1;
2346         if (incrlev > levels) continue;
2347         vj = ju[j];
2348         do {
2349           m = qm; qm = q[m];
2350         } while (qm < vj);
2351         if (qm != vj) {      /* a fill */
2352           nzk++; q[m] = vj; q[vj] = qm; qm = vj;
2353           levtmp[vj] = incrlev;
2354         } else if (levtmp[vj] > incrlev) levtmp[vj] = incrlev;
2355       }
2356       prow = prowl[prow]; /* next pivot row */
2357     }
2358 
2359     /* add k to row list for first nonzero element in k-th row */
2360     if (nzk > 1) {
2361       i        = q[k]; /* col value of first nonzero element in k_th row of U */
2362       prowl[k] = prowl[i]; prowl[i] = k;
2363     }
2364     iu[k+1] = iu[k] + nzk;
2365 
2366     /* allocate more space to ju and lev if needed */
2367     if (iu[k+1] > umax) {
2368       /* estimate how much additional space we will need */
2369       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
2370       /* just double the memory each time */
2371       maxadd = umax;
2372       if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
2373       umax += maxadd;
2374 
2375       /* allocate a longer ju */
2376       ierr = PetscMalloc1(umax,&jutmp);CHKERRQ(ierr);
2377       ierr = PetscMemcpy(jutmp,ju,iu[k]*sizeof(PetscInt));CHKERRQ(ierr);
2378       ierr = PetscFree(ju);CHKERRQ(ierr);
2379       ju   = jutmp;
2380 
2381       ierr      = PetscMalloc1(umax,&jutmp);CHKERRQ(ierr);
2382       ierr      = PetscMemcpy(jutmp,lev,(iu[k]-shift)*sizeof(PetscInt));CHKERRQ(ierr);
2383       ierr      = PetscFree(lev);CHKERRQ(ierr);
2384       lev       = jutmp;
2385       reallocs += 2; /* count how many times we realloc */
2386     }
2387 
2388     /* save nonzero structure of k-th row in ju */
2389     i=k;
2390     while (nzk--) {
2391       i                = q[i];
2392       ju[juidx]        = i;
2393       lev[juidx-shift] = levtmp[i];
2394       juidx++;
2395     }
2396   }
2397 
2398 #if defined(PETSC_USE_INFO)
2399   if (ai[mbs] != 0) {
2400     PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
2401     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)f,(double)af);CHKERRQ(ierr);
2402     ierr = PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);CHKERRQ(ierr);
2403     ierr = PetscInfo1(A,"PCFactorSetFill(pc,%g);\n",(double)af);CHKERRQ(ierr);
2404     ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
2405   } else {
2406     ierr = PetscInfo(A,"Empty matrix.\n");CHKERRQ(ierr);
2407   }
2408 #endif
2409 
2410   ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
2411   ierr = PetscFree3(prowl,q,levtmp);CHKERRQ(ierr);
2412   ierr = PetscFree(lev);CHKERRQ(ierr);
2413 
2414   /* put together the new matrix */
2415   ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(B,bs,0,NULL);CHKERRQ(ierr);
2416 
2417   /* ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)iperm);CHKERRQ(ierr); */
2418   b    = (Mat_SeqSBAIJ*)(B)->data;
2419   ierr = PetscFree2(b->imax,b->ilen);CHKERRQ(ierr);
2420 
2421   b->singlemalloc = PETSC_FALSE;
2422   b->free_a       = PETSC_TRUE;
2423   b->free_ij      = PETSC_TRUE;
2424   /* the next line frees the default space generated by the Create() */
2425   ierr    = PetscFree3(b->a,b->j,b->i);CHKERRQ(ierr);
2426   ierr    = PetscMalloc1((iu[mbs]+1)*bs2,&b->a);CHKERRQ(ierr);
2427   b->j    = ju;
2428   b->i    = iu;
2429   b->diag = 0;
2430   b->ilen = 0;
2431   b->imax = 0;
2432 
2433   ierr    = ISDestroy(&b->row);CHKERRQ(ierr);
2434   ierr    = ISDestroy(&b->icol);CHKERRQ(ierr);
2435   b->row  = perm;
2436   b->icol = perm;
2437   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
2438   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
2439   ierr    = PetscMalloc1(bs*mbs+bs,&b->solve_work);CHKERRQ(ierr);
2440   /* In b structure:  Free imax, ilen, old a, old j.
2441      Allocate idnew, solve_work, new a, new j */
2442   ierr     = PetscLogObjectMemory((PetscObject)B,(iu[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));CHKERRQ(ierr);
2443   b->maxnz = b->nz = iu[mbs];
2444 
2445   (B)->info.factor_mallocs   = reallocs;
2446   (B)->info.fill_ratio_given = f;
2447   if (ai[mbs] != 0) {
2448     (B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
2449   } else {
2450     (B)->info.fill_ratio_needed = 0.0;
2451   }
2452   ierr = MatSeqSBAIJSetNumericFactorization_inplace(B,perm_identity);CHKERRQ(ierr);
2453   PetscFunctionReturn(0);
2454 }
2455 
2456 /*
2457   See MatICCFactorSymbolic_SeqAIJ() for description of its data structure
2458 */
2459 #include <petscbt.h>
2460 #include <../src/mat/utils/freespace.h>
2461 #undef __FUNCT__
2462 #define __FUNCT__ "MatICCFactorSymbolic_SeqSBAIJ"
2463 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
2464 {
2465   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b;
2466   PetscErrorCode     ierr;
2467   PetscBool          perm_identity,free_ij = PETSC_TRUE,missing;
2468   PetscInt           bs=A->rmap->bs,am=a->mbs,d,*ai=a->i,*aj= a->j;
2469   const PetscInt     *rip;
2470   PetscInt           reallocs=0,i,*ui,*udiag,*cols;
2471   PetscInt           jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow;
2472   PetscInt           nlnk,*lnk,*lnk_lvl=NULL,ncols,*uj,**uj_ptr,**uj_lvl_ptr;
2473   PetscReal          fill          =info->fill,levels=info->levels;
2474   PetscFreeSpaceList free_space    =NULL,current_space=NULL;
2475   PetscFreeSpaceList free_space_lvl=NULL,current_space_lvl=NULL;
2476   PetscBT            lnkbt;
2477 
2478   PetscFunctionBegin;
2479   if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
2480   ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr);
2481   if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
2482   if (bs > 1) {
2483     ierr = MatICCFactorSymbolic_SeqSBAIJ_inplace(fact,A,perm,info);CHKERRQ(ierr);
2484     PetscFunctionReturn(0);
2485   }
2486 
2487   /* check whether perm is the identity mapping */
2488   ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr);
2489   if (!perm_identity) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2490   a->permute = PETSC_FALSE;
2491 
2492   ierr  = PetscMalloc1(am+1,&ui);CHKERRQ(ierr);
2493   ierr  = PetscMalloc1(am+1,&udiag);CHKERRQ(ierr);
2494   ui[0] = 0;
2495 
2496   /* ICC(0) without matrix ordering: simply rearrange column indices */
2497   if (!levels) {
2498     /* reuse the column pointers and row offsets for memory savings */
2499     for (i=0; i<am; i++) {
2500       ncols    = ai[i+1] - ai[i];
2501       ui[i+1]  = ui[i] + ncols;
2502       udiag[i] = ui[i+1] - 1; /* points to the last entry of U(i,:) */
2503     }
2504     ierr = PetscMalloc1(ui[am]+1,&uj);CHKERRQ(ierr);
2505     cols = uj;
2506     for (i=0; i<am; i++) {
2507       aj    = a->j + ai[i] + 1; /* 1st entry of U(i,:) without diagonal */
2508       ncols = ai[i+1] - ai[i] -1;
2509       for (j=0; j<ncols; j++) *cols++ = aj[j];
2510       *cols++ = i; /* diagoanl is located as the last entry of U(i,:) */
2511     }
2512   } else { /* case: levels>0 */
2513     ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr);
2514 
2515     /* initialization */
2516     /* jl: linked list for storing indices of the pivot rows
2517        il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2518     ierr = PetscMalloc4(am,&uj_ptr,am,&uj_lvl_ptr,am,&il,am,&jl);CHKERRQ(ierr);
2519     for (i=0; i<am; i++) {
2520       jl[i] = am; il[i] = 0;
2521     }
2522 
2523     /* create and initialize a linked list for storing column indices of the active row k */
2524     nlnk = am + 1;
2525     ierr = PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
2526 
2527     /* initial FreeSpace size is fill*(ai[am]+1) */
2528     ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space);CHKERRQ(ierr);
2529 
2530     current_space = free_space;
2531 
2532     ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space_lvl);CHKERRQ(ierr);
2533 
2534     current_space_lvl = free_space_lvl;
2535 
2536     for (k=0; k<am; k++) {  /* for each active row k */
2537       /* initialize lnk by the column indices of row k */
2538       nzk   = 0;
2539       ncols = ai[k+1] - ai[k];
2540       if (!ncols) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Empty row %D in matrix ",k);
2541       cols = aj+ai[k];
2542       ierr = PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
2543       nzk += nlnk;
2544 
2545       /* update lnk by computing fill-in for each pivot row to be merged in */
2546       prow = jl[k]; /* 1st pivot row */
2547 
2548       while (prow < k) {
2549         nextprow = jl[prow];
2550 
2551         /* merge prow into k-th row */
2552         jmin  = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
2553         jmax  = ui[prow+1];
2554         ncols = jmax-jmin;
2555         i     = jmin - ui[prow];
2556 
2557         cols = uj_ptr[prow] + i;  /* points to the 2nd nzero entry in U(prow,k:am-1) */
2558         uj   = uj_lvl_ptr[prow] + i;  /* levels of cols */
2559         j    = *(uj - 1);
2560         ierr = PetscICCLLAddSorted(ncols,cols,levels,uj,am,nlnk,lnk,lnk_lvl,lnkbt,j);CHKERRQ(ierr);
2561         nzk += nlnk;
2562 
2563         /* update il and jl for prow */
2564         if (jmin < jmax) {
2565           il[prow] = jmin;
2566 
2567           j = *cols; jl[prow] = jl[j]; jl[j] = prow;
2568         }
2569         prow = nextprow;
2570       }
2571 
2572       /* if free space is not available, make more free space */
2573       if (current_space->local_remaining<nzk) {
2574         i    = am - k + 1; /* num of unfactored rows */
2575         i   *= PetscMin(nzk, i-1); /* i*nzk, i*(i-1): estimated and max additional space needed */
2576         ierr = PetscFreeSpaceGet(i,&current_space);CHKERRQ(ierr);
2577         ierr = PetscFreeSpaceGet(i,&current_space_lvl);CHKERRQ(ierr);
2578         reallocs++;
2579       }
2580 
2581       /* copy data into free_space and free_space_lvl, then initialize lnk */
2582       if (nzk == 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Empty row %D in ICC matrix factor",k);
2583       ierr = PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr);
2584 
2585       /* add the k-th row into il and jl */
2586       if (nzk > 1) {
2587         i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2588         jl[k] = jl[i]; jl[i] = k;
2589         il[k] = ui[k] + 1;
2590       }
2591       uj_ptr[k]     = current_space->array;
2592       uj_lvl_ptr[k] = current_space_lvl->array;
2593 
2594       current_space->array               += nzk;
2595       current_space->local_used          += nzk;
2596       current_space->local_remaining     -= nzk;
2597       current_space_lvl->array           += nzk;
2598       current_space_lvl->local_used      += nzk;
2599       current_space_lvl->local_remaining -= nzk;
2600 
2601       ui[k+1] = ui[k] + nzk;
2602     }
2603 
2604     ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
2605     ierr = PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);CHKERRQ(ierr);
2606 
2607     /* destroy list of free space and other temporary array(s) */
2608     ierr = PetscMalloc1(ui[am]+1,&uj);CHKERRQ(ierr);
2609     ierr = PetscFreeSpaceContiguous_Cholesky(&free_space,uj,am,ui,udiag);CHKERRQ(ierr); /* store matrix factor  */
2610     ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
2611     ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr);
2612 
2613   } /* end of case: levels>0 || (levels=0 && !perm_identity) */
2614 
2615   /* put together the new matrix in MATSEQSBAIJ format */
2616   ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(fact,bs,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
2617 
2618   b    = (Mat_SeqSBAIJ*)(fact)->data;
2619   ierr = PetscFree2(b->imax,b->ilen);CHKERRQ(ierr);
2620 
2621   b->singlemalloc = PETSC_FALSE;
2622   b->free_a       = PETSC_TRUE;
2623   b->free_ij      = free_ij;
2624 
2625   ierr = PetscMalloc1(ui[am]+1,&b->a);CHKERRQ(ierr);
2626 
2627   b->j         = uj;
2628   b->i         = ui;
2629   b->diag      = udiag;
2630   b->free_diag = PETSC_TRUE;
2631   b->ilen      = 0;
2632   b->imax      = 0;
2633   b->row       = perm;
2634   b->col       = perm;
2635 
2636   ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
2637   ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
2638 
2639   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
2640 
2641   ierr = PetscMalloc1(am+1,&b->solve_work);CHKERRQ(ierr);
2642   ierr = PetscLogObjectMemory((PetscObject)fact,ui[am]*(sizeof(PetscInt)+sizeof(MatScalar)));CHKERRQ(ierr);
2643 
2644   b->maxnz = b->nz = ui[am];
2645 
2646   fact->info.factor_mallocs   = reallocs;
2647   fact->info.fill_ratio_given = fill;
2648   if (ai[am] != 0) {
2649     fact->info.fill_ratio_needed = ((PetscReal)ui[am])/ai[am];
2650   } else {
2651     fact->info.fill_ratio_needed = 0.0;
2652   }
2653 #if defined(PETSC_USE_INFO)
2654   if (ai[am] != 0) {
2655     PetscReal af = fact->info.fill_ratio_needed;
2656     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)fill,(double)af);CHKERRQ(ierr);
2657     ierr = PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);CHKERRQ(ierr);
2658     ierr = PetscInfo1(A,"PCFactorSetFill(pc,%g) for best performance.\n",(double)af);CHKERRQ(ierr);
2659   } else {
2660     ierr = PetscInfo(A,"Empty matrix.\n");CHKERRQ(ierr);
2661   }
2662 #endif
2663   fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
2664   PetscFunctionReturn(0);
2665 }
2666 
2667 #undef __FUNCT__
2668 #define __FUNCT__ "MatICCFactorSymbolic_SeqSBAIJ_inplace"
2669 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_inplace(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
2670 {
2671   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data;
2672   Mat_SeqSBAIJ       *b;
2673   PetscErrorCode     ierr;
2674   PetscBool          perm_identity,free_ij = PETSC_TRUE;
2675   PetscInt           bs=A->rmap->bs,am=a->mbs;
2676   const PetscInt     *cols,*rip,*ai=a->i,*aj=a->j;
2677   PetscInt           reallocs=0,i,*ui;
2678   PetscInt           jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow;
2679   PetscInt           nlnk,*lnk,*lnk_lvl=NULL,ncols,*cols_lvl,*uj,**uj_ptr,**uj_lvl_ptr;
2680   PetscReal          fill          =info->fill,levels=info->levels,ratio_needed;
2681   PetscFreeSpaceList free_space    =NULL,current_space=NULL;
2682   PetscFreeSpaceList free_space_lvl=NULL,current_space_lvl=NULL;
2683   PetscBT            lnkbt;
2684 
2685   PetscFunctionBegin;
2686   /*
2687    This code originally uses Modified Sparse Row (MSR) storage
2688    (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choice!
2689    Then it is rewritten so the factor B takes seqsbaij format. However the associated
2690    MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1,
2691    thus the original code in MSR format is still used for these cases.
2692    The code below should replace MatICCFactorSymbolic_SeqSBAIJ_MSR() whenever
2693    MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor.
2694   */
2695   if (bs > 1) {
2696     ierr = MatICCFactorSymbolic_SeqSBAIJ_MSR(fact,A,perm,info);CHKERRQ(ierr);
2697     PetscFunctionReturn(0);
2698   }
2699 
2700   /* check whether perm is the identity mapping */
2701   ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr);
2702   if (!perm_identity) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2703   a->permute = PETSC_FALSE;
2704 
2705   /* special case that simply copies fill pattern */
2706   if (!levels) {
2707     /* reuse the column pointers and row offsets for memory savings */
2708     ui           = a->i;
2709     uj           = a->j;
2710     free_ij      = PETSC_FALSE;
2711     ratio_needed = 1.0;
2712   } else { /* case: levels>0 */
2713     ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr);
2714 
2715     /* initialization */
2716     ierr  = PetscMalloc1(am+1,&ui);CHKERRQ(ierr);
2717     ui[0] = 0;
2718 
2719     /* jl: linked list for storing indices of the pivot rows
2720        il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2721     ierr = PetscMalloc4(am,&uj_ptr,am,&uj_lvl_ptr,am,&il,am,&jl);CHKERRQ(ierr);
2722     for (i=0; i<am; i++) {
2723       jl[i] = am; il[i] = 0;
2724     }
2725 
2726     /* create and initialize a linked list for storing column indices of the active row k */
2727     nlnk = am + 1;
2728     ierr = PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
2729 
2730     /* initial FreeSpace size is fill*(ai[am]+1) */
2731     ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space);CHKERRQ(ierr);
2732 
2733     current_space = free_space;
2734 
2735     ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space_lvl);CHKERRQ(ierr);
2736 
2737     current_space_lvl = free_space_lvl;
2738 
2739     for (k=0; k<am; k++) {  /* for each active row k */
2740       /* initialize lnk by the column indices of row rip[k] */
2741       nzk   = 0;
2742       ncols = ai[rip[k]+1] - ai[rip[k]];
2743       cols  = aj+ai[rip[k]];
2744       ierr  = PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
2745       nzk  += nlnk;
2746 
2747       /* update lnk by computing fill-in for each pivot row to be merged in */
2748       prow = jl[k]; /* 1st pivot row */
2749 
2750       while (prow < k) {
2751         nextprow = jl[prow];
2752 
2753         /* merge prow into k-th row */
2754         jmin     = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
2755         jmax     = ui[prow+1];
2756         ncols    = jmax-jmin;
2757         i        = jmin - ui[prow];
2758         cols     = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
2759         j        = *(uj_lvl_ptr[prow] + i - 1);
2760         cols_lvl = uj_lvl_ptr[prow]+i;
2761         ierr     = PetscICCLLAddSorted(ncols,cols,levels,cols_lvl,am,nlnk,lnk,lnk_lvl,lnkbt,j);CHKERRQ(ierr);
2762         nzk     += nlnk;
2763 
2764         /* update il and jl for prow */
2765         if (jmin < jmax) {
2766           il[prow] = jmin;
2767           j        = *cols;
2768           jl[prow] = jl[j];
2769           jl[j]    = prow;
2770         }
2771         prow = nextprow;
2772       }
2773 
2774       /* if free space is not available, make more free space */
2775       if (current_space->local_remaining<nzk) {
2776         i    = am - k + 1; /* num of unfactored rows */
2777         i    = PetscMin(i*nzk, i*(i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
2778         ierr = PetscFreeSpaceGet(i,&current_space);CHKERRQ(ierr);
2779         ierr = PetscFreeSpaceGet(i,&current_space_lvl);CHKERRQ(ierr);
2780         reallocs++;
2781       }
2782 
2783       /* copy data into free_space and free_space_lvl, then initialize lnk */
2784       ierr = PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr);
2785 
2786       /* add the k-th row into il and jl */
2787       if (nzk-1 > 0) {
2788         i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2789         jl[k] = jl[i]; jl[i] = k;
2790         il[k] = ui[k] + 1;
2791       }
2792       uj_ptr[k]     = current_space->array;
2793       uj_lvl_ptr[k] = current_space_lvl->array;
2794 
2795       current_space->array               += nzk;
2796       current_space->local_used          += nzk;
2797       current_space->local_remaining     -= nzk;
2798       current_space_lvl->array           += nzk;
2799       current_space_lvl->local_used      += nzk;
2800       current_space_lvl->local_remaining -= nzk;
2801 
2802       ui[k+1] = ui[k] + nzk;
2803     }
2804 
2805     ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
2806     ierr = PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);CHKERRQ(ierr);
2807 
2808     /* destroy list of free space and other temporary array(s) */
2809     ierr = PetscMalloc1(ui[am]+1,&uj);CHKERRQ(ierr);
2810     ierr = PetscFreeSpaceContiguous(&free_space,uj);CHKERRQ(ierr);
2811     ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
2812     ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr);
2813     if (ai[am] != 0) {
2814       ratio_needed = ((PetscReal)ui[am])/((PetscReal)ai[am]);
2815     } else {
2816       ratio_needed = 0.0;
2817     }
2818   } /* end of case: levels>0 || (levels=0 && !perm_identity) */
2819 
2820   /* put together the new matrix in MATSEQSBAIJ format */
2821   ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(fact,bs,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
2822 
2823   b = (Mat_SeqSBAIJ*)(fact)->data;
2824 
2825   ierr = PetscFree2(b->imax,b->ilen);CHKERRQ(ierr);
2826 
2827   b->singlemalloc = PETSC_FALSE;
2828   b->free_a       = PETSC_TRUE;
2829   b->free_ij      = free_ij;
2830 
2831   ierr = PetscMalloc1(ui[am]+1,&b->a);CHKERRQ(ierr);
2832 
2833   b->j             = uj;
2834   b->i             = ui;
2835   b->diag          = 0;
2836   b->ilen          = 0;
2837   b->imax          = 0;
2838   b->row           = perm;
2839   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
2840 
2841   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
2842   b->icol = perm;
2843   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
2844   ierr    = PetscMalloc1(am+1,&b->solve_work);CHKERRQ(ierr);
2845 
2846   b->maxnz = b->nz = ui[am];
2847 
2848   fact->info.factor_mallocs    = reallocs;
2849   fact->info.fill_ratio_given  = fill;
2850   fact->info.fill_ratio_needed = ratio_needed;
2851 #if defined(PETSC_USE_INFO)
2852   if (ai[am] != 0) {
2853     PetscReal af = fact->info.fill_ratio_needed;
2854     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)fill,(double)af);CHKERRQ(ierr);
2855     ierr = PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);CHKERRQ(ierr);
2856     ierr = PetscInfo1(A,"PCFactorSetFill(pc,%g) for best performance.\n",(double)af);CHKERRQ(ierr);
2857   } else {
2858     ierr = PetscInfo(A,"Empty matrix.\n");CHKERRQ(ierr);
2859   }
2860 #endif
2861   if (perm_identity) {
2862     fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering_inplace;
2863   } else {
2864     fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_inplace;
2865   }
2866   PetscFunctionReturn(0);
2867 }
2868 
2869