xref: /petsc/src/mat/impls/sbaij/seq/sbaijfact2.c (revision 3e7ff0edd3573be01c8c0fa32db97c3db8fa5c8d)
1 
2 /*
3     Factorization code for SBAIJ format.
4 */
5 
6 #include <../src/mat/impls/sbaij/seq/sbaij.h>
7 #include <../src/mat/impls/baij/seq/baij.h>
8 #include <petsc-private/kernels/blockinvert.h>
9 
10 #undef __FUNCT__
11 #define __FUNCT__ "MatSolve_SeqSBAIJ_N_inplace"
12 PetscErrorCode MatSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
13 {
14   Mat_SeqSBAIJ   *a   =(Mat_SeqSBAIJ*)A->data;
15   IS             isrow=a->row;
16   PetscInt       mbs  =a->mbs,*ai=a->i,*aj=a->j;
17   PetscErrorCode ierr;
18   const PetscInt *r;
19   PetscInt       nz,*vj,k,idx,k1;
20   PetscInt       bs =A->rmap->bs,bs2 = a->bs2;
21   MatScalar      *aa=a->a,*v,*diag;
22   PetscScalar    *x,*xk,*xj,*b,*xk_tmp,*t;
23 
24   PetscFunctionBegin;
25   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
26   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
27   t    = a->solve_work;
28   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
29   ierr = PetscMalloc1(bs,&xk_tmp);CHKERRQ(ierr);
30 
31   /* solve U^T * D * y = b by forward substitution */
32   xk = t;
33   for (k=0; k<mbs; k++) { /* t <- perm(b) */
34     idx = bs*r[k];
35     for (k1=0; k1<bs; k1++) *xk++ = b[idx+k1];
36   }
37   for (k=0; k<mbs; k++) {
38     v    = aa + bs2*ai[k];
39     xk   = t + k*bs;    /* Dk*xk = k-th block of x */
40     ierr = PetscMemcpy(xk_tmp,xk,bs*sizeof(PetscScalar));CHKERRQ(ierr); /* xk_tmp <- xk */
41     nz   = ai[k+1] - ai[k];
42     vj   = aj + ai[k];
43     xj   = t + (*vj)*bs; /* *vj-th block of x, *vj>k */
44     while (nz--) {
45       /* x(:) += U(k,:)^T*(Dk*xk) */
46       PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
47       vj++; xj = t + (*vj)*bs;
48       v       += bs2;
49     }
50     /* xk = inv(Dk)*(Dk*xk) */
51     diag = aa+k*bs2;                            /* ptr to inv(Dk) */
52     PetscKernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
53   }
54 
55   /* solve U*x = y by back substitution */
56   for (k=mbs-1; k>=0; k--) {
57     v  = aa + bs2*ai[k];
58     xk = t + k*bs;        /* xk */
59     nz = ai[k+1] - ai[k];
60     vj = aj + ai[k];
61     xj = t + (*vj)*bs;
62     while (nz--) {
63       /* xk += U(k,:)*x(:) */
64       PetscKernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
65       vj++;
66       v += bs2; xj = t + (*vj)*bs;
67     }
68     idx = bs*r[k];
69     for (k1=0; k1<bs; k1++) x[idx+k1] = *xk++;
70   }
71 
72   ierr = PetscFree(xk_tmp);CHKERRQ(ierr);
73   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
74   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
75   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
76   ierr = PetscLogFlops(4.0*bs2*a->nz -(bs+2.0*bs2)*mbs);CHKERRQ(ierr);
77   PetscFunctionReturn(0);
78 }
79 
80 #undef __FUNCT__
81 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_N_inplace"
82 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
83 {
84   PetscFunctionBegin;
85   SETERRQ(PETSC_COMM_SELF,1,"not implemented yet");
86   PetscFunctionReturn(0);
87 }
88 
89 #undef __FUNCT__
90 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_N_inplace"
91 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
92 {
93   PetscFunctionBegin;
94   SETERRQ(PETSC_COMM_SELF,1,"not implemented yet");
95   PetscFunctionReturn(0);
96 }
97 
98 #undef __FUNCT__
99 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_N_NaturalOrdering"
100 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x)
101 {
102   PetscErrorCode ierr;
103   PetscInt       nz,*vj,k;
104   PetscInt       bs2 = bs*bs;
105   MatScalar      *v,*diag;
106   PetscScalar    *xk,*xj,*xk_tmp;
107 
108   PetscFunctionBegin;
109   ierr = PetscMalloc1(bs,&xk_tmp);CHKERRQ(ierr);
110   for (k=0; k<mbs; k++) {
111     v    = aa + bs2*ai[k];
112     xk   = x + k*bs;    /* Dk*xk = k-th block of x */
113     ierr = PetscMemcpy(xk_tmp,xk,bs*sizeof(PetscScalar));CHKERRQ(ierr); /* xk_tmp <- xk */
114     nz   = ai[k+1] - ai[k];
115     vj   = aj + ai[k];
116     xj   = x + (*vj)*bs; /* *vj-th block of x, *vj>k */
117     while (nz--) {
118       /* x(:) += U(k,:)^T*(Dk*xk) */
119       PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
120       vj++; xj = x + (*vj)*bs;
121       v       += bs2;
122     }
123     /* xk = inv(Dk)*(Dk*xk) */
124     diag = aa+k*bs2;                            /* ptr to inv(Dk) */
125     PetscKernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
126   }
127   ierr = PetscFree(xk_tmp);CHKERRQ(ierr);
128   PetscFunctionReturn(0);
129 }
130 
131 #undef __FUNCT__
132 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering"
133 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x)
134 {
135   PetscInt    nz,*vj,k;
136   PetscInt    bs2 = bs*bs;
137   MatScalar   *v;
138   PetscScalar *xk,*xj;
139 
140   PetscFunctionBegin;
141   for (k=mbs-1; k>=0; k--) {
142     v  = aa + bs2*ai[k];
143     xk = x + k*bs;        /* xk */
144     nz = ai[k+1] - ai[k];
145     vj = aj + ai[k];
146     xj = x + (*vj)*bs;
147     while (nz--) {
148       /* xk += U(k,:)*x(:) */
149       PetscKernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
150       vj++;
151       v += bs2; xj = x + (*vj)*bs;
152     }
153   }
154   PetscFunctionReturn(0);
155 }
156 
157 #undef __FUNCT__
158 #define __FUNCT__ "MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace"
159 PetscErrorCode MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
160 {
161   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
162   PetscErrorCode ierr;
163   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
164   PetscInt       bs =A->rmap->bs;
165   MatScalar      *aa=a->a;
166   PetscScalar    *x,*b;
167 
168   PetscFunctionBegin;
169   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
170   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
171 
172   /* solve U^T * D * y = b by forward substitution */
173   ierr = PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */
174   ierr = MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr);
175 
176   /* solve U*x = y by back substitution */
177   ierr = MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr);
178 
179   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
180   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
181   ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr);
182   PetscFunctionReturn(0);
183 }
184 
185 #undef __FUNCT__
186 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace"
187 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
188 {
189   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
190   PetscErrorCode ierr;
191   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
192   PetscInt       bs =A->rmap->bs;
193   MatScalar      *aa=a->a;
194   PetscScalar    *x,*b;
195 
196   PetscFunctionBegin;
197   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
198   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
199   ierr = PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */
200   ierr = MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr);
201   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
202   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
203   ierr = PetscLogFlops(2.0*bs2*a->nz - bs*mbs);CHKERRQ(ierr);
204   PetscFunctionReturn(0);
205 }
206 
207 #undef __FUNCT__
208 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace"
209 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
210 {
211   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
212   PetscErrorCode ierr;
213   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
214   PetscInt       bs =A->rmap->bs;
215   MatScalar      *aa=a->a;
216   PetscScalar    *x,*b;
217 
218   PetscFunctionBegin;
219   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
220   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
221   ierr = PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
222   ierr = MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr);
223   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
224   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
225   ierr = PetscLogFlops(2.0*bs2*(a->nz-mbs));CHKERRQ(ierr);
226   PetscFunctionReturn(0);
227 }
228 
229 #undef __FUNCT__
230 #define __FUNCT__ "MatSolve_SeqSBAIJ_7_inplace"
231 PetscErrorCode MatSolve_SeqSBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
232 {
233   Mat_SeqSBAIJ   *a   =(Mat_SeqSBAIJ*)A->data;
234   IS             isrow=a->row;
235   PetscInt       mbs  =a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2,bs=A->rmap->bs;
236   PetscErrorCode ierr;
237   const PetscInt *r;
238   PetscInt       nz,*vj,k,idx;
239   MatScalar      *aa=a->a,*v,*d;
240   PetscScalar    *x,*b,x0,x1,x2,x3,x4,x5,x6,*t,*tp;
241 
242   PetscFunctionBegin;
243   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
244   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
245   t    = a->solve_work;
246   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
247 
248   /* solve U^T * D * y = b by forward substitution */
249   tp = t;
250   for (k=0; k<mbs; k++) { /* t <- perm(b) */
251     idx   = 7*r[k];
252     tp[0] = b[idx];
253     tp[1] = b[idx+1];
254     tp[2] = b[idx+2];
255     tp[3] = b[idx+3];
256     tp[4] = b[idx+4];
257     tp[5] = b[idx+5];
258     tp[6] = b[idx+6];
259     tp   += 7;
260   }
261 
262   for (k=0; k<mbs; k++) {
263     v  = aa + 49*ai[k];
264     vj = aj + ai[k];
265     tp = t + k*7;
266     x0 =tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6];
267     nz = ai[k+1] - ai[k];
268     tp = t + (*vj)*7;
269     while (nz--) {
270       tp[0]+=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
271       tp[1]+=  v[7]*x0 +  v[8]*x1 +  v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
272       tp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
273       tp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
274       tp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
275       tp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
276       tp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
277       vj++;
278       tp = t + (*vj)*7;
279       v += 49;
280     }
281 
282     /* xk = inv(Dk)*(Dk*xk) */
283     d     = aa+k*49;       /* ptr to inv(Dk) */
284     tp    = t + k*7;
285     tp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
286     tp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
287     tp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
288     tp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
289     tp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
290     tp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
291     tp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
292   }
293 
294   /* solve U*x = y by back substitution */
295   for (k=mbs-1; k>=0; k--) {
296     v  = aa + 49*ai[k];
297     vj = aj + ai[k];
298     tp = t + k*7;
299     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];  x6=tp[6]; /* xk */
300     nz = ai[k+1] - ai[k];
301 
302     tp = t + (*vj)*7;
303     while (nz--) {
304       /* xk += U(k,:)*x(:) */
305       x0 += v[0]*tp[0] + v[7]*tp[1] + v[14]*tp[2] + v[21]*tp[3] + v[28]*tp[4] + v[35]*tp[5] + v[42]*tp[6];
306       x1 += v[1]*tp[0] + v[8]*tp[1] + v[15]*tp[2] + v[22]*tp[3] + v[29]*tp[4] + v[36]*tp[5] + v[43]*tp[6];
307       x2 += v[2]*tp[0] + v[9]*tp[1] + v[16]*tp[2] + v[23]*tp[3] + v[30]*tp[4] + v[37]*tp[5] + v[44]*tp[6];
308       x3 += v[3]*tp[0]+ v[10]*tp[1] + v[17]*tp[2] + v[24]*tp[3] + v[31]*tp[4] + v[38]*tp[5] + v[45]*tp[6];
309       x4 += v[4]*tp[0]+ v[11]*tp[1] + v[18]*tp[2] + v[25]*tp[3] + v[32]*tp[4] + v[39]*tp[5] + v[46]*tp[6];
310       x5 += v[5]*tp[0]+ v[12]*tp[1] + v[19]*tp[2] + v[26]*tp[3] + v[33]*tp[4] + v[40]*tp[5] + v[47]*tp[6];
311       x6 += v[6]*tp[0]+ v[13]*tp[1] + v[20]*tp[2] + v[27]*tp[3] + v[34]*tp[4] + v[41]*tp[5] + v[48]*tp[6];
312       vj++;
313       tp = t + (*vj)*7;
314       v += 49;
315     }
316     tp       = t + k*7;
317     tp[0]    = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5; tp[6]=x6;
318     idx      = 7*r[k];
319     x[idx]   = x0;
320     x[idx+1] = x1;
321     x[idx+2] = x2;
322     x[idx+3] = x3;
323     x[idx+4] = x4;
324     x[idx+5] = x5;
325     x[idx+6] = x6;
326   }
327 
328   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
329   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
330   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
331   ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr);
332   PetscFunctionReturn(0);
333 }
334 
335 #undef __FUNCT__
336 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_7_NaturalOrdering"
337 PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
338 {
339   MatScalar   *v,*d;
340   PetscScalar *xp,x0,x1,x2,x3,x4,x5,x6;
341   PetscInt    nz,*vj,k;
342 
343   PetscFunctionBegin;
344   for (k=0; k<mbs; k++) {
345     v  = aa + 49*ai[k];
346     xp = x + k*7;
347     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* Dk*xk = k-th block of x */
348     nz = ai[k+1] - ai[k];
349     vj = aj + ai[k];
350     xp = x + (*vj)*7;
351     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
352     PetscPrefetchBlock(v+49*nz,49*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
353     while (nz--) {
354       /* x(:) += U(k,:)^T*(Dk*xk) */
355       xp[0]+=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
356       xp[1]+=  v[7]*x0 +  v[8]*x1 +  v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
357       xp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
358       xp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
359       xp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
360       xp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
361       xp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
362       vj++;
363       xp = x + (*vj)*7;
364       v += 49;
365     }
366     /* xk = inv(Dk)*(Dk*xk) */
367     d     = aa+k*49;       /* ptr to inv(Dk) */
368     xp    = x + k*7;
369     xp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
370     xp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
371     xp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
372     xp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
373     xp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
374     xp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
375     xp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
376   }
377   PetscFunctionReturn(0);
378 }
379 
380 #undef __FUNCT__
381 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering"
382 PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
383 {
384   MatScalar   *v;
385   PetscScalar *xp,x0,x1,x2,x3,x4,x5,x6;
386   PetscInt    nz,*vj,k;
387 
388   PetscFunctionBegin;
389   for (k=mbs-1; k>=0; k--) {
390     v  = aa + 49*ai[k];
391     xp = x + k*7;
392     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* xk */
393     nz = ai[k+1] - ai[k];
394     vj = aj + ai[k];
395     xp = x + (*vj)*7;
396     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
397     PetscPrefetchBlock(v-49*nz,49*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
398     while (nz--) {
399       /* xk += U(k,:)*x(:) */
400       x0 += v[0]*xp[0] + v[7]*xp[1] + v[14]*xp[2] + v[21]*xp[3] + v[28]*xp[4] + v[35]*xp[5] + v[42]*xp[6];
401       x1 += v[1]*xp[0] + v[8]*xp[1] + v[15]*xp[2] + v[22]*xp[3] + v[29]*xp[4] + v[36]*xp[5] + v[43]*xp[6];
402       x2 += v[2]*xp[0] + v[9]*xp[1] + v[16]*xp[2] + v[23]*xp[3] + v[30]*xp[4] + v[37]*xp[5] + v[44]*xp[6];
403       x3 += v[3]*xp[0]+ v[10]*xp[1] + v[17]*xp[2] + v[24]*xp[3] + v[31]*xp[4] + v[38]*xp[5] + v[45]*xp[6];
404       x4 += v[4]*xp[0]+ v[11]*xp[1] + v[18]*xp[2] + v[25]*xp[3] + v[32]*xp[4] + v[39]*xp[5] + v[46]*xp[6];
405       x5 += v[5]*xp[0]+ v[12]*xp[1] + v[19]*xp[2] + v[26]*xp[3] + v[33]*xp[4] + v[40]*xp[5] + v[47]*xp[6];
406       x6 += v[6]*xp[0]+ v[13]*xp[1] + v[20]*xp[2] + v[27]*xp[3] + v[34]*xp[4] + v[41]*xp[5] + v[48]*xp[6];
407       vj++;
408       v += 49; xp = x + (*vj)*7;
409     }
410     xp   = x + k*7;
411     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5; xp[6]=x6;
412   }
413   PetscFunctionReturn(0);
414 }
415 
416 #undef __FUNCT__
417 #define __FUNCT__ "MatSolve_SeqSBAIJ_7_NaturalOrdering_inplace"
418 PetscErrorCode MatSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
419 {
420   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
421   PetscErrorCode ierr;
422   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
423   MatScalar      *aa=a->a;
424   PetscScalar    *x,*b;
425 
426   PetscFunctionBegin;
427   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
428   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
429 
430   /* solve U^T * D * y = b by forward substitution */
431   ierr = PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */
432   ierr = MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
433 
434   /* solve U*x = y by back substitution */
435   ierr = MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
436 
437   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
438   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
439   ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr);
440   PetscFunctionReturn(0);
441 }
442 
443 #undef __FUNCT__
444 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace"
445 PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
446 {
447   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
448   PetscErrorCode ierr;
449   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
450   MatScalar      *aa=a->a;
451   PetscScalar    *x,*b;
452 
453   PetscFunctionBegin;
454   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
455   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
456   ierr = PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
457   ierr = MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
458   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
459   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
460   ierr = PetscLogFlops(2.0*bs2*a->nz - bs*mbs);CHKERRQ(ierr);
461   PetscFunctionReturn(0);
462 }
463 
464 #undef __FUNCT__
465 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace"
466 PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
467 {
468   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
469   PetscErrorCode ierr;
470   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
471   MatScalar      *aa=a->a;
472   PetscScalar    *x,*b;
473 
474   PetscFunctionBegin;
475   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
476   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
477   ierr = PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
478   ierr = MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
479   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
480   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
481   ierr = PetscLogFlops(2.0*bs2*(a->nz-mbs));CHKERRQ(ierr);
482   PetscFunctionReturn(0);
483 }
484 
485 #undef __FUNCT__
486 #define __FUNCT__ "MatSolve_SeqSBAIJ_6_inplace"
487 PetscErrorCode MatSolve_SeqSBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
488 {
489   Mat_SeqSBAIJ   *a   =(Mat_SeqSBAIJ*)A->data;
490   IS             isrow=a->row;
491   PetscInt       mbs  =a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
492   PetscErrorCode ierr;
493   const PetscInt *r;
494   PetscInt       nz,*vj,k,idx;
495   MatScalar      *aa=a->a,*v,*d;
496   PetscScalar    *x,*b,x0,x1,x2,x3,x4,x5,*t,*tp;
497 
498   PetscFunctionBegin;
499   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
500   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
501   t    = a->solve_work;
502   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
503 
504   /* solve U^T * D * y = b by forward substitution */
505   tp = t;
506   for (k=0; k<mbs; k++) { /* t <- perm(b) */
507     idx   = 6*r[k];
508     tp[0] = b[idx];
509     tp[1] = b[idx+1];
510     tp[2] = b[idx+2];
511     tp[3] = b[idx+3];
512     tp[4] = b[idx+4];
513     tp[5] = b[idx+5];
514     tp   += 6;
515   }
516 
517   for (k=0; k<mbs; k++) {
518     v  = aa + 36*ai[k];
519     vj = aj + ai[k];
520     tp = t + k*6;
521     x0 =tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];
522     nz = ai[k+1] - ai[k];
523     tp = t + (*vj)*6;
524     while (nz--) {
525       tp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
526       tp[1] +=  v[6]*x0 +  v[7]*x1 +  v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
527       tp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
528       tp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
529       tp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
530       tp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
531       vj++;
532       tp = t + (*vj)*6;
533       v += 36;
534     }
535 
536     /* xk = inv(Dk)*(Dk*xk) */
537     d     = aa+k*36;       /* ptr to inv(Dk) */
538     tp    = t + k*6;
539     tp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
540     tp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
541     tp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
542     tp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
543     tp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
544     tp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
545   }
546 
547   /* solve U*x = y by back substitution */
548   for (k=mbs-1; k>=0; k--) {
549     v  = aa + 36*ai[k];
550     vj = aj + ai[k];
551     tp = t + k*6;
552     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; /* xk */
553     nz = ai[k+1] - ai[k];
554 
555     tp = t + (*vj)*6;
556     while (nz--) {
557       /* xk += U(k,:)*x(:) */
558       x0 += v[0]*tp[0] + v[6]*tp[1] + v[12]*tp[2] + v[18]*tp[3] + v[24]*tp[4] + v[30]*tp[5];
559       x1 += v[1]*tp[0] + v[7]*tp[1] + v[13]*tp[2] + v[19]*tp[3] + v[25]*tp[4] + v[31]*tp[5];
560       x2 += v[2]*tp[0] + v[8]*tp[1] + v[14]*tp[2] + v[20]*tp[3] + v[26]*tp[4] + v[32]*tp[5];
561       x3 += v[3]*tp[0] + v[9]*tp[1] + v[15]*tp[2] + v[21]*tp[3] + v[27]*tp[4] + v[33]*tp[5];
562       x4 += v[4]*tp[0]+ v[10]*tp[1] + v[16]*tp[2] + v[22]*tp[3] + v[28]*tp[4] + v[34]*tp[5];
563       x5 += v[5]*tp[0]+ v[11]*tp[1] + v[17]*tp[2] + v[23]*tp[3] + v[29]*tp[4] + v[35]*tp[5];
564       vj++;
565       tp = t + (*vj)*6;
566       v += 36;
567     }
568     tp       = t + k*6;
569     tp[0]    = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5;
570     idx      = 6*r[k];
571     x[idx]   = x0;
572     x[idx+1] = x1;
573     x[idx+2] = x2;
574     x[idx+3] = x3;
575     x[idx+4] = x4;
576     x[idx+5] = x5;
577   }
578 
579   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
580   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
581   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
582   ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr);
583   PetscFunctionReturn(0);
584 }
585 
586 #undef __FUNCT__
587 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_6_NaturalOrdering"
588 PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
589 {
590   MatScalar   *v,*d;
591   PetscScalar *xp,x0,x1,x2,x3,x4,x5;
592   PetscInt    nz,*vj,k;
593 
594   PetscFunctionBegin;
595   for (k=0; k<mbs; k++) {
596     v  = aa + 36*ai[k];
597     xp = x + k*6;
598     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* Dk*xk = k-th block of x */
599     nz = ai[k+1] - ai[k];
600     vj = aj + ai[k];
601     xp = x + (*vj)*6;
602     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
603     PetscPrefetchBlock(v+36*nz,36*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
604     while (nz--) {
605       /* x(:) += U(k,:)^T*(Dk*xk) */
606       xp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
607       xp[1] +=  v[6]*x0 +  v[7]*x1 +  v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
608       xp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
609       xp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
610       xp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
611       xp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
612       vj++;
613       xp = x + (*vj)*6;
614       v += 36;
615     }
616     /* xk = inv(Dk)*(Dk*xk) */
617     d     = aa+k*36;       /* ptr to inv(Dk) */
618     xp    = x + k*6;
619     xp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
620     xp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
621     xp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
622     xp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
623     xp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
624     xp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
625   }
626   PetscFunctionReturn(0);
627 }
628 #undef __FUNCT__
629 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering"
630 PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
631 {
632   MatScalar   *v;
633   PetscScalar *xp,x0,x1,x2,x3,x4,x5;
634   PetscInt    nz,*vj,k;
635 
636   PetscFunctionBegin;
637   for (k=mbs-1; k>=0; k--) {
638     v  = aa + 36*ai[k];
639     xp = x + k*6;
640     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* xk */
641     nz = ai[k+1] - ai[k];
642     vj = aj + ai[k];
643     xp = x + (*vj)*6;
644     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
645     PetscPrefetchBlock(v-36*nz,36*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
646     while (nz--) {
647       /* xk += U(k,:)*x(:) */
648       x0 += v[0]*xp[0] + v[6]*xp[1] + v[12]*xp[2] + v[18]*xp[3] + v[24]*xp[4] + v[30]*xp[5];
649       x1 += v[1]*xp[0] + v[7]*xp[1] + v[13]*xp[2] + v[19]*xp[3] + v[25]*xp[4] + v[31]*xp[5];
650       x2 += v[2]*xp[0] + v[8]*xp[1] + v[14]*xp[2] + v[20]*xp[3] + v[26]*xp[4] + v[32]*xp[5];
651       x3 += v[3]*xp[0] + v[9]*xp[1] + v[15]*xp[2] + v[21]*xp[3] + v[27]*xp[4] + v[33]*xp[5];
652       x4 += v[4]*xp[0]+ v[10]*xp[1] + v[16]*xp[2] + v[22]*xp[3] + v[28]*xp[4] + v[34]*xp[5];
653       x5 += v[5]*xp[0]+ v[11]*xp[1] + v[17]*xp[2] + v[23]*xp[3] + v[29]*xp[4] + v[35]*xp[5];
654       vj++;
655       v += 36; xp = x + (*vj)*6;
656     }
657     xp   = x + k*6;
658     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5;
659   }
660   PetscFunctionReturn(0);
661 }
662 
663 
664 #undef __FUNCT__
665 #define __FUNCT__ "MatSolve_SeqSBAIJ_6_NaturalOrdering_inplace"
666 PetscErrorCode MatSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
667 {
668   Mat_SeqSBAIJ   *a =(Mat_SeqSBAIJ*)A->data;
669   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
670   MatScalar      *aa=a->a;
671   PetscScalar    *x,*b;
672   PetscErrorCode ierr;
673 
674   PetscFunctionBegin;
675   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
676   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
677 
678   /* solve U^T * D * y = b by forward substitution */
679   ierr = PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */
680   ierr = MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
681 
682   /* solve U*x = y by back substitution */
683   ierr = MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
684 
685   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
686   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
687   ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr);
688   PetscFunctionReturn(0);
689 }
690 
691 #undef __FUNCT__
692 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace"
693 PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
694 {
695   Mat_SeqSBAIJ   *a =(Mat_SeqSBAIJ*)A->data;
696   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
697   MatScalar      *aa=a->a;
698   PetscScalar    *x,*b;
699   PetscErrorCode ierr;
700 
701   PetscFunctionBegin;
702   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
703   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
704   ierr = PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */
705   ierr = MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
706   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
707   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
708   ierr = PetscLogFlops(2.0*bs2*a->nz - bs*mbs);CHKERRQ(ierr);
709   PetscFunctionReturn(0);
710 }
711 
712 #undef __FUNCT__
713 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace"
714 PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
715 {
716   Mat_SeqSBAIJ   *a =(Mat_SeqSBAIJ*)A->data;
717   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
718   MatScalar      *aa=a->a;
719   PetscScalar    *x,*b;
720   PetscErrorCode ierr;
721 
722   PetscFunctionBegin;
723   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
724   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
725   ierr = PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */
726   ierr = MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
727   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
728   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
729   ierr = PetscLogFlops(2.0*bs2*(a->nz - mbs));CHKERRQ(ierr);
730   PetscFunctionReturn(0);
731 }
732 
733 #undef __FUNCT__
734 #define __FUNCT__ "MatSolve_SeqSBAIJ_5_inplace"
735 PetscErrorCode MatSolve_SeqSBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
736 {
737   Mat_SeqSBAIJ   *a   =(Mat_SeqSBAIJ*)A->data;
738   IS             isrow=a->row;
739   PetscInt       mbs  =a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2 = a->bs2;
740   PetscErrorCode ierr;
741   const PetscInt *r;
742   PetscInt       nz,*vj,k,idx;
743   MatScalar      *aa=a->a,*v,*diag;
744   PetscScalar    *x,*b,x0,x1,x2,x3,x4,*t,*tp;
745 
746   PetscFunctionBegin;
747   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
748   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
749   t    = a->solve_work;
750   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
751 
752   /* solve U^T * D * y = b by forward substitution */
753   tp = t;
754   for (k=0; k<mbs; k++) { /* t <- perm(b) */
755     idx   = 5*r[k];
756     tp[0] = b[idx];
757     tp[1] = b[idx+1];
758     tp[2] = b[idx+2];
759     tp[3] = b[idx+3];
760     tp[4] = b[idx+4];
761     tp   += 5;
762   }
763 
764   for (k=0; k<mbs; k++) {
765     v  = aa + 25*ai[k];
766     vj = aj + ai[k];
767     tp = t + k*5;
768     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4];
769     nz = ai[k+1] - ai[k];
770 
771     tp = t + (*vj)*5;
772     while (nz--) {
773       tp[0] +=  v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4;
774       tp[1] +=  v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4;
775       tp[2] += v[10]*x0+ v[11]*x1+ v[12]*x2+ v[13]*x3+ v[14]*x4;
776       tp[3] += v[15]*x0+ v[16]*x1+ v[17]*x2+ v[18]*x3+ v[19]*x4;
777       tp[4] += v[20]*x0+ v[21]*x1+ v[22]*x2+ v[23]*x3+ v[24]*x4;
778       vj++;
779       tp = t + (*vj)*5;
780       v += 25;
781     }
782 
783     /* xk = inv(Dk)*(Dk*xk) */
784     diag  = aa+k*25;          /* ptr to inv(Dk) */
785     tp    = t + k*5;
786     tp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
787     tp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
788     tp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
789     tp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
790     tp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
791   }
792 
793   /* solve U*x = y by back substitution */
794   for (k=mbs-1; k>=0; k--) {
795     v  = aa + 25*ai[k];
796     vj = aj + ai[k];
797     tp = t + k*5;
798     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; /* xk */
799     nz = ai[k+1] - ai[k];
800 
801     tp = t + (*vj)*5;
802     while (nz--) {
803       /* xk += U(k,:)*x(:) */
804       x0 += v[0]*tp[0] + v[5]*tp[1] + v[10]*tp[2] + v[15]*tp[3] + v[20]*tp[4];
805       x1 += v[1]*tp[0] + v[6]*tp[1] + v[11]*tp[2] + v[16]*tp[3] + v[21]*tp[4];
806       x2 += v[2]*tp[0] + v[7]*tp[1] + v[12]*tp[2] + v[17]*tp[3] + v[22]*tp[4];
807       x3 += v[3]*tp[0] + v[8]*tp[1] + v[13]*tp[2] + v[18]*tp[3] + v[23]*tp[4];
808       x4 += v[4]*tp[0] + v[9]*tp[1] + v[14]*tp[2] + v[19]*tp[3] + v[24]*tp[4];
809       vj++;
810       tp = t + (*vj)*5;
811       v += 25;
812     }
813     tp       = t + k*5;
814     tp[0]    = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4;
815     idx      = 5*r[k];
816     x[idx]   = x0;
817     x[idx+1] = x1;
818     x[idx+2] = x2;
819     x[idx+3] = x3;
820     x[idx+4] = x4;
821   }
822 
823   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
824   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
825   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
826   ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr);
827   PetscFunctionReturn(0);
828 }
829 
830 #undef __FUNCT__
831 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_5_NaturalOrdering"
832 PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
833 {
834   MatScalar   *v,*diag;
835   PetscScalar *xp,x0,x1,x2,x3,x4;
836   PetscInt    nz,*vj,k;
837 
838   PetscFunctionBegin;
839   for (k=0; k<mbs; k++) {
840     v  = aa + 25*ai[k];
841     xp = x + k*5;
842     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* Dk*xk = k-th block of x */
843     nz = ai[k+1] - ai[k];
844     vj = aj + ai[k];
845     xp = x + (*vj)*5;
846     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
847     PetscPrefetchBlock(v+25*nz,25*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
848     while (nz--) {
849       /* x(:) += U(k,:)^T*(Dk*xk) */
850       xp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4;
851       xp[1] +=  v[5]*x0 +  v[6]*x1 +  v[7]*x2 + v[8]*x3 + v[9]*x4;
852       xp[2] += v[10]*x0 + v[11]*x1 + v[12]*x2+ v[13]*x3+ v[14]*x4;
853       xp[3] += v[15]*x0 + v[16]*x1 + v[17]*x2+ v[18]*x3+ v[19]*x4;
854       xp[4] += v[20]*x0 + v[21]*x1 + v[22]*x2+ v[23]*x3+ v[24]*x4;
855       vj++;
856       xp = x + (*vj)*5;
857       v += 25;
858     }
859     /* xk = inv(Dk)*(Dk*xk) */
860     diag  = aa+k*25;         /* ptr to inv(Dk) */
861     xp    = x + k*5;
862     xp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
863     xp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
864     xp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
865     xp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
866     xp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
867   }
868   PetscFunctionReturn(0);
869 }
870 
871 #undef __FUNCT__
872 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering"
873 PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
874 {
875   MatScalar   *v;
876   PetscScalar *xp,x0,x1,x2,x3,x4;
877   PetscInt    nz,*vj,k;
878 
879   PetscFunctionBegin;
880   for (k=mbs-1; k>=0; k--) {
881     v  = aa + 25*ai[k];
882     xp = x + k*5;
883     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; /* xk */
884     nz = ai[k+1] - ai[k];
885     vj = aj + ai[k];
886     xp = x + (*vj)*5;
887     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
888     PetscPrefetchBlock(v-25*nz,25*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
889     while (nz--) {
890       /* xk += U(k,:)*x(:) */
891       x0 += v[0]*xp[0] + v[5]*xp[1] + v[10]*xp[2] + v[15]*xp[3] + v[20]*xp[4];
892       x1 += v[1]*xp[0] + v[6]*xp[1] + v[11]*xp[2] + v[16]*xp[3] + v[21]*xp[4];
893       x2 += v[2]*xp[0] + v[7]*xp[1] + v[12]*xp[2] + v[17]*xp[3] + v[22]*xp[4];
894       x3 += v[3]*xp[0] + v[8]*xp[1] + v[13]*xp[2] + v[18]*xp[3] + v[23]*xp[4];
895       x4 += v[4]*xp[0] + v[9]*xp[1] + v[14]*xp[2] + v[19]*xp[3] + v[24]*xp[4];
896       vj++;
897       v += 25; xp = x + (*vj)*5;
898     }
899     xp   = x + k*5;
900     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4;
901   }
902   PetscFunctionReturn(0);
903 }
904 
905 #undef __FUNCT__
906 #define __FUNCT__ "MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace"
907 PetscErrorCode MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
908 {
909   Mat_SeqSBAIJ   *a =(Mat_SeqSBAIJ*)A->data;
910   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2 = a->bs2;
911   MatScalar      *aa=a->a;
912   PetscScalar    *x,*b;
913   PetscErrorCode ierr;
914 
915   PetscFunctionBegin;
916   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
917   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
918 
919   /* solve U^T * D * y = b by forward substitution */
920   ierr = PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */
921   ierr = MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
922 
923   /* solve U*x = y by back substitution */
924   ierr = MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
925 
926   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
927   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
928   ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr);
929   PetscFunctionReturn(0);
930 }
931 
932 #undef __FUNCT__
933 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace"
934 PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
935 {
936   Mat_SeqSBAIJ   *a =(Mat_SeqSBAIJ*)A->data;
937   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
938   MatScalar      *aa=a->a;
939   PetscScalar    *x,*b;
940   PetscErrorCode ierr;
941 
942   PetscFunctionBegin;
943   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
944   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
945   ierr = PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */
946   ierr = MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
947   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
948   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
949   ierr = PetscLogFlops(2.0*bs2*a->nz - bs*mbs);CHKERRQ(ierr);
950   PetscFunctionReturn(0);
951 }
952 
953 #undef __FUNCT__
954 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace"
955 PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
956 {
957   Mat_SeqSBAIJ   *a =(Mat_SeqSBAIJ*)A->data;
958   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
959   MatScalar      *aa=a->a;
960   PetscScalar    *x,*b;
961   PetscErrorCode ierr;
962 
963   PetscFunctionBegin;
964   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
965   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
966   ierr = PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
967   ierr = MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
968   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
969   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
970   ierr = PetscLogFlops(2.0*bs2*(a->nz-mbs));CHKERRQ(ierr);
971   PetscFunctionReturn(0);
972 }
973 
974 #undef __FUNCT__
975 #define __FUNCT__ "MatSolve_SeqSBAIJ_4_inplace"
976 PetscErrorCode MatSolve_SeqSBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
977 {
978   Mat_SeqSBAIJ   *a   =(Mat_SeqSBAIJ*)A->data;
979   IS             isrow=a->row;
980   PetscInt       mbs  =a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
981   PetscErrorCode ierr;
982   const PetscInt *r;
983   PetscInt       nz,*vj,k,idx;
984   MatScalar      *aa=a->a,*v,*diag;
985   PetscScalar    *x,*b,x0,x1,x2,x3,*t,*tp;
986 
987   PetscFunctionBegin;
988   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
989   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
990   t    = a->solve_work;
991   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
992 
993   /* solve U^T * D * y = b by forward substitution */
994   tp = t;
995   for (k=0; k<mbs; k++) { /* t <- perm(b) */
996     idx   = 4*r[k];
997     tp[0] = b[idx];
998     tp[1] = b[idx+1];
999     tp[2] = b[idx+2];
1000     tp[3] = b[idx+3];
1001     tp   += 4;
1002   }
1003 
1004   for (k=0; k<mbs; k++) {
1005     v  = aa + 16*ai[k];
1006     vj = aj + ai[k];
1007     tp = t + k*4;
1008     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3];
1009     nz = ai[k+1] - ai[k];
1010 
1011     tp = t + (*vj)*4;
1012     while (nz--) {
1013       tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
1014       tp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
1015       tp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
1016       tp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
1017       vj++;
1018       tp = t + (*vj)*4;
1019       v += 16;
1020     }
1021 
1022     /* xk = inv(Dk)*(Dk*xk) */
1023     diag  = aa+k*16;          /* ptr to inv(Dk) */
1024     tp    = t + k*4;
1025     tp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
1026     tp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
1027     tp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
1028     tp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
1029   }
1030 
1031   /* solve U*x = y by back substitution */
1032   for (k=mbs-1; k>=0; k--) {
1033     v  = aa + 16*ai[k];
1034     vj = aj + ai[k];
1035     tp = t + k*4;
1036     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; /* xk */
1037     nz = ai[k+1] - ai[k];
1038 
1039     tp = t + (*vj)*4;
1040     while (nz--) {
1041       /* xk += U(k,:)*x(:) */
1042       x0 += v[0]*tp[0] + v[4]*tp[1] + v[8]*tp[2] + v[12]*tp[3];
1043       x1 += v[1]*tp[0] + v[5]*tp[1] + v[9]*tp[2] + v[13]*tp[3];
1044       x2 += v[2]*tp[0] + v[6]*tp[1]+ v[10]*tp[2] + v[14]*tp[3];
1045       x3 += v[3]*tp[0] + v[7]*tp[1]+ v[11]*tp[2] + v[15]*tp[3];
1046       vj++;
1047       tp = t + (*vj)*4;
1048       v += 16;
1049     }
1050     tp       = t + k*4;
1051     tp[0]    =x0; tp[1]=x1; tp[2]=x2; tp[3]=x3;
1052     idx      = 4*r[k];
1053     x[idx]   = x0;
1054     x[idx+1] = x1;
1055     x[idx+2] = x2;
1056     x[idx+3] = x3;
1057   }
1058 
1059   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1060   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1061   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1062   ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr);
1063   PetscFunctionReturn(0);
1064 }
1065 
1066 #undef __FUNCT__
1067 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_4_NaturalOrdering"
1068 PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1069 {
1070   MatScalar   *v,*diag;
1071   PetscScalar *xp,x0,x1,x2,x3;
1072   PetscInt    nz,*vj,k;
1073 
1074   PetscFunctionBegin;
1075   for (k=0; k<mbs; k++) {
1076     v  = aa + 16*ai[k];
1077     xp = x + k*4;
1078     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* Dk*xk = k-th block of x */
1079     nz = ai[k+1] - ai[k];
1080     vj = aj + ai[k];
1081     xp = x + (*vj)*4;
1082     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
1083     PetscPrefetchBlock(v+16*nz,16*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1084     while (nz--) {
1085       /* x(:) += U(k,:)^T*(Dk*xk) */
1086       xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
1087       xp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
1088       xp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
1089       xp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
1090       vj++;
1091       xp = x + (*vj)*4;
1092       v += 16;
1093     }
1094     /* xk = inv(Dk)*(Dk*xk) */
1095     diag  = aa+k*16;         /* ptr to inv(Dk) */
1096     xp    = x + k*4;
1097     xp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
1098     xp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
1099     xp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
1100     xp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
1101   }
1102   PetscFunctionReturn(0);
1103 }
1104 
1105 #undef __FUNCT__
1106 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering"
1107 PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1108 {
1109   MatScalar   *v;
1110   PetscScalar *xp,x0,x1,x2,x3;
1111   PetscInt    nz,*vj,k;
1112 
1113   PetscFunctionBegin;
1114   for (k=mbs-1; k>=0; k--) {
1115     v  = aa + 16*ai[k];
1116     xp = x + k*4;
1117     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* xk */
1118     nz = ai[k+1] - ai[k];
1119     vj = aj + ai[k];
1120     xp = x + (*vj)*4;
1121     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
1122     PetscPrefetchBlock(v-16*nz,16*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1123     while (nz--) {
1124       /* xk += U(k,:)*x(:) */
1125       x0 += v[0]*xp[0] + v[4]*xp[1] + v[8]*xp[2] + v[12]*xp[3];
1126       x1 += v[1]*xp[0] + v[5]*xp[1] + v[9]*xp[2] + v[13]*xp[3];
1127       x2 += v[2]*xp[0] + v[6]*xp[1]+ v[10]*xp[2] + v[14]*xp[3];
1128       x3 += v[3]*xp[0] + v[7]*xp[1]+ v[11]*xp[2] + v[15]*xp[3];
1129       vj++;
1130       v += 16; xp = x + (*vj)*4;
1131     }
1132     xp    = x + k*4;
1133     xp[0] = x0; xp[1] = x1; xp[2] = x2; xp[3] = x3;
1134   }
1135   PetscFunctionReturn(0);
1136 }
1137 
1138 #undef __FUNCT__
1139 #define __FUNCT__ "MatSolve_SeqSBAIJ_4_NaturalOrdering_inplace"
1140 PetscErrorCode MatSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1141 {
1142   Mat_SeqSBAIJ   *a =(Mat_SeqSBAIJ*)A->data;
1143   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1144   MatScalar      *aa=a->a;
1145   PetscScalar    *x,*b;
1146   PetscErrorCode ierr;
1147 
1148   PetscFunctionBegin;
1149   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1150   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1151 
1152   /* solve U^T * D * y = b by forward substitution */
1153   ierr = PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */
1154   ierr = MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1155 
1156   /* solve U*x = y by back substitution */
1157   ierr = MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1158   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1159   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1160   ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr);
1161   PetscFunctionReturn(0);
1162 }
1163 
1164 #undef __FUNCT__
1165 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace"
1166 PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1167 {
1168   Mat_SeqSBAIJ   *a =(Mat_SeqSBAIJ*)A->data;
1169   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1170   MatScalar      *aa=a->a;
1171   PetscScalar    *x,*b;
1172   PetscErrorCode ierr;
1173 
1174   PetscFunctionBegin;
1175   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1176   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1177   ierr = PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */
1178   ierr = MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1179   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1180   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1181   ierr = PetscLogFlops(2.0*bs2*a->nz - bs*mbs);CHKERRQ(ierr);
1182   PetscFunctionReturn(0);
1183 }
1184 
1185 #undef __FUNCT__
1186 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace"
1187 PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1188 {
1189   Mat_SeqSBAIJ   *a =(Mat_SeqSBAIJ*)A->data;
1190   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
1191   MatScalar      *aa=a->a;
1192   PetscScalar    *x,*b;
1193   PetscErrorCode ierr;
1194 
1195   PetscFunctionBegin;
1196   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1197   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1198   ierr = PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1199   ierr = MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1200   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1201   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1202   ierr = PetscLogFlops(2.0*bs2*(a->nz-mbs));CHKERRQ(ierr);
1203   PetscFunctionReturn(0);
1204 }
1205 
1206 #undef __FUNCT__
1207 #define __FUNCT__ "MatSolve_SeqSBAIJ_3_inplace"
1208 PetscErrorCode MatSolve_SeqSBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
1209 {
1210   Mat_SeqSBAIJ   *a   =(Mat_SeqSBAIJ*)A->data;
1211   IS             isrow=a->row;
1212   PetscInt       mbs  =a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1213   PetscErrorCode ierr;
1214   const PetscInt *r;
1215   PetscInt       nz,*vj,k,idx;
1216   MatScalar      *aa=a->a,*v,*diag;
1217   PetscScalar    *x,*b,x0,x1,x2,*t,*tp;
1218 
1219   PetscFunctionBegin;
1220   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1221   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1222   t    = a->solve_work;
1223   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1224 
1225   /* solve U^T * D * y = b by forward substitution */
1226   tp = t;
1227   for (k=0; k<mbs; k++) { /* t <- perm(b) */
1228     idx   = 3*r[k];
1229     tp[0] = b[idx];
1230     tp[1] = b[idx+1];
1231     tp[2] = b[idx+2];
1232     tp   += 3;
1233   }
1234 
1235   for (k=0; k<mbs; k++) {
1236     v  = aa + 9*ai[k];
1237     vj = aj + ai[k];
1238     tp = t + k*3;
1239     x0 = tp[0]; x1 = tp[1]; x2 = tp[2];
1240     nz = ai[k+1] - ai[k];
1241 
1242     tp = t + (*vj)*3;
1243     while (nz--) {
1244       tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
1245       tp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
1246       tp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
1247       vj++;
1248       tp = t + (*vj)*3;
1249       v += 9;
1250     }
1251 
1252     /* xk = inv(Dk)*(Dk*xk) */
1253     diag  = aa+k*9;          /* ptr to inv(Dk) */
1254     tp    = t + k*3;
1255     tp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
1256     tp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
1257     tp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
1258   }
1259 
1260   /* solve U*x = y by back substitution */
1261   for (k=mbs-1; k>=0; k--) {
1262     v  = aa + 9*ai[k];
1263     vj = aj + ai[k];
1264     tp = t + k*3;
1265     x0 = tp[0]; x1 = tp[1]; x2 = tp[2];  /* xk */
1266     nz = ai[k+1] - ai[k];
1267 
1268     tp = t + (*vj)*3;
1269     while (nz--) {
1270       /* xk += U(k,:)*x(:) */
1271       x0 += v[0]*tp[0] + v[3]*tp[1] + v[6]*tp[2];
1272       x1 += v[1]*tp[0] + v[4]*tp[1] + v[7]*tp[2];
1273       x2 += v[2]*tp[0] + v[5]*tp[1] + v[8]*tp[2];
1274       vj++;
1275       tp = t + (*vj)*3;
1276       v += 9;
1277     }
1278     tp       = t + k*3;
1279     tp[0]    = x0; tp[1] = x1; tp[2] = x2;
1280     idx      = 3*r[k];
1281     x[idx]   = x0;
1282     x[idx+1] = x1;
1283     x[idx+2] = x2;
1284   }
1285 
1286   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1287   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1288   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1289   ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr);
1290   PetscFunctionReturn(0);
1291 }
1292 
1293 #undef __FUNCT__
1294 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_3_NaturalOrdering"
1295 PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1296 {
1297   MatScalar   *v,*diag;
1298   PetscScalar *xp,x0,x1,x2;
1299   PetscInt    nz,*vj,k;
1300 
1301   PetscFunctionBegin;
1302   for (k=0; k<mbs; k++) {
1303     v  = aa + 9*ai[k];
1304     xp = x + k*3;
1305     x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* Dk*xk = k-th block of x */
1306     nz = ai[k+1] - ai[k];
1307     vj = aj + ai[k];
1308     xp = x + (*vj)*3;
1309     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);    /* Indices for the next row (assumes same size as this one) */
1310     PetscPrefetchBlock(v+9*nz,9*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1311     while (nz--) {
1312       /* x(:) += U(k,:)^T*(Dk*xk) */
1313       xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
1314       xp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
1315       xp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
1316       vj++;
1317       xp = x + (*vj)*3;
1318       v += 9;
1319     }
1320     /* xk = inv(Dk)*(Dk*xk) */
1321     diag  = aa+k*9;         /* ptr to inv(Dk) */
1322     xp    = x + k*3;
1323     xp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
1324     xp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
1325     xp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
1326   }
1327   PetscFunctionReturn(0);
1328 }
1329 
1330 #undef __FUNCT__
1331 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering"
1332 PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1333 {
1334   MatScalar   *v;
1335   PetscScalar *xp,x0,x1,x2;
1336   PetscInt    nz,*vj,k;
1337 
1338   PetscFunctionBegin;
1339   for (k=mbs-1; k>=0; k--) {
1340     v  = aa + 9*ai[k];
1341     xp = x + k*3;
1342     x0 = xp[0]; x1 = xp[1]; x2 = xp[2];  /* xk */
1343     nz = ai[k+1] - ai[k];
1344     vj = aj + ai[k];
1345     xp = x + (*vj)*3;
1346     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);    /* Indices for the next row (assumes same size as this one) */
1347     PetscPrefetchBlock(v-9*nz,9*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1348     while (nz--) {
1349       /* xk += U(k,:)*x(:) */
1350       x0 += v[0]*xp[0] + v[3]*xp[1] + v[6]*xp[2];
1351       x1 += v[1]*xp[0] + v[4]*xp[1] + v[7]*xp[2];
1352       x2 += v[2]*xp[0] + v[5]*xp[1] + v[8]*xp[2];
1353       vj++;
1354       v += 9; xp = x + (*vj)*3;
1355     }
1356     xp    = x + k*3;
1357     xp[0] = x0; xp[1] = x1; xp[2] = x2;
1358   }
1359   PetscFunctionReturn(0);
1360 }
1361 
1362 #undef __FUNCT__
1363 #define __FUNCT__ "MatSolve_SeqSBAIJ_3_NaturalOrdering_inplace"
1364 PetscErrorCode MatSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1365 {
1366   Mat_SeqSBAIJ   *a =(Mat_SeqSBAIJ*)A->data;
1367   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1368   MatScalar      *aa=a->a;
1369   PetscScalar    *x,*b;
1370   PetscErrorCode ierr;
1371 
1372   PetscFunctionBegin;
1373   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1374   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1375 
1376   /* solve U^T * D * y = b by forward substitution */
1377   ierr = PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1378   ierr = MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1379 
1380   /* solve U*x = y by back substitution */
1381   ierr = MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1382 
1383   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1384   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1385   ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr);
1386   PetscFunctionReturn(0);
1387 }
1388 
1389 #undef __FUNCT__
1390 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace"
1391 PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1392 {
1393   Mat_SeqSBAIJ   *a =(Mat_SeqSBAIJ*)A->data;
1394   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1395   MatScalar      *aa=a->a;
1396   PetscScalar    *x,*b;
1397   PetscErrorCode ierr;
1398 
1399   PetscFunctionBegin;
1400   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1401   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1402   ierr = PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1403   ierr = MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1404   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1405   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1406   ierr = PetscLogFlops(2.0*bs2*a->nz - bs*mbs);CHKERRQ(ierr);
1407   PetscFunctionReturn(0);
1408 }
1409 
1410 #undef __FUNCT__
1411 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace"
1412 PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1413 {
1414   Mat_SeqSBAIJ   *a =(Mat_SeqSBAIJ*)A->data;
1415   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
1416   MatScalar      *aa=a->a;
1417   PetscScalar    *x,*b;
1418   PetscErrorCode ierr;
1419 
1420   PetscFunctionBegin;
1421   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1422   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1423   ierr = PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1424   ierr = MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1425   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1426   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1427   ierr = PetscLogFlops(2.0*bs2*(a->nz-mbs));CHKERRQ(ierr);
1428   PetscFunctionReturn(0);
1429 }
1430 
1431 #undef __FUNCT__
1432 #define __FUNCT__ "MatSolve_SeqSBAIJ_2_inplace"
1433 PetscErrorCode MatSolve_SeqSBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
1434 {
1435   Mat_SeqSBAIJ   *a   =(Mat_SeqSBAIJ*)A->data;
1436   IS             isrow=a->row;
1437   PetscInt       mbs  =a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1438   PetscErrorCode ierr;
1439   const PetscInt *r;
1440   PetscInt       nz,*vj,k,k2,idx;
1441   MatScalar      *aa=a->a,*v,*diag;
1442   PetscScalar    *x,*b,x0,x1,*t;
1443 
1444   PetscFunctionBegin;
1445   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1446   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1447   t    = a->solve_work;
1448   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1449 
1450   /* solve U^T * D * y = perm(b) by forward substitution */
1451   for (k=0; k<mbs; k++) {  /* t <- perm(b) */
1452     idx      = 2*r[k];
1453     t[k*2]   = b[idx];
1454     t[k*2+1] = b[idx+1];
1455   }
1456   for (k=0; k<mbs; k++) {
1457     v  = aa + 4*ai[k];
1458     vj = aj + ai[k];
1459     k2 = k*2;
1460     x0 = t[k2]; x1 = t[k2+1];
1461     nz = ai[k+1] - ai[k];
1462     while (nz--) {
1463       t[(*vj)*2]   += v[0]*x0 + v[1]*x1;
1464       t[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1465       vj++; v      += 4;
1466     }
1467     diag    = aa+k*4; /* ptr to inv(Dk) */
1468     t[k2]   = diag[0]*x0 + diag[2]*x1;
1469     t[k2+1] = diag[1]*x0 + diag[3]*x1;
1470   }
1471 
1472   /* solve U*x = y by back substitution */
1473   for (k=mbs-1; k>=0; k--) {
1474     v  = aa + 4*ai[k];
1475     vj = aj + ai[k];
1476     k2 = k*2;
1477     x0 = t[k2]; x1 = t[k2+1];
1478     nz = ai[k+1] - ai[k];
1479     while (nz--) {
1480       x0 += v[0]*t[(*vj)*2] + v[2]*t[(*vj)*2+1];
1481       x1 += v[1]*t[(*vj)*2] + v[3]*t[(*vj)*2+1];
1482       vj++;
1483       v += 4;
1484     }
1485     t[k2]    = x0;
1486     t[k2+1]  = x1;
1487     idx      = 2*r[k];
1488     x[idx]   = x0;
1489     x[idx+1] = x1;
1490   }
1491 
1492   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1493   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1494   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1495   ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr);
1496   PetscFunctionReturn(0);
1497 }
1498 
1499 #undef __FUNCT__
1500 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_2_NaturalOrdering"
1501 PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1502 {
1503   MatScalar   *v,*diag;
1504   PetscScalar x0,x1;
1505   PetscInt    nz,*vj,k,k2;
1506 
1507   PetscFunctionBegin;
1508   for (k=0; k<mbs; k++) {
1509     v  = aa + 4*ai[k];
1510     vj = aj + ai[k];
1511     k2 = k*2;
1512     x0 = x[k2]; x1 = x[k2+1];  /* Dk*xk = k-th block of x */
1513     nz = ai[k+1] - ai[k];
1514     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);    /* Indices for the next row (assumes same size as this one) */
1515     PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1516     while (nz--) {
1517       /* x(:) += U(k,:)^T*(Dk*xk) */
1518       x[(*vj)*2]   += v[0]*x0 + v[1]*x1;
1519       x[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1520       vj++; v      += 4;
1521     }
1522     /* xk = inv(Dk)*(Dk*xk) */
1523     diag    = aa+k*4;       /* ptr to inv(Dk) */
1524     x[k2]   = diag[0]*x0 + diag[2]*x1;
1525     x[k2+1] = diag[1]*x0 + diag[3]*x1;
1526   }
1527   PetscFunctionReturn(0);
1528 }
1529 
1530 #undef __FUNCT__
1531 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering"
1532 PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1533 {
1534   MatScalar   *v;
1535   PetscScalar x0,x1;
1536   PetscInt    nz,*vj,k,k2;
1537 
1538   PetscFunctionBegin;
1539   for (k=mbs-1; k>=0; k--) {
1540     v  = aa + 4*ai[k];
1541     vj = aj + ai[k];
1542     k2 = k*2;
1543     x0 = x[k2]; x1 = x[k2+1];  /* xk */
1544     nz = ai[k+1] - ai[k];
1545     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);    /* Indices for the next row (assumes same size as this one) */
1546     PetscPrefetchBlock(v-4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1547     while (nz--) {
1548       /* xk += U(k,:)*x(:) */
1549       x0 += v[0]*x[(*vj)*2] + v[2]*x[(*vj)*2+1];
1550       x1 += v[1]*x[(*vj)*2] + v[3]*x[(*vj)*2+1];
1551       vj++;
1552       v += 4;
1553     }
1554     x[k2]   = x0;
1555     x[k2+1] = x1;
1556   }
1557   PetscFunctionReturn(0);
1558 }
1559 
1560 #undef __FUNCT__
1561 #define __FUNCT__ "MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace"
1562 PetscErrorCode MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1563 {
1564   Mat_SeqSBAIJ   *a =(Mat_SeqSBAIJ*)A->data;
1565   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1566   MatScalar      *aa=a->a;
1567   PetscScalar    *x,*b;
1568   PetscErrorCode ierr;
1569 
1570   PetscFunctionBegin;
1571   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1572   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1573 
1574   /* solve U^T * D * y = b by forward substitution */
1575   ierr = PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1576   ierr = MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1577 
1578   /* solve U*x = y by back substitution */
1579   ierr = MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1580 
1581   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1582   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1583   ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr);
1584   PetscFunctionReturn(0);
1585 }
1586 
1587 #undef __FUNCT__
1588 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace"
1589 PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1590 {
1591   Mat_SeqSBAIJ   *a =(Mat_SeqSBAIJ*)A->data;
1592   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1593   MatScalar      *aa=a->a;
1594   PetscScalar    *x,*b;
1595   PetscErrorCode ierr;
1596 
1597   PetscFunctionBegin;
1598   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1599   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1600   ierr = PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1601   ierr = MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1602   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1603   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1604   ierr = PetscLogFlops(2.0*bs2*a->nz - bs*mbs);CHKERRQ(ierr);
1605   PetscFunctionReturn(0);
1606 }
1607 
1608 #undef __FUNCT__
1609 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace"
1610 PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1611 {
1612   Mat_SeqSBAIJ   *a =(Mat_SeqSBAIJ*)A->data;
1613   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
1614   MatScalar      *aa=a->a;
1615   PetscScalar    *x,*b;
1616   PetscErrorCode ierr;
1617 
1618   PetscFunctionBegin;
1619   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1620   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1621   ierr = PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1622   ierr = MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1623   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1624   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1625   ierr = PetscLogFlops(2.0*bs2*(a->nz - mbs));CHKERRQ(ierr);
1626   PetscFunctionReturn(0);
1627 }
1628 
1629 #undef __FUNCT__
1630 #define __FUNCT__ "MatSolve_SeqSBAIJ_1"
1631 PetscErrorCode MatSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1632 {
1633   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1634   IS                isrow=a->row;
1635   PetscErrorCode    ierr;
1636   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag = a->diag;
1637   const MatScalar   *aa=a->a,*v;
1638   const PetscScalar *b;
1639   PetscScalar       *x,xk,*t;
1640   PetscInt          nz,k,j;
1641 
1642   PetscFunctionBegin;
1643   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1644   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1645   t    = a->solve_work;
1646   ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1647 
1648   /* solve U^T*D*y = perm(b) by forward substitution */
1649   for (k=0; k<mbs; k++) t[k] = b[rp[k]];
1650   for (k=0; k<mbs; k++) {
1651     v  = aa + ai[k];
1652     vj = aj + ai[k];
1653     xk = t[k];
1654     nz = ai[k+1] - ai[k] - 1;
1655     for (j=0; j<nz; j++) t[vj[j]] += v[j]*xk;
1656     t[k] = xk*v[nz];   /* v[nz] = 1/D(k) */
1657   }
1658 
1659   /* solve U*perm(x) = y by back substitution */
1660   for (k=mbs-1; k>=0; k--) {
1661     v  = aa + adiag[k] - 1;
1662     vj = aj + adiag[k] - 1;
1663     nz = ai[k+1] - ai[k] - 1;
1664     for (j=0; j<nz; j++) t[k] += v[-j]*t[vj[-j]];
1665     x[rp[k]] = t[k];
1666   }
1667 
1668   ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1669   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1670   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1671   ierr = PetscLogFlops(4.0*a->nz - 3.0*mbs);CHKERRQ(ierr);
1672   PetscFunctionReturn(0);
1673 }
1674 
1675 #undef __FUNCT__
1676 #define __FUNCT__ "MatSolve_SeqSBAIJ_1_inplace"
1677 PetscErrorCode MatSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1678 {
1679   Mat_SeqSBAIJ    *a   = (Mat_SeqSBAIJ*)A->data;
1680   IS              isrow=a->row;
1681   PetscErrorCode  ierr;
1682   const PetscInt  mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1683   const MatScalar *aa=a->a,*v;
1684   PetscScalar     *x,*b,xk,*t;
1685   PetscInt        nz,k;
1686 
1687   PetscFunctionBegin;
1688   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1689   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1690   t    = a->solve_work;
1691   ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1692 
1693   /* solve U^T*D*y = perm(b) by forward substitution */
1694   for (k=0; k<mbs; k++) t[k] = b[rp[k]];
1695   for (k=0; k<mbs; k++) {
1696     v  = aa + ai[k] + 1;
1697     vj = aj + ai[k] + 1;
1698     xk = t[k];
1699     nz = ai[k+1] - ai[k] - 1;
1700     while (nz--) t[*vj++] += (*v++) * xk;
1701     t[k] = xk*aa[ai[k]];  /* aa[k] = 1/D(k) */
1702   }
1703 
1704   /* solve U*perm(x) = y by back substitution */
1705   for (k=mbs-1; k>=0; k--) {
1706     v  = aa + ai[k] + 1;
1707     vj = aj + ai[k] + 1;
1708     nz = ai[k+1] - ai[k] - 1;
1709     while (nz--) t[k] += (*v++) * t[*vj++];
1710     x[rp[k]] = t[k];
1711   }
1712 
1713   ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1714   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1715   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1716   ierr = PetscLogFlops(4.0*a->nz - 3*mbs);CHKERRQ(ierr);
1717   PetscFunctionReturn(0);
1718 }
1719 
1720 #undef __FUNCT__
1721 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_1"
1722 PetscErrorCode MatForwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1723 {
1724   Mat_SeqSBAIJ    *a   = (Mat_SeqSBAIJ*)A->data;
1725   IS              isrow=a->row;
1726   PetscErrorCode  ierr;
1727   const PetscInt  mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag = a->diag;
1728   const MatScalar *aa=a->a,*v;
1729   PetscReal       diagk;
1730   PetscScalar     *x,*b,xk;
1731   PetscInt        nz,k;
1732 
1733   PetscFunctionBegin;
1734   /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1735   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1736   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1737   ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1738 
1739   for (k=0; k<mbs; k++) x[k] = b[rp[k]];
1740   for (k=0; k<mbs; k++) {
1741     v  = aa + ai[k];
1742     vj = aj + ai[k];
1743     xk = x[k];
1744     nz = ai[k+1] - ai[k] - 1;
1745     while (nz--) x[*vj++] += (*v++) * xk;
1746 
1747     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
1748     if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1749     x[k] = xk*PetscSqrtReal(diagk);
1750   }
1751   ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1752   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1753   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1754   ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr);
1755   PetscFunctionReturn(0);
1756 }
1757 
1758 #undef __FUNCT__
1759 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_1_inplace"
1760 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1761 {
1762   Mat_SeqSBAIJ    *a   = (Mat_SeqSBAIJ*)A->data;
1763   IS              isrow=a->row;
1764   PetscErrorCode  ierr;
1765   const PetscInt  mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1766   const MatScalar *aa=a->a,*v;
1767   PetscReal       diagk;
1768   PetscScalar     *x,*b,xk;
1769   PetscInt        nz,k;
1770 
1771   PetscFunctionBegin;
1772   /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1773   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1774   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1775   ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1776 
1777   for (k=0; k<mbs; k++) x[k] = b[rp[k]];
1778   for (k=0; k<mbs; k++) {
1779     v  = aa + ai[k] + 1;
1780     vj = aj + ai[k] + 1;
1781     xk = x[k];
1782     nz = ai[k+1] - ai[k] - 1;
1783     while (nz--) x[*vj++] += (*v++) * xk;
1784 
1785     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
1786     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1787     x[k] = xk*PetscSqrtReal(diagk);
1788   }
1789   ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1790   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1791   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1792   ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr);
1793   PetscFunctionReturn(0);
1794 }
1795 
1796 #undef __FUNCT__
1797 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_1"
1798 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1799 {
1800   Mat_SeqSBAIJ    *a   = (Mat_SeqSBAIJ*)A->data;
1801   IS              isrow=a->row;
1802   PetscErrorCode  ierr;
1803   const PetscInt  mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag = a->diag;
1804   const MatScalar *aa=a->a,*v;
1805   PetscReal       diagk;
1806   PetscScalar     *x,*b,*t;
1807   PetscInt        nz,k;
1808 
1809   PetscFunctionBegin;
1810   /* solve D^(1/2)*U*perm(x) = b by back substitution */
1811   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1812   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1813   t    = a->solve_work;
1814   ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1815 
1816   for (k=mbs-1; k>=0; k--) {
1817     v     = aa + ai[k];
1818     vj    = aj + ai[k];
1819     diagk = PetscRealPart(aa[adiag[k]]);
1820     if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1821     t[k] = b[k] * PetscSqrtReal(diagk);
1822     nz   = ai[k+1] - ai[k] - 1;
1823     while (nz--) t[k] += (*v++) * t[*vj++];
1824     x[rp[k]] = t[k];
1825   }
1826   ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1827   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1828   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1829   ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr);
1830   PetscFunctionReturn(0);
1831 }
1832 
1833 #undef __FUNCT__
1834 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_1_inplace"
1835 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1836 {
1837   Mat_SeqSBAIJ    *a   = (Mat_SeqSBAIJ*)A->data;
1838   IS              isrow=a->row;
1839   PetscErrorCode  ierr;
1840   const PetscInt  mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1841   const MatScalar *aa=a->a,*v;
1842   PetscReal       diagk;
1843   PetscScalar     *x,*b,*t;
1844   PetscInt        nz,k;
1845 
1846   PetscFunctionBegin;
1847   /* solve D^(1/2)*U*perm(x) = b by back substitution */
1848   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1849   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1850   t    = a->solve_work;
1851   ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1852 
1853   for (k=mbs-1; k>=0; k--) {
1854     v     = aa + ai[k] + 1;
1855     vj    = aj + ai[k] + 1;
1856     diagk = PetscRealPart(aa[ai[k]]);
1857     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1858     t[k] = b[k] * PetscSqrtReal(diagk);
1859     nz   = ai[k+1] - ai[k] - 1;
1860     while (nz--) t[k] += (*v++) * t[*vj++];
1861     x[rp[k]] = t[k];
1862   }
1863   ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1864   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1865   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1866   ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr);
1867   PetscFunctionReturn(0);
1868 }
1869 
1870 #undef __FUNCT__
1871 #define __FUNCT__ "MatSolves_SeqSBAIJ_1"
1872 PetscErrorCode MatSolves_SeqSBAIJ_1(Mat A,Vecs bb,Vecs xx)
1873 {
1874   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1875   PetscErrorCode ierr;
1876 
1877   PetscFunctionBegin;
1878   if (A->rmap->bs == 1) {
1879     ierr = MatSolve_SeqSBAIJ_1(A,bb->v,xx->v);CHKERRQ(ierr);
1880   } else {
1881     IS              isrow=a->row;
1882     const PetscInt  *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp;
1883     const MatScalar *aa=a->a,*v;
1884     PetscScalar     *x,*b,*t;
1885     PetscInt        nz,k,n,i,j;
1886     if (bb->n > a->solves_work_n) {
1887       ierr = PetscFree(a->solves_work);CHKERRQ(ierr);
1888       ierr = PetscMalloc1(bb->n*A->rmap->N,&a->solves_work);CHKERRQ(ierr);
1889 
1890       a->solves_work_n = bb->n;
1891     }
1892     n    = bb->n;
1893     ierr = VecGetArray(bb->v,&b);CHKERRQ(ierr);
1894     ierr = VecGetArray(xx->v,&x);CHKERRQ(ierr);
1895     t    = a->solves_work;
1896 
1897     ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1898 
1899     /* solve U^T*D*y = perm(b) by forward substitution */
1900     for (k=0; k<mbs; k++) {
1901       for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs]; /* values are stored interlaced in t */
1902     }
1903     for (k=0; k<mbs; k++) {
1904       v  = aa + ai[k];
1905       vj = aj + ai[k];
1906       nz = ai[k+1] - ai[k] - 1;
1907       for (j=0; j<nz; j++) {
1908         for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i];
1909         v++;vj++;
1910       }
1911       for (i=0; i<n; i++) t[n*k+i] *= aa[nz];  /* note: aa[nz] = 1/D(k) */
1912     }
1913 
1914     /* solve U*perm(x) = y by back substitution */
1915     for (k=mbs-1; k>=0; k--) {
1916       v  = aa + ai[k] - 1;
1917       vj = aj + ai[k] - 1;
1918       nz = ai[k+1] - ai[k] - 1;
1919       for (j=0; j<nz; j++) {
1920         for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i];
1921         v++;vj++;
1922       }
1923       for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i];
1924     }
1925 
1926     ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1927     ierr = VecRestoreArray(bb->v,&b);CHKERRQ(ierr);
1928     ierr = VecRestoreArray(xx->v,&x);CHKERRQ(ierr);
1929     ierr = PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));CHKERRQ(ierr);
1930   }
1931   PetscFunctionReturn(0);
1932 }
1933 
1934 #undef __FUNCT__
1935 #define __FUNCT__ "MatSolves_SeqSBAIJ_1_inplace"
1936 PetscErrorCode MatSolves_SeqSBAIJ_1_inplace(Mat A,Vecs bb,Vecs xx)
1937 {
1938   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1939   PetscErrorCode ierr;
1940 
1941   PetscFunctionBegin;
1942   if (A->rmap->bs == 1) {
1943     ierr = MatSolve_SeqSBAIJ_1_inplace(A,bb->v,xx->v);CHKERRQ(ierr);
1944   } else {
1945     IS              isrow=a->row;
1946     const PetscInt  *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp;
1947     const MatScalar *aa=a->a,*v;
1948     PetscScalar     *x,*b,*t;
1949     PetscInt        nz,k,n,i;
1950     if (bb->n > a->solves_work_n) {
1951       ierr = PetscFree(a->solves_work);CHKERRQ(ierr);
1952       ierr = PetscMalloc1(bb->n*A->rmap->N,&a->solves_work);CHKERRQ(ierr);
1953 
1954       a->solves_work_n = bb->n;
1955     }
1956     n    = bb->n;
1957     ierr = VecGetArray(bb->v,&b);CHKERRQ(ierr);
1958     ierr = VecGetArray(xx->v,&x);CHKERRQ(ierr);
1959     t    = a->solves_work;
1960 
1961     ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1962 
1963     /* solve U^T*D*y = perm(b) by forward substitution */
1964     for (k=0; k<mbs; k++) {
1965       for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs];  /* values are stored interlaced in t */
1966     }
1967     for (k=0; k<mbs; k++) {
1968       v  = aa + ai[k];
1969       vj = aj + ai[k];
1970       nz = ai[k+1] - ai[k];
1971       while (nz--) {
1972         for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i];
1973         v++;vj++;
1974       }
1975       for (i=0; i<n; i++) t[n*k+i] *= aa[k];  /* note: aa[k] = 1/D(k) */
1976     }
1977 
1978     /* solve U*perm(x) = y by back substitution */
1979     for (k=mbs-1; k>=0; k--) {
1980       v  = aa + ai[k];
1981       vj = aj + ai[k];
1982       nz = ai[k+1] - ai[k];
1983       while (nz--) {
1984         for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i];
1985         v++;vj++;
1986       }
1987       for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i];
1988     }
1989 
1990     ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1991     ierr = VecRestoreArray(bb->v,&b);CHKERRQ(ierr);
1992     ierr = VecRestoreArray(xx->v,&x);CHKERRQ(ierr);
1993     ierr = PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));CHKERRQ(ierr);
1994   }
1995   PetscFunctionReturn(0);
1996 }
1997 
1998 #undef __FUNCT__
1999 #define __FUNCT__ "MatSolve_SeqSBAIJ_1_NaturalOrdering"
2000 PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2001 {
2002   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
2003   PetscErrorCode    ierr;
2004   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*vj,*adiag = a->diag;
2005   const MatScalar   *aa=a->a,*v;
2006   const PetscScalar *b;
2007   PetscScalar       *x,xi;
2008   PetscInt          nz,i,j;
2009 
2010   PetscFunctionBegin;
2011   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2012   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2013 
2014   /* solve U^T*D*y = b by forward substitution */
2015   ierr = PetscMemcpy(x,b,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
2016   for (i=0; i<mbs; i++) {
2017     v  = aa + ai[i];
2018     vj = aj + ai[i];
2019     xi = x[i];
2020     nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
2021     for (j=0; j<nz; j++) x[vj[j]] += v[j]* xi;
2022     x[i] = xi*v[nz];  /* v[nz] = aa[diag[i]] = 1/D(i) */
2023   }
2024 
2025   /* solve U*x = y by backward substitution */
2026   for (i=mbs-2; i>=0; i--) {
2027     xi = x[i];
2028     v  = aa + adiag[i] - 1; /* end of row i, excluding diag */
2029     vj = aj + adiag[i] - 1;
2030     nz = ai[i+1] - ai[i] - 1;
2031     for (j=0; j<nz; j++) xi += v[-j]*x[vj[-j]];
2032     x[i] = xi;
2033   }
2034 
2035   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2036   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2037   ierr = PetscLogFlops(4.0*a->nz - 3*mbs);CHKERRQ(ierr);
2038   PetscFunctionReturn(0);
2039 }
2040 
2041 #undef __FUNCT__
2042 #define __FUNCT__ "MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace"
2043 PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2044 {
2045   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
2046   PetscErrorCode ierr;
2047   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
2048   MatScalar      *aa=a->a,*v;
2049   PetscScalar    *x,*b,xk;
2050   PetscInt       nz,*vj,k;
2051 
2052   PetscFunctionBegin;
2053   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2054   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2055 
2056   /* solve U^T*D*y = b by forward substitution */
2057   ierr = PetscMemcpy(x,b,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
2058   for (k=0; k<mbs; k++) {
2059     v  = aa + ai[k] + 1;
2060     vj = aj + ai[k] + 1;
2061     xk = x[k];
2062     nz = ai[k+1] - ai[k] - 1;     /* exclude diag[k] */
2063     while (nz--) x[*vj++] += (*v++) * xk;
2064     x[k] = xk*aa[ai[k]];  /* note: aa[diag[k]] = 1/D(k) */
2065   }
2066 
2067   /* solve U*x = y by back substitution */
2068   for (k=mbs-2; k>=0; k--) {
2069     v  = aa + ai[k] + 1;
2070     vj = aj + ai[k] + 1;
2071     xk = x[k];
2072     nz = ai[k+1] - ai[k] - 1;
2073     while (nz--) {
2074       xk += (*v++) * x[*vj++];
2075     }
2076     x[k] = xk;
2077   }
2078 
2079   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2080   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2081   ierr = PetscLogFlops(4.0*a->nz - 3*mbs);CHKERRQ(ierr);
2082   PetscFunctionReturn(0);
2083 }
2084 
2085 #undef __FUNCT__
2086 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_1_NaturalOrdering"
2087 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2088 {
2089   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
2090   PetscErrorCode  ierr;
2091   PetscInt        mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag = a->diag;
2092   const MatScalar *aa=a->a,*v;
2093   PetscReal       diagk;
2094   PetscScalar     *x,*b;
2095   PetscInt        nz,*vj,k;
2096 
2097   PetscFunctionBegin;
2098   /* solve U^T*D^(1/2)*x = b by forward substitution */
2099   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2100   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2101   ierr = PetscMemcpy(x,b,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
2102   for (k=0; k<mbs; k++) {
2103     v  = aa + ai[k];
2104     vj = aj + ai[k];
2105     nz = ai[k+1] - ai[k] - 1;     /* exclude diag[k] */
2106     while (nz--) x[*vj++] += (*v++) * x[k];
2107     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[adiag[k]] = 1/D(k) */
2108     if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",PetscRealPart(aa[adiag[k]]),PetscImaginaryPart(aa[adiag[k]]));
2109     x[k] *= PetscSqrtReal(diagk);
2110   }
2111   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2112   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2113   ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr);
2114   PetscFunctionReturn(0);
2115 }
2116 
2117 #undef __FUNCT__
2118 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace"
2119 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2120 {
2121   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
2122   PetscErrorCode ierr;
2123   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
2124   MatScalar      *aa=a->a,*v;
2125   PetscReal      diagk;
2126   PetscScalar    *x,*b;
2127   PetscInt       nz,*vj,k;
2128 
2129   PetscFunctionBegin;
2130   /* solve U^T*D^(1/2)*x = b by forward substitution */
2131   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2132   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2133   ierr = PetscMemcpy(x,b,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
2134   for (k=0; k<mbs; k++) {
2135     v  = aa + ai[k] + 1;
2136     vj = aj + ai[k] + 1;
2137     nz = ai[k+1] - ai[k] - 1;     /* exclude diag[k] */
2138     while (nz--) x[*vj++] += (*v++) * x[k];
2139     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2140     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",PetscRealPart(aa[ai[k]]),PetscImaginaryPart(aa[ai[k]]));
2141     x[k] *= PetscSqrtReal(diagk);
2142   }
2143   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2144   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2145   ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr);
2146   PetscFunctionReturn(0);
2147 }
2148 
2149 #undef __FUNCT__
2150 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering"
2151 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2152 {
2153   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
2154   PetscErrorCode ierr;
2155   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag = a->diag;
2156   MatScalar      *aa=a->a,*v;
2157   PetscReal      diagk;
2158   PetscScalar    *x,*b;
2159   PetscInt       nz,*vj,k;
2160 
2161   PetscFunctionBegin;
2162   /* solve D^(1/2)*U*x = b by back substitution */
2163   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2164   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2165 
2166   for (k=mbs-1; k>=0; k--) {
2167     v     = aa + ai[k];
2168     vj    = aj + ai[k];
2169     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
2170     if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
2171     x[k] = PetscSqrtReal(diagk)*b[k];
2172     nz   = ai[k+1] - ai[k] - 1;
2173     while (nz--) x[k] += (*v++) * x[*vj++];
2174   }
2175   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2176   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2177   ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr);
2178   PetscFunctionReturn(0);
2179 }
2180 
2181 #undef __FUNCT__
2182 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace"
2183 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2184 {
2185   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
2186   PetscErrorCode ierr;
2187   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
2188   MatScalar      *aa=a->a,*v;
2189   PetscReal      diagk;
2190   PetscScalar    *x,*b;
2191   PetscInt       nz,*vj,k;
2192 
2193   PetscFunctionBegin;
2194   /* solve D^(1/2)*U*x = b by back substitution */
2195   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2196   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2197 
2198   for (k=mbs-1; k>=0; k--) {
2199     v     = aa + ai[k] + 1;
2200     vj    = aj + ai[k] + 1;
2201     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2202     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
2203     x[k] = PetscSqrtReal(diagk)*b[k];
2204     nz   = ai[k+1] - ai[k] - 1;
2205     while (nz--) x[k] += (*v++) * x[*vj++];
2206   }
2207   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2208   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2209   ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr);
2210   PetscFunctionReturn(0);
2211 }
2212 
2213 /* Use Modified Sparse Row storage for u and ju, see Saad pp.85 */
2214 #undef __FUNCT__
2215 #define __FUNCT__ "MatICCFactorSymbolic_SeqSBAIJ_MSR"
2216 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_MSR(Mat B,Mat A,IS perm,const MatFactorInfo *info)
2217 {
2218   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b;
2219   PetscErrorCode ierr;
2220   const PetscInt *rip,mbs = a->mbs,*ai,*aj;
2221   PetscInt       *jutmp,bs = A->rmap->bs,bs2=a->bs2,i;
2222   PetscInt       m,reallocs = 0,*levtmp;
2223   PetscInt       *prowl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd;
2224   PetscInt       incrlev,*lev,shift,prow,nz;
2225   PetscReal      f = info->fill,levels = info->levels;
2226   PetscBool      perm_identity;
2227 
2228   PetscFunctionBegin;
2229   /* check whether perm is the identity mapping */
2230   ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr);
2231 
2232   if (perm_identity) {
2233     a->permute = PETSC_FALSE;
2234     ai         = a->i; aj = a->j;
2235   } else { /*  non-trivial permutation */
2236     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2237     a->permute = PETSC_TRUE;
2238     ierr       = MatReorderingSeqSBAIJ(A, perm);CHKERRQ(ierr);
2239     ai         = a->inew; aj = a->jnew;
2240   }
2241 
2242   /* initialization */
2243   ierr  = ISGetIndices(perm,&rip);CHKERRQ(ierr);
2244   umax  = (PetscInt)(f*ai[mbs] + 1);
2245   ierr  = PetscMalloc1(umax,&lev);CHKERRQ(ierr);
2246   umax += mbs + 1;
2247   shift = mbs + 1;
2248   ierr  = PetscMalloc1(mbs+1,&iu);CHKERRQ(ierr);
2249   ierr  = PetscMalloc1(umax,&ju);CHKERRQ(ierr);
2250   iu[0] = mbs + 1;
2251   juidx = mbs + 1;
2252   /* prowl: linked list for pivot row */
2253   ierr = PetscMalloc3(mbs,&prowl,mbs,&q,mbs,&levtmp);CHKERRQ(ierr);
2254   /* q: linked list for col index */
2255 
2256   for (i=0; i<mbs; i++) {
2257     prowl[i] = mbs;
2258     q[i]     = 0;
2259   }
2260 
2261   /* for each row k */
2262   for (k=0; k<mbs; k++) {
2263     nzk  = 0;
2264     q[k] = mbs;
2265     /* copy current row into linked list */
2266     nz = ai[rip[k]+1] - ai[rip[k]];
2267     j  = ai[rip[k]];
2268     while (nz--) {
2269       vj = rip[aj[j++]];
2270       if (vj > k) {
2271         qm = k;
2272         do {
2273           m = qm; qm = q[m];
2274         } while (qm < vj);
2275         if (qm == vj) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Duplicate entry in A\n");
2276         nzk++;
2277         q[m]       = vj;
2278         q[vj]      = qm;
2279         levtmp[vj] = 0;   /* initialize lev for nonzero element */
2280       }
2281     }
2282 
2283     /* modify nonzero structure of k-th row by computing fill-in
2284        for each row prow to be merged in */
2285     prow = k;
2286     prow = prowl[prow]; /* next pivot row (== 0 for symbolic factorization) */
2287 
2288     while (prow < k) {
2289       /* merge row prow into k-th row */
2290       jmin = iu[prow] + 1;
2291       jmax = iu[prow+1];
2292       qm   = k;
2293       for (j=jmin; j<jmax; j++) {
2294         incrlev = lev[j-shift] + 1;
2295         if (incrlev > levels) continue;
2296         vj = ju[j];
2297         do {
2298           m = qm; qm = q[m];
2299         } while (qm < vj);
2300         if (qm != vj) {      /* a fill */
2301           nzk++; q[m] = vj; q[vj] = qm; qm = vj;
2302           levtmp[vj] = incrlev;
2303         } else if (levtmp[vj] > incrlev) levtmp[vj] = incrlev;
2304       }
2305       prow = prowl[prow]; /* next pivot row */
2306     }
2307 
2308     /* add k to row list for first nonzero element in k-th row */
2309     if (nzk > 1) {
2310       i        = q[k]; /* col value of first nonzero element in k_th row of U */
2311       prowl[k] = prowl[i]; prowl[i] = k;
2312     }
2313     iu[k+1] = iu[k] + nzk;
2314 
2315     /* allocate more space to ju and lev if needed */
2316     if (iu[k+1] > umax) {
2317       /* estimate how much additional space we will need */
2318       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
2319       /* just double the memory each time */
2320       maxadd = umax;
2321       if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
2322       umax += maxadd;
2323 
2324       /* allocate a longer ju */
2325       ierr = PetscMalloc1(umax,&jutmp);CHKERRQ(ierr);
2326       ierr = PetscMemcpy(jutmp,ju,iu[k]*sizeof(PetscInt));CHKERRQ(ierr);
2327       ierr = PetscFree(ju);CHKERRQ(ierr);
2328       ju   = jutmp;
2329 
2330       ierr      = PetscMalloc1(umax,&jutmp);CHKERRQ(ierr);
2331       ierr      = PetscMemcpy(jutmp,lev,(iu[k]-shift)*sizeof(PetscInt));CHKERRQ(ierr);
2332       ierr      = PetscFree(lev);CHKERRQ(ierr);
2333       lev       = jutmp;
2334       reallocs += 2; /* count how many times we realloc */
2335     }
2336 
2337     /* save nonzero structure of k-th row in ju */
2338     i=k;
2339     while (nzk--) {
2340       i                = q[i];
2341       ju[juidx]        = i;
2342       lev[juidx-shift] = levtmp[i];
2343       juidx++;
2344     }
2345   }
2346 
2347 #if defined(PETSC_USE_INFO)
2348   if (ai[mbs] != 0) {
2349     PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
2350     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)f,(double)af);CHKERRQ(ierr);
2351     ierr = PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);CHKERRQ(ierr);
2352     ierr = PetscInfo1(A,"PCFactorSetFill(pc,%g);\n",(double)af);CHKERRQ(ierr);
2353     ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
2354   } else {
2355     ierr = PetscInfo(A,"Empty matrix.\n");CHKERRQ(ierr);
2356   }
2357 #endif
2358 
2359   ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
2360   ierr = PetscFree3(prowl,q,levtmp);CHKERRQ(ierr);
2361   ierr = PetscFree(lev);CHKERRQ(ierr);
2362 
2363   /* put together the new matrix */
2364   ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(B,bs,0,NULL);CHKERRQ(ierr);
2365 
2366   /* ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)iperm);CHKERRQ(ierr); */
2367   b    = (Mat_SeqSBAIJ*)(B)->data;
2368   ierr = PetscFree2(b->imax,b->ilen);CHKERRQ(ierr);
2369 
2370   b->singlemalloc = PETSC_FALSE;
2371   b->free_a       = PETSC_TRUE;
2372   b->free_ij      = PETSC_TRUE;
2373   /* the next line frees the default space generated by the Create() */
2374   ierr    = PetscFree3(b->a,b->j,b->i);CHKERRQ(ierr);
2375   ierr    = PetscMalloc1((iu[mbs]+1)*bs2,&b->a);CHKERRQ(ierr);
2376   b->j    = ju;
2377   b->i    = iu;
2378   b->diag = 0;
2379   b->ilen = 0;
2380   b->imax = 0;
2381 
2382   ierr    = ISDestroy(&b->row);CHKERRQ(ierr);
2383   ierr    = ISDestroy(&b->icol);CHKERRQ(ierr);
2384   b->row  = perm;
2385   b->icol = perm;
2386   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
2387   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
2388   ierr    = PetscMalloc1(bs*mbs+bs,&b->solve_work);CHKERRQ(ierr);
2389   /* In b structure:  Free imax, ilen, old a, old j.
2390      Allocate idnew, solve_work, new a, new j */
2391   ierr     = PetscLogObjectMemory((PetscObject)B,(iu[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));CHKERRQ(ierr);
2392   b->maxnz = b->nz = iu[mbs];
2393 
2394   (B)->info.factor_mallocs   = reallocs;
2395   (B)->info.fill_ratio_given = f;
2396   if (ai[mbs] != 0) {
2397     (B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
2398   } else {
2399     (B)->info.fill_ratio_needed = 0.0;
2400   }
2401   ierr = MatSeqSBAIJSetNumericFactorization_inplace(B,perm_identity);CHKERRQ(ierr);
2402   PetscFunctionReturn(0);
2403 }
2404 
2405 /*
2406   See MatICCFactorSymbolic_SeqAIJ() for description of its data structure
2407 */
2408 #include <petscbt.h>
2409 #include <../src/mat/utils/freespace.h>
2410 #undef __FUNCT__
2411 #define __FUNCT__ "MatICCFactorSymbolic_SeqSBAIJ"
2412 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
2413 {
2414   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b;
2415   PetscErrorCode     ierr;
2416   PetscBool          perm_identity,free_ij = PETSC_TRUE,missing;
2417   PetscInt           bs=A->rmap->bs,am=a->mbs,d,*ai=a->i,*aj= a->j;
2418   const PetscInt     *rip;
2419   PetscInt           reallocs=0,i,*ui,*udiag,*cols;
2420   PetscInt           jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow;
2421   PetscInt           nlnk,*lnk,*lnk_lvl=NULL,ncols,*uj,**uj_ptr,**uj_lvl_ptr;
2422   PetscReal          fill          =info->fill,levels=info->levels;
2423   PetscFreeSpaceList free_space    =NULL,current_space=NULL;
2424   PetscFreeSpaceList free_space_lvl=NULL,current_space_lvl=NULL;
2425   PetscBT            lnkbt;
2426 
2427   PetscFunctionBegin;
2428   if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
2429   ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr);
2430   if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
2431   if (bs > 1) {
2432     ierr = MatICCFactorSymbolic_SeqSBAIJ_inplace(fact,A,perm,info);CHKERRQ(ierr);
2433     PetscFunctionReturn(0);
2434   }
2435 
2436   /* check whether perm is the identity mapping */
2437   ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr);
2438   if (!perm_identity) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2439   a->permute = PETSC_FALSE;
2440 
2441   ierr  = PetscMalloc1(am+1,&ui);CHKERRQ(ierr);
2442   ierr  = PetscMalloc1(am+1,&udiag);CHKERRQ(ierr);
2443   ui[0] = 0;
2444 
2445   /* ICC(0) without matrix ordering: simply rearrange column indices */
2446   if (!levels) {
2447     /* reuse the column pointers and row offsets for memory savings */
2448     for (i=0; i<am; i++) {
2449       ncols    = ai[i+1] - ai[i];
2450       ui[i+1]  = ui[i] + ncols;
2451       udiag[i] = ui[i+1] - 1; /* points to the last entry of U(i,:) */
2452     }
2453     ierr = PetscMalloc1(ui[am]+1,&uj);CHKERRQ(ierr);
2454     cols = uj;
2455     for (i=0; i<am; i++) {
2456       aj    = a->j + ai[i] + 1; /* 1st entry of U(i,:) without diagonal */
2457       ncols = ai[i+1] - ai[i] -1;
2458       for (j=0; j<ncols; j++) *cols++ = aj[j];
2459       *cols++ = i; /* diagoanl is located as the last entry of U(i,:) */
2460     }
2461   } else { /* case: levels>0 */
2462     ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr);
2463 
2464     /* initialization */
2465     /* jl: linked list for storing indices of the pivot rows
2466        il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2467     ierr = PetscMalloc4(am,&uj_ptr,am,&uj_lvl_ptr,am,&il,am,&jl);CHKERRQ(ierr);
2468     for (i=0; i<am; i++) {
2469       jl[i] = am; il[i] = 0;
2470     }
2471 
2472     /* create and initialize a linked list for storing column indices of the active row k */
2473     nlnk = am + 1;
2474     ierr = PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
2475 
2476     /* initial FreeSpace size is fill*(ai[am]+1) */
2477     ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space);CHKERRQ(ierr);
2478 
2479     current_space = free_space;
2480 
2481     ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space_lvl);CHKERRQ(ierr);
2482 
2483     current_space_lvl = free_space_lvl;
2484 
2485     for (k=0; k<am; k++) {  /* for each active row k */
2486       /* initialize lnk by the column indices of row k */
2487       nzk   = 0;
2488       ncols = ai[k+1] - ai[k];
2489       if (!ncols) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Empty row %D in matrix ",k);
2490       cols = aj+ai[k];
2491       ierr = PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
2492       nzk += nlnk;
2493 
2494       /* update lnk by computing fill-in for each pivot row to be merged in */
2495       prow = jl[k]; /* 1st pivot row */
2496 
2497       while (prow < k) {
2498         nextprow = jl[prow];
2499 
2500         /* merge prow into k-th row */
2501         jmin  = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
2502         jmax  = ui[prow+1];
2503         ncols = jmax-jmin;
2504         i     = jmin - ui[prow];
2505 
2506         cols = uj_ptr[prow] + i;  /* points to the 2nd nzero entry in U(prow,k:am-1) */
2507         uj   = uj_lvl_ptr[prow] + i;  /* levels of cols */
2508         j    = *(uj - 1);
2509         ierr = PetscICCLLAddSorted(ncols,cols,levels,uj,am,nlnk,lnk,lnk_lvl,lnkbt,j);CHKERRQ(ierr);
2510         nzk += nlnk;
2511 
2512         /* update il and jl for prow */
2513         if (jmin < jmax) {
2514           il[prow] = jmin;
2515 
2516           j = *cols; jl[prow] = jl[j]; jl[j] = prow;
2517         }
2518         prow = nextprow;
2519       }
2520 
2521       /* if free space is not available, make more free space */
2522       if (current_space->local_remaining<nzk) {
2523         i    = am - k + 1; /* num of unfactored rows */
2524         i   *= PetscMin(nzk, i-1); /* i*nzk, i*(i-1): estimated and max additional space needed */
2525         ierr = PetscFreeSpaceGet(i,&current_space);CHKERRQ(ierr);
2526         ierr = PetscFreeSpaceGet(i,&current_space_lvl);CHKERRQ(ierr);
2527         reallocs++;
2528       }
2529 
2530       /* copy data into free_space and free_space_lvl, then initialize lnk */
2531       if (nzk == 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Empty row %D in ICC matrix factor",k);
2532       ierr = PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr);
2533 
2534       /* add the k-th row into il and jl */
2535       if (nzk > 1) {
2536         i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2537         jl[k] = jl[i]; jl[i] = k;
2538         il[k] = ui[k] + 1;
2539       }
2540       uj_ptr[k]     = current_space->array;
2541       uj_lvl_ptr[k] = current_space_lvl->array;
2542 
2543       current_space->array               += nzk;
2544       current_space->local_used          += nzk;
2545       current_space->local_remaining     -= nzk;
2546       current_space_lvl->array           += nzk;
2547       current_space_lvl->local_used      += nzk;
2548       current_space_lvl->local_remaining -= nzk;
2549 
2550       ui[k+1] = ui[k] + nzk;
2551     }
2552 
2553     ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
2554     ierr = PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);CHKERRQ(ierr);
2555 
2556     /* destroy list of free space and other temporary array(s) */
2557     ierr = PetscMalloc1(ui[am]+1,&uj);CHKERRQ(ierr);
2558     ierr = PetscFreeSpaceContiguous_Cholesky(&free_space,uj,am,ui,udiag);CHKERRQ(ierr); /* store matrix factor  */
2559     ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
2560     ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr);
2561 
2562   } /* end of case: levels>0 || (levels=0 && !perm_identity) */
2563 
2564   /* put together the new matrix in MATSEQSBAIJ format */
2565   ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(fact,bs,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
2566 
2567   b    = (Mat_SeqSBAIJ*)(fact)->data;
2568   ierr = PetscFree2(b->imax,b->ilen);CHKERRQ(ierr);
2569 
2570   b->singlemalloc = PETSC_FALSE;
2571   b->free_a       = PETSC_TRUE;
2572   b->free_ij      = free_ij;
2573 
2574   ierr = PetscMalloc1(ui[am]+1,&b->a);CHKERRQ(ierr);
2575 
2576   b->j         = uj;
2577   b->i         = ui;
2578   b->diag      = udiag;
2579   b->free_diag = PETSC_TRUE;
2580   b->ilen      = 0;
2581   b->imax      = 0;
2582   b->row       = perm;
2583   b->col       = perm;
2584 
2585   ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
2586   ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
2587 
2588   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
2589 
2590   ierr = PetscMalloc1(am+1,&b->solve_work);CHKERRQ(ierr);
2591   ierr = PetscLogObjectMemory((PetscObject)fact,ui[am]*(sizeof(PetscInt)+sizeof(MatScalar)));CHKERRQ(ierr);
2592 
2593   b->maxnz = b->nz = ui[am];
2594 
2595   fact->info.factor_mallocs   = reallocs;
2596   fact->info.fill_ratio_given = fill;
2597   if (ai[am] != 0) {
2598     fact->info.fill_ratio_needed = ((PetscReal)ui[am])/ai[am];
2599   } else {
2600     fact->info.fill_ratio_needed = 0.0;
2601   }
2602 #if defined(PETSC_USE_INFO)
2603   if (ai[am] != 0) {
2604     PetscReal af = fact->info.fill_ratio_needed;
2605     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)fill,(double)af);CHKERRQ(ierr);
2606     ierr = PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);CHKERRQ(ierr);
2607     ierr = PetscInfo1(A,"PCFactorSetFill(pc,%g) for best performance.\n",(double)af);CHKERRQ(ierr);
2608   } else {
2609     ierr = PetscInfo(A,"Empty matrix.\n");CHKERRQ(ierr);
2610   }
2611 #endif
2612   fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
2613   PetscFunctionReturn(0);
2614 }
2615 
2616 #undef __FUNCT__
2617 #define __FUNCT__ "MatICCFactorSymbolic_SeqSBAIJ_inplace"
2618 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_inplace(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
2619 {
2620   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data;
2621   Mat_SeqSBAIJ       *b;
2622   PetscErrorCode     ierr;
2623   PetscBool          perm_identity,free_ij = PETSC_TRUE;
2624   PetscInt           bs=A->rmap->bs,am=a->mbs;
2625   const PetscInt     *cols,*rip,*ai=a->i,*aj=a->j;
2626   PetscInt           reallocs=0,i,*ui;
2627   PetscInt           jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow;
2628   PetscInt           nlnk,*lnk,*lnk_lvl=NULL,ncols,*cols_lvl,*uj,**uj_ptr,**uj_lvl_ptr;
2629   PetscReal          fill          =info->fill,levels=info->levels,ratio_needed;
2630   PetscFreeSpaceList free_space    =NULL,current_space=NULL;
2631   PetscFreeSpaceList free_space_lvl=NULL,current_space_lvl=NULL;
2632   PetscBT            lnkbt;
2633 
2634   PetscFunctionBegin;
2635   /*
2636    This code originally uses Modified Sparse Row (MSR) storage
2637    (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choice!
2638    Then it is rewritten so the factor B takes seqsbaij format. However the associated
2639    MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1,
2640    thus the original code in MSR format is still used for these cases.
2641    The code below should replace MatICCFactorSymbolic_SeqSBAIJ_MSR() whenever
2642    MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor.
2643   */
2644   if (bs > 1) {
2645     ierr = MatICCFactorSymbolic_SeqSBAIJ_MSR(fact,A,perm,info);CHKERRQ(ierr);
2646     PetscFunctionReturn(0);
2647   }
2648 
2649   /* check whether perm is the identity mapping */
2650   ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr);
2651   if (!perm_identity) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2652   a->permute = PETSC_FALSE;
2653 
2654   /* special case that simply copies fill pattern */
2655   if (!levels) {
2656     /* reuse the column pointers and row offsets for memory savings */
2657     ui           = a->i;
2658     uj           = a->j;
2659     free_ij      = PETSC_FALSE;
2660     ratio_needed = 1.0;
2661   } else { /* case: levels>0 */
2662     ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr);
2663 
2664     /* initialization */
2665     ierr  = PetscMalloc1(am+1,&ui);CHKERRQ(ierr);
2666     ui[0] = 0;
2667 
2668     /* jl: linked list for storing indices of the pivot rows
2669        il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2670     ierr = PetscMalloc4(am,&uj_ptr,am,&uj_lvl_ptr,am,&il,am,&jl);CHKERRQ(ierr);
2671     for (i=0; i<am; i++) {
2672       jl[i] = am; il[i] = 0;
2673     }
2674 
2675     /* create and initialize a linked list for storing column indices of the active row k */
2676     nlnk = am + 1;
2677     ierr = PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
2678 
2679     /* initial FreeSpace size is fill*(ai[am]+1) */
2680     ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space);CHKERRQ(ierr);
2681 
2682     current_space = free_space;
2683 
2684     ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space_lvl);CHKERRQ(ierr);
2685 
2686     current_space_lvl = free_space_lvl;
2687 
2688     for (k=0; k<am; k++) {  /* for each active row k */
2689       /* initialize lnk by the column indices of row rip[k] */
2690       nzk   = 0;
2691       ncols = ai[rip[k]+1] - ai[rip[k]];
2692       cols  = aj+ai[rip[k]];
2693       ierr  = PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
2694       nzk  += nlnk;
2695 
2696       /* update lnk by computing fill-in for each pivot row to be merged in */
2697       prow = jl[k]; /* 1st pivot row */
2698 
2699       while (prow < k) {
2700         nextprow = jl[prow];
2701 
2702         /* merge prow into k-th row */
2703         jmin     = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
2704         jmax     = ui[prow+1];
2705         ncols    = jmax-jmin;
2706         i        = jmin - ui[prow];
2707         cols     = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
2708         j        = *(uj_lvl_ptr[prow] + i - 1);
2709         cols_lvl = uj_lvl_ptr[prow]+i;
2710         ierr     = PetscICCLLAddSorted(ncols,cols,levels,cols_lvl,am,nlnk,lnk,lnk_lvl,lnkbt,j);CHKERRQ(ierr);
2711         nzk     += nlnk;
2712 
2713         /* update il and jl for prow */
2714         if (jmin < jmax) {
2715           il[prow] = jmin;
2716           j        = *cols;
2717           jl[prow] = jl[j];
2718           jl[j]    = prow;
2719         }
2720         prow = nextprow;
2721       }
2722 
2723       /* if free space is not available, make more free space */
2724       if (current_space->local_remaining<nzk) {
2725         i    = am - k + 1; /* num of unfactored rows */
2726         i    = PetscMin(i*nzk, i*(i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
2727         ierr = PetscFreeSpaceGet(i,&current_space);CHKERRQ(ierr);
2728         ierr = PetscFreeSpaceGet(i,&current_space_lvl);CHKERRQ(ierr);
2729         reallocs++;
2730       }
2731 
2732       /* copy data into free_space and free_space_lvl, then initialize lnk */
2733       ierr = PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr);
2734 
2735       /* add the k-th row into il and jl */
2736       if (nzk-1 > 0) {
2737         i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2738         jl[k] = jl[i]; jl[i] = k;
2739         il[k] = ui[k] + 1;
2740       }
2741       uj_ptr[k]     = current_space->array;
2742       uj_lvl_ptr[k] = current_space_lvl->array;
2743 
2744       current_space->array               += nzk;
2745       current_space->local_used          += nzk;
2746       current_space->local_remaining     -= nzk;
2747       current_space_lvl->array           += nzk;
2748       current_space_lvl->local_used      += nzk;
2749       current_space_lvl->local_remaining -= nzk;
2750 
2751       ui[k+1] = ui[k] + nzk;
2752     }
2753 
2754     ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
2755     ierr = PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);CHKERRQ(ierr);
2756 
2757     /* destroy list of free space and other temporary array(s) */
2758     ierr = PetscMalloc1(ui[am]+1,&uj);CHKERRQ(ierr);
2759     ierr = PetscFreeSpaceContiguous(&free_space,uj);CHKERRQ(ierr);
2760     ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
2761     ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr);
2762     if (ai[am] != 0) {
2763       ratio_needed = ((PetscReal)ui[am])/((PetscReal)ai[am]);
2764     } else {
2765       ratio_needed = 0.0;
2766     }
2767   } /* end of case: levels>0 || (levels=0 && !perm_identity) */
2768 
2769   /* put together the new matrix in MATSEQSBAIJ format */
2770   ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(fact,bs,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
2771 
2772   b = (Mat_SeqSBAIJ*)(fact)->data;
2773 
2774   ierr = PetscFree2(b->imax,b->ilen);CHKERRQ(ierr);
2775 
2776   b->singlemalloc = PETSC_FALSE;
2777   b->free_a       = PETSC_TRUE;
2778   b->free_ij      = free_ij;
2779 
2780   ierr = PetscMalloc1(ui[am]+1,&b->a);CHKERRQ(ierr);
2781 
2782   b->j             = uj;
2783   b->i             = ui;
2784   b->diag          = 0;
2785   b->ilen          = 0;
2786   b->imax          = 0;
2787   b->row           = perm;
2788   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
2789 
2790   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
2791   b->icol = perm;
2792   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
2793   ierr    = PetscMalloc1(am+1,&b->solve_work);CHKERRQ(ierr);
2794 
2795   b->maxnz = b->nz = ui[am];
2796 
2797   fact->info.factor_mallocs    = reallocs;
2798   fact->info.fill_ratio_given  = fill;
2799   fact->info.fill_ratio_needed = ratio_needed;
2800 #if defined(PETSC_USE_INFO)
2801   if (ai[am] != 0) {
2802     PetscReal af = fact->info.fill_ratio_needed;
2803     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)fill,(double)af);CHKERRQ(ierr);
2804     ierr = PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);CHKERRQ(ierr);
2805     ierr = PetscInfo1(A,"PCFactorSetFill(pc,%g) for best performance.\n",(double)af);CHKERRQ(ierr);
2806   } else {
2807     ierr = PetscInfo(A,"Empty matrix.\n");CHKERRQ(ierr);
2808   }
2809 #endif
2810   if (perm_identity) {
2811     fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering_inplace;
2812   } else {
2813     fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_inplace;
2814   }
2815   PetscFunctionReturn(0);
2816 }
2817 
2818