xref: /petsc/src/mat/impls/sbaij/seq/sbaijfact2.c (revision 2205254efee3a00a594e5e2a3a70f74dcb40bc03)
1 
2 /*
3     Factorization code for SBAIJ format.
4 */
5 
6 #include <../src/mat/impls/sbaij/seq/sbaij.h>
7 #include <../src/mat/impls/baij/seq/baij.h>
8 #include <../src/mat/blockinvert.h>
9 
10 #undef __FUNCT__
11 #define __FUNCT__ "MatSolve_SeqSBAIJ_N_inplace"
12 PetscErrorCode MatSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
13 {
14   Mat_SeqSBAIJ    *a=(Mat_SeqSBAIJ*)A->data;
15   IS              isrow=a->row;
16   PetscInt        mbs=a->mbs,*ai=a->i,*aj=a->j;
17   PetscErrorCode  ierr;
18   const PetscInt  *r;
19   PetscInt        nz,*vj,k,idx,k1;
20   PetscInt        bs=A->rmap->bs,bs2 = a->bs2;
21   MatScalar       *aa=a->a,*v,*diag;
22   PetscScalar     *x,*xk,*xj,*b,*xk_tmp,*t;
23 
24   PetscFunctionBegin;
25   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
26   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
27   t  = a->solve_work;
28   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
29   ierr = PetscMalloc(bs*sizeof(PetscScalar),&xk_tmp);CHKERRQ(ierr);
30 
31   /* solve U^T * D * y = b by forward substitution */
32   xk = t;
33   for (k=0; k<mbs; k++) { /* t <- perm(b) */
34     idx   = bs*r[k];
35     for (k1=0; k1<bs; k1++) *xk++ = b[idx+k1];
36   }
37   for (k=0; k<mbs; k++) {
38     v  = aa + bs2*ai[k];
39     xk = t + k*bs;      /* Dk*xk = k-th block of x */
40     ierr = PetscMemcpy(xk_tmp,xk,bs*sizeof(PetscScalar));CHKERRQ(ierr); /* xk_tmp <- xk */
41     nz = ai[k+1] - ai[k];
42     vj = aj + ai[k];
43     xj = t + (*vj)*bs;  /* *vj-th block of x, *vj>k */
44     while (nz--) {
45       /* x(:) += U(k,:)^T*(Dk*xk) */
46       PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
47       vj++; xj = t + (*vj)*bs;
48       v += bs2;
49     }
50     /* xk = inv(Dk)*(Dk*xk) */
51     diag = aa+k*bs2;                            /* ptr to inv(Dk) */
52     PetscKernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
53   }
54 
55   /* solve U*x = y by back substitution */
56   for (k=mbs-1; k>=0; k--) {
57     v  = aa + bs2*ai[k];
58     xk = t + k*bs;        /* xk */
59     nz = ai[k+1] - ai[k];
60     vj = aj + ai[k];
61     xj = t + (*vj)*bs;
62     while (nz--) {
63       /* xk += U(k,:)*x(:) */
64       PetscKernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
65       vj++;
66       v += bs2; xj = t + (*vj)*bs;
67     }
68     idx   = bs*r[k];
69     for (k1=0; k1<bs; k1++) x[idx+k1] = *xk++;
70   }
71 
72   ierr = PetscFree(xk_tmp);CHKERRQ(ierr);
73   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
74   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
75   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
76   ierr = PetscLogFlops(4.0*bs2*a->nz -(bs+2.0*bs2)*mbs);CHKERRQ(ierr);
77   PetscFunctionReturn(0);
78 }
79 
80 #undef __FUNCT__
81 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_N_inplace"
82 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
83 {
84   PetscFunctionBegin;
85   SETERRQ(PETSC_COMM_SELF,1,"not implemented yet");
86   PetscFunctionReturn(0);
87 }
88 
89 #undef __FUNCT__
90 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_N_inplace"
91 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
92 {
93   PetscFunctionBegin;
94   SETERRQ(PETSC_COMM_SELF,1,"not implemented yet");
95   PetscFunctionReturn(0);
96 }
97 
98 #undef __FUNCT__
99 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_N_NaturalOrdering"
100 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x)
101 {
102   PetscErrorCode ierr;
103   PetscInt       nz,*vj,k;
104   PetscInt       bs2 = bs*bs;
105   MatScalar      *v,*diag;
106   PetscScalar    *xk,*xj,*xk_tmp;
107 
108   PetscFunctionBegin;
109   ierr = PetscMalloc(bs*sizeof(PetscScalar),&xk_tmp);CHKERRQ(ierr);
110   for (k=0; k<mbs; k++) {
111     v  = aa + bs2*ai[k];
112     xk = x + k*bs;      /* Dk*xk = k-th block of x */
113     ierr = PetscMemcpy(xk_tmp,xk,bs*sizeof(PetscScalar));CHKERRQ(ierr); /* xk_tmp <- xk */
114     nz = ai[k+1] - ai[k];
115     vj = aj + ai[k];
116     xj = x + (*vj)*bs;  /* *vj-th block of x, *vj>k */
117     while (nz--) {
118       /* x(:) += U(k,:)^T*(Dk*xk) */
119       PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
120       vj++; xj = x + (*vj)*bs;
121       v += bs2;
122     }
123     /* xk = inv(Dk)*(Dk*xk) */
124     diag = aa+k*bs2;                            /* ptr to inv(Dk) */
125     PetscKernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
126   }
127   ierr = PetscFree(xk_tmp);CHKERRQ(ierr);
128   PetscFunctionReturn(0);
129 }
130 
131 #undef __FUNCT__
132 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering"
133 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x)
134 {
135   PetscInt       nz,*vj,k;
136   PetscInt       bs2 = bs*bs;
137   MatScalar      *v;
138   PetscScalar    *xk,*xj;
139 
140   PetscFunctionBegin;
141   for (k=mbs-1; k>=0; k--) {
142     v  = aa + bs2*ai[k];
143     xk = x + k*bs;        /* xk */
144     nz = ai[k+1] - ai[k];
145     vj = aj + ai[k];
146     xj = x + (*vj)*bs;
147     while (nz--) {
148       /* xk += U(k,:)*x(:) */
149       PetscKernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
150       vj++;
151       v += bs2; xj = x + (*vj)*bs;
152     }
153   }
154   PetscFunctionReturn(0);
155 }
156 
157 #undef __FUNCT__
158 #define __FUNCT__ "MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace"
159 PetscErrorCode MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
160 {
161   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
162   PetscErrorCode ierr;
163   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
164   PetscInt       bs=A->rmap->bs;
165   MatScalar      *aa=a->a;
166   PetscScalar    *x,*b;
167 
168   PetscFunctionBegin;
169   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
170   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
171 
172   /* solve U^T * D * y = b by forward substitution */
173   ierr = PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */
174   ierr = MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr);
175 
176   /* solve U*x = y by back substitution */
177   ierr = MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr);
178 
179   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
180   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
181   ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr);
182   PetscFunctionReturn(0);
183 }
184 
185 #undef __FUNCT__
186 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace"
187 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
188 {
189   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
190   PetscErrorCode ierr;
191   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
192   PetscInt       bs=A->rmap->bs;
193   MatScalar      *aa=a->a;
194   PetscScalar    *x,*b;
195 
196   PetscFunctionBegin;
197   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
198   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
199   ierr = PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */
200   ierr = MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr);
201   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
202   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
203   ierr = PetscLogFlops(2.0*bs2*a->nz - bs*mbs);CHKERRQ(ierr);
204   PetscFunctionReturn(0);
205 }
206 
207 #undef __FUNCT__
208 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace"
209 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
210 {
211   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
212   PetscErrorCode ierr;
213   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
214   PetscInt       bs=A->rmap->bs;
215   MatScalar      *aa=a->a;
216   PetscScalar    *x,*b;
217 
218   PetscFunctionBegin;
219   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
220   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
221   ierr = PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
222   ierr = MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr);
223   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
224   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
225   ierr = PetscLogFlops(2.0*bs2*(a->nz-mbs));CHKERRQ(ierr);
226   PetscFunctionReturn(0);
227 }
228 
229 #undef __FUNCT__
230 #define __FUNCT__ "MatSolve_SeqSBAIJ_7_inplace"
231 PetscErrorCode MatSolve_SeqSBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
232 {
233   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
234   IS             isrow=a->row;
235   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2,bs=A->rmap->bs;
236   PetscErrorCode ierr;
237   const PetscInt *r;
238   PetscInt       nz,*vj,k,idx;
239   MatScalar      *aa=a->a,*v,*d;
240   PetscScalar    *x,*b,x0,x1,x2,x3,x4,x5,x6,*t,*tp;
241 
242   PetscFunctionBegin;
243   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
244   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
245   t  = a->solve_work;
246   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
247 
248   /* solve U^T * D * y = b by forward substitution */
249   tp = t;
250   for (k=0; k<mbs; k++) { /* t <- perm(b) */
251     idx   = 7*r[k];
252     tp[0] = b[idx];
253     tp[1] = b[idx+1];
254     tp[2] = b[idx+2];
255     tp[3] = b[idx+3];
256     tp[4] = b[idx+4];
257     tp[5] = b[idx+5];
258     tp[6] = b[idx+6];
259     tp += 7;
260   }
261 
262   for (k=0; k<mbs; k++) {
263     v  = aa + 49*ai[k];
264     vj = aj + ai[k];
265     tp = t + k*7;
266     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6];
267     nz = ai[k+1] - ai[k];
268     tp = t + (*vj)*7;
269     while (nz--) {
270       tp[0]+=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
271       tp[1]+=  v[7]*x0 +  v[8]*x1 +  v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
272       tp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
273       tp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
274       tp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
275       tp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
276       tp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
277       vj++; tp = t + (*vj)*7;
278       v += 49;
279     }
280 
281     /* xk = inv(Dk)*(Dk*xk) */
282     d  = aa+k*49;          /* ptr to inv(Dk) */
283     tp    = t + k*7;
284     tp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
285     tp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
286     tp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
287     tp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
288     tp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
289     tp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
290     tp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
291   }
292 
293   /* solve U*x = y by back substitution */
294   for (k=mbs-1; k>=0; k--) {
295     v  = aa + 49*ai[k];
296     vj = aj + ai[k];
297     tp    = t + k*7;
298     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];  x6=tp[6]; /* xk */
299     nz = ai[k+1] - ai[k];
300 
301     tp = t + (*vj)*7;
302     while (nz--) {
303       /* xk += U(k,:)*x(:) */
304       x0 += v[0]*tp[0] + v[7]*tp[1] + v[14]*tp[2] + v[21]*tp[3] + v[28]*tp[4] + v[35]*tp[5] + v[42]*tp[6];
305       x1 += v[1]*tp[0] + v[8]*tp[1] + v[15]*tp[2] + v[22]*tp[3] + v[29]*tp[4] + v[36]*tp[5] + v[43]*tp[6];
306       x2 += v[2]*tp[0] + v[9]*tp[1] + v[16]*tp[2] + v[23]*tp[3] + v[30]*tp[4] + v[37]*tp[5] + v[44]*tp[6];
307       x3 += v[3]*tp[0]+ v[10]*tp[1] + v[17]*tp[2] + v[24]*tp[3] + v[31]*tp[4] + v[38]*tp[5] + v[45]*tp[6];
308       x4 += v[4]*tp[0]+ v[11]*tp[1] + v[18]*tp[2] + v[25]*tp[3] + v[32]*tp[4] + v[39]*tp[5] + v[46]*tp[6];
309       x5 += v[5]*tp[0]+ v[12]*tp[1] + v[19]*tp[2] + v[26]*tp[3] + v[33]*tp[4] + v[40]*tp[5] + v[47]*tp[6];
310       x6 += v[6]*tp[0]+ v[13]*tp[1] + v[20]*tp[2] + v[27]*tp[3] + v[34]*tp[4] + v[41]*tp[5] + v[48]*tp[6];
311       vj++; tp = t + (*vj)*7;
312       v += 49;
313     }
314     tp    = t + k*7;
315     tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5; tp[6]=x6;
316     idx   = 7*r[k];
317     x[idx]     = x0;
318     x[idx+1]   = x1;
319     x[idx+2]   = x2;
320     x[idx+3]   = x3;
321     x[idx+4]   = x4;
322     x[idx+5]   = x5;
323     x[idx+6]   = x6;
324   }
325 
326   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
327   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
328   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
329   ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr);
330   PetscFunctionReturn(0);
331 }
332 
333 #undef __FUNCT__
334 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_7_NaturalOrdering"
335 PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
336 {
337   MatScalar      *v,*d;
338   PetscScalar    *xp,x0,x1,x2,x3,x4,x5,x6;
339   PetscInt       nz,*vj,k;
340 
341   PetscFunctionBegin;
342   for (k=0; k<mbs; k++) {
343     v  = aa + 49*ai[k];
344     xp = x + k*7;
345     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* Dk*xk = k-th block of x */
346     nz = ai[k+1] - ai[k];
347     vj = aj + ai[k];
348     xp = x + (*vj)*7;
349     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
350     PetscPrefetchBlock(v+49*nz,49*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
351     while (nz--) {
352       /* x(:) += U(k,:)^T*(Dk*xk) */
353       xp[0]+=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
354       xp[1]+=  v[7]*x0 +  v[8]*x1 +  v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
355       xp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
356       xp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
357       xp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
358       xp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
359       xp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
360       vj++; xp = x + (*vj)*7;
361       v += 49;
362     }
363     /* xk = inv(Dk)*(Dk*xk) */
364     d  = aa+k*49;          /* ptr to inv(Dk) */
365     xp = x + k*7;
366     xp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
367     xp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
368     xp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
369     xp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
370     xp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
371     xp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
372     xp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
373   }
374   PetscFunctionReturn(0);
375 }
376 
377 #undef __FUNCT__
378 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering"
379 PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
380 {
381   MatScalar      *v;
382   PetscScalar    *xp,x0,x1,x2,x3,x4,x5,x6;
383   PetscInt       nz,*vj,k;
384 
385   PetscFunctionBegin;
386   for (k=mbs-1; k>=0; k--) {
387     v  = aa + 49*ai[k];
388     xp = x + k*7;
389     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* xk */
390     nz = ai[k+1] - ai[k];
391     vj = aj + ai[k];
392     xp = x + (*vj)*7;
393     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
394     PetscPrefetchBlock(v-49*nz,49*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
395     while (nz--) {
396       /* xk += U(k,:)*x(:) */
397       x0 += v[0]*xp[0] + v[7]*xp[1] + v[14]*xp[2] + v[21]*xp[3] + v[28]*xp[4] + v[35]*xp[5] + v[42]*xp[6];
398       x1 += v[1]*xp[0] + v[8]*xp[1] + v[15]*xp[2] + v[22]*xp[3] + v[29]*xp[4] + v[36]*xp[5] + v[43]*xp[6];
399       x2 += v[2]*xp[0] + v[9]*xp[1] + v[16]*xp[2] + v[23]*xp[3] + v[30]*xp[4] + v[37]*xp[5] + v[44]*xp[6];
400       x3 += v[3]*xp[0]+ v[10]*xp[1] + v[17]*xp[2] + v[24]*xp[3] + v[31]*xp[4] + v[38]*xp[5] + v[45]*xp[6];
401       x4 += v[4]*xp[0]+ v[11]*xp[1] + v[18]*xp[2] + v[25]*xp[3] + v[32]*xp[4] + v[39]*xp[5] + v[46]*xp[6];
402       x5 += v[5]*xp[0]+ v[12]*xp[1] + v[19]*xp[2] + v[26]*xp[3] + v[33]*xp[4] + v[40]*xp[5] + v[47]*xp[6];
403       x6 += v[6]*xp[0]+ v[13]*xp[1] + v[20]*xp[2] + v[27]*xp[3] + v[34]*xp[4] + v[41]*xp[5] + v[48]*xp[6];
404       vj++;
405       v += 49; xp = x + (*vj)*7;
406     }
407     xp = x + k*7;
408     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5; xp[6]=x6;
409   }
410   PetscFunctionReturn(0);
411 }
412 
413 #undef __FUNCT__
414 #define __FUNCT__ "MatSolve_SeqSBAIJ_7_NaturalOrdering_inplace"
415 PetscErrorCode MatSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
416 {
417   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
418   PetscErrorCode ierr;
419   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
420   MatScalar      *aa=a->a;
421   PetscScalar    *x,*b;
422 
423   PetscFunctionBegin;
424   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
425   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
426 
427   /* solve U^T * D * y = b by forward substitution */
428   ierr = PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */
429   ierr = MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
430 
431   /* solve U*x = y by back substitution */
432   ierr = MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
433 
434   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
435   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
436   ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr);
437   PetscFunctionReturn(0);
438 }
439 
440 #undef __FUNCT__
441 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace"
442 PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
443 {
444   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
445   PetscErrorCode ierr;
446   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
447   MatScalar      *aa=a->a;
448   PetscScalar    *x,*b;
449 
450   PetscFunctionBegin;
451   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
452   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
453   ierr = PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
454   ierr = MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
455   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
456   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
457   ierr = PetscLogFlops(2.0*bs2*a->nz - bs*mbs);CHKERRQ(ierr);
458   PetscFunctionReturn(0);
459 }
460 
461 #undef __FUNCT__
462 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace"
463 PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
464 {
465   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
466   PetscErrorCode ierr;
467   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
468   MatScalar      *aa=a->a;
469   PetscScalar    *x,*b;
470 
471   PetscFunctionBegin;
472   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
473   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
474   ierr = PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
475   ierr = MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
476   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
477   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
478   ierr = PetscLogFlops(2.0*bs2*(a->nz-mbs));CHKERRQ(ierr);
479   PetscFunctionReturn(0);
480 }
481 
482 #undef __FUNCT__
483 #define __FUNCT__ "MatSolve_SeqSBAIJ_6_inplace"
484 PetscErrorCode MatSolve_SeqSBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
485 {
486   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
487   IS             isrow=a->row;
488   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
489   PetscErrorCode ierr;
490   const PetscInt *r;
491   PetscInt       nz,*vj,k,idx;
492   MatScalar      *aa=a->a,*v,*d;
493   PetscScalar    *x,*b,x0,x1,x2,x3,x4,x5,*t,*tp;
494 
495   PetscFunctionBegin;
496   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
497   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
498   t  = a->solve_work;
499   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
500 
501   /* solve U^T * D * y = b by forward substitution */
502   tp = t;
503   for (k=0; k<mbs; k++) { /* t <- perm(b) */
504     idx   = 6*r[k];
505     tp[0] = b[idx];
506     tp[1] = b[idx+1];
507     tp[2] = b[idx+2];
508     tp[3] = b[idx+3];
509     tp[4] = b[idx+4];
510     tp[5] = b[idx+5];
511     tp += 6;
512   }
513 
514   for (k=0; k<mbs; k++) {
515     v  = aa + 36*ai[k];
516     vj = aj + ai[k];
517     tp = t + k*6;
518     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];
519     nz = ai[k+1] - ai[k];
520     tp = t + (*vj)*6;
521     while (nz--) {
522       tp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
523       tp[1] +=  v[6]*x0 +  v[7]*x1 +  v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
524       tp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
525       tp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
526       tp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
527       tp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
528       vj++; tp = t + (*vj)*6;
529       v += 36;
530     }
531 
532     /* xk = inv(Dk)*(Dk*xk) */
533     d  = aa+k*36;          /* ptr to inv(Dk) */
534     tp    = t + k*6;
535     tp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
536     tp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
537     tp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
538     tp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
539     tp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
540     tp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
541   }
542 
543   /* solve U*x = y by back substitution */
544   for (k=mbs-1; k>=0; k--) {
545     v  = aa + 36*ai[k];
546     vj = aj + ai[k];
547     tp    = t + k*6;
548     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];  /* xk */
549     nz = ai[k+1] - ai[k];
550 
551     tp = t + (*vj)*6;
552     while (nz--) {
553       /* xk += U(k,:)*x(:) */
554       x0 += v[0]*tp[0] + v[6]*tp[1] + v[12]*tp[2] + v[18]*tp[3] + v[24]*tp[4] + v[30]*tp[5];
555       x1 += v[1]*tp[0] + v[7]*tp[1] + v[13]*tp[2] + v[19]*tp[3] + v[25]*tp[4] + v[31]*tp[5];
556       x2 += v[2]*tp[0] + v[8]*tp[1] + v[14]*tp[2] + v[20]*tp[3] + v[26]*tp[4] + v[32]*tp[5];
557       x3 += v[3]*tp[0] + v[9]*tp[1] + v[15]*tp[2] + v[21]*tp[3] + v[27]*tp[4] + v[33]*tp[5];
558       x4 += v[4]*tp[0]+ v[10]*tp[1] + v[16]*tp[2] + v[22]*tp[3] + v[28]*tp[4] + v[34]*tp[5];
559       x5 += v[5]*tp[0]+ v[11]*tp[1] + v[17]*tp[2] + v[23]*tp[3] + v[29]*tp[4] + v[35]*tp[5];
560       vj++; tp = t + (*vj)*6;
561       v += 36;
562     }
563     tp    = t + k*6;
564     tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5;
565     idx   = 6*r[k];
566     x[idx]     = x0;
567     x[idx+1]   = x1;
568     x[idx+2]   = x2;
569     x[idx+3]   = x3;
570     x[idx+4]   = x4;
571     x[idx+5]   = x5;
572   }
573 
574   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
575   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
576   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
577   ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr);
578   PetscFunctionReturn(0);
579 }
580 
581 #undef __FUNCT__
582 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_6_NaturalOrdering"
583 PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
584 {
585   MatScalar      *v,*d;
586   PetscScalar    *xp,x0,x1,x2,x3,x4,x5;
587   PetscInt       nz,*vj,k;
588 
589   PetscFunctionBegin;
590   for (k=0; k<mbs; k++) {
591     v  = aa + 36*ai[k];
592     xp = x + k*6;
593     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* Dk*xk = k-th block of x */
594     nz = ai[k+1] - ai[k];
595     vj = aj + ai[k];
596     xp = x + (*vj)*6;
597     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
598     PetscPrefetchBlock(v+36*nz,36*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
599     while (nz--) {
600       /* x(:) += U(k,:)^T*(Dk*xk) */
601       xp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
602       xp[1] +=  v[6]*x0 +  v[7]*x1 +  v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
603       xp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
604       xp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
605       xp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
606       xp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
607       vj++; xp = x + (*vj)*6;
608       v += 36;
609     }
610     /* xk = inv(Dk)*(Dk*xk) */
611     d  = aa+k*36;          /* ptr to inv(Dk) */
612     xp = x + k*6;
613     xp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
614     xp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
615     xp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
616     xp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
617     xp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
618     xp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
619   }
620   PetscFunctionReturn(0);
621 }
622 #undef __FUNCT__
623 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering"
624 PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
625 {
626   MatScalar      *v;
627   PetscScalar    *xp,x0,x1,x2,x3,x4,x5;
628   PetscInt       nz,*vj,k;
629 
630   PetscFunctionBegin;
631   for (k=mbs-1; k>=0; k--) {
632     v  = aa + 36*ai[k];
633     xp = x + k*6;
634     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* xk */
635     nz = ai[k+1] - ai[k];
636     vj = aj + ai[k];
637     xp = x + (*vj)*6;
638     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
639     PetscPrefetchBlock(v-36*nz,36*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
640     while (nz--) {
641       /* xk += U(k,:)*x(:) */
642       x0 += v[0]*xp[0] + v[6]*xp[1] + v[12]*xp[2] + v[18]*xp[3] + v[24]*xp[4] + v[30]*xp[5];
643       x1 += v[1]*xp[0] + v[7]*xp[1] + v[13]*xp[2] + v[19]*xp[3] + v[25]*xp[4] + v[31]*xp[5];
644       x2 += v[2]*xp[0] + v[8]*xp[1] + v[14]*xp[2] + v[20]*xp[3] + v[26]*xp[4] + v[32]*xp[5];
645       x3 += v[3]*xp[0] + v[9]*xp[1] + v[15]*xp[2] + v[21]*xp[3] + v[27]*xp[4] + v[33]*xp[5];
646       x4 += v[4]*xp[0]+ v[10]*xp[1] + v[16]*xp[2] + v[22]*xp[3] + v[28]*xp[4] + v[34]*xp[5];
647       x5 += v[5]*xp[0]+ v[11]*xp[1] + v[17]*xp[2] + v[23]*xp[3] + v[29]*xp[4] + v[35]*xp[5];
648       vj++;
649       v += 36; xp = x + (*vj)*6;
650     }
651     xp = x + k*6;
652     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5;
653   }
654   PetscFunctionReturn(0);
655 }
656 
657 
658 #undef __FUNCT__
659 #define __FUNCT__ "MatSolve_SeqSBAIJ_6_NaturalOrdering_inplace"
660 PetscErrorCode MatSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
661 {
662   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
663   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
664   MatScalar      *aa=a->a;
665   PetscScalar    *x,*b;
666   PetscErrorCode ierr;
667 
668   PetscFunctionBegin;
669   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
670   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
671 
672   /* solve U^T * D * y = b by forward substitution */
673   ierr = PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */
674   ierr = MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
675 
676   /* solve U*x = y by back substitution */
677   ierr = MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
678 
679   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
680   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
681   ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr);
682   PetscFunctionReturn(0);
683 }
684 
685 #undef __FUNCT__
686 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace"
687 PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
688 {
689   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
690   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
691   MatScalar      *aa=a->a;
692   PetscScalar    *x,*b;
693   PetscErrorCode ierr;
694 
695   PetscFunctionBegin;
696   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
697   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
698   ierr = PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */
699   ierr = MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
700   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
701   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
702   ierr = PetscLogFlops(2.0*bs2*a->nz - bs*mbs);CHKERRQ(ierr);
703   PetscFunctionReturn(0);
704 }
705 
706 #undef __FUNCT__
707 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace"
708 PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
709 {
710   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
711   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
712   MatScalar      *aa=a->a;
713   PetscScalar    *x,*b;
714   PetscErrorCode ierr;
715 
716   PetscFunctionBegin;
717   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
718   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
719   ierr = PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */
720   ierr = MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
721   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
722   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
723   ierr = PetscLogFlops(2.0*bs2*(a->nz - mbs));CHKERRQ(ierr);
724   PetscFunctionReturn(0);
725 }
726 
727 #undef __FUNCT__
728 #define __FUNCT__ "MatSolve_SeqSBAIJ_5_inplace"
729 PetscErrorCode MatSolve_SeqSBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
730 {
731   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
732   IS             isrow=a->row;
733   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2 = a->bs2;
734   PetscErrorCode ierr;
735   const PetscInt *r;
736   PetscInt       nz,*vj,k,idx;
737   MatScalar      *aa=a->a,*v,*diag;
738   PetscScalar    *x,*b,x0,x1,x2,x3,x4,*t,*tp;
739 
740   PetscFunctionBegin;
741   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
742   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
743   t  = a->solve_work;
744   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
745 
746   /* solve U^T * D * y = b by forward substitution */
747   tp = t;
748   for (k=0; k<mbs; k++) { /* t <- perm(b) */
749     idx   = 5*r[k];
750     tp[0] = b[idx];
751     tp[1] = b[idx+1];
752     tp[2] = b[idx+2];
753     tp[3] = b[idx+3];
754     tp[4] = b[idx+4];
755     tp += 5;
756   }
757 
758   for (k=0; k<mbs; k++) {
759     v  = aa + 25*ai[k];
760     vj = aj + ai[k];
761     tp = t + k*5;
762     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4];
763     nz = ai[k+1] - ai[k];
764 
765     tp = t + (*vj)*5;
766     while (nz--) {
767       tp[0] +=  v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4;
768       tp[1] +=  v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4;
769       tp[2] += v[10]*x0+ v[11]*x1+ v[12]*x2+ v[13]*x3+ v[14]*x4;
770       tp[3] += v[15]*x0+ v[16]*x1+ v[17]*x2+ v[18]*x3+ v[19]*x4;
771       tp[4] += v[20]*x0+ v[21]*x1+ v[22]*x2+ v[23]*x3+ v[24]*x4;
772       vj++; tp = t + (*vj)*5;
773       v += 25;
774     }
775 
776     /* xk = inv(Dk)*(Dk*xk) */
777     diag  = aa+k*25;          /* ptr to inv(Dk) */
778     tp    = t + k*5;
779       tp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
780       tp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
781       tp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
782       tp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
783       tp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
784   }
785 
786   /* solve U*x = y by back substitution */
787   for (k=mbs-1; k>=0; k--) {
788     v  = aa + 25*ai[k];
789     vj = aj + ai[k];
790     tp    = t + k*5;
791     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4];/* xk */
792     nz = ai[k+1] - ai[k];
793 
794     tp = t + (*vj)*5;
795     while (nz--) {
796       /* xk += U(k,:)*x(:) */
797       x0 += v[0]*tp[0] + v[5]*tp[1] + v[10]*tp[2] + v[15]*tp[3] + v[20]*tp[4];
798       x1 += v[1]*tp[0] + v[6]*tp[1] + v[11]*tp[2] + v[16]*tp[3] + v[21]*tp[4];
799       x2 += v[2]*tp[0] + v[7]*tp[1] + v[12]*tp[2] + v[17]*tp[3] + v[22]*tp[4];
800       x3 += v[3]*tp[0] + v[8]*tp[1] + v[13]*tp[2] + v[18]*tp[3] + v[23]*tp[4];
801       x4 += v[4]*tp[0] + v[9]*tp[1] + v[14]*tp[2] + v[19]*tp[3] + v[24]*tp[4];
802       vj++; tp = t + (*vj)*5;
803       v += 25;
804     }
805     tp    = t + k*5;
806     tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4;
807     idx   = 5*r[k];
808     x[idx]     = x0;
809     x[idx+1]   = x1;
810     x[idx+2]   = x2;
811     x[idx+3]   = x3;
812     x[idx+4]   = x4;
813   }
814 
815   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
816   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
817   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
818   ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr);
819   PetscFunctionReturn(0);
820 }
821 
822 #undef __FUNCT__
823 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_5_NaturalOrdering"
824 PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
825 {
826   MatScalar      *v,*diag;
827   PetscScalar    *xp,x0,x1,x2,x3,x4;
828   PetscInt       nz,*vj,k;
829 
830   PetscFunctionBegin;
831   for (k=0; k<mbs; k++) {
832     v  = aa + 25*ai[k];
833     xp = x + k*5;
834     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* Dk*xk = k-th block of x */
835     nz = ai[k+1] - ai[k];
836     vj = aj + ai[k];
837     xp = x + (*vj)*5;
838     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
839     PetscPrefetchBlock(v+25*nz,25*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
840     while (nz--) {
841       /* x(:) += U(k,:)^T*(Dk*xk) */
842       xp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4;
843       xp[1] +=  v[5]*x0 +  v[6]*x1 +  v[7]*x2 + v[8]*x3 + v[9]*x4;
844       xp[2] += v[10]*x0 + v[11]*x1 + v[12]*x2+ v[13]*x3+ v[14]*x4;
845       xp[3] += v[15]*x0 + v[16]*x1 + v[17]*x2+ v[18]*x3+ v[19]*x4;
846       xp[4] += v[20]*x0 + v[21]*x1 + v[22]*x2+ v[23]*x3+ v[24]*x4;
847       vj++; xp = x + (*vj)*5;
848       v += 25;
849     }
850     /* xk = inv(Dk)*(Dk*xk) */
851     diag = aa+k*25;          /* ptr to inv(Dk) */
852     xp   = x + k*5;
853     xp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
854     xp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
855     xp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
856     xp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
857     xp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
858   }
859   PetscFunctionReturn(0);
860 }
861 
862 #undef __FUNCT__
863 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering"
864 PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
865 {
866   MatScalar      *v;
867   PetscScalar    *xp,x0,x1,x2,x3,x4;
868   PetscInt       nz,*vj,k;
869 
870   PetscFunctionBegin;
871   for (k=mbs-1; k>=0; k--) {
872     v  = aa + 25*ai[k];
873     xp = x + k*5;
874     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* xk */
875     nz = ai[k+1] - ai[k];
876     vj = aj + ai[k];
877     xp = x + (*vj)*5;
878     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
879     PetscPrefetchBlock(v-25*nz,25*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
880     while (nz--) {
881       /* xk += U(k,:)*x(:) */
882       x0 += v[0]*xp[0] + v[5]*xp[1] + v[10]*xp[2] + v[15]*xp[3] + v[20]*xp[4];
883       x1 += v[1]*xp[0] + v[6]*xp[1] + v[11]*xp[2] + v[16]*xp[3] + v[21]*xp[4];
884       x2 += v[2]*xp[0] + v[7]*xp[1] + v[12]*xp[2] + v[17]*xp[3] + v[22]*xp[4];
885       x3 += v[3]*xp[0] + v[8]*xp[1] + v[13]*xp[2] + v[18]*xp[3] + v[23]*xp[4];
886       x4 += v[4]*xp[0] + v[9]*xp[1] + v[14]*xp[2] + v[19]*xp[3] + v[24]*xp[4];
887       vj++;
888       v += 25; xp = x + (*vj)*5;
889     }
890     xp = x + k*5;
891     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4;
892   }
893   PetscFunctionReturn(0);
894 }
895 
896 #undef __FUNCT__
897 #define __FUNCT__ "MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace"
898 PetscErrorCode MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
899 {
900   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
901   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2 = a->bs2;
902   MatScalar      *aa=a->a;
903   PetscScalar    *x,*b;
904   PetscErrorCode ierr;
905 
906   PetscFunctionBegin;
907   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
908   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
909 
910   /* solve U^T * D * y = b by forward substitution */
911   ierr = PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */
912   ierr = MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
913 
914   /* solve U*x = y by back substitution */
915   ierr = MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
916 
917   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
918   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
919   ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr);
920   PetscFunctionReturn(0);
921 }
922 
923 #undef __FUNCT__
924 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace"
925 PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
926 {
927   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
928   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
929   MatScalar      *aa=a->a;
930   PetscScalar    *x,*b;
931   PetscErrorCode ierr;
932 
933   PetscFunctionBegin;
934   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
935   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
936   ierr = PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */
937   ierr = MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
938   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
939   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
940   ierr = PetscLogFlops(2.0*bs2*a->nz - bs*mbs);CHKERRQ(ierr);
941   PetscFunctionReturn(0);
942 }
943 
944 #undef __FUNCT__
945 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace"
946 PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
947 {
948   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
949   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
950   MatScalar      *aa=a->a;
951   PetscScalar    *x,*b;
952   PetscErrorCode ierr;
953 
954   PetscFunctionBegin;
955   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
956   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
957   ierr = PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
958   ierr = MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
959   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
960   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
961   ierr = PetscLogFlops(2.0*bs2*(a->nz-mbs));CHKERRQ(ierr);
962   PetscFunctionReturn(0);
963 }
964 
965 #undef __FUNCT__
966 #define __FUNCT__ "MatSolve_SeqSBAIJ_4_inplace"
967 PetscErrorCode MatSolve_SeqSBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
968 {
969   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
970   IS             isrow=a->row;
971   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
972   PetscErrorCode ierr;
973   const PetscInt *r;
974   PetscInt       nz,*vj,k,idx;
975   MatScalar      *aa=a->a,*v,*diag;
976   PetscScalar    *x,*b,x0,x1,x2,x3,*t,*tp;
977 
978   PetscFunctionBegin;
979   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
980   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
981   t  = a->solve_work;
982   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
983 
984   /* solve U^T * D * y = b by forward substitution */
985   tp = t;
986   for (k=0; k<mbs; k++) { /* t <- perm(b) */
987     idx   = 4*r[k];
988     tp[0] = b[idx];
989     tp[1] = b[idx+1];
990     tp[2] = b[idx+2];
991     tp[3] = b[idx+3];
992     tp += 4;
993   }
994 
995   for (k=0; k<mbs; k++) {
996     v  = aa + 16*ai[k];
997     vj = aj + ai[k];
998     tp = t + k*4;
999     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3];
1000     nz = ai[k+1] - ai[k];
1001 
1002     tp = t + (*vj)*4;
1003     while (nz--) {
1004       tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
1005       tp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
1006       tp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
1007       tp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
1008       vj++; tp = t + (*vj)*4;
1009       v += 16;
1010     }
1011 
1012     /* xk = inv(Dk)*(Dk*xk) */
1013     diag  = aa+k*16;          /* ptr to inv(Dk) */
1014     tp    = t + k*4;
1015     tp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
1016     tp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
1017     tp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
1018     tp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
1019   }
1020 
1021   /* solve U*x = y by back substitution */
1022   for (k=mbs-1; k>=0; k--) {
1023     v  = aa + 16*ai[k];
1024     vj = aj + ai[k];
1025     tp    = t + k*4;
1026     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; /* xk */
1027     nz = ai[k+1] - ai[k];
1028 
1029     tp = t + (*vj)*4;
1030     while (nz--) {
1031       /* xk += U(k,:)*x(:) */
1032       x0 += v[0]*tp[0] + v[4]*tp[1] + v[8]*tp[2] + v[12]*tp[3];
1033       x1 += v[1]*tp[0] + v[5]*tp[1] + v[9]*tp[2] + v[13]*tp[3];
1034       x2 += v[2]*tp[0] + v[6]*tp[1]+ v[10]*tp[2] + v[14]*tp[3];
1035       x3 += v[3]*tp[0] + v[7]*tp[1]+ v[11]*tp[2] + v[15]*tp[3];
1036       vj++; tp = t + (*vj)*4;
1037       v += 16;
1038     }
1039     tp    = t + k*4;
1040     tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3;
1041     idx        = 4*r[k];
1042     x[idx]     = x0;
1043     x[idx+1]   = x1;
1044     x[idx+2]   = x2;
1045     x[idx+3]   = x3;
1046   }
1047 
1048   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1049   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1050   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1051   ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr);
1052    PetscFunctionReturn(0);
1053 }
1054 
1055 #undef __FUNCT__
1056 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_4_NaturalOrdering"
1057 PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1058 {
1059   MatScalar      *v,*diag;
1060   PetscScalar    *xp,x0,x1,x2,x3;
1061   PetscInt       nz,*vj,k;
1062 
1063   PetscFunctionBegin;
1064   for (k=0; k<mbs; k++) {
1065     v  = aa + 16*ai[k];
1066     xp = x + k*4;
1067     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* Dk*xk = k-th block of x */
1068     nz = ai[k+1] - ai[k];
1069     vj = aj + ai[k];
1070     xp = x + (*vj)*4;
1071     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
1072     PetscPrefetchBlock(v+16*nz,16*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1073     while (nz--) {
1074       /* x(:) += U(k,:)^T*(Dk*xk) */
1075       xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
1076       xp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
1077       xp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
1078       xp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
1079       vj++; xp = x + (*vj)*4;
1080       v += 16;
1081     }
1082     /* xk = inv(Dk)*(Dk*xk) */
1083     diag = aa+k*16;          /* ptr to inv(Dk) */
1084     xp   = x + k*4;
1085     xp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
1086     xp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
1087     xp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
1088     xp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
1089   }
1090   PetscFunctionReturn(0);
1091 }
1092 
1093 #undef __FUNCT__
1094 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering"
1095 PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1096 {
1097   MatScalar      *v;
1098   PetscScalar    *xp,x0,x1,x2,x3;
1099   PetscInt       nz,*vj,k;
1100 
1101   PetscFunctionBegin;
1102   for (k=mbs-1; k>=0; k--) {
1103     v  = aa + 16*ai[k];
1104     xp = x + k*4;
1105     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* xk */
1106     nz = ai[k+1] - ai[k];
1107     vj = aj + ai[k];
1108     xp = x + (*vj)*4;
1109     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
1110     PetscPrefetchBlock(v-16*nz,16*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1111     while (nz--) {
1112       /* xk += U(k,:)*x(:) */
1113       x0 += v[0]*xp[0] + v[4]*xp[1] + v[8]*xp[2] + v[12]*xp[3];
1114       x1 += v[1]*xp[0] + v[5]*xp[1] + v[9]*xp[2] + v[13]*xp[3];
1115       x2 += v[2]*xp[0] + v[6]*xp[1]+ v[10]*xp[2] + v[14]*xp[3];
1116       x3 += v[3]*xp[0] + v[7]*xp[1]+ v[11]*xp[2] + v[15]*xp[3];
1117       vj++;
1118       v += 16; xp = x + (*vj)*4;
1119     }
1120     xp = x + k*4;
1121     xp[0] = x0; xp[1] = x1; xp[2] = x2; xp[3] = x3;
1122   }
1123   PetscFunctionReturn(0);
1124 }
1125 
1126 #undef __FUNCT__
1127 #define __FUNCT__ "MatSolve_SeqSBAIJ_4_NaturalOrdering_inplace"
1128 PetscErrorCode MatSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1129 {
1130   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1131   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1132   MatScalar      *aa=a->a;
1133   PetscScalar    *x,*b;
1134   PetscErrorCode ierr;
1135 
1136   PetscFunctionBegin;
1137   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1138   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1139 
1140   /* solve U^T * D * y = b by forward substitution */
1141   ierr = PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */
1142   ierr = MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1143 
1144   /* solve U*x = y by back substitution */
1145   ierr = MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1146   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1147   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1148   ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr);
1149   PetscFunctionReturn(0);
1150 }
1151 
1152 #undef __FUNCT__
1153 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace"
1154 PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1155 {
1156   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1157   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1158   MatScalar      *aa=a->a;
1159   PetscScalar    *x,*b;
1160   PetscErrorCode ierr;
1161 
1162   PetscFunctionBegin;
1163   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1164   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1165   ierr = PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */
1166   ierr = MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1167   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1168   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1169   ierr = PetscLogFlops(2.0*bs2*a->nz - bs*mbs);CHKERRQ(ierr);
1170   PetscFunctionReturn(0);
1171 }
1172 
1173 #undef __FUNCT__
1174 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace"
1175 PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1176 {
1177   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1178   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
1179   MatScalar      *aa=a->a;
1180   PetscScalar    *x,*b;
1181   PetscErrorCode ierr;
1182 
1183   PetscFunctionBegin;
1184   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1185   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1186   ierr = PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1187   ierr = MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1188   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1189   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1190   ierr = PetscLogFlops(2.0*bs2*(a->nz-mbs));CHKERRQ(ierr);
1191   PetscFunctionReturn(0);
1192 }
1193 
1194 #undef __FUNCT__
1195 #define __FUNCT__ "MatSolve_SeqSBAIJ_3_inplace"
1196 PetscErrorCode MatSolve_SeqSBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
1197 {
1198   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1199   IS             isrow=a->row;
1200   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1201   PetscErrorCode ierr;
1202   const PetscInt *r;
1203   PetscInt       nz,*vj,k,idx;
1204   MatScalar      *aa=a->a,*v,*diag;
1205   PetscScalar    *x,*b,x0,x1,x2,*t,*tp;
1206 
1207   PetscFunctionBegin;
1208   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1209   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1210   t  = a->solve_work;
1211   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1212 
1213   /* solve U^T * D * y = b by forward substitution */
1214   tp = t;
1215   for (k=0; k<mbs; k++) { /* t <- perm(b) */
1216     idx   = 3*r[k];
1217     tp[0] = b[idx];
1218     tp[1] = b[idx+1];
1219     tp[2] = b[idx+2];
1220     tp += 3;
1221   }
1222 
1223   for (k=0; k<mbs; k++) {
1224     v  = aa + 9*ai[k];
1225     vj = aj + ai[k];
1226     tp = t + k*3;
1227     x0 = tp[0]; x1 = tp[1]; x2 = tp[2];
1228     nz = ai[k+1] - ai[k];
1229 
1230     tp = t + (*vj)*3;
1231     while (nz--) {
1232       tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
1233       tp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
1234       tp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
1235       vj++; tp = t + (*vj)*3;
1236       v += 9;
1237     }
1238 
1239     /* xk = inv(Dk)*(Dk*xk) */
1240     diag  = aa+k*9;          /* ptr to inv(Dk) */
1241     tp    = t + k*3;
1242     tp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
1243     tp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
1244     tp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
1245   }
1246 
1247   /* solve U*x = y by back substitution */
1248   for (k=mbs-1; k>=0; k--) {
1249     v  = aa + 9*ai[k];
1250     vj = aj + ai[k];
1251     tp    = t + k*3;
1252     x0 = tp[0]; x1 = tp[1]; x2 = tp[2];  /* xk */
1253     nz = ai[k+1] - ai[k];
1254 
1255     tp = t + (*vj)*3;
1256     while (nz--) {
1257       /* xk += U(k,:)*x(:) */
1258       x0 += v[0]*tp[0] + v[3]*tp[1] + v[6]*tp[2];
1259       x1 += v[1]*tp[0] + v[4]*tp[1] + v[7]*tp[2];
1260       x2 += v[2]*tp[0] + v[5]*tp[1] + v[8]*tp[2];
1261       vj++; tp = t + (*vj)*3;
1262       v += 9;
1263     }
1264     tp    = t + k*3;
1265     tp[0] = x0; tp[1] = x1; tp[2] = x2;
1266     idx      = 3*r[k];
1267     x[idx]   = x0;
1268     x[idx+1] = x1;
1269     x[idx+2] = x2;
1270   }
1271 
1272   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1273   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1274   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1275   ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr);
1276   PetscFunctionReturn(0);
1277 }
1278 
1279 #undef __FUNCT__
1280 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_3_NaturalOrdering"
1281 PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1282 {
1283   MatScalar      *v,*diag;
1284   PetscScalar    *xp,x0,x1,x2;
1285   PetscInt       nz,*vj,k;
1286 
1287   PetscFunctionBegin;
1288   for (k=0; k<mbs; k++) {
1289     v  = aa + 9*ai[k];
1290     xp = x + k*3;
1291     x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* Dk*xk = k-th block of x */
1292     nz = ai[k+1] - ai[k];
1293     vj = aj + ai[k];
1294     xp = x + (*vj)*3;
1295     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);    /* Indices for the next row (assumes same size as this one) */
1296     PetscPrefetchBlock(v+9*nz,9*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1297     while (nz--) {
1298       /* x(:) += U(k,:)^T*(Dk*xk) */
1299       xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
1300       xp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
1301       xp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
1302       vj++; xp = x + (*vj)*3;
1303       v += 9;
1304     }
1305     /* xk = inv(Dk)*(Dk*xk) */
1306     diag = aa+k*9;          /* ptr to inv(Dk) */
1307     xp   = x + k*3;
1308     xp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
1309     xp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
1310     xp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
1311   }
1312   PetscFunctionReturn(0);
1313 }
1314 
1315 #undef __FUNCT__
1316 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering"
1317 PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1318 {
1319   MatScalar      *v;
1320   PetscScalar    *xp,x0,x1,x2;
1321   PetscInt       nz,*vj,k;
1322 
1323   PetscFunctionBegin;
1324   for (k=mbs-1; k>=0; k--) {
1325     v  = aa + 9*ai[k];
1326     xp = x + k*3;
1327     x0 = xp[0]; x1 = xp[1]; x2 = xp[2];  /* xk */
1328     nz = ai[k+1] - ai[k];
1329     vj = aj + ai[k];
1330     xp = x + (*vj)*3;
1331     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);    /* Indices for the next row (assumes same size as this one) */
1332     PetscPrefetchBlock(v-9*nz,9*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1333     while (nz--) {
1334       /* xk += U(k,:)*x(:) */
1335       x0 += v[0]*xp[0] + v[3]*xp[1] + v[6]*xp[2];
1336       x1 += v[1]*xp[0] + v[4]*xp[1] + v[7]*xp[2];
1337       x2 += v[2]*xp[0] + v[5]*xp[1] + v[8]*xp[2];
1338       vj++;
1339       v += 9; xp = x + (*vj)*3;
1340     }
1341     xp = x + k*3;
1342     xp[0] = x0; xp[1] = x1; xp[2] = x2;
1343   }
1344   PetscFunctionReturn(0);
1345 }
1346 
1347 #undef __FUNCT__
1348 #define __FUNCT__ "MatSolve_SeqSBAIJ_3_NaturalOrdering_inplace"
1349 PetscErrorCode MatSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1350 {
1351   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1352   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1353   MatScalar      *aa=a->a;
1354   PetscScalar    *x,*b;
1355   PetscErrorCode ierr;
1356 
1357   PetscFunctionBegin;
1358   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1359   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1360 
1361   /* solve U^T * D * y = b by forward substitution */
1362   ierr = PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1363   ierr = MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1364 
1365   /* solve U*x = y by back substitution */
1366   ierr = MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1367 
1368   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1369   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1370   ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr);
1371   PetscFunctionReturn(0);
1372 }
1373 
1374 #undef __FUNCT__
1375 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace"
1376 PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1377 {
1378   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1379   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1380   MatScalar      *aa=a->a;
1381   PetscScalar    *x,*b;
1382   PetscErrorCode ierr;
1383 
1384   PetscFunctionBegin;
1385   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1386   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1387   ierr = PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1388   ierr = MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1389   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1390   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1391   ierr = PetscLogFlops(2.0*bs2*a->nz - bs*mbs);CHKERRQ(ierr);
1392   PetscFunctionReturn(0);
1393 }
1394 
1395 #undef __FUNCT__
1396 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace"
1397 PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1398 {
1399   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1400   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
1401   MatScalar      *aa=a->a;
1402   PetscScalar    *x,*b;
1403   PetscErrorCode ierr;
1404 
1405   PetscFunctionBegin;
1406   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1407   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1408   ierr = PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1409   ierr = MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1410   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1411   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1412   ierr = PetscLogFlops(2.0*bs2*(a->nz-mbs));CHKERRQ(ierr);
1413   PetscFunctionReturn(0);
1414 }
1415 
1416 #undef __FUNCT__
1417 #define __FUNCT__ "MatSolve_SeqSBAIJ_2_inplace"
1418 PetscErrorCode MatSolve_SeqSBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
1419 {
1420   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ *)A->data;
1421   IS             isrow=a->row;
1422   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1423   PetscErrorCode ierr;
1424   const PetscInt *r;
1425   PetscInt       nz,*vj,k,k2,idx;
1426   MatScalar      *aa=a->a,*v,*diag;
1427   PetscScalar    *x,*b,x0,x1,*t;
1428 
1429   PetscFunctionBegin;
1430   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1431   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1432   t  = a->solve_work;
1433   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1434 
1435   /* solve U^T * D * y = perm(b) by forward substitution */
1436   for (k=0; k<mbs; k++) {  /* t <- perm(b) */
1437     idx = 2*r[k];
1438     t[k*2]   = b[idx];
1439     t[k*2+1] = b[idx+1];
1440   }
1441   for (k=0; k<mbs; k++) {
1442     v  = aa + 4*ai[k];
1443     vj = aj + ai[k];
1444     k2 = k*2;
1445     x0 = t[k2]; x1 = t[k2+1];
1446     nz = ai[k+1] - ai[k];
1447     while (nz--) {
1448       t[(*vj)*2]   += v[0]*x0 + v[1]*x1;
1449       t[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1450       vj++; v += 4;
1451     }
1452     diag = aa+k*4;  /* ptr to inv(Dk) */
1453     t[k2]   = diag[0]*x0 + diag[2]*x1;
1454     t[k2+1] = diag[1]*x0 + diag[3]*x1;
1455   }
1456 
1457   /* solve U*x = y by back substitution */
1458   for (k=mbs-1; k>=0; k--) {
1459     v  = aa + 4*ai[k];
1460     vj = aj + ai[k];
1461     k2 = k*2;
1462     x0 = t[k2]; x1 = t[k2+1];
1463     nz = ai[k+1] - ai[k];
1464     while (nz--) {
1465       x0 += v[0]*t[(*vj)*2] + v[2]*t[(*vj)*2+1];
1466       x1 += v[1]*t[(*vj)*2] + v[3]*t[(*vj)*2+1];
1467       vj++; v += 4;
1468     }
1469     t[k2]    = x0;
1470     t[k2+1]  = x1;
1471     idx      = 2*r[k];
1472     x[idx]   = x0;
1473     x[idx+1] = x1;
1474   }
1475 
1476   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1477   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1478   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1479   ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr);
1480   PetscFunctionReturn(0);
1481 }
1482 
1483 #undef __FUNCT__
1484 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_2_NaturalOrdering"
1485 PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1486 {
1487   MatScalar      *v,*diag;
1488   PetscScalar    x0,x1;
1489   PetscInt       nz,*vj,k,k2;
1490 
1491   PetscFunctionBegin;
1492   for (k=0; k<mbs; k++) {
1493     v  = aa + 4*ai[k];
1494     vj = aj + ai[k];
1495     k2 = k*2;
1496     x0 = x[k2]; x1 = x[k2+1];  /* Dk*xk = k-th block of x */
1497     nz = ai[k+1] - ai[k];
1498     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);    /* Indices for the next row (assumes same size as this one) */
1499     PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1500     while (nz--) {
1501       /* x(:) += U(k,:)^T*(Dk*xk) */
1502       x[(*vj)*2]   += v[0]*x0 + v[1]*x1;
1503       x[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1504       vj++; v += 4;
1505     }
1506     /* xk = inv(Dk)*(Dk*xk) */
1507     diag = aa+k*4;          /* ptr to inv(Dk) */
1508     x[k2]   = diag[0]*x0 + diag[2]*x1;
1509     x[k2+1] = diag[1]*x0 + diag[3]*x1;
1510   }
1511   PetscFunctionReturn(0);
1512 }
1513 
1514 #undef __FUNCT__
1515 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering"
1516 PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1517 {
1518   MatScalar      *v;
1519   PetscScalar    x0,x1;
1520   PetscInt       nz,*vj,k,k2;
1521 
1522   PetscFunctionBegin;
1523   for (k=mbs-1; k>=0; k--) {
1524     v  = aa + 4*ai[k];
1525     vj = aj + ai[k];
1526     k2 = k*2;
1527     x0 = x[k2]; x1 = x[k2+1];  /* xk */
1528     nz = ai[k+1] - ai[k];
1529     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);    /* Indices for the next row (assumes same size as this one) */
1530     PetscPrefetchBlock(v-4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1531     while (nz--) {
1532       /* xk += U(k,:)*x(:) */
1533       x0 += v[0]*x[(*vj)*2] + v[2]*x[(*vj)*2+1];
1534       x1 += v[1]*x[(*vj)*2] + v[3]*x[(*vj)*2+1];
1535       vj++; v += 4;
1536     }
1537     x[k2]     = x0;
1538     x[k2+1]   = x1;
1539   }
1540   PetscFunctionReturn(0);
1541 }
1542 
1543 #undef __FUNCT__
1544 #define __FUNCT__ "MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace"
1545 PetscErrorCode MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1546 {
1547   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1548   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1549   MatScalar      *aa=a->a;
1550   PetscScalar    *x,*b;
1551   PetscErrorCode ierr;
1552 
1553   PetscFunctionBegin;
1554   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1555   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1556 
1557   /* solve U^T * D * y = b by forward substitution */
1558   ierr = PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1559   ierr = MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1560 
1561   /* solve U*x = y by back substitution */
1562   ierr = MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1563 
1564   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1565   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1566   ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr);
1567   PetscFunctionReturn(0);
1568 }
1569 
1570 #undef __FUNCT__
1571 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace"
1572 PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1573 {
1574   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1575   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1576   MatScalar      *aa=a->a;
1577   PetscScalar    *x,*b;
1578   PetscErrorCode ierr;
1579 
1580   PetscFunctionBegin;
1581   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1582   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1583   ierr = PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1584   ierr = MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1585   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1586   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1587   ierr = PetscLogFlops(2.0*bs2*a->nz - bs*mbs);CHKERRQ(ierr);
1588   PetscFunctionReturn(0);
1589 }
1590 
1591 #undef __FUNCT__
1592 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace"
1593 PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1594 {
1595   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1596   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
1597   MatScalar      *aa=a->a;
1598   PetscScalar    *x,*b;
1599   PetscErrorCode ierr;
1600 
1601   PetscFunctionBegin;
1602   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1603   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1604   ierr = PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1605   ierr = MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1606   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1607   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1608   ierr = PetscLogFlops(2.0*bs2*(a->nz - mbs));CHKERRQ(ierr);
1609   PetscFunctionReturn(0);
1610 }
1611 
1612 #undef __FUNCT__
1613 #define __FUNCT__ "MatSolve_SeqSBAIJ_1"
1614 PetscErrorCode MatSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1615 {
1616   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1617   IS                isrow=a->row;
1618   PetscErrorCode    ierr;
1619   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag = a->diag;
1620   const MatScalar   *aa=a->a,*v;
1621   const PetscScalar *b;
1622   PetscScalar       *x,xk,*t;
1623   PetscInt          nz,k,j;
1624 
1625   PetscFunctionBegin;
1626   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1627   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1628   t    = a->solve_work;
1629   ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1630 
1631   /* solve U^T*D*y = perm(b) by forward substitution */
1632   for (k=0; k<mbs; k++) t[k] = b[rp[k]];
1633   for (k=0; k<mbs; k++) {
1634     v  = aa + ai[k];
1635     vj = aj + ai[k];
1636     xk = t[k];
1637     nz = ai[k+1] - ai[k] - 1;
1638     for (j=0; j<nz; j++) t[vj[j]] += v[j]*xk;
1639     t[k] = xk*v[nz];   /* v[nz] = 1/D(k) */
1640   }
1641 
1642   /* solve U*perm(x) = y by back substitution */
1643   for (k=mbs-1; k>=0; k--) {
1644     v  = aa + adiag[k] - 1;
1645     vj = aj + adiag[k] - 1;
1646     nz = ai[k+1] - ai[k] - 1;
1647     for (j=0; j<nz; j++) t[k] += v[-j]*t[vj[-j]];
1648     x[rp[k]] = t[k];
1649   }
1650 
1651   ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1652   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1653   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1654   ierr = PetscLogFlops(4.0*a->nz - 3.0*mbs);CHKERRQ(ierr);
1655   PetscFunctionReturn(0);
1656 }
1657 
1658 #undef __FUNCT__
1659 #define __FUNCT__ "MatSolve_SeqSBAIJ_1_inplace"
1660 PetscErrorCode MatSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1661 {
1662   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1663   IS              isrow=a->row;
1664   PetscErrorCode  ierr;
1665   const PetscInt  mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1666   const MatScalar *aa=a->a,*v;
1667   PetscScalar     *x,*b,xk,*t;
1668   PetscInt        nz,k;
1669 
1670   PetscFunctionBegin;
1671   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1672   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1673   t    = a->solve_work;
1674   ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1675 
1676   /* solve U^T*D*y = perm(b) by forward substitution */
1677   for (k=0; k<mbs; k++) t[k] = b[rp[k]];
1678   for (k=0; k<mbs; k++) {
1679     v  = aa + ai[k] + 1;
1680     vj = aj + ai[k] + 1;
1681     xk = t[k];
1682     nz = ai[k+1] - ai[k] - 1;
1683     while (nz--) t[*vj++] += (*v++) * xk;
1684     t[k] = xk*aa[ai[k]];  /* aa[k] = 1/D(k) */
1685   }
1686 
1687   /* solve U*perm(x) = y by back substitution */
1688   for (k=mbs-1; k>=0; k--) {
1689     v  = aa + ai[k] + 1;
1690     vj = aj + ai[k] + 1;
1691     nz = ai[k+1] - ai[k] - 1;
1692     while (nz--) t[k] += (*v++) * t[*vj++];
1693     x[rp[k]] = t[k];
1694   }
1695 
1696   ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1697   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1698   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1699   ierr = PetscLogFlops(4.0*a->nz - 3*mbs);CHKERRQ(ierr);
1700   PetscFunctionReturn(0);
1701 }
1702 
1703 #undef __FUNCT__
1704 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_1"
1705 PetscErrorCode MatForwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1706 {
1707   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1708   IS              isrow=a->row;
1709   PetscErrorCode  ierr;
1710   const PetscInt  mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag = a->diag;
1711   const MatScalar *aa=a->a,*v;
1712   PetscReal       diagk;
1713   PetscScalar     *x,*b,xk;
1714   PetscInt        nz,k;
1715 
1716   PetscFunctionBegin;
1717   /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1718   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1719   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1720   ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1721 
1722   for (k=0; k<mbs; k++) x[k] = b[rp[k]];
1723   for (k=0; k<mbs; k++) {
1724     v  = aa + ai[k];
1725     vj = aj + ai[k];
1726     xk = x[k];
1727     nz = ai[k+1] - ai[k] - 1;
1728     while (nz--) x[*vj++] += (*v++) * xk;
1729 
1730     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
1731     if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1732     x[k] = xk*PetscSqrtReal(diagk);
1733   }
1734   ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1735   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1736   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1737   ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr);
1738   PetscFunctionReturn(0);
1739 }
1740 
1741 #undef __FUNCT__
1742 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_1_inplace"
1743 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1744 {
1745   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1746   IS              isrow=a->row;
1747   PetscErrorCode  ierr;
1748   const PetscInt  mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1749   const MatScalar *aa=a->a,*v;
1750   PetscReal       diagk;
1751   PetscScalar     *x,*b,xk;
1752   PetscInt        nz,k;
1753 
1754   PetscFunctionBegin;
1755   /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1756   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1757   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1758   ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1759 
1760   for (k=0; k<mbs; k++) x[k] = b[rp[k]];
1761   for (k=0; k<mbs; k++) {
1762     v  = aa + ai[k] + 1;
1763     vj = aj + ai[k] + 1;
1764     xk = x[k];
1765     nz = ai[k+1] - ai[k] - 1;
1766     while (nz--) x[*vj++] += (*v++) * xk;
1767 
1768     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
1769     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1770     x[k] = xk*PetscSqrtReal(diagk);
1771   }
1772   ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1773   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1774   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1775   ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr);
1776   PetscFunctionReturn(0);
1777 }
1778 
1779 #undef __FUNCT__
1780 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_1"
1781 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1782 {
1783   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1784   IS              isrow=a->row;
1785   PetscErrorCode  ierr;
1786   const PetscInt  mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag = a->diag;
1787   const MatScalar *aa=a->a,*v;
1788   PetscReal       diagk;
1789   PetscScalar     *x,*b,*t;
1790   PetscInt        nz,k;
1791 
1792   PetscFunctionBegin;
1793   /* solve D^(1/2)*U*perm(x) = b by back substitution */
1794   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1795   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1796   t    = a->solve_work;
1797   ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1798 
1799   for (k=mbs-1; k>=0; k--) {
1800     v  = aa + ai[k];
1801     vj = aj + ai[k];
1802     diagk = PetscRealPart(aa[adiag[k]]);
1803     if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1804     t[k] = b[k] * PetscSqrtReal(diagk);
1805     nz = ai[k+1] - ai[k] - 1;
1806     while (nz--) t[k] += (*v++) * t[*vj++];
1807     x[rp[k]] = t[k];
1808   }
1809   ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1810   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1811   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1812   ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr);
1813   PetscFunctionReturn(0);
1814 }
1815 
1816 #undef __FUNCT__
1817 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_1_inplace"
1818 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1819 {
1820   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1821   IS              isrow=a->row;
1822   PetscErrorCode  ierr;
1823   const PetscInt  mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1824   const MatScalar *aa=a->a,*v;
1825   PetscReal       diagk;
1826   PetscScalar     *x,*b,*t;
1827   PetscInt        nz,k;
1828 
1829   PetscFunctionBegin;
1830   /* solve D^(1/2)*U*perm(x) = b by back substitution */
1831   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1832   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1833   t    = a->solve_work;
1834   ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1835 
1836   for (k=mbs-1; k>=0; k--) {
1837     v  = aa + ai[k] + 1;
1838     vj = aj + ai[k] + 1;
1839     diagk = PetscRealPart(aa[ai[k]]);
1840     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1841     t[k] = b[k] * PetscSqrtReal(diagk);
1842     nz = ai[k+1] - ai[k] - 1;
1843     while (nz--) t[k] += (*v++) * t[*vj++];
1844     x[rp[k]] = t[k];
1845   }
1846   ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1847   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1848   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1849   ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr);
1850   PetscFunctionReturn(0);
1851 }
1852 
1853 #undef __FUNCT__
1854 #define __FUNCT__ "MatSolves_SeqSBAIJ_1"
1855 PetscErrorCode MatSolves_SeqSBAIJ_1(Mat A,Vecs bb,Vecs xx)
1856 {
1857   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;
1858   PetscErrorCode ierr;
1859 
1860   PetscFunctionBegin;
1861   if (A->rmap->bs == 1) {
1862     ierr = MatSolve_SeqSBAIJ_1(A,bb->v,xx->v);CHKERRQ(ierr);
1863   } else {
1864     IS              isrow=a->row;
1865     const PetscInt  *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp;
1866     const MatScalar *aa=a->a,*v;
1867     PetscScalar     *x,*b,*t;
1868     PetscInt        nz,k,n,i,j;
1869     if (bb->n > a->solves_work_n) {
1870       ierr = PetscFree(a->solves_work);CHKERRQ(ierr);
1871       ierr = PetscMalloc(bb->n*A->rmap->N*sizeof(PetscScalar),&a->solves_work);CHKERRQ(ierr);
1872       a->solves_work_n = bb->n;
1873     }
1874     n    = bb->n;
1875     ierr = VecGetArray(bb->v,&b);CHKERRQ(ierr);
1876     ierr = VecGetArray(xx->v,&x);CHKERRQ(ierr);
1877     t    = a->solves_work;
1878 
1879     ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1880 
1881     /* solve U^T*D*y = perm(b) by forward substitution */
1882     for (k=0; k<mbs; k++) {for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs];} /* values are stored interlaced in t */
1883     for (k=0; k<mbs; k++) {
1884       v  = aa + ai[k];
1885       vj = aj + ai[k];
1886       nz = ai[k+1] - ai[k] - 1;
1887       for (j=0; j<nz; j++) {
1888         for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i];
1889         v++;vj++;
1890       }
1891       for (i=0; i<n; i++) t[n*k+i] *= aa[nz];  /* note: aa[nz] = 1/D(k) */
1892     }
1893 
1894     /* solve U*perm(x) = y by back substitution */
1895     for (k=mbs-1; k>=0; k--) {
1896       v  = aa + ai[k] - 1;
1897       vj = aj + ai[k] - 1;
1898       nz = ai[k+1] - ai[k] - 1;
1899       for (j=0; j<nz; j++) {
1900         for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i];
1901         v++;vj++;
1902       }
1903       for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i];
1904     }
1905 
1906     ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1907     ierr = VecRestoreArray(bb->v,&b);CHKERRQ(ierr);
1908     ierr = VecRestoreArray(xx->v,&x);CHKERRQ(ierr);
1909     ierr = PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));CHKERRQ(ierr);
1910   }
1911   PetscFunctionReturn(0);
1912 }
1913 
1914 #undef __FUNCT__
1915 #define __FUNCT__ "MatSolves_SeqSBAIJ_1_inplace"
1916 PetscErrorCode MatSolves_SeqSBAIJ_1_inplace(Mat A,Vecs bb,Vecs xx)
1917 {
1918   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;
1919   PetscErrorCode ierr;
1920 
1921   PetscFunctionBegin;
1922   if (A->rmap->bs == 1) {
1923     ierr = MatSolve_SeqSBAIJ_1_inplace(A,bb->v,xx->v);CHKERRQ(ierr);
1924   } else {
1925     IS              isrow=a->row;
1926     const PetscInt  *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp;
1927     const MatScalar *aa=a->a,*v;
1928     PetscScalar     *x,*b,*t;
1929     PetscInt        nz,k,n,i;
1930     if (bb->n > a->solves_work_n) {
1931       ierr = PetscFree(a->solves_work);CHKERRQ(ierr);
1932       ierr = PetscMalloc(bb->n*A->rmap->N*sizeof(PetscScalar),&a->solves_work);CHKERRQ(ierr);
1933       a->solves_work_n = bb->n;
1934     }
1935     n    = bb->n;
1936     ierr = VecGetArray(bb->v,&b);CHKERRQ(ierr);
1937     ierr = VecGetArray(xx->v,&x);CHKERRQ(ierr);
1938     t    = a->solves_work;
1939 
1940     ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1941 
1942     /* solve U^T*D*y = perm(b) by forward substitution */
1943     for (k=0; k<mbs; k++) {for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs];} /* values are stored interlaced in t */
1944     for (k=0; k<mbs; k++) {
1945       v  = aa + ai[k];
1946       vj = aj + ai[k];
1947       nz = ai[k+1] - ai[k];
1948       while (nz--) {
1949         for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i];
1950         v++;vj++;
1951       }
1952       for (i=0; i<n; i++) t[n*k+i] *= aa[k];  /* note: aa[k] = 1/D(k) */
1953     }
1954 
1955     /* solve U*perm(x) = y by back substitution */
1956     for (k=mbs-1; k>=0; k--) {
1957       v  = aa + ai[k];
1958       vj = aj + ai[k];
1959       nz = ai[k+1] - ai[k];
1960       while (nz--) {
1961         for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i];
1962         v++;vj++;
1963       }
1964       for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i];
1965     }
1966 
1967     ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1968     ierr = VecRestoreArray(bb->v,&b);CHKERRQ(ierr);
1969     ierr = VecRestoreArray(xx->v,&x);CHKERRQ(ierr);
1970     ierr = PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));CHKERRQ(ierr);
1971   }
1972   PetscFunctionReturn(0);
1973 }
1974 
1975 #undef __FUNCT__
1976 #define __FUNCT__ "MatSolve_SeqSBAIJ_1_NaturalOrdering"
1977 PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
1978 {
1979   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1980   PetscErrorCode    ierr;
1981   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*vj,*adiag = a->diag;
1982   const MatScalar   *aa=a->a,*v;
1983   const PetscScalar *b;
1984   PetscScalar       *x,xi;
1985   PetscInt          nz,i,j;
1986 
1987   PetscFunctionBegin;
1988   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1989   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1990 
1991   /* solve U^T*D*y = b by forward substitution */
1992   ierr = PetscMemcpy(x,b,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1993   for (i=0; i<mbs; i++) {
1994     v  = aa + ai[i];
1995     vj = aj + ai[i];
1996     xi = x[i];
1997     nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
1998     for (j=0; j<nz; j++) {
1999       x[vj[j]] += v[j]* xi;
2000     }
2001     x[i] = xi*v[nz];  /* v[nz] = aa[diag[i]] = 1/D(i) */
2002   }
2003 
2004   /* solve U*x = y by backward substitution */
2005   for (i=mbs-2; i>=0; i--) {
2006     xi = x[i];
2007     v  = aa + adiag[i] - 1; /* end of row i, excluding diag */
2008     vj = aj + adiag[i] - 1;
2009     nz = ai[i+1] - ai[i] - 1;
2010     for (j=0; j<nz; j++) xi += v[-j]*x[vj[-j]];
2011     x[i] = xi;
2012   }
2013 
2014   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2015   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2016   ierr = PetscLogFlops(4.0*a->nz - 3*mbs);CHKERRQ(ierr);
2017   PetscFunctionReturn(0);
2018 }
2019 
2020 #undef __FUNCT__
2021 #define __FUNCT__ "MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace"
2022 PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2023 {
2024   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;
2025   PetscErrorCode ierr;
2026   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
2027   MatScalar      *aa=a->a,*v;
2028   PetscScalar    *x,*b,xk;
2029   PetscInt       nz,*vj,k;
2030 
2031   PetscFunctionBegin;
2032   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2033   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2034 
2035   /* solve U^T*D*y = b by forward substitution */
2036   ierr = PetscMemcpy(x,b,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
2037   for (k=0; k<mbs; k++) {
2038     v  = aa + ai[k] + 1;
2039     vj = aj + ai[k] + 1;
2040     xk = x[k];
2041     nz = ai[k+1] - ai[k] - 1;     /* exclude diag[k] */
2042     while (nz--) x[*vj++] += (*v++) * xk;
2043     x[k] = xk*aa[ai[k]];  /* note: aa[diag[k]] = 1/D(k) */
2044   }
2045 
2046   /* solve U*x = y by back substitution */
2047   for (k=mbs-2; k>=0; k--) {
2048     v  = aa + ai[k] + 1;
2049     vj = aj + ai[k] + 1;
2050     xk = x[k];
2051     nz = ai[k+1] - ai[k] - 1;
2052     while (nz--) {
2053       xk += (*v++) * x[*vj++];
2054     }
2055     x[k] = xk;
2056   }
2057 
2058   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2059   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2060   ierr = PetscLogFlops(4.0*a->nz - 3*mbs);CHKERRQ(ierr);
2061   PetscFunctionReturn(0);
2062 }
2063 
2064 #undef __FUNCT__
2065 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_1_NaturalOrdering"
2066 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2067 {
2068   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
2069   PetscErrorCode  ierr;
2070   PetscInt        mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag = a->diag;
2071   const MatScalar *aa=a->a,*v;
2072   PetscReal       diagk;
2073   PetscScalar     *x,*b;
2074   PetscInt        nz,*vj,k;
2075 
2076   PetscFunctionBegin;
2077   /* solve U^T*D^(1/2)*x = b by forward substitution */
2078   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2079   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2080   ierr = PetscMemcpy(x,b,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
2081   for (k=0; k<mbs; k++) {
2082     v  = aa + ai[k];
2083     vj = aj + ai[k];
2084     nz = ai[k+1] - ai[k] - 1;     /* exclude diag[k] */
2085     while (nz--) x[*vj++] += (*v++) * x[k];
2086     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[adiag[k]] = 1/D(k) */
2087     if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",PetscRealPart(aa[adiag[k]]),PetscImaginaryPart(aa[adiag[k]]));
2088     x[k] *= PetscSqrtReal(diagk);
2089   }
2090   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2091   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2092   ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr);
2093   PetscFunctionReturn(0);
2094 }
2095 
2096 #undef __FUNCT__
2097 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace"
2098 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2099 {
2100   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;
2101   PetscErrorCode ierr;
2102   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
2103   MatScalar      *aa=a->a,*v;
2104   PetscReal      diagk;
2105   PetscScalar    *x,*b;
2106   PetscInt       nz,*vj,k;
2107 
2108   PetscFunctionBegin;
2109   /* solve U^T*D^(1/2)*x = b by forward substitution */
2110   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2111   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2112   ierr = PetscMemcpy(x,b,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
2113   for (k=0; k<mbs; k++) {
2114     v  = aa + ai[k] + 1;
2115     vj = aj + ai[k] + 1;
2116     nz = ai[k+1] - ai[k] - 1;     /* exclude diag[k] */
2117     while (nz--) x[*vj++] += (*v++) * x[k];
2118     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2119     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",PetscRealPart(aa[ai[k]]),PetscImaginaryPart(aa[ai[k]]));
2120     x[k] *= PetscSqrtReal(diagk);
2121   }
2122   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2123   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2124   ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr);
2125   PetscFunctionReturn(0);
2126 }
2127 
2128 #undef __FUNCT__
2129 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering"
2130 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2131 {
2132   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;
2133   PetscErrorCode ierr;
2134   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag = a->diag;
2135   MatScalar      *aa=a->a,*v;
2136   PetscReal      diagk;
2137   PetscScalar    *x,*b;
2138   PetscInt       nz,*vj,k;
2139 
2140   PetscFunctionBegin;
2141   /* solve D^(1/2)*U*x = b by back substitution */
2142   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2143   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2144 
2145   for (k=mbs-1; k>=0; k--) {
2146     v  = aa + ai[k];
2147     vj = aj + ai[k];
2148     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
2149     if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
2150     x[k] = PetscSqrtReal(diagk)*b[k];
2151     nz = ai[k+1] - ai[k] - 1;
2152     while (nz--) x[k] += (*v++) * x[*vj++];
2153   }
2154   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2155   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2156   ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr);
2157   PetscFunctionReturn(0);
2158 }
2159 
2160 #undef __FUNCT__
2161 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace"
2162 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2163 {
2164   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;
2165   PetscErrorCode ierr;
2166   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
2167   MatScalar      *aa=a->a,*v;
2168   PetscReal      diagk;
2169   PetscScalar    *x,*b;
2170   PetscInt       nz,*vj,k;
2171 
2172   PetscFunctionBegin;
2173   /* solve D^(1/2)*U*x = b by back substitution */
2174   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2175   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2176 
2177   for (k=mbs-1; k>=0; k--) {
2178     v  = aa + ai[k] + 1;
2179     vj = aj + ai[k] + 1;
2180     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2181     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
2182     x[k] = PetscSqrtReal(diagk)*b[k];
2183     nz = ai[k+1] - ai[k] - 1;
2184     while (nz--) x[k] += (*v++) * x[*vj++];
2185   }
2186   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2187   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2188   ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr);
2189   PetscFunctionReturn(0);
2190 }
2191 
2192 /* Use Modified Sparse Row storage for u and ju, see Saad pp.85 */
2193 #undef __FUNCT__
2194 #define __FUNCT__ "MatICCFactorSymbolic_SeqSBAIJ_MSR"
2195 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_MSR(Mat B,Mat A,IS perm,const MatFactorInfo *info)
2196 {
2197   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b;
2198   PetscErrorCode ierr;
2199   const PetscInt *rip,mbs = a->mbs,*ai,*aj;
2200   PetscInt       *jutmp,bs = A->rmap->bs,bs2=a->bs2,i;
2201   PetscInt       m,reallocs = 0,*levtmp;
2202   PetscInt       *prowl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd;
2203   PetscInt       incrlev,*lev,shift,prow,nz;
2204   PetscReal      f = info->fill,levels = info->levels;
2205   PetscBool      perm_identity;
2206 
2207   PetscFunctionBegin;
2208   /* check whether perm is the identity mapping */
2209   ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr);
2210 
2211   if (perm_identity) {
2212     a->permute = PETSC_FALSE;
2213     ai = a->i; aj = a->j;
2214   } else { /*  non-trivial permutation */
2215     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2216     a->permute = PETSC_TRUE;
2217     ierr = MatReorderingSeqSBAIJ(A, perm);CHKERRQ(ierr);
2218     ai = a->inew; aj = a->jnew;
2219   }
2220 
2221   /* initialization */
2222   ierr  = ISGetIndices(perm,&rip);CHKERRQ(ierr);
2223   umax  = (PetscInt)(f*ai[mbs] + 1);
2224   ierr  = PetscMalloc(umax*sizeof(PetscInt),&lev);CHKERRQ(ierr);
2225   umax += mbs + 1;
2226   shift = mbs + 1;
2227   ierr  = PetscMalloc((mbs+1)*sizeof(PetscInt),&iu);CHKERRQ(ierr);
2228   ierr  = PetscMalloc(umax*sizeof(PetscInt),&ju);CHKERRQ(ierr);
2229   iu[0] = mbs + 1;
2230   juidx = mbs + 1;
2231   /* prowl: linked list for pivot row */
2232   ierr    = PetscMalloc3(mbs,PetscInt,&prowl,mbs,PetscInt,&q,mbs,PetscInt,&levtmp);CHKERRQ(ierr);
2233   /* q: linked list for col index */
2234 
2235   for (i=0; i<mbs; i++) {
2236     prowl[i] = mbs;
2237     q[i] = 0;
2238   }
2239 
2240   /* for each row k */
2241   for (k=0; k<mbs; k++) {
2242     nzk  = 0;
2243     q[k] = mbs;
2244     /* copy current row into linked list */
2245     nz = ai[rip[k]+1] - ai[rip[k]];
2246     j = ai[rip[k]];
2247     while (nz--) {
2248       vj = rip[aj[j++]];
2249       if (vj > k) {
2250         qm = k;
2251         do {
2252           m  = qm; qm = q[m];
2253         } while (qm < vj);
2254         if (qm == vj) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Duplicate entry in A\n");
2255         nzk++;
2256         q[m]       = vj;
2257         q[vj]      = qm;
2258         levtmp[vj] = 0;   /* initialize lev for nonzero element */
2259       }
2260     }
2261 
2262     /* modify nonzero structure of k-th row by computing fill-in
2263        for each row prow to be merged in */
2264     prow = k;
2265     prow = prowl[prow]; /* next pivot row (== 0 for symbolic factorization) */
2266 
2267     while (prow < k) {
2268       /* merge row prow into k-th row */
2269       jmin = iu[prow] + 1;
2270       jmax = iu[prow+1];
2271       qm = k;
2272       for (j=jmin; j<jmax; j++) {
2273         incrlev = lev[j-shift] + 1;
2274         if (incrlev > levels) continue;
2275         vj      = ju[j];
2276         do {
2277           m = qm; qm = q[m];
2278         } while (qm < vj);
2279         if (qm != vj) {      /* a fill */
2280           nzk++; q[m] = vj; q[vj] = qm; qm = vj;
2281           levtmp[vj] = incrlev;
2282         } else {
2283           if (levtmp[vj] > incrlev) levtmp[vj] = incrlev;
2284         }
2285       }
2286       prow = prowl[prow]; /* next pivot row */
2287     }
2288 
2289     /* add k to row list for first nonzero element in k-th row */
2290     if (nzk > 1) {
2291       i = q[k]; /* col value of first nonzero element in k_th row of U */
2292       prowl[k] = prowl[i]; prowl[i] = k;
2293     }
2294     iu[k+1] = iu[k] + nzk;
2295 
2296     /* allocate more space to ju and lev if needed */
2297     if (iu[k+1] > umax) {
2298       /* estimate how much additional space we will need */
2299       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
2300       /* just double the memory each time */
2301       maxadd = umax;
2302       if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
2303       umax += maxadd;
2304 
2305       /* allocate a longer ju */
2306       ierr     = PetscMalloc(umax*sizeof(PetscInt),&jutmp);CHKERRQ(ierr);
2307       ierr     = PetscMemcpy(jutmp,ju,iu[k]*sizeof(PetscInt));CHKERRQ(ierr);
2308       ierr     = PetscFree(ju);CHKERRQ(ierr);
2309       ju       = jutmp;
2310 
2311       ierr     = PetscMalloc(umax*sizeof(PetscInt),&jutmp);CHKERRQ(ierr);
2312       ierr     = PetscMemcpy(jutmp,lev,(iu[k]-shift)*sizeof(PetscInt));CHKERRQ(ierr);
2313       ierr     = PetscFree(lev);CHKERRQ(ierr);
2314       lev      = jutmp;
2315       reallocs += 2; /* count how many times we realloc */
2316     }
2317 
2318     /* save nonzero structure of k-th row in ju */
2319     i=k;
2320     while (nzk --) {
2321       i                = q[i];
2322       ju[juidx]        = i;
2323       lev[juidx-shift] = levtmp[i];
2324       juidx++;
2325     }
2326   }
2327 
2328 #if defined(PETSC_USE_INFO)
2329   if (ai[mbs] != 0) {
2330     PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
2331     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr);
2332     ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
2333     ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr);
2334     ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
2335   } else {
2336     ierr = PetscInfo(A,"Empty matrix.\n");CHKERRQ(ierr);
2337   }
2338 #endif
2339 
2340   ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
2341   ierr = PetscFree3(prowl,q,levtmp);CHKERRQ(ierr);
2342   ierr = PetscFree(lev);CHKERRQ(ierr);
2343 
2344   /* put together the new matrix */
2345   ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(B,bs,0,PETSC_NULL);CHKERRQ(ierr);
2346 
2347   /* ierr = PetscLogObjectParent(B,iperm);CHKERRQ(ierr); */
2348   b    = (Mat_SeqSBAIJ*)(B)->data;
2349   ierr = PetscFree2(b->imax,b->ilen);CHKERRQ(ierr);
2350   b->singlemalloc = PETSC_FALSE;
2351   b->free_a       = PETSC_TRUE;
2352   b->free_ij      = PETSC_TRUE;
2353   /* the next line frees the default space generated by the Create() */
2354   ierr    = PetscFree3(b->a,b->j,b->i);CHKERRQ(ierr);
2355   ierr    = PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2,&b->a);CHKERRQ(ierr);
2356   b->j    = ju;
2357   b->i    = iu;
2358   b->diag = 0;
2359   b->ilen = 0;
2360   b->imax = 0;
2361 
2362   ierr = ISDestroy(&b->row);CHKERRQ(ierr);
2363   ierr = ISDestroy(&b->icol);CHKERRQ(ierr);
2364   b->row  = perm;
2365   b->icol = perm;
2366   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
2367   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
2368   ierr    = PetscMalloc((bs*mbs+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
2369   /* In b structure:  Free imax, ilen, old a, old j.
2370      Allocate idnew, solve_work, new a, new j */
2371   ierr    = PetscLogObjectMemory(B,(iu[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));CHKERRQ(ierr);
2372   b->maxnz = b->nz = iu[mbs];
2373 
2374   (B)->info.factor_mallocs    = reallocs;
2375   (B)->info.fill_ratio_given  = f;
2376   if (ai[mbs] != 0) {
2377     (B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
2378   } else {
2379     (B)->info.fill_ratio_needed = 0.0;
2380   }
2381   ierr = MatSeqSBAIJSetNumericFactorization_inplace(B,perm_identity);CHKERRQ(ierr);
2382   PetscFunctionReturn(0);
2383 }
2384 
2385 /*
2386   See MatICCFactorSymbolic_SeqAIJ() for description of its data structure
2387 */
2388 #include <petscbt.h>
2389 #include <../src/mat/utils/freespace.h>
2390 #undef __FUNCT__
2391 #define __FUNCT__ "MatICCFactorSymbolic_SeqSBAIJ"
2392 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
2393 {
2394   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b;
2395   PetscErrorCode     ierr;
2396   PetscBool          perm_identity,free_ij = PETSC_TRUE,missing;
2397   PetscInt           bs=A->rmap->bs,am=a->mbs,d,*ai=a->i,*aj= a->j;
2398   const PetscInt     *rip;
2399   PetscInt           reallocs=0,i,*ui,*udiag,*cols;
2400   PetscInt           jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow;
2401   PetscInt           nlnk,*lnk,*lnk_lvl=PETSC_NULL,ncols,*uj,**uj_ptr,**uj_lvl_ptr;
2402   PetscReal          fill=info->fill,levels=info->levels;
2403   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
2404   PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL;
2405   PetscBT            lnkbt;
2406 
2407   PetscFunctionBegin;
2408   if (bs > 1) {
2409     ierr = MatICCFactorSymbolic_SeqSBAIJ_inplace(fact,A,perm,info);CHKERRQ(ierr);
2410     PetscFunctionReturn(0);
2411   }
2412   if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
2413   ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr);
2414   if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
2415 
2416   /* check whether perm is the identity mapping */
2417   ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr);
2418   if (!perm_identity) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2419   a->permute = PETSC_FALSE;
2420 
2421   ierr = PetscMalloc((am+1)*sizeof(PetscInt),&ui);CHKERRQ(ierr);
2422   ierr = PetscMalloc((am+1)*sizeof(PetscInt),&udiag);CHKERRQ(ierr);
2423   ui[0] = 0;
2424 
2425   /* ICC(0) without matrix ordering: simply rearrange column indices */
2426   if (!levels) {
2427     /* reuse the column pointers and row offsets for memory savings */
2428     for (i=0; i<am; i++) {
2429       ncols    = ai[i+1] - ai[i];
2430       ui[i+1]  = ui[i] + ncols;
2431       udiag[i] = ui[i+1] - 1; /* points to the last entry of U(i,:) */
2432     }
2433     ierr = PetscMalloc((ui[am]+1)*sizeof(PetscInt),&uj);CHKERRQ(ierr);
2434     cols = uj;
2435     for (i=0; i<am; i++) {
2436       aj    = a->j + ai[i] + 1; /* 1st entry of U(i,:) without diagonal */
2437       ncols = ai[i+1] - ai[i] -1;
2438       for (j=0; j<ncols; j++) *cols++ = aj[j];
2439       *cols++ = i; /* diagoanl is located as the last entry of U(i,:) */
2440     }
2441   } else { /* case: levels>0 */
2442     ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr);
2443 
2444     /* initialization */
2445     /* jl: linked list for storing indices of the pivot rows
2446        il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2447     ierr = PetscMalloc4(am,PetscInt*,&uj_ptr,am,PetscInt*,&uj_lvl_ptr,am,PetscInt,&il,am,PetscInt,&jl);CHKERRQ(ierr);
2448     for (i=0; i<am; i++) {
2449       jl[i] = am; il[i] = 0;
2450     }
2451 
2452     /* create and initialize a linked list for storing column indices of the active row k */
2453     nlnk = am + 1;
2454     ierr = PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
2455 
2456     /* initial FreeSpace size is fill*(ai[am]+1) */
2457     ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space);CHKERRQ(ierr);
2458     current_space = free_space;
2459     ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space_lvl);CHKERRQ(ierr);
2460     current_space_lvl = free_space_lvl;
2461 
2462     for (k=0; k<am; k++) {  /* for each active row k */
2463       /* initialize lnk by the column indices of row k */
2464       nzk   = 0;
2465       ncols = ai[k+1] - ai[k];
2466       if (!ncols) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Empty row %D in matrix ",k);
2467       cols  = aj+ai[k];
2468       ierr = PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
2469       nzk += nlnk;
2470 
2471       /* update lnk by computing fill-in for each pivot row to be merged in */
2472       prow = jl[k]; /* 1st pivot row */
2473 
2474       while (prow < k) {
2475         nextprow = jl[prow];
2476 
2477         /* merge prow into k-th row */
2478         jmin = il[prow] + 1;  /* index of the 2nd nzero entry in U(prow,k:am-1) */
2479         jmax = ui[prow+1];
2480         ncols = jmax-jmin;
2481         i     = jmin - ui[prow];
2482 
2483         cols  = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
2484         uj    = uj_lvl_ptr[prow] + i; /* levels of cols */
2485         j     = *(uj - 1);
2486         ierr = PetscICCLLAddSorted(ncols,cols,levels,uj,am,nlnk,lnk,lnk_lvl,lnkbt,j);CHKERRQ(ierr);
2487         nzk += nlnk;
2488 
2489         /* update il and jl for prow */
2490         if (jmin < jmax) {
2491           il[prow] = jmin;
2492           j = *cols; jl[prow] = jl[j]; jl[j] = prow;
2493         }
2494         prow = nextprow;
2495       }
2496 
2497       /* if free space is not available, make more free space */
2498       if (current_space->local_remaining<nzk) {
2499         i = am - k + 1; /* num of unfactored rows */
2500         i *= PetscMin(nzk, i-1); /* i*nzk, i*(i-1): estimated and max additional space needed */
2501         ierr = PetscFreeSpaceGet(i,&current_space);CHKERRQ(ierr);
2502         ierr = PetscFreeSpaceGet(i,&current_space_lvl);CHKERRQ(ierr);
2503         reallocs++;
2504       }
2505 
2506       /* copy data into free_space and free_space_lvl, then initialize lnk */
2507       if (nzk == 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Empty row %D in ICC matrix factor",k);
2508       ierr = PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr);
2509 
2510       /* add the k-th row into il and jl */
2511       if (nzk > 1) {
2512         i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2513         jl[k] = jl[i]; jl[i] = k;
2514         il[k] = ui[k] + 1;
2515       }
2516       uj_ptr[k]     = current_space->array;
2517       uj_lvl_ptr[k] = current_space_lvl->array;
2518 
2519       current_space->array           += nzk;
2520       current_space->local_used      += nzk;
2521       current_space->local_remaining -= nzk;
2522       current_space_lvl->array           += nzk;
2523       current_space_lvl->local_used      += nzk;
2524       current_space_lvl->local_remaining -= nzk;
2525 
2526       ui[k+1] = ui[k] + nzk;
2527     }
2528 
2529     ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
2530     ierr = PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);CHKERRQ(ierr);
2531 
2532     /* destroy list of free space and other temporary array(s) */
2533     ierr = PetscMalloc((ui[am]+1)*sizeof(PetscInt),&uj);CHKERRQ(ierr);
2534     ierr = PetscFreeSpaceContiguous_Cholesky(&free_space,uj,am,ui,udiag);CHKERRQ(ierr); /* store matrix factor  */
2535     ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
2536     ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr);
2537 
2538   } /* end of case: levels>0 || (levels=0 && !perm_identity) */
2539 
2540   /* put together the new matrix in MATSEQSBAIJ format */
2541   ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
2542 
2543   b = (Mat_SeqSBAIJ*)(fact)->data;
2544   ierr = PetscFree2(b->imax,b->ilen);CHKERRQ(ierr);
2545   b->singlemalloc = PETSC_FALSE;
2546   b->free_a  = PETSC_TRUE;
2547   b->free_ij = free_ij;
2548   ierr = PetscMalloc((ui[am]+1)*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
2549   b->j     = uj;
2550   b->i     = ui;
2551   b->diag  = udiag;
2552   b->free_diag = PETSC_TRUE;
2553   b->ilen  = 0;
2554   b->imax  = 0;
2555   b->row   = perm;
2556   b->col   = perm;
2557   ierr     = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
2558   ierr     = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
2559   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
2560   ierr = PetscMalloc((am+1)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
2561   ierr = PetscLogObjectMemory(fact,ui[am]*(sizeof(PetscInt)+sizeof(MatScalar)));CHKERRQ(ierr);
2562   b->maxnz = b->nz = ui[am];
2563 
2564   fact->info.factor_mallocs    = reallocs;
2565   fact->info.fill_ratio_given  = fill;
2566   if (ai[am] != 0) {
2567     fact->info.fill_ratio_needed = ((PetscReal)ui[am])/ai[am];
2568   } else {
2569     fact->info.fill_ratio_needed = 0.0;
2570   }
2571 #if defined(PETSC_USE_INFO)
2572     if (ai[am] != 0) {
2573       PetscReal af = fact->info.fill_ratio_needed;
2574       ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,fill,af);CHKERRQ(ierr);
2575       ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
2576       ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G) for best performance.\n",af);CHKERRQ(ierr);
2577     } else {
2578       ierr = PetscInfo(A,"Empty matrix.\n");CHKERRQ(ierr);
2579     }
2580 #endif
2581   fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
2582   PetscFunctionReturn(0);
2583 }
2584 
2585 #undef __FUNCT__
2586 #define __FUNCT__ "MatICCFactorSymbolic_SeqSBAIJ_inplace"
2587 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_inplace(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
2588 {
2589   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data;
2590   Mat_SeqSBAIJ       *b;
2591   PetscErrorCode     ierr;
2592   PetscBool          perm_identity,free_ij = PETSC_TRUE,missing;
2593   PetscInt           bs=A->rmap->bs,am=a->mbs,d;
2594   const PetscInt     *cols,*rip,*ai=a->i,*aj=a->j;
2595   PetscInt           reallocs=0,i,*ui;
2596   PetscInt           jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow;
2597   PetscInt           nlnk,*lnk,*lnk_lvl=PETSC_NULL,ncols,*cols_lvl,*uj,**uj_ptr,**uj_lvl_ptr;
2598   PetscReal          fill=info->fill,levels=info->levels,ratio_needed;
2599   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
2600   PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL;
2601   PetscBT            lnkbt;
2602 
2603   PetscFunctionBegin;
2604   ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr);
2605   if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
2606 
2607   /*
2608    This code originally uses Modified Sparse Row (MSR) storage
2609    (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choice!
2610    Then it is rewritten so the factor B takes seqsbaij format. However the associated
2611    MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1,
2612    thus the original code in MSR format is still used for these cases.
2613    The code below should replace MatICCFactorSymbolic_SeqSBAIJ_MSR() whenever
2614    MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor.
2615   */
2616   if (bs > 1) {
2617     ierr = MatICCFactorSymbolic_SeqSBAIJ_MSR(fact,A,perm,info);CHKERRQ(ierr);
2618     PetscFunctionReturn(0);
2619   }
2620 
2621   /* check whether perm is the identity mapping */
2622   ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr);
2623   if (!perm_identity) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2624   a->permute = PETSC_FALSE;
2625 
2626   /* special case that simply copies fill pattern */
2627   if (!levels) {
2628     /* reuse the column pointers and row offsets for memory savings */
2629     ui           = a->i;
2630     uj           = a->j;
2631     free_ij      = PETSC_FALSE;
2632     ratio_needed = 1.0;
2633   } else { /* case: levels>0 */
2634     ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr);
2635 
2636     /* initialization */
2637     ierr  = PetscMalloc((am+1)*sizeof(PetscInt),&ui);CHKERRQ(ierr);
2638     ui[0] = 0;
2639 
2640     /* jl: linked list for storing indices of the pivot rows
2641        il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2642     ierr = PetscMalloc4(am,PetscInt*,&uj_ptr,am,PetscInt*,&uj_lvl_ptr,am,PetscInt,&il,am,PetscInt,&jl);CHKERRQ(ierr);
2643     for (i=0; i<am; i++) {
2644       jl[i] = am; il[i] = 0;
2645     }
2646 
2647     /* create and initialize a linked list for storing column indices of the active row k */
2648     nlnk = am + 1;
2649     ierr = PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
2650 
2651     /* initial FreeSpace size is fill*(ai[am]+1) */
2652     ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space);CHKERRQ(ierr);
2653     current_space = free_space;
2654     ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space_lvl);CHKERRQ(ierr);
2655     current_space_lvl = free_space_lvl;
2656 
2657     for (k=0; k<am; k++) {  /* for each active row k */
2658       /* initialize lnk by the column indices of row rip[k] */
2659       nzk   = 0;
2660       ncols = ai[rip[k]+1] - ai[rip[k]];
2661       cols  = aj+ai[rip[k]];
2662       ierr = PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
2663       nzk += nlnk;
2664 
2665       /* update lnk by computing fill-in for each pivot row to be merged in */
2666       prow = jl[k]; /* 1st pivot row */
2667 
2668       while (prow < k) {
2669         nextprow = jl[prow];
2670 
2671         /* merge prow into k-th row */
2672         jmin = il[prow] + 1;  /* index of the 2nd nzero entry in U(prow,k:am-1) */
2673         jmax = ui[prow+1];
2674         ncols = jmax-jmin;
2675         i     = jmin - ui[prow];
2676         cols  = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
2677         j     = *(uj_lvl_ptr[prow] + i - 1);
2678         cols_lvl = uj_lvl_ptr[prow]+i;
2679         ierr = PetscICCLLAddSorted(ncols,cols,levels,cols_lvl,am,nlnk,lnk,lnk_lvl,lnkbt,j);CHKERRQ(ierr);
2680         nzk += nlnk;
2681 
2682         /* update il and jl for prow */
2683         if (jmin < jmax) {
2684           il[prow] = jmin;
2685           j = *cols; jl[prow] = jl[j]; jl[j] = prow;
2686         }
2687         prow = nextprow;
2688       }
2689 
2690       /* if free space is not available, make more free space */
2691       if (current_space->local_remaining<nzk) {
2692         i = am - k + 1; /* num of unfactored rows */
2693         i = PetscMin(i*nzk, i*(i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
2694         ierr = PetscFreeSpaceGet(i,&current_space);CHKERRQ(ierr);
2695         ierr = PetscFreeSpaceGet(i,&current_space_lvl);CHKERRQ(ierr);
2696         reallocs++;
2697       }
2698 
2699       /* copy data into free_space and free_space_lvl, then initialize lnk */
2700       ierr = PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr);
2701 
2702       /* add the k-th row into il and jl */
2703       if (nzk-1 > 0) {
2704         i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2705         jl[k] = jl[i]; jl[i] = k;
2706         il[k] = ui[k] + 1;
2707       }
2708       uj_ptr[k]     = current_space->array;
2709       uj_lvl_ptr[k] = current_space_lvl->array;
2710 
2711       current_space->array           += nzk;
2712       current_space->local_used      += nzk;
2713       current_space->local_remaining -= nzk;
2714       current_space_lvl->array           += nzk;
2715       current_space_lvl->local_used      += nzk;
2716       current_space_lvl->local_remaining -= nzk;
2717 
2718       ui[k+1] = ui[k] + nzk;
2719     }
2720 
2721     ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
2722     ierr = PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);CHKERRQ(ierr);
2723 
2724     /* destroy list of free space and other temporary array(s) */
2725     ierr = PetscMalloc((ui[am]+1)*sizeof(PetscInt),&uj);CHKERRQ(ierr);
2726     ierr = PetscFreeSpaceContiguous(&free_space,uj);CHKERRQ(ierr);
2727     ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
2728     ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr);
2729     if (ai[am] != 0) {
2730       ratio_needed = ((PetscReal)ui[am])/((PetscReal)ai[am]);
2731     } else {
2732       ratio_needed = 0.0;
2733     }
2734   } /* end of case: levels>0 || (levels=0 && !perm_identity) */
2735 
2736   /* put together the new matrix in MATSEQSBAIJ format */
2737   ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
2738 
2739   b = (Mat_SeqSBAIJ*)(fact)->data;
2740   ierr = PetscFree2(b->imax,b->ilen);CHKERRQ(ierr);
2741   b->singlemalloc = PETSC_FALSE;
2742   b->free_a  = PETSC_TRUE;
2743   b->free_ij = free_ij;
2744   ierr = PetscMalloc((ui[am]+1)*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
2745   b->j    = uj;
2746   b->i    = ui;
2747   b->diag    = 0;
2748   b->ilen    = 0;
2749   b->imax    = 0;
2750   b->row     = perm;
2751   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
2752   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
2753   b->icol = perm;
2754   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
2755   ierr    = PetscMalloc((am+1)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
2756   b->maxnz = b->nz = ui[am];
2757 
2758   fact->info.factor_mallocs    = reallocs;
2759   fact->info.fill_ratio_given  = fill;
2760   fact->info.fill_ratio_needed = ratio_needed;
2761 #if defined(PETSC_USE_INFO)
2762     if (ai[am] != 0) {
2763       PetscReal af = fact->info.fill_ratio_needed;
2764       ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,fill,af);CHKERRQ(ierr);
2765       ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
2766       ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G) for best performance.\n",af);CHKERRQ(ierr);
2767     } else {
2768       ierr = PetscInfo(A,"Empty matrix.\n");CHKERRQ(ierr);
2769     }
2770 #endif
2771   if (perm_identity) {
2772     fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering_inplace;
2773   } else {
2774     fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_inplace;
2775   }
2776   PetscFunctionReturn(0);
2777 }
2778 
2779