xref: /petsc/src/mat/impls/sbaij/seq/sbaijfact2.c (revision 7d5fd1e4d9337468ad3f05b65b7facdcd2dfd2a4)
1 
2 /*
3     Factorization code for SBAIJ format.
4 */
5 
6 #include <../src/mat/impls/sbaij/seq/sbaij.h>
7 #include <../src/mat/impls/baij/seq/baij.h>
8 #include <petsc/private/kernels/blockinvert.h>
9 
10 PetscErrorCode MatSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
11 {
12   Mat_SeqSBAIJ      *a   =(Mat_SeqSBAIJ*)A->data;
13   IS                isrow=a->row;
14   PetscInt          mbs  =a->mbs,*ai=a->i,*aj=a->j;
15   PetscErrorCode    ierr;
16   const PetscInt    *r;
17   PetscInt          nz,*vj,k,idx,k1;
18   PetscInt          bs =A->rmap->bs,bs2 = a->bs2;
19   const MatScalar   *aa=a->a,*v,*diag;
20   PetscScalar       *x,*xk,*xj,*xk_tmp,*t;
21   const PetscScalar *b;
22 
23   PetscFunctionBegin;
24   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
25   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
26   t    = a->solve_work;
27   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
28   ierr = PetscMalloc1(bs,&xk_tmp);CHKERRQ(ierr);
29 
30   /* solve U^T * D * y = b by forward substitution */
31   xk = t;
32   for (k=0; k<mbs; k++) { /* t <- perm(b) */
33     idx = bs*r[k];
34     for (k1=0; k1<bs; k1++) *xk++ = b[idx+k1];
35   }
36   for (k=0; k<mbs; k++) {
37     v    = aa + bs2*ai[k];
38     xk   = t + k*bs;    /* Dk*xk = k-th block of x */
39     ierr = PetscArraycpy(xk_tmp,xk,bs);CHKERRQ(ierr); /* xk_tmp <- xk */
40     nz   = ai[k+1] - ai[k];
41     vj   = aj + ai[k];
42     xj   = t + (*vj)*bs; /* *vj-th block of x, *vj>k */
43     while (nz--) {
44       /* x(:) += U(k,:)^T*(Dk*xk) */
45       PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
46       vj++; xj = t + (*vj)*bs;
47       v       += bs2;
48     }
49     /* xk = inv(Dk)*(Dk*xk) */
50     diag = aa+k*bs2;                            /* ptr to inv(Dk) */
51     PetscKernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
52   }
53 
54   /* solve U*x = y by back substitution */
55   for (k=mbs-1; k>=0; k--) {
56     v  = aa + bs2*ai[k];
57     xk = t + k*bs;        /* xk */
58     nz = ai[k+1] - ai[k];
59     vj = aj + ai[k];
60     xj = t + (*vj)*bs;
61     while (nz--) {
62       /* xk += U(k,:)*x(:) */
63       PetscKernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
64       vj++;
65       v += bs2; xj = t + (*vj)*bs;
66     }
67     idx = bs*r[k];
68     for (k1=0; k1<bs; k1++) x[idx+k1] = *xk++;
69   }
70 
71   ierr = PetscFree(xk_tmp);CHKERRQ(ierr);
72   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
73   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
74   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
75   ierr = PetscLogFlops(4.0*bs2*a->nz -(bs+2.0*bs2)*mbs);CHKERRQ(ierr);
76   PetscFunctionReturn(0);
77 }
78 
79 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
80 {
81   PetscFunctionBegin;
82   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"not implemented yet");
83 }
84 
85 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
86 {
87   PetscFunctionBegin;
88   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not yet implemented");
89 }
90 
91 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x)
92 {
93   PetscErrorCode  ierr;
94   PetscInt        nz,k;
95   const PetscInt  *vj,bs2 = bs*bs;
96   const MatScalar *v,*diag;
97   PetscScalar     *xk,*xj,*xk_tmp;
98 
99   PetscFunctionBegin;
100   ierr = PetscMalloc1(bs,&xk_tmp);CHKERRQ(ierr);
101   for (k=0; k<mbs; k++) {
102     v    = aa + bs2*ai[k];
103     xk   = x + k*bs;    /* Dk*xk = k-th block of x */
104     ierr = PetscArraycpy(xk_tmp,xk,bs);CHKERRQ(ierr); /* xk_tmp <- xk */
105     nz   = ai[k+1] - ai[k];
106     vj   = aj + ai[k];
107     xj   = x + (*vj)*bs; /* *vj-th block of x, *vj>k */
108     while (nz--) {
109       /* x(:) += U(k,:)^T*(Dk*xk) */
110       PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
111       vj++; xj = x + (*vj)*bs;
112       v       += bs2;
113     }
114     /* xk = inv(Dk)*(Dk*xk) */
115     diag = aa+k*bs2;                            /* ptr to inv(Dk) */
116     PetscKernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
117   }
118   ierr = PetscFree(xk_tmp);CHKERRQ(ierr);
119   PetscFunctionReturn(0);
120 }
121 
122 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x)
123 {
124   PetscInt        nz,k;
125   const PetscInt  *vj,bs2 = bs*bs;
126   const MatScalar *v;
127   PetscScalar     *xk,*xj;
128 
129   PetscFunctionBegin;
130   for (k=mbs-1; k>=0; k--) {
131     v  = aa + bs2*ai[k];
132     xk = x + k*bs;        /* xk */
133     nz = ai[k+1] - ai[k];
134     vj = aj + ai[k];
135     xj = x + (*vj)*bs;
136     while (nz--) {
137       /* xk += U(k,:)*x(:) */
138       PetscKernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
139       vj++;
140       v += bs2; xj = x + (*vj)*bs;
141     }
142   }
143   PetscFunctionReturn(0);
144 }
145 
146 PetscErrorCode MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
147 {
148   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
149   PetscErrorCode    ierr;
150   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
151   PetscInt          bs =A->rmap->bs;
152   const MatScalar   *aa=a->a;
153   PetscScalar       *x;
154   const PetscScalar *b;
155 
156   PetscFunctionBegin;
157   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
158   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
159 
160   /* solve U^T * D * y = b by forward substitution */
161   ierr = PetscArraycpy(x,b,bs*mbs);CHKERRQ(ierr); /* x <- b */
162   ierr = MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr);
163 
164   /* solve U*x = y by back substitution */
165   ierr = MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr);
166 
167   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
168   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
169   ierr = PetscLogFlops(4.0*a->bs2*a->nz - (bs+2.0*a->bs2)*mbs);CHKERRQ(ierr);
170   PetscFunctionReturn(0);
171 }
172 
173 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
174 {
175   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
176   PetscErrorCode    ierr;
177   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
178   PetscInt          bs =A->rmap->bs;
179   const MatScalar   *aa=a->a;
180   const PetscScalar *b;
181   PetscScalar       *x;
182 
183   PetscFunctionBegin;
184   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
185   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
186   ierr = PetscArraycpy(x,b,bs*mbs);CHKERRQ(ierr); /* x <- b */
187   ierr = MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr);
188   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
189   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
190   ierr = PetscLogFlops(2.0*a->bs2*a->nz - bs*mbs);CHKERRQ(ierr);
191   PetscFunctionReturn(0);
192 }
193 
194 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
195 {
196   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
197   PetscErrorCode    ierr;
198   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
199   PetscInt          bs =A->rmap->bs;
200   const MatScalar   *aa=a->a;
201   const PetscScalar *b;
202   PetscScalar       *x;
203 
204   PetscFunctionBegin;
205   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
206   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
207   ierr = PetscArraycpy(x,b,bs*mbs);CHKERRQ(ierr);
208   ierr = MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr);
209   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
210   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
211   ierr = PetscLogFlops(2.0*a->bs2*(a->nz-mbs));CHKERRQ(ierr);
212   PetscFunctionReturn(0);
213 }
214 
215 PetscErrorCode MatSolve_SeqSBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
216 {
217   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
218   IS                isrow=a->row;
219   const PetscInt    mbs  =a->mbs,*ai=a->i,*aj=a->j,*r,*vj;
220   PetscErrorCode    ierr;
221   PetscInt          nz,k,idx;
222   const MatScalar   *aa=a->a,*v,*d;
223   PetscScalar       *x,x0,x1,x2,x3,x4,x5,x6,*t,*tp;
224   const PetscScalar *b;
225 
226   PetscFunctionBegin;
227   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
228   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
229   t    = a->solve_work;
230   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
231 
232   /* solve U^T * D * y = b by forward substitution */
233   tp = t;
234   for (k=0; k<mbs; k++) { /* t <- perm(b) */
235     idx   = 7*r[k];
236     tp[0] = b[idx];
237     tp[1] = b[idx+1];
238     tp[2] = b[idx+2];
239     tp[3] = b[idx+3];
240     tp[4] = b[idx+4];
241     tp[5] = b[idx+5];
242     tp[6] = b[idx+6];
243     tp   += 7;
244   }
245 
246   for (k=0; k<mbs; k++) {
247     v  = aa + 49*ai[k];
248     vj = aj + ai[k];
249     tp = t + k*7;
250     x0 =tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6];
251     nz = ai[k+1] - ai[k];
252     tp = t + (*vj)*7;
253     while (nz--) {
254       tp[0]+=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
255       tp[1]+=  v[7]*x0 +  v[8]*x1 +  v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
256       tp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
257       tp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
258       tp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
259       tp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
260       tp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
261       vj++;
262       tp = t + (*vj)*7;
263       v += 49;
264     }
265 
266     /* xk = inv(Dk)*(Dk*xk) */
267     d     = aa+k*49;       /* ptr to inv(Dk) */
268     tp    = t + k*7;
269     tp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
270     tp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
271     tp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
272     tp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
273     tp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
274     tp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
275     tp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
276   }
277 
278   /* solve U*x = y by back substitution */
279   for (k=mbs-1; k>=0; k--) {
280     v  = aa + 49*ai[k];
281     vj = aj + ai[k];
282     tp = t + k*7;
283     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];  x6=tp[6]; /* xk */
284     nz = ai[k+1] - ai[k];
285 
286     tp = t + (*vj)*7;
287     while (nz--) {
288       /* xk += U(k,:)*x(:) */
289       x0 += v[0]*tp[0] + v[7]*tp[1] + v[14]*tp[2] + v[21]*tp[3] + v[28]*tp[4] + v[35]*tp[5] + v[42]*tp[6];
290       x1 += v[1]*tp[0] + v[8]*tp[1] + v[15]*tp[2] + v[22]*tp[3] + v[29]*tp[4] + v[36]*tp[5] + v[43]*tp[6];
291       x2 += v[2]*tp[0] + v[9]*tp[1] + v[16]*tp[2] + v[23]*tp[3] + v[30]*tp[4] + v[37]*tp[5] + v[44]*tp[6];
292       x3 += v[3]*tp[0]+ v[10]*tp[1] + v[17]*tp[2] + v[24]*tp[3] + v[31]*tp[4] + v[38]*tp[5] + v[45]*tp[6];
293       x4 += v[4]*tp[0]+ v[11]*tp[1] + v[18]*tp[2] + v[25]*tp[3] + v[32]*tp[4] + v[39]*tp[5] + v[46]*tp[6];
294       x5 += v[5]*tp[0]+ v[12]*tp[1] + v[19]*tp[2] + v[26]*tp[3] + v[33]*tp[4] + v[40]*tp[5] + v[47]*tp[6];
295       x6 += v[6]*tp[0]+ v[13]*tp[1] + v[20]*tp[2] + v[27]*tp[3] + v[34]*tp[4] + v[41]*tp[5] + v[48]*tp[6];
296       vj++;
297       tp = t + (*vj)*7;
298       v += 49;
299     }
300     tp       = t + k*7;
301     tp[0]    = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5; tp[6]=x6;
302     idx      = 7*r[k];
303     x[idx]   = x0;
304     x[idx+1] = x1;
305     x[idx+2] = x2;
306     x[idx+3] = x3;
307     x[idx+4] = x4;
308     x[idx+5] = x5;
309     x[idx+6] = x6;
310   }
311 
312   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
313   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
314   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
315   ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr);
316   PetscFunctionReturn(0);
317 }
318 
319 PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
320 {
321   const MatScalar *v,*d;
322   PetscScalar     *xp,x0,x1,x2,x3,x4,x5,x6;
323   PetscInt        nz,k;
324   const PetscInt  *vj;
325 
326   PetscFunctionBegin;
327   for (k=0; k<mbs; k++) {
328     v  = aa + 49*ai[k];
329     xp = x + k*7;
330     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* Dk*xk = k-th block of x */
331     nz = ai[k+1] - ai[k];
332     vj = aj + ai[k];
333     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
334     PetscPrefetchBlock(v+49*nz,49*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
335     while (nz--) {
336       xp = x + (*vj)*7;
337       /* x(:) += U(k,:)^T*(Dk*xk) */
338       xp[0]+=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
339       xp[1]+=  v[7]*x0 +  v[8]*x1 +  v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
340       xp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
341       xp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
342       xp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
343       xp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
344       xp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
345       vj++;
346       v += 49;
347     }
348     /* xk = inv(Dk)*(Dk*xk) */
349     d     = aa+k*49;       /* ptr to inv(Dk) */
350     xp    = x + k*7;
351     xp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
352     xp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
353     xp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
354     xp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
355     xp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
356     xp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
357     xp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
358   }
359   PetscFunctionReturn(0);
360 }
361 
362 PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
363 {
364   const MatScalar *v;
365   PetscScalar     *xp,x0,x1,x2,x3,x4,x5,x6;
366   PetscInt        nz,k;
367   const PetscInt  *vj;
368 
369   PetscFunctionBegin;
370   for (k=mbs-1; k>=0; k--) {
371     v  = aa + 49*ai[k];
372     xp = x + k*7;
373     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* xk */
374     nz = ai[k+1] - ai[k];
375     vj = aj + ai[k];
376     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
377     PetscPrefetchBlock(v-49*nz,49*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
378     while (nz--) {
379       xp = x + (*vj)*7;
380       /* xk += U(k,:)*x(:) */
381       x0 += v[0]*xp[0] + v[7]*xp[1] + v[14]*xp[2] + v[21]*xp[3] + v[28]*xp[4] + v[35]*xp[5] + v[42]*xp[6];
382       x1 += v[1]*xp[0] + v[8]*xp[1] + v[15]*xp[2] + v[22]*xp[3] + v[29]*xp[4] + v[36]*xp[5] + v[43]*xp[6];
383       x2 += v[2]*xp[0] + v[9]*xp[1] + v[16]*xp[2] + v[23]*xp[3] + v[30]*xp[4] + v[37]*xp[5] + v[44]*xp[6];
384       x3 += v[3]*xp[0]+ v[10]*xp[1] + v[17]*xp[2] + v[24]*xp[3] + v[31]*xp[4] + v[38]*xp[5] + v[45]*xp[6];
385       x4 += v[4]*xp[0]+ v[11]*xp[1] + v[18]*xp[2] + v[25]*xp[3] + v[32]*xp[4] + v[39]*xp[5] + v[46]*xp[6];
386       x5 += v[5]*xp[0]+ v[12]*xp[1] + v[19]*xp[2] + v[26]*xp[3] + v[33]*xp[4] + v[40]*xp[5] + v[47]*xp[6];
387       x6 += v[6]*xp[0]+ v[13]*xp[1] + v[20]*xp[2] + v[27]*xp[3] + v[34]*xp[4] + v[41]*xp[5] + v[48]*xp[6];
388       vj++;
389       v += 49;
390     }
391     xp = x + k*7;
392     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5; xp[6]=x6;
393   }
394   PetscFunctionReturn(0);
395 }
396 
397 PetscErrorCode MatSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
398 {
399   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
400   PetscErrorCode    ierr;
401   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
402   const MatScalar   *aa=a->a;
403   PetscScalar       *x;
404   const PetscScalar *b;
405 
406   PetscFunctionBegin;
407   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
408   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
409 
410   /* solve U^T * D * y = b by forward substitution */
411   ierr = PetscArraycpy(x,b,7*mbs);CHKERRQ(ierr); /* x <- b */
412   ierr = MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
413 
414   /* solve U*x = y by back substitution */
415   ierr = MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
416 
417   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
418   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
419   ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr);
420   PetscFunctionReturn(0);
421 }
422 
423 PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
424 {
425   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
426   PetscErrorCode    ierr;
427   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
428   const MatScalar   *aa=a->a;
429   PetscScalar       *x;
430   const PetscScalar *b;
431 
432   PetscFunctionBegin;
433   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
434   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
435   ierr = PetscArraycpy(x,b,7*mbs);CHKERRQ(ierr);
436   ierr = MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
437   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
438   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
439   ierr = PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);CHKERRQ(ierr);
440   PetscFunctionReturn(0);
441 }
442 
443 PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
444 {
445   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
446   PetscErrorCode    ierr;
447   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
448   const MatScalar   *aa=a->a;
449   PetscScalar       *x;
450   const PetscScalar *b;
451 
452   PetscFunctionBegin;
453   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
454   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
455   ierr = PetscArraycpy(x,b,7*mbs);CHKERRQ(ierr);
456   ierr = MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
457   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
458   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
459   ierr = PetscLogFlops(2.0*a->bs2*(a->nz-mbs));CHKERRQ(ierr);
460   PetscFunctionReturn(0);
461 }
462 
463 PetscErrorCode MatSolve_SeqSBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
464 {
465   Mat_SeqSBAIJ      *a   =(Mat_SeqSBAIJ*)A->data;
466   IS                isrow=a->row;
467   const PetscInt    mbs  =a->mbs,*ai=a->i,*aj=a->j,*r,*vj;
468   PetscErrorCode    ierr;
469   PetscInt          nz,k,idx;
470   const MatScalar   *aa=a->a,*v,*d;
471   PetscScalar       *x,x0,x1,x2,x3,x4,x5,*t,*tp;
472   const PetscScalar *b;
473 
474   PetscFunctionBegin;
475   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
476   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
477   t    = a->solve_work;
478   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
479 
480   /* solve U^T * D * y = b by forward substitution */
481   tp = t;
482   for (k=0; k<mbs; k++) { /* t <- perm(b) */
483     idx   = 6*r[k];
484     tp[0] = b[idx];
485     tp[1] = b[idx+1];
486     tp[2] = b[idx+2];
487     tp[3] = b[idx+3];
488     tp[4] = b[idx+4];
489     tp[5] = b[idx+5];
490     tp   += 6;
491   }
492 
493   for (k=0; k<mbs; k++) {
494     v  = aa + 36*ai[k];
495     vj = aj + ai[k];
496     tp = t + k*6;
497     x0 =tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];
498     nz = ai[k+1] - ai[k];
499     tp = t + (*vj)*6;
500     while (nz--) {
501       tp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
502       tp[1] +=  v[6]*x0 +  v[7]*x1 +  v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
503       tp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
504       tp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
505       tp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
506       tp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
507       vj++;
508       tp = t + (*vj)*6;
509       v += 36;
510     }
511 
512     /* xk = inv(Dk)*(Dk*xk) */
513     d     = aa+k*36;       /* ptr to inv(Dk) */
514     tp    = t + k*6;
515     tp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
516     tp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
517     tp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
518     tp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
519     tp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
520     tp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
521   }
522 
523   /* solve U*x = y by back substitution */
524   for (k=mbs-1; k>=0; k--) {
525     v  = aa + 36*ai[k];
526     vj = aj + ai[k];
527     tp = t + k*6;
528     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; /* xk */
529     nz = ai[k+1] - ai[k];
530 
531     tp = t + (*vj)*6;
532     while (nz--) {
533       /* xk += U(k,:)*x(:) */
534       x0 += v[0]*tp[0] + v[6]*tp[1] + v[12]*tp[2] + v[18]*tp[3] + v[24]*tp[4] + v[30]*tp[5];
535       x1 += v[1]*tp[0] + v[7]*tp[1] + v[13]*tp[2] + v[19]*tp[3] + v[25]*tp[4] + v[31]*tp[5];
536       x2 += v[2]*tp[0] + v[8]*tp[1] + v[14]*tp[2] + v[20]*tp[3] + v[26]*tp[4] + v[32]*tp[5];
537       x3 += v[3]*tp[0] + v[9]*tp[1] + v[15]*tp[2] + v[21]*tp[3] + v[27]*tp[4] + v[33]*tp[5];
538       x4 += v[4]*tp[0]+ v[10]*tp[1] + v[16]*tp[2] + v[22]*tp[3] + v[28]*tp[4] + v[34]*tp[5];
539       x5 += v[5]*tp[0]+ v[11]*tp[1] + v[17]*tp[2] + v[23]*tp[3] + v[29]*tp[4] + v[35]*tp[5];
540       vj++;
541       tp = t + (*vj)*6;
542       v += 36;
543     }
544     tp       = t + k*6;
545     tp[0]    = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5;
546     idx      = 6*r[k];
547     x[idx]   = x0;
548     x[idx+1] = x1;
549     x[idx+2] = x2;
550     x[idx+3] = x3;
551     x[idx+4] = x4;
552     x[idx+5] = x5;
553   }
554 
555   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
556   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
557   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
558   ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr);
559   PetscFunctionReturn(0);
560 }
561 
562 PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
563 {
564   const MatScalar *v,*d;
565   PetscScalar     *xp,x0,x1,x2,x3,x4,x5;
566   PetscInt        nz,k;
567   const PetscInt  *vj;
568 
569   PetscFunctionBegin;
570   for (k=0; k<mbs; k++) {
571     v  = aa + 36*ai[k];
572     xp = x + k*6;
573     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* Dk*xk = k-th block of x */
574     nz = ai[k+1] - ai[k];
575     vj = aj + ai[k];
576     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
577     PetscPrefetchBlock(v+36*nz,36*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
578     while (nz--) {
579       xp = x + (*vj)*6;
580       /* x(:) += U(k,:)^T*(Dk*xk) */
581       xp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
582       xp[1] +=  v[6]*x0 +  v[7]*x1 +  v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
583       xp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
584       xp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
585       xp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
586       xp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
587       vj++;
588       v += 36;
589     }
590     /* xk = inv(Dk)*(Dk*xk) */
591     d     = aa+k*36;       /* ptr to inv(Dk) */
592     xp    = x + k*6;
593     xp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
594     xp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
595     xp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
596     xp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
597     xp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
598     xp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
599   }
600   PetscFunctionReturn(0);
601 }
602 PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
603 {
604   const MatScalar   *v;
605   PetscScalar       *xp,x0,x1,x2,x3,x4,x5;
606   PetscInt          nz,k;
607   const PetscInt    *vj;
608 
609   PetscFunctionBegin;
610   for (k=mbs-1; k>=0; k--) {
611     v  = aa + 36*ai[k];
612     xp = x + k*6;
613     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* xk */
614     nz = ai[k+1] - ai[k];
615     vj = aj + ai[k];
616     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
617     PetscPrefetchBlock(v-36*nz,36*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
618     while (nz--) {
619       xp = x + (*vj)*6;
620       /* xk += U(k,:)*x(:) */
621       x0 += v[0]*xp[0] + v[6]*xp[1] + v[12]*xp[2] + v[18]*xp[3] + v[24]*xp[4] + v[30]*xp[5];
622       x1 += v[1]*xp[0] + v[7]*xp[1] + v[13]*xp[2] + v[19]*xp[3] + v[25]*xp[4] + v[31]*xp[5];
623       x2 += v[2]*xp[0] + v[8]*xp[1] + v[14]*xp[2] + v[20]*xp[3] + v[26]*xp[4] + v[32]*xp[5];
624       x3 += v[3]*xp[0] + v[9]*xp[1] + v[15]*xp[2] + v[21]*xp[3] + v[27]*xp[4] + v[33]*xp[5];
625       x4 += v[4]*xp[0]+ v[10]*xp[1] + v[16]*xp[2] + v[22]*xp[3] + v[28]*xp[4] + v[34]*xp[5];
626       x5 += v[5]*xp[0]+ v[11]*xp[1] + v[17]*xp[2] + v[23]*xp[3] + v[29]*xp[4] + v[35]*xp[5];
627       vj++;
628       v += 36;
629     }
630     xp   = x + k*6;
631     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5;
632   }
633   PetscFunctionReturn(0);
634 }
635 
636 PetscErrorCode MatSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
637 {
638   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
639   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
640   const MatScalar   *aa=a->a;
641   PetscScalar       *x;
642   const PetscScalar *b;
643   PetscErrorCode    ierr;
644 
645   PetscFunctionBegin;
646   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
647   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
648 
649   /* solve U^T * D * y = b by forward substitution */
650   ierr = PetscArraycpy(x,b,6*mbs);CHKERRQ(ierr); /* x <- b */
651   ierr = MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
652 
653   /* solve U*x = y by back substitution */
654   ierr = MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
655 
656   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
657   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
658   ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr);
659   PetscFunctionReturn(0);
660 }
661 
662 PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
663 {
664   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
665   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
666   const MatScalar   *aa=a->a;
667   PetscScalar       *x;
668   const PetscScalar *b;
669   PetscErrorCode    ierr;
670 
671   PetscFunctionBegin;
672   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
673   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
674   ierr = PetscArraycpy(x,b,6*mbs);CHKERRQ(ierr); /* x <- b */
675   ierr = MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
676   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
677   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
678   ierr = PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);CHKERRQ(ierr);
679   PetscFunctionReturn(0);
680 }
681 
682 PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
683 {
684   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
685   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
686   const MatScalar   *aa=a->a;
687   PetscScalar       *x;
688   const PetscScalar *b;
689   PetscErrorCode    ierr;
690 
691   PetscFunctionBegin;
692   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
693   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
694   ierr = PetscArraycpy(x,b,6*mbs);CHKERRQ(ierr); /* x <- b */
695   ierr = MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
696   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
697   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
698   ierr = PetscLogFlops(2.0*a->bs2*(a->nz - mbs));CHKERRQ(ierr);
699   PetscFunctionReturn(0);
700 }
701 
702 PetscErrorCode MatSolve_SeqSBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
703 {
704   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
705   IS                isrow=a->row;
706   const PetscInt    mbs  =a->mbs,*ai=a->i,*aj=a->j;
707   PetscErrorCode    ierr;
708   const PetscInt    *r,*vj;
709   PetscInt          nz,k,idx;
710   const MatScalar   *aa=a->a,*v,*diag;
711   PetscScalar       *x,x0,x1,x2,x3,x4,*t,*tp;
712   const PetscScalar *b;
713 
714   PetscFunctionBegin;
715   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
716   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
717   t    = a->solve_work;
718   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
719 
720   /* solve U^T * D * y = b by forward substitution */
721   tp = t;
722   for (k=0; k<mbs; k++) { /* t <- perm(b) */
723     idx   = 5*r[k];
724     tp[0] = b[idx];
725     tp[1] = b[idx+1];
726     tp[2] = b[idx+2];
727     tp[3] = b[idx+3];
728     tp[4] = b[idx+4];
729     tp   += 5;
730   }
731 
732   for (k=0; k<mbs; k++) {
733     v  = aa + 25*ai[k];
734     vj = aj + ai[k];
735     tp = t + k*5;
736     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4];
737     nz = ai[k+1] - ai[k];
738 
739     tp = t + (*vj)*5;
740     while (nz--) {
741       tp[0] +=  v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4;
742       tp[1] +=  v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4;
743       tp[2] += v[10]*x0+ v[11]*x1+ v[12]*x2+ v[13]*x3+ v[14]*x4;
744       tp[3] += v[15]*x0+ v[16]*x1+ v[17]*x2+ v[18]*x3+ v[19]*x4;
745       tp[4] += v[20]*x0+ v[21]*x1+ v[22]*x2+ v[23]*x3+ v[24]*x4;
746       vj++;
747       tp = t + (*vj)*5;
748       v += 25;
749     }
750 
751     /* xk = inv(Dk)*(Dk*xk) */
752     diag  = aa+k*25;          /* ptr to inv(Dk) */
753     tp    = t + k*5;
754     tp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
755     tp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
756     tp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
757     tp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
758     tp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
759   }
760 
761   /* solve U*x = y by back substitution */
762   for (k=mbs-1; k>=0; k--) {
763     v  = aa + 25*ai[k];
764     vj = aj + ai[k];
765     tp = t + k*5;
766     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; /* xk */
767     nz = ai[k+1] - ai[k];
768 
769     tp = t + (*vj)*5;
770     while (nz--) {
771       /* xk += U(k,:)*x(:) */
772       x0 += v[0]*tp[0] + v[5]*tp[1] + v[10]*tp[2] + v[15]*tp[3] + v[20]*tp[4];
773       x1 += v[1]*tp[0] + v[6]*tp[1] + v[11]*tp[2] + v[16]*tp[3] + v[21]*tp[4];
774       x2 += v[2]*tp[0] + v[7]*tp[1] + v[12]*tp[2] + v[17]*tp[3] + v[22]*tp[4];
775       x3 += v[3]*tp[0] + v[8]*tp[1] + v[13]*tp[2] + v[18]*tp[3] + v[23]*tp[4];
776       x4 += v[4]*tp[0] + v[9]*tp[1] + v[14]*tp[2] + v[19]*tp[3] + v[24]*tp[4];
777       vj++;
778       tp = t + (*vj)*5;
779       v += 25;
780     }
781     tp       = t + k*5;
782     tp[0]    = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4;
783     idx      = 5*r[k];
784     x[idx]   = x0;
785     x[idx+1] = x1;
786     x[idx+2] = x2;
787     x[idx+3] = x3;
788     x[idx+4] = x4;
789   }
790 
791   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
792   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
793   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
794   ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr);
795   PetscFunctionReturn(0);
796 }
797 
798 PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
799 {
800   const MatScalar *v,*diag;
801   PetscScalar     *xp,x0,x1,x2,x3,x4;
802   PetscInt        nz,k;
803   const PetscInt  *vj;
804 
805   PetscFunctionBegin;
806   for (k=0; k<mbs; k++) {
807     v  = aa + 25*ai[k];
808     xp = x + k*5;
809     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* Dk*xk = k-th block of x */
810     nz = ai[k+1] - ai[k];
811     vj = aj + ai[k];
812     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
813     PetscPrefetchBlock(v+25*nz,25*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
814     while (nz--) {
815       xp = x + (*vj)*5;
816       /* x(:) += U(k,:)^T*(Dk*xk) */
817       xp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4;
818       xp[1] +=  v[5]*x0 +  v[6]*x1 +  v[7]*x2 + v[8]*x3 + v[9]*x4;
819       xp[2] += v[10]*x0 + v[11]*x1 + v[12]*x2+ v[13]*x3+ v[14]*x4;
820       xp[3] += v[15]*x0 + v[16]*x1 + v[17]*x2+ v[18]*x3+ v[19]*x4;
821       xp[4] += v[20]*x0 + v[21]*x1 + v[22]*x2+ v[23]*x3+ v[24]*x4;
822       vj++;
823       v += 25;
824     }
825     /* xk = inv(Dk)*(Dk*xk) */
826     diag  = aa+k*25;         /* ptr to inv(Dk) */
827     xp    = x + k*5;
828     xp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
829     xp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
830     xp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
831     xp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
832     xp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
833   }
834   PetscFunctionReturn(0);
835 }
836 
837 PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
838 {
839   const MatScalar *v;
840   PetscScalar     *xp,x0,x1,x2,x3,x4;
841   PetscInt        nz,k;
842   const PetscInt  *vj;
843 
844   PetscFunctionBegin;
845   for (k=mbs-1; k>=0; k--) {
846     v  = aa + 25*ai[k];
847     xp = x + k*5;
848     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; /* xk */
849     nz = ai[k+1] - ai[k];
850     vj = aj + ai[k];
851     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
852     PetscPrefetchBlock(v-25*nz,25*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
853     while (nz--) {
854       xp = x + (*vj)*5;
855       /* xk += U(k,:)*x(:) */
856       x0 += v[0]*xp[0] + v[5]*xp[1] + v[10]*xp[2] + v[15]*xp[3] + v[20]*xp[4];
857       x1 += v[1]*xp[0] + v[6]*xp[1] + v[11]*xp[2] + v[16]*xp[3] + v[21]*xp[4];
858       x2 += v[2]*xp[0] + v[7]*xp[1] + v[12]*xp[2] + v[17]*xp[3] + v[22]*xp[4];
859       x3 += v[3]*xp[0] + v[8]*xp[1] + v[13]*xp[2] + v[18]*xp[3] + v[23]*xp[4];
860       x4 += v[4]*xp[0] + v[9]*xp[1] + v[14]*xp[2] + v[19]*xp[3] + v[24]*xp[4];
861       vj++;
862       v += 25;
863     }
864     xp   = x + k*5;
865     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4;
866   }
867   PetscFunctionReturn(0);
868 }
869 
870 PetscErrorCode MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
871 {
872   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
873   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
874   const MatScalar   *aa=a->a;
875   PetscScalar       *x;
876   const PetscScalar *b;
877   PetscErrorCode    ierr;
878 
879   PetscFunctionBegin;
880   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
881   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
882 
883   /* solve U^T * D * y = b by forward substitution */
884   ierr = PetscArraycpy(x,b,5*mbs);CHKERRQ(ierr); /* x <- b */
885   ierr = MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
886 
887   /* solve U*x = y by back substitution */
888   ierr = MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
889 
890   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
891   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
892   ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr);
893   PetscFunctionReturn(0);
894 }
895 
896 PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
897 {
898   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
899   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
900   const MatScalar   *aa=a->a;
901   PetscScalar       *x;
902   const PetscScalar *b;
903   PetscErrorCode    ierr;
904 
905   PetscFunctionBegin;
906   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
907   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
908   ierr = PetscArraycpy(x,b,5*mbs);CHKERRQ(ierr); /* x <- b */
909   ierr = MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
910   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
911   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
912   ierr = PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);CHKERRQ(ierr);
913   PetscFunctionReturn(0);
914 }
915 
916 PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
917 {
918   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
919   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
920   const MatScalar   *aa=a->a;
921   PetscScalar       *x;
922   const PetscScalar *b;
923   PetscErrorCode    ierr;
924 
925   PetscFunctionBegin;
926   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
927   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
928   ierr = PetscArraycpy(x,b,5*mbs);CHKERRQ(ierr);
929   ierr = MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
930   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
931   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
932   ierr = PetscLogFlops(2.0*a->bs2*(a->nz-mbs));CHKERRQ(ierr);
933   PetscFunctionReturn(0);
934 }
935 
936 PetscErrorCode MatSolve_SeqSBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
937 {
938   Mat_SeqSBAIJ      *a   =(Mat_SeqSBAIJ*)A->data;
939   IS                isrow=a->row;
940   const PetscInt    mbs  =a->mbs,*ai=a->i,*aj=a->j;
941   PetscErrorCode    ierr;
942   const PetscInt    *r,*vj;
943   PetscInt          nz,k,idx;
944   const MatScalar   *aa=a->a,*v,*diag;
945   PetscScalar       *x,x0,x1,x2,x3,*t,*tp;
946   const PetscScalar *b;
947 
948   PetscFunctionBegin;
949   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
950   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
951   t    = a->solve_work;
952   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
953 
954   /* solve U^T * D * y = b by forward substitution */
955   tp = t;
956   for (k=0; k<mbs; k++) { /* t <- perm(b) */
957     idx   = 4*r[k];
958     tp[0] = b[idx];
959     tp[1] = b[idx+1];
960     tp[2] = b[idx+2];
961     tp[3] = b[idx+3];
962     tp   += 4;
963   }
964 
965   for (k=0; k<mbs; k++) {
966     v  = aa + 16*ai[k];
967     vj = aj + ai[k];
968     tp = t + k*4;
969     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3];
970     nz = ai[k+1] - ai[k];
971 
972     tp = t + (*vj)*4;
973     while (nz--) {
974       tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
975       tp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
976       tp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
977       tp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
978       vj++;
979       tp = t + (*vj)*4;
980       v += 16;
981     }
982 
983     /* xk = inv(Dk)*(Dk*xk) */
984     diag  = aa+k*16;          /* ptr to inv(Dk) */
985     tp    = t + k*4;
986     tp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
987     tp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
988     tp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
989     tp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
990   }
991 
992   /* solve U*x = y by back substitution */
993   for (k=mbs-1; k>=0; k--) {
994     v  = aa + 16*ai[k];
995     vj = aj + ai[k];
996     tp = t + k*4;
997     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; /* xk */
998     nz = ai[k+1] - ai[k];
999 
1000     tp = t + (*vj)*4;
1001     while (nz--) {
1002       /* xk += U(k,:)*x(:) */
1003       x0 += v[0]*tp[0] + v[4]*tp[1] + v[8]*tp[2] + v[12]*tp[3];
1004       x1 += v[1]*tp[0] + v[5]*tp[1] + v[9]*tp[2] + v[13]*tp[3];
1005       x2 += v[2]*tp[0] + v[6]*tp[1]+ v[10]*tp[2] + v[14]*tp[3];
1006       x3 += v[3]*tp[0] + v[7]*tp[1]+ v[11]*tp[2] + v[15]*tp[3];
1007       vj++;
1008       tp = t + (*vj)*4;
1009       v += 16;
1010     }
1011     tp       = t + k*4;
1012     tp[0]    =x0; tp[1]=x1; tp[2]=x2; tp[3]=x3;
1013     idx      = 4*r[k];
1014     x[idx]   = x0;
1015     x[idx+1] = x1;
1016     x[idx+2] = x2;
1017     x[idx+3] = x3;
1018   }
1019 
1020   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1021   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1022   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1023   ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr);
1024   PetscFunctionReturn(0);
1025 }
1026 
1027 PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1028 {
1029   const MatScalar *v,*diag;
1030   PetscScalar     *xp,x0,x1,x2,x3;
1031   PetscInt        nz,k;
1032   const PetscInt  *vj;
1033 
1034   PetscFunctionBegin;
1035   for (k=0; k<mbs; k++) {
1036     v  = aa + 16*ai[k];
1037     xp = x + k*4;
1038     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* Dk*xk = k-th block of x */
1039     nz = ai[k+1] - ai[k];
1040     vj = aj + ai[k];
1041     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
1042     PetscPrefetchBlock(v+16*nz,16*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1043     while (nz--) {
1044       xp = x + (*vj)*4;
1045       /* x(:) += U(k,:)^T*(Dk*xk) */
1046       xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
1047       xp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
1048       xp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
1049       xp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
1050       vj++;
1051       v += 16;
1052     }
1053     /* xk = inv(Dk)*(Dk*xk) */
1054     diag  = aa+k*16;         /* ptr to inv(Dk) */
1055     xp    = x + k*4;
1056     xp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
1057     xp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
1058     xp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
1059     xp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
1060   }
1061   PetscFunctionReturn(0);
1062 }
1063 
1064 PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1065 {
1066   const MatScalar *v;
1067   PetscScalar     *xp,x0,x1,x2,x3;
1068   PetscInt        nz,k;
1069   const PetscInt  *vj;
1070 
1071   PetscFunctionBegin;
1072   for (k=mbs-1; k>=0; k--) {
1073     v  = aa + 16*ai[k];
1074     xp = x + k*4;
1075     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* xk */
1076     nz = ai[k+1] - ai[k];
1077     vj = aj + ai[k];
1078     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
1079     PetscPrefetchBlock(v-16*nz,16*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1080     while (nz--) {
1081       xp = x + (*vj)*4;
1082       /* xk += U(k,:)*x(:) */
1083       x0 += v[0]*xp[0] + v[4]*xp[1] + v[8]*xp[2] + v[12]*xp[3];
1084       x1 += v[1]*xp[0] + v[5]*xp[1] + v[9]*xp[2] + v[13]*xp[3];
1085       x2 += v[2]*xp[0] + v[6]*xp[1]+ v[10]*xp[2] + v[14]*xp[3];
1086       x3 += v[3]*xp[0] + v[7]*xp[1]+ v[11]*xp[2] + v[15]*xp[3];
1087       vj++;
1088       v += 16;
1089     }
1090     xp    = x + k*4;
1091     xp[0] = x0; xp[1] = x1; xp[2] = x2; xp[3] = x3;
1092   }
1093   PetscFunctionReturn(0);
1094 }
1095 
1096 PetscErrorCode MatSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1097 {
1098   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1099   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1100   const MatScalar   *aa=a->a;
1101   PetscScalar       *x;
1102   const PetscScalar *b;
1103   PetscErrorCode    ierr;
1104 
1105   PetscFunctionBegin;
1106   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1107   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1108 
1109   /* solve U^T * D * y = b by forward substitution */
1110   ierr = PetscArraycpy(x,b,4*mbs);CHKERRQ(ierr); /* x <- b */
1111   ierr = MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1112 
1113   /* solve U*x = y by back substitution */
1114   ierr = MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1115   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1116   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1117   ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr);
1118   PetscFunctionReturn(0);
1119 }
1120 
1121 PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1122 {
1123   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1124   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1125   const MatScalar   *aa=a->a;
1126   PetscScalar       *x;
1127   const PetscScalar *b;
1128   PetscErrorCode    ierr;
1129 
1130   PetscFunctionBegin;
1131   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1132   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1133   ierr = PetscArraycpy(x,b,4*mbs);CHKERRQ(ierr); /* x <- b */
1134   ierr = MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1135   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1136   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1137   ierr = PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);CHKERRQ(ierr);
1138   PetscFunctionReturn(0);
1139 }
1140 
1141 PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1142 {
1143   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1144   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1145   const MatScalar   *aa=a->a;
1146   PetscScalar       *x;
1147   const PetscScalar *b;
1148   PetscErrorCode    ierr;
1149 
1150   PetscFunctionBegin;
1151   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1152   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1153   ierr = PetscArraycpy(x,b,4*mbs);CHKERRQ(ierr);
1154   ierr = MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1155   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1156   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1157   ierr = PetscLogFlops(2.0*a->bs2*(a->nz-mbs));CHKERRQ(ierr);
1158   PetscFunctionReturn(0);
1159 }
1160 
1161 PetscErrorCode MatSolve_SeqSBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
1162 {
1163   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1164   IS                isrow=a->row;
1165   const PetscInt    mbs  =a->mbs,*ai=a->i,*aj=a->j;
1166   PetscErrorCode    ierr;
1167   const PetscInt    *r;
1168   PetscInt          nz,k,idx;
1169   const PetscInt    *vj;
1170   const MatScalar   *aa=a->a,*v,*diag;
1171   PetscScalar       *x,x0,x1,x2,*t,*tp;
1172   const PetscScalar *b;
1173 
1174   PetscFunctionBegin;
1175   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1176   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1177   t    = a->solve_work;
1178   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1179 
1180   /* solve U^T * D * y = b by forward substitution */
1181   tp = t;
1182   for (k=0; k<mbs; k++) { /* t <- perm(b) */
1183     idx   = 3*r[k];
1184     tp[0] = b[idx];
1185     tp[1] = b[idx+1];
1186     tp[2] = b[idx+2];
1187     tp   += 3;
1188   }
1189 
1190   for (k=0; k<mbs; k++) {
1191     v  = aa + 9*ai[k];
1192     vj = aj + ai[k];
1193     tp = t + k*3;
1194     x0 = tp[0]; x1 = tp[1]; x2 = tp[2];
1195     nz = ai[k+1] - ai[k];
1196 
1197     tp = t + (*vj)*3;
1198     while (nz--) {
1199       tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
1200       tp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
1201       tp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
1202       vj++;
1203       tp = t + (*vj)*3;
1204       v += 9;
1205     }
1206 
1207     /* xk = inv(Dk)*(Dk*xk) */
1208     diag  = aa+k*9;          /* ptr to inv(Dk) */
1209     tp    = t + k*3;
1210     tp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
1211     tp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
1212     tp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
1213   }
1214 
1215   /* solve U*x = y by back substitution */
1216   for (k=mbs-1; k>=0; k--) {
1217     v  = aa + 9*ai[k];
1218     vj = aj + ai[k];
1219     tp = t + k*3;
1220     x0 = tp[0]; x1 = tp[1]; x2 = tp[2];  /* xk */
1221     nz = ai[k+1] - ai[k];
1222 
1223     tp = t + (*vj)*3;
1224     while (nz--) {
1225       /* xk += U(k,:)*x(:) */
1226       x0 += v[0]*tp[0] + v[3]*tp[1] + v[6]*tp[2];
1227       x1 += v[1]*tp[0] + v[4]*tp[1] + v[7]*tp[2];
1228       x2 += v[2]*tp[0] + v[5]*tp[1] + v[8]*tp[2];
1229       vj++;
1230       tp = t + (*vj)*3;
1231       v += 9;
1232     }
1233     tp       = t + k*3;
1234     tp[0]    = x0; tp[1] = x1; tp[2] = x2;
1235     idx      = 3*r[k];
1236     x[idx]   = x0;
1237     x[idx+1] = x1;
1238     x[idx+2] = x2;
1239   }
1240 
1241   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1242   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1243   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1244   ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr);
1245   PetscFunctionReturn(0);
1246 }
1247 
1248 PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1249 {
1250   const MatScalar *v,*diag;
1251   PetscScalar     *xp,x0,x1,x2;
1252   PetscInt        nz,k;
1253   const PetscInt  *vj;
1254 
1255   PetscFunctionBegin;
1256   for (k=0; k<mbs; k++) {
1257     v  = aa + 9*ai[k];
1258     xp = x + k*3;
1259     x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* Dk*xk = k-th block of x */
1260     nz = ai[k+1] - ai[k];
1261     vj = aj + ai[k];
1262     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);    /* Indices for the next row (assumes same size as this one) */
1263     PetscPrefetchBlock(v+9*nz,9*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1264     while (nz--) {
1265       xp = x + (*vj)*3;
1266       /* x(:) += U(k,:)^T*(Dk*xk) */
1267       xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
1268       xp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
1269       xp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
1270       vj++;
1271       v += 9;
1272     }
1273     /* xk = inv(Dk)*(Dk*xk) */
1274     diag  = aa+k*9;         /* ptr to inv(Dk) */
1275     xp    = x + k*3;
1276     xp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
1277     xp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
1278     xp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
1279   }
1280   PetscFunctionReturn(0);
1281 }
1282 
1283 PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1284 {
1285   const MatScalar *v;
1286   PetscScalar     *xp,x0,x1,x2;
1287   PetscInt        nz,k;
1288   const PetscInt  *vj;
1289 
1290   PetscFunctionBegin;
1291   for (k=mbs-1; k>=0; k--) {
1292     v  = aa + 9*ai[k];
1293     xp = x + k*3;
1294     x0 = xp[0]; x1 = xp[1]; x2 = xp[2];  /* xk */
1295     nz = ai[k+1] - ai[k];
1296     vj = aj + ai[k];
1297     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);    /* Indices for the next row (assumes same size as this one) */
1298     PetscPrefetchBlock(v-9*nz,9*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1299     while (nz--) {
1300       xp = x + (*vj)*3;
1301       /* xk += U(k,:)*x(:) */
1302       x0 += v[0]*xp[0] + v[3]*xp[1] + v[6]*xp[2];
1303       x1 += v[1]*xp[0] + v[4]*xp[1] + v[7]*xp[2];
1304       x2 += v[2]*xp[0] + v[5]*xp[1] + v[8]*xp[2];
1305       vj++;
1306       v += 9;
1307     }
1308     xp    = x + k*3;
1309     xp[0] = x0; xp[1] = x1; xp[2] = x2;
1310   }
1311   PetscFunctionReturn(0);
1312 }
1313 
1314 PetscErrorCode MatSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1315 {
1316   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1317   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1318   const MatScalar   *aa=a->a;
1319   PetscScalar       *x;
1320   const PetscScalar *b;
1321   PetscErrorCode    ierr;
1322 
1323   PetscFunctionBegin;
1324   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1325   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1326 
1327   /* solve U^T * D * y = b by forward substitution */
1328   ierr = PetscArraycpy(x,b,3*mbs);CHKERRQ(ierr);
1329   ierr = MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1330 
1331   /* solve U*x = y by back substitution */
1332   ierr = MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1333 
1334   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1335   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1336   ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr);
1337   PetscFunctionReturn(0);
1338 }
1339 
1340 PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1341 {
1342   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1343   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1344   const MatScalar   *aa=a->a;
1345   PetscScalar       *x;
1346   const PetscScalar *b;
1347   PetscErrorCode    ierr;
1348 
1349   PetscFunctionBegin;
1350   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1351   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1352   ierr = PetscArraycpy(x,b,3*mbs);CHKERRQ(ierr);
1353   ierr = MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1354   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1355   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1356   ierr = PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);CHKERRQ(ierr);
1357   PetscFunctionReturn(0);
1358 }
1359 
1360 PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1361 {
1362   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1363   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1364   const MatScalar   *aa=a->a;
1365   PetscScalar       *x;
1366   const PetscScalar *b;
1367   PetscErrorCode    ierr;
1368 
1369   PetscFunctionBegin;
1370   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1371   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1372   ierr = PetscArraycpy(x,b,3*mbs);CHKERRQ(ierr);
1373   ierr = MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1374   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1375   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1376   ierr = PetscLogFlops(2.0*a->bs2*(a->nz-mbs));CHKERRQ(ierr);
1377   PetscFunctionReturn(0);
1378 }
1379 
1380 PetscErrorCode MatSolve_SeqSBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
1381 {
1382   Mat_SeqSBAIJ      *a   =(Mat_SeqSBAIJ*)A->data;
1383   IS                isrow=a->row;
1384   const PetscInt    mbs  =a->mbs,*ai=a->i,*aj=a->j;
1385   PetscErrorCode    ierr;
1386   const PetscInt    *r,*vj;
1387   PetscInt          nz,k,k2,idx;
1388   const MatScalar   *aa=a->a,*v,*diag;
1389   PetscScalar       *x,x0,x1,*t;
1390   const PetscScalar *b;
1391 
1392   PetscFunctionBegin;
1393   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1394   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1395   t    = a->solve_work;
1396   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1397 
1398   /* solve U^T * D * y = perm(b) by forward substitution */
1399   for (k=0; k<mbs; k++) {  /* t <- perm(b) */
1400     idx      = 2*r[k];
1401     t[k*2]   = b[idx];
1402     t[k*2+1] = b[idx+1];
1403   }
1404   for (k=0; k<mbs; k++) {
1405     v  = aa + 4*ai[k];
1406     vj = aj + ai[k];
1407     k2 = k*2;
1408     x0 = t[k2]; x1 = t[k2+1];
1409     nz = ai[k+1] - ai[k];
1410     while (nz--) {
1411       t[(*vj)*2]   += v[0]*x0 + v[1]*x1;
1412       t[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1413       vj++; v      += 4;
1414     }
1415     diag    = aa+k*4; /* ptr to inv(Dk) */
1416     t[k2]   = diag[0]*x0 + diag[2]*x1;
1417     t[k2+1] = diag[1]*x0 + diag[3]*x1;
1418   }
1419 
1420   /* solve U*x = y by back substitution */
1421   for (k=mbs-1; k>=0; k--) {
1422     v  = aa + 4*ai[k];
1423     vj = aj + ai[k];
1424     k2 = k*2;
1425     x0 = t[k2]; x1 = t[k2+1];
1426     nz = ai[k+1] - ai[k];
1427     while (nz--) {
1428       x0 += v[0]*t[(*vj)*2] + v[2]*t[(*vj)*2+1];
1429       x1 += v[1]*t[(*vj)*2] + v[3]*t[(*vj)*2+1];
1430       vj++;
1431       v += 4;
1432     }
1433     t[k2]    = x0;
1434     t[k2+1]  = x1;
1435     idx      = 2*r[k];
1436     x[idx]   = x0;
1437     x[idx+1] = x1;
1438   }
1439 
1440   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1441   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1442   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1443   ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr);
1444   PetscFunctionReturn(0);
1445 }
1446 
1447 PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1448 {
1449   const MatScalar *v,*diag;
1450   PetscScalar     x0,x1;
1451   PetscInt        nz,k,k2;
1452   const PetscInt  *vj;
1453 
1454   PetscFunctionBegin;
1455   for (k=0; k<mbs; k++) {
1456     v  = aa + 4*ai[k];
1457     vj = aj + ai[k];
1458     k2 = k*2;
1459     x0 = x[k2]; x1 = x[k2+1];  /* Dk*xk = k-th block of x */
1460     nz = ai[k+1] - ai[k];
1461     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);    /* Indices for the next row (assumes same size as this one) */
1462     PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1463     while (nz--) {
1464       /* x(:) += U(k,:)^T*(Dk*xk) */
1465       x[(*vj)*2]   += v[0]*x0 + v[1]*x1;
1466       x[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1467       vj++; v      += 4;
1468     }
1469     /* xk = inv(Dk)*(Dk*xk) */
1470     diag    = aa+k*4;       /* ptr to inv(Dk) */
1471     x[k2]   = diag[0]*x0 + diag[2]*x1;
1472     x[k2+1] = diag[1]*x0 + diag[3]*x1;
1473   }
1474   PetscFunctionReturn(0);
1475 }
1476 
1477 PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1478 {
1479   const MatScalar *v;
1480   PetscScalar     x0,x1;
1481   PetscInt        nz,k,k2;
1482   const PetscInt  *vj;
1483 
1484   PetscFunctionBegin;
1485   for (k=mbs-1; k>=0; k--) {
1486     v  = aa + 4*ai[k];
1487     vj = aj + ai[k];
1488     k2 = k*2;
1489     x0 = x[k2]; x1 = x[k2+1];  /* xk */
1490     nz = ai[k+1] - ai[k];
1491     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);    /* Indices for the next row (assumes same size as this one) */
1492     PetscPrefetchBlock(v-4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1493     while (nz--) {
1494       /* xk += U(k,:)*x(:) */
1495       x0 += v[0]*x[(*vj)*2] + v[2]*x[(*vj)*2+1];
1496       x1 += v[1]*x[(*vj)*2] + v[3]*x[(*vj)*2+1];
1497       vj++;
1498       v += 4;
1499     }
1500     x[k2]   = x0;
1501     x[k2+1] = x1;
1502   }
1503   PetscFunctionReturn(0);
1504 }
1505 
1506 PetscErrorCode MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1507 {
1508   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1509   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1510   const MatScalar   *aa=a->a;
1511   PetscScalar       *x;
1512   const PetscScalar *b;
1513   PetscErrorCode    ierr;
1514 
1515   PetscFunctionBegin;
1516   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1517   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1518 
1519   /* solve U^T * D * y = b by forward substitution */
1520   ierr = PetscArraycpy(x,b,2*mbs);CHKERRQ(ierr);
1521   ierr = MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1522 
1523   /* solve U*x = y by back substitution */
1524   ierr = MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1525 
1526   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1527   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1528   ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr);
1529   PetscFunctionReturn(0);
1530 }
1531 
1532 PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1533 {
1534   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1535   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1536   const MatScalar   *aa=a->a;
1537   PetscScalar       *x;
1538   const PetscScalar *b;
1539   PetscErrorCode    ierr;
1540 
1541   PetscFunctionBegin;
1542   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1543   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1544   ierr = PetscArraycpy(x,b,2*mbs);CHKERRQ(ierr);
1545   ierr = MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1546   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1547   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1548   ierr = PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);CHKERRQ(ierr);
1549   PetscFunctionReturn(0);
1550 }
1551 
1552 PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1553 {
1554   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1555   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1556   const MatScalar   *aa=a->a;
1557   PetscScalar       *x;
1558   const PetscScalar *b;
1559   PetscErrorCode    ierr;
1560 
1561   PetscFunctionBegin;
1562   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1563   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1564   ierr = PetscArraycpy(x,b,2*mbs);CHKERRQ(ierr);
1565   ierr = MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1566   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1567   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1568   ierr = PetscLogFlops(2.0*a->bs2*(a->nz - mbs));CHKERRQ(ierr);
1569   PetscFunctionReturn(0);
1570 }
1571 
1572 PetscErrorCode MatSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1573 {
1574   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1575   IS                isrow=a->row;
1576   PetscErrorCode    ierr;
1577   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag=a->diag;
1578   const MatScalar   *aa=a->a,*v;
1579   const PetscScalar *b;
1580   PetscScalar       *x,xk,*t;
1581   PetscInt          nz,k,j;
1582 
1583   PetscFunctionBegin;
1584   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1585   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1586   t    = a->solve_work;
1587   ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1588 
1589   /* solve U^T*D*y = perm(b) by forward substitution */
1590   for (k=0; k<mbs; k++) t[k] = b[rp[k]];
1591   for (k=0; k<mbs; k++) {
1592     v  = aa + ai[k];
1593     vj = aj + ai[k];
1594     xk = t[k];
1595     nz = ai[k+1] - ai[k] - 1;
1596     for (j=0; j<nz; j++) t[vj[j]] += v[j]*xk;
1597     t[k] = xk*v[nz];   /* v[nz] = 1/D(k) */
1598   }
1599 
1600   /* solve U*perm(x) = y by back substitution */
1601   for (k=mbs-1; k>=0; k--) {
1602     v  = aa + adiag[k] - 1;
1603     vj = aj + adiag[k] - 1;
1604     nz = ai[k+1] - ai[k] - 1;
1605     for (j=0; j<nz; j++) t[k] += v[-j]*t[vj[-j]];
1606     x[rp[k]] = t[k];
1607   }
1608 
1609   ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1610   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1611   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1612   ierr = PetscLogFlops(4.0*a->nz - 3.0*mbs);CHKERRQ(ierr);
1613   PetscFunctionReturn(0);
1614 }
1615 
1616 PetscErrorCode MatSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1617 {
1618   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1619   IS                isrow=a->row;
1620   PetscErrorCode    ierr;
1621   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1622   const MatScalar   *aa=a->a,*v;
1623   PetscScalar       *x,xk,*t;
1624   const PetscScalar *b;
1625   PetscInt          nz,k;
1626 
1627   PetscFunctionBegin;
1628   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1629   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1630   t    = a->solve_work;
1631   ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1632 
1633   /* solve U^T*D*y = perm(b) by forward substitution */
1634   for (k=0; k<mbs; k++) t[k] = b[rp[k]];
1635   for (k=0; k<mbs; k++) {
1636     v  = aa + ai[k] + 1;
1637     vj = aj + ai[k] + 1;
1638     xk = t[k];
1639     nz = ai[k+1] - ai[k] - 1;
1640     while (nz--) t[*vj++] += (*v++) * xk;
1641     t[k] = xk*aa[ai[k]];  /* aa[k] = 1/D(k) */
1642   }
1643 
1644   /* solve U*perm(x) = y by back substitution */
1645   for (k=mbs-1; k>=0; k--) {
1646     v  = aa + ai[k] + 1;
1647     vj = aj + ai[k] + 1;
1648     nz = ai[k+1] - ai[k] - 1;
1649     while (nz--) t[k] += (*v++) * t[*vj++];
1650     x[rp[k]] = t[k];
1651   }
1652 
1653   ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1654   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1655   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1656   ierr = PetscLogFlops(4.0*a->nz - 3*mbs);CHKERRQ(ierr);
1657   PetscFunctionReturn(0);
1658 }
1659 
1660 PetscErrorCode MatForwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1661 {
1662   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1663   IS                isrow=a->row;
1664   PetscErrorCode    ierr;
1665   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag=a->diag;
1666   const MatScalar   *aa=a->a,*v;
1667   PetscReal         diagk;
1668   PetscScalar       *x,xk;
1669   const PetscScalar *b;
1670   PetscInt          nz,k;
1671 
1672   PetscFunctionBegin;
1673   /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1674   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1675   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1676   ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1677 
1678   for (k=0; k<mbs; k++) x[k] = b[rp[k]];
1679   for (k=0; k<mbs; k++) {
1680     v  = aa + ai[k];
1681     vj = aj + ai[k];
1682     xk = x[k];
1683     nz = ai[k+1] - ai[k] - 1;
1684     while (nz--) x[*vj++] += (*v++) * xk;
1685 
1686     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
1687     if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1688     x[k] = xk*PetscSqrtReal(diagk);
1689   }
1690   ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1691   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1692   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1693   ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr);
1694   PetscFunctionReturn(0);
1695 }
1696 
1697 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1698 {
1699   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1700   IS                isrow=a->row;
1701   PetscErrorCode    ierr;
1702   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1703   const MatScalar   *aa=a->a,*v;
1704   PetscReal         diagk;
1705   PetscScalar       *x,xk;
1706   const PetscScalar *b;
1707   PetscInt          nz,k;
1708 
1709   PetscFunctionBegin;
1710   /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1711   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1712   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1713   ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1714 
1715   for (k=0; k<mbs; k++) x[k] = b[rp[k]];
1716   for (k=0; k<mbs; k++) {
1717     v  = aa + ai[k] + 1;
1718     vj = aj + ai[k] + 1;
1719     xk = x[k];
1720     nz = ai[k+1] - ai[k] - 1;
1721     while (nz--) x[*vj++] += (*v++) * xk;
1722 
1723     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
1724     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1725     x[k] = xk*PetscSqrtReal(diagk);
1726   }
1727   ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1728   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1729   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1730   ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr);
1731   PetscFunctionReturn(0);
1732 }
1733 
1734 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1735 {
1736   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1737   IS                isrow=a->row;
1738   PetscErrorCode    ierr;
1739   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag=a->diag;
1740   const MatScalar   *aa=a->a,*v;
1741   PetscReal         diagk;
1742   PetscScalar       *x,*t;
1743   const PetscScalar *b;
1744   PetscInt          nz,k;
1745 
1746   PetscFunctionBegin;
1747   /* solve D^(1/2)*U*perm(x) = b by back substitution */
1748   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1749   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1750   t    = a->solve_work;
1751   ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1752 
1753   for (k=mbs-1; k>=0; k--) {
1754     v     = aa + ai[k];
1755     vj    = aj + ai[k];
1756     diagk = PetscRealPart(aa[adiag[k]]);
1757     if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1758     t[k] = b[k] * PetscSqrtReal(diagk);
1759     nz   = ai[k+1] - ai[k] - 1;
1760     while (nz--) t[k] += (*v++) * t[*vj++];
1761     x[rp[k]] = t[k];
1762   }
1763   ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1764   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1765   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1766   ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr);
1767   PetscFunctionReturn(0);
1768 }
1769 
1770 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1771 {
1772   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1773   IS                isrow=a->row;
1774   PetscErrorCode    ierr;
1775   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1776   const MatScalar   *aa=a->a,*v;
1777   PetscReal         diagk;
1778   PetscScalar       *x,*t;
1779   const PetscScalar *b;
1780   PetscInt          nz,k;
1781 
1782   PetscFunctionBegin;
1783   /* solve D^(1/2)*U*perm(x) = b by back substitution */
1784   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1785   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1786   t    = a->solve_work;
1787   ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1788 
1789   for (k=mbs-1; k>=0; k--) {
1790     v     = aa + ai[k] + 1;
1791     vj    = aj + ai[k] + 1;
1792     diagk = PetscRealPart(aa[ai[k]]);
1793     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1794     t[k] = b[k] * PetscSqrtReal(diagk);
1795     nz   = ai[k+1] - ai[k] - 1;
1796     while (nz--) t[k] += (*v++) * t[*vj++];
1797     x[rp[k]] = t[k];
1798   }
1799   ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1800   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1801   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1802   ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr);
1803   PetscFunctionReturn(0);
1804 }
1805 
1806 PetscErrorCode MatSolves_SeqSBAIJ_1(Mat A,Vecs bb,Vecs xx)
1807 {
1808   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1809   PetscErrorCode ierr;
1810 
1811   PetscFunctionBegin;
1812   if (A->rmap->bs == 1) {
1813     ierr = MatSolve_SeqSBAIJ_1(A,bb->v,xx->v);CHKERRQ(ierr);
1814   } else {
1815     IS                isrow=a->row;
1816     const PetscInt    *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp;
1817     const MatScalar   *aa=a->a,*v;
1818     PetscScalar       *x,*t;
1819     const PetscScalar *b;
1820     PetscInt          nz,k,n,i,j;
1821 
1822     if (bb->n > a->solves_work_n) {
1823       ierr = PetscFree(a->solves_work);CHKERRQ(ierr);
1824       ierr = PetscMalloc1(bb->n*A->rmap->N,&a->solves_work);CHKERRQ(ierr);
1825       a->solves_work_n = bb->n;
1826     }
1827     n    = bb->n;
1828     ierr = VecGetArrayRead(bb->v,&b);CHKERRQ(ierr);
1829     ierr = VecGetArray(xx->v,&x);CHKERRQ(ierr);
1830     t    = a->solves_work;
1831 
1832     ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1833 
1834     /* solve U^T*D*y = perm(b) by forward substitution */
1835     for (k=0; k<mbs; k++) {
1836       for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs]; /* values are stored interlaced in t */
1837     }
1838     for (k=0; k<mbs; k++) {
1839       v  = aa + ai[k];
1840       vj = aj + ai[k];
1841       nz = ai[k+1] - ai[k] - 1;
1842       for (j=0; j<nz; j++) {
1843         for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i];
1844         v++;vj++;
1845       }
1846       for (i=0; i<n; i++) t[n*k+i] *= aa[nz];  /* note: aa[nz] = 1/D(k) */
1847     }
1848 
1849     /* solve U*perm(x) = y by back substitution */
1850     for (k=mbs-1; k>=0; k--) {
1851       v  = aa + ai[k] - 1;
1852       vj = aj + ai[k] - 1;
1853       nz = ai[k+1] - ai[k] - 1;
1854       for (j=0; j<nz; j++) {
1855         for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i];
1856         v++;vj++;
1857       }
1858       for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i];
1859     }
1860 
1861     ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1862     ierr = VecRestoreArrayRead(bb->v,&b);CHKERRQ(ierr);
1863     ierr = VecRestoreArray(xx->v,&x);CHKERRQ(ierr);
1864     ierr = PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));CHKERRQ(ierr);
1865   }
1866   PetscFunctionReturn(0);
1867 }
1868 
1869 PetscErrorCode MatSolves_SeqSBAIJ_1_inplace(Mat A,Vecs bb,Vecs xx)
1870 {
1871   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1872   PetscErrorCode ierr;
1873 
1874   PetscFunctionBegin;
1875   if (A->rmap->bs == 1) {
1876     ierr = MatSolve_SeqSBAIJ_1_inplace(A,bb->v,xx->v);CHKERRQ(ierr);
1877   } else {
1878     IS                isrow=a->row;
1879     const PetscInt    *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp;
1880     const MatScalar   *aa=a->a,*v;
1881     PetscScalar       *x,*t;
1882     const PetscScalar *b;
1883     PetscInt          nz,k,n,i;
1884 
1885     if (bb->n > a->solves_work_n) {
1886       ierr = PetscFree(a->solves_work);CHKERRQ(ierr);
1887       ierr = PetscMalloc1(bb->n*A->rmap->N,&a->solves_work);CHKERRQ(ierr);
1888       a->solves_work_n = bb->n;
1889     }
1890     n    = bb->n;
1891     ierr = VecGetArrayRead(bb->v,&b);CHKERRQ(ierr);
1892     ierr = VecGetArray(xx->v,&x);CHKERRQ(ierr);
1893     t    = a->solves_work;
1894 
1895     ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1896 
1897     /* solve U^T*D*y = perm(b) by forward substitution */
1898     for (k=0; k<mbs; k++) {
1899       for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs];  /* values are stored interlaced in t */
1900     }
1901     for (k=0; k<mbs; k++) {
1902       v  = aa + ai[k];
1903       vj = aj + ai[k];
1904       nz = ai[k+1] - ai[k];
1905       while (nz--) {
1906         for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i];
1907         v++;vj++;
1908       }
1909       for (i=0; i<n; i++) t[n*k+i] *= aa[k];  /* note: aa[k] = 1/D(k) */
1910     }
1911 
1912     /* solve U*perm(x) = y by back substitution */
1913     for (k=mbs-1; k>=0; k--) {
1914       v  = aa + ai[k];
1915       vj = aj + ai[k];
1916       nz = ai[k+1] - ai[k];
1917       while (nz--) {
1918         for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i];
1919         v++;vj++;
1920       }
1921       for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i];
1922     }
1923 
1924     ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1925     ierr = VecRestoreArrayRead(bb->v,&b);CHKERRQ(ierr);
1926     ierr = VecRestoreArray(xx->v,&x);CHKERRQ(ierr);
1927     ierr = PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));CHKERRQ(ierr);
1928   }
1929   PetscFunctionReturn(0);
1930 }
1931 
1932 PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
1933 {
1934   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1935   PetscErrorCode    ierr;
1936   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*vj,*adiag=a->diag;
1937   const MatScalar   *aa=a->a,*v;
1938   const PetscScalar *b;
1939   PetscScalar       *x,xi;
1940   PetscInt          nz,i,j;
1941 
1942   PetscFunctionBegin;
1943   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1944   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1945   /* solve U^T*D*y = b by forward substitution */
1946   ierr = PetscArraycpy(x,b,mbs);CHKERRQ(ierr);
1947   for (i=0; i<mbs; i++) {
1948     v  = aa + ai[i];
1949     vj = aj + ai[i];
1950     xi = x[i];
1951     nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
1952     for (j=0; j<nz; j++) x[vj[j]] += v[j]*xi;
1953     x[i] = xi*v[nz];  /* v[nz] = aa[diag[i]] = 1/D(i) */
1954   }
1955   /* solve U*x = y by backward substitution */
1956   for (i=mbs-2; i>=0; i--) {
1957     xi = x[i];
1958     v  = aa + adiag[i] - 1; /* end of row i, excluding diag */
1959     vj = aj + adiag[i] - 1;
1960     nz = ai[i+1] - ai[i] - 1;
1961     for (j=0; j<nz; j++) xi += v[-j]*x[vj[-j]];
1962     x[i] = xi;
1963   }
1964   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1965   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1966   ierr = PetscLogFlops(4.0*a->nz - 3*mbs);CHKERRQ(ierr);
1967   PetscFunctionReturn(0);
1968 }
1969 
1970 PetscErrorCode MatMatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Mat B,Mat X)
1971 {
1972   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1973   PetscErrorCode    ierr;
1974   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*vj,*adiag=a->diag;
1975   const MatScalar   *aa=a->a,*v;
1976   const PetscScalar *b;
1977   PetscScalar       *x,xi;
1978   PetscInt          nz,i,j,neq,ldb,ldx;
1979   PetscBool         isdense;
1980 
1981   PetscFunctionBegin;
1982   if (!mbs) PetscFunctionReturn(0);
1983   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSE,&isdense);CHKERRQ(ierr);
1984   if (!isdense) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"B matrix must be a SeqDense matrix");
1985   if (X != B) {
1986     ierr = PetscObjectTypeCompare((PetscObject)X,MATSEQDENSE,&isdense);CHKERRQ(ierr);
1987     if (!isdense) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"X matrix must be a SeqDense matrix");
1988   }
1989   ierr = MatDenseGetArrayRead(B,&b);CHKERRQ(ierr);
1990   ierr = MatDenseGetLDA(B,&ldb);CHKERRQ(ierr);
1991   ierr = MatDenseGetArray(X,&x);CHKERRQ(ierr);
1992   ierr = MatDenseGetLDA(X,&ldx);CHKERRQ(ierr);
1993   for (neq=0; neq<B->cmap->n; neq++) {
1994     /* solve U^T*D*y = b by forward substitution */
1995     ierr = PetscArraycpy(x,b,mbs);CHKERRQ(ierr);
1996     for (i=0; i<mbs; i++) {
1997       v  = aa + ai[i];
1998       vj = aj + ai[i];
1999       xi = x[i];
2000       nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
2001       for (j=0; j<nz; j++) x[vj[j]] += v[j]*xi;
2002       x[i] = xi*v[nz];  /* v[nz] = aa[diag[i]] = 1/D(i) */
2003     }
2004     /* solve U*x = y by backward substitution */
2005     for (i=mbs-2; i>=0; i--) {
2006       xi = x[i];
2007       v  = aa + adiag[i] - 1; /* end of row i, excluding diag */
2008       vj = aj + adiag[i] - 1;
2009       nz = ai[i+1] - ai[i] - 1;
2010       for (j=0; j<nz; j++) xi += v[-j]*x[vj[-j]];
2011       x[i] = xi;
2012     }
2013     b += ldb;
2014     x += ldx;
2015   }
2016   ierr = MatDenseRestoreArrayRead(B,&b);CHKERRQ(ierr);
2017   ierr = MatDenseRestoreArray(X,&x);CHKERRQ(ierr);
2018   ierr = PetscLogFlops(B->cmap->n*(4.0*a->nz - 3*mbs));CHKERRQ(ierr);
2019   PetscFunctionReturn(0);
2020 }
2021 
2022 PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2023 {
2024   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
2025   PetscErrorCode    ierr;
2026   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*vj;
2027   const MatScalar   *aa=a->a,*v;
2028   PetscScalar       *x,xk;
2029   const PetscScalar *b;
2030   PetscInt          nz,k;
2031 
2032   PetscFunctionBegin;
2033   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2034   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2035 
2036   /* solve U^T*D*y = b by forward substitution */
2037   ierr = PetscArraycpy(x,b,mbs);CHKERRQ(ierr);
2038   for (k=0; k<mbs; k++) {
2039     v  = aa + ai[k] + 1;
2040     vj = aj + ai[k] + 1;
2041     xk = x[k];
2042     nz = ai[k+1] - ai[k] - 1;     /* exclude diag[k] */
2043     while (nz--) x[*vj++] += (*v++) * xk;
2044     x[k] = xk*aa[ai[k]];  /* note: aa[diag[k]] = 1/D(k) */
2045   }
2046 
2047   /* solve U*x = y by back substitution */
2048   for (k=mbs-2; k>=0; k--) {
2049     v  = aa + ai[k] + 1;
2050     vj = aj + ai[k] + 1;
2051     xk = x[k];
2052     nz = ai[k+1] - ai[k] - 1;
2053     while (nz--) {
2054       xk += (*v++) * x[*vj++];
2055     }
2056     x[k] = xk;
2057   }
2058 
2059   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2060   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2061   ierr = PetscLogFlops(4.0*a->nz - 3*mbs);CHKERRQ(ierr);
2062   PetscFunctionReturn(0);
2063 }
2064 
2065 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2066 {
2067   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
2068   PetscErrorCode    ierr;
2069   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vj;
2070   const MatScalar   *aa=a->a,*v;
2071   PetscReal         diagk;
2072   PetscScalar       *x;
2073   const PetscScalar *b;
2074   PetscInt          nz,k;
2075 
2076   PetscFunctionBegin;
2077   /* solve U^T*D^(1/2)*x = b by forward substitution */
2078   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2079   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2080   ierr = PetscArraycpy(x,b,mbs);CHKERRQ(ierr);
2081   for (k=0; k<mbs; k++) {
2082     v  = aa + ai[k];
2083     vj = aj + ai[k];
2084     nz = ai[k+1] - ai[k] - 1;     /* exclude diag[k] */
2085     while (nz--) x[*vj++] += (*v++) * x[k];
2086     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[adiag[k]] = 1/D(k) */
2087     if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",PetscRealPart(aa[adiag[k]]),PetscImaginaryPart(aa[adiag[k]]));
2088     x[k] *= PetscSqrtReal(diagk);
2089   }
2090   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2091   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2092   ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr);
2093   PetscFunctionReturn(0);
2094 }
2095 
2096 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2097 {
2098   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
2099   PetscErrorCode    ierr;
2100   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*vj;
2101   const MatScalar   *aa=a->a,*v;
2102   PetscReal         diagk;
2103   PetscScalar       *x;
2104   const PetscScalar *b;
2105   PetscInt          nz,k;
2106 
2107   PetscFunctionBegin;
2108   /* solve U^T*D^(1/2)*x = b by forward substitution */
2109   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2110   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2111   ierr = PetscArraycpy(x,b,mbs);CHKERRQ(ierr);
2112   for (k=0; k<mbs; k++) {
2113     v  = aa + ai[k] + 1;
2114     vj = aj + ai[k] + 1;
2115     nz = ai[k+1] - ai[k] - 1;     /* exclude diag[k] */
2116     while (nz--) x[*vj++] += (*v++) * x[k];
2117     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2118     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",PetscRealPart(aa[ai[k]]),PetscImaginaryPart(aa[ai[k]]));
2119     x[k] *= PetscSqrtReal(diagk);
2120   }
2121   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2122   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2123   ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr);
2124   PetscFunctionReturn(0);
2125 }
2126 
2127 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2128 {
2129   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
2130   PetscErrorCode    ierr;
2131   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vj;
2132   const MatScalar   *aa=a->a,*v;
2133   PetscReal         diagk;
2134   PetscScalar       *x;
2135   const PetscScalar *b;
2136   PetscInt          nz,k;
2137 
2138   PetscFunctionBegin;
2139   /* solve D^(1/2)*U*x = b by back substitution */
2140   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2141   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2142 
2143   for (k=mbs-1; k>=0; k--) {
2144     v     = aa + ai[k];
2145     vj    = aj + ai[k];
2146     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
2147     if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
2148     x[k] = PetscSqrtReal(diagk)*b[k];
2149     nz   = ai[k+1] - ai[k] - 1;
2150     while (nz--) x[k] += (*v++) * x[*vj++];
2151   }
2152   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2153   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2154   ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr);
2155   PetscFunctionReturn(0);
2156 }
2157 
2158 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2159 {
2160   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
2161   PetscErrorCode    ierr;
2162   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*vj;
2163   const MatScalar   *aa=a->a,*v;
2164   PetscReal         diagk;
2165   PetscScalar       *x;
2166   const PetscScalar *b;
2167   PetscInt          nz,k;
2168 
2169   PetscFunctionBegin;
2170   /* solve D^(1/2)*U*x = b by back substitution */
2171   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2172   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2173 
2174   for (k=mbs-1; k>=0; k--) {
2175     v     = aa + ai[k] + 1;
2176     vj    = aj + ai[k] + 1;
2177     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2178     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
2179     x[k] = PetscSqrtReal(diagk)*b[k];
2180     nz   = ai[k+1] - ai[k] - 1;
2181     while (nz--) x[k] += (*v++) * x[*vj++];
2182   }
2183   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2184   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2185   ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr);
2186   PetscFunctionReturn(0);
2187 }
2188 
2189 /* Use Modified Sparse Row storage for u and ju, see Saad pp.85 */
2190 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_MSR(Mat B,Mat A,IS perm,const MatFactorInfo *info)
2191 {
2192   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b;
2193   PetscErrorCode ierr;
2194   const PetscInt *rip,mbs = a->mbs,*ai,*aj;
2195   PetscInt       *jutmp,bs = A->rmap->bs,i;
2196   PetscInt       m,reallocs = 0,*levtmp;
2197   PetscInt       *prowl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd;
2198   PetscInt       incrlev,*lev,shift,prow,nz;
2199   PetscReal      f = info->fill,levels = info->levels;
2200   PetscBool      perm_identity;
2201 
2202   PetscFunctionBegin;
2203   /* check whether perm is the identity mapping */
2204   ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr);
2205 
2206   if (perm_identity) {
2207     a->permute = PETSC_FALSE;
2208     ai         = a->i; aj = a->j;
2209   } else { /*  non-trivial permutation */
2210     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2211   }
2212 
2213   /* initialization */
2214   ierr  = ISGetIndices(perm,&rip);CHKERRQ(ierr);
2215   umax  = (PetscInt)(f*ai[mbs] + 1);
2216   ierr  = PetscMalloc1(umax,&lev);CHKERRQ(ierr);
2217   umax += mbs + 1;
2218   shift = mbs + 1;
2219   ierr  = PetscMalloc1(mbs+1,&iu);CHKERRQ(ierr);
2220   ierr  = PetscMalloc1(umax,&ju);CHKERRQ(ierr);
2221   iu[0] = mbs + 1;
2222   juidx = mbs + 1;
2223   /* prowl: linked list for pivot row */
2224   ierr = PetscMalloc3(mbs,&prowl,mbs,&q,mbs,&levtmp);CHKERRQ(ierr);
2225   /* q: linked list for col index */
2226 
2227   for (i=0; i<mbs; i++) {
2228     prowl[i] = mbs;
2229     q[i]     = 0;
2230   }
2231 
2232   /* for each row k */
2233   for (k=0; k<mbs; k++) {
2234     nzk  = 0;
2235     q[k] = mbs;
2236     /* copy current row into linked list */
2237     nz = ai[rip[k]+1] - ai[rip[k]];
2238     j  = ai[rip[k]];
2239     while (nz--) {
2240       vj = rip[aj[j++]];
2241       if (vj > k) {
2242         qm = k;
2243         do {
2244           m = qm; qm = q[m];
2245         } while (qm < vj);
2246         if (qm == vj) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Duplicate entry in A\n");
2247         nzk++;
2248         q[m]       = vj;
2249         q[vj]      = qm;
2250         levtmp[vj] = 0;   /* initialize lev for nonzero element */
2251       }
2252     }
2253 
2254     /* modify nonzero structure of k-th row by computing fill-in
2255        for each row prow to be merged in */
2256     prow = k;
2257     prow = prowl[prow]; /* next pivot row (== 0 for symbolic factorization) */
2258 
2259     while (prow < k) {
2260       /* merge row prow into k-th row */
2261       jmin = iu[prow] + 1;
2262       jmax = iu[prow+1];
2263       qm   = k;
2264       for (j=jmin; j<jmax; j++) {
2265         incrlev = lev[j-shift] + 1;
2266         if (incrlev > levels) continue;
2267         vj = ju[j];
2268         do {
2269           m = qm; qm = q[m];
2270         } while (qm < vj);
2271         if (qm != vj) {      /* a fill */
2272           nzk++; q[m] = vj; q[vj] = qm; qm = vj;
2273           levtmp[vj] = incrlev;
2274         } else if (levtmp[vj] > incrlev) levtmp[vj] = incrlev;
2275       }
2276       prow = prowl[prow]; /* next pivot row */
2277     }
2278 
2279     /* add k to row list for first nonzero element in k-th row */
2280     if (nzk > 1) {
2281       i        = q[k]; /* col value of first nonzero element in k_th row of U */
2282       prowl[k] = prowl[i]; prowl[i] = k;
2283     }
2284     iu[k+1] = iu[k] + nzk;
2285 
2286     /* allocate more space to ju and lev if needed */
2287     if (iu[k+1] > umax) {
2288       /* estimate how much additional space we will need */
2289       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
2290       /* just double the memory each time */
2291       maxadd = umax;
2292       if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
2293       umax += maxadd;
2294 
2295       /* allocate a longer ju */
2296       ierr = PetscMalloc1(umax,&jutmp);CHKERRQ(ierr);
2297       ierr = PetscArraycpy(jutmp,ju,iu[k]);CHKERRQ(ierr);
2298       ierr = PetscFree(ju);CHKERRQ(ierr);
2299       ju   = jutmp;
2300 
2301       ierr      = PetscMalloc1(umax,&jutmp);CHKERRQ(ierr);
2302       ierr      = PetscArraycpy(jutmp,lev,iu[k]-shift);CHKERRQ(ierr);
2303       ierr      = PetscFree(lev);CHKERRQ(ierr);
2304       lev       = jutmp;
2305       reallocs += 2; /* count how many times we realloc */
2306     }
2307 
2308     /* save nonzero structure of k-th row in ju */
2309     i=k;
2310     while (nzk--) {
2311       i                = q[i];
2312       ju[juidx]        = i;
2313       lev[juidx-shift] = levtmp[i];
2314       juidx++;
2315     }
2316   }
2317 
2318 #if defined(PETSC_USE_INFO)
2319   if (ai[mbs] != 0) {
2320     PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
2321     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)f,(double)af);CHKERRQ(ierr);
2322     ierr = PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);CHKERRQ(ierr);
2323     ierr = PetscInfo1(A,"PCFactorSetFill(pc,%g);\n",(double)af);CHKERRQ(ierr);
2324     ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
2325   } else {
2326     ierr = PetscInfo(A,"Empty matrix\n");CHKERRQ(ierr);
2327   }
2328 #endif
2329 
2330   ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
2331   ierr = PetscFree3(prowl,q,levtmp);CHKERRQ(ierr);
2332   ierr = PetscFree(lev);CHKERRQ(ierr);
2333 
2334   /* put together the new matrix */
2335   ierr = MatSeqSBAIJSetPreallocation(B,bs,0,NULL);CHKERRQ(ierr);
2336 
2337   /* ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)iperm);CHKERRQ(ierr); */
2338   b    = (Mat_SeqSBAIJ*)(B)->data;
2339   ierr = PetscFree2(b->imax,b->ilen);CHKERRQ(ierr);
2340 
2341   b->singlemalloc = PETSC_FALSE;
2342   b->free_a       = PETSC_TRUE;
2343   b->free_ij      = PETSC_TRUE;
2344   /* the next line frees the default space generated by the Create() */
2345   ierr    = PetscFree3(b->a,b->j,b->i);CHKERRQ(ierr);
2346   ierr    = PetscMalloc1((iu[mbs]+1)*a->bs2,&b->a);CHKERRQ(ierr);
2347   b->j    = ju;
2348   b->i    = iu;
2349   b->diag = NULL;
2350   b->ilen = NULL;
2351   b->imax = NULL;
2352 
2353   ierr    = ISDestroy(&b->row);CHKERRQ(ierr);
2354   ierr    = ISDestroy(&b->icol);CHKERRQ(ierr);
2355   b->row  = perm;
2356   b->icol = perm;
2357   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
2358   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
2359   ierr    = PetscMalloc1(bs*mbs+bs,&b->solve_work);CHKERRQ(ierr);
2360   /* In b structure:  Free imax, ilen, old a, old j.
2361      Allocate idnew, solve_work, new a, new j */
2362   ierr     = PetscLogObjectMemory((PetscObject)B,(iu[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));CHKERRQ(ierr);
2363   b->maxnz = b->nz = iu[mbs];
2364 
2365   (B)->info.factor_mallocs   = reallocs;
2366   (B)->info.fill_ratio_given = f;
2367   if (ai[mbs] != 0) {
2368     (B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
2369   } else {
2370     (B)->info.fill_ratio_needed = 0.0;
2371   }
2372   ierr = MatSeqSBAIJSetNumericFactorization_inplace(B,perm_identity);CHKERRQ(ierr);
2373   PetscFunctionReturn(0);
2374 }
2375 
2376 /*
2377   See MatICCFactorSymbolic_SeqAIJ() for description of its data structure
2378 */
2379 #include <petscbt.h>
2380 #include <../src/mat/utils/freespace.h>
2381 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
2382 {
2383   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b;
2384   PetscErrorCode     ierr;
2385   PetscBool          perm_identity,free_ij = PETSC_TRUE,missing;
2386   PetscInt           bs=A->rmap->bs,am=a->mbs,d,*ai=a->i,*aj= a->j;
2387   const PetscInt     *rip;
2388   PetscInt           reallocs=0,i,*ui,*udiag,*cols;
2389   PetscInt           jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow;
2390   PetscInt           nlnk,*lnk,*lnk_lvl=NULL,ncols,*uj,**uj_ptr,**uj_lvl_ptr;
2391   PetscReal          fill          =info->fill,levels=info->levels;
2392   PetscFreeSpaceList free_space    =NULL,current_space=NULL;
2393   PetscFreeSpaceList free_space_lvl=NULL,current_space_lvl=NULL;
2394   PetscBT            lnkbt;
2395 
2396   PetscFunctionBegin;
2397   if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
2398   ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr);
2399   if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
2400   if (bs > 1) {
2401     ierr = MatICCFactorSymbolic_SeqSBAIJ_inplace(fact,A,perm,info);CHKERRQ(ierr);
2402     PetscFunctionReturn(0);
2403   }
2404 
2405   /* check whether perm is the identity mapping */
2406   ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr);
2407   if (!perm_identity) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2408   a->permute = PETSC_FALSE;
2409 
2410   ierr  = PetscMalloc1(am+1,&ui);CHKERRQ(ierr);
2411   ierr  = PetscMalloc1(am+1,&udiag);CHKERRQ(ierr);
2412   ui[0] = 0;
2413 
2414   /* ICC(0) without matrix ordering: simply rearrange column indices */
2415   if (!levels) {
2416     /* reuse the column pointers and row offsets for memory savings */
2417     for (i=0; i<am; i++) {
2418       ncols    = ai[i+1] - ai[i];
2419       ui[i+1]  = ui[i] + ncols;
2420       udiag[i] = ui[i+1] - 1; /* points to the last entry of U(i,:) */
2421     }
2422     ierr = PetscMalloc1(ui[am]+1,&uj);CHKERRQ(ierr);
2423     cols = uj;
2424     for (i=0; i<am; i++) {
2425       aj    = a->j + ai[i] + 1; /* 1st entry of U(i,:) without diagonal */
2426       ncols = ai[i+1] - ai[i] -1;
2427       for (j=0; j<ncols; j++) *cols++ = aj[j];
2428       *cols++ = i; /* diagonal is located as the last entry of U(i,:) */
2429     }
2430   } else { /* case: levels>0 */
2431     ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr);
2432 
2433     /* initialization */
2434     /* jl: linked list for storing indices of the pivot rows
2435        il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2436     ierr = PetscMalloc4(am,&uj_ptr,am,&uj_lvl_ptr,am,&il,am,&jl);CHKERRQ(ierr);
2437     for (i=0; i<am; i++) {
2438       jl[i] = am; il[i] = 0;
2439     }
2440 
2441     /* create and initialize a linked list for storing column indices of the active row k */
2442     nlnk = am + 1;
2443     ierr = PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
2444 
2445     /* initial FreeSpace size is fill*(ai[am]+1) */
2446     ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space);CHKERRQ(ierr);
2447 
2448     current_space = free_space;
2449 
2450     ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space_lvl);CHKERRQ(ierr);
2451 
2452     current_space_lvl = free_space_lvl;
2453 
2454     for (k=0; k<am; k++) {  /* for each active row k */
2455       /* initialize lnk by the column indices of row k */
2456       nzk   = 0;
2457       ncols = ai[k+1] - ai[k];
2458       if (!ncols) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Empty row %D in matrix ",k);
2459       cols = aj+ai[k];
2460       ierr = PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
2461       nzk += nlnk;
2462 
2463       /* update lnk by computing fill-in for each pivot row to be merged in */
2464       prow = jl[k]; /* 1st pivot row */
2465 
2466       while (prow < k) {
2467         nextprow = jl[prow];
2468 
2469         /* merge prow into k-th row */
2470         jmin  = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
2471         jmax  = ui[prow+1];
2472         ncols = jmax-jmin;
2473         i     = jmin - ui[prow];
2474 
2475         cols = uj_ptr[prow] + i;  /* points to the 2nd nzero entry in U(prow,k:am-1) */
2476         uj   = uj_lvl_ptr[prow] + i;  /* levels of cols */
2477         j    = *(uj - 1);
2478         ierr = PetscICCLLAddSorted(ncols,cols,levels,uj,am,nlnk,lnk,lnk_lvl,lnkbt,j);CHKERRQ(ierr);
2479         nzk += nlnk;
2480 
2481         /* update il and jl for prow */
2482         if (jmin < jmax) {
2483           il[prow] = jmin;
2484 
2485           j = *cols; jl[prow] = jl[j]; jl[j] = prow;
2486         }
2487         prow = nextprow;
2488       }
2489 
2490       /* if free space is not available, make more free space */
2491       if (current_space->local_remaining<nzk) {
2492         i    = am - k + 1; /* num of unfactored rows */
2493         i    = PetscIntMultTruncate(i,PetscMin(nzk, i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
2494         ierr = PetscFreeSpaceGet(i,&current_space);CHKERRQ(ierr);
2495         ierr = PetscFreeSpaceGet(i,&current_space_lvl);CHKERRQ(ierr);
2496         reallocs++;
2497       }
2498 
2499       /* copy data into free_space and free_space_lvl, then initialize lnk */
2500       if (nzk == 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Empty row %D in ICC matrix factor",k);
2501       ierr = PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr);
2502 
2503       /* add the k-th row into il and jl */
2504       if (nzk > 1) {
2505         i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2506         jl[k] = jl[i]; jl[i] = k;
2507         il[k] = ui[k] + 1;
2508       }
2509       uj_ptr[k]     = current_space->array;
2510       uj_lvl_ptr[k] = current_space_lvl->array;
2511 
2512       current_space->array               += nzk;
2513       current_space->local_used          += nzk;
2514       current_space->local_remaining     -= nzk;
2515       current_space_lvl->array           += nzk;
2516       current_space_lvl->local_used      += nzk;
2517       current_space_lvl->local_remaining -= nzk;
2518 
2519       ui[k+1] = ui[k] + nzk;
2520     }
2521 
2522     ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
2523     ierr = PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);CHKERRQ(ierr);
2524 
2525     /* destroy list of free space and other temporary array(s) */
2526     ierr = PetscMalloc1(ui[am]+1,&uj);CHKERRQ(ierr);
2527     ierr = PetscFreeSpaceContiguous_Cholesky(&free_space,uj,am,ui,udiag);CHKERRQ(ierr); /* store matrix factor  */
2528     ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
2529     ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr);
2530 
2531   } /* end of case: levels>0 || (levels=0 && !perm_identity) */
2532 
2533   /* put together the new matrix in MATSEQSBAIJ format */
2534   ierr = MatSeqSBAIJSetPreallocation(fact,bs,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
2535 
2536   b    = (Mat_SeqSBAIJ*)(fact)->data;
2537   ierr = PetscFree2(b->imax,b->ilen);CHKERRQ(ierr);
2538 
2539   b->singlemalloc = PETSC_FALSE;
2540   b->free_a       = PETSC_TRUE;
2541   b->free_ij      = free_ij;
2542 
2543   ierr = PetscMalloc1(ui[am]+1,&b->a);CHKERRQ(ierr);
2544 
2545   b->j         = uj;
2546   b->i         = ui;
2547   b->diag      = udiag;
2548   b->free_diag = PETSC_TRUE;
2549   b->ilen      = NULL;
2550   b->imax      = NULL;
2551   b->row       = perm;
2552   b->col       = perm;
2553 
2554   ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
2555   ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
2556 
2557   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
2558 
2559   ierr = PetscMalloc1(am+1,&b->solve_work);CHKERRQ(ierr);
2560   ierr = PetscLogObjectMemory((PetscObject)fact,ui[am]*(sizeof(PetscInt)+sizeof(MatScalar)));CHKERRQ(ierr);
2561 
2562   b->maxnz = b->nz = ui[am];
2563 
2564   fact->info.factor_mallocs   = reallocs;
2565   fact->info.fill_ratio_given = fill;
2566   if (ai[am] != 0) {
2567     fact->info.fill_ratio_needed = ((PetscReal)ui[am])/ai[am];
2568   } else {
2569     fact->info.fill_ratio_needed = 0.0;
2570   }
2571 #if defined(PETSC_USE_INFO)
2572   if (ai[am] != 0) {
2573     PetscReal af = fact->info.fill_ratio_needed;
2574     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)fill,(double)af);CHKERRQ(ierr);
2575     ierr = PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);CHKERRQ(ierr);
2576     ierr = PetscInfo1(A,"PCFactorSetFill(pc,%g) for best performance.\n",(double)af);CHKERRQ(ierr);
2577   } else {
2578     ierr = PetscInfo(A,"Empty matrix\n");CHKERRQ(ierr);
2579   }
2580 #endif
2581   fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
2582   PetscFunctionReturn(0);
2583 }
2584 
2585 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_inplace(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
2586 {
2587   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data;
2588   Mat_SeqSBAIJ       *b;
2589   PetscErrorCode     ierr;
2590   PetscBool          perm_identity,free_ij = PETSC_TRUE;
2591   PetscInt           bs=A->rmap->bs,am=a->mbs;
2592   const PetscInt     *cols,*rip,*ai=a->i,*aj=a->j;
2593   PetscInt           reallocs=0,i,*ui;
2594   PetscInt           jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow;
2595   PetscInt           nlnk,*lnk,*lnk_lvl=NULL,ncols,*cols_lvl,*uj,**uj_ptr,**uj_lvl_ptr;
2596   PetscReal          fill          =info->fill,levels=info->levels,ratio_needed;
2597   PetscFreeSpaceList free_space    =NULL,current_space=NULL;
2598   PetscFreeSpaceList free_space_lvl=NULL,current_space_lvl=NULL;
2599   PetscBT            lnkbt;
2600 
2601   PetscFunctionBegin;
2602   /*
2603    This code originally uses Modified Sparse Row (MSR) storage
2604    (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choice!
2605    Then it is rewritten so the factor B takes seqsbaij format. However the associated
2606    MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1,
2607    thus the original code in MSR format is still used for these cases.
2608    The code below should replace MatICCFactorSymbolic_SeqSBAIJ_MSR() whenever
2609    MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor.
2610   */
2611   if (bs > 1) {
2612     ierr = MatICCFactorSymbolic_SeqSBAIJ_MSR(fact,A,perm,info);CHKERRQ(ierr);
2613     PetscFunctionReturn(0);
2614   }
2615 
2616   /* check whether perm is the identity mapping */
2617   ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr);
2618   if (!perm_identity) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2619   a->permute = PETSC_FALSE;
2620 
2621   /* special case that simply copies fill pattern */
2622   if (!levels) {
2623     /* reuse the column pointers and row offsets for memory savings */
2624     ui           = a->i;
2625     uj           = a->j;
2626     free_ij      = PETSC_FALSE;
2627     ratio_needed = 1.0;
2628   } else { /* case: levels>0 */
2629     ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr);
2630 
2631     /* initialization */
2632     ierr  = PetscMalloc1(am+1,&ui);CHKERRQ(ierr);
2633     ui[0] = 0;
2634 
2635     /* jl: linked list for storing indices of the pivot rows
2636        il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2637     ierr = PetscMalloc4(am,&uj_ptr,am,&uj_lvl_ptr,am,&il,am,&jl);CHKERRQ(ierr);
2638     for (i=0; i<am; i++) {
2639       jl[i] = am; il[i] = 0;
2640     }
2641 
2642     /* create and initialize a linked list for storing column indices of the active row k */
2643     nlnk = am + 1;
2644     ierr = PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
2645 
2646     /* initial FreeSpace size is fill*(ai[am]+1) */
2647     ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space);CHKERRQ(ierr);
2648 
2649     current_space = free_space;
2650 
2651     ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space_lvl);CHKERRQ(ierr);
2652 
2653     current_space_lvl = free_space_lvl;
2654 
2655     for (k=0; k<am; k++) {  /* for each active row k */
2656       /* initialize lnk by the column indices of row rip[k] */
2657       nzk   = 0;
2658       ncols = ai[rip[k]+1] - ai[rip[k]];
2659       cols  = aj+ai[rip[k]];
2660       ierr  = PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
2661       nzk  += nlnk;
2662 
2663       /* update lnk by computing fill-in for each pivot row to be merged in */
2664       prow = jl[k]; /* 1st pivot row */
2665 
2666       while (prow < k) {
2667         nextprow = jl[prow];
2668 
2669         /* merge prow into k-th row */
2670         jmin     = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
2671         jmax     = ui[prow+1];
2672         ncols    = jmax-jmin;
2673         i        = jmin - ui[prow];
2674         cols     = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
2675         j        = *(uj_lvl_ptr[prow] + i - 1);
2676         cols_lvl = uj_lvl_ptr[prow]+i;
2677         ierr     = PetscICCLLAddSorted(ncols,cols,levels,cols_lvl,am,nlnk,lnk,lnk_lvl,lnkbt,j);CHKERRQ(ierr);
2678         nzk     += nlnk;
2679 
2680         /* update il and jl for prow */
2681         if (jmin < jmax) {
2682           il[prow] = jmin;
2683           j        = *cols;
2684           jl[prow] = jl[j];
2685           jl[j]    = prow;
2686         }
2687         prow = nextprow;
2688       }
2689 
2690       /* if free space is not available, make more free space */
2691       if (current_space->local_remaining<nzk) {
2692         i    = am - k + 1; /* num of unfactored rows */
2693         i    = PetscMin(PetscIntMultTruncate(i,nzk), PetscIntMultTruncate(i,i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
2694         ierr = PetscFreeSpaceGet(i,&current_space);CHKERRQ(ierr);
2695         ierr = PetscFreeSpaceGet(i,&current_space_lvl);CHKERRQ(ierr);
2696         reallocs++;
2697       }
2698 
2699       /* copy data into free_space and free_space_lvl, then initialize lnk */
2700       ierr = PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr);
2701 
2702       /* add the k-th row into il and jl */
2703       if (nzk-1 > 0) {
2704         i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2705         jl[k] = jl[i]; jl[i] = k;
2706         il[k] = ui[k] + 1;
2707       }
2708       uj_ptr[k]     = current_space->array;
2709       uj_lvl_ptr[k] = current_space_lvl->array;
2710 
2711       current_space->array               += nzk;
2712       current_space->local_used          += nzk;
2713       current_space->local_remaining     -= nzk;
2714       current_space_lvl->array           += nzk;
2715       current_space_lvl->local_used      += nzk;
2716       current_space_lvl->local_remaining -= nzk;
2717 
2718       ui[k+1] = ui[k] + nzk;
2719     }
2720 
2721     ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
2722     ierr = PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);CHKERRQ(ierr);
2723 
2724     /* destroy list of free space and other temporary array(s) */
2725     ierr = PetscMalloc1(ui[am]+1,&uj);CHKERRQ(ierr);
2726     ierr = PetscFreeSpaceContiguous(&free_space,uj);CHKERRQ(ierr);
2727     ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
2728     ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr);
2729     if (ai[am] != 0) {
2730       ratio_needed = ((PetscReal)ui[am])/((PetscReal)ai[am]);
2731     } else {
2732       ratio_needed = 0.0;
2733     }
2734   } /* end of case: levels>0 || (levels=0 && !perm_identity) */
2735 
2736   /* put together the new matrix in MATSEQSBAIJ format */
2737   ierr = MatSeqSBAIJSetPreallocation(fact,bs,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
2738 
2739   b = (Mat_SeqSBAIJ*)(fact)->data;
2740 
2741   ierr = PetscFree2(b->imax,b->ilen);CHKERRQ(ierr);
2742 
2743   b->singlemalloc = PETSC_FALSE;
2744   b->free_a       = PETSC_TRUE;
2745   b->free_ij      = free_ij;
2746 
2747   ierr = PetscMalloc1(ui[am]+1,&b->a);CHKERRQ(ierr);
2748 
2749   b->j             = uj;
2750   b->i             = ui;
2751   b->diag          = NULL;
2752   b->ilen          = NULL;
2753   b->imax          = NULL;
2754   b->row           = perm;
2755   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
2756 
2757   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
2758   b->icol = perm;
2759   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
2760   ierr    = PetscMalloc1(am+1,&b->solve_work);CHKERRQ(ierr);
2761 
2762   b->maxnz = b->nz = ui[am];
2763 
2764   fact->info.factor_mallocs    = reallocs;
2765   fact->info.fill_ratio_given  = fill;
2766   fact->info.fill_ratio_needed = ratio_needed;
2767 #if defined(PETSC_USE_INFO)
2768   if (ai[am] != 0) {
2769     PetscReal af = fact->info.fill_ratio_needed;
2770     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)fill,(double)af);CHKERRQ(ierr);
2771     ierr = PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);CHKERRQ(ierr);
2772     ierr = PetscInfo1(A,"PCFactorSetFill(pc,%g) for best performance.\n",(double)af);CHKERRQ(ierr);
2773   } else {
2774     ierr = PetscInfo(A,"Empty matrix\n");CHKERRQ(ierr);
2775   }
2776 #endif
2777   if (perm_identity) {
2778     fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering_inplace;
2779   } else {
2780     fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_inplace;
2781   }
2782   PetscFunctionReturn(0);
2783 }
2784