xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision 2205254efee3a00a594e5e2a3a70f74dcb40bc03)
1 
2 /*
3     Factorization code for BAIJ format.
4 */
5 
6 #include <../src/mat/impls/baij/seq/baij.h>
7 #include <../src/mat/blockinvert.h>
8 #include <petscbt.h>
9 #include <../src/mat/utils/freespace.h>
10 
11 #undef __FUNCT__
12 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering"
13 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
14 {
15   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
16   PetscErrorCode    ierr;
17   const PetscInt    *adiag = a->diag,*ai = a->i,*aj = a->j,*vi;
18   PetscInt          i,n = a->mbs,j;
19   PetscInt          nz;
20   PetscScalar       *x,*tmp,s1;
21   const MatScalar   *aa = a->a,*v;
22   const PetscScalar *b;
23 
24   PetscFunctionBegin;
25   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
26   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
27   tmp  = a->solve_work;
28 
29 
30   /* copy the b into temp work space according to permutation */
31   for (i=0; i<n; i++) tmp[i] = b[i];
32 
33   /* forward solve the U^T */
34   for (i=0; i<n; i++) {
35     v   = aa + adiag[i+1] + 1;
36     vi  = aj + adiag[i+1] + 1;
37     nz  = adiag[i] - adiag[i+1] - 1;
38     s1  = tmp[i];
39     s1 *= v[nz];  /* multiply by inverse of diagonal entry */
40     for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
41     tmp[i] = s1;
42   }
43 
44   /* backward solve the L^T */
45   for (i=n-1; i>=0; i--) {
46     v   = aa + ai[i];
47     vi  = aj + ai[i];
48     nz  = ai[i+1] - ai[i];
49     s1  = tmp[i];
50     for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
51   }
52 
53   /* copy tmp into x according to permutation */
54   for (i=0; i<n; i++) x[i] = tmp[i];
55 
56   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
57   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
58 
59   ierr = PetscLogFlops(2.0*a->nz-A->cmap->n);CHKERRQ(ierr);
60   PetscFunctionReturn(0);
61 }
62 
63 #undef __FUNCT__
64 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace"
65 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
66 {
67   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
68   PetscErrorCode    ierr;
69   PetscInt          i,nz;
70   const PetscInt    *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
71   const MatScalar   *aa=a->a,*v;
72   PetscScalar       s1,*x;
73 
74   PetscFunctionBegin;
75   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
76   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
77 
78   /* forward solve the U^T */
79   for (i=0; i<n; i++) {
80 
81     v     = aa + diag[i];
82     /* multiply by the inverse of the block diagonal */
83     s1    = (*v++)*x[i];
84     vi    = aj + diag[i] + 1;
85     nz    = ai[i+1] - diag[i] - 1;
86     while (nz--) {
87       x[*vi++]  -= (*v++)*s1;
88     }
89     x[i]   = s1;
90   }
91   /* backward solve the L^T */
92   for (i=n-1; i>=0; i--) {
93     v    = aa + diag[i] - 1;
94     vi   = aj + diag[i] - 1;
95     nz   = diag[i] - ai[i];
96     s1   = x[i];
97     while (nz--) {
98       x[*vi--]   -=  (*v--)*s1;
99     }
100   }
101   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
102   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
103   PetscFunctionReturn(0);
104 }
105 
106 #undef __FUNCT__
107 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace"
108 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
109 {
110   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
111   PetscErrorCode    ierr;
112   PetscInt          i,nz,idx,idt,oidx;
113   const PetscInt    *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j;
114   const MatScalar   *aa=a->a,*v;
115   PetscScalar       s1,s2,x1,x2,*x;
116 
117   PetscFunctionBegin;
118   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
119   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
120 
121   /* forward solve the U^T */
122   idx = 0;
123   for (i=0; i<n; i++) {
124 
125     v     = aa + 4*diag[i];
126     /* multiply by the inverse of the block diagonal */
127     x1 = x[idx];   x2 = x[1+idx];
128     s1 = v[0]*x1  +  v[1]*x2;
129     s2 = v[2]*x1  +  v[3]*x2;
130     v += 4;
131 
132     vi    = aj + diag[i] + 1;
133     nz    = ai[i+1] - diag[i] - 1;
134     while (nz--) {
135       oidx = 2*(*vi++);
136       x[oidx]   -= v[0]*s1  +  v[1]*s2;
137       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
138       v  += 4;
139     }
140     x[idx]   = s1;x[1+idx] = s2;
141     idx += 2;
142   }
143   /* backward solve the L^T */
144   for (i=n-1; i>=0; i--) {
145     v    = aa + 4*diag[i] - 4;
146     vi   = aj + diag[i] - 1;
147     nz   = diag[i] - ai[i];
148     idt  = 2*i;
149     s1   = x[idt];  s2 = x[1+idt];
150     while (nz--) {
151       idx   = 2*(*vi--);
152       x[idx]   -=  v[0]*s1 +  v[1]*s2;
153       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
154       v -= 4;
155     }
156   }
157   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
158   ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
159   PetscFunctionReturn(0);
160 }
161 
162 #undef __FUNCT__
163 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering"
164 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
165 {
166   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
167   PetscErrorCode    ierr;
168   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
169   PetscInt          nz,idx,idt,j,i,oidx;
170   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
171   const MatScalar   *aa=a->a,*v;
172   PetscScalar       s1,s2,x1,x2,*x;
173 
174   PetscFunctionBegin;
175   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
176   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
177 
178   /* forward solve the U^T */
179   idx = 0;
180   for (i=0; i<n; i++) {
181     v     = aa + bs2*diag[i];
182     /* multiply by the inverse of the block diagonal */
183     x1 = x[idx];   x2 = x[1+idx];
184     s1 = v[0]*x1  +  v[1]*x2;
185     s2 = v[2]*x1  +  v[3]*x2;
186     v -= bs2;
187 
188     vi    = aj + diag[i] - 1;
189     nz    = diag[i] - diag[i+1] - 1;
190     for (j=0;j>-nz;j--) {
191       oidx = bs*vi[j];
192       x[oidx]   -= v[0]*s1  +  v[1]*s2;
193       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
194       v  -= bs2;
195     }
196     x[idx]   = s1;x[1+idx] = s2;
197     idx += bs;
198   }
199   /* backward solve the L^T */
200   for (i=n-1; i>=0; i--) {
201     v    = aa + bs2*ai[i];
202     vi   = aj + ai[i];
203     nz   = ai[i+1] - ai[i];
204     idt  = bs*i;
205     s1   = x[idt];  s2 = x[1+idt];
206     for (j=0;j<nz;j++) {
207       idx   = bs*vi[j];
208       x[idx]   -=  v[0]*s1 +  v[1]*s2;
209       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
210       v += bs2;
211     }
212   }
213   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
214   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
215   PetscFunctionReturn(0);
216 }
217 
218 #undef __FUNCT__
219 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace"
220 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
221 {
222   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
223   PetscErrorCode    ierr;
224   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
225   PetscInt          i,nz,idx,idt,oidx;
226   const MatScalar   *aa=a->a,*v;
227   PetscScalar       s1,s2,s3,x1,x2,x3,*x;
228 
229   PetscFunctionBegin;
230   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
231   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
232 
233   /* forward solve the U^T */
234   idx = 0;
235   for (i=0; i<n; i++) {
236 
237     v     = aa + 9*diag[i];
238     /* multiply by the inverse of the block diagonal */
239     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
240     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
241     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
242     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
243     v += 9;
244 
245     vi    = aj + diag[i] + 1;
246     nz    = ai[i+1] - diag[i] - 1;
247     while (nz--) {
248       oidx = 3*(*vi++);
249       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
250       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
251       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
252       v  += 9;
253     }
254     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
255     idx += 3;
256   }
257   /* backward solve the L^T */
258   for (i=n-1; i>=0; i--) {
259     v    = aa + 9*diag[i] - 9;
260     vi   = aj + diag[i] - 1;
261     nz   = diag[i] - ai[i];
262     idt  = 3*i;
263     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
264     while (nz--) {
265       idx   = 3*(*vi--);
266       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
267       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
268       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
269       v -= 9;
270     }
271   }
272   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
273   ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
274   PetscFunctionReturn(0);
275 }
276 
277 #undef __FUNCT__
278 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering"
279 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
280 {
281   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
282   PetscErrorCode    ierr;
283   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
284   PetscInt          nz,idx,idt,j,i,oidx;
285   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
286   const MatScalar   *aa=a->a,*v;
287   PetscScalar       s1,s2,s3,x1,x2,x3,*x;
288 
289   PetscFunctionBegin;
290   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
291   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
292 
293   /* forward solve the U^T */
294   idx = 0;
295   for (i=0; i<n; i++) {
296     v     = aa + bs2*diag[i];
297     /* multiply by the inverse of the block diagonal */
298     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];
299     s1 = v[0]*x1  +  v[1]*x2  + v[2]*x3;
300     s2 = v[3]*x1  +  v[4]*x2  + v[5]*x3;
301     s3 = v[6]*x1  +  v[7]*x2  + v[8]*x3;
302     v -= bs2;
303 
304     vi    = aj + diag[i] - 1;
305     nz    = diag[i] - diag[i+1] - 1;
306     for (j=0;j>-nz;j--) {
307       oidx = bs*vi[j];
308       x[oidx]   -= v[0]*s1  +  v[1]*s2  + v[2]*s3;
309       x[oidx+1] -= v[3]*s1  +  v[4]*s2  + v[5]*s3;
310       x[oidx+2] -= v[6]*s1  +  v[7]*s2  + v[8]*s3;
311       v  -= bs2;
312     }
313     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;
314     idx += bs;
315   }
316   /* backward solve the L^T */
317   for (i=n-1; i>=0; i--) {
318     v    = aa + bs2*ai[i];
319     vi   = aj + ai[i];
320     nz   = ai[i+1] - ai[i];
321     idt  = bs*i;
322     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];
323     for (j=0;j<nz;j++) {
324       idx   = bs*vi[j];
325       x[idx]   -= v[0]*s1  +  v[1]*s2  + v[2]*s3;
326       x[idx+1] -= v[3]*s1  +  v[4]*s2  + v[5]*s3;
327       x[idx+2] -= v[6]*s1  +  v[7]*s2  + v[8]*s3;
328       v += bs2;
329     }
330   }
331   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
332   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
333   PetscFunctionReturn(0);
334 }
335 
336 #undef __FUNCT__
337 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace"
338 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
339 {
340   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
341   PetscErrorCode    ierr;
342   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
343   PetscInt          i,nz,idx,idt,oidx;
344   const MatScalar   *aa=a->a,*v;
345   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x;
346 
347   PetscFunctionBegin;
348   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
349   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
350 
351   /* forward solve the U^T */
352   idx = 0;
353   for (i=0; i<n; i++) {
354 
355     v     = aa + 16*diag[i];
356     /* multiply by the inverse of the block diagonal */
357     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
358     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
359     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
360     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
361     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
362     v += 16;
363 
364     vi    = aj + diag[i] + 1;
365     nz    = ai[i+1] - diag[i] - 1;
366     while (nz--) {
367       oidx = 4*(*vi++);
368       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
369       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
370       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
371       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
372       v  += 16;
373     }
374     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
375     idx += 4;
376   }
377   /* backward solve the L^T */
378   for (i=n-1; i>=0; i--) {
379     v    = aa + 16*diag[i] - 16;
380     vi   = aj + diag[i] - 1;
381     nz   = diag[i] - ai[i];
382     idt  = 4*i;
383     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
384     while (nz--) {
385       idx   = 4*(*vi--);
386       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
387       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
388       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
389       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
390       v -= 16;
391     }
392   }
393   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
394   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
395   PetscFunctionReturn(0);
396 }
397 
398 #undef __FUNCT__
399 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering"
400 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
401 {
402   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
403   PetscErrorCode    ierr;
404   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
405   PetscInt          nz,idx,idt,j,i,oidx;
406   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
407   const MatScalar   *aa=a->a,*v;
408   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x;
409 
410   PetscFunctionBegin;
411   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
412   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
413 
414   /* forward solve the U^T */
415   idx = 0;
416   for (i=0; i<n; i++) {
417     v     = aa + bs2*diag[i];
418     /* multiply by the inverse of the block diagonal */
419     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
420     s1 =  v[0]*x1  +  v[1]*x2  + v[2]*x3  + v[3]*x4;
421     s2 =  v[4]*x1  +  v[5]*x2  + v[6]*x3  + v[7]*x4;
422     s3 =  v[8]*x1  +  v[9]*x2  + v[10]*x3 + v[11]*x4;
423     s4 =  v[12]*x1 +  v[13]*x2 + v[14]*x3 + v[15]*x4;
424     v -= bs2;
425 
426     vi    = aj + diag[i] - 1;
427     nz    = diag[i] - diag[i+1] - 1;
428     for (j=0;j>-nz;j--) {
429       oidx = bs*vi[j];
430       x[oidx]   -=  v[0]*s1  +  v[1]*s2  + v[2]*s3  + v[3]*s4;
431       x[oidx+1] -=  v[4]*s1  +  v[5]*s2  + v[6]*s3  + v[7]*s4;
432       x[oidx+2] -=  v[8]*s1  +  v[9]*s2  + v[10]*s3 + v[11]*s4;
433       x[oidx+3] -=  v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
434       v  -= bs2;
435     }
436     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4;
437     idx += bs;
438   }
439   /* backward solve the L^T */
440   for (i=n-1; i>=0; i--) {
441     v    = aa + bs2*ai[i];
442     vi   = aj + ai[i];
443     nz   = ai[i+1] - ai[i];
444     idt  = bs*i;
445     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];
446     for (j=0;j<nz;j++) {
447       idx   = bs*vi[j];
448       x[idx]   -=  v[0]*s1  +  v[1]*s2  + v[2]*s3  + v[3]*s4;
449       x[idx+1] -=  v[4]*s1  +  v[5]*s2  + v[6]*s3  + v[7]*s4;
450       x[idx+2] -=  v[8]*s1  +  v[9]*s2  + v[10]*s3 + v[11]*s4;
451       x[idx+3] -=  v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
452       v += bs2;
453     }
454   }
455   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
456   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
457   PetscFunctionReturn(0);
458 }
459 
460 #undef __FUNCT__
461 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace"
462 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
463 {
464   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
465   PetscErrorCode    ierr;
466   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
467   PetscInt          i,nz,idx,idt,oidx;
468   const MatScalar   *aa=a->a,*v;
469   PetscScalar       s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x;
470 
471   PetscFunctionBegin;
472   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
473   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
474 
475   /* forward solve the U^T */
476   idx = 0;
477   for (i=0; i<n; i++) {
478 
479     v     = aa + 25*diag[i];
480     /* multiply by the inverse of the block diagonal */
481     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
482     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
483     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
484     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
485     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
486     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
487     v += 25;
488 
489     vi    = aj + diag[i] + 1;
490     nz    = ai[i+1] - diag[i] - 1;
491     while (nz--) {
492       oidx = 5*(*vi++);
493       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
494       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
495       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
496       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
497       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
498       v  += 25;
499     }
500     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
501     idx += 5;
502   }
503   /* backward solve the L^T */
504   for (i=n-1; i>=0; i--) {
505     v    = aa + 25*diag[i] - 25;
506     vi   = aj + diag[i] - 1;
507     nz   = diag[i] - ai[i];
508     idt  = 5*i;
509     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
510     while (nz--) {
511       idx   = 5*(*vi--);
512       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
513       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
514       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
515       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
516       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
517       v -= 25;
518     }
519   }
520   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
521   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
522   PetscFunctionReturn(0);
523 }
524 
525 #undef __FUNCT__
526 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering"
527 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
528 {
529   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
530   PetscErrorCode ierr;
531   const PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
532   PetscInt       nz,idx,idt,j,i,oidx;
533   const PetscInt       bs=A->rmap->bs,bs2=a->bs2;
534   const MatScalar      *aa=a->a,*v;
535   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x;
536 
537   PetscFunctionBegin;
538   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
539   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
540 
541   /* forward solve the U^T */
542   idx = 0;
543   for (i=0; i<n; i++) {
544     v     = aa + bs2*diag[i];
545     /* multiply by the inverse of the block diagonal */
546     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
547     x5 = x[4+idx];
548     s1 =  v[0]*x1   +  v[1]*x2   + v[2]*x3   + v[3]*x4   + v[4]*x5;
549     s2 =  v[5]*x1   +  v[6]*x2   + v[7]*x3   + v[8]*x4   + v[9]*x5;
550     s3 =  v[10]*x1  +  v[11]*x2  + v[12]*x3  + v[13]*x4  + v[14]*x5;
551     s4 =  v[15]*x1  +  v[16]*x2  + v[17]*x3  + v[18]*x4  + v[19]*x5;
552     s5 =  v[20]*x1  +  v[21]*x2  + v[22]*x3  + v[23]*x4   + v[24]*x5;
553     v -= bs2;
554 
555     vi    = aj + diag[i] - 1;
556     nz    = diag[i] - diag[i+1] - 1;
557     for (j=0;j>-nz;j--) {
558       oidx = bs*vi[j];
559       x[oidx]   -=  v[0]*s1   +  v[1]*s2   + v[2]*s3   + v[3]*s4   + v[4]*s5;
560       x[oidx+1] -=  v[5]*s1   +  v[6]*s2   + v[7]*s3   + v[8]*s4   + v[9]*s5;
561       x[oidx+2] -=  v[10]*s1  +  v[11]*s2  + v[12]*s3  + v[13]*s4  + v[14]*s5;
562       x[oidx+3] -=  v[15]*s1  +  v[16]*s2  + v[17]*s3  + v[18]*s4  + v[19]*s5;
563       x[oidx+4] -=  v[20]*s1  +  v[21]*s2  + v[22]*s3  + v[23]*s4   + v[24]*s5;
564       v  -= bs2;
565     }
566     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
567     idx += bs;
568   }
569   /* backward solve the L^T */
570   for (i=n-1; i>=0; i--) {
571     v    = aa + bs2*ai[i];
572     vi   = aj + ai[i];
573     nz   = ai[i+1] - ai[i];
574     idt  = bs*i;
575     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
576     for (j=0;j<nz;j++) {
577       idx   = bs*vi[j];
578       x[idx]   -=  v[0]*s1   +  v[1]*s2   + v[2]*s3   + v[3]*s4   + v[4]*s5;
579       x[idx+1] -=  v[5]*s1   +  v[6]*s2   + v[7]*s3   + v[8]*s4   + v[9]*s5;
580       x[idx+2] -=  v[10]*s1  +  v[11]*s2  + v[12]*s3  + v[13]*s4  + v[14]*s5;
581       x[idx+3] -=  v[15]*s1  +  v[16]*s2  + v[17]*s3  + v[18]*s4  + v[19]*s5;
582       x[idx+4] -=  v[20]*s1  +  v[21]*s2  + v[22]*s3  + v[23]*s4   + v[24]*s5;
583       v += bs2;
584     }
585   }
586   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
587   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
588   PetscFunctionReturn(0);
589 }
590 
591 #undef __FUNCT__
592 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace"
593 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
594 {
595   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
596   PetscErrorCode    ierr;
597   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
598   PetscInt          i,nz,idx,idt,oidx;
599   const MatScalar   *aa=a->a,*v;
600   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x;
601 
602   PetscFunctionBegin;
603   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
604   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
605 
606   /* forward solve the U^T */
607   idx = 0;
608   for (i=0; i<n; i++) {
609 
610     v     = aa + 36*diag[i];
611     /* multiply by the inverse of the block diagonal */
612     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
613     x6    = x[5+idx];
614     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
615     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
616     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
617     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
618     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
619     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
620     v += 36;
621 
622     vi    = aj + diag[i] + 1;
623     nz    = ai[i+1] - diag[i] - 1;
624     while (nz--) {
625       oidx = 6*(*vi++);
626       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
627       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
628       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
629       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
630       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
631       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
632       v  += 36;
633     }
634     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
635     x[5+idx] = s6;
636     idx += 6;
637   }
638   /* backward solve the L^T */
639   for (i=n-1; i>=0; i--) {
640     v    = aa + 36*diag[i] - 36;
641     vi   = aj + diag[i] - 1;
642     nz   = diag[i] - ai[i];
643     idt  = 6*i;
644     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
645     s6 = x[5+idt];
646     while (nz--) {
647       idx   = 6*(*vi--);
648       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
649       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
650       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
651       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
652       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
653       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
654       v -= 36;
655     }
656   }
657   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
658   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
659   PetscFunctionReturn(0);
660 }
661 
662 #undef __FUNCT__
663 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering"
664 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
665 {
666   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
667   PetscErrorCode    ierr;
668   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
669   PetscInt          nz,idx,idt,j,i,oidx;
670   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
671   const MatScalar   *aa=a->a,*v;
672   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x;
673 
674   PetscFunctionBegin;
675   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
676   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
677 
678   /* forward solve the U^T */
679   idx = 0;
680   for (i=0; i<n; i++) {
681     v     = aa + bs2*diag[i];
682     /* multiply by the inverse of the block diagonal */
683     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
684     x5 = x[4+idx]; x6 = x[5+idx];
685     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
686     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
687     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
688     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
689     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
690     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
691     v -= bs2;
692 
693     vi    = aj + diag[i] - 1;
694     nz    = diag[i] - diag[i+1] - 1;
695     for (j=0;j>-nz;j--) {
696       oidx = bs*vi[j];
697       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
698       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
699       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
700       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
701       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
702       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
703       v  -= bs2;
704     }
705     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
706     x[5+idx] = s6;
707     idx += bs;
708   }
709   /* backward solve the L^T */
710   for (i=n-1; i>=0; i--) {
711     v    = aa + bs2*ai[i];
712     vi   = aj + ai[i];
713     nz   = ai[i+1] - ai[i];
714     idt  = bs*i;
715     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
716     s6   = x[5+idt];
717     for (j=0;j<nz;j++) {
718       idx   = bs*vi[j];
719       x[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
720       x[idx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
721       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
722       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
723       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
724       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
725       v += bs2;
726     }
727   }
728   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
729   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
730   PetscFunctionReturn(0);
731 }
732 
733 #undef __FUNCT__
734 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace"
735 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
736 {
737   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
738   PetscErrorCode    ierr;
739   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
740   PetscInt          i,nz,idx,idt,oidx;
741   const MatScalar   *aa=a->a,*v;
742   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x;
743 
744   PetscFunctionBegin;
745   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
746   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
747 
748   /* forward solve the U^T */
749   idx = 0;
750   for (i=0; i<n; i++) {
751 
752     v     = aa + 49*diag[i];
753     /* multiply by the inverse of the block diagonal */
754     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
755     x6    = x[5+idx]; x7 = x[6+idx];
756     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
757     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
758     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
759     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
760     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
761     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
762     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
763     v += 49;
764 
765     vi    = aj + diag[i] + 1;
766     nz    = ai[i+1] - diag[i] - 1;
767     while (nz--) {
768       oidx = 7*(*vi++);
769       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
770       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
771       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
772       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
773       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
774       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
775       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
776       v  += 49;
777     }
778     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
779     x[5+idx] = s6;x[6+idx] = s7;
780     idx += 7;
781   }
782   /* backward solve the L^T */
783   for (i=n-1; i>=0; i--) {
784     v    = aa + 49*diag[i] - 49;
785     vi   = aj + diag[i] - 1;
786     nz   = diag[i] - ai[i];
787     idt  = 7*i;
788     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
789     s6 = x[5+idt];s7 = x[6+idt];
790     while (nz--) {
791       idx   = 7*(*vi--);
792       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
793       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
794       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
795       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
796       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
797       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
798       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
799       v -= 49;
800     }
801   }
802   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
803   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
804   PetscFunctionReturn(0);
805 }
806 #undef __FUNCT__
807 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering"
808 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
809 {
810   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
811   PetscErrorCode    ierr;
812   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
813   PetscInt          nz,idx,idt,j,i,oidx;
814   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
815   const MatScalar   *aa=a->a,*v;
816   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x;
817 
818   PetscFunctionBegin;
819   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
820   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
821 
822   /* forward solve the U^T */
823   idx = 0;
824   for (i=0; i<n; i++) {
825     v     = aa + bs2*diag[i];
826     /* multiply by the inverse of the block diagonal */
827     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
828     x5 = x[4+idx]; x6 = x[5+idx];  x7 = x[6+idx];
829     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
830     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
831     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
832     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
833     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
834     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
835     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
836     v -= bs2;
837     vi    = aj + diag[i] - 1;
838     nz    = diag[i] - diag[i+1] - 1;
839     for (j=0;j>-nz;j--) {
840       oidx = bs*vi[j];
841       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
842       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
843       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
844       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
845       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
846       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
847       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
848       v  -= bs2;
849     }
850     x[idx]   = s1;  x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
851     x[5+idx] = s6;  x[6+idx] = s7;
852     idx += bs;
853   }
854   /* backward solve the L^T */
855   for (i=n-1; i>=0; i--) {
856     v    = aa + bs2*ai[i];
857     vi   = aj + ai[i];
858     nz   = ai[i+1] - ai[i];
859     idt  = bs*i;
860     s1   = x[idt];    s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
861     s6   = x[5+idt];  s7 = x[6+idt];
862     for (j=0;j<nz;j++) {
863       idx   = bs*vi[j];
864       x[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
865       x[idx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
866       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
867       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
868       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
869       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
870       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
871       v += bs2;
872     }
873   }
874   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
875   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
876   PetscFunctionReturn(0);
877 }
878 
879 /*---------------------------------------------------------------------------------------------*/
880 #undef __FUNCT__
881 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1"
882 PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
883 {
884   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data;
885   IS                iscol = a->col,isrow = a->row;
886   PetscErrorCode    ierr;
887   const PetscInt    *rout,*cout,*r,*c,*adiag = a->diag,*ai = a->i,*aj = a->j,*vi;
888   PetscInt          i,n = a->mbs,j;
889   PetscInt          nz;
890   PetscScalar       *x,*tmp,s1;
891   const MatScalar   *aa = a->a,*v;
892   const PetscScalar *b;
893 
894   PetscFunctionBegin;
895   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
896   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
897   tmp  = a->solve_work;
898 
899   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
900   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
901 
902   /* copy the b into temp work space according to permutation */
903   for (i=0; i<n; i++) tmp[i] = b[c[i]];
904 
905   /* forward solve the U^T */
906   for (i=0; i<n; i++) {
907     v   = aa + adiag[i+1] + 1;
908     vi  = aj + adiag[i+1] + 1;
909     nz  = adiag[i] - adiag[i+1] - 1;
910     s1  = tmp[i];
911     s1 *= v[nz];  /* multiply by inverse of diagonal entry */
912     for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
913     tmp[i] = s1;
914   }
915 
916   /* backward solve the L^T */
917   for (i=n-1; i>=0; i--) {
918     v   = aa + ai[i];
919     vi  = aj + ai[i];
920     nz  = ai[i+1] - ai[i];
921     s1  = tmp[i];
922     for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
923   }
924 
925   /* copy tmp into x according to permutation */
926   for (i=0; i<n; i++) x[r[i]] = tmp[i];
927 
928   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
929   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
930   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
931   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
932 
933   ierr = PetscLogFlops(2.0*a->nz-A->cmap->n);CHKERRQ(ierr);
934   PetscFunctionReturn(0);
935 }
936 
937 #undef __FUNCT__
938 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_inplace"
939 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
940 {
941   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
942   IS                iscol=a->col,isrow=a->row;
943   PetscErrorCode    ierr;
944   const PetscInt    *r,*c,*rout,*cout;
945   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
946   PetscInt          i,nz;
947   const MatScalar   *aa=a->a,*v;
948   PetscScalar       s1,*x,*t;
949   const PetscScalar *b;
950 
951   PetscFunctionBegin;
952   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
953   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
954   t  = a->solve_work;
955 
956   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
957   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
958 
959   /* copy the b into temp work space according to permutation */
960   for (i=0; i<n; i++) {
961     t[i] = b[c[i]];
962   }
963 
964   /* forward solve the U^T */
965   for (i=0; i<n; i++) {
966 
967     v     = aa + diag[i];
968     /* multiply by the inverse of the block diagonal */
969     s1    = (*v++)*t[i];
970     vi    = aj + diag[i] + 1;
971     nz    = ai[i+1] - diag[i] - 1;
972     while (nz--) {
973       t[*vi++]  -= (*v++)*s1;
974     }
975     t[i]   = s1;
976   }
977   /* backward solve the L^T */
978   for (i=n-1; i>=0; i--) {
979     v    = aa + diag[i] - 1;
980     vi   = aj + diag[i] - 1;
981     nz   = diag[i] - ai[i];
982     s1   = t[i];
983     while (nz--) {
984       t[*vi--]   -=  (*v--)*s1;
985     }
986   }
987 
988   /* copy t into x according to permutation */
989   for (i=0; i<n; i++) {
990     x[r[i]]   = t[i];
991   }
992 
993   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
994   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
995   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
996   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
997   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
998   PetscFunctionReturn(0);
999 }
1000 
1001 #undef __FUNCT__
1002 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_inplace"
1003 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
1004 {
1005   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1006   IS                iscol=a->col,isrow=a->row;
1007   PetscErrorCode    ierr;
1008   const PetscInt    *r,*c,*rout,*cout;
1009   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1010   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1011   const MatScalar   *aa=a->a,*v;
1012   PetscScalar       s1,s2,x1,x2,*x,*t;
1013   const PetscScalar *b;
1014 
1015   PetscFunctionBegin;
1016   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1017   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1018   t  = a->solve_work;
1019 
1020   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1021   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1022 
1023   /* copy the b into temp work space according to permutation */
1024   ii = 0;
1025   for (i=0; i<n; i++) {
1026     ic      = 2*c[i];
1027     t[ii]   = b[ic];
1028     t[ii+1] = b[ic+1];
1029     ii += 2;
1030   }
1031 
1032   /* forward solve the U^T */
1033   idx = 0;
1034   for (i=0; i<n; i++) {
1035 
1036     v     = aa + 4*diag[i];
1037     /* multiply by the inverse of the block diagonal */
1038     x1    = t[idx];   x2 = t[1+idx];
1039     s1 = v[0]*x1  +  v[1]*x2;
1040     s2 = v[2]*x1  +  v[3]*x2;
1041     v += 4;
1042 
1043     vi    = aj + diag[i] + 1;
1044     nz    = ai[i+1] - diag[i] - 1;
1045     while (nz--) {
1046       oidx = 2*(*vi++);
1047       t[oidx]   -= v[0]*s1  +  v[1]*s2;
1048       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
1049       v  += 4;
1050     }
1051     t[idx]   = s1;t[1+idx] = s2;
1052     idx += 2;
1053   }
1054   /* backward solve the L^T */
1055   for (i=n-1; i>=0; i--) {
1056     v    = aa + 4*diag[i] - 4;
1057     vi   = aj + diag[i] - 1;
1058     nz   = diag[i] - ai[i];
1059     idt  = 2*i;
1060     s1 = t[idt];  s2 = t[1+idt];
1061     while (nz--) {
1062       idx   = 2*(*vi--);
1063       t[idx]   -=  v[0]*s1 +  v[1]*s2;
1064       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
1065       v -= 4;
1066     }
1067   }
1068 
1069   /* copy t into x according to permutation */
1070   ii = 0;
1071   for (i=0; i<n; i++) {
1072     ir      = 2*r[i];
1073     x[ir]   = t[ii];
1074     x[ir+1] = t[ii+1];
1075     ii += 2;
1076   }
1077 
1078   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1079   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1080   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1081   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1082   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
1083   PetscFunctionReturn(0);
1084 }
1085 
1086 #undef __FUNCT__
1087 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2"
1088 PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
1089 {
1090   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1091   PetscErrorCode    ierr;
1092   IS                iscol=a->col,isrow=a->row;
1093   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1094   const PetscInt    *r,*c,*rout,*cout;
1095   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1096   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1097   const MatScalar   *aa=a->a,*v;
1098   PetscScalar       s1,s2,x1,x2,*x,*t;
1099   const PetscScalar *b;
1100 
1101   PetscFunctionBegin;
1102   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1103   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1104   t = a->solve_work;
1105 
1106   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1107   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1108 
1109   /* copy b into temp work space according to permutation */
1110   for (i=0;i<n;i++) {
1111     ii = bs*i; ic = bs*c[i];
1112     t[ii] = b[ic]; t[ii+1] = b[ic+1];
1113   }
1114 
1115   /* forward solve the U^T */
1116   idx = 0;
1117   for (i=0; i<n; i++) {
1118     v     = aa + bs2*diag[i];
1119     /* multiply by the inverse of the block diagonal */
1120     x1 = t[idx];   x2 = t[1+idx];
1121     s1 = v[0]*x1  +  v[1]*x2;
1122     s2 = v[2]*x1  +  v[3]*x2;
1123     v -= bs2;
1124 
1125     vi    = aj + diag[i] - 1;
1126     nz    = diag[i] - diag[i+1] - 1;
1127     for (j=0;j>-nz;j--) {
1128       oidx = bs*vi[j];
1129       t[oidx]   -= v[0]*s1  +  v[1]*s2;
1130       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
1131       v  -= bs2;
1132     }
1133     t[idx]   = s1;t[1+idx] = s2;
1134     idx += bs;
1135   }
1136   /* backward solve the L^T */
1137   for (i=n-1; i>=0; i--) {
1138     v    = aa + bs2*ai[i];
1139     vi   = aj + ai[i];
1140     nz   = ai[i+1] - ai[i];
1141     idt  = bs*i;
1142     s1   = t[idt];  s2 = t[1+idt];
1143     for (j=0;j<nz;j++) {
1144       idx   = bs*vi[j];
1145       t[idx]   -=  v[0]*s1 +  v[1]*s2;
1146       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
1147       v += bs2;
1148     }
1149   }
1150 
1151   /* copy t into x according to permutation */
1152   for (i=0;i<n;i++) {
1153     ii = bs*i;  ir = bs*r[i];
1154     x[ir] = t[ii];  x[ir+1] = t[ii+1];
1155   }
1156 
1157   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1158   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1159   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1160   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1161   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1162   PetscFunctionReturn(0);
1163 }
1164 
1165 #undef __FUNCT__
1166 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_inplace"
1167 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
1168 {
1169   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1170   IS                iscol=a->col,isrow=a->row;
1171   PetscErrorCode    ierr;
1172   const PetscInt    *r,*c,*rout,*cout;
1173   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1174   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1175   const MatScalar   *aa=a->a,*v;
1176   PetscScalar       s1,s2,s3,x1,x2,x3,*x,*t;
1177   const PetscScalar *b;
1178 
1179   PetscFunctionBegin;
1180   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1181   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1182   t  = a->solve_work;
1183 
1184   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1185   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1186 
1187   /* copy the b into temp work space according to permutation */
1188   ii = 0;
1189   for (i=0; i<n; i++) {
1190     ic      = 3*c[i];
1191     t[ii]   = b[ic];
1192     t[ii+1] = b[ic+1];
1193     t[ii+2] = b[ic+2];
1194     ii += 3;
1195   }
1196 
1197   /* forward solve the U^T */
1198   idx = 0;
1199   for (i=0; i<n; i++) {
1200 
1201     v     = aa + 9*diag[i];
1202     /* multiply by the inverse of the block diagonal */
1203     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
1204     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
1205     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
1206     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
1207     v += 9;
1208 
1209     vi    = aj + diag[i] + 1;
1210     nz    = ai[i+1] - diag[i] - 1;
1211     while (nz--) {
1212       oidx = 3*(*vi++);
1213       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
1214       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
1215       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1216       v  += 9;
1217     }
1218     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
1219     idx += 3;
1220   }
1221   /* backward solve the L^T */
1222   for (i=n-1; i>=0; i--) {
1223     v    = aa + 9*diag[i] - 9;
1224     vi   = aj + diag[i] - 1;
1225     nz   = diag[i] - ai[i];
1226     idt  = 3*i;
1227     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
1228     while (nz--) {
1229       idx   = 3*(*vi--);
1230       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
1231       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
1232       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1233       v -= 9;
1234     }
1235   }
1236 
1237   /* copy t into x according to permutation */
1238   ii = 0;
1239   for (i=0; i<n; i++) {
1240     ir      = 3*r[i];
1241     x[ir]   = t[ii];
1242     x[ir+1] = t[ii+1];
1243     x[ir+2] = t[ii+2];
1244     ii += 3;
1245   }
1246 
1247   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1248   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1249   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1250   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1251   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
1252   PetscFunctionReturn(0);
1253 }
1254 
1255 #undef __FUNCT__
1256 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3"
1257 PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
1258 {
1259   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1260   PetscErrorCode    ierr;
1261   IS                iscol=a->col,isrow=a->row;
1262   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1263   const PetscInt    *r,*c,*rout,*cout;
1264   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1265   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1266   const MatScalar   *aa=a->a,*v;
1267   PetscScalar       s1,s2,s3,x1,x2,x3,*x,*t;
1268   const PetscScalar *b;
1269 
1270   PetscFunctionBegin;
1271   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1272   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1273   t = a->solve_work;
1274 
1275   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1276   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1277 
1278   /* copy b into temp work space according to permutation */
1279   for (i=0;i<n;i++) {
1280     ii = bs*i; ic = bs*c[i];
1281     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2];
1282   }
1283 
1284   /* forward solve the U^T */
1285   idx = 0;
1286   for (i=0; i<n; i++) {
1287     v     = aa + bs2*diag[i];
1288     /* multiply by the inverse of the block diagonal */
1289     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
1290     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
1291     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
1292     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
1293     v -= bs2;
1294 
1295     vi    = aj + diag[i] - 1;
1296     nz    = diag[i] - diag[i+1] - 1;
1297     for (j=0;j>-nz;j--) {
1298       oidx = bs*vi[j];
1299       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
1300       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
1301       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1302       v  -= bs2;
1303     }
1304     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;
1305     idx += bs;
1306   }
1307   /* backward solve the L^T */
1308   for (i=n-1; i>=0; i--) {
1309     v    = aa + bs2*ai[i];
1310     vi   = aj + ai[i];
1311     nz   = ai[i+1] - ai[i];
1312     idt  = bs*i;
1313     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];
1314     for (j=0;j<nz;j++) {
1315       idx   = bs*vi[j];
1316       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
1317       t[idx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
1318       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1319       v += bs2;
1320     }
1321   }
1322 
1323   /* copy t into x according to permutation */
1324   for (i=0;i<n;i++) {
1325     ii = bs*i;  ir = bs*r[i];
1326     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];
1327   }
1328 
1329   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1330   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1331   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1332   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1333   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1334   PetscFunctionReturn(0);
1335 }
1336 
1337 #undef __FUNCT__
1338 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_inplace"
1339 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
1340 {
1341   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1342   IS                iscol=a->col,isrow=a->row;
1343   PetscErrorCode    ierr;
1344   const PetscInt    *r,*c,*rout,*cout;
1345   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1346   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1347   const MatScalar   *aa=a->a,*v;
1348   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x,*t;
1349   const PetscScalar *b;
1350 
1351   PetscFunctionBegin;
1352   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1353   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1354   t  = a->solve_work;
1355 
1356   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1357   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1358 
1359   /* copy the b into temp work space according to permutation */
1360   ii = 0;
1361   for (i=0; i<n; i++) {
1362     ic      = 4*c[i];
1363     t[ii]   = b[ic];
1364     t[ii+1] = b[ic+1];
1365     t[ii+2] = b[ic+2];
1366     t[ii+3] = b[ic+3];
1367     ii += 4;
1368   }
1369 
1370   /* forward solve the U^T */
1371   idx = 0;
1372   for (i=0; i<n; i++) {
1373 
1374     v     = aa + 16*diag[i];
1375     /* multiply by the inverse of the block diagonal */
1376     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
1377     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1378     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1379     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1380     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1381     v += 16;
1382 
1383     vi    = aj + diag[i] + 1;
1384     nz    = ai[i+1] - diag[i] - 1;
1385     while (nz--) {
1386       oidx = 4*(*vi++);
1387       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
1388       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
1389       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1390       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1391       v  += 16;
1392     }
1393     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
1394     idx += 4;
1395   }
1396   /* backward solve the L^T */
1397   for (i=n-1; i>=0; i--) {
1398     v    = aa + 16*diag[i] - 16;
1399     vi   = aj + diag[i] - 1;
1400     nz   = diag[i] - ai[i];
1401     idt  = 4*i;
1402     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
1403     while (nz--) {
1404       idx   = 4*(*vi--);
1405       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
1406       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
1407       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1408       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1409       v -= 16;
1410     }
1411   }
1412 
1413   /* copy t into x according to permutation */
1414   ii = 0;
1415   for (i=0; i<n; i++) {
1416     ir      = 4*r[i];
1417     x[ir]   = t[ii];
1418     x[ir+1] = t[ii+1];
1419     x[ir+2] = t[ii+2];
1420     x[ir+3] = t[ii+3];
1421     ii += 4;
1422   }
1423 
1424   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1425   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1426   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1427   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1428   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
1429   PetscFunctionReturn(0);
1430 }
1431 
1432 #undef __FUNCT__
1433 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4"
1434 PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
1435 {
1436   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1437   PetscErrorCode    ierr;
1438   IS                iscol=a->col,isrow=a->row;
1439   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1440   const PetscInt    *r,*c,*rout,*cout;
1441   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1442   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1443   const MatScalar   *aa=a->a,*v;
1444   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x,*t;
1445   const PetscScalar *b;
1446 
1447   PetscFunctionBegin;
1448   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1449   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1450   t = a->solve_work;
1451 
1452   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1453   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1454 
1455   /* copy b into temp work space according to permutation */
1456   for (i=0;i<n;i++) {
1457     ii = bs*i; ic = bs*c[i];
1458     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1459   }
1460 
1461   /* forward solve the U^T */
1462   idx = 0;
1463   for (i=0; i<n; i++) {
1464     v     = aa + bs2*diag[i];
1465     /* multiply by the inverse of the block diagonal */
1466     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];  x4 = t[3+idx];
1467     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1468     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1469     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1470     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1471     v -= bs2;
1472 
1473     vi    = aj + diag[i] - 1;
1474     nz    = diag[i] - diag[i+1] - 1;
1475     for (j=0;j>-nz;j--) {
1476       oidx = bs*vi[j];
1477       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
1478       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
1479       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1480       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1481       v  -= bs2;
1482     }
1483     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4;
1484     idx += bs;
1485   }
1486   /* backward solve the L^T */
1487   for (i=n-1; i>=0; i--) {
1488     v    = aa + bs2*ai[i];
1489     vi   = aj + ai[i];
1490     nz   = ai[i+1] - ai[i];
1491     idt  = bs*i;
1492     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt];
1493     for (j=0;j<nz;j++) {
1494       idx   = bs*vi[j];
1495       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3  +  v[3]*s4;
1496       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3  +  v[7]*s4;
1497       t[idx+2] -=  v[8]*s1 +  v[9]*s2 +  v[10]*s3 + v[11]*s4;
1498       t[idx+3] -= v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
1499       v += bs2;
1500     }
1501   }
1502 
1503   /* copy t into x according to permutation */
1504   for (i=0;i<n;i++) {
1505     ii = bs*i;  ir = bs*r[i];
1506     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
1507   }
1508 
1509   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1510   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1511   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1512   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1513   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1514   PetscFunctionReturn(0);
1515 }
1516 
1517 #undef __FUNCT__
1518 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_inplace"
1519 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
1520 {
1521   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1522   IS                iscol=a->col,isrow=a->row;
1523   PetscErrorCode    ierr;
1524   const PetscInt    *r,*c,*rout,*cout;
1525   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1526   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1527   const MatScalar   *aa=a->a,*v;
1528   PetscScalar       s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t;
1529   const PetscScalar *b;
1530 
1531   PetscFunctionBegin;
1532   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1533   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1534   t  = a->solve_work;
1535 
1536   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1537   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1538 
1539   /* copy the b into temp work space according to permutation */
1540   ii = 0;
1541   for (i=0; i<n; i++) {
1542     ic      = 5*c[i];
1543     t[ii]   = b[ic];
1544     t[ii+1] = b[ic+1];
1545     t[ii+2] = b[ic+2];
1546     t[ii+3] = b[ic+3];
1547     t[ii+4] = b[ic+4];
1548     ii += 5;
1549   }
1550 
1551   /* forward solve the U^T */
1552   idx = 0;
1553   for (i=0; i<n; i++) {
1554 
1555     v     = aa + 25*diag[i];
1556     /* multiply by the inverse of the block diagonal */
1557     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1558     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1559     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1560     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1561     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1562     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1563     v += 25;
1564 
1565     vi    = aj + diag[i] + 1;
1566     nz    = ai[i+1] - diag[i] - 1;
1567     while (nz--) {
1568       oidx = 5*(*vi++);
1569       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1570       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1571       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1572       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1573       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1574       v  += 25;
1575     }
1576     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1577     idx += 5;
1578   }
1579   /* backward solve the L^T */
1580   for (i=n-1; i>=0; i--) {
1581     v    = aa + 25*diag[i] - 25;
1582     vi   = aj + diag[i] - 1;
1583     nz   = diag[i] - ai[i];
1584     idt  = 5*i;
1585     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1586     while (nz--) {
1587       idx   = 5*(*vi--);
1588       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1589       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1590       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1591       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1592       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1593       v -= 25;
1594     }
1595   }
1596 
1597   /* copy t into x according to permutation */
1598   ii = 0;
1599   for (i=0; i<n; i++) {
1600     ir      = 5*r[i];
1601     x[ir]   = t[ii];
1602     x[ir+1] = t[ii+1];
1603     x[ir+2] = t[ii+2];
1604     x[ir+3] = t[ii+3];
1605     x[ir+4] = t[ii+4];
1606     ii += 5;
1607   }
1608 
1609   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1610   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1611   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1612   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1613   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
1614   PetscFunctionReturn(0);
1615 }
1616 
1617 #undef __FUNCT__
1618 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5"
1619 PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
1620 {
1621   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1622   PetscErrorCode    ierr;
1623   IS                iscol=a->col,isrow=a->row;
1624   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1625   const PetscInt    *r,*c,*rout,*cout;
1626   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1627   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1628   const MatScalar   *aa=a->a,*v;
1629   PetscScalar       s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t;
1630   const PetscScalar *b;
1631 
1632   PetscFunctionBegin;
1633   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1634   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1635   t = a->solve_work;
1636 
1637   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1638   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1639 
1640   /* copy b into temp work space according to permutation */
1641   for (i=0;i<n;i++) {
1642     ii = bs*i; ic = bs*c[i];
1643     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1644     t[ii+4] = b[ic+4];
1645   }
1646 
1647   /* forward solve the U^T */
1648   idx = 0;
1649   for (i=0; i<n; i++) {
1650     v     = aa + bs2*diag[i];
1651     /* multiply by the inverse of the block diagonal */
1652     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1653     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1654     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1655     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1656     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1657     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1658     v -= bs2;
1659 
1660     vi    = aj + diag[i] - 1;
1661     nz    = diag[i] - diag[i+1] - 1;
1662     for (j=0;j>-nz;j--) {
1663       oidx = bs*vi[j];
1664       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1665       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1666       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1667       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1668       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1669       v  -= bs2;
1670     }
1671     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
1672     idx += bs;
1673   }
1674   /* backward solve the L^T */
1675   for (i=n-1; i>=0; i--) {
1676     v    = aa + bs2*ai[i];
1677     vi   = aj + ai[i];
1678     nz   = ai[i+1] - ai[i];
1679     idt  = bs*i;
1680     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
1681     for (j=0;j<nz;j++) {
1682       idx   = bs*vi[j];
1683       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1684       t[idx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1685       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1686       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1687       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1688       v += bs2;
1689     }
1690   }
1691 
1692   /* copy t into x according to permutation */
1693   for (i=0;i<n;i++) {
1694     ii = bs*i;  ir = bs*r[i];
1695     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
1696     x[ir+4] = t[ii+4];
1697   }
1698 
1699   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1700   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1701   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1702   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1703   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1704   PetscFunctionReturn(0);
1705 }
1706 
1707 #undef __FUNCT__
1708 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_inplace"
1709 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
1710 {
1711   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1712   IS                iscol=a->col,isrow=a->row;
1713   PetscErrorCode    ierr;
1714   const PetscInt    *r,*c,*rout,*cout;
1715   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1716   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1717   const MatScalar   *aa=a->a,*v;
1718   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t;
1719   const PetscScalar *b;
1720 
1721   PetscFunctionBegin;
1722   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1723   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1724   t  = a->solve_work;
1725 
1726   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1727   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1728 
1729   /* copy the b into temp work space according to permutation */
1730   ii = 0;
1731   for (i=0; i<n; i++) {
1732     ic      = 6*c[i];
1733     t[ii]   = b[ic];
1734     t[ii+1] = b[ic+1];
1735     t[ii+2] = b[ic+2];
1736     t[ii+3] = b[ic+3];
1737     t[ii+4] = b[ic+4];
1738     t[ii+5] = b[ic+5];
1739     ii += 6;
1740   }
1741 
1742   /* forward solve the U^T */
1743   idx = 0;
1744   for (i=0; i<n; i++) {
1745 
1746     v     = aa + 36*diag[i];
1747     /* multiply by the inverse of the block diagonal */
1748     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1749     x6    = t[5+idx];
1750     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
1751     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
1752     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
1753     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
1754     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
1755     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
1756     v += 36;
1757 
1758     vi    = aj + diag[i] + 1;
1759     nz    = ai[i+1] - diag[i] - 1;
1760     while (nz--) {
1761       oidx = 6*(*vi++);
1762       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1763       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1764       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1765       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1766       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1767       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1768       v  += 36;
1769     }
1770     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1771     t[5+idx] = s6;
1772     idx += 6;
1773   }
1774   /* backward solve the L^T */
1775   for (i=n-1; i>=0; i--) {
1776     v    = aa + 36*diag[i] - 36;
1777     vi   = aj + diag[i] - 1;
1778     nz   = diag[i] - ai[i];
1779     idt  = 6*i;
1780     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1781     s6 = t[5+idt];
1782     while (nz--) {
1783       idx   = 6*(*vi--);
1784       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1785       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1786       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1787       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1788       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1789       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1790       v -= 36;
1791     }
1792   }
1793 
1794   /* copy t into x according to permutation */
1795   ii = 0;
1796   for (i=0; i<n; i++) {
1797     ir      = 6*r[i];
1798     x[ir]   = t[ii];
1799     x[ir+1] = t[ii+1];
1800     x[ir+2] = t[ii+2];
1801     x[ir+3] = t[ii+3];
1802     x[ir+4] = t[ii+4];
1803     x[ir+5] = t[ii+5];
1804     ii += 6;
1805   }
1806 
1807   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1808   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1809   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1810   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1811   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1812   PetscFunctionReturn(0);
1813 }
1814 
1815 #undef __FUNCT__
1816 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6"
1817 PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
1818 {
1819   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1820   PetscErrorCode    ierr;
1821   IS                iscol=a->col,isrow=a->row;
1822   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1823   const PetscInt    *r,*c,*rout,*cout;
1824   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1825   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1826   const MatScalar   *aa=a->a,*v;
1827   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t;
1828   const PetscScalar *b;
1829 
1830   PetscFunctionBegin;
1831   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1832   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1833   t = a->solve_work;
1834 
1835   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1836   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1837 
1838   /* copy b into temp work space according to permutation */
1839   for (i=0;i<n;i++) {
1840     ii = bs*i; ic = bs*c[i];
1841     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1842     t[ii+4] = b[ic+4];  t[ii+5] = b[ic+5];
1843   }
1844 
1845   /* forward solve the U^T */
1846   idx = 0;
1847   for (i=0; i<n; i++) {
1848     v     = aa + bs2*diag[i];
1849     /* multiply by the inverse of the block diagonal */
1850     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1851     x6    = t[5+idx];
1852     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
1853     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
1854     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
1855     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
1856     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
1857     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
1858     v -= bs2;
1859 
1860     vi    = aj + diag[i] - 1;
1861     nz    = diag[i] - diag[i+1] - 1;
1862     for (j=0;j>-nz;j--) {
1863       oidx = bs*vi[j];
1864       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1865       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1866       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1867       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1868       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1869       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1870       v  -= bs2;
1871     }
1872     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
1873     t[5+idx] = s6;
1874     idx += bs;
1875   }
1876   /* backward solve the L^T */
1877   for (i=n-1; i>=0; i--) {
1878     v    = aa + bs2*ai[i];
1879     vi   = aj + ai[i];
1880     nz   = ai[i+1] - ai[i];
1881     idt  = bs*i;
1882     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
1883     s6   = t[5+idt];
1884    for (j=0;j<nz;j++) {
1885       idx   = bs*vi[j];
1886       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1887       t[idx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1888       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1889       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1890       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1891       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1892       v += bs2;
1893     }
1894   }
1895 
1896   /* copy t into x according to permutation */
1897   for (i=0;i<n;i++) {
1898     ii = bs*i;  ir = bs*r[i];
1899     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
1900     x[ir+4] = t[ii+4];  x[ir+5] = t[ii+5];
1901   }
1902 
1903   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1904   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1905   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1906   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1907   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1908   PetscFunctionReturn(0);
1909 }
1910 
1911 #undef __FUNCT__
1912 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_inplace"
1913 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
1914 {
1915   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1916   IS                iscol=a->col,isrow=a->row;
1917   PetscErrorCode    ierr;
1918   const PetscInt    *r,*c,*rout,*cout;
1919   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1920   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1921   const MatScalar   *aa=a->a,*v;
1922   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
1923   const PetscScalar *b;
1924 
1925   PetscFunctionBegin;
1926   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1927   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1928   t  = a->solve_work;
1929 
1930   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1931   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1932 
1933   /* copy the b into temp work space according to permutation */
1934   ii = 0;
1935   for (i=0; i<n; i++) {
1936     ic      = 7*c[i];
1937     t[ii]   = b[ic];
1938     t[ii+1] = b[ic+1];
1939     t[ii+2] = b[ic+2];
1940     t[ii+3] = b[ic+3];
1941     t[ii+4] = b[ic+4];
1942     t[ii+5] = b[ic+5];
1943     t[ii+6] = b[ic+6];
1944     ii += 7;
1945   }
1946 
1947   /* forward solve the U^T */
1948   idx = 0;
1949   for (i=0; i<n; i++) {
1950 
1951     v     = aa + 49*diag[i];
1952     /* multiply by the inverse of the block diagonal */
1953     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1954     x6    = t[5+idx]; x7 = t[6+idx];
1955     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1956     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1957     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1958     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1959     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1960     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1961     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1962     v += 49;
1963 
1964     vi    = aj + diag[i] + 1;
1965     nz    = ai[i+1] - diag[i] - 1;
1966     while (nz--) {
1967       oidx = 7*(*vi++);
1968       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1969       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1970       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1971       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1972       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1973       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1974       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1975       v  += 49;
1976     }
1977     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1978     t[5+idx] = s6;t[6+idx] = s7;
1979     idx += 7;
1980   }
1981   /* backward solve the L^T */
1982   for (i=n-1; i>=0; i--) {
1983     v    = aa + 49*diag[i] - 49;
1984     vi   = aj + diag[i] - 1;
1985     nz   = diag[i] - ai[i];
1986     idt  = 7*i;
1987     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1988     s6 = t[5+idt];s7 = t[6+idt];
1989     while (nz--) {
1990       idx   = 7*(*vi--);
1991       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1992       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1993       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1994       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1995       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1996       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1997       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1998       v -= 49;
1999     }
2000   }
2001 
2002   /* copy t into x according to permutation */
2003   ii = 0;
2004   for (i=0; i<n; i++) {
2005     ir      = 7*r[i];
2006     x[ir]   = t[ii];
2007     x[ir+1] = t[ii+1];
2008     x[ir+2] = t[ii+2];
2009     x[ir+3] = t[ii+3];
2010     x[ir+4] = t[ii+4];
2011     x[ir+5] = t[ii+5];
2012     x[ir+6] = t[ii+6];
2013     ii += 7;
2014   }
2015 
2016   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2017   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2018   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2019   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2020   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
2021   PetscFunctionReturn(0);
2022 }
2023 #undef __FUNCT__
2024 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7"
2025 PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
2026 {
2027   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2028   PetscErrorCode    ierr;
2029   IS                iscol=a->col,isrow=a->row;
2030   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
2031   const PetscInt    *r,*c,*rout,*cout;
2032   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
2033   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
2034   const MatScalar   *aa=a->a,*v;
2035   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
2036   const PetscScalar *b;
2037 
2038   PetscFunctionBegin;
2039   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2040   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2041   t = a->solve_work;
2042 
2043   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2044   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2045 
2046   /* copy b into temp work space according to permutation */
2047   for (i=0;i<n;i++) {
2048     ii = bs*i; ic = bs*c[i];
2049     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
2050     t[ii+4] = b[ic+4];  t[ii+5] = b[ic+5];  t[ii+6] = b[ic+6];
2051   }
2052 
2053   /* forward solve the U^T */
2054   idx = 0;
2055   for (i=0; i<n; i++) {
2056     v     = aa + bs2*diag[i];
2057     /* multiply by the inverse of the block diagonal */
2058     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2059     x6    = t[5+idx]; x7 = t[6+idx];
2060     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
2061     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
2062     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
2063     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
2064     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
2065     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
2066     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
2067     v -= bs2;
2068 
2069     vi    = aj + diag[i] - 1;
2070     nz    = diag[i] - diag[i+1] - 1;
2071     for (j=0;j>-nz;j--) {
2072       oidx = bs*vi[j];
2073       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
2074       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
2075       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
2076       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
2077       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
2078       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
2079       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
2080       v  -= bs2;
2081     }
2082     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
2083     t[5+idx] = s6;  t[6+idx] = s7;
2084     idx += bs;
2085   }
2086   /* backward solve the L^T */
2087   for (i=n-1; i>=0; i--) {
2088     v    = aa + bs2*ai[i];
2089     vi   = aj + ai[i];
2090     nz   = ai[i+1] - ai[i];
2091     idt  = bs*i;
2092     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
2093     s6   = t[5+idt];  s7 = t[6+idt];
2094    for (j=0;j<nz;j++) {
2095       idx   = bs*vi[j];
2096       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
2097       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
2098       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
2099       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
2100       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
2101       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
2102       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
2103       v += bs2;
2104     }
2105   }
2106 
2107   /* copy t into x according to permutation */
2108   for (i=0;i<n;i++) {
2109     ii = bs*i;  ir = bs*r[i];
2110     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
2111     x[ir+4] = t[ii+4];  x[ir+5] = t[ii+5];  x[ir+6] = t[ii+6];
2112   }
2113 
2114   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2115   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2116   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2117   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2118   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2119   PetscFunctionReturn(0);
2120 }
2121 
2122 /* ----------------------------------------------------------- */
2123 #undef __FUNCT__
2124 #define __FUNCT__ "MatSolve_SeqBAIJ_N_inplace"
2125 PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
2126 {
2127   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2128   IS                iscol=a->col,isrow=a->row;
2129   PetscErrorCode    ierr;
2130   const PetscInt    *r,*c,*rout,*cout;
2131   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*vi;
2132   PetscInt          i,nz;
2133   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
2134   const MatScalar   *aa=a->a,*v;
2135   PetscScalar       *x,*s,*t,*ls;
2136   const PetscScalar *b;
2137 
2138   PetscFunctionBegin;
2139   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2140   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2141   t  = a->solve_work;
2142 
2143   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2144   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2145 
2146   /* forward solve the lower triangular */
2147   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
2148   for (i=1; i<n; i++) {
2149     v   = aa + bs2*ai[i];
2150     vi  = aj + ai[i];
2151     nz  = a->diag[i] - ai[i];
2152     s = t + bs*i;
2153     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
2154     while (nz--) {
2155       PetscKernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
2156       v += bs2;
2157     }
2158   }
2159   /* backward solve the upper triangular */
2160   ls = a->solve_work + A->cmap->n;
2161   for (i=n-1; i>=0; i--) {
2162     v   = aa + bs2*(a->diag[i] + 1);
2163     vi  = aj + a->diag[i] + 1;
2164     nz  = ai[i+1] - a->diag[i] - 1;
2165     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
2166     while (nz--) {
2167       PetscKernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
2168       v += bs2;
2169     }
2170     PetscKernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
2171     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
2172   }
2173 
2174   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2175   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2176   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2177   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2178   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
2179   PetscFunctionReturn(0);
2180 }
2181 
2182 /* ----------------------------------------------------------- */
2183 #undef __FUNCT__
2184 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N_inplace"
2185 PetscErrorCode MatSolveTranspose_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
2186 {
2187   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2188   IS                iscol=a->col,isrow=a->row;
2189   PetscErrorCode    ierr;
2190   const PetscInt    *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
2191   PetscInt          i,nz,j;
2192   const PetscInt    n=a->mbs,bs=A->rmap->bs,bs2=a->bs2;
2193   const MatScalar   *aa=a->a,*v;
2194   PetscScalar       *x,*t,*ls;
2195   const PetscScalar *b;
2196 
2197   PetscFunctionBegin;
2198   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2199   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2200   t    = a->solve_work;
2201 
2202   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2203   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2204 
2205   /* copy the b into temp work space according to permutation */
2206   for (i=0; i<n; i++) {
2207     for (j=0; j<bs; j++) {
2208       t[i*bs+j] = b[c[i]*bs+j];
2209     }
2210   }
2211 
2212 
2213   /* forward solve the upper triangular transpose */
2214   ls = a->solve_work + A->cmap->n;
2215   for (i=0; i<n; i++) {
2216     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
2217     PetscKernel_w_gets_transA_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
2218     v   = aa + bs2*(a->diag[i] + 1);
2219     vi  = aj + a->diag[i] + 1;
2220     nz  = ai[i+1] - a->diag[i] - 1;
2221     while (nz--) {
2222       PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
2223       v += bs2;
2224     }
2225   }
2226 
2227   /* backward solve the lower triangular transpose */
2228   for (i=n-1; i>=0; i--) {
2229     v   = aa + bs2*ai[i];
2230     vi  = aj + ai[i];
2231     nz  = a->diag[i] - ai[i];
2232     while (nz--) {
2233       PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
2234       v += bs2;
2235     }
2236   }
2237 
2238   /* copy t into x according to permutation */
2239   for (i=0; i<n; i++) {
2240     for (j=0; j<bs; j++) {
2241       x[bs*r[i]+j]   = t[bs*i+j];
2242     }
2243   }
2244 
2245   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2246   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2247   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2248   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2249   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
2250   PetscFunctionReturn(0);
2251 }
2252 
2253 #undef __FUNCT__
2254 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N"
2255 PetscErrorCode MatSolveTranspose_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
2256 {
2257   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2258   IS                iscol=a->col,isrow=a->row;
2259   PetscErrorCode    ierr;
2260   const PetscInt    *r,*c,*rout,*cout;
2261   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*vi,*diag=a->diag;
2262   PetscInt          i,j,nz;
2263   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
2264   const MatScalar   *aa=a->a,*v;
2265   PetscScalar       *x,*t,*ls;
2266   const PetscScalar *b;
2267 
2268   PetscFunctionBegin;
2269   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2270   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2271   t    = a->solve_work;
2272 
2273   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2274   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2275 
2276   /* copy the b into temp work space according to permutation */
2277   for (i=0; i<n; i++) {
2278     for (j=0; j<bs; j++) {
2279       t[i*bs+j] = b[c[i]*bs+j];
2280     }
2281   }
2282 
2283 
2284   /* forward solve the upper triangular transpose */
2285   ls = a->solve_work + A->cmap->n;
2286   for (i=0; i<n; i++) {
2287     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
2288     PetscKernel_w_gets_transA_times_v(bs,ls,aa+bs2*diag[i],t+i*bs);
2289     v   = aa + bs2*(diag[i] - 1);
2290     vi  = aj + diag[i] - 1;
2291     nz  = diag[i] - diag[i+1] - 1;
2292     for (j=0;j>-nz;j--) {
2293       PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs);
2294       v -= bs2;
2295     }
2296   }
2297 
2298   /* backward solve the lower triangular transpose */
2299   for (i=n-1; i>=0; i--) {
2300     v   = aa + bs2*ai[i];
2301     vi  = aj + ai[i];
2302     nz  = ai[i+1] - ai[i];
2303     for (j=0;j<nz;j++) {
2304       PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs);
2305       v += bs2;
2306     }
2307   }
2308 
2309   /* copy t into x according to permutation */
2310   for (i=0; i<n; i++) {
2311     for (j=0; j<bs; j++) {
2312       x[bs*r[i]+j]   = t[bs*i+j];
2313     }
2314   }
2315 
2316   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2317   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2318   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2319   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2320   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
2321   PetscFunctionReturn(0);
2322 }
2323 
2324 /* bs = 15 for PFLOTRAN. Block operations are done by accessing all the columns   of the block at once */
2325 
2326 #undef __FUNCT__
2327 #define __FUNCT__ "MatSolve_SeqBAIJ_15_NaturalOrdering_ver2"
2328 PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver2(Mat A,Vec bb,Vec xx)
2329 {
2330   Mat_SeqBAIJ      *a=(Mat_SeqBAIJ *)A->data;
2331   PetscErrorCode    ierr;
2332   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
2333   PetscInt          i,nz,idx,idt,m;
2334   const MatScalar   *aa=a->a,*v;
2335   PetscScalar       s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15;
2336   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15;
2337   PetscScalar       *x;
2338   const PetscScalar *b;
2339 
2340   PetscFunctionBegin;
2341   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2342   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2343 
2344   /* forward solve the lower triangular */
2345   idx    = 0;
2346   x[0]  = b[idx];    x[1]  = b[1+idx];  x[2]  = b[2+idx];  x[3]  = b[3+idx];  x[4]  = b[4+idx];
2347   x[5]  = b[5+idx];  x[6]  = b[6+idx];  x[7]  = b[7+idx];  x[8]  = b[8+idx];  x[9]  = b[9+idx];
2348   x[10] = b[10+idx]; x[11] = b[11+idx]; x[12] = b[12+idx]; x[13] = b[13+idx]; x[14] = b[14+idx];
2349 
2350   for (i=1; i<n; i++) {
2351     v     = aa + bs2*ai[i];
2352     vi    = aj + ai[i];
2353     nz    = ai[i+1] - ai[i];
2354     idt   = bs*i;
2355     s1   = b[idt];    s2  = b[1+idt];  s3  = b[2+idt];  s4  = b[3+idt];  s5  = b[4+idt];
2356     s6   = b[5+idt];  s7  = b[6+idt];  s8  = b[7+idt];  s9  = b[8+idt];  s10 = b[9+idt];
2357     s11  = b[10+idt]; s12 = b[11+idt]; s13 = b[12+idt]; s14 = b[13+idt]; s15 = b[14+idt];
2358     for (m=0;m<nz;m++) {
2359       idx   = bs*vi[m];
2360       x1   = x[idx];     x2  = x[1+idx];  x3  = x[2+idx];  x4  = x[3+idx];  x5  = x[4+idx];
2361       x6   = x[5+idx];   x7  = x[6+idx];  x8  = x[7+idx];  x9  = x[8+idx];  x10 = x[9+idx];
2362       x11  = x[10+idx]; x12  = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx];
2363 
2364 
2365       s1 -=  v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
2366       s2 -=  v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
2367       s3 -=  v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
2368       s4 -=  v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
2369       s5  -= v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
2370       s6  -= v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
2371       s7  -= v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
2372       s8  -= v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
2373       s9  -= v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
2374       s10 -= v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
2375       s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
2376       s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
2377       s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
2378       s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
2379       s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
2380 
2381       v += bs2;
2382     }
2383     x[idt]    = s1;  x[1+idt]  = s2;  x[2+idt]  = s3;  x[3+idt]  = s4;  x[4+idt]  = s5;
2384     x[5+idt]  = s6;  x[6+idt]  = s7;  x[7+idt]  = s8;  x[8+idt]  = s9;  x[9+idt]  = s10;
2385     x[10+idt] = s11; x[11+idt] = s12; x[12+idt] = s13; x[13+idt] = s14; x[14+idt] = s15;
2386 
2387   }
2388   /* backward solve the upper triangular */
2389   for (i=n-1; i>=0; i--) {
2390     v    = aa + bs2*(adiag[i+1]+1);
2391     vi   = aj + adiag[i+1]+1;
2392     nz   = adiag[i] - adiag[i+1] - 1;
2393     idt  = bs*i;
2394     s1   = x[idt];     s2  = x[1+idt];  s3  = x[2+idt];  s4  = x[3+idt];  s5  = x[4+idt];
2395     s6   = x[5+idt];   s7  = x[6+idt];  s8  = x[7+idt];  s9  = x[8+idt];  s10 = x[9+idt];
2396     s11  = x[10+idt]; s12  = x[11+idt]; s13 = x[12+idt]; s14 = x[13+idt]; s15 = x[14+idt];
2397 
2398     for (m=0;m<nz;m++) {
2399       idx   = bs*vi[m];
2400       x1   = x[idx];     x2  = x[1+idx];  x3  = x[2+idx];  x4  = x[3+idx];  x5  = x[4+idx];
2401       x6   = x[5+idx];   x7  = x[6+idx];  x8  = x[7+idx];  x9  = x[8+idx];  x10 = x[9+idx];
2402       x11  = x[10+idx]; x12  = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx];
2403 
2404       s1  -= v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
2405       s2  -= v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
2406       s3  -= v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
2407       s4  -= v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
2408       s5  -= v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
2409       s6  -= v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
2410       s7  -= v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
2411       s8  -= v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
2412       s9  -= v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
2413       s10 -= v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
2414       s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
2415       s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
2416       s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
2417       s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
2418       s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
2419 
2420       v += bs2;
2421     }
2422 
2423     x[idt] = v[0]*s1  + v[15]*s2 + v[30]*s3 + v[45]*s4 + v[60]*s5 + v[75]*s6 + v[90]*s7  + v[105]*s8 + v[120]*s9 + v[135]*s10 + v[150]*s11 + v[165]*s12 + v[180]*s13 + v[195]*s14 + v[210]*s15;
2424     x[1+idt] = v[1]*s1  + v[16]*s2 + v[31]*s3 + v[46]*s4 + v[61]*s5 + v[76]*s6 + v[91]*s7  + v[106]*s8 + v[121]*s9 + v[136]*s10 + v[151]*s11 + v[166]*s12 + v[181]*s13 + v[196]*s14 + v[211]*s15;
2425     x[2+idt] = v[2]*s1  + v[17]*s2 + v[32]*s3 + v[47]*s4 + v[62]*s5 + v[77]*s6 + v[92]*s7  + v[107]*s8 + v[122]*s9 + v[137]*s10 + v[152]*s11 + v[167]*s12 + v[182]*s13 + v[197]*s14 + v[212]*s15;
2426     x[3+idt] = v[3]*s1  + v[18]*s2 + v[33]*s3 + v[48]*s4 + v[63]*s5 + v[78]*s6 + v[93]*s7  + v[108]*s8 + v[123]*s9 + v[138]*s10 + v[153]*s11 + v[168]*s12 + v[183]*s13 + v[198]*s14 + v[213]*s15;
2427     x[4+idt] = v[4]*s1  + v[19]*s2 + v[34]*s3 + v[49]*s4 + v[64]*s5 + v[79]*s6 + v[94]*s7  + v[109]*s8 + v[124]*s9 + v[139]*s10 + v[154]*s11 + v[169]*s12 + v[184]*s13 + v[199]*s14 + v[214]*s15;
2428     x[5+idt] = v[5]*s1  + v[20]*s2 + v[35]*s3 + v[50]*s4 + v[65]*s5 + v[80]*s6 + v[95]*s7  + v[110]*s8 + v[125]*s9 + v[140]*s10 + v[155]*s11 + v[170]*s12 + v[185]*s13 + v[200]*s14 + v[215]*s15;
2429     x[6+idt] = v[6]*s1  + v[21]*s2 + v[36]*s3 + v[51]*s4 + v[66]*s5 + v[81]*s6 + v[96]*s7  + v[111]*s8 + v[126]*s9 + v[141]*s10 + v[156]*s11 + v[171]*s12 + v[186]*s13 + v[201]*s14 + v[216]*s15;
2430     x[7+idt] = v[7]*s1  + v[22]*s2 + v[37]*s3 + v[52]*s4 + v[67]*s5 + v[82]*s6 + v[97]*s7  + v[112]*s8 + v[127]*s9 + v[142]*s10 + v[157]*s11 + v[172]*s12 + v[187]*s13 + v[202]*s14 + v[217]*s15;
2431     x[8+idt] = v[8]*s1  + v[23]*s2 + v[38]*s3 + v[53]*s4 + v[68]*s5 + v[83]*s6 + v[98]*s7  + v[113]*s8 + v[128]*s9 + v[143]*s10 + v[158]*s11 + v[173]*s12 + v[188]*s13 + v[203]*s14 + v[218]*s15;
2432     x[9+idt] = v[9]*s1  + v[24]*s2 + v[39]*s3 + v[54]*s4 + v[69]*s5 + v[84]*s6 + v[99]*s7  + v[114]*s8 + v[129]*s9 + v[144]*s10 + v[159]*s11 + v[174]*s12 + v[189]*s13 + v[204]*s14 + v[219]*s15;
2433     x[10+idt] = v[10]*s1 + v[25]*s2 + v[40]*s3 + v[55]*s4 + v[70]*s5 + v[85]*s6 + v[100]*s7 + v[115]*s8 + v[130]*s9 + v[145]*s10 + v[160]*s11 + v[175]*s12 + v[190]*s13 + v[205]*s14 + v[220]*s15;
2434     x[11+idt] = v[11]*s1 + v[26]*s2 + v[41]*s3 + v[56]*s4 + v[71]*s5 + v[86]*s6 + v[101]*s7 + v[116]*s8 + v[131]*s9 + v[146]*s10 + v[161]*s11 + v[176]*s12 + v[191]*s13 + v[206]*s14 + v[221]*s15;
2435     x[12+idt] = v[12]*s1 + v[27]*s2 + v[42]*s3 + v[57]*s4 + v[72]*s5 + v[87]*s6 + v[102]*s7 + v[117]*s8 + v[132]*s9 + v[147]*s10 + v[162]*s11 + v[177]*s12 + v[192]*s13 + v[207]*s14 + v[222]*s15;
2436     x[13+idt] = v[13]*s1 + v[28]*s2 + v[43]*s3 + v[58]*s4 + v[73]*s5 + v[88]*s6 + v[103]*s7 + v[118]*s8 + v[133]*s9 + v[148]*s10 + v[163]*s11 + v[178]*s12 + v[193]*s13 + v[208]*s14 + v[223]*s15;
2437     x[14+idt] = v[14]*s1 + v[29]*s2 + v[44]*s3 + v[59]*s4 + v[74]*s5 + v[89]*s6 + v[104]*s7 + v[119]*s8 + v[134]*s9 + v[149]*s10 + v[164]*s11 + v[179]*s12 + v[194]*s13 + v[209]*s14 + v[224]*s15;
2438 
2439   }
2440 
2441   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2442   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2443   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2444   PetscFunctionReturn(0);
2445 }
2446 
2447 /* bs = 15 for PFLOTRAN. Block operations are done by accessing one column at at time */
2448 /* Default MatSolve for block size 15 */
2449 
2450 #undef __FUNCT__
2451 #define __FUNCT__ "MatSolve_SeqBAIJ_15_NaturalOrdering_ver1"
2452 PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver1(Mat A,Vec bb,Vec xx)
2453 {
2454   Mat_SeqBAIJ      *a=(Mat_SeqBAIJ *)A->data;
2455   PetscErrorCode    ierr;
2456   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
2457   PetscInt          i,k,nz,idx,idt,m;
2458   const MatScalar   *aa=a->a,*v;
2459   PetscScalar       s[15];
2460   PetscScalar       *x,xv;
2461   const PetscScalar *b;
2462 
2463   PetscFunctionBegin;
2464   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2465   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2466 
2467   /* forward solve the lower triangular */
2468   for (i=0; i<n; i++) {
2469     v     = aa + bs2*ai[i];
2470     vi    = aj + ai[i];
2471     nz    = ai[i+1] - ai[i];
2472     idt   = bs*i;
2473     x[idt]   = b[idt];    x[1+idt]  = b[1+idt];  x[2+idt]  = b[2+idt];  x[3+idt]  = b[3+idt];  x[4+idt]  = b[4+idt];
2474     x[5+idt]   = b[5+idt];  x[6+idt]  = b[6+idt];  x[7+idt]  = b[7+idt];  x[8+idt]  = b[8+idt];  x[9+idt] = b[9+idt];
2475     x[10+idt]  = b[10+idt]; x[11+idt] = b[11+idt]; x[12+idt] = b[12+idt]; x[13+idt] = b[13+idt]; x[14+idt] = b[14+idt];
2476     for (m=0;m<nz;m++) {
2477       idx   = bs*vi[m];
2478       for (k=0;k<15;k++) {
2479         xv        = x[k + idx];
2480         x[idt]    -= v[0]*xv;
2481         x[1+idt]  -= v[1]*xv;
2482         x[2+idt]  -= v[2]*xv;
2483         x[3+idt]  -= v[3]*xv;
2484         x[4+idt]  -= v[4]*xv;
2485         x[5+idt]  -= v[5]*xv;
2486         x[6+idt]  -= v[6]*xv;
2487         x[7+idt]  -= v[7]*xv;
2488         x[8+idt]  -= v[8]*xv;
2489         x[9+idt]  -= v[9]*xv;
2490         x[10+idt] -= v[10]*xv;
2491         x[11+idt] -= v[11]*xv;
2492         x[12+idt] -= v[12]*xv;
2493         x[13+idt] -= v[13]*xv;
2494         x[14+idt] -= v[14]*xv;
2495         v += 15;
2496       }
2497     }
2498   }
2499   /* backward solve the upper triangular */
2500   for (i=n-1; i>=0; i--) {
2501     v    = aa + bs2*(adiag[i+1]+1);
2502     vi   = aj + adiag[i+1]+1;
2503     nz   = adiag[i] - adiag[i+1] - 1;
2504     idt  = bs*i;
2505     s[0]   = x[idt];    s[1]  = x[1+idt];  s[2]  = x[2+idt];  s[3]  = x[3+idt];  s[4]  = x[4+idt];
2506     s[5]   = x[5+idt];  s[6]  = x[6+idt];  s[7]  = x[7+idt];  s[8]  = x[8+idt];  s[9]  = x[9+idt];
2507     s[10]  = x[10+idt]; s[11] = x[11+idt]; s[12] = x[12+idt]; s[13] = x[13+idt]; s[14] = x[14+idt];
2508 
2509     for (m=0;m<nz;m++) {
2510       idx   = bs*vi[m];
2511       for (k=0;k<15;k++) {
2512         xv = x[k + idx];
2513         s[0]  -= v[0]*xv;
2514         s[1]  -= v[1]*xv;
2515         s[2]  -= v[2]*xv;
2516         s[3]  -= v[3]*xv;
2517         s[4]  -= v[4]*xv;
2518         s[5]  -= v[5]*xv;
2519         s[6]  -= v[6]*xv;
2520         s[7]  -= v[7]*xv;
2521         s[8]  -= v[8]*xv;
2522         s[9]  -= v[9]*xv;
2523         s[10] -= v[10]*xv;
2524         s[11] -= v[11]*xv;
2525         s[12] -= v[12]*xv;
2526         s[13] -= v[13]*xv;
2527         s[14] -= v[14]*xv;
2528         v += 15;
2529       }
2530     }
2531     ierr = PetscMemzero(x+idt,bs*sizeof(MatScalar));CHKERRQ(ierr);
2532     for (k=0;k<15;k++) {
2533       x[idt]    += v[0]*s[k];
2534       x[1+idt]  += v[1]*s[k];
2535       x[2+idt]  += v[2]*s[k];
2536       x[3+idt]  += v[3]*s[k];
2537       x[4+idt]  += v[4]*s[k];
2538       x[5+idt]  += v[5]*s[k];
2539       x[6+idt]  += v[6]*s[k];
2540       x[7+idt]  += v[7]*s[k];
2541       x[8+idt]  += v[8]*s[k];
2542       x[9+idt]  += v[9]*s[k];
2543       x[10+idt] += v[10]*s[k];
2544       x[11+idt] += v[11]*s[k];
2545       x[12+idt] += v[12]*s[k];
2546       x[13+idt] += v[13]*s[k];
2547       x[14+idt] += v[14]*s[k];
2548       v += 15;
2549     }
2550   }
2551   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2552   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2553   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2554   PetscFunctionReturn(0);
2555 }
2556 
2557 
2558 #undef __FUNCT__
2559 #define __FUNCT__ "MatSolve_SeqBAIJ_7_inplace"
2560 PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
2561 {
2562   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2563   IS                iscol=a->col,isrow=a->row;
2564   PetscErrorCode    ierr;
2565   const PetscInt    *r,*c,*ai=a->i,*aj=a->j;
2566   const PetscInt    *rout,*cout,*diag = a->diag,*vi,n=a->mbs;
2567   PetscInt          i,nz,idx,idt,idc;
2568   const MatScalar   *aa=a->a,*v;
2569   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
2570   const PetscScalar *b;
2571 
2572   PetscFunctionBegin;
2573   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2574   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2575   t  = a->solve_work;
2576 
2577   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2578   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2579 
2580   /* forward solve the lower triangular */
2581   idx    = 7*(*r++);
2582   t[0] = b[idx];   t[1] = b[1+idx];
2583   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
2584   t[5] = b[5+idx]; t[6] = b[6+idx];
2585 
2586   for (i=1; i<n; i++) {
2587     v     = aa + 49*ai[i];
2588     vi    = aj + ai[i];
2589     nz    = diag[i] - ai[i];
2590     idx   = 7*(*r++);
2591     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2592     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
2593     while (nz--) {
2594       idx   = 7*(*vi++);
2595       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
2596       x4    = t[3+idx];x5 = t[4+idx];
2597       x6    = t[5+idx];x7 = t[6+idx];
2598       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2599       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2600       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2601       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2602       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2603       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2604       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2605       v += 49;
2606     }
2607     idx = 7*i;
2608     t[idx]   = s1;t[1+idx] = s2;
2609     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2610     t[5+idx] = s6;t[6+idx] = s7;
2611   }
2612   /* backward solve the upper triangular */
2613   for (i=n-1; i>=0; i--) {
2614     v    = aa + 49*diag[i] + 49;
2615     vi   = aj + diag[i] + 1;
2616     nz   = ai[i+1] - diag[i] - 1;
2617     idt  = 7*i;
2618     s1 = t[idt];  s2 = t[1+idt];
2619     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2620     s6 = t[5+idt];s7 = t[6+idt];
2621     while (nz--) {
2622       idx   = 7*(*vi++);
2623       x1    = t[idx];   x2 = t[1+idx];
2624       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2625       x6    = t[5+idx]; x7 = t[6+idx];
2626       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2627       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2628       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2629       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2630       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2631       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2632       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2633       v += 49;
2634     }
2635     idc = 7*(*c--);
2636     v   = aa + 49*diag[i];
2637     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
2638                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
2639     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
2640                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
2641     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
2642                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
2643     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
2644                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
2645     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
2646                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
2647     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
2648                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
2649     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
2650                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
2651   }
2652 
2653   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2654   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2655   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2656   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2657   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
2658   PetscFunctionReturn(0);
2659 }
2660 
2661 #undef __FUNCT__
2662 #define __FUNCT__ "MatSolve_SeqBAIJ_7"
2663 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
2664 {
2665   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2666   IS                iscol=a->col,isrow=a->row;
2667   PetscErrorCode    ierr;
2668   const PetscInt    *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag;
2669   const PetscInt    n=a->mbs,*rout,*cout,*vi;
2670   PetscInt          i,nz,idx,idt,idc,m;
2671   const MatScalar   *aa=a->a,*v;
2672   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
2673   const PetscScalar *b;
2674 
2675   PetscFunctionBegin;
2676   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2677   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2678   t  = a->solve_work;
2679 
2680   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2681   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2682 
2683   /* forward solve the lower triangular */
2684   idx    = 7*r[0];
2685   t[0] = b[idx];   t[1] = b[1+idx];
2686   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
2687   t[5] = b[5+idx]; t[6] = b[6+idx];
2688 
2689   for (i=1; i<n; i++) {
2690     v     = aa + 49*ai[i];
2691     vi    = aj + ai[i];
2692     nz    = ai[i+1] - ai[i];
2693     idx   = 7*r[i];
2694     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2695     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
2696     for (m=0;m<nz;m++) {
2697       idx   = 7*vi[m];
2698       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
2699       x4    = t[3+idx];x5 = t[4+idx];
2700       x6    = t[5+idx];x7 = t[6+idx];
2701       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2702       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2703       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2704       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2705       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2706       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2707       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2708       v += 49;
2709     }
2710     idx = 7*i;
2711     t[idx]   = s1;t[1+idx] = s2;
2712     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2713     t[5+idx] = s6;t[6+idx] = s7;
2714   }
2715   /* backward solve the upper triangular */
2716   for (i=n-1; i>=0; i--) {
2717     v    = aa + 49*(adiag[i+1]+1);
2718     vi   = aj + adiag[i+1]+1;
2719     nz   = adiag[i] - adiag[i+1] - 1;
2720     idt  = 7*i;
2721     s1 = t[idt];  s2 = t[1+idt];
2722     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2723     s6 = t[5+idt];s7 = t[6+idt];
2724     for (m=0;m<nz;m++) {
2725       idx   = 7*vi[m];
2726       x1    = t[idx];   x2 = t[1+idx];
2727       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2728       x6    = t[5+idx]; x7 = t[6+idx];
2729       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2730       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2731       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2732       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2733       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2734       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2735       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2736       v += 49;
2737     }
2738     idc = 7*c[i];
2739     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
2740                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
2741     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
2742                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
2743     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
2744                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
2745     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
2746                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
2747     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
2748                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
2749     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
2750                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
2751     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
2752                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
2753   }
2754 
2755   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2756   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2757   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2758   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2759   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
2760   PetscFunctionReturn(0);
2761 }
2762 
2763 #undef __FUNCT__
2764 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_inplace"
2765 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2766 {
2767   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2768   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
2769   PetscErrorCode    ierr;
2770   PetscInt          i,nz,idx,idt,jdx;
2771   const MatScalar   *aa=a->a,*v;
2772   PetscScalar       *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
2773   const PetscScalar *b;
2774 
2775   PetscFunctionBegin;
2776   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2777   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2778   /* forward solve the lower triangular */
2779   idx    = 0;
2780   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
2781   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
2782   x[6] = b[6+idx];
2783   for (i=1; i<n; i++) {
2784     v     =  aa + 49*ai[i];
2785     vi    =  aj + ai[i];
2786     nz    =  diag[i] - ai[i];
2787     idx   =  7*i;
2788     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
2789     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
2790     s7  =  b[6+idx];
2791     while (nz--) {
2792       jdx   = 7*(*vi++);
2793       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
2794       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
2795       x7    = x[6+jdx];
2796       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2797       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2798       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2799       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2800       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2801       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2802       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2803       v += 49;
2804      }
2805     x[idx]   = s1;
2806     x[1+idx] = s2;
2807     x[2+idx] = s3;
2808     x[3+idx] = s4;
2809     x[4+idx] = s5;
2810     x[5+idx] = s6;
2811     x[6+idx] = s7;
2812   }
2813   /* backward solve the upper triangular */
2814   for (i=n-1; i>=0; i--) {
2815     v    = aa + 49*diag[i] + 49;
2816     vi   = aj + diag[i] + 1;
2817     nz   = ai[i+1] - diag[i] - 1;
2818     idt  = 7*i;
2819     s1 = x[idt];   s2 = x[1+idt];
2820     s3 = x[2+idt]; s4 = x[3+idt];
2821     s5 = x[4+idt]; s6 = x[5+idt];
2822     s7 = x[6+idt];
2823     while (nz--) {
2824       idx   = 7*(*vi++);
2825       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
2826       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
2827       x7    = x[6+idx];
2828       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2829       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2830       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2831       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2832       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2833       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2834       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2835       v += 49;
2836     }
2837     v        = aa + 49*diag[i];
2838     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
2839                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
2840     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
2841                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
2842     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
2843                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
2844     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
2845                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
2846     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
2847                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
2848     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
2849                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
2850     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
2851                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
2852   }
2853 
2854   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2855   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2856   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
2857   PetscFunctionReturn(0);
2858 }
2859 
2860 #undef __FUNCT__
2861 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
2862 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
2863 {
2864     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2865     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
2866     PetscErrorCode    ierr;
2867     PetscInt          i,k,nz,idx,jdx,idt;
2868     const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
2869     const MatScalar   *aa=a->a,*v;
2870     PetscScalar       *x;
2871     const PetscScalar *b;
2872     PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
2873 
2874     PetscFunctionBegin;
2875     ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2876     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2877     /* forward solve the lower triangular */
2878     idx    = 0;
2879     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
2880     x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx];
2881     for (i=1; i<n; i++) {
2882        v    = aa + bs2*ai[i];
2883        vi   = aj + ai[i];
2884        nz   = ai[i+1] - ai[i];
2885       idx   = bs*i;
2886        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2887        s5   = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
2888        for (k=0;k<nz;k++) {
2889           jdx   = bs*vi[k];
2890           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
2891           x5    = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx];
2892           s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
2893           s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
2894           s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
2895           s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
2896           s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
2897           s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
2898           s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
2899           v   +=  bs2;
2900         }
2901 
2902        x[idx]   = s1;
2903        x[1+idx] = s2;
2904        x[2+idx] = s3;
2905        x[3+idx] = s4;
2906        x[4+idx] = s5;
2907        x[5+idx] = s6;
2908        x[6+idx] = s7;
2909     }
2910 
2911    /* backward solve the upper triangular */
2912   for (i=n-1; i>=0; i--) {
2913     v   = aa + bs2*(adiag[i+1]+1);
2914      vi  = aj + adiag[i+1]+1;
2915      nz  = adiag[i] - adiag[i+1]-1;
2916      idt = bs*i;
2917      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
2918      s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt];
2919     for (k=0;k<nz;k++) {
2920       idx   = bs*vi[k];
2921        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
2922        x5    = x[4+idx];x6 = x[5+idx];x7 = x[6+idx];
2923        s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
2924        s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
2925        s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
2926        s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
2927        s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
2928        s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
2929        s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
2930         v   +=  bs2;
2931     }
2932     /* x = inv_diagonal*x */
2933     x[idt]   = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4  + v[28]*s5 + v[35]*s6 + v[42]*s7;
2934     x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4  + v[29]*s5 + v[36]*s6 + v[43]*s7;
2935     x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4  + v[30]*s5 + v[37]*s6 + v[44]*s7;
2936     x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4  + v[31]*s5 + v[38]*s6 + v[45]*s7;
2937     x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4  + v[32]*s5 + v[39]*s6 + v[46]*s7;
2938     x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4  + v[33]*s5 + v[40]*s6 + v[47]*s7;
2939     x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4  + v[34]*s5 + v[41]*s6 + v[48]*s7;
2940   }
2941 
2942   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2943   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2944   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2945   PetscFunctionReturn(0);
2946 }
2947 
2948 #undef __FUNCT__
2949 #define __FUNCT__ "MatSolve_SeqBAIJ_6_inplace"
2950 PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
2951 {
2952   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2953   IS                iscol=a->col,isrow=a->row;
2954   PetscErrorCode    ierr;
2955   const PetscInt    *r,*c,*rout,*cout;
2956   const PetscInt    *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
2957   PetscInt          i,nz,idx,idt,idc;
2958   const MatScalar   *aa=a->a,*v;
2959   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
2960   const PetscScalar *b;
2961 
2962   PetscFunctionBegin;
2963   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2964   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2965   t  = a->solve_work;
2966 
2967   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2968   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2969 
2970   /* forward solve the lower triangular */
2971   idx    = 6*(*r++);
2972   t[0] = b[idx];   t[1] = b[1+idx];
2973   t[2] = b[2+idx]; t[3] = b[3+idx];
2974   t[4] = b[4+idx]; t[5] = b[5+idx];
2975   for (i=1; i<n; i++) {
2976     v     = aa + 36*ai[i];
2977     vi    = aj + ai[i];
2978     nz    = diag[i] - ai[i];
2979     idx   = 6*(*r++);
2980     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2981     s5  = b[4+idx]; s6 = b[5+idx];
2982     while (nz--) {
2983       idx   = 6*(*vi++);
2984       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
2985       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
2986       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2987       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2988       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2989       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2990       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2991       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
2992       v += 36;
2993     }
2994     idx = 6*i;
2995     t[idx]   = s1;t[1+idx] = s2;
2996     t[2+idx] = s3;t[3+idx] = s4;
2997     t[4+idx] = s5;t[5+idx] = s6;
2998   }
2999   /* backward solve the upper triangular */
3000   for (i=n-1; i>=0; i--) {
3001     v    = aa + 36*diag[i] + 36;
3002     vi   = aj + diag[i] + 1;
3003     nz   = ai[i+1] - diag[i] - 1;
3004     idt  = 6*i;
3005     s1 = t[idt];  s2 = t[1+idt];
3006     s3 = t[2+idt];s4 = t[3+idt];
3007     s5 = t[4+idt];s6 = t[5+idt];
3008     while (nz--) {
3009       idx   = 6*(*vi++);
3010       x1    = t[idx];   x2 = t[1+idx];
3011       x3    = t[2+idx]; x4 = t[3+idx];
3012       x5    = t[4+idx]; x6 = t[5+idx];
3013       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3014       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3015       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3016       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3017       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3018       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3019       v += 36;
3020     }
3021     idc = 6*(*c--);
3022     v   = aa + 36*diag[i];
3023     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
3024                                  v[18]*s4+v[24]*s5+v[30]*s6;
3025     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
3026                                  v[19]*s4+v[25]*s5+v[31]*s6;
3027     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
3028                                  v[20]*s4+v[26]*s5+v[32]*s6;
3029     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
3030                                  v[21]*s4+v[27]*s5+v[33]*s6;
3031     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
3032                                  v[22]*s4+v[28]*s5+v[34]*s6;
3033     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
3034                                  v[23]*s4+v[29]*s5+v[35]*s6;
3035   }
3036 
3037   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3038   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3039   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3040   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3041   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
3042   PetscFunctionReturn(0);
3043 }
3044 
3045 #undef __FUNCT__
3046 #define __FUNCT__ "MatSolve_SeqBAIJ_6"
3047 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
3048 {
3049   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
3050   IS                iscol=a->col,isrow=a->row;
3051   PetscErrorCode    ierr;
3052   const PetscInt    *r,*c,*rout,*cout;
3053   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3054   PetscInt          i,nz,idx,idt,idc,m;
3055   const MatScalar   *aa=a->a,*v;
3056   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
3057   const PetscScalar *b;
3058 
3059   PetscFunctionBegin;
3060   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3061   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3062   t  = a->solve_work;
3063 
3064   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3065   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
3066 
3067   /* forward solve the lower triangular */
3068   idx    = 6*r[0];
3069   t[0] = b[idx];   t[1] = b[1+idx];
3070   t[2] = b[2+idx]; t[3] = b[3+idx];
3071   t[4] = b[4+idx]; t[5] = b[5+idx];
3072   for (i=1; i<n; i++) {
3073     v     = aa + 36*ai[i];
3074     vi    = aj + ai[i];
3075     nz    = ai[i+1] - ai[i];
3076     idx   = 6*r[i];
3077     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3078     s5  = b[4+idx]; s6 = b[5+idx];
3079     for (m=0;m<nz;m++) {
3080       idx   = 6*vi[m];
3081       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
3082       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
3083       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3084       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3085       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3086       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3087       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3088       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3089       v += 36;
3090     }
3091     idx = 6*i;
3092     t[idx]   = s1;t[1+idx] = s2;
3093     t[2+idx] = s3;t[3+idx] = s4;
3094     t[4+idx] = s5;t[5+idx] = s6;
3095   }
3096   /* backward solve the upper triangular */
3097   for (i=n-1; i>=0; i--) {
3098     v    = aa + 36*(adiag[i+1]+1);
3099     vi   = aj + adiag[i+1]+1;
3100     nz   = adiag[i] - adiag[i+1] - 1;
3101     idt  = 6*i;
3102     s1 = t[idt];  s2 = t[1+idt];
3103     s3 = t[2+idt];s4 = t[3+idt];
3104     s5 = t[4+idt];s6 = t[5+idt];
3105     for (m=0;m<nz;m++) {
3106       idx   = 6*vi[m];
3107       x1    = t[idx];   x2 = t[1+idx];
3108       x3    = t[2+idx]; x4 = t[3+idx];
3109       x5    = t[4+idx]; x6 = t[5+idx];
3110       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3111       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3112       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3113       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3114       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3115       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3116       v += 36;
3117     }
3118     idc = 6*c[i];
3119     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
3120                                  v[18]*s4+v[24]*s5+v[30]*s6;
3121     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
3122                                  v[19]*s4+v[25]*s5+v[31]*s6;
3123     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
3124                                  v[20]*s4+v[26]*s5+v[32]*s6;
3125     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
3126                                  v[21]*s4+v[27]*s5+v[33]*s6;
3127     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
3128                                  v[22]*s4+v[28]*s5+v[34]*s6;
3129     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
3130                                  v[23]*s4+v[29]*s5+v[35]*s6;
3131   }
3132 
3133   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3134   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3135   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3136   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3137   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
3138   PetscFunctionReturn(0);
3139 }
3140 
3141 #undef __FUNCT__
3142 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_inplace"
3143 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
3144 {
3145   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3146   PetscInt          i,nz,idx,idt,jdx;
3147   PetscErrorCode    ierr;
3148   const PetscInt    *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j;
3149   const MatScalar   *aa=a->a,*v;
3150   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
3151   const PetscScalar *b;
3152 
3153   PetscFunctionBegin;
3154   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3155   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3156   /* forward solve the lower triangular */
3157   idx    = 0;
3158   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
3159   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
3160   for (i=1; i<n; i++) {
3161     v     =  aa + 36*ai[i];
3162     vi    =  aj + ai[i];
3163     nz    =  diag[i] - ai[i];
3164     idx   =  6*i;
3165     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
3166     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
3167     while (nz--) {
3168       jdx   = 6*(*vi++);
3169       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
3170       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
3171       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3172       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3173       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3174       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3175       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3176       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3177       v += 36;
3178      }
3179     x[idx]   = s1;
3180     x[1+idx] = s2;
3181     x[2+idx] = s3;
3182     x[3+idx] = s4;
3183     x[4+idx] = s5;
3184     x[5+idx] = s6;
3185   }
3186   /* backward solve the upper triangular */
3187   for (i=n-1; i>=0; i--) {
3188     v    = aa + 36*diag[i] + 36;
3189     vi   = aj + diag[i] + 1;
3190     nz   = ai[i+1] - diag[i] - 1;
3191     idt  = 6*i;
3192     s1 = x[idt];   s2 = x[1+idt];
3193     s3 = x[2+idt]; s4 = x[3+idt];
3194     s5 = x[4+idt]; s6 = x[5+idt];
3195     while (nz--) {
3196       idx   = 6*(*vi++);
3197       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
3198       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
3199       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3200       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3201       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3202       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3203       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3204       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3205       v += 36;
3206     }
3207     v        = aa + 36*diag[i];
3208     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
3209     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
3210     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
3211     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
3212     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
3213     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
3214   }
3215 
3216   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3217   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3218   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
3219   PetscFunctionReturn(0);
3220 }
3221 
3222 #undef __FUNCT__
3223 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
3224 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
3225 {
3226     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3227     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3228     PetscErrorCode    ierr;
3229     PetscInt          i,k,nz,idx,jdx,idt;
3230     const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
3231     const MatScalar   *aa=a->a,*v;
3232     PetscScalar       *x;
3233     const PetscScalar *b;
3234     PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
3235 
3236     PetscFunctionBegin;
3237     ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3238     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3239     /* forward solve the lower triangular */
3240     idx    = 0;
3241     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
3242     x[4] = b[4+idx];x[5] = b[5+idx];
3243     for (i=1; i<n; i++) {
3244        v    = aa + bs2*ai[i];
3245        vi   = aj + ai[i];
3246        nz   = ai[i+1] - ai[i];
3247       idx   = bs*i;
3248        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3249        s5   = b[4+idx];s6 = b[5+idx];
3250        for (k=0;k<nz;k++) {
3251           jdx   = bs*vi[k];
3252           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
3253           x5    = x[4+jdx]; x6 = x[5+jdx];
3254           s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
3255           s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
3256           s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
3257           s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
3258           s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
3259           s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
3260           v   +=  bs2;
3261         }
3262 
3263        x[idx]   = s1;
3264        x[1+idx] = s2;
3265        x[2+idx] = s3;
3266        x[3+idx] = s4;
3267        x[4+idx] = s5;
3268        x[5+idx] = s6;
3269     }
3270 
3271    /* backward solve the upper triangular */
3272   for (i=n-1; i>=0; i--) {
3273     v   = aa + bs2*(adiag[i+1]+1);
3274      vi  = aj + adiag[i+1]+1;
3275      nz  = adiag[i] - adiag[i+1]-1;
3276      idt = bs*i;
3277      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
3278      s5 = x[4+idt];s6 = x[5+idt];
3279      for (k=0;k<nz;k++) {
3280       idx   = bs*vi[k];
3281        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
3282        x5    = x[4+idx];x6 = x[5+idx];
3283        s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
3284        s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
3285        s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
3286        s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
3287        s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
3288        s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
3289         v   +=  bs2;
3290     }
3291     /* x = inv_diagonal*x */
3292    x[idt]   = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
3293    x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
3294    x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
3295    x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
3296    x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
3297    x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
3298   }
3299 
3300   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3301   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3302   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
3303   PetscFunctionReturn(0);
3304 }
3305 
3306 #undef __FUNCT__
3307 #define __FUNCT__ "MatSolve_SeqBAIJ_5_inplace"
3308 PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
3309 {
3310   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
3311   IS                iscol=a->col,isrow=a->row;
3312   PetscErrorCode    ierr;
3313   const PetscInt    *r,*c,*rout,*cout,*diag = a->diag;
3314   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3315   PetscInt          i,nz,idx,idt,idc;
3316   const MatScalar   *aa=a->a,*v;
3317   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
3318   const PetscScalar *b;
3319 
3320   PetscFunctionBegin;
3321   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3322   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3323   t  = a->solve_work;
3324 
3325   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3326   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3327 
3328   /* forward solve the lower triangular */
3329   idx    = 5*(*r++);
3330   t[0] = b[idx];   t[1] = b[1+idx];
3331   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
3332   for (i=1; i<n; i++) {
3333     v     = aa + 25*ai[i];
3334     vi    = aj + ai[i];
3335     nz    = diag[i] - ai[i];
3336     idx   = 5*(*r++);
3337     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3338     s5  = b[4+idx];
3339     while (nz--) {
3340       idx   = 5*(*vi++);
3341       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
3342       x4    = t[3+idx];x5 = t[4+idx];
3343       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3344       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3345       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3346       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3347       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3348       v += 25;
3349     }
3350     idx = 5*i;
3351     t[idx]   = s1;t[1+idx] = s2;
3352     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
3353   }
3354   /* backward solve the upper triangular */
3355   for (i=n-1; i>=0; i--) {
3356     v    = aa + 25*diag[i] + 25;
3357     vi   = aj + diag[i] + 1;
3358     nz   = ai[i+1] - diag[i] - 1;
3359     idt  = 5*i;
3360     s1 = t[idt];  s2 = t[1+idt];
3361     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
3362     while (nz--) {
3363       idx   = 5*(*vi++);
3364       x1    = t[idx];   x2 = t[1+idx];
3365       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
3366       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3367       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3368       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3369       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3370       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3371       v += 25;
3372     }
3373     idc = 5*(*c--);
3374     v   = aa + 25*diag[i];
3375     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
3376                                  v[15]*s4+v[20]*s5;
3377     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
3378                                  v[16]*s4+v[21]*s5;
3379     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
3380                                  v[17]*s4+v[22]*s5;
3381     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
3382                                  v[18]*s4+v[23]*s5;
3383     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
3384                                  v[19]*s4+v[24]*s5;
3385   }
3386 
3387   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3388   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3389   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3390   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3391   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
3392   PetscFunctionReturn(0);
3393 }
3394 
3395 #undef __FUNCT__
3396 #define __FUNCT__ "MatSolve_SeqBAIJ_5"
3397 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
3398 {
3399   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
3400   IS                iscol=a->col,isrow=a->row;
3401   PetscErrorCode    ierr;
3402   const PetscInt    *r,*c,*rout,*cout;
3403   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3404   PetscInt          i,nz,idx,idt,idc,m;
3405   const MatScalar   *aa=a->a,*v;
3406   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
3407   const PetscScalar *b;
3408 
3409   PetscFunctionBegin;
3410   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3411   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3412   t  = a->solve_work;
3413 
3414   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3415   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
3416 
3417   /* forward solve the lower triangular */
3418   idx    = 5*r[0];
3419   t[0] = b[idx];   t[1] = b[1+idx];
3420   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
3421   for (i=1; i<n; i++) {
3422     v     = aa + 25*ai[i];
3423     vi    = aj + ai[i];
3424     nz    = ai[i+1] - ai[i];
3425     idx   = 5*r[i];
3426     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3427     s5  = b[4+idx];
3428     for (m=0;m<nz;m++) {
3429       idx   = 5*vi[m];
3430       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
3431       x4    = t[3+idx];x5 = t[4+idx];
3432       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3433       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3434       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3435       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3436       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3437       v += 25;
3438     }
3439     idx = 5*i;
3440     t[idx]   = s1;t[1+idx] = s2;
3441     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
3442   }
3443   /* backward solve the upper triangular */
3444   for (i=n-1; i>=0; i--) {
3445     v    = aa + 25*(adiag[i+1]+1);
3446     vi   = aj + adiag[i+1]+1;
3447     nz   = adiag[i] - adiag[i+1] - 1;
3448     idt  = 5*i;
3449     s1 = t[idt];  s2 = t[1+idt];
3450     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
3451     for (m=0;m<nz;m++) {
3452       idx   = 5*vi[m];
3453       x1    = t[idx];   x2 = t[1+idx];
3454       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
3455       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3456       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3457       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3458       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3459       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3460       v += 25;
3461     }
3462     idc = 5*c[i];
3463     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
3464                                  v[15]*s4+v[20]*s5;
3465     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
3466                                  v[16]*s4+v[21]*s5;
3467     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
3468                                  v[17]*s4+v[22]*s5;
3469     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
3470                                  v[18]*s4+v[23]*s5;
3471     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
3472                                  v[19]*s4+v[24]*s5;
3473   }
3474 
3475   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3476   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3477   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3478   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3479   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
3480   PetscFunctionReturn(0);
3481 }
3482 
3483 #undef __FUNCT__
3484 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_inplace"
3485 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
3486 {
3487   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3488   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3489   PetscInt          i,nz,idx,idt,jdx;
3490   PetscErrorCode    ierr;
3491   const MatScalar   *aa=a->a,*v;
3492   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
3493   const PetscScalar *b;
3494 
3495   PetscFunctionBegin;
3496   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3497   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3498   /* forward solve the lower triangular */
3499   idx    = 0;
3500   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
3501   for (i=1; i<n; i++) {
3502     v     =  aa + 25*ai[i];
3503     vi    =  aj + ai[i];
3504     nz    =  diag[i] - ai[i];
3505     idx   =  5*i;
3506     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
3507     while (nz--) {
3508       jdx   = 5*(*vi++);
3509       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
3510       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3511       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3512       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3513       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3514       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3515       v    += 25;
3516     }
3517     x[idx]   = s1;
3518     x[1+idx] = s2;
3519     x[2+idx] = s3;
3520     x[3+idx] = s4;
3521     x[4+idx] = s5;
3522   }
3523   /* backward solve the upper triangular */
3524   for (i=n-1; i>=0; i--) {
3525     v    = aa + 25*diag[i] + 25;
3526     vi   = aj + diag[i] + 1;
3527     nz   = ai[i+1] - diag[i] - 1;
3528     idt  = 5*i;
3529     s1 = x[idt];  s2 = x[1+idt];
3530     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
3531     while (nz--) {
3532       idx   = 5*(*vi++);
3533       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
3534       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3535       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3536       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3537       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3538       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3539       v    += 25;
3540     }
3541     v        = aa + 25*diag[i];
3542     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
3543     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
3544     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
3545     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
3546     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
3547   }
3548 
3549   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3550   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3551   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
3552   PetscFunctionReturn(0);
3553 }
3554 
3555 #undef __FUNCT__
3556 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
3557 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
3558 {
3559   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3560   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3561   PetscInt          i,k,nz,idx,idt,jdx;
3562   PetscErrorCode    ierr;
3563   const MatScalar   *aa=a->a,*v;
3564   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
3565   const PetscScalar *b;
3566 
3567   PetscFunctionBegin;
3568   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3569   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3570   /* forward solve the lower triangular */
3571   idx    = 0;
3572   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
3573   for (i=1; i<n; i++) {
3574     v   = aa + 25*ai[i];
3575     vi  = aj + ai[i];
3576     nz  = ai[i+1] - ai[i];
3577     idx = 5*i;
3578     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
3579     for (k=0;k<nz;k++) {
3580       jdx   = 5*vi[k];
3581       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
3582       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3583       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3584       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3585       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3586       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3587       v    += 25;
3588     }
3589     x[idx]   = s1;
3590     x[1+idx] = s2;
3591     x[2+idx] = s3;
3592     x[3+idx] = s4;
3593     x[4+idx] = s5;
3594   }
3595 
3596   /* backward solve the upper triangular */
3597   for (i=n-1; i>=0; i--) {
3598     v   = aa + 25*(adiag[i+1]+1);
3599     vi  = aj + adiag[i+1]+1;
3600     nz  = adiag[i] - adiag[i+1]-1;
3601     idt = 5*i;
3602     s1 = x[idt];  s2 = x[1+idt];
3603     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
3604     for (k=0;k<nz;k++) {
3605       idx   = 5*vi[k];
3606       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
3607       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3608       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3609       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3610       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3611       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3612       v    += 25;
3613     }
3614     /* x = inv_diagonal*x */
3615     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
3616     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
3617     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
3618     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
3619     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
3620   }
3621 
3622   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3623   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3624   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
3625   PetscFunctionReturn(0);
3626 }
3627 
3628 #undef __FUNCT__
3629 #define __FUNCT__ "MatSolve_SeqBAIJ_4_inplace"
3630 PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
3631 {
3632   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3633   IS                iscol=a->col,isrow=a->row;
3634   PetscErrorCode    ierr;
3635   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3636   PetscInt          i,nz,idx,idt,idc;
3637   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3638   const MatScalar   *aa=a->a,*v;
3639   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
3640   const PetscScalar *b;
3641 
3642   PetscFunctionBegin;
3643   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3644   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3645   t  = a->solve_work;
3646 
3647   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3648   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3649 
3650   /* forward solve the lower triangular */
3651   idx    = 4*(*r++);
3652   t[0] = b[idx];   t[1] = b[1+idx];
3653   t[2] = b[2+idx]; t[3] = b[3+idx];
3654   for (i=1; i<n; i++) {
3655     v     = aa + 16*ai[i];
3656     vi    = aj + ai[i];
3657     nz    = diag[i] - ai[i];
3658     idx   = 4*(*r++);
3659     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3660     while (nz--) {
3661       idx   = 4*(*vi++);
3662       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
3663       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3664       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3665       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3666       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3667       v    += 16;
3668     }
3669     idx        = 4*i;
3670     t[idx]   = s1;t[1+idx] = s2;
3671     t[2+idx] = s3;t[3+idx] = s4;
3672   }
3673   /* backward solve the upper triangular */
3674   for (i=n-1; i>=0; i--) {
3675     v    = aa + 16*diag[i] + 16;
3676     vi   = aj + diag[i] + 1;
3677     nz   = ai[i+1] - diag[i] - 1;
3678     idt  = 4*i;
3679     s1 = t[idt];  s2 = t[1+idt];
3680     s3 = t[2+idt];s4 = t[3+idt];
3681     while (nz--) {
3682       idx   = 4*(*vi++);
3683       x1    = t[idx];   x2 = t[1+idx];
3684       x3    = t[2+idx]; x4 = t[3+idx];
3685       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3686       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3687       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3688       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3689       v += 16;
3690     }
3691     idc      = 4*(*c--);
3692     v        = aa + 16*diag[i];
3693     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3694     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3695     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3696     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3697   }
3698 
3699   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3700   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3701   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3702   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3703   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3704   PetscFunctionReturn(0);
3705 }
3706 
3707 #undef __FUNCT__
3708 #define __FUNCT__ "MatSolve_SeqBAIJ_4"
3709 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
3710 {
3711   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3712   IS                iscol=a->col,isrow=a->row;
3713   PetscErrorCode    ierr;
3714   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3715   PetscInt          i,nz,idx,idt,idc,m;
3716   const PetscInt    *r,*c,*rout,*cout;
3717   const MatScalar   *aa=a->a,*v;
3718   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
3719   const PetscScalar *b;
3720 
3721   PetscFunctionBegin;
3722   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3723   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3724   t  = a->solve_work;
3725 
3726   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3727   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
3728 
3729   /* forward solve the lower triangular */
3730   idx    = 4*r[0];
3731   t[0] = b[idx];   t[1] = b[1+idx];
3732   t[2] = b[2+idx]; t[3] = b[3+idx];
3733   for (i=1; i<n; i++) {
3734     v     = aa + 16*ai[i];
3735     vi    = aj + ai[i];
3736     nz    = ai[i+1] - ai[i];
3737     idx   = 4*r[i];
3738     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3739     for (m=0;m<nz;m++) {
3740       idx   = 4*vi[m];
3741       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
3742       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3743       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3744       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3745       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3746       v    += 16;
3747     }
3748     idx        = 4*i;
3749     t[idx]   = s1;t[1+idx] = s2;
3750     t[2+idx] = s3;t[3+idx] = s4;
3751   }
3752   /* backward solve the upper triangular */
3753   for (i=n-1; i>=0; i--) {
3754     v    = aa + 16*(adiag[i+1]+1);
3755     vi   = aj + adiag[i+1]+1;
3756     nz   = adiag[i] - adiag[i+1] - 1;
3757     idt  = 4*i;
3758     s1 = t[idt];  s2 = t[1+idt];
3759     s3 = t[2+idt];s4 = t[3+idt];
3760     for (m=0;m<nz;m++) {
3761       idx   = 4*vi[m];
3762       x1    = t[idx];   x2 = t[1+idx];
3763       x3    = t[2+idx]; x4 = t[3+idx];
3764       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3765       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3766       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3767       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3768       v += 16;
3769     }
3770     idc      = 4*c[i];
3771     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3772     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3773     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3774     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3775   }
3776 
3777   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3778   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3779   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3780   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3781   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3782   PetscFunctionReturn(0);
3783 }
3784 
3785 #undef __FUNCT__
3786 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion"
3787 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
3788 {
3789   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3790   IS                iscol=a->col,isrow=a->row;
3791   PetscErrorCode    ierr;
3792   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3793   PetscInt          i,nz,idx,idt,idc;
3794   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3795   const MatScalar   *aa=a->a,*v;
3796   MatScalar         s1,s2,s3,s4,x1,x2,x3,x4,*t;
3797   PetscScalar       *x;
3798   const PetscScalar *b;
3799 
3800   PetscFunctionBegin;
3801   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3802   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3803   t  = (MatScalar *)a->solve_work;
3804 
3805   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3806   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3807 
3808   /* forward solve the lower triangular */
3809   idx    = 4*(*r++);
3810   t[0] = (MatScalar)b[idx];
3811   t[1] = (MatScalar)b[1+idx];
3812   t[2] = (MatScalar)b[2+idx];
3813   t[3] = (MatScalar)b[3+idx];
3814   for (i=1; i<n; i++) {
3815     v     = aa + 16*ai[i];
3816     vi    = aj + ai[i];
3817     nz    = diag[i] - ai[i];
3818     idx   = 4*(*r++);
3819     s1 = (MatScalar)b[idx];
3820     s2 = (MatScalar)b[1+idx];
3821     s3 = (MatScalar)b[2+idx];
3822     s4 = (MatScalar)b[3+idx];
3823     while (nz--) {
3824       idx   = 4*(*vi++);
3825       x1  = t[idx];
3826       x2  = t[1+idx];
3827       x3  = t[2+idx];
3828       x4  = t[3+idx];
3829       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3830       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3831       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3832       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3833       v    += 16;
3834     }
3835     idx        = 4*i;
3836     t[idx]   = s1;
3837     t[1+idx] = s2;
3838     t[2+idx] = s3;
3839     t[3+idx] = s4;
3840   }
3841   /* backward solve the upper triangular */
3842   for (i=n-1; i>=0; i--) {
3843     v    = aa + 16*diag[i] + 16;
3844     vi   = aj + diag[i] + 1;
3845     nz   = ai[i+1] - diag[i] - 1;
3846     idt  = 4*i;
3847     s1 = t[idt];
3848     s2 = t[1+idt];
3849     s3 = t[2+idt];
3850     s4 = t[3+idt];
3851     while (nz--) {
3852       idx   = 4*(*vi++);
3853       x1  = t[idx];
3854       x2  = t[1+idx];
3855       x3  = t[2+idx];
3856       x4  = t[3+idx];
3857       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3858       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3859       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3860       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3861       v += 16;
3862     }
3863     idc      = 4*(*c--);
3864     v        = aa + 16*diag[i];
3865     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3866     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3867     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3868     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3869     x[idc]   = (PetscScalar)t[idt];
3870     x[1+idc] = (PetscScalar)t[1+idt];
3871     x[2+idc] = (PetscScalar)t[2+idt];
3872     x[3+idc] = (PetscScalar)t[3+idt];
3873  }
3874 
3875   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3876   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3877   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3878   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3879   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3880   PetscFunctionReturn(0);
3881 }
3882 
3883 #if defined (PETSC_HAVE_SSE)
3884 
3885 #include PETSC_HAVE_SSE
3886 
3887 #undef __FUNCT__
3888 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
3889 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
3890 {
3891   /*
3892      Note: This code uses demotion of double
3893      to float when performing the mixed-mode computation.
3894      This may not be numerically reasonable for all applications.
3895   */
3896   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3897   IS             iscol=a->col,isrow=a->row;
3898   PetscErrorCode ierr;
3899   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
3900   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
3901   MatScalar      *aa=a->a,*v;
3902   PetscScalar    *x,*b,*t;
3903 
3904   /* Make space in temp stack for 16 Byte Aligned arrays */
3905   float           ssealignedspace[11],*tmps,*tmpx;
3906   unsigned long   offset;
3907 
3908   PetscFunctionBegin;
3909   SSE_SCOPE_BEGIN;
3910 
3911     offset = (unsigned long)ssealignedspace % 16;
3912     if (offset) offset = (16 - offset)/4;
3913     tmps = &ssealignedspace[offset];
3914     tmpx = &ssealignedspace[offset+4];
3915     PREFETCH_NTA(aa+16*ai[1]);
3916 
3917     ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3918     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3919     t  = a->solve_work;
3920 
3921     ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3922     ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3923 
3924     /* forward solve the lower triangular */
3925     idx  = 4*(*r++);
3926     t[0] = b[idx];   t[1] = b[1+idx];
3927     t[2] = b[2+idx]; t[3] = b[3+idx];
3928     v    =  aa + 16*ai[1];
3929 
3930     for (i=1; i<n;) {
3931       PREFETCH_NTA(&v[8]);
3932       vi   =  aj      + ai[i];
3933       nz   =  diag[i] - ai[i];
3934       idx  =  4*(*r++);
3935 
3936       /* Demote sum from double to float */
3937       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
3938       LOAD_PS(tmps,XMM7);
3939 
3940       while (nz--) {
3941         PREFETCH_NTA(&v[16]);
3942         idx = 4*(*vi++);
3943 
3944         /* Demote solution (so far) from double to float */
3945         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
3946 
3947         /* 4x4 Matrix-Vector product with negative accumulation: */
3948         SSE_INLINE_BEGIN_2(tmpx,v)
3949           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3950 
3951           /* First Column */
3952           SSE_COPY_PS(XMM0,XMM6)
3953           SSE_SHUFFLE(XMM0,XMM0,0x00)
3954           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3955           SSE_SUB_PS(XMM7,XMM0)
3956 
3957           /* Second Column */
3958           SSE_COPY_PS(XMM1,XMM6)
3959           SSE_SHUFFLE(XMM1,XMM1,0x55)
3960           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3961           SSE_SUB_PS(XMM7,XMM1)
3962 
3963           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3964 
3965           /* Third Column */
3966           SSE_COPY_PS(XMM2,XMM6)
3967           SSE_SHUFFLE(XMM2,XMM2,0xAA)
3968           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3969           SSE_SUB_PS(XMM7,XMM2)
3970 
3971           /* Fourth Column */
3972           SSE_COPY_PS(XMM3,XMM6)
3973           SSE_SHUFFLE(XMM3,XMM3,0xFF)
3974           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3975           SSE_SUB_PS(XMM7,XMM3)
3976         SSE_INLINE_END_2
3977 
3978         v  += 16;
3979       }
3980       idx = 4*i;
3981       v   = aa + 16*ai[++i];
3982       PREFETCH_NTA(v);
3983       STORE_PS(tmps,XMM7);
3984 
3985       /* Promote result from float to double */
3986       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
3987     }
3988     /* backward solve the upper triangular */
3989     idt  = 4*(n-1);
3990     ai16 = 16*diag[n-1];
3991     v    = aa + ai16 + 16;
3992     for (i=n-1; i>=0;) {
3993       PREFETCH_NTA(&v[8]);
3994       vi = aj + diag[i] + 1;
3995       nz = ai[i+1] - diag[i] - 1;
3996 
3997       /* Demote accumulator from double to float */
3998       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
3999       LOAD_PS(tmps,XMM7);
4000 
4001       while (nz--) {
4002         PREFETCH_NTA(&v[16]);
4003         idx = 4*(*vi++);
4004 
4005         /* Demote solution (so far) from double to float */
4006         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
4007 
4008         /* 4x4 Matrix-Vector Product with negative accumulation: */
4009         SSE_INLINE_BEGIN_2(tmpx,v)
4010           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4011 
4012           /* First Column */
4013           SSE_COPY_PS(XMM0,XMM6)
4014           SSE_SHUFFLE(XMM0,XMM0,0x00)
4015           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4016           SSE_SUB_PS(XMM7,XMM0)
4017 
4018           /* Second Column */
4019           SSE_COPY_PS(XMM1,XMM6)
4020           SSE_SHUFFLE(XMM1,XMM1,0x55)
4021           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4022           SSE_SUB_PS(XMM7,XMM1)
4023 
4024           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4025 
4026           /* Third Column */
4027           SSE_COPY_PS(XMM2,XMM6)
4028           SSE_SHUFFLE(XMM2,XMM2,0xAA)
4029           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4030           SSE_SUB_PS(XMM7,XMM2)
4031 
4032           /* Fourth Column */
4033           SSE_COPY_PS(XMM3,XMM6)
4034           SSE_SHUFFLE(XMM3,XMM3,0xFF)
4035           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4036           SSE_SUB_PS(XMM7,XMM3)
4037         SSE_INLINE_END_2
4038         v  += 16;
4039       }
4040       v    = aa + ai16;
4041       ai16 = 16*diag[--i];
4042       PREFETCH_NTA(aa+ai16+16);
4043       /*
4044          Scale the result by the diagonal 4x4 block,
4045          which was inverted as part of the factorization
4046       */
4047       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
4048         /* First Column */
4049         SSE_COPY_PS(XMM0,XMM7)
4050         SSE_SHUFFLE(XMM0,XMM0,0x00)
4051         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
4052 
4053         /* Second Column */
4054         SSE_COPY_PS(XMM1,XMM7)
4055         SSE_SHUFFLE(XMM1,XMM1,0x55)
4056         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
4057         SSE_ADD_PS(XMM0,XMM1)
4058 
4059         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
4060 
4061         /* Third Column */
4062         SSE_COPY_PS(XMM2,XMM7)
4063         SSE_SHUFFLE(XMM2,XMM2,0xAA)
4064         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
4065         SSE_ADD_PS(XMM0,XMM2)
4066 
4067         /* Fourth Column */
4068         SSE_COPY_PS(XMM3,XMM7)
4069         SSE_SHUFFLE(XMM3,XMM3,0xFF)
4070         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
4071         SSE_ADD_PS(XMM0,XMM3)
4072 
4073         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
4074       SSE_INLINE_END_3
4075 
4076       /* Promote solution from float to double */
4077       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
4078 
4079       /* Apply reordering to t and stream into x.    */
4080       /* This way, x doesn't pollute the cache.      */
4081       /* Be careful with size: 2 doubles = 4 floats! */
4082       idc  = 4*(*c--);
4083       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
4084         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
4085         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
4086         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
4087         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
4088         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
4089         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
4090       SSE_INLINE_END_2
4091       v    = aa + ai16 + 16;
4092       idt -= 4;
4093     }
4094 
4095     ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4096     ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4097     ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4098     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4099     ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4100   SSE_SCOPE_END;
4101   PetscFunctionReturn(0);
4102 }
4103 
4104 #endif
4105 
4106 
4107 /*
4108       Special case where the matrix was ILU(0) factored in the natural
4109    ordering. This eliminates the need for the column and row permutation.
4110 */
4111 #undef __FUNCT__
4112 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_inplace"
4113 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
4114 {
4115   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4116   PetscInt          n=a->mbs;
4117   const PetscInt    *ai=a->i,*aj=a->j;
4118   PetscErrorCode    ierr;
4119   const PetscInt    *diag = a->diag;
4120   const MatScalar   *aa=a->a;
4121   PetscScalar       *x;
4122   const PetscScalar *b;
4123 
4124   PetscFunctionBegin;
4125   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
4126   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4127 
4128 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
4129   {
4130     static PetscScalar w[2000]; /* very BAD need to fix */
4131     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
4132   }
4133 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
4134   {
4135     static PetscScalar w[2000]; /* very BAD need to fix */
4136     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
4137   }
4138 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
4139   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
4140 #else
4141   {
4142     PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
4143     const MatScalar *v;
4144     PetscInt        jdx,idt,idx,nz,i,ai16;
4145     const PetscInt  *vi;
4146 
4147   /* forward solve the lower triangular */
4148   idx    = 0;
4149   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
4150   for (i=1; i<n; i++) {
4151     v     =  aa      + 16*ai[i];
4152     vi    =  aj      + ai[i];
4153     nz    =  diag[i] - ai[i];
4154     idx   +=  4;
4155     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
4156     while (nz--) {
4157       jdx   = 4*(*vi++);
4158       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
4159       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
4160       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
4161       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4162       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4163       v    += 16;
4164     }
4165     x[idx]   = s1;
4166     x[1+idx] = s2;
4167     x[2+idx] = s3;
4168     x[3+idx] = s4;
4169   }
4170   /* backward solve the upper triangular */
4171   idt = 4*(n-1);
4172   for (i=n-1; i>=0; i--) {
4173     ai16 = 16*diag[i];
4174     v    = aa + ai16 + 16;
4175     vi   = aj + diag[i] + 1;
4176     nz   = ai[i+1] - diag[i] - 1;
4177     s1 = x[idt];  s2 = x[1+idt];
4178     s3 = x[2+idt];s4 = x[3+idt];
4179     while (nz--) {
4180       idx   = 4*(*vi++);
4181       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
4182       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
4183       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
4184       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
4185       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
4186       v    += 16;
4187     }
4188     v        = aa + ai16;
4189     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
4190     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
4191     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
4192     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
4193     idt -= 4;
4194   }
4195   }
4196 #endif
4197 
4198   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
4199   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4200   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4201   PetscFunctionReturn(0);
4202 }
4203 
4204 #undef __FUNCT__
4205 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
4206 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
4207 {
4208     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4209     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
4210     PetscInt          i,k,nz,idx,jdx,idt;
4211     PetscErrorCode    ierr;
4212     const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
4213     const MatScalar   *aa=a->a,*v;
4214     PetscScalar       *x;
4215     const PetscScalar *b;
4216     PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4;
4217 
4218     PetscFunctionBegin;
4219     ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
4220     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4221     /* forward solve the lower triangular */
4222     idx    = 0;
4223     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
4224     for (i=1; i<n; i++) {
4225        v    = aa + bs2*ai[i];
4226        vi   = aj + ai[i];
4227        nz   = ai[i+1] - ai[i];
4228       idx   = bs*i;
4229        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
4230       for (k=0;k<nz;k++) {
4231           jdx   = bs*vi[k];
4232           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
4233           s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
4234           s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
4235           s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4236           s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4237 
4238           v   +=  bs2;
4239         }
4240 
4241        x[idx]   = s1;
4242        x[1+idx] = s2;
4243        x[2+idx] = s3;
4244        x[3+idx] = s4;
4245     }
4246 
4247    /* backward solve the upper triangular */
4248   for (i=n-1; i>=0; i--) {
4249     v   = aa + bs2*(adiag[i+1]+1);
4250      vi  = aj + adiag[i+1]+1;
4251      nz  = adiag[i] - adiag[i+1]-1;
4252      idt = bs*i;
4253      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
4254 
4255     for (k=0;k<nz;k++) {
4256       idx   = bs*vi[k];
4257        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
4258        s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
4259        s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
4260        s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4261        s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4262 
4263         v   +=  bs2;
4264     }
4265     /* x = inv_diagonal*x */
4266    x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
4267    x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;;
4268    x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
4269    x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
4270 
4271   }
4272 
4273   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
4274   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4275   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
4276   PetscFunctionReturn(0);
4277 }
4278 
4279 #undef __FUNCT__
4280 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion"
4281 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
4282 {
4283   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4284   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*diag=a->diag;
4285   PetscErrorCode    ierr;
4286   const MatScalar   *aa=a->a;
4287   const PetscScalar *b;
4288   PetscScalar       *x;
4289 
4290   PetscFunctionBegin;
4291   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
4292   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4293 
4294   {
4295     MatScalar        s1,s2,s3,s4,x1,x2,x3,x4;
4296     const MatScalar  *v;
4297     MatScalar        *t=(MatScalar *)x;
4298     PetscInt         jdx,idt,idx,nz,i,ai16;
4299     const PetscInt   *vi;
4300 
4301     /* forward solve the lower triangular */
4302     idx  = 0;
4303     t[0] = (MatScalar)b[0];
4304     t[1] = (MatScalar)b[1];
4305     t[2] = (MatScalar)b[2];
4306     t[3] = (MatScalar)b[3];
4307     for (i=1; i<n; i++) {
4308       v     =  aa      + 16*ai[i];
4309       vi    =  aj      + ai[i];
4310       nz    =  diag[i] - ai[i];
4311       idx   +=  4;
4312       s1 = (MatScalar)b[idx];
4313       s2 = (MatScalar)b[1+idx];
4314       s3 = (MatScalar)b[2+idx];
4315       s4 = (MatScalar)b[3+idx];
4316       while (nz--) {
4317         jdx = 4*(*vi++);
4318         x1  = t[jdx];
4319         x2  = t[1+jdx];
4320         x3  = t[2+jdx];
4321         x4  = t[3+jdx];
4322         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
4323         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
4324         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4325         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4326         v    += 16;
4327       }
4328       t[idx]   = s1;
4329       t[1+idx] = s2;
4330       t[2+idx] = s3;
4331       t[3+idx] = s4;
4332     }
4333     /* backward solve the upper triangular */
4334     idt = 4*(n-1);
4335     for (i=n-1; i>=0; i--) {
4336       ai16 = 16*diag[i];
4337       v    = aa + ai16 + 16;
4338       vi   = aj + diag[i] + 1;
4339       nz   = ai[i+1] - diag[i] - 1;
4340       s1   = t[idt];
4341       s2   = t[1+idt];
4342       s3   = t[2+idt];
4343       s4   = t[3+idt];
4344       while (nz--) {
4345         idx = 4*(*vi++);
4346         x1  = (MatScalar)x[idx];
4347         x2  = (MatScalar)x[1+idx];
4348         x3  = (MatScalar)x[2+idx];
4349         x4  = (MatScalar)x[3+idx];
4350         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
4351         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
4352         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4353         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4354         v    += 16;
4355       }
4356       v        = aa + ai16;
4357       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
4358       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
4359       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
4360       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
4361       idt -= 4;
4362     }
4363   }
4364 
4365   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
4366   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4367   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4368   PetscFunctionReturn(0);
4369 }
4370 
4371 #if defined (PETSC_HAVE_SSE)
4372 
4373 #include PETSC_HAVE_SSE
4374 #undef __FUNCT__
4375 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj"
4376 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
4377 {
4378   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
4379   unsigned short *aj=(unsigned short *)a->j;
4380   PetscErrorCode ierr;
4381   int            *ai=a->i,n=a->mbs,*diag = a->diag;
4382   MatScalar      *aa=a->a;
4383   PetscScalar    *x,*b;
4384 
4385   PetscFunctionBegin;
4386   SSE_SCOPE_BEGIN;
4387   /*
4388      Note: This code currently uses demotion of double
4389      to float when performing the mixed-mode computation.
4390      This may not be numerically reasonable for all applications.
4391   */
4392   PREFETCH_NTA(aa+16*ai[1]);
4393 
4394   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4395   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4396   {
4397     /* x will first be computed in single precision then promoted inplace to double */
4398     MatScalar      *v,*t=(MatScalar *)x;
4399     int            nz,i,idt,ai16;
4400     unsigned int   jdx,idx;
4401     unsigned short *vi;
4402     /* Forward solve the lower triangular factor. */
4403 
4404     /* First block is the identity. */
4405     idx  = 0;
4406     CONVERT_DOUBLE4_FLOAT4(t,b);
4407     v    =  aa + 16*((unsigned int)ai[1]);
4408 
4409     for (i=1; i<n;) {
4410       PREFETCH_NTA(&v[8]);
4411       vi   =  aj      + ai[i];
4412       nz   =  diag[i] - ai[i];
4413       idx +=  4;
4414 
4415       /* Demote RHS from double to float. */
4416       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
4417       LOAD_PS(&t[idx],XMM7);
4418 
4419       while (nz--) {
4420         PREFETCH_NTA(&v[16]);
4421         jdx = 4*((unsigned int)(*vi++));
4422 
4423         /* 4x4 Matrix-Vector product with negative accumulation: */
4424         SSE_INLINE_BEGIN_2(&t[jdx],v)
4425           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4426 
4427           /* First Column */
4428           SSE_COPY_PS(XMM0,XMM6)
4429           SSE_SHUFFLE(XMM0,XMM0,0x00)
4430           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4431           SSE_SUB_PS(XMM7,XMM0)
4432 
4433           /* Second Column */
4434           SSE_COPY_PS(XMM1,XMM6)
4435           SSE_SHUFFLE(XMM1,XMM1,0x55)
4436           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4437           SSE_SUB_PS(XMM7,XMM1)
4438 
4439           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4440 
4441           /* Third Column */
4442           SSE_COPY_PS(XMM2,XMM6)
4443           SSE_SHUFFLE(XMM2,XMM2,0xAA)
4444           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4445           SSE_SUB_PS(XMM7,XMM2)
4446 
4447           /* Fourth Column */
4448           SSE_COPY_PS(XMM3,XMM6)
4449           SSE_SHUFFLE(XMM3,XMM3,0xFF)
4450           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4451           SSE_SUB_PS(XMM7,XMM3)
4452         SSE_INLINE_END_2
4453 
4454         v  += 16;
4455       }
4456       v    =  aa + 16*ai[++i];
4457       PREFETCH_NTA(v);
4458       STORE_PS(&t[idx],XMM7);
4459     }
4460 
4461     /* Backward solve the upper triangular factor.*/
4462 
4463     idt  = 4*(n-1);
4464     ai16 = 16*diag[n-1];
4465     v    = aa + ai16 + 16;
4466     for (i=n-1; i>=0;) {
4467       PREFETCH_NTA(&v[8]);
4468       vi = aj + diag[i] + 1;
4469       nz = ai[i+1] - diag[i] - 1;
4470 
4471       LOAD_PS(&t[idt],XMM7);
4472 
4473       while (nz--) {
4474         PREFETCH_NTA(&v[16]);
4475         idx = 4*((unsigned int)(*vi++));
4476 
4477         /* 4x4 Matrix-Vector Product with negative accumulation: */
4478         SSE_INLINE_BEGIN_2(&t[idx],v)
4479           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4480 
4481           /* First Column */
4482           SSE_COPY_PS(XMM0,XMM6)
4483           SSE_SHUFFLE(XMM0,XMM0,0x00)
4484           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4485           SSE_SUB_PS(XMM7,XMM0)
4486 
4487           /* Second Column */
4488           SSE_COPY_PS(XMM1,XMM6)
4489           SSE_SHUFFLE(XMM1,XMM1,0x55)
4490           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4491           SSE_SUB_PS(XMM7,XMM1)
4492 
4493           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4494 
4495           /* Third Column */
4496           SSE_COPY_PS(XMM2,XMM6)
4497           SSE_SHUFFLE(XMM2,XMM2,0xAA)
4498           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4499           SSE_SUB_PS(XMM7,XMM2)
4500 
4501           /* Fourth Column */
4502           SSE_COPY_PS(XMM3,XMM6)
4503           SSE_SHUFFLE(XMM3,XMM3,0xFF)
4504           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4505           SSE_SUB_PS(XMM7,XMM3)
4506         SSE_INLINE_END_2
4507         v  += 16;
4508       }
4509       v    = aa + ai16;
4510       ai16 = 16*diag[--i];
4511       PREFETCH_NTA(aa+ai16+16);
4512       /*
4513          Scale the result by the diagonal 4x4 block,
4514          which was inverted as part of the factorization
4515       */
4516       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
4517         /* First Column */
4518         SSE_COPY_PS(XMM0,XMM7)
4519         SSE_SHUFFLE(XMM0,XMM0,0x00)
4520         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
4521 
4522         /* Second Column */
4523         SSE_COPY_PS(XMM1,XMM7)
4524         SSE_SHUFFLE(XMM1,XMM1,0x55)
4525         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
4526         SSE_ADD_PS(XMM0,XMM1)
4527 
4528         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
4529 
4530         /* Third Column */
4531         SSE_COPY_PS(XMM2,XMM7)
4532         SSE_SHUFFLE(XMM2,XMM2,0xAA)
4533         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
4534         SSE_ADD_PS(XMM0,XMM2)
4535 
4536         /* Fourth Column */
4537         SSE_COPY_PS(XMM3,XMM7)
4538         SSE_SHUFFLE(XMM3,XMM3,0xFF)
4539         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
4540         SSE_ADD_PS(XMM0,XMM3)
4541 
4542         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
4543       SSE_INLINE_END_3
4544 
4545       v    = aa + ai16 + 16;
4546       idt -= 4;
4547     }
4548 
4549     /* Convert t from single precision back to double precision (inplace)*/
4550     idt = 4*(n-1);
4551     for (i=n-1;i>=0;i--) {
4552       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
4553       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
4554       PetscScalar *xtemp=&x[idt];
4555       MatScalar   *ttemp=&t[idt];
4556       xtemp[3] = (PetscScalar)ttemp[3];
4557       xtemp[2] = (PetscScalar)ttemp[2];
4558       xtemp[1] = (PetscScalar)ttemp[1];
4559       xtemp[0] = (PetscScalar)ttemp[0];
4560       idt -= 4;
4561     }
4562 
4563   } /* End of artificial scope. */
4564   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4565   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4566   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4567   SSE_SCOPE_END;
4568   PetscFunctionReturn(0);
4569 }
4570 
4571 #undef __FUNCT__
4572 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion"
4573 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
4574 {
4575   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
4576   int            *aj=a->j;
4577   PetscErrorCode ierr;
4578   int            *ai=a->i,n=a->mbs,*diag = a->diag;
4579   MatScalar      *aa=a->a;
4580   PetscScalar    *x,*b;
4581 
4582   PetscFunctionBegin;
4583   SSE_SCOPE_BEGIN;
4584   /*
4585      Note: This code currently uses demotion of double
4586      to float when performing the mixed-mode computation.
4587      This may not be numerically reasonable for all applications.
4588   */
4589   PREFETCH_NTA(aa+16*ai[1]);
4590 
4591   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4592   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4593   {
4594     /* x will first be computed in single precision then promoted inplace to double */
4595     MatScalar *v,*t=(MatScalar *)x;
4596     int       nz,i,idt,ai16;
4597     int       jdx,idx;
4598     int       *vi;
4599     /* Forward solve the lower triangular factor. */
4600 
4601     /* First block is the identity. */
4602     idx  = 0;
4603     CONVERT_DOUBLE4_FLOAT4(t,b);
4604     v    =  aa + 16*ai[1];
4605 
4606     for (i=1; i<n;) {
4607       PREFETCH_NTA(&v[8]);
4608       vi   =  aj      + ai[i];
4609       nz   =  diag[i] - ai[i];
4610       idx +=  4;
4611 
4612       /* Demote RHS from double to float. */
4613       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
4614       LOAD_PS(&t[idx],XMM7);
4615 
4616       while (nz--) {
4617         PREFETCH_NTA(&v[16]);
4618         jdx = 4*(*vi++);
4619 /*          jdx = *vi++; */
4620 
4621         /* 4x4 Matrix-Vector product with negative accumulation: */
4622         SSE_INLINE_BEGIN_2(&t[jdx],v)
4623           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4624 
4625           /* First Column */
4626           SSE_COPY_PS(XMM0,XMM6)
4627           SSE_SHUFFLE(XMM0,XMM0,0x00)
4628           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4629           SSE_SUB_PS(XMM7,XMM0)
4630 
4631           /* Second Column */
4632           SSE_COPY_PS(XMM1,XMM6)
4633           SSE_SHUFFLE(XMM1,XMM1,0x55)
4634           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4635           SSE_SUB_PS(XMM7,XMM1)
4636 
4637           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4638 
4639           /* Third Column */
4640           SSE_COPY_PS(XMM2,XMM6)
4641           SSE_SHUFFLE(XMM2,XMM2,0xAA)
4642           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4643           SSE_SUB_PS(XMM7,XMM2)
4644 
4645           /* Fourth Column */
4646           SSE_COPY_PS(XMM3,XMM6)
4647           SSE_SHUFFLE(XMM3,XMM3,0xFF)
4648           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4649           SSE_SUB_PS(XMM7,XMM3)
4650         SSE_INLINE_END_2
4651 
4652         v  += 16;
4653       }
4654       v    =  aa + 16*ai[++i];
4655       PREFETCH_NTA(v);
4656       STORE_PS(&t[idx],XMM7);
4657     }
4658 
4659     /* Backward solve the upper triangular factor.*/
4660 
4661     idt  = 4*(n-1);
4662     ai16 = 16*diag[n-1];
4663     v    = aa + ai16 + 16;
4664     for (i=n-1; i>=0;) {
4665       PREFETCH_NTA(&v[8]);
4666       vi = aj + diag[i] + 1;
4667       nz = ai[i+1] - diag[i] - 1;
4668 
4669       LOAD_PS(&t[idt],XMM7);
4670 
4671       while (nz--) {
4672         PREFETCH_NTA(&v[16]);
4673         idx = 4*(*vi++);
4674 /*          idx = *vi++; */
4675 
4676         /* 4x4 Matrix-Vector Product with negative accumulation: */
4677         SSE_INLINE_BEGIN_2(&t[idx],v)
4678           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4679 
4680           /* First Column */
4681           SSE_COPY_PS(XMM0,XMM6)
4682           SSE_SHUFFLE(XMM0,XMM0,0x00)
4683           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4684           SSE_SUB_PS(XMM7,XMM0)
4685 
4686           /* Second Column */
4687           SSE_COPY_PS(XMM1,XMM6)
4688           SSE_SHUFFLE(XMM1,XMM1,0x55)
4689           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4690           SSE_SUB_PS(XMM7,XMM1)
4691 
4692           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4693 
4694           /* Third Column */
4695           SSE_COPY_PS(XMM2,XMM6)
4696           SSE_SHUFFLE(XMM2,XMM2,0xAA)
4697           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4698           SSE_SUB_PS(XMM7,XMM2)
4699 
4700           /* Fourth Column */
4701           SSE_COPY_PS(XMM3,XMM6)
4702           SSE_SHUFFLE(XMM3,XMM3,0xFF)
4703           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4704           SSE_SUB_PS(XMM7,XMM3)
4705         SSE_INLINE_END_2
4706         v  += 16;
4707       }
4708       v    = aa + ai16;
4709       ai16 = 16*diag[--i];
4710       PREFETCH_NTA(aa+ai16+16);
4711       /*
4712          Scale the result by the diagonal 4x4 block,
4713          which was inverted as part of the factorization
4714       */
4715       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
4716         /* First Column */
4717         SSE_COPY_PS(XMM0,XMM7)
4718         SSE_SHUFFLE(XMM0,XMM0,0x00)
4719         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
4720 
4721         /* Second Column */
4722         SSE_COPY_PS(XMM1,XMM7)
4723         SSE_SHUFFLE(XMM1,XMM1,0x55)
4724         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
4725         SSE_ADD_PS(XMM0,XMM1)
4726 
4727         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
4728 
4729         /* Third Column */
4730         SSE_COPY_PS(XMM2,XMM7)
4731         SSE_SHUFFLE(XMM2,XMM2,0xAA)
4732         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
4733         SSE_ADD_PS(XMM0,XMM2)
4734 
4735         /* Fourth Column */
4736         SSE_COPY_PS(XMM3,XMM7)
4737         SSE_SHUFFLE(XMM3,XMM3,0xFF)
4738         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
4739         SSE_ADD_PS(XMM0,XMM3)
4740 
4741         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
4742       SSE_INLINE_END_3
4743 
4744       v    = aa + ai16 + 16;
4745       idt -= 4;
4746     }
4747 
4748     /* Convert t from single precision back to double precision (inplace)*/
4749     idt = 4*(n-1);
4750     for (i=n-1;i>=0;i--) {
4751       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
4752       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
4753       PetscScalar *xtemp=&x[idt];
4754       MatScalar   *ttemp=&t[idt];
4755       xtemp[3] = (PetscScalar)ttemp[3];
4756       xtemp[2] = (PetscScalar)ttemp[2];
4757       xtemp[1] = (PetscScalar)ttemp[1];
4758       xtemp[0] = (PetscScalar)ttemp[0];
4759       idt -= 4;
4760     }
4761 
4762   } /* End of artificial scope. */
4763   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4764   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4765   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4766   SSE_SCOPE_END;
4767   PetscFunctionReturn(0);
4768 }
4769 
4770 #endif
4771 
4772 #undef __FUNCT__
4773 #define __FUNCT__ "MatSolve_SeqBAIJ_3_inplace"
4774 PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
4775 {
4776   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4777   IS                iscol=a->col,isrow=a->row;
4778   PetscErrorCode    ierr;
4779   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
4780   PetscInt          i,nz,idx,idt,idc;
4781   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
4782   const MatScalar   *aa=a->a,*v;
4783   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
4784   const PetscScalar *b;
4785 
4786   PetscFunctionBegin;
4787   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
4788   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4789   t  = a->solve_work;
4790 
4791   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4792   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
4793 
4794   /* forward solve the lower triangular */
4795   idx    = 3*(*r++);
4796   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
4797   for (i=1; i<n; i++) {
4798     v     = aa + 9*ai[i];
4799     vi    = aj + ai[i];
4800     nz    = diag[i] - ai[i];
4801     idx   = 3*(*r++);
4802     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
4803     while (nz--) {
4804       idx   = 3*(*vi++);
4805       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4806       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4807       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4808       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4809       v += 9;
4810     }
4811     idx = 3*i;
4812     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
4813   }
4814   /* backward solve the upper triangular */
4815   for (i=n-1; i>=0; i--) {
4816     v    = aa + 9*diag[i] + 9;
4817     vi   = aj + diag[i] + 1;
4818     nz   = ai[i+1] - diag[i] - 1;
4819     idt  = 3*i;
4820     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
4821     while (nz--) {
4822       idx   = 3*(*vi++);
4823       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4824       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4825       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4826       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4827       v += 9;
4828     }
4829     idc = 3*(*c--);
4830     v   = aa + 9*diag[i];
4831     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4832     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4833     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4834   }
4835   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4836   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4837   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
4838   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4839   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
4840   PetscFunctionReturn(0);
4841 }
4842 
4843 #undef __FUNCT__
4844 #define __FUNCT__ "MatSolve_SeqBAIJ_3"
4845 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
4846 {
4847   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4848   IS                iscol=a->col,isrow=a->row;
4849   PetscErrorCode    ierr;
4850   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
4851   PetscInt          i,nz,idx,idt,idc,m;
4852   const PetscInt    *r,*c,*rout,*cout;
4853   const MatScalar   *aa=a->a,*v;
4854   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
4855   const PetscScalar *b;
4856 
4857   PetscFunctionBegin;
4858   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
4859   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4860   t  = a->solve_work;
4861 
4862   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4863   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
4864 
4865   /* forward solve the lower triangular */
4866   idx    = 3*r[0];
4867   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
4868   for (i=1; i<n; i++) {
4869     v     = aa + 9*ai[i];
4870     vi    = aj + ai[i];
4871     nz    = ai[i+1] - ai[i];
4872     idx   = 3*r[i];
4873     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
4874     for (m=0;m<nz;m++) {
4875       idx   = 3*vi[m];
4876       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4877       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4878       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4879       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4880       v += 9;
4881     }
4882     idx = 3*i;
4883     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
4884   }
4885   /* backward solve the upper triangular */
4886   for (i=n-1; i>=0; i--) {
4887     v    = aa + 9*(adiag[i+1]+1);
4888     vi   = aj + adiag[i+1]+1;
4889     nz   = adiag[i] - adiag[i+1] - 1;
4890     idt  = 3*i;
4891     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
4892     for (m=0;m<nz;m++) {
4893       idx   = 3*vi[m];
4894       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4895       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4896       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4897       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4898       v += 9;
4899     }
4900     idc = 3*c[i];
4901     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4902     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4903     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4904   }
4905   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4906   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4907   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
4908   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4909   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
4910   PetscFunctionReturn(0);
4911 }
4912 
4913 /*
4914       Special case where the matrix was ILU(0) factored in the natural
4915    ordering. This eliminates the need for the column and row permutation.
4916 */
4917 #undef __FUNCT__
4918 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_inplace"
4919 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
4920 {
4921   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4922   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j;
4923   PetscErrorCode    ierr;
4924   const PetscInt    *diag = a->diag,*vi;
4925   const MatScalar   *aa=a->a,*v;
4926   PetscScalar       *x,s1,s2,s3,x1,x2,x3;
4927   const PetscScalar *b;
4928   PetscInt          jdx,idt,idx,nz,i;
4929 
4930   PetscFunctionBegin;
4931   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
4932   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4933 
4934   /* forward solve the lower triangular */
4935   idx    = 0;
4936   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
4937   for (i=1; i<n; i++) {
4938     v     =  aa      + 9*ai[i];
4939     vi    =  aj      + ai[i];
4940     nz    =  diag[i] - ai[i];
4941     idx   +=  3;
4942     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
4943     while (nz--) {
4944       jdx   = 3*(*vi++);
4945       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
4946       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4947       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4948       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4949       v    += 9;
4950     }
4951     x[idx]   = s1;
4952     x[1+idx] = s2;
4953     x[2+idx] = s3;
4954   }
4955   /* backward solve the upper triangular */
4956   for (i=n-1; i>=0; i--) {
4957     v    = aa + 9*diag[i] + 9;
4958     vi   = aj + diag[i] + 1;
4959     nz   = ai[i+1] - diag[i] - 1;
4960     idt  = 3*i;
4961     s1 = x[idt];  s2 = x[1+idt];
4962     s3 = x[2+idt];
4963     while (nz--) {
4964       idx   = 3*(*vi++);
4965       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
4966       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4967       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4968       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4969       v    += 9;
4970     }
4971     v        = aa +  9*diag[i];
4972     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4973     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4974     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4975   }
4976 
4977   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
4978   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4979   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
4980   PetscFunctionReturn(0);
4981 }
4982 
4983 #undef __FUNCT__
4984 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
4985 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
4986 {
4987     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4988     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
4989     PetscErrorCode    ierr;
4990     PetscInt          i,k,nz,idx,jdx,idt;
4991     const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
4992     const MatScalar   *aa=a->a,*v;
4993     PetscScalar       *x;
4994     const PetscScalar *b;
4995     PetscScalar        s1,s2,s3,x1,x2,x3;
4996 
4997     PetscFunctionBegin;
4998     ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
4999     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5000     /* forward solve the lower triangular */
5001     idx    = 0;
5002     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
5003     for (i=1; i<n; i++) {
5004        v    = aa + bs2*ai[i];
5005        vi   = aj + ai[i];
5006        nz   = ai[i+1] - ai[i];
5007       idx   = bs*i;
5008        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];
5009       for (k=0;k<nz;k++) {
5010          jdx   = bs*vi[k];
5011           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
5012           s1   -= v[0]*x1 + v[3]*x2 + v[6]*x3;
5013           s2   -= v[1]*x1 + v[4]*x2 + v[7]*x3;
5014           s3   -= v[2]*x1 + v[5]*x2 + v[8]*x3;
5015 
5016           v   +=  bs2;
5017         }
5018 
5019        x[idx]   = s1;
5020        x[1+idx] = s2;
5021        x[2+idx] = s3;
5022     }
5023 
5024    /* backward solve the upper triangular */
5025   for (i=n-1; i>=0; i--) {
5026     v   = aa + bs2*(adiag[i+1]+1);
5027      vi  = aj + adiag[i+1]+1;
5028      nz  = adiag[i] - adiag[i+1]-1;
5029      idt = bs*i;
5030      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];
5031 
5032      for (k=0;k<nz;k++) {
5033        idx   = bs*vi[k];
5034        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
5035        s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
5036        s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
5037        s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
5038 
5039         v   +=  bs2;
5040     }
5041     /* x = inv_diagonal*x */
5042    x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
5043    x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
5044    x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
5045 
5046   }
5047 
5048   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5049   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5050   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
5051   PetscFunctionReturn(0);
5052 }
5053 
5054 #undef __FUNCT__
5055 #define __FUNCT__ "MatSolve_SeqBAIJ_2_inplace"
5056 PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
5057 {
5058   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
5059   IS                iscol=a->col,isrow=a->row;
5060   PetscErrorCode    ierr;
5061   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
5062   PetscInt          i,nz,idx,idt,idc;
5063   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
5064   const MatScalar   *aa=a->a,*v;
5065   PetscScalar       *x,s1,s2,x1,x2,*t;
5066   const PetscScalar *b;
5067 
5068   PetscFunctionBegin;
5069   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
5070   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5071   t  = a->solve_work;
5072 
5073   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
5074   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
5075 
5076   /* forward solve the lower triangular */
5077   idx    = 2*(*r++);
5078   t[0] = b[idx]; t[1] = b[1+idx];
5079   for (i=1; i<n; i++) {
5080     v     = aa + 4*ai[i];
5081     vi    = aj + ai[i];
5082     nz    = diag[i] - ai[i];
5083     idx   = 2*(*r++);
5084     s1  = b[idx]; s2 = b[1+idx];
5085     while (nz--) {
5086       idx   = 2*(*vi++);
5087       x1    = t[idx]; x2 = t[1+idx];
5088       s1 -= v[0]*x1 + v[2]*x2;
5089       s2 -= v[1]*x1 + v[3]*x2;
5090       v += 4;
5091     }
5092     idx = 2*i;
5093     t[idx] = s1; t[1+idx] = s2;
5094   }
5095   /* backward solve the upper triangular */
5096   for (i=n-1; i>=0; i--) {
5097     v    = aa + 4*diag[i] + 4;
5098     vi   = aj + diag[i] + 1;
5099     nz   = ai[i+1] - diag[i] - 1;
5100     idt  = 2*i;
5101     s1 = t[idt]; s2 = t[1+idt];
5102     while (nz--) {
5103       idx   = 2*(*vi++);
5104       x1    = t[idx]; x2 = t[1+idx];
5105       s1 -= v[0]*x1 + v[2]*x2;
5106       s2 -= v[1]*x1 + v[3]*x2;
5107       v += 4;
5108     }
5109     idc = 2*(*c--);
5110     v   = aa + 4*diag[i];
5111     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
5112     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
5113   }
5114   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
5115   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
5116   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5117   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5118   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
5119   PetscFunctionReturn(0);
5120 }
5121 
5122 #undef __FUNCT__
5123 #define __FUNCT__ "MatSolve_SeqBAIJ_2"
5124 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
5125 {
5126   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
5127   IS                iscol=a->col,isrow=a->row;
5128   PetscErrorCode    ierr;
5129   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
5130   PetscInt          i,nz,idx,jdx,idt,idc,m;
5131   const PetscInt    *r,*c,*rout,*cout;
5132   const MatScalar   *aa=a->a,*v;
5133   PetscScalar       *x,s1,s2,x1,x2,*t;
5134   const PetscScalar *b;
5135 
5136   PetscFunctionBegin;
5137   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
5138   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5139   t  = a->solve_work;
5140 
5141   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
5142   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
5143 
5144   /* forward solve the lower triangular */
5145   idx    = 2*r[0];
5146   t[0] = b[idx]; t[1] = b[1+idx];
5147   for (i=1; i<n; i++) {
5148     v     = aa + 4*ai[i];
5149     vi    = aj + ai[i];
5150     nz    = ai[i+1] - ai[i];
5151     idx   = 2*r[i];
5152     s1  = b[idx]; s2 = b[1+idx];
5153     for (m=0;m<nz;m++) {
5154       jdx   = 2*vi[m];
5155       x1    = t[jdx]; x2 = t[1+jdx];
5156       s1 -= v[0]*x1 + v[2]*x2;
5157       s2 -= v[1]*x1 + v[3]*x2;
5158       v += 4;
5159     }
5160     idx = 2*i;
5161     t[idx] = s1; t[1+idx] = s2;
5162   }
5163   /* backward solve the upper triangular */
5164   for (i=n-1; i>=0; i--) {
5165     v    = aa + 4*(adiag[i+1]+1);
5166     vi   = aj + adiag[i+1]+1;
5167     nz   = adiag[i] - adiag[i+1] - 1;
5168     idt  = 2*i;
5169     s1 = t[idt]; s2 = t[1+idt];
5170     for (m=0;m<nz;m++) {
5171       idx   = 2*vi[m];
5172       x1    = t[idx]; x2 = t[1+idx];
5173       s1 -= v[0]*x1 + v[2]*x2;
5174       s2 -= v[1]*x1 + v[3]*x2;
5175       v += 4;
5176     }
5177     idc = 2*c[i];
5178     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
5179     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
5180   }
5181   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
5182   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
5183   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5184   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5185   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
5186   PetscFunctionReturn(0);
5187 }
5188 
5189 /*
5190       Special case where the matrix was ILU(0) factored in the natural
5191    ordering. This eliminates the need for the column and row permutation.
5192 */
5193 #undef __FUNCT__
5194 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_inplace"
5195 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
5196 {
5197   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
5198   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
5199   PetscErrorCode    ierr;
5200   const MatScalar   *aa=a->a,*v;
5201   PetscScalar       *x,s1,s2,x1,x2;
5202   const PetscScalar *b;
5203   PetscInt          jdx,idt,idx,nz,i;
5204 
5205   PetscFunctionBegin;
5206   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
5207   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5208 
5209   /* forward solve the lower triangular */
5210   idx    = 0;
5211   x[0]   = b[0]; x[1] = b[1];
5212   for (i=1; i<n; i++) {
5213     v     =  aa      + 4*ai[i];
5214     vi    =  aj      + ai[i];
5215     nz    =  diag[i] - ai[i];
5216     idx   +=  2;
5217     s1  =  b[idx];s2 = b[1+idx];
5218     while (nz--) {
5219       jdx   = 2*(*vi++);
5220       x1    = x[jdx];x2 = x[1+jdx];
5221       s1 -= v[0]*x1 + v[2]*x2;
5222       s2 -= v[1]*x1 + v[3]*x2;
5223       v    += 4;
5224     }
5225     x[idx]   = s1;
5226     x[1+idx] = s2;
5227   }
5228   /* backward solve the upper triangular */
5229   for (i=n-1; i>=0; i--) {
5230     v    = aa + 4*diag[i] + 4;
5231     vi   = aj + diag[i] + 1;
5232     nz   = ai[i+1] - diag[i] - 1;
5233     idt  = 2*i;
5234     s1 = x[idt];  s2 = x[1+idt];
5235     while (nz--) {
5236       idx   = 2*(*vi++);
5237       x1    = x[idx];   x2 = x[1+idx];
5238       s1 -= v[0]*x1 + v[2]*x2;
5239       s2 -= v[1]*x1 + v[3]*x2;
5240       v    += 4;
5241     }
5242     v        = aa +  4*diag[i];
5243     x[idt]   = v[0]*s1 + v[2]*s2;
5244     x[1+idt] = v[1]*s1 + v[3]*s2;
5245   }
5246 
5247   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5248   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5249   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
5250   PetscFunctionReturn(0);
5251 }
5252 
5253 #undef __FUNCT__
5254 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
5255 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
5256 {
5257     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
5258     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
5259     PetscInt          i,k,nz,idx,idt,jdx;
5260     PetscErrorCode    ierr;
5261     const MatScalar   *aa=a->a,*v;
5262     PetscScalar       *x,s1,s2,x1,x2;
5263     const PetscScalar *b;
5264 
5265     PetscFunctionBegin;
5266     ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
5267     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5268     /* forward solve the lower triangular */
5269     idx    = 0;
5270     x[0] = b[idx]; x[1] = b[1+idx];
5271     for (i=1; i<n; i++) {
5272         v   = aa + 4*ai[i];
5273        vi   = aj + ai[i];
5274        nz   = ai[i+1] - ai[i];
5275        idx  = 2*i;
5276        s1   = b[idx];s2 = b[1+idx];
5277        PetscPrefetchBlock(vi+nz,nz,0,PETSC_PREFETCH_HINT_NTA);
5278        PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA);
5279       for (k=0;k<nz;k++) {
5280          jdx   = 2*vi[k];
5281           x1    = x[jdx];x2 = x[1+jdx];
5282           s1   -= v[0]*x1 + v[2]*x2;
5283           s2   -= v[1]*x1 + v[3]*x2;
5284            v   +=  4;
5285         }
5286        x[idx]   = s1;
5287        x[1+idx] = s2;
5288     }
5289 
5290    /* backward solve the upper triangular */
5291   for (i=n-1; i>=0; i--) {
5292      v   = aa + 4*(adiag[i+1]+1);
5293      vi  = aj + adiag[i+1]+1;
5294      nz  = adiag[i] - adiag[i+1]-1;
5295      idt = 2*i;
5296      s1 = x[idt];  s2 = x[1+idt];
5297      PetscPrefetchBlock(vi+nz,nz,0,PETSC_PREFETCH_HINT_NTA);
5298      PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA);
5299      for (k=0;k<nz;k++) {
5300       idx   = 2*vi[k];
5301        x1    = x[idx];   x2 = x[1+idx];
5302        s1 -= v[0]*x1 + v[2]*x2;
5303        s2 -= v[1]*x1 + v[3]*x2;
5304          v    += 4;
5305     }
5306     /* x = inv_diagonal*x */
5307    x[idt]   = v[0]*s1 + v[2]*s2;
5308    x[1+idt] = v[1]*s1 + v[3]*s2;
5309   }
5310 
5311   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5312   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5313   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
5314   PetscFunctionReturn(0);
5315 }
5316 
5317 #undef __FUNCT__
5318 #define __FUNCT__ "MatSolve_SeqBAIJ_1_inplace"
5319 PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
5320 {
5321   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
5322   IS                iscol=a->col,isrow=a->row;
5323   PetscErrorCode    ierr;
5324   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
5325   PetscInt          i,nz;
5326   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
5327   const MatScalar   *aa=a->a,*v;
5328   PetscScalar       *x,s1,*t;
5329   const PetscScalar *b;
5330 
5331   PetscFunctionBegin;
5332   if (!n) PetscFunctionReturn(0);
5333 
5334   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
5335   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5336   t  = a->solve_work;
5337 
5338   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
5339   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
5340 
5341   /* forward solve the lower triangular */
5342   t[0] = b[*r++];
5343   for (i=1; i<n; i++) {
5344     v     = aa + ai[i];
5345     vi    = aj + ai[i];
5346     nz    = diag[i] - ai[i];
5347     s1  = b[*r++];
5348     while (nz--) {
5349       s1 -= (*v++)*t[*vi++];
5350     }
5351     t[i] = s1;
5352   }
5353   /* backward solve the upper triangular */
5354   for (i=n-1; i>=0; i--) {
5355     v    = aa + diag[i] + 1;
5356     vi   = aj + diag[i] + 1;
5357     nz   = ai[i+1] - diag[i] - 1;
5358     s1 = t[i];
5359     while (nz--) {
5360       s1 -= (*v++)*t[*vi++];
5361     }
5362     x[*c--] = t[i] = aa[diag[i]]*s1;
5363   }
5364 
5365   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
5366   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
5367   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5368   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5369   ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr);
5370   PetscFunctionReturn(0);
5371 }
5372 
5373 #undef __FUNCT__
5374 #define __FUNCT__ "MatSolve_SeqBAIJ_1"
5375 PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
5376 {
5377   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data;
5378   IS                iscol = a->col,isrow = a->row;
5379   PetscErrorCode    ierr;
5380   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag = a->diag,nz;
5381   const PetscInt    *rout,*cout,*r,*c;
5382   PetscScalar       *x,*tmp,sum;
5383   const PetscScalar *b;
5384   const MatScalar   *aa = a->a,*v;
5385 
5386   PetscFunctionBegin;
5387   if (!n) PetscFunctionReturn(0);
5388 
5389   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
5390   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5391   tmp  = a->solve_work;
5392 
5393   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
5394   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
5395 
5396   /* forward solve the lower triangular */
5397   tmp[0] = b[r[0]];
5398   v      = aa;
5399   vi     = aj;
5400   for (i=1; i<n; i++) {
5401     nz  = ai[i+1] - ai[i];
5402     sum = b[r[i]];
5403     PetscSparseDenseMinusDot(sum,tmp,v,vi,nz);
5404     tmp[i] = sum;
5405     v += nz; vi += nz;
5406   }
5407 
5408   /* backward solve the upper triangular */
5409   for (i=n-1; i>=0; i--) {
5410     v   = aa + adiag[i+1]+1;
5411     vi  = aj + adiag[i+1]+1;
5412     nz  = adiag[i]-adiag[i+1]-1;
5413     sum = tmp[i];
5414     PetscSparseDenseMinusDot(sum,tmp,v,vi,nz);
5415     x[c[i]] = tmp[i] = sum*v[nz]; /* v[nz] = aa[adiag[i]] */
5416   }
5417 
5418   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
5419   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
5420   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5421   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5422   ierr = PetscLogFlops(2*a->nz - A->cmap->n);CHKERRQ(ierr);
5423   PetscFunctionReturn(0);
5424 }
5425 
5426 /*
5427       Special case where the matrix was ILU(0) factored in the natural
5428    ordering. This eliminates the need for the column and row permutation.
5429 */
5430 #undef __FUNCT__
5431 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering_inplace"
5432 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
5433 {
5434   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
5435   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
5436   PetscErrorCode    ierr;
5437   const MatScalar   *aa=a->a,*v;
5438   PetscScalar       *x;
5439   const PetscScalar *b;
5440   PetscScalar       s1,x1;
5441   PetscInt          jdx,idt,idx,nz,i;
5442 
5443   PetscFunctionBegin;
5444   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
5445   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5446 
5447   /* forward solve the lower triangular */
5448   idx    = 0;
5449   x[0]   = b[0];
5450   for (i=1; i<n; i++) {
5451     v     =  aa      + ai[i];
5452     vi    =  aj      + ai[i];
5453     nz    =  diag[i] - ai[i];
5454     idx   +=  1;
5455     s1  =  b[idx];
5456     while (nz--) {
5457       jdx   = *vi++;
5458       x1    = x[jdx];
5459       s1 -= v[0]*x1;
5460       v    += 1;
5461     }
5462     x[idx]   = s1;
5463   }
5464   /* backward solve the upper triangular */
5465   for (i=n-1; i>=0; i--) {
5466     v    = aa + diag[i] + 1;
5467     vi   = aj + diag[i] + 1;
5468     nz   = ai[i+1] - diag[i] - 1;
5469     idt  = i;
5470     s1 = x[idt];
5471     while (nz--) {
5472       idx   = *vi++;
5473       x1    = x[idx];
5474       s1 -= v[0]*x1;
5475       v    += 1;
5476     }
5477     v        = aa +  diag[i];
5478     x[idt]   = v[0]*s1;
5479   }
5480   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5481   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5482   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
5483   PetscFunctionReturn(0);
5484 }
5485 
5486 
5487 #undef __FUNCT__
5488 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering"
5489 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
5490 {
5491   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
5492   PetscErrorCode    ierr;
5493   const PetscInt    n = a->mbs,*ai = a->i,*aj = a->j,*adiag = a->diag,*vi;
5494   PetscScalar       *x,sum;
5495   const PetscScalar *b;
5496   const MatScalar   *aa = a->a,*v;
5497   PetscInt          i,nz;
5498 
5499   PetscFunctionBegin;
5500   if (!n) PetscFunctionReturn(0);
5501 
5502   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
5503   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5504 
5505   /* forward solve the lower triangular */
5506   x[0] = b[0];
5507   v    = aa;
5508   vi   = aj;
5509   for (i=1; i<n; i++) {
5510     nz  = ai[i+1] - ai[i];
5511     sum = b[i];
5512     PetscSparseDenseMinusDot(sum,x,v,vi,nz);
5513     v  += nz;
5514     vi += nz;
5515     x[i] = sum;
5516   }
5517 
5518   /* backward solve the upper triangular */
5519   for (i=n-1; i>=0; i--) {
5520     v   = aa + adiag[i+1] + 1;
5521     vi  = aj + adiag[i+1] + 1;
5522     nz = adiag[i] - adiag[i+1]-1;
5523     sum = x[i];
5524     PetscSparseDenseMinusDot(sum,x,v,vi,nz);
5525     x[i] = sum*v[nz]; /* x[i]=aa[adiag[i]]*sum; v++; */
5526   }
5527 
5528   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
5529   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5530   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5531   PetscFunctionReturn(0);
5532 }
5533 
5534 /* ----------------------------------------------------------------*/
5535 extern PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscBool);
5536 
5537 #undef __FUNCT__
5538 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering"
5539 /*
5540    This is not much faster than MatLUFactorNumeric_SeqBAIJ_N() but the solve is faster at least sometimes
5541 */
5542 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info)
5543 {
5544   Mat             C=B;
5545   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
5546   PetscErrorCode  ierr;
5547   PetscInt        i,j,k,ipvt[15];
5548   const PetscInt  n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*ajtmp,*bjtmp,*bdiag=b->diag,*pj;
5549   PetscInt        nz,nzL,row;
5550   MatScalar       *rtmp,*pc,*mwork,*pv,*vv,work[225];
5551   const MatScalar *v,*aa=a->a;
5552   PetscInt        bs2 = a->bs2,bs=A->rmap->bs,flg;
5553   PetscInt        sol_ver;
5554 
5555   PetscFunctionBegin;
5556   ierr = PetscOptionsGetInt(((PetscObject)A)->prefix,"-sol_ver",&sol_ver,PETSC_NULL);CHKERRQ(ierr);
5557 
5558   /* generate work space needed by the factorization */
5559   ierr = PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);CHKERRQ(ierr);
5560   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
5561 
5562   for (i=0; i<n; i++) {
5563     /* zero rtmp */
5564     /* L part */
5565     nz    = bi[i+1] - bi[i];
5566     bjtmp = bj + bi[i];
5567     for  (j=0; j<nz; j++) {
5568       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5569     }
5570 
5571     /* U part */
5572     nz = bdiag[i] - bdiag[i+1];
5573     bjtmp = bj + bdiag[i+1]+1;
5574     for  (j=0; j<nz; j++) {
5575       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5576     }
5577 
5578     /* load in initial (unfactored row) */
5579     nz    = ai[i+1] - ai[i];
5580     ajtmp = aj + ai[i];
5581     v     = aa + bs2*ai[i];
5582     for (j=0; j<nz; j++) {
5583       ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
5584     }
5585 
5586     /* elimination */
5587     bjtmp = bj + bi[i];
5588     nzL   = bi[i+1] - bi[i];
5589     for (k=0;k < nzL;k++) {
5590       row = bjtmp[k];
5591       pc = rtmp + bs2*row;
5592       for (flg=0,j=0; j<bs2; j++) {
5593         if (pc[j]!=0.0) {
5594           flg = 1;
5595           break;
5596         }
5597       }
5598       if (flg) {
5599         pv = b->a + bs2*bdiag[row];
5600         PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork);
5601         /*ierr = PetscKernel_A_gets_A_times_B_15(pc,pv,mwork);CHKERRQ(ierr);*/
5602         pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
5603         pv = b->a + bs2*(bdiag[row+1]+1);
5604         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
5605         for (j=0; j<nz; j++) {
5606           vv   = rtmp + bs2*pj[j];
5607           PetscKernel_A_gets_A_minus_B_times_C(bs,vv,pc,pv);
5608           /* ierr = PetscKernel_A_gets_A_minus_B_times_C_15(vv,pc,pv);CHKERRQ(ierr); */
5609           pv  += bs2;
5610         }
5611         ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
5612       }
5613     }
5614 
5615     /* finished row so stick it into b->a */
5616     /* L part */
5617     pv   = b->a + bs2*bi[i] ;
5618     pj   = b->j + bi[i] ;
5619     nz   = bi[i+1] - bi[i];
5620     for (j=0; j<nz; j++) {
5621       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5622     }
5623 
5624     /* Mark diagonal and invert diagonal for simplier triangular solves */
5625     pv   = b->a + bs2*bdiag[i];
5626     pj   = b->j + bdiag[i];
5627     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5628     /* PetscKernel_A_gets_inverse_A(bs,pv,pivots,work); */
5629     ierr = PetscKernel_A_gets_inverse_A_15(pv,ipvt,work,info->shiftamount);CHKERRQ(ierr);
5630 
5631     /* U part */
5632     pv = b->a + bs2*(bdiag[i+1]+1);
5633     pj = b->j + bdiag[i+1]+1;
5634     nz = bdiag[i] - bdiag[i+1] - 1;
5635     for (j=0; j<nz; j++) {
5636       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5637     }
5638   }
5639 
5640   ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr);
5641   C->ops->solve = MatSolve_SeqBAIJ_15_NaturalOrdering_ver1;
5642   C->ops->solvetranspose = MatSolve_SeqBAIJ_N_NaturalOrdering;
5643   C->assembled = PETSC_TRUE;
5644   ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
5645   PetscFunctionReturn(0);
5646 }
5647 
5648 #undef __FUNCT__
5649 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N"
5650 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N(Mat B,Mat A,const MatFactorInfo *info)
5651 {
5652   Mat            C=B;
5653   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
5654   IS             isrow = b->row,isicol = b->icol;
5655   PetscErrorCode ierr;
5656   const PetscInt *r,*ic;
5657   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
5658   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
5659   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
5660   PetscInt       bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg;
5661   MatScalar      *v_work;
5662   PetscBool      col_identity,row_identity,both_identity;
5663 
5664   PetscFunctionBegin;
5665   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
5666   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
5667 
5668   ierr = PetscMalloc(bs2*n*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
5669   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
5670 
5671   /* generate work space needed by dense LU factorization */
5672   ierr  = PetscMalloc3(bs,MatScalar,&v_work,bs2,MatScalar,&mwork,bs,PetscInt,&v_pivots);CHKERRQ(ierr);
5673 
5674   for (i=0; i<n; i++) {
5675     /* zero rtmp */
5676     /* L part */
5677     nz    = bi[i+1] - bi[i];
5678     bjtmp = bj + bi[i];
5679     for  (j=0; j<nz; j++) {
5680       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5681     }
5682 
5683     /* U part */
5684     nz = bdiag[i] - bdiag[i+1];
5685     bjtmp = bj + bdiag[i+1]+1;
5686     for  (j=0; j<nz; j++) {
5687       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5688     }
5689 
5690     /* load in initial (unfactored row) */
5691     nz    = ai[r[i]+1] - ai[r[i]];
5692     ajtmp = aj + ai[r[i]];
5693     v     = aa + bs2*ai[r[i]];
5694     for (j=0; j<nz; j++) {
5695       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
5696     }
5697 
5698     /* elimination */
5699     bjtmp = bj + bi[i];
5700     nzL   = bi[i+1] - bi[i];
5701     for (k=0;k < nzL;k++) {
5702       row = bjtmp[k];
5703       pc = rtmp + bs2*row;
5704       for (flg=0,j=0; j<bs2; j++) {
5705         if (pc[j]!=0.0) {
5706           flg = 1;
5707           break;
5708         }
5709       }
5710       if (flg) {
5711         pv         = b->a + bs2*bdiag[row];
5712         PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */
5713         pj         = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
5714         pv         = b->a + bs2*(bdiag[row+1]+1);
5715         nz         = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
5716         for (j=0; j<nz; j++) {
5717           PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j);
5718         }
5719         ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
5720       }
5721     }
5722 
5723     /* finished row so stick it into b->a */
5724     /* L part */
5725     pv   = b->a + bs2*bi[i] ;
5726     pj   = b->j + bi[i] ;
5727     nz   = bi[i+1] - bi[i];
5728     for (j=0; j<nz; j++) {
5729       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5730     }
5731 
5732     /* Mark diagonal and invert diagonal for simplier triangular solves */
5733     pv  = b->a + bs2*bdiag[i];
5734     pj  = b->j + bdiag[i];
5735     /* if (*pj != i)SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */
5736     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5737     ierr = PetscKernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr);
5738 
5739     /* U part */
5740     pv = b->a + bs2*(bdiag[i+1]+1);
5741     pj = b->j + bdiag[i+1]+1;
5742     nz = bdiag[i] - bdiag[i+1] - 1;
5743     for (j=0; j<nz; j++) {
5744       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5745     }
5746   }
5747 
5748   ierr = PetscFree(rtmp);CHKERRQ(ierr);
5749   ierr = PetscFree3(v_work,mwork,v_pivots);CHKERRQ(ierr);
5750   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
5751   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
5752 
5753   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5754   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
5755   both_identity = (PetscBool) (row_identity && col_identity);
5756   if (both_identity) {
5757     C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering;
5758   } else {
5759     C->ops->solve = MatSolve_SeqBAIJ_N;
5760   }
5761   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_N;
5762 
5763   C->assembled = PETSC_TRUE;
5764   ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
5765   PetscFunctionReturn(0);
5766 }
5767 
5768 /*
5769    ilu(0) with natural ordering under new data structure.
5770    See MatILUFactorSymbolic_SeqAIJ_ilu0() for detailed description
5771    because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_inplace().
5772 */
5773 
5774 #undef __FUNCT__
5775 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0"
5776 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
5777 {
5778 
5779   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
5780   PetscErrorCode     ierr;
5781   PetscInt           n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2;
5782   PetscInt           i,j,nz,*bi,*bj,*bdiag,bi_temp;
5783 
5784   PetscFunctionBegin;
5785   ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr);
5786   b    = (Mat_SeqBAIJ*)(fact)->data;
5787 
5788   /* allocate matrix arrays for new data structure */
5789   ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,n+1,PetscInt,&b->i);CHKERRQ(ierr);
5790   ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(n+1)*sizeof(PetscInt));CHKERRQ(ierr);
5791   b->singlemalloc    = PETSC_TRUE;
5792   b->free_a          = PETSC_TRUE;
5793   b->free_ij         = PETSC_TRUE;
5794   fact->preallocated = PETSC_TRUE;
5795   fact->assembled    = PETSC_TRUE;
5796   if (!b->diag) {
5797     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr);
5798     ierr = PetscLogObjectMemory(fact,(n+1)*sizeof(PetscInt));CHKERRQ(ierr);
5799   }
5800   bdiag = b->diag;
5801 
5802   if (n > 0) {
5803     ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr);
5804   }
5805 
5806   /* set bi and bj with new data structure */
5807   bi = b->i;
5808   bj = b->j;
5809 
5810   /* L part */
5811   bi[0] = 0;
5812   for (i=0; i<n; i++) {
5813     nz = adiag[i] - ai[i];
5814     bi[i+1] = bi[i] + nz;
5815     aj = a->j + ai[i];
5816     for (j=0; j<nz; j++) {
5817       *bj = aj[j]; bj++;
5818     }
5819   }
5820 
5821   /* U part */
5822   bi_temp = bi[n];
5823   bdiag[n] = bi[n]-1;
5824   for (i=n-1; i>=0; i--) {
5825     nz = ai[i+1] - adiag[i] - 1;
5826     bi_temp = bi_temp + nz + 1;
5827     aj = a->j + adiag[i] + 1;
5828     for (j=0; j<nz; j++) {
5829       *bj = aj[j]; bj++;
5830     }
5831     /* diag[i] */
5832     *bj = i; bj++;
5833     bdiag[i] = bi_temp - 1;
5834   }
5835   PetscFunctionReturn(0);
5836 }
5837 
5838 #undef __FUNCT__
5839 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ"
5840 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
5841 {
5842   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
5843   IS                 isicol;
5844   PetscErrorCode     ierr;
5845   const PetscInt     *r,*ic;
5846   PetscInt           n=a->mbs,*ai=a->i,*aj=a->j,d;
5847   PetscInt           *bi,*cols,nnz,*cols_lvl;
5848   PetscInt           *bdiag,prow,fm,nzbd,reallocs=0,dcount=0;
5849   PetscInt           i,levels,diagonal_fill;
5850   PetscBool          col_identity,row_identity,both_identity;
5851   PetscReal          f;
5852   PetscInt           nlnk,*lnk,*lnk_lvl=PETSC_NULL;
5853   PetscBT            lnkbt;
5854   PetscInt           nzi,*bj,**bj_ptr,**bjlvl_ptr;
5855   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
5856   PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL;
5857   PetscBool          missing;
5858   PetscInt           bs=A->rmap->bs,bs2=a->bs2;
5859 
5860   PetscFunctionBegin;
5861   if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
5862   if (bs>1) {  /* check shifttype */
5863     if (info->shifttype == MAT_SHIFT_NONZERO || info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE)
5864       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Only MAT_SHIFT_NONE and MAT_SHIFT_INBLOCKS are supported for BAIJ matrix");
5865   }
5866 
5867   ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr);
5868   if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
5869 
5870   f             = info->fill;
5871   levels        = (PetscInt)info->levels;
5872   diagonal_fill = (PetscInt)info->diagonal_fill;
5873   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
5874 
5875   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5876   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
5877   both_identity = (PetscBool) (row_identity && col_identity);
5878 
5879   if (!levels && both_identity) {
5880     /* special case: ilu(0) with natural ordering */
5881     ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0(fact,A,isrow,iscol,info);CHKERRQ(ierr);
5882     ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
5883 
5884     fact->factortype               = MAT_FACTOR_ILU;
5885     (fact)->info.factor_mallocs    = 0;
5886     (fact)->info.fill_ratio_given  = info->fill;
5887     (fact)->info.fill_ratio_needed = 1.0;
5888     b                = (Mat_SeqBAIJ*)(fact)->data;
5889     b->row           = isrow;
5890     b->col           = iscol;
5891     b->icol          = isicol;
5892     ierr             = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5893     ierr             = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5894     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5895     ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
5896     PetscFunctionReturn(0);
5897   }
5898 
5899   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
5900   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
5901 
5902   /* get new row pointers */
5903   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
5904   bi[0] = 0;
5905   /* bdiag is location of diagonal in factor */
5906   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr);
5907   bdiag[0]  = 0;
5908 
5909   ierr = PetscMalloc2(n,PetscInt*,&bj_ptr,n,PetscInt*,&bjlvl_ptr);CHKERRQ(ierr);
5910 
5911   /* create a linked list for storing column indices of the active row */
5912   nlnk = n + 1;
5913   ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
5914 
5915   /* initial FreeSpace size is f*(ai[n]+1) */
5916   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr);
5917   current_space = free_space;
5918   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr);
5919   current_space_lvl = free_space_lvl;
5920 
5921   for (i=0; i<n; i++) {
5922     nzi = 0;
5923     /* copy current row into linked list */
5924     nnz  = ai[r[i]+1] - ai[r[i]];
5925     if (!nnz) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i);
5926     cols = aj + ai[r[i]];
5927     lnk[i] = -1; /* marker to indicate if diagonal exists */
5928     ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
5929     nzi += nlnk;
5930 
5931     /* make sure diagonal entry is included */
5932     if (diagonal_fill && lnk[i] == -1) {
5933       fm = n;
5934       while (lnk[fm] < i) fm = lnk[fm];
5935       lnk[i]     = lnk[fm]; /* insert diagonal into linked list */
5936       lnk[fm]    = i;
5937       lnk_lvl[i] = 0;
5938       nzi++; dcount++;
5939     }
5940 
5941     /* add pivot rows into the active row */
5942     nzbd = 0;
5943     prow = lnk[n];
5944     while (prow < i) {
5945       nnz      = bdiag[prow];
5946       cols     = bj_ptr[prow] + nnz + 1;
5947       cols_lvl = bjlvl_ptr[prow] + nnz + 1;
5948       nnz      = bi[prow+1] - bi[prow] - nnz - 1;
5949       ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr);
5950       nzi += nlnk;
5951       prow = lnk[prow];
5952       nzbd++;
5953     }
5954     bdiag[i] = nzbd;
5955     bi[i+1]  = bi[i] + nzi;
5956 
5957     /* if free space is not available, make more free space */
5958     if (current_space->local_remaining<nzi) {
5959       nnz = 2*nzi*(n - i); /* estimated and max additional space needed */
5960       ierr = PetscFreeSpaceGet(nnz,&current_space);CHKERRQ(ierr);
5961       ierr = PetscFreeSpaceGet(nnz,&current_space_lvl);CHKERRQ(ierr);
5962       reallocs++;
5963     }
5964 
5965     /* copy data into free_space and free_space_lvl, then initialize lnk */
5966     ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr);
5967     bj_ptr[i]    = current_space->array;
5968     bjlvl_ptr[i] = current_space_lvl->array;
5969 
5970     /* make sure the active row i has diagonal entry */
5971     if (*(bj_ptr[i]+bdiag[i]) != i) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\ntry running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i);
5972 
5973     current_space->array           += nzi;
5974     current_space->local_used      += nzi;
5975     current_space->local_remaining -= nzi;
5976     current_space_lvl->array           += nzi;
5977     current_space_lvl->local_used      += nzi;
5978     current_space_lvl->local_remaining -= nzi;
5979   }
5980 
5981   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
5982   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
5983 
5984   /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */
5985   ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr);
5986   ierr = PetscFreeSpaceContiguous_LU(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr);
5987 
5988   ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
5989   ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr);
5990   ierr = PetscFree2(bj_ptr,bjlvl_ptr);CHKERRQ(ierr);
5991 
5992 #if defined(PETSC_USE_INFO)
5993   {
5994     PetscReal af = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]);
5995     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr);
5996     ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
5997     ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr);
5998     ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
5999     if (diagonal_fill) {
6000       ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr);
6001     }
6002   }
6003 #endif
6004 
6005   /* put together the new matrix */
6006   ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
6007   ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
6008   b = (Mat_SeqBAIJ*)(fact)->data;
6009   b->free_a       = PETSC_TRUE;
6010   b->free_ij      = PETSC_TRUE;
6011   b->singlemalloc = PETSC_FALSE;
6012   ierr = PetscMalloc((bs2*(bdiag[0]+1))*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
6013   b->j          = bj;
6014   b->i          = bi;
6015   b->diag       = bdiag;
6016   b->free_diag  = PETSC_TRUE;
6017   b->ilen       = 0;
6018   b->imax       = 0;
6019   b->row        = isrow;
6020   b->col        = iscol;
6021   ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
6022   ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
6023   b->icol       = isicol;
6024   ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
6025   /* In b structure:  Free imax, ilen, old a, old j.
6026      Allocate bdiag, solve_work, new a, new j */
6027   ierr = PetscLogObjectMemory(fact,(bdiag[0]+1) * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr);
6028   b->maxnz = b->nz = bdiag[0]+1;
6029   fact->info.factor_mallocs    = reallocs;
6030   fact->info.fill_ratio_given  = f;
6031   fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]);
6032   ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
6033   PetscFunctionReturn(0);
6034 }
6035 
6036 /*
6037      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
6038    except that the data structure of Mat_SeqAIJ is slightly different.
6039    Not a good example of code reuse.
6040 */
6041 #undef __FUNCT__
6042 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_inplace"
6043 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_inplace(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
6044 {
6045   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b;
6046   IS             isicol;
6047   PetscErrorCode ierr;
6048   const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi;
6049   PetscInt       prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp;
6050   PetscInt       *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0;
6051   PetscInt       incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd;
6052   PetscBool      col_identity,row_identity,both_identity,flg;
6053   PetscReal      f;
6054 
6055   PetscFunctionBegin;
6056   ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr);
6057   if (flg) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd);
6058 
6059   f             = info->fill;
6060   levels        = (PetscInt)info->levels;
6061   diagonal_fill = (PetscInt)info->diagonal_fill;
6062   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
6063 
6064   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
6065   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
6066   both_identity = (PetscBool) (row_identity && col_identity);
6067 
6068   if (!levels && both_identity) {  /* special case copy the nonzero structure */
6069     ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr);
6070     ierr = MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);CHKERRQ(ierr);
6071 
6072     fact->factortype = MAT_FACTOR_ILU;
6073     b            = (Mat_SeqBAIJ*)fact->data;
6074     b->row       = isrow;
6075     b->col       = iscol;
6076     ierr         = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
6077     ierr         = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
6078     b->icol      = isicol;
6079     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
6080     ierr         = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
6081     PetscFunctionReturn(0);
6082   }
6083 
6084   /* general case perform the symbolic factorization */
6085     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
6086     ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
6087 
6088     /* get new row pointers */
6089     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr);
6090     ainew[0] = 0;
6091     /* don't know how many column pointers are needed so estimate */
6092     jmax = (PetscInt)(f*ai[n] + 1);
6093     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr);
6094     /* ajfill is level of fill for each fill entry */
6095     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr);
6096     /* fill is a linked list of nonzeros in active row */
6097     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr);
6098     /* im is level for each filled value */
6099     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr);
6100     /* dloc is location of diagonal in factor */
6101     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr);
6102     dloc[0]  = 0;
6103     for (prow=0; prow<n; prow++) {
6104 
6105       /* copy prow into linked list */
6106       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
6107       if (!nz) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow);
6108       xi         = aj + ai[r[prow]];
6109       fill[n]    = n;
6110       fill[prow] = -1; /* marker for diagonal entry */
6111       while (nz--) {
6112         fm  = n;
6113         idx = ic[*xi++];
6114         do {
6115           m  = fm;
6116           fm = fill[m];
6117         } while (fm < idx);
6118         fill[m]   = idx;
6119         fill[idx] = fm;
6120         im[idx]   = 0;
6121       }
6122 
6123       /* make sure diagonal entry is included */
6124       if (diagonal_fill && fill[prow] == -1) {
6125         fm = n;
6126         while (fill[fm] < prow) fm = fill[fm];
6127         fill[prow] = fill[fm];  /* insert diagonal into linked list */
6128         fill[fm]   = prow;
6129         im[prow]   = 0;
6130         nzf++;
6131         dcount++;
6132       }
6133 
6134       nzi = 0;
6135       row = fill[n];
6136       while (row < prow) {
6137         incrlev = im[row] + 1;
6138         nz      = dloc[row];
6139         xi      = ajnew  + ainew[row] + nz + 1;
6140         flev    = ajfill + ainew[row] + nz + 1;
6141         nnz     = ainew[row+1] - ainew[row] - nz - 1;
6142         fm      = row;
6143         while (nnz-- > 0) {
6144           idx = *xi++;
6145           if (*flev + incrlev > levels) {
6146             flev++;
6147             continue;
6148           }
6149           do {
6150             m  = fm;
6151             fm = fill[m];
6152           } while (fm < idx);
6153           if (fm != idx) {
6154             im[idx]   = *flev + incrlev;
6155             fill[m]   = idx;
6156             fill[idx] = fm;
6157             fm        = idx;
6158             nzf++;
6159           } else {
6160             if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
6161           }
6162           flev++;
6163         }
6164         row = fill[row];
6165         nzi++;
6166       }
6167       /* copy new filled row into permanent storage */
6168       ainew[prow+1] = ainew[prow] + nzf;
6169       if (ainew[prow+1] > jmax) {
6170 
6171         /* estimate how much additional space we will need */
6172         /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
6173         /* just double the memory each time */
6174         PetscInt maxadd = jmax;
6175         /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
6176         if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
6177         jmax += maxadd;
6178 
6179         /* allocate a longer ajnew and ajfill */
6180         ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
6181         ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
6182         ierr = PetscFree(ajnew);CHKERRQ(ierr);
6183         ajnew = xitmp;
6184         ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
6185         ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
6186         ierr = PetscFree(ajfill);CHKERRQ(ierr);
6187         ajfill = xitmp;
6188         reallocate++; /* count how many reallocations are needed */
6189       }
6190       xitmp       = ajnew + ainew[prow];
6191       flev        = ajfill + ainew[prow];
6192       dloc[prow]  = nzi;
6193       fm          = fill[n];
6194       while (nzf--) {
6195         *xitmp++ = fm;
6196         *flev++ = im[fm];
6197         fm      = fill[fm];
6198       }
6199       /* make sure row has diagonal entry */
6200       if (ajnew[ainew[prow]+dloc[prow]] != prow) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
6201                                                           try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow);
6202     }
6203     ierr = PetscFree(ajfill);CHKERRQ(ierr);
6204     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
6205     ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
6206     ierr = PetscFree(fill);CHKERRQ(ierr);
6207     ierr = PetscFree(im);CHKERRQ(ierr);
6208 
6209 #if defined(PETSC_USE_INFO)
6210     {
6211       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
6212       ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr);
6213       ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
6214       ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr);
6215       ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
6216       if (diagonal_fill) {
6217         ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr);
6218       }
6219     }
6220 #endif
6221 
6222     /* put together the new matrix */
6223     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
6224     ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
6225     b    = (Mat_SeqBAIJ*)fact->data;
6226     b->free_a       = PETSC_TRUE;
6227     b->free_ij      = PETSC_TRUE;
6228     b->singlemalloc = PETSC_FALSE;
6229     ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
6230     b->j          = ajnew;
6231     b->i          = ainew;
6232     for (i=0; i<n; i++) dloc[i] += ainew[i];
6233     b->diag       = dloc;
6234     b->free_diag  = PETSC_TRUE;
6235     b->ilen       = 0;
6236     b->imax       = 0;
6237     b->row        = isrow;
6238     b->col        = iscol;
6239     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
6240     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
6241     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
6242     b->icol       = isicol;
6243     ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
6244     /* In b structure:  Free imax, ilen, old a, old j.
6245        Allocate dloc, solve_work, new a, new j */
6246     ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr);
6247     b->maxnz          = b->nz = ainew[n];
6248 
6249     fact->info.factor_mallocs    = reallocate;
6250     fact->info.fill_ratio_given  = f;
6251     fact->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
6252 
6253   ierr = MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);CHKERRQ(ierr);
6254   PetscFunctionReturn(0);
6255 }
6256 
6257 #undef __FUNCT__
6258 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE"
6259 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
6260 {
6261   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
6262   /* int i,*AJ=a->j,nz=a->nz; */
6263 
6264   PetscFunctionBegin;
6265   /* Undo Column scaling */
6266 /*    while (nz--) { */
6267 /*      AJ[i] = AJ[i]/4; */
6268 /*    } */
6269   /* This should really invoke a push/pop logic, but we don't have that yet. */
6270   A->ops->setunfactored = PETSC_NULL;
6271   PetscFunctionReturn(0);
6272 }
6273 
6274 #undef __FUNCT__
6275 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj"
6276 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
6277 {
6278   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
6279   PetscInt       *AJ=a->j,nz=a->nz;
6280   unsigned short *aj=(unsigned short *)AJ;
6281 
6282   PetscFunctionBegin;
6283   /* Is this really necessary? */
6284   while (nz--) {
6285     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
6286   }
6287   A->ops->setunfactored = PETSC_NULL;
6288   PetscFunctionReturn(0);
6289 }
6290 
6291 
6292