xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision d736bfeb4d37a01fcbdf00fe73fb60d6f0ba2142)
1 #define PETSCMAT_DLL
2 
3 /*
4     Factorization code for BAIJ format.
5 */
6 
7 #include "../src/mat/impls/baij/seq/baij.h"
8 #include "../src/mat/blockinvert.h"
9 #include "petscbt.h"
10 #include "../src/mat/utils/freespace.h"
11 
12 #undef __FUNCT__
13 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace"
14 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
15 {
16   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
17   PetscErrorCode    ierr;
18   PetscInt          i,nz;
19   const PetscInt    *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
20   const MatScalar   *aa=a->a,*v;
21   PetscScalar       s1,*x;
22   const PetscScalar *b;
23 
24   PetscFunctionBegin;
25   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
26   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
27   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
28 
29   /* forward solve the U^T */
30   for (i=0; i<n; i++) {
31 
32     v     = aa + diag[i];
33     /* multiply by the inverse of the block diagonal */
34     s1    = (*v++)*x[i];
35     vi    = aj + diag[i] + 1;
36     nz    = ai[i+1] - diag[i] - 1;
37     while (nz--) {
38       x[*vi++]  -= (*v++)*s1;
39     }
40     x[i]   = s1;
41   }
42   /* backward solve the L^T */
43   for (i=n-1; i>=0; i--){
44     v    = aa + diag[i] - 1;
45     vi   = aj + diag[i] - 1;
46     nz   = diag[i] - ai[i];
47     s1   = x[i];
48     while (nz--) {
49       x[*vi--]   -=  (*v--)*s1;
50     }
51   }
52   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
53   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
54   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
55   PetscFunctionReturn(0);
56 }
57 
58 #undef __FUNCT__
59 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace"
60 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
61 {
62   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
63   PetscErrorCode    ierr;
64   PetscInt          i,nz,idx,idt,oidx;
65   const PetscInt    *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j;
66   const MatScalar   *aa=a->a,*v;
67   PetscScalar       s1,s2,x1,x2,*x;
68   const PetscScalar *b;
69 
70   PetscFunctionBegin;
71   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
72   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
73   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
74 
75   /* forward solve the U^T */
76   idx = 0;
77   for (i=0; i<n; i++) {
78 
79     v     = aa + 4*diag[i];
80     /* multiply by the inverse of the block diagonal */
81     x1 = x[idx];   x2 = x[1+idx];
82     s1 = v[0]*x1  +  v[1]*x2;
83     s2 = v[2]*x1  +  v[3]*x2;
84     v += 4;
85 
86     vi    = aj + diag[i] + 1;
87     nz    = ai[i+1] - diag[i] - 1;
88     while (nz--) {
89       oidx = 2*(*vi++);
90       x[oidx]   -= v[0]*s1  +  v[1]*s2;
91       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
92       v  += 4;
93     }
94     x[idx]   = s1;x[1+idx] = s2;
95     idx += 2;
96   }
97   /* backward solve the L^T */
98   for (i=n-1; i>=0; i--){
99     v    = aa + 4*diag[i] - 4;
100     vi   = aj + diag[i] - 1;
101     nz   = diag[i] - ai[i];
102     idt  = 2*i;
103     s1   = x[idt];  s2 = x[1+idt];
104     while (nz--) {
105       idx   = 2*(*vi--);
106       x[idx]   -=  v[0]*s1 +  v[1]*s2;
107       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
108       v -= 4;
109     }
110   }
111   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
112   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
113   ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
114   PetscFunctionReturn(0);
115 }
116 
117 #undef __FUNCT__
118 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering"
119 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
120 {
121   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
122   PetscErrorCode    ierr;
123   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
124   PetscInt          nz,idx,idt,j,i,oidx;
125   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
126   const MatScalar   *aa=a->a,*v;
127   PetscScalar       s1,s2,x1,x2,*x;
128   const PetscScalar *b;
129 
130   PetscFunctionBegin;
131   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
132   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
133   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
134 
135   /* forward solve the U^T */
136   idx = 0;
137   for (i=0; i<n; i++) {
138     v     = aa + bs2*diag[i];
139     /* multiply by the inverse of the block diagonal */
140     x1 = x[idx];   x2 = x[1+idx];
141     s1 = v[0]*x1  +  v[1]*x2;
142     s2 = v[2]*x1  +  v[3]*x2;
143     v -= bs2;
144 
145     vi    = aj + diag[i] - 1;
146     nz    = diag[i] - diag[i+1] - 1;
147     for(j=0;j>-nz;j--){
148       oidx = bs*vi[j];
149       x[oidx]   -= v[0]*s1  +  v[1]*s2;
150       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
151       v  -= bs2;
152     }
153     x[idx]   = s1;x[1+idx] = s2;
154     idx += bs;
155   }
156   /* backward solve the L^T */
157   for (i=n-1; i>=0; i--){
158     v    = aa + bs2*ai[i];
159     vi   = aj + ai[i];
160     nz   = ai[i+1] - ai[i];
161     idt  = bs*i;
162     s1   = x[idt];  s2 = x[1+idt];
163     for(j=0;j<nz;j++){
164       idx   = bs*vi[j];
165       x[idx]   -=  v[0]*s1 +  v[1]*s2;
166       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
167       v += bs2;
168     }
169   }
170   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
171   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
172   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
173   PetscFunctionReturn(0);
174 }
175 
176 #undef __FUNCT__
177 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace"
178 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
179 {
180   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
181   PetscErrorCode    ierr;
182   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
183   PetscInt          i,nz,idx,idt,oidx;
184   const MatScalar   *aa=a->a,*v;
185   PetscScalar       s1,s2,s3,x1,x2,x3,*x;
186   const PetscScalar *b;
187 
188   PetscFunctionBegin;
189   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
190   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
191   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
192 
193   /* forward solve the U^T */
194   idx = 0;
195   for (i=0; i<n; i++) {
196 
197     v     = aa + 9*diag[i];
198     /* multiply by the inverse of the block diagonal */
199     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
200     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
201     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
202     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
203     v += 9;
204 
205     vi    = aj + diag[i] + 1;
206     nz    = ai[i+1] - diag[i] - 1;
207     while (nz--) {
208       oidx = 3*(*vi++);
209       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
210       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
211       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
212       v  += 9;
213     }
214     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
215     idx += 3;
216   }
217   /* backward solve the L^T */
218   for (i=n-1; i>=0; i--){
219     v    = aa + 9*diag[i] - 9;
220     vi   = aj + diag[i] - 1;
221     nz   = diag[i] - ai[i];
222     idt  = 3*i;
223     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
224     while (nz--) {
225       idx   = 3*(*vi--);
226       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
227       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
228       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
229       v -= 9;
230     }
231   }
232   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
233   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
234   ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
235   PetscFunctionReturn(0);
236 }
237 
238 #undef __FUNCT__
239 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering"
240 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
241 {
242   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
243   PetscErrorCode    ierr;
244   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
245   PetscInt          nz,idx,idt,j,i,oidx;
246   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
247   const MatScalar   *aa=a->a,*v;
248   PetscScalar       s1,s2,s3,x1,x2,x3,*x;
249   const PetscScalar *b;
250 
251   PetscFunctionBegin;
252   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
253   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
254   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
255 
256   /* forward solve the U^T */
257   idx = 0;
258   for (i=0; i<n; i++) {
259     v     = aa + bs2*diag[i];
260     /* multiply by the inverse of the block diagonal */
261     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];
262     s1 = v[0]*x1  +  v[1]*x2  + v[2]*x3;
263     s2 = v[3]*x1  +  v[4]*x2  + v[5]*x3;
264     s3 = v[6]*x1  +  v[7]*x2  + v[8]*x3;
265     v -= bs2;
266 
267     vi    = aj + diag[i] - 1;
268     nz    = diag[i] - diag[i+1] - 1;
269     for(j=0;j>-nz;j--){
270       oidx = bs*vi[j];
271       x[oidx]   -= v[0]*s1  +  v[1]*s2  + v[2]*s3;
272       x[oidx+1] -= v[3]*s1  +  v[4]*s2  + v[5]*s3;
273       x[oidx+2] -= v[6]*s1  +  v[7]*s2  + v[8]*s3;
274       v  -= bs2;
275     }
276     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;
277     idx += bs;
278   }
279   /* backward solve the L^T */
280   for (i=n-1; i>=0; i--){
281     v    = aa + bs2*ai[i];
282     vi   = aj + ai[i];
283     nz   = ai[i+1] - ai[i];
284     idt  = bs*i;
285     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];
286     for(j=0;j<nz;j++){
287       idx   = bs*vi[j];
288       x[idx]   -= v[0]*s1  +  v[1]*s2  + v[2]*s3;
289       x[idx+1] -= v[3]*s1  +  v[4]*s2  + v[5]*s3;
290       x[idx+2] -= v[6]*s1  +  v[7]*s2  + v[8]*s3;
291       v += bs2;
292     }
293   }
294   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
295   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
296   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
297   PetscFunctionReturn(0);
298 }
299 
300 #undef __FUNCT__
301 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace"
302 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
303 {
304   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
305   PetscErrorCode    ierr;
306   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
307   PetscInt          i,nz,idx,idt,oidx;
308   const MatScalar   *aa=a->a,*v;
309   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x;
310   const PetscScalar *b;
311 
312   PetscFunctionBegin;
313   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
314   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
315   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
316 
317   /* forward solve the U^T */
318   idx = 0;
319   for (i=0; i<n; i++) {
320 
321     v     = aa + 16*diag[i];
322     /* multiply by the inverse of the block diagonal */
323     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
324     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
325     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
326     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
327     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
328     v += 16;
329 
330     vi    = aj + diag[i] + 1;
331     nz    = ai[i+1] - diag[i] - 1;
332     while (nz--) {
333       oidx = 4*(*vi++);
334       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
335       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
336       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
337       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
338       v  += 16;
339     }
340     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
341     idx += 4;
342   }
343   /* backward solve the L^T */
344   for (i=n-1; i>=0; i--){
345     v    = aa + 16*diag[i] - 16;
346     vi   = aj + diag[i] - 1;
347     nz   = diag[i] - ai[i];
348     idt  = 4*i;
349     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
350     while (nz--) {
351       idx   = 4*(*vi--);
352       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
353       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
354       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
355       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
356       v -= 16;
357     }
358   }
359   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
360   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
361   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
362   PetscFunctionReturn(0);
363 }
364 
365 #undef __FUNCT__
366 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering"
367 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
368 {
369   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
370   PetscErrorCode    ierr;
371   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
372   PetscInt          nz,idx,idt,j,i,oidx;
373   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
374   const MatScalar   *aa=a->a,*v;
375   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x;
376   const PetscScalar *b;
377 
378   PetscFunctionBegin;
379   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
380   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
381   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
382 
383   /* forward solve the U^T */
384   idx = 0;
385   for (i=0; i<n; i++) {
386     v     = aa + bs2*diag[i];
387     /* multiply by the inverse of the block diagonal */
388     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
389     s1 =  v[0]*x1  +  v[1]*x2  + v[2]*x3  + v[3]*x4;
390     s2 =  v[4]*x1  +  v[5]*x2  + v[6]*x3  + v[7]*x4;
391     s3 =  v[8]*x1  +  v[9]*x2  + v[10]*x3 + v[11]*x4;
392     s4 =  v[12]*x1 +  v[13]*x2 + v[14]*x3 + v[15]*x4;
393     v -= bs2;
394 
395     vi    = aj + diag[i] - 1;
396     nz    = diag[i] - diag[i+1] - 1;
397     for(j=0;j>-nz;j--){
398       oidx = bs*vi[j];
399       x[oidx]   -=  v[0]*s1  +  v[1]*s2  + v[2]*s3  + v[3]*s4;
400       x[oidx+1] -=  v[4]*s1  +  v[5]*s2  + v[6]*s3  + v[7]*s4;
401       x[oidx+2] -=  v[8]*s1  +  v[9]*s2  + v[10]*s3 + v[11]*s4;
402       x[oidx+3] -=  v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
403       v  -= bs2;
404     }
405     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4;
406     idx += bs;
407   }
408   /* backward solve the L^T */
409   for (i=n-1; i>=0; i--){
410     v    = aa + bs2*ai[i];
411     vi   = aj + ai[i];
412     nz   = ai[i+1] - ai[i];
413     idt  = bs*i;
414     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];
415     for(j=0;j<nz;j++){
416       idx   = bs*vi[j];
417       x[idx]   -=  v[0]*s1  +  v[1]*s2  + v[2]*s3  + v[3]*s4;
418       x[idx+1] -=  v[4]*s1  +  v[5]*s2  + v[6]*s3  + v[7]*s4;
419       x[idx+2] -=  v[8]*s1  +  v[9]*s2  + v[10]*s3 + v[11]*s4;
420       x[idx+3] -=  v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
421       v += bs2;
422     }
423   }
424   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
425   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
426   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
427   PetscFunctionReturn(0);
428 }
429 
430 #undef __FUNCT__
431 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace"
432 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
433 {
434   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
435   PetscErrorCode    ierr;
436   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
437   PetscInt          i,nz,idx,idt,oidx;
438   const MatScalar   *aa=a->a,*v;
439   PetscScalar       s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x;
440   const PetscScalar *b;
441 
442   PetscFunctionBegin;
443   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
444   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
445   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
446 
447   /* forward solve the U^T */
448   idx = 0;
449   for (i=0; i<n; i++) {
450 
451     v     = aa + 25*diag[i];
452     /* multiply by the inverse of the block diagonal */
453     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
454     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
455     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
456     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
457     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
458     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
459     v += 25;
460 
461     vi    = aj + diag[i] + 1;
462     nz    = ai[i+1] - diag[i] - 1;
463     while (nz--) {
464       oidx = 5*(*vi++);
465       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
466       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
467       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
468       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
469       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
470       v  += 25;
471     }
472     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
473     idx += 5;
474   }
475   /* backward solve the L^T */
476   for (i=n-1; i>=0; i--){
477     v    = aa + 25*diag[i] - 25;
478     vi   = aj + diag[i] - 1;
479     nz   = diag[i] - ai[i];
480     idt  = 5*i;
481     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
482     while (nz--) {
483       idx   = 5*(*vi--);
484       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
485       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
486       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
487       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
488       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
489       v -= 25;
490     }
491   }
492   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
493   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
494   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
495   PetscFunctionReturn(0);
496 }
497 
498 #undef __FUNCT__
499 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering"
500 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
501 {
502   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
503   PetscErrorCode ierr;
504   const PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
505   PetscInt       nz,idx,idt,j,i,oidx;
506   const PetscInt       bs=A->rmap->bs,bs2=a->bs2;
507   const MatScalar      *aa=a->a,*v;
508   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x;
509   const PetscScalar    *b;
510 
511   PetscFunctionBegin;
512   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
513   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
514   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
515 
516   /* forward solve the U^T */
517   idx = 0;
518   for (i=0; i<n; i++) {
519     v     = aa + bs2*diag[i];
520     /* multiply by the inverse of the block diagonal */
521     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
522     x5 = x[4+idx];
523     s1 =  v[0]*x1   +  v[1]*x2   + v[2]*x3   + v[3]*x4   + v[4]*x5;
524     s2 =  v[5]*x1   +  v[6]*x2   + v[7]*x3   + v[8]*x4   + v[9]*x5;
525     s3 =  v[10]*x1  +  v[11]*x2  + v[12]*x3  + v[13]*x4  + v[14]*x5;
526     s4 =  v[15]*x1  +  v[16]*x2  + v[17]*x3  + v[18]*x4  + v[19]*x5;
527     s5 =  v[20]*x1  +  v[21]*x2  + v[22]*x3  + v[23]*x4   + v[24]*x5;
528     v -= bs2;
529 
530     vi    = aj + diag[i] - 1;
531     nz    = diag[i] - diag[i+1] - 1;
532     for(j=0;j>-nz;j--){
533       oidx = bs*vi[j];
534       x[oidx]   -=  v[0]*s1   +  v[1]*s2   + v[2]*s3   + v[3]*s4   + v[4]*s5;
535       x[oidx+1] -=  v[5]*s1   +  v[6]*s2   + v[7]*s3   + v[8]*s4   + v[9]*s5;
536       x[oidx+2] -=  v[10]*s1  +  v[11]*s2  + v[12]*s3  + v[13]*s4  + v[14]*s5;
537       x[oidx+3] -=  v[15]*s1  +  v[16]*s2  + v[17]*s3  + v[18]*s4  + v[19]*s5;
538       x[oidx+4] -=  v[20]*s1  +  v[21]*s2  + v[22]*s3  + v[23]*s4   + v[24]*s5;
539       v  -= bs2;
540     }
541     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
542     idx += bs;
543   }
544   /* backward solve the L^T */
545   for (i=n-1; i>=0; i--){
546     v    = aa + bs2*ai[i];
547     vi   = aj + ai[i];
548     nz   = ai[i+1] - ai[i];
549     idt  = bs*i;
550     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
551     for(j=0;j<nz;j++){
552       idx   = bs*vi[j];
553       x[idx]   -=  v[0]*s1   +  v[1]*s2   + v[2]*s3   + v[3]*s4   + v[4]*s5;
554       x[idx+1] -=  v[5]*s1   +  v[6]*s2   + v[7]*s3   + v[8]*s4   + v[9]*s5;
555       x[idx+2] -=  v[10]*s1  +  v[11]*s2  + v[12]*s3  + v[13]*s4  + v[14]*s5;
556       x[idx+3] -=  v[15]*s1  +  v[16]*s2  + v[17]*s3  + v[18]*s4  + v[19]*s5;
557       x[idx+4] -=  v[20]*s1  +  v[21]*s2  + v[22]*s3  + v[23]*s4   + v[24]*s5;
558       v += bs2;
559     }
560   }
561   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
562   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
563   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
564   PetscFunctionReturn(0);
565 }
566 
567 #undef __FUNCT__
568 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace"
569 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
570 {
571   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
572   PetscErrorCode    ierr;
573   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
574   PetscInt          i,nz,idx,idt,oidx;
575   const MatScalar   *aa=a->a,*v;
576   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x;
577   const PetscScalar *b;
578 
579   PetscFunctionBegin;
580   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
581   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
582   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
583 
584   /* forward solve the U^T */
585   idx = 0;
586   for (i=0; i<n; i++) {
587 
588     v     = aa + 36*diag[i];
589     /* multiply by the inverse of the block diagonal */
590     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
591     x6    = x[5+idx];
592     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
593     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
594     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
595     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
596     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
597     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
598     v += 36;
599 
600     vi    = aj + diag[i] + 1;
601     nz    = ai[i+1] - diag[i] - 1;
602     while (nz--) {
603       oidx = 6*(*vi++);
604       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
605       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
606       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
607       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
608       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
609       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
610       v  += 36;
611     }
612     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
613     x[5+idx] = s6;
614     idx += 6;
615   }
616   /* backward solve the L^T */
617   for (i=n-1; i>=0; i--){
618     v    = aa + 36*diag[i] - 36;
619     vi   = aj + diag[i] - 1;
620     nz   = diag[i] - ai[i];
621     idt  = 6*i;
622     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
623     s6 = x[5+idt];
624     while (nz--) {
625       idx   = 6*(*vi--);
626       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
627       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
628       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
629       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
630       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
631       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
632       v -= 36;
633     }
634   }
635   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
636   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
637   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
638   PetscFunctionReturn(0);
639 }
640 
641 #undef __FUNCT__
642 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering"
643 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
644 {
645   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
646   PetscErrorCode    ierr;
647   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
648   PetscInt          nz,idx,idt,j,i,oidx;
649   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
650   const MatScalar   *aa=a->a,*v;
651   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x;
652   const PetscScalar *b;
653 
654   PetscFunctionBegin;
655   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
656   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
657   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
658 
659   /* forward solve the U^T */
660   idx = 0;
661   for (i=0; i<n; i++) {
662     v     = aa + bs2*diag[i];
663     /* multiply by the inverse of the block diagonal */
664     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
665     x5 = x[4+idx]; x6 = x[5+idx];
666     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
667     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
668     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
669     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
670     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
671     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
672     v -= bs2;
673 
674     vi    = aj + diag[i] - 1;
675     nz    = diag[i] - diag[i+1] - 1;
676     for(j=0;j>-nz;j--){
677       oidx = bs*vi[j];
678       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
679       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
680       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
681       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
682       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
683       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
684       v  -= bs2;
685     }
686     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
687     x[5+idx] = s6;
688     idx += bs;
689   }
690   /* backward solve the L^T */
691   for (i=n-1; i>=0; i--){
692     v    = aa + bs2*ai[i];
693     vi   = aj + ai[i];
694     nz   = ai[i+1] - ai[i];
695     idt  = bs*i;
696     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
697     s6   = x[5+idt];
698     for(j=0;j<nz;j++){
699       idx   = bs*vi[j];
700       x[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
701       x[idx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
702       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
703       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
704       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
705       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
706       v += bs2;
707     }
708   }
709   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
710   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
711   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
712   PetscFunctionReturn(0);
713 }
714 
715 #undef __FUNCT__
716 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace"
717 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
718 {
719   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
720   PetscErrorCode    ierr;
721   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
722   PetscInt          i,nz,idx,idt,oidx;
723   const MatScalar   *aa=a->a,*v;
724   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x;
725   const PetscScalar *b;
726 
727   PetscFunctionBegin;
728   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
729   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
730   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
731 
732   /* forward solve the U^T */
733   idx = 0;
734   for (i=0; i<n; i++) {
735 
736     v     = aa + 49*diag[i];
737     /* multiply by the inverse of the block diagonal */
738     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
739     x6    = x[5+idx]; x7 = x[6+idx];
740     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
741     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
742     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
743     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
744     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
745     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
746     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
747     v += 49;
748 
749     vi    = aj + diag[i] + 1;
750     nz    = ai[i+1] - diag[i] - 1;
751     while (nz--) {
752       oidx = 7*(*vi++);
753       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
754       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
755       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
756       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
757       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
758       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
759       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
760       v  += 49;
761     }
762     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
763     x[5+idx] = s6;x[6+idx] = s7;
764     idx += 7;
765   }
766   /* backward solve the L^T */
767   for (i=n-1; i>=0; i--){
768     v    = aa + 49*diag[i] - 49;
769     vi   = aj + diag[i] - 1;
770     nz   = diag[i] - ai[i];
771     idt  = 7*i;
772     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
773     s6 = x[5+idt];s7 = x[6+idt];
774     while (nz--) {
775       idx   = 7*(*vi--);
776       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
777       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
778       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
779       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
780       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
781       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
782       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
783       v -= 49;
784     }
785   }
786   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
787   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
788   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
789   PetscFunctionReturn(0);
790 }
791 #undef __FUNCT__
792 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering"
793 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
794 {
795   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
796   PetscErrorCode    ierr;
797   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
798   PetscInt          nz,idx,idt,j,i,oidx;
799   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
800   const MatScalar   *aa=a->a,*v;
801   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x;
802   const PetscScalar *b;
803 
804   PetscFunctionBegin;
805   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
806   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
807   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
808 
809   /* forward solve the U^T */
810   idx = 0;
811   for (i=0; i<n; i++) {
812     v     = aa + bs2*diag[i];
813     /* multiply by the inverse of the block diagonal */
814     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
815     x5 = x[4+idx]; x6 = x[5+idx];  x7 = x[6+idx];
816     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
817     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
818     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
819     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
820     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
821     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
822     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
823     v -= bs2;
824     vi    = aj + diag[i] - 1;
825     nz    = diag[i] - diag[i+1] - 1;
826     for(j=0;j>-nz;j--){
827       oidx = bs*vi[j];
828       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
829       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
830       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
831       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
832       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
833       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
834       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
835       v  -= bs2;
836     }
837     x[idx]   = s1;  x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
838     x[5+idx] = s6;  x[6+idx] = s7;
839     idx += bs;
840   }
841   /* backward solve the L^T */
842   for (i=n-1; i>=0; i--){
843     v    = aa + bs2*ai[i];
844     vi   = aj + ai[i];
845     nz   = ai[i+1] - ai[i];
846     idt  = bs*i;
847     s1   = x[idt];    s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
848     s6   = x[5+idt];  s7 = x[6+idt];
849     for(j=0;j<nz;j++){
850       idx   = bs*vi[j];
851       x[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
852       x[idx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
853       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
854       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
855       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
856       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
857       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
858       v += bs2;
859     }
860   }
861   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
862   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
863   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
864   PetscFunctionReturn(0);
865 }
866 
867 /*---------------------------------------------------------------------------------------------*/
868 #undef __FUNCT__
869 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_inplace"
870 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
871 {
872   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
873   IS                iscol=a->col,isrow=a->row;
874   PetscErrorCode    ierr;
875   const PetscInt    *r,*c,*rout,*cout;
876   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
877   PetscInt          i,nz;
878   const MatScalar   *aa=a->a,*v;
879   PetscScalar       s1,*x,*t;
880   const PetscScalar *b;
881 
882   PetscFunctionBegin;
883   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
884   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
885   t  = a->solve_work;
886 
887   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
888   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
889 
890   /* copy the b into temp work space according to permutation */
891   for (i=0; i<n; i++) {
892     t[i] = b[c[i]];
893   }
894 
895   /* forward solve the U^T */
896   for (i=0; i<n; i++) {
897 
898     v     = aa + diag[i];
899     /* multiply by the inverse of the block diagonal */
900     s1    = (*v++)*t[i];
901     vi    = aj + diag[i] + 1;
902     nz    = ai[i+1] - diag[i] - 1;
903     while (nz--) {
904       t[*vi++]  -= (*v++)*s1;
905     }
906     t[i]   = s1;
907   }
908   /* backward solve the L^T */
909   for (i=n-1; i>=0; i--){
910     v    = aa + diag[i] - 1;
911     vi   = aj + diag[i] - 1;
912     nz   = diag[i] - ai[i];
913     s1   = t[i];
914     while (nz--) {
915       t[*vi--]   -=  (*v--)*s1;
916     }
917   }
918 
919   /* copy t into x according to permutation */
920   for (i=0; i<n; i++) {
921     x[r[i]]   = t[i];
922   }
923 
924   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
925   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
926   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
927   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
928   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
929   PetscFunctionReturn(0);
930 }
931 
932 #undef __FUNCT__
933 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_inplace"
934 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
935 {
936   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
937   IS                iscol=a->col,isrow=a->row;
938   PetscErrorCode    ierr;
939   const PetscInt    *r,*c,*rout,*cout;
940   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
941   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
942   const MatScalar   *aa=a->a,*v;
943   PetscScalar       s1,s2,x1,x2,*x,*t;
944   const PetscScalar *b;
945 
946   PetscFunctionBegin;
947   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
948   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
949   t  = a->solve_work;
950 
951   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
952   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
953 
954   /* copy the b into temp work space according to permutation */
955   ii = 0;
956   for (i=0; i<n; i++) {
957     ic      = 2*c[i];
958     t[ii]   = b[ic];
959     t[ii+1] = b[ic+1];
960     ii += 2;
961   }
962 
963   /* forward solve the U^T */
964   idx = 0;
965   for (i=0; i<n; i++) {
966 
967     v     = aa + 4*diag[i];
968     /* multiply by the inverse of the block diagonal */
969     x1    = t[idx];   x2 = t[1+idx];
970     s1 = v[0]*x1  +  v[1]*x2;
971     s2 = v[2]*x1  +  v[3]*x2;
972     v += 4;
973 
974     vi    = aj + diag[i] + 1;
975     nz    = ai[i+1] - diag[i] - 1;
976     while (nz--) {
977       oidx = 2*(*vi++);
978       t[oidx]   -= v[0]*s1  +  v[1]*s2;
979       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
980       v  += 4;
981     }
982     t[idx]   = s1;t[1+idx] = s2;
983     idx += 2;
984   }
985   /* backward solve the L^T */
986   for (i=n-1; i>=0; i--){
987     v    = aa + 4*diag[i] - 4;
988     vi   = aj + diag[i] - 1;
989     nz   = diag[i] - ai[i];
990     idt  = 2*i;
991     s1 = t[idt];  s2 = t[1+idt];
992     while (nz--) {
993       idx   = 2*(*vi--);
994       t[idx]   -=  v[0]*s1 +  v[1]*s2;
995       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
996       v -= 4;
997     }
998   }
999 
1000   /* copy t into x according to permutation */
1001   ii = 0;
1002   for (i=0; i<n; i++) {
1003     ir      = 2*r[i];
1004     x[ir]   = t[ii];
1005     x[ir+1] = t[ii+1];
1006     ii += 2;
1007   }
1008 
1009   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1010   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1011   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1012   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1013   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
1014   PetscFunctionReturn(0);
1015 }
1016 
1017 #undef __FUNCT__
1018 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2"
1019 PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
1020 {
1021   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1022   PetscErrorCode    ierr;
1023   IS                iscol=a->col,isrow=a->row;
1024   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1025   const PetscInt    *r,*c,*rout,*cout;
1026   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1027   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1028   const MatScalar   *aa=a->a,*v;
1029   PetscScalar       s1,s2,x1,x2,*x,*t;
1030   const PetscScalar *b;
1031 
1032   PetscFunctionBegin;
1033   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1034   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1035   t = a->solve_work;
1036 
1037   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1038   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1039 
1040   /* copy b into temp work space according to permutation */
1041   for(i=0;i<n;i++){
1042     ii = bs*i; ic = bs*c[i];
1043     t[ii] = b[ic]; t[ii+1] = b[ic+1];
1044   }
1045 
1046   /* forward solve the U^T */
1047   idx = 0;
1048   for (i=0; i<n; i++) {
1049     v     = aa + bs2*diag[i];
1050     /* multiply by the inverse of the block diagonal */
1051     x1 = t[idx];   x2 = t[1+idx];
1052     s1 = v[0]*x1  +  v[1]*x2;
1053     s2 = v[2]*x1  +  v[3]*x2;
1054     v -= bs2;
1055 
1056     vi    = aj + diag[i] - 1;
1057     nz    = diag[i] - diag[i+1] - 1;
1058     for(j=0;j>-nz;j--){
1059       oidx = bs*vi[j];
1060       t[oidx]   -= v[0]*s1  +  v[1]*s2;
1061       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
1062       v  -= bs2;
1063     }
1064     t[idx]   = s1;t[1+idx] = s2;
1065     idx += bs;
1066   }
1067   /* backward solve the L^T */
1068   for (i=n-1; i>=0; i--){
1069     v    = aa + bs2*ai[i];
1070     vi   = aj + ai[i];
1071     nz   = ai[i+1] - ai[i];
1072     idt  = bs*i;
1073     s1   = t[idt];  s2 = t[1+idt];
1074     for(j=0;j<nz;j++){
1075       idx   = bs*vi[j];
1076       t[idx]   -=  v[0]*s1 +  v[1]*s2;
1077       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
1078       v += bs2;
1079     }
1080   }
1081 
1082   /* copy t into x according to permutation */
1083   for(i=0;i<n;i++){
1084     ii = bs*i;  ir = bs*r[i];
1085     x[ir] = t[ii];  x[ir+1] = t[ii+1];
1086   }
1087 
1088   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1089   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1090   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1091   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1092   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1093   PetscFunctionReturn(0);
1094 }
1095 
1096 #undef __FUNCT__
1097 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_inplace"
1098 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
1099 {
1100   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1101   IS                iscol=a->col,isrow=a->row;
1102   PetscErrorCode    ierr;
1103   const PetscInt    *r,*c,*rout,*cout;
1104   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1105   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1106   const MatScalar   *aa=a->a,*v;
1107   PetscScalar       s1,s2,s3,x1,x2,x3,*x,*t;
1108   const PetscScalar *b;
1109 
1110   PetscFunctionBegin;
1111   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1112   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1113   t  = a->solve_work;
1114 
1115   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1116   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1117 
1118   /* copy the b into temp work space according to permutation */
1119   ii = 0;
1120   for (i=0; i<n; i++) {
1121     ic      = 3*c[i];
1122     t[ii]   = b[ic];
1123     t[ii+1] = b[ic+1];
1124     t[ii+2] = b[ic+2];
1125     ii += 3;
1126   }
1127 
1128   /* forward solve the U^T */
1129   idx = 0;
1130   for (i=0; i<n; i++) {
1131 
1132     v     = aa + 9*diag[i];
1133     /* multiply by the inverse of the block diagonal */
1134     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
1135     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
1136     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
1137     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
1138     v += 9;
1139 
1140     vi    = aj + diag[i] + 1;
1141     nz    = ai[i+1] - diag[i] - 1;
1142     while (nz--) {
1143       oidx = 3*(*vi++);
1144       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
1145       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
1146       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1147       v  += 9;
1148     }
1149     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
1150     idx += 3;
1151   }
1152   /* backward solve the L^T */
1153   for (i=n-1; i>=0; i--){
1154     v    = aa + 9*diag[i] - 9;
1155     vi   = aj + diag[i] - 1;
1156     nz   = diag[i] - ai[i];
1157     idt  = 3*i;
1158     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
1159     while (nz--) {
1160       idx   = 3*(*vi--);
1161       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
1162       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
1163       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1164       v -= 9;
1165     }
1166   }
1167 
1168   /* copy t into x according to permutation */
1169   ii = 0;
1170   for (i=0; i<n; i++) {
1171     ir      = 3*r[i];
1172     x[ir]   = t[ii];
1173     x[ir+1] = t[ii+1];
1174     x[ir+2] = t[ii+2];
1175     ii += 3;
1176   }
1177 
1178   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1179   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1180   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1181   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1182   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
1183   PetscFunctionReturn(0);
1184 }
1185 
1186 #undef __FUNCT__
1187 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3"
1188 PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
1189 {
1190   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1191   PetscErrorCode    ierr;
1192   IS                iscol=a->col,isrow=a->row;
1193   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1194   const PetscInt    *r,*c,*rout,*cout;
1195   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1196   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1197   const MatScalar   *aa=a->a,*v;
1198   PetscScalar       s1,s2,s3,x1,x2,x3,*x,*t;
1199   const PetscScalar *b;
1200 
1201   PetscFunctionBegin;
1202   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1203   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1204   t = a->solve_work;
1205 
1206   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1207   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1208 
1209   /* copy b into temp work space according to permutation */
1210   for(i=0;i<n;i++){
1211     ii = bs*i; ic = bs*c[i];
1212     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2];
1213   }
1214 
1215   /* forward solve the U^T */
1216   idx = 0;
1217   for (i=0; i<n; i++) {
1218     v     = aa + bs2*diag[i];
1219     /* multiply by the inverse of the block diagonal */
1220     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
1221     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
1222     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
1223     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
1224     v -= bs2;
1225 
1226     vi    = aj + diag[i] - 1;
1227     nz    = diag[i] - diag[i+1] - 1;
1228     for(j=0;j>-nz;j--){
1229       oidx = bs*vi[j];
1230       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
1231       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
1232       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1233       v  -= bs2;
1234     }
1235     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;
1236     idx += bs;
1237   }
1238   /* backward solve the L^T */
1239   for (i=n-1; i>=0; i--){
1240     v    = aa + bs2*ai[i];
1241     vi   = aj + ai[i];
1242     nz   = ai[i+1] - ai[i];
1243     idt  = bs*i;
1244     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];
1245     for(j=0;j<nz;j++){
1246       idx   = bs*vi[j];
1247       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
1248       t[idx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
1249       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1250       v += bs2;
1251     }
1252   }
1253 
1254   /* copy t into x according to permutation */
1255   for(i=0;i<n;i++){
1256     ii = bs*i;  ir = bs*r[i];
1257     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];
1258   }
1259 
1260   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1261   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1262   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1263   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1264   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1265   PetscFunctionReturn(0);
1266 }
1267 
1268 #undef __FUNCT__
1269 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_inplace"
1270 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
1271 {
1272   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1273   IS                iscol=a->col,isrow=a->row;
1274   PetscErrorCode    ierr;
1275   const PetscInt    *r,*c,*rout,*cout;
1276   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1277   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1278   const MatScalar   *aa=a->a,*v;
1279   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x,*t;
1280   const PetscScalar *b;
1281 
1282   PetscFunctionBegin;
1283   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1284   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1285   t  = a->solve_work;
1286 
1287   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1288   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1289 
1290   /* copy the b into temp work space according to permutation */
1291   ii = 0;
1292   for (i=0; i<n; i++) {
1293     ic      = 4*c[i];
1294     t[ii]   = b[ic];
1295     t[ii+1] = b[ic+1];
1296     t[ii+2] = b[ic+2];
1297     t[ii+3] = b[ic+3];
1298     ii += 4;
1299   }
1300 
1301   /* forward solve the U^T */
1302   idx = 0;
1303   for (i=0; i<n; i++) {
1304 
1305     v     = aa + 16*diag[i];
1306     /* multiply by the inverse of the block diagonal */
1307     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
1308     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1309     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1310     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1311     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1312     v += 16;
1313 
1314     vi    = aj + diag[i] + 1;
1315     nz    = ai[i+1] - diag[i] - 1;
1316     while (nz--) {
1317       oidx = 4*(*vi++);
1318       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
1319       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
1320       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1321       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1322       v  += 16;
1323     }
1324     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
1325     idx += 4;
1326   }
1327   /* backward solve the L^T */
1328   for (i=n-1; i>=0; i--){
1329     v    = aa + 16*diag[i] - 16;
1330     vi   = aj + diag[i] - 1;
1331     nz   = diag[i] - ai[i];
1332     idt  = 4*i;
1333     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
1334     while (nz--) {
1335       idx   = 4*(*vi--);
1336       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
1337       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
1338       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1339       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1340       v -= 16;
1341     }
1342   }
1343 
1344   /* copy t into x according to permutation */
1345   ii = 0;
1346   for (i=0; i<n; i++) {
1347     ir      = 4*r[i];
1348     x[ir]   = t[ii];
1349     x[ir+1] = t[ii+1];
1350     x[ir+2] = t[ii+2];
1351     x[ir+3] = t[ii+3];
1352     ii += 4;
1353   }
1354 
1355   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1356   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1357   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1358   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1359   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
1360   PetscFunctionReturn(0);
1361 }
1362 
1363 #undef __FUNCT__
1364 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4"
1365 PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
1366 {
1367   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1368   PetscErrorCode    ierr;
1369   IS                iscol=a->col,isrow=a->row;
1370   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1371   const PetscInt    *r,*c,*rout,*cout;
1372   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1373   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1374   const MatScalar   *aa=a->a,*v;
1375   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x,*t;
1376   const PetscScalar *b;
1377 
1378   PetscFunctionBegin;
1379   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1380   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1381   t = a->solve_work;
1382 
1383   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1384   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1385 
1386   /* copy b into temp work space according to permutation */
1387   for(i=0;i<n;i++){
1388     ii = bs*i; ic = bs*c[i];
1389     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1390   }
1391 
1392   /* forward solve the U^T */
1393   idx = 0;
1394   for (i=0; i<n; i++) {
1395     v     = aa + bs2*diag[i];
1396     /* multiply by the inverse of the block diagonal */
1397     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];  x4 = t[3+idx];
1398     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1399     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1400     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1401     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1402     v -= bs2;
1403 
1404     vi    = aj + diag[i] - 1;
1405     nz    = diag[i] - diag[i+1] - 1;
1406     for(j=0;j>-nz;j--){
1407       oidx = bs*vi[j];
1408       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
1409       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
1410       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1411       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1412       v  -= bs2;
1413     }
1414     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4;
1415     idx += bs;
1416   }
1417   /* backward solve the L^T */
1418   for (i=n-1; i>=0; i--){
1419     v    = aa + bs2*ai[i];
1420     vi   = aj + ai[i];
1421     nz   = ai[i+1] - ai[i];
1422     idt  = bs*i;
1423     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt];
1424     for(j=0;j<nz;j++){
1425       idx   = bs*vi[j];
1426       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3  +  v[3]*s4;
1427       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3  +  v[7]*s4;
1428       t[idx+2] -=  v[8]*s1 +  v[9]*s2 +  v[10]*s3 + v[11]*s4;
1429       t[idx+3] -= v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
1430       v += bs2;
1431     }
1432   }
1433 
1434   /* copy t into x according to permutation */
1435   for(i=0;i<n;i++){
1436     ii = bs*i;  ir = bs*r[i];
1437     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
1438   }
1439 
1440   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1441   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1442   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1443   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1444   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1445   PetscFunctionReturn(0);
1446 }
1447 
1448 #undef __FUNCT__
1449 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_inplace"
1450 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
1451 {
1452   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1453   IS                iscol=a->col,isrow=a->row;
1454   PetscErrorCode    ierr;
1455   const PetscInt    *r,*c,*rout,*cout;
1456   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1457   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1458   const MatScalar   *aa=a->a,*v;
1459   PetscScalar       s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t;
1460   const PetscScalar *b;
1461 
1462   PetscFunctionBegin;
1463   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1464   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1465   t  = a->solve_work;
1466 
1467   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1468   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1469 
1470   /* copy the b into temp work space according to permutation */
1471   ii = 0;
1472   for (i=0; i<n; i++) {
1473     ic      = 5*c[i];
1474     t[ii]   = b[ic];
1475     t[ii+1] = b[ic+1];
1476     t[ii+2] = b[ic+2];
1477     t[ii+3] = b[ic+3];
1478     t[ii+4] = b[ic+4];
1479     ii += 5;
1480   }
1481 
1482   /* forward solve the U^T */
1483   idx = 0;
1484   for (i=0; i<n; i++) {
1485 
1486     v     = aa + 25*diag[i];
1487     /* multiply by the inverse of the block diagonal */
1488     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1489     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1490     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1491     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1492     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1493     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1494     v += 25;
1495 
1496     vi    = aj + diag[i] + 1;
1497     nz    = ai[i+1] - diag[i] - 1;
1498     while (nz--) {
1499       oidx = 5*(*vi++);
1500       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1501       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1502       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1503       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1504       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1505       v  += 25;
1506     }
1507     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1508     idx += 5;
1509   }
1510   /* backward solve the L^T */
1511   for (i=n-1; i>=0; i--){
1512     v    = aa + 25*diag[i] - 25;
1513     vi   = aj + diag[i] - 1;
1514     nz   = diag[i] - ai[i];
1515     idt  = 5*i;
1516     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1517     while (nz--) {
1518       idx   = 5*(*vi--);
1519       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1520       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1521       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1522       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1523       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1524       v -= 25;
1525     }
1526   }
1527 
1528   /* copy t into x according to permutation */
1529   ii = 0;
1530   for (i=0; i<n; i++) {
1531     ir      = 5*r[i];
1532     x[ir]   = t[ii];
1533     x[ir+1] = t[ii+1];
1534     x[ir+2] = t[ii+2];
1535     x[ir+3] = t[ii+3];
1536     x[ir+4] = t[ii+4];
1537     ii += 5;
1538   }
1539 
1540   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1541   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1542   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1543   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1544   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
1545   PetscFunctionReturn(0);
1546 }
1547 
1548 #undef __FUNCT__
1549 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5"
1550 PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
1551 {
1552   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1553   PetscErrorCode    ierr;
1554   IS                iscol=a->col,isrow=a->row;
1555   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1556   const PetscInt    *r,*c,*rout,*cout;
1557   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1558   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1559   const MatScalar   *aa=a->a,*v;
1560   PetscScalar       s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t;
1561   const PetscScalar *b;
1562 
1563   PetscFunctionBegin;
1564   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1565   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1566   t = a->solve_work;
1567 
1568   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1569   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1570 
1571   /* copy b into temp work space according to permutation */
1572   for(i=0;i<n;i++){
1573     ii = bs*i; ic = bs*c[i];
1574     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1575     t[ii+4] = b[ic+4];
1576   }
1577 
1578   /* forward solve the U^T */
1579   idx = 0;
1580   for (i=0; i<n; i++) {
1581     v     = aa + bs2*diag[i];
1582     /* multiply by the inverse of the block diagonal */
1583     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1584     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1585     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1586     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1587     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1588     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1589     v -= bs2;
1590 
1591     vi    = aj + diag[i] - 1;
1592     nz    = diag[i] - diag[i+1] - 1;
1593     for(j=0;j>-nz;j--){
1594       oidx = bs*vi[j];
1595       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1596       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1597       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1598       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1599       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1600       v  -= bs2;
1601     }
1602     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
1603     idx += bs;
1604   }
1605   /* backward solve the L^T */
1606   for (i=n-1; i>=0; i--){
1607     v    = aa + bs2*ai[i];
1608     vi   = aj + ai[i];
1609     nz   = ai[i+1] - ai[i];
1610     idt  = bs*i;
1611     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
1612     for(j=0;j<nz;j++){
1613       idx   = bs*vi[j];
1614       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1615       t[idx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1616       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1617       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1618       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1619       v += bs2;
1620     }
1621   }
1622 
1623   /* copy t into x according to permutation */
1624   for(i=0;i<n;i++){
1625     ii = bs*i;  ir = bs*r[i];
1626     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
1627     x[ir+4] = t[ii+4];
1628   }
1629 
1630   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1631   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1632   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1633   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1634   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1635   PetscFunctionReturn(0);
1636 }
1637 
1638 #undef __FUNCT__
1639 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_inplace"
1640 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
1641 {
1642   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1643   IS                iscol=a->col,isrow=a->row;
1644   PetscErrorCode    ierr;
1645   const PetscInt    *r,*c,*rout,*cout;
1646   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1647   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1648   const MatScalar   *aa=a->a,*v;
1649   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t;
1650   const PetscScalar *b;
1651 
1652   PetscFunctionBegin;
1653   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1654   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1655   t  = a->solve_work;
1656 
1657   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1658   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1659 
1660   /* copy the b into temp work space according to permutation */
1661   ii = 0;
1662   for (i=0; i<n; i++) {
1663     ic      = 6*c[i];
1664     t[ii]   = b[ic];
1665     t[ii+1] = b[ic+1];
1666     t[ii+2] = b[ic+2];
1667     t[ii+3] = b[ic+3];
1668     t[ii+4] = b[ic+4];
1669     t[ii+5] = b[ic+5];
1670     ii += 6;
1671   }
1672 
1673   /* forward solve the U^T */
1674   idx = 0;
1675   for (i=0; i<n; i++) {
1676 
1677     v     = aa + 36*diag[i];
1678     /* multiply by the inverse of the block diagonal */
1679     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1680     x6    = t[5+idx];
1681     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
1682     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
1683     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
1684     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
1685     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
1686     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
1687     v += 36;
1688 
1689     vi    = aj + diag[i] + 1;
1690     nz    = ai[i+1] - diag[i] - 1;
1691     while (nz--) {
1692       oidx = 6*(*vi++);
1693       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1694       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1695       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1696       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1697       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1698       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1699       v  += 36;
1700     }
1701     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1702     t[5+idx] = s6;
1703     idx += 6;
1704   }
1705   /* backward solve the L^T */
1706   for (i=n-1; i>=0; i--){
1707     v    = aa + 36*diag[i] - 36;
1708     vi   = aj + diag[i] - 1;
1709     nz   = diag[i] - ai[i];
1710     idt  = 6*i;
1711     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1712     s6 = t[5+idt];
1713     while (nz--) {
1714       idx   = 6*(*vi--);
1715       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1716       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1717       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1718       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1719       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1720       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1721       v -= 36;
1722     }
1723   }
1724 
1725   /* copy t into x according to permutation */
1726   ii = 0;
1727   for (i=0; i<n; i++) {
1728     ir      = 6*r[i];
1729     x[ir]   = t[ii];
1730     x[ir+1] = t[ii+1];
1731     x[ir+2] = t[ii+2];
1732     x[ir+3] = t[ii+3];
1733     x[ir+4] = t[ii+4];
1734     x[ir+5] = t[ii+5];
1735     ii += 6;
1736   }
1737 
1738   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1739   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1740   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1741   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1742   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1743   PetscFunctionReturn(0);
1744 }
1745 
1746 #undef __FUNCT__
1747 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6"
1748 PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
1749 {
1750   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1751   PetscErrorCode    ierr;
1752   IS                iscol=a->col,isrow=a->row;
1753   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1754   const PetscInt    *r,*c,*rout,*cout;
1755   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1756   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1757   const MatScalar   *aa=a->a,*v;
1758   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t;
1759   const PetscScalar *b;
1760 
1761   PetscFunctionBegin;
1762   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1763   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1764   t = a->solve_work;
1765 
1766   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1767   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1768 
1769   /* copy b into temp work space according to permutation */
1770   for(i=0;i<n;i++){
1771     ii = bs*i; ic = bs*c[i];
1772     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1773     t[ii+4] = b[ic+4];  t[ii+5] = b[ic+5];
1774   }
1775 
1776   /* forward solve the U^T */
1777   idx = 0;
1778   for (i=0; i<n; i++) {
1779     v     = aa + bs2*diag[i];
1780     /* multiply by the inverse of the block diagonal */
1781     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1782     x6    = t[5+idx];
1783     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
1784     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
1785     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
1786     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
1787     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
1788     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
1789     v -= bs2;
1790 
1791     vi    = aj + diag[i] - 1;
1792     nz    = diag[i] - diag[i+1] - 1;
1793     for(j=0;j>-nz;j--){
1794       oidx = bs*vi[j];
1795       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1796       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1797       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1798       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1799       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1800       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1801       v  -= bs2;
1802     }
1803     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
1804     t[5+idx] = s6;
1805     idx += bs;
1806   }
1807   /* backward solve the L^T */
1808   for (i=n-1; i>=0; i--){
1809     v    = aa + bs2*ai[i];
1810     vi   = aj + ai[i];
1811     nz   = ai[i+1] - ai[i];
1812     idt  = bs*i;
1813     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
1814     s6   = t[5+idt];
1815    for(j=0;j<nz;j++){
1816       idx   = bs*vi[j];
1817       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1818       t[idx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1819       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1820       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1821       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1822       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1823       v += bs2;
1824     }
1825   }
1826 
1827   /* copy t into x according to permutation */
1828   for(i=0;i<n;i++){
1829     ii = bs*i;  ir = bs*r[i];
1830     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
1831     x[ir+4] = t[ii+4];  x[ir+5] = t[ii+5];
1832   }
1833 
1834   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1835   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1836   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1837   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1838   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1839   PetscFunctionReturn(0);
1840 }
1841 
1842 #undef __FUNCT__
1843 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_inplace"
1844 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
1845 {
1846   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1847   IS                iscol=a->col,isrow=a->row;
1848   PetscErrorCode    ierr;
1849   const PetscInt    *r,*c,*rout,*cout;
1850   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1851   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1852   const MatScalar   *aa=a->a,*v;
1853   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
1854   const PetscScalar *b;
1855 
1856   PetscFunctionBegin;
1857   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1858   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1859   t  = a->solve_work;
1860 
1861   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1862   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1863 
1864   /* copy the b into temp work space according to permutation */
1865   ii = 0;
1866   for (i=0; i<n; i++) {
1867     ic      = 7*c[i];
1868     t[ii]   = b[ic];
1869     t[ii+1] = b[ic+1];
1870     t[ii+2] = b[ic+2];
1871     t[ii+3] = b[ic+3];
1872     t[ii+4] = b[ic+4];
1873     t[ii+5] = b[ic+5];
1874     t[ii+6] = b[ic+6];
1875     ii += 7;
1876   }
1877 
1878   /* forward solve the U^T */
1879   idx = 0;
1880   for (i=0; i<n; i++) {
1881 
1882     v     = aa + 49*diag[i];
1883     /* multiply by the inverse of the block diagonal */
1884     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1885     x6    = t[5+idx]; x7 = t[6+idx];
1886     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1887     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1888     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1889     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1890     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1891     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1892     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1893     v += 49;
1894 
1895     vi    = aj + diag[i] + 1;
1896     nz    = ai[i+1] - diag[i] - 1;
1897     while (nz--) {
1898       oidx = 7*(*vi++);
1899       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1900       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1901       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1902       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1903       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1904       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1905       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1906       v  += 49;
1907     }
1908     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1909     t[5+idx] = s6;t[6+idx] = s7;
1910     idx += 7;
1911   }
1912   /* backward solve the L^T */
1913   for (i=n-1; i>=0; i--){
1914     v    = aa + 49*diag[i] - 49;
1915     vi   = aj + diag[i] - 1;
1916     nz   = diag[i] - ai[i];
1917     idt  = 7*i;
1918     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1919     s6 = t[5+idt];s7 = t[6+idt];
1920     while (nz--) {
1921       idx   = 7*(*vi--);
1922       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1923       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1924       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1925       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1926       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1927       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1928       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1929       v -= 49;
1930     }
1931   }
1932 
1933   /* copy t into x according to permutation */
1934   ii = 0;
1935   for (i=0; i<n; i++) {
1936     ir      = 7*r[i];
1937     x[ir]   = t[ii];
1938     x[ir+1] = t[ii+1];
1939     x[ir+2] = t[ii+2];
1940     x[ir+3] = t[ii+3];
1941     x[ir+4] = t[ii+4];
1942     x[ir+5] = t[ii+5];
1943     x[ir+6] = t[ii+6];
1944     ii += 7;
1945   }
1946 
1947   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1948   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1949   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1950   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1951   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
1952   PetscFunctionReturn(0);
1953 }
1954 #undef __FUNCT__
1955 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7"
1956 PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1957 {
1958   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1959   PetscErrorCode    ierr;
1960   IS                iscol=a->col,isrow=a->row;
1961   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1962   const PetscInt    *r,*c,*rout,*cout;
1963   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1964   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1965   const MatScalar   *aa=a->a,*v;
1966   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
1967   const PetscScalar *b;
1968 
1969   PetscFunctionBegin;
1970   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1971   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1972   t = a->solve_work;
1973 
1974   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1975   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1976 
1977   /* copy b into temp work space according to permutation */
1978   for(i=0;i<n;i++){
1979     ii = bs*i; ic = bs*c[i];
1980     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1981     t[ii+4] = b[ic+4];  t[ii+5] = b[ic+5];  t[ii+6] = b[ic+6];
1982   }
1983 
1984   /* forward solve the U^T */
1985   idx = 0;
1986   for (i=0; i<n; i++) {
1987     v     = aa + bs2*diag[i];
1988     /* multiply by the inverse of the block diagonal */
1989     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1990     x6    = t[5+idx]; x7 = t[6+idx];
1991     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1992     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1993     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1994     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1995     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1996     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1997     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1998     v -= bs2;
1999 
2000     vi    = aj + diag[i] - 1;
2001     nz    = diag[i] - diag[i+1] - 1;
2002     for(j=0;j>-nz;j--){
2003       oidx = bs*vi[j];
2004       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
2005       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
2006       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
2007       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
2008       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
2009       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
2010       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
2011       v  -= bs2;
2012     }
2013     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
2014     t[5+idx] = s6;  t[6+idx] = s7;
2015     idx += bs;
2016   }
2017   /* backward solve the L^T */
2018   for (i=n-1; i>=0; i--){
2019     v    = aa + bs2*ai[i];
2020     vi   = aj + ai[i];
2021     nz   = ai[i+1] - ai[i];
2022     idt  = bs*i;
2023     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
2024     s6   = t[5+idt];  s7 = t[6+idt];
2025    for(j=0;j<nz;j++){
2026       idx   = bs*vi[j];
2027       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
2028       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
2029       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
2030       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
2031       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
2032       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
2033       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
2034       v += bs2;
2035     }
2036   }
2037 
2038   /* copy t into x according to permutation */
2039   for(i=0;i<n;i++){
2040     ii = bs*i;  ir = bs*r[i];
2041     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
2042     x[ir+4] = t[ii+4];  x[ir+5] = t[ii+5];  x[ir+6] = t[ii+6];
2043   }
2044 
2045   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2046   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2047   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2048   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2049   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2050   PetscFunctionReturn(0);
2051 }
2052 
2053 /* ----------------------------------------------------------- */
2054 #undef __FUNCT__
2055 #define __FUNCT__ "MatSolve_SeqBAIJ_N_inplace"
2056 PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
2057 {
2058   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2059   IS                iscol=a->col,isrow=a->row;
2060   PetscErrorCode    ierr;
2061   const PetscInt    *r,*c,*rout,*cout;
2062   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*vi;
2063   PetscInt          i,nz;
2064   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
2065   const MatScalar   *aa=a->a,*v;
2066   PetscScalar       *x,*s,*t,*ls;
2067   const PetscScalar *b;
2068 
2069   PetscFunctionBegin;
2070   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2071   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2072   t  = a->solve_work;
2073 
2074   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2075   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2076 
2077   /* forward solve the lower triangular */
2078   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
2079   for (i=1; i<n; i++) {
2080     v   = aa + bs2*ai[i];
2081     vi  = aj + ai[i];
2082     nz  = a->diag[i] - ai[i];
2083     s = t + bs*i;
2084     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
2085     while (nz--) {
2086       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
2087       v += bs2;
2088     }
2089   }
2090   /* backward solve the upper triangular */
2091   ls = a->solve_work + A->cmap->n;
2092   for (i=n-1; i>=0; i--){
2093     v   = aa + bs2*(a->diag[i] + 1);
2094     vi  = aj + a->diag[i] + 1;
2095     nz  = ai[i+1] - a->diag[i] - 1;
2096     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
2097     while (nz--) {
2098       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
2099       v += bs2;
2100     }
2101     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
2102     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
2103   }
2104 
2105   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2106   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2107   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2108   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2109   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
2110   PetscFunctionReturn(0);
2111 }
2112 
2113 /* ----------------------------------------------------------- */
2114 #undef __FUNCT__
2115 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N_inplace"
2116 PetscErrorCode MatSolveTranspose_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
2117 {
2118   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2119   IS                iscol=a->col,isrow=a->row;
2120   PetscErrorCode    ierr;
2121   const PetscInt    *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
2122   PetscInt          i,nz,j;
2123   const PetscInt    n=a->mbs,bs=A->rmap->bs,bs2=a->bs2;
2124   const MatScalar   *aa=a->a,*v;
2125   PetscScalar       *x,*t,*ls;
2126   const PetscScalar *b;
2127   PetscFunctionBegin;
2128   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2129   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2130   t    = a->solve_work;
2131 
2132   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2133   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2134 
2135   /* copy the b into temp work space according to permutation */
2136   for (i=0; i<n; i++) {
2137     for (j=0; j<bs; j++) {
2138       t[i*bs+j] = b[c[i]*bs+j];
2139     }
2140   }
2141 
2142 
2143   /* forward solve the upper triangular transpose */
2144   ls = a->solve_work + A->cmap->n;
2145   for (i=0; i<n; i++){
2146     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
2147     Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
2148     v   = aa + bs2*(a->diag[i] + 1);
2149     vi  = aj + a->diag[i] + 1;
2150     nz  = ai[i+1] - a->diag[i] - 1;
2151     while (nz--) {
2152       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
2153       v += bs2;
2154     }
2155   }
2156 
2157   /* backward solve the lower triangular transpose */
2158   for (i=n-1; i>=0; i--) {
2159     v   = aa + bs2*ai[i];
2160     vi  = aj + ai[i];
2161     nz  = a->diag[i] - ai[i];
2162     while (nz--) {
2163       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
2164       v += bs2;
2165     }
2166   }
2167 
2168   /* copy t into x according to permutation */
2169   for (i=0; i<n; i++) {
2170     for (j=0; j<bs; j++) {
2171       x[bs*r[i]+j]   = t[bs*i+j];
2172     }
2173   }
2174 
2175   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2176   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2177   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2178   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2179   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
2180   PetscFunctionReturn(0);
2181 }
2182 
2183 #undef __FUNCT__
2184 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N"
2185 PetscErrorCode MatSolveTranspose_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
2186 {
2187   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2188   IS                iscol=a->col,isrow=a->row;
2189   PetscErrorCode    ierr;
2190   const PetscInt    *r,*c,*rout,*cout;
2191   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*vi,*diag=a->diag;
2192   PetscInt          i,j,nz;
2193   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
2194   const MatScalar   *aa=a->a,*v;
2195   PetscScalar       *x,*t,*ls;
2196   const PetscScalar *b;
2197 
2198   PetscFunctionBegin;
2199   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2200   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2201   t    = a->solve_work;
2202 
2203   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2204   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2205 
2206   /* copy the b into temp work space according to permutation */
2207   for (i=0; i<n; i++) {
2208     for (j=0; j<bs; j++) {
2209       t[i*bs+j] = b[c[i]*bs+j];
2210     }
2211   }
2212 
2213 
2214   /* forward solve the upper triangular transpose */
2215   ls = a->solve_work + A->cmap->n;
2216   for (i=0; i<n; i++){
2217     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
2218     Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*diag[i],t+i*bs);
2219     v   = aa + bs2*(diag[i] - 1);
2220     vi  = aj + diag[i] - 1;
2221     nz  = diag[i] - diag[i+1] - 1;
2222     for(j=0;j>-nz;j--){
2223       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs);
2224       v -= bs2;
2225     }
2226   }
2227 
2228   /* backward solve the lower triangular transpose */
2229   for (i=n-1; i>=0; i--) {
2230     v   = aa + bs2*ai[i];
2231     vi  = aj + ai[i];
2232     nz  = ai[i+1] - ai[i];
2233     for(j=0;j<nz;j++){
2234       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs);
2235       v += bs2;
2236     }
2237   }
2238 
2239   /* copy t into x according to permutation */
2240   for (i=0; i<n; i++) {
2241     for (j=0; j<bs; j++) {
2242       x[bs*r[i]+j]   = t[bs*i+j];
2243     }
2244   }
2245 
2246   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2247   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2248   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2249   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2250   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
2251   PetscFunctionReturn(0);
2252 }
2253 
2254 /* bs = 15 for PFLOTRAN. Block operations are done by accessing all the columns   of the block at once */
2255 
2256 #undef __FUNCT__
2257 #define __FUNCT__ "MatSolve_SeqBAIJ_15_NaturalOrdering_ver2"
2258 PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver2(Mat A,Vec bb,Vec xx)
2259 {
2260   Mat_SeqBAIJ      *a=(Mat_SeqBAIJ *)A->data;
2261   PetscErrorCode    ierr;
2262   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
2263   PetscInt          i,nz,idx,idt,m;
2264   const MatScalar   *aa=a->a,*v;
2265   PetscScalar       s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15;
2266   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15;
2267   PetscScalar       *x;
2268   const PetscScalar *b;
2269 
2270   PetscFunctionBegin;
2271   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2272   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2273 
2274   /* forward solve the lower triangular */
2275   idx    = 0;
2276   x[0]  = b[idx];    x[1]  = b[1+idx];  x[2]  = b[2+idx];  x[3]  = b[3+idx];  x[4]  = b[4+idx];
2277   x[5]  = b[5+idx];  x[6]  = b[6+idx];  x[7]  = b[7+idx];  x[8]  = b[8+idx];  x[9]  = b[9+idx];
2278   x[10] = b[10+idx]; x[11] = b[11+idx]; x[12] = b[12+idx]; x[13] = b[13+idx]; x[14] = b[14+idx];
2279 
2280   for (i=1; i<n; i++) {
2281     v     = aa + bs2*ai[i];
2282     vi    = aj + ai[i];
2283     nz    = ai[i+1] - ai[i];
2284     idt   = bs*i;
2285     s1   = b[idt];    s2  = b[1+idt];  s3  = b[2+idt];  s4  = b[3+idt];  s5  = b[4+idt];
2286     s6   = b[5+idt];  s7  = b[6+idt];  s8  = b[7+idt];  s9  = b[8+idt];  s10 = b[9+idt];
2287     s11  = b[10+idt]; s12 = b[11+idt]; s13 = b[12+idt]; s14 = b[13+idt]; s15 = b[14+idt];
2288     for(m=0;m<nz;m++){
2289       idx   = bs*vi[m];
2290       x1   = x[idx];     x2  = x[1+idx];  x3  = x[2+idx];  x4  = x[3+idx];  x5  = x[4+idx];
2291       x6   = x[5+idx];   x7  = x[6+idx];  x8  = x[7+idx];  x9  = x[8+idx];  x10 = x[9+idx];
2292       x11  = x[10+idx]; x12  = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx];
2293 
2294 
2295       s1 -=  v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
2296       s2 -=  v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
2297       s3 -=  v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
2298       s4 -=  v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
2299       s5  -= v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
2300       s6  -= v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
2301       s7  -= v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
2302       s8  -= v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
2303       s9  -= v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
2304       s10 -= v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
2305       s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
2306       s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
2307       s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
2308       s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
2309       s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
2310 
2311       v += bs2;
2312     }
2313     x[idt]    = s1;  x[1+idt]  = s2;  x[2+idt]  = s3;  x[3+idt]  = s4;  x[4+idt]  = s5;
2314     x[5+idt]  = s6;  x[6+idt]  = s7;  x[7+idt]  = s8;  x[8+idt]  = s9;  x[9+idt]  = s10;
2315     x[10+idt] = s11; x[11+idt] = s12; x[12+idt] = s13; x[13+idt] = s14; x[14+idt] = s15;
2316 
2317   }
2318   /* backward solve the upper triangular */
2319   for (i=n-1; i>=0; i--){
2320     v    = aa + bs2*(adiag[i+1]+1);
2321     vi   = aj + adiag[i+1]+1;
2322     nz   = adiag[i] - adiag[i+1] - 1;
2323     idt  = bs*i;
2324     s1   = x[idt];     s2  = x[1+idt];  s3  = x[2+idt];  s4  = x[3+idt];  s5  = x[4+idt];
2325     s6   = x[5+idt];   s7  = x[6+idt];  s8  = x[7+idt];  s9  = x[8+idt];  s10 = x[9+idt];
2326     s11  = x[10+idt]; s12  = x[11+idt]; s13 = x[12+idt]; s14 = x[13+idt]; s15 = x[14+idt];
2327 
2328     for(m=0;m<nz;m++){
2329       idx   = bs*vi[m];
2330       x1   = x[idx];     x2  = x[1+idx];  x3  = x[2+idx];  x4  = x[3+idx];  x5  = x[4+idx];
2331       x6   = x[5+idx];   x7  = x[6+idx];  x8  = x[7+idx];  x9  = x[8+idx];  x10 = x[9+idx];
2332       x11  = x[10+idx]; x12  = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx];
2333 
2334       s1  -= v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
2335       s2  -= v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
2336       s3  -= v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
2337       s4  -= v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
2338       s5  -= v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
2339       s6  -= v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
2340       s7  -= v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
2341       s8  -= v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
2342       s9  -= v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
2343       s10 -= v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
2344       s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
2345       s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
2346       s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
2347       s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
2348       s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
2349 
2350       v += bs2;
2351     }
2352 
2353     x[idt] = v[0]*s1  + v[15]*s2 + v[30]*s3 + v[45]*s4 + v[60]*s5 + v[75]*s6 + v[90]*s7  + v[105]*s8 + v[120]*s9 + v[135]*s10 + v[150]*s11 + v[165]*s12 + v[180]*s13 + v[195]*s14 + v[210]*s15;
2354     x[1+idt] = v[1]*s1  + v[16]*s2 + v[31]*s3 + v[46]*s4 + v[61]*s5 + v[76]*s6 + v[91]*s7  + v[106]*s8 + v[121]*s9 + v[136]*s10 + v[151]*s11 + v[166]*s12 + v[181]*s13 + v[196]*s14 + v[211]*s15;
2355     x[2+idt] = v[2]*s1  + v[17]*s2 + v[32]*s3 + v[47]*s4 + v[62]*s5 + v[77]*s6 + v[92]*s7  + v[107]*s8 + v[122]*s9 + v[137]*s10 + v[152]*s11 + v[167]*s12 + v[182]*s13 + v[197]*s14 + v[212]*s15;
2356     x[3+idt] = v[3]*s1  + v[18]*s2 + v[33]*s3 + v[48]*s4 + v[63]*s5 + v[78]*s6 + v[93]*s7  + v[108]*s8 + v[123]*s9 + v[138]*s10 + v[153]*s11 + v[168]*s12 + v[183]*s13 + v[198]*s14 + v[213]*s15;
2357     x[4+idt] = v[4]*s1  + v[19]*s2 + v[34]*s3 + v[49]*s4 + v[64]*s5 + v[79]*s6 + v[94]*s7  + v[109]*s8 + v[124]*s9 + v[139]*s10 + v[154]*s11 + v[169]*s12 + v[184]*s13 + v[199]*s14 + v[214]*s15;
2358     x[5+idt] = v[5]*s1  + v[20]*s2 + v[35]*s3 + v[50]*s4 + v[65]*s5 + v[80]*s6 + v[95]*s7  + v[110]*s8 + v[125]*s9 + v[140]*s10 + v[155]*s11 + v[170]*s12 + v[185]*s13 + v[200]*s14 + v[215]*s15;
2359     x[6+idt] = v[6]*s1  + v[21]*s2 + v[36]*s3 + v[51]*s4 + v[66]*s5 + v[81]*s6 + v[96]*s7  + v[111]*s8 + v[126]*s9 + v[141]*s10 + v[156]*s11 + v[171]*s12 + v[186]*s13 + v[201]*s14 + v[216]*s15;
2360     x[7+idt] = v[7]*s1  + v[22]*s2 + v[37]*s3 + v[52]*s4 + v[67]*s5 + v[82]*s6 + v[97]*s7  + v[112]*s8 + v[127]*s9 + v[142]*s10 + v[157]*s11 + v[172]*s12 + v[187]*s13 + v[202]*s14 + v[217]*s15;
2361     x[8+idt] = v[8]*s1  + v[23]*s2 + v[38]*s3 + v[53]*s4 + v[68]*s5 + v[83]*s6 + v[98]*s7  + v[113]*s8 + v[128]*s9 + v[143]*s10 + v[158]*s11 + v[173]*s12 + v[188]*s13 + v[203]*s14 + v[218]*s15;
2362     x[9+idt] = v[9]*s1  + v[24]*s2 + v[39]*s3 + v[54]*s4 + v[69]*s5 + v[84]*s6 + v[99]*s7  + v[114]*s8 + v[129]*s9 + v[144]*s10 + v[159]*s11 + v[174]*s12 + v[189]*s13 + v[204]*s14 + v[219]*s15;
2363     x[10+idt] = v[10]*s1 + v[25]*s2 + v[40]*s3 + v[55]*s4 + v[70]*s5 + v[85]*s6 + v[100]*s7 + v[115]*s8 + v[130]*s9 + v[145]*s10 + v[160]*s11 + v[175]*s12 + v[190]*s13 + v[205]*s14 + v[220]*s15;
2364     x[11+idt] = v[11]*s1 + v[26]*s2 + v[41]*s3 + v[56]*s4 + v[71]*s5 + v[86]*s6 + v[101]*s7 + v[116]*s8 + v[131]*s9 + v[146]*s10 + v[161]*s11 + v[176]*s12 + v[191]*s13 + v[206]*s14 + v[221]*s15;
2365     x[12+idt] = v[12]*s1 + v[27]*s2 + v[42]*s3 + v[57]*s4 + v[72]*s5 + v[87]*s6 + v[102]*s7 + v[117]*s8 + v[132]*s9 + v[147]*s10 + v[162]*s11 + v[177]*s12 + v[192]*s13 + v[207]*s14 + v[222]*s15;
2366     x[13+idt] = v[13]*s1 + v[28]*s2 + v[43]*s3 + v[58]*s4 + v[73]*s5 + v[88]*s6 + v[103]*s7 + v[118]*s8 + v[133]*s9 + v[148]*s10 + v[163]*s11 + v[178]*s12 + v[193]*s13 + v[208]*s14 + v[223]*s15;
2367     x[14+idt] = v[14]*s1 + v[29]*s2 + v[44]*s3 + v[59]*s4 + v[74]*s5 + v[89]*s6 + v[104]*s7 + v[119]*s8 + v[134]*s9 + v[149]*s10 + v[164]*s11 + v[179]*s12 + v[194]*s13 + v[209]*s14 + v[224]*s15;
2368 
2369   }
2370 
2371   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2372   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2373   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2374   PetscFunctionReturn(0);
2375 }
2376 
2377 /* bs = 15 for PFLOTRAN. Block operations are done by accessing one column at at time */
2378 /* Default MatSolve for block size 15 */
2379 
2380 #undef __FUNCT__
2381 #define __FUNCT__ "MatSolve_SeqBAIJ_15_NaturalOrdering_ver1"
2382 PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver1(Mat A,Vec bb,Vec xx)
2383 {
2384   Mat_SeqBAIJ      *a=(Mat_SeqBAIJ *)A->data;
2385   PetscErrorCode    ierr;
2386   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
2387   PetscInt          i,k,nz,kdx,idx,idt,m;
2388   const MatScalar   *aa=a->a,*v;
2389   PetscScalar       s[15];
2390   PetscScalar       *x;
2391   const PetscScalar *b;
2392 
2393   PetscFunctionBegin;
2394   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2395   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2396 
2397   /* forward solve the lower triangular */
2398   for (i=0; i<n; i++) {
2399     v     = aa + bs2*ai[i];
2400     vi    = aj + ai[i];
2401     nz    = ai[i+1] - ai[i];
2402     idt   = bs*i;
2403     x[idt]   = b[idt];    x[1+idt]  = b[1+idt];  x[2+idt]  = b[2+idt];  x[3+idt]  = b[3+idt];  x[4+idt]  = b[4+idt];
2404     x[5+idt]   = b[5+idt];  x[6+idt]  = b[6+idt];  x[7+idt]  = b[7+idt];  x[8+idt]  = b[8+idt];  x[9+idt] = b[9+idt];
2405     x[10+idt]  = b[10+idt]; x[11+idt] = b[11+idt]; x[12+idt] = b[12+idt]; x[13+idt] = b[13+idt]; x[14+idt] = b[14+idt];
2406     for(m=0;m<nz;m++){
2407       idx   = bs*vi[m];
2408       for(k=0;k<15;k++){
2409 	kdx = k + idx;
2410 	x[idt]    -= v[0]*x[kdx];
2411 	x[1+idt]  -= v[1]*x[kdx];
2412 	x[2+idt]  -= v[2]*x[kdx];
2413         x[3+idt]  -= v[3]*x[kdx];
2414 	x[4+idt]  -= v[4]*x[kdx];
2415 	x[5+idt]  -= v[5]*x[kdx];
2416 	x[6+idt]  -= v[6]*x[kdx];
2417         x[7+idt]  -= v[7]*x[kdx];
2418 	x[8+idt]  -= v[8]*x[kdx];
2419 	x[9+idt]  -= v[9]*x[kdx];
2420 	x[10+idt] -= v[10]*x[kdx];
2421         x[11+idt] -= v[11]*x[kdx];
2422 	x[12+idt] -= v[12]*x[kdx];
2423 	x[13+idt] -= v[13]*x[kdx];
2424 	x[14+idt] -= v[14]*x[kdx];
2425 	v += 15;
2426       }
2427     }
2428   }
2429   /* backward solve the upper triangular */
2430   for (i=n-1; i>=0; i--){
2431     v    = aa + bs2*(adiag[i+1]+1);
2432     vi   = aj + adiag[i+1]+1;
2433     nz   = adiag[i] - adiag[i+1] - 1;
2434     idt  = bs*i;
2435     s[0]   = x[idt];    s[1]  = x[1+idt];  s[2]  = x[2+idt];  s[3]  = x[3+idt];  s[4]  = x[4+idt];
2436     s[5]   = x[5+idt];  s[6]  = x[6+idt];  s[7]  = x[7+idt];  s[8]  = x[8+idt];  s[9]  = x[9+idt];
2437     s[10]  = x[10+idt]; s[11] = x[11+idt]; s[12] = x[12+idt]; s[13] = x[13+idt]; s[14] = x[14+idt];
2438 
2439     for(m=0;m<nz;m++){
2440       idx   = bs*vi[m];
2441       for(k=0;k<15;k++){
2442 	kdx = k + idx;
2443 	s[0]  -= v[0]*x[kdx];
2444 	s[1]  -= v[1]*x[kdx];
2445 	s[2]  -= v[2]*x[kdx];
2446         s[3]  -= v[3]*x[kdx];
2447 	s[4]  -= v[4]*x[kdx];
2448 	s[5]  -= v[5]*x[kdx];
2449 	s[6]  -= v[6]*x[kdx];
2450         s[7]  -= v[7]*x[kdx];
2451 	s[8]  -= v[8]*x[kdx];
2452 	s[9]  -= v[9]*x[kdx];
2453 	s[10] -= v[10]*x[kdx];
2454         s[11] -= v[11]*x[kdx];
2455 	s[12] -= v[12]*x[kdx];
2456 	s[13] -= v[13]*x[kdx];
2457 	s[14] -= v[14]*x[kdx];
2458 	v += 15;
2459       }
2460     }
2461     ierr = PetscMemzero(x+idt,bs*sizeof(MatScalar));CHKERRQ(ierr);
2462     for(k=0;k<15;k++){
2463       x[idt]    += v[0]*s[k];
2464       x[1+idt]  += v[1]*s[k];
2465       x[2+idt]  += v[2]*s[k];
2466       x[3+idt]  += v[3]*s[k];
2467       x[4+idt]  += v[4]*s[k];
2468       x[5+idt]  += v[5]*s[k];
2469       x[6+idt]  += v[6]*s[k];
2470       x[7+idt]  += v[7]*s[k];
2471       x[8+idt]  += v[8]*s[k];
2472       x[9+idt]  += v[9]*s[k];
2473       x[10+idt] += v[10]*s[k];
2474       x[11+idt] += v[11]*s[k];
2475       x[12+idt] += v[12]*s[k];
2476       x[13+idt] += v[13]*s[k];
2477       x[14+idt] += v[14]*s[k];
2478       v += 15;
2479     }
2480   }
2481   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2482   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2483   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2484   PetscFunctionReturn(0);
2485 }
2486 
2487 
2488 #undef __FUNCT__
2489 #define __FUNCT__ "MatSolve_SeqBAIJ_7_inplace"
2490 PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
2491 {
2492   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2493   IS                iscol=a->col,isrow=a->row;
2494   PetscErrorCode    ierr;
2495   const PetscInt    *r,*c,*ai=a->i,*aj=a->j;
2496   const PetscInt    *rout,*cout,*diag = a->diag,*vi,n=a->mbs;
2497   PetscInt          i,nz,idx,idt,idc;
2498   const MatScalar   *aa=a->a,*v;
2499   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
2500   const PetscScalar *b;
2501 
2502   PetscFunctionBegin;
2503   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2504   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2505   t  = a->solve_work;
2506 
2507   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2508   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2509 
2510   /* forward solve the lower triangular */
2511   idx    = 7*(*r++);
2512   t[0] = b[idx];   t[1] = b[1+idx];
2513   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
2514   t[5] = b[5+idx]; t[6] = b[6+idx];
2515 
2516   for (i=1; i<n; i++) {
2517     v     = aa + 49*ai[i];
2518     vi    = aj + ai[i];
2519     nz    = diag[i] - ai[i];
2520     idx   = 7*(*r++);
2521     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2522     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
2523     while (nz--) {
2524       idx   = 7*(*vi++);
2525       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
2526       x4    = t[3+idx];x5 = t[4+idx];
2527       x6    = t[5+idx];x7 = t[6+idx];
2528       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2529       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2530       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2531       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2532       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2533       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2534       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2535       v += 49;
2536     }
2537     idx = 7*i;
2538     t[idx]   = s1;t[1+idx] = s2;
2539     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2540     t[5+idx] = s6;t[6+idx] = s7;
2541   }
2542   /* backward solve the upper triangular */
2543   for (i=n-1; i>=0; i--){
2544     v    = aa + 49*diag[i] + 49;
2545     vi   = aj + diag[i] + 1;
2546     nz   = ai[i+1] - diag[i] - 1;
2547     idt  = 7*i;
2548     s1 = t[idt];  s2 = t[1+idt];
2549     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2550     s6 = t[5+idt];s7 = t[6+idt];
2551     while (nz--) {
2552       idx   = 7*(*vi++);
2553       x1    = t[idx];   x2 = t[1+idx];
2554       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2555       x6    = t[5+idx]; x7 = t[6+idx];
2556       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2557       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2558       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2559       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2560       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2561       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2562       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2563       v += 49;
2564     }
2565     idc = 7*(*c--);
2566     v   = aa + 49*diag[i];
2567     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
2568                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
2569     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
2570                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
2571     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
2572                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
2573     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
2574                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
2575     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
2576                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
2577     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
2578                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
2579     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
2580                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
2581   }
2582 
2583   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2584   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2585   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2586   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2587   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
2588   PetscFunctionReturn(0);
2589 }
2590 
2591 #undef __FUNCT__
2592 #define __FUNCT__ "MatSolve_SeqBAIJ_7"
2593 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
2594 {
2595   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2596   IS                iscol=a->col,isrow=a->row;
2597   PetscErrorCode    ierr;
2598   const PetscInt    *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag;
2599   const PetscInt    n=a->mbs,*rout,*cout,*vi;
2600   PetscInt          i,nz,idx,idt,idc,m;
2601   const MatScalar   *aa=a->a,*v;
2602   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
2603   const PetscScalar *b;
2604 
2605   PetscFunctionBegin;
2606   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2607   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2608   t  = a->solve_work;
2609 
2610   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2611   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2612 
2613   /* forward solve the lower triangular */
2614   idx    = 7*r[0];
2615   t[0] = b[idx];   t[1] = b[1+idx];
2616   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
2617   t[5] = b[5+idx]; t[6] = b[6+idx];
2618 
2619   for (i=1; i<n; i++) {
2620     v     = aa + 49*ai[i];
2621     vi    = aj + ai[i];
2622     nz    = ai[i+1] - ai[i];
2623     idx   = 7*r[i];
2624     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2625     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
2626     for(m=0;m<nz;m++){
2627       idx   = 7*vi[m];
2628       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
2629       x4    = t[3+idx];x5 = t[4+idx];
2630       x6    = t[5+idx];x7 = t[6+idx];
2631       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2632       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2633       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2634       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2635       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2636       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2637       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2638       v += 49;
2639     }
2640     idx = 7*i;
2641     t[idx]   = s1;t[1+idx] = s2;
2642     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2643     t[5+idx] = s6;t[6+idx] = s7;
2644   }
2645   /* backward solve the upper triangular */
2646   for (i=n-1; i>=0; i--){
2647     v    = aa + 49*(adiag[i+1]+1);
2648     vi   = aj + adiag[i+1]+1;
2649     nz   = adiag[i] - adiag[i+1] - 1;
2650     idt  = 7*i;
2651     s1 = t[idt];  s2 = t[1+idt];
2652     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2653     s6 = t[5+idt];s7 = t[6+idt];
2654     for(m=0;m<nz;m++){
2655       idx   = 7*vi[m];
2656       x1    = t[idx];   x2 = t[1+idx];
2657       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2658       x6    = t[5+idx]; x7 = t[6+idx];
2659       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2660       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2661       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2662       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2663       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2664       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2665       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2666       v += 49;
2667     }
2668     idc = 7*c[i];
2669     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
2670                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
2671     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
2672                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
2673     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
2674                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
2675     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
2676                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
2677     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
2678                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
2679     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
2680                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
2681     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
2682                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
2683   }
2684 
2685   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2686   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2687   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2688   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2689   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
2690   PetscFunctionReturn(0);
2691 }
2692 
2693 #undef __FUNCT__
2694 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_inplace"
2695 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2696 {
2697   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2698   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
2699   PetscErrorCode    ierr;
2700   PetscInt          i,nz,idx,idt,jdx;
2701   const MatScalar   *aa=a->a,*v;
2702   PetscScalar       *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
2703   const PetscScalar *b;
2704 
2705   PetscFunctionBegin;
2706   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2707   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2708   /* forward solve the lower triangular */
2709   idx    = 0;
2710   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
2711   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
2712   x[6] = b[6+idx];
2713   for (i=1; i<n; i++) {
2714     v     =  aa + 49*ai[i];
2715     vi    =  aj + ai[i];
2716     nz    =  diag[i] - ai[i];
2717     idx   =  7*i;
2718     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
2719     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
2720     s7  =  b[6+idx];
2721     while (nz--) {
2722       jdx   = 7*(*vi++);
2723       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
2724       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
2725       x7    = x[6+jdx];
2726       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2727       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2728       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2729       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2730       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2731       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2732       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2733       v += 49;
2734      }
2735     x[idx]   = s1;
2736     x[1+idx] = s2;
2737     x[2+idx] = s3;
2738     x[3+idx] = s4;
2739     x[4+idx] = s5;
2740     x[5+idx] = s6;
2741     x[6+idx] = s7;
2742   }
2743   /* backward solve the upper triangular */
2744   for (i=n-1; i>=0; i--){
2745     v    = aa + 49*diag[i] + 49;
2746     vi   = aj + diag[i] + 1;
2747     nz   = ai[i+1] - diag[i] - 1;
2748     idt  = 7*i;
2749     s1 = x[idt];   s2 = x[1+idt];
2750     s3 = x[2+idt]; s4 = x[3+idt];
2751     s5 = x[4+idt]; s6 = x[5+idt];
2752     s7 = x[6+idt];
2753     while (nz--) {
2754       idx   = 7*(*vi++);
2755       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
2756       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
2757       x7    = x[6+idx];
2758       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2759       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2760       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2761       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2762       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2763       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2764       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2765       v += 49;
2766     }
2767     v        = aa + 49*diag[i];
2768     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
2769                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
2770     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
2771                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
2772     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
2773                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
2774     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
2775                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
2776     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
2777                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
2778     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
2779                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
2780     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
2781                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
2782   }
2783 
2784   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2785   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2786   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
2787   PetscFunctionReturn(0);
2788 }
2789 
2790 #undef __FUNCT__
2791 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
2792 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
2793 {
2794     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2795     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
2796     PetscErrorCode    ierr;
2797     PetscInt          i,k,nz,idx,jdx,idt;
2798     const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
2799     const MatScalar   *aa=a->a,*v;
2800     PetscScalar       *x;
2801     const PetscScalar *b;
2802     PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
2803 
2804     PetscFunctionBegin;
2805     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2806     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2807     /* forward solve the lower triangular */
2808     idx    = 0;
2809     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
2810     x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx];
2811     for (i=1; i<n; i++) {
2812        v    = aa + bs2*ai[i];
2813        vi   = aj + ai[i];
2814        nz   = ai[i+1] - ai[i];
2815       idx   = bs*i;
2816        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2817        s5   = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
2818        for(k=0;k<nz;k++) {
2819           jdx   = bs*vi[k];
2820           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
2821 	  x5    = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx];
2822           s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
2823           s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
2824           s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
2825 	  s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
2826           s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
2827 	  s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
2828 	  s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
2829           v   +=  bs2;
2830         }
2831 
2832        x[idx]   = s1;
2833        x[1+idx] = s2;
2834        x[2+idx] = s3;
2835        x[3+idx] = s4;
2836        x[4+idx] = s5;
2837        x[5+idx] = s6;
2838        x[6+idx] = s7;
2839     }
2840 
2841    /* backward solve the upper triangular */
2842   for (i=n-1; i>=0; i--){
2843     v   = aa + bs2*(adiag[i+1]+1);
2844      vi  = aj + adiag[i+1]+1;
2845      nz  = adiag[i] - adiag[i+1]-1;
2846      idt = bs*i;
2847      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
2848      s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt];
2849     for(k=0;k<nz;k++) {
2850       idx   = bs*vi[k];
2851        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
2852        x5    = x[4+idx];x6 = x[5+idx];x7 = x[6+idx];
2853        s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
2854        s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
2855        s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
2856        s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
2857        s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
2858        s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
2859        s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
2860         v   +=  bs2;
2861     }
2862     /* x = inv_diagonal*x */
2863     x[idt]   = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4  + v[28]*s5 + v[35]*s6 + v[42]*s7;
2864     x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4  + v[29]*s5 + v[36]*s6 + v[43]*s7;
2865     x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4  + v[30]*s5 + v[37]*s6 + v[44]*s7;
2866     x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4  + v[31]*s5 + v[38]*s6 + v[45]*s7;
2867     x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4  + v[32]*s5 + v[39]*s6 + v[46]*s7;
2868     x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4  + v[33]*s5 + v[40]*s6 + v[47]*s7;
2869     x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4  + v[34]*s5 + v[41]*s6 + v[48]*s7;
2870   }
2871 
2872   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2873   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2874   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2875   PetscFunctionReturn(0);
2876 }
2877 
2878 #undef __FUNCT__
2879 #define __FUNCT__ "MatSolve_SeqBAIJ_6_inplace"
2880 PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
2881 {
2882   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2883   IS                iscol=a->col,isrow=a->row;
2884   PetscErrorCode    ierr;
2885   const PetscInt    *r,*c,*rout,*cout;
2886   const PetscInt    *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
2887   PetscInt          i,nz,idx,idt,idc;
2888   const MatScalar   *aa=a->a,*v;
2889   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
2890   const PetscScalar *b;
2891 
2892   PetscFunctionBegin;
2893   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2894   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2895   t  = a->solve_work;
2896 
2897   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2898   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2899 
2900   /* forward solve the lower triangular */
2901   idx    = 6*(*r++);
2902   t[0] = b[idx];   t[1] = b[1+idx];
2903   t[2] = b[2+idx]; t[3] = b[3+idx];
2904   t[4] = b[4+idx]; t[5] = b[5+idx];
2905   for (i=1; i<n; i++) {
2906     v     = aa + 36*ai[i];
2907     vi    = aj + ai[i];
2908     nz    = diag[i] - ai[i];
2909     idx   = 6*(*r++);
2910     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2911     s5  = b[4+idx]; s6 = b[5+idx];
2912     while (nz--) {
2913       idx   = 6*(*vi++);
2914       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
2915       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
2916       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2917       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2918       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2919       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2920       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2921       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
2922       v += 36;
2923     }
2924     idx = 6*i;
2925     t[idx]   = s1;t[1+idx] = s2;
2926     t[2+idx] = s3;t[3+idx] = s4;
2927     t[4+idx] = s5;t[5+idx] = s6;
2928   }
2929   /* backward solve the upper triangular */
2930   for (i=n-1; i>=0; i--){
2931     v    = aa + 36*diag[i] + 36;
2932     vi   = aj + diag[i] + 1;
2933     nz   = ai[i+1] - diag[i] - 1;
2934     idt  = 6*i;
2935     s1 = t[idt];  s2 = t[1+idt];
2936     s3 = t[2+idt];s4 = t[3+idt];
2937     s5 = t[4+idt];s6 = t[5+idt];
2938     while (nz--) {
2939       idx   = 6*(*vi++);
2940       x1    = t[idx];   x2 = t[1+idx];
2941       x3    = t[2+idx]; x4 = t[3+idx];
2942       x5    = t[4+idx]; x6 = t[5+idx];
2943       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2944       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2945       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2946       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2947       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2948       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
2949       v += 36;
2950     }
2951     idc = 6*(*c--);
2952     v   = aa + 36*diag[i];
2953     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
2954                                  v[18]*s4+v[24]*s5+v[30]*s6;
2955     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
2956                                  v[19]*s4+v[25]*s5+v[31]*s6;
2957     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
2958                                  v[20]*s4+v[26]*s5+v[32]*s6;
2959     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
2960                                  v[21]*s4+v[27]*s5+v[33]*s6;
2961     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
2962                                  v[22]*s4+v[28]*s5+v[34]*s6;
2963     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
2964                                  v[23]*s4+v[29]*s5+v[35]*s6;
2965   }
2966 
2967   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2968   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2969   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2970   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2971   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
2972   PetscFunctionReturn(0);
2973 }
2974 
2975 #undef __FUNCT__
2976 #define __FUNCT__ "MatSolve_SeqBAIJ_6"
2977 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
2978 {
2979   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2980   IS                iscol=a->col,isrow=a->row;
2981   PetscErrorCode    ierr;
2982   const PetscInt    *r,*c,*rout,*cout;
2983   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
2984   PetscInt          i,nz,idx,idt,idc,m;
2985   const MatScalar   *aa=a->a,*v;
2986   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
2987   const PetscScalar *b;
2988 
2989   PetscFunctionBegin;
2990   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2991   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2992   t  = a->solve_work;
2993 
2994   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2995   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2996 
2997   /* forward solve the lower triangular */
2998   idx    = 6*r[0];
2999   t[0] = b[idx];   t[1] = b[1+idx];
3000   t[2] = b[2+idx]; t[3] = b[3+idx];
3001   t[4] = b[4+idx]; t[5] = b[5+idx];
3002   for (i=1; i<n; i++) {
3003     v     = aa + 36*ai[i];
3004     vi    = aj + ai[i];
3005     nz    = ai[i+1] - ai[i];
3006     idx   = 6*r[i];
3007     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3008     s5  = b[4+idx]; s6 = b[5+idx];
3009     for(m=0;m<nz;m++){
3010       idx   = 6*vi[m];
3011       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
3012       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
3013       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3014       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3015       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3016       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3017       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3018       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3019       v += 36;
3020     }
3021     idx = 6*i;
3022     t[idx]   = s1;t[1+idx] = s2;
3023     t[2+idx] = s3;t[3+idx] = s4;
3024     t[4+idx] = s5;t[5+idx] = s6;
3025   }
3026   /* backward solve the upper triangular */
3027   for (i=n-1; i>=0; i--){
3028     v    = aa + 36*(adiag[i+1]+1);
3029     vi   = aj + adiag[i+1]+1;
3030     nz   = adiag[i] - adiag[i+1] - 1;
3031     idt  = 6*i;
3032     s1 = t[idt];  s2 = t[1+idt];
3033     s3 = t[2+idt];s4 = t[3+idt];
3034     s5 = t[4+idt];s6 = t[5+idt];
3035     for(m=0;m<nz;m++){
3036       idx   = 6*vi[m];
3037       x1    = t[idx];   x2 = t[1+idx];
3038       x3    = t[2+idx]; x4 = t[3+idx];
3039       x5    = t[4+idx]; x6 = t[5+idx];
3040       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3041       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3042       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3043       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3044       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3045       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3046       v += 36;
3047     }
3048     idc = 6*c[i];
3049     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
3050                                  v[18]*s4+v[24]*s5+v[30]*s6;
3051     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
3052                                  v[19]*s4+v[25]*s5+v[31]*s6;
3053     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
3054                                  v[20]*s4+v[26]*s5+v[32]*s6;
3055     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
3056                                  v[21]*s4+v[27]*s5+v[33]*s6;
3057     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
3058                                  v[22]*s4+v[28]*s5+v[34]*s6;
3059     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
3060                                  v[23]*s4+v[29]*s5+v[35]*s6;
3061   }
3062 
3063   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3064   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3065   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3066   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3067   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
3068   PetscFunctionReturn(0);
3069 }
3070 
3071 #undef __FUNCT__
3072 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_inplace"
3073 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
3074 {
3075   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3076   PetscInt          i,nz,idx,idt,jdx;
3077   PetscErrorCode    ierr;
3078   const PetscInt    *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j;
3079   const MatScalar   *aa=a->a,*v;
3080   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
3081   const PetscScalar *b;
3082 
3083   PetscFunctionBegin;
3084   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3085   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3086   /* forward solve the lower triangular */
3087   idx    = 0;
3088   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
3089   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
3090   for (i=1; i<n; i++) {
3091     v     =  aa + 36*ai[i];
3092     vi    =  aj + ai[i];
3093     nz    =  diag[i] - ai[i];
3094     idx   =  6*i;
3095     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
3096     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
3097     while (nz--) {
3098       jdx   = 6*(*vi++);
3099       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
3100       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
3101       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3102       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3103       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3104       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3105       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3106       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3107       v += 36;
3108      }
3109     x[idx]   = s1;
3110     x[1+idx] = s2;
3111     x[2+idx] = s3;
3112     x[3+idx] = s4;
3113     x[4+idx] = s5;
3114     x[5+idx] = s6;
3115   }
3116   /* backward solve the upper triangular */
3117   for (i=n-1; i>=0; i--){
3118     v    = aa + 36*diag[i] + 36;
3119     vi   = aj + diag[i] + 1;
3120     nz   = ai[i+1] - diag[i] - 1;
3121     idt  = 6*i;
3122     s1 = x[idt];   s2 = x[1+idt];
3123     s3 = x[2+idt]; s4 = x[3+idt];
3124     s5 = x[4+idt]; s6 = x[5+idt];
3125     while (nz--) {
3126       idx   = 6*(*vi++);
3127       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
3128       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
3129       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3130       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3131       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3132       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3133       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3134       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3135       v += 36;
3136     }
3137     v        = aa + 36*diag[i];
3138     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
3139     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
3140     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
3141     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
3142     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
3143     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
3144   }
3145 
3146   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3147   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3148   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
3149   PetscFunctionReturn(0);
3150 }
3151 
3152 #undef __FUNCT__
3153 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
3154 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
3155 {
3156     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3157     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3158     PetscErrorCode    ierr;
3159     PetscInt          i,k,nz,idx,jdx,idt;
3160     const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
3161     const MatScalar   *aa=a->a,*v;
3162     PetscScalar       *x;
3163     const PetscScalar *b;
3164     PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
3165 
3166     PetscFunctionBegin;
3167     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3168     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3169     /* forward solve the lower triangular */
3170     idx    = 0;
3171     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
3172     x[4] = b[4+idx];x[5] = b[5+idx];
3173     for (i=1; i<n; i++) {
3174        v    = aa + bs2*ai[i];
3175        vi   = aj + ai[i];
3176        nz   = ai[i+1] - ai[i];
3177       idx   = bs*i;
3178        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3179        s5   = b[4+idx];s6 = b[5+idx];
3180        for(k=0;k<nz;k++){
3181           jdx   = bs*vi[k];
3182           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
3183 	  x5    = x[4+jdx]; x6 = x[5+jdx];
3184           s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
3185           s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
3186           s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
3187 	  s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
3188           s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
3189 	  s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
3190           v   +=  bs2;
3191         }
3192 
3193        x[idx]   = s1;
3194        x[1+idx] = s2;
3195        x[2+idx] = s3;
3196        x[3+idx] = s4;
3197        x[4+idx] = s5;
3198        x[5+idx] = s6;
3199     }
3200 
3201    /* backward solve the upper triangular */
3202   for (i=n-1; i>=0; i--){
3203     v   = aa + bs2*(adiag[i+1]+1);
3204      vi  = aj + adiag[i+1]+1;
3205      nz  = adiag[i] - adiag[i+1]-1;
3206      idt = bs*i;
3207      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
3208      s5 = x[4+idt];s6 = x[5+idt];
3209      for(k=0;k<nz;k++){
3210       idx   = bs*vi[k];
3211        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
3212        x5    = x[4+idx];x6 = x[5+idx];
3213        s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
3214        s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
3215        s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
3216        s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
3217        s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
3218        s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
3219         v   +=  bs2;
3220     }
3221     /* x = inv_diagonal*x */
3222    x[idt]   = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
3223    x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
3224    x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
3225    x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
3226    x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
3227    x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
3228   }
3229 
3230   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3231   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3232   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
3233   PetscFunctionReturn(0);
3234 }
3235 
3236 #undef __FUNCT__
3237 #define __FUNCT__ "MatSolve_SeqBAIJ_5_inplace"
3238 PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
3239 {
3240   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
3241   IS                iscol=a->col,isrow=a->row;
3242   PetscErrorCode    ierr;
3243   const PetscInt    *r,*c,*rout,*cout,*diag = a->diag;
3244   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3245   PetscInt          i,nz,idx,idt,idc;
3246   const MatScalar   *aa=a->a,*v;
3247   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
3248   const PetscScalar *b;
3249 
3250   PetscFunctionBegin;
3251   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3252   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3253   t  = a->solve_work;
3254 
3255   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3256   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3257 
3258   /* forward solve the lower triangular */
3259   idx    = 5*(*r++);
3260   t[0] = b[idx];   t[1] = b[1+idx];
3261   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
3262   for (i=1; i<n; i++) {
3263     v     = aa + 25*ai[i];
3264     vi    = aj + ai[i];
3265     nz    = diag[i] - ai[i];
3266     idx   = 5*(*r++);
3267     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3268     s5  = b[4+idx];
3269     while (nz--) {
3270       idx   = 5*(*vi++);
3271       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
3272       x4    = t[3+idx];x5 = t[4+idx];
3273       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3274       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3275       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3276       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3277       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3278       v += 25;
3279     }
3280     idx = 5*i;
3281     t[idx]   = s1;t[1+idx] = s2;
3282     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
3283   }
3284   /* backward solve the upper triangular */
3285   for (i=n-1; i>=0; i--){
3286     v    = aa + 25*diag[i] + 25;
3287     vi   = aj + diag[i] + 1;
3288     nz   = ai[i+1] - diag[i] - 1;
3289     idt  = 5*i;
3290     s1 = t[idt];  s2 = t[1+idt];
3291     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
3292     while (nz--) {
3293       idx   = 5*(*vi++);
3294       x1    = t[idx];   x2 = t[1+idx];
3295       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
3296       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3297       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3298       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3299       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3300       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3301       v += 25;
3302     }
3303     idc = 5*(*c--);
3304     v   = aa + 25*diag[i];
3305     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
3306                                  v[15]*s4+v[20]*s5;
3307     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
3308                                  v[16]*s4+v[21]*s5;
3309     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
3310                                  v[17]*s4+v[22]*s5;
3311     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
3312                                  v[18]*s4+v[23]*s5;
3313     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
3314                                  v[19]*s4+v[24]*s5;
3315   }
3316 
3317   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3318   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3319   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3320   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3321   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
3322   PetscFunctionReturn(0);
3323 }
3324 
3325 #undef __FUNCT__
3326 #define __FUNCT__ "MatSolve_SeqBAIJ_5"
3327 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
3328 {
3329   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
3330   IS                iscol=a->col,isrow=a->row;
3331   PetscErrorCode    ierr;
3332   const PetscInt    *r,*c,*rout,*cout;
3333   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3334   PetscInt          i,nz,idx,idt,idc,m;
3335   const MatScalar   *aa=a->a,*v;
3336   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
3337   const PetscScalar *b;
3338 
3339   PetscFunctionBegin;
3340   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3341   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3342   t  = a->solve_work;
3343 
3344   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3345   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
3346 
3347   /* forward solve the lower triangular */
3348   idx    = 5*r[0];
3349   t[0] = b[idx];   t[1] = b[1+idx];
3350   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
3351   for (i=1; i<n; i++) {
3352     v     = aa + 25*ai[i];
3353     vi    = aj + ai[i];
3354     nz    = ai[i+1] - ai[i];
3355     idx   = 5*r[i];
3356     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3357     s5  = b[4+idx];
3358     for(m=0;m<nz;m++){
3359       idx   = 5*vi[m];
3360       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
3361       x4    = t[3+idx];x5 = t[4+idx];
3362       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3363       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3364       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3365       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3366       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3367       v += 25;
3368     }
3369     idx = 5*i;
3370     t[idx]   = s1;t[1+idx] = s2;
3371     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
3372   }
3373   /* backward solve the upper triangular */
3374   for (i=n-1; i>=0; i--){
3375     v    = aa + 25*(adiag[i+1]+1);
3376     vi   = aj + adiag[i+1]+1;
3377     nz   = adiag[i] - adiag[i+1] - 1;
3378     idt  = 5*i;
3379     s1 = t[idt];  s2 = t[1+idt];
3380     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
3381     for(m=0;m<nz;m++){
3382       idx   = 5*vi[m];
3383       x1    = t[idx];   x2 = t[1+idx];
3384       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
3385       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3386       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3387       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3388       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3389       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3390       v += 25;
3391     }
3392     idc = 5*c[i];
3393     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
3394                                  v[15]*s4+v[20]*s5;
3395     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
3396                                  v[16]*s4+v[21]*s5;
3397     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
3398                                  v[17]*s4+v[22]*s5;
3399     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
3400                                  v[18]*s4+v[23]*s5;
3401     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
3402                                  v[19]*s4+v[24]*s5;
3403   }
3404 
3405   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3406   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3407   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3408   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3409   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
3410   PetscFunctionReturn(0);
3411 }
3412 
3413 #undef __FUNCT__
3414 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_inplace"
3415 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
3416 {
3417   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3418   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3419   PetscInt          i,nz,idx,idt,jdx;
3420   PetscErrorCode    ierr;
3421   const MatScalar   *aa=a->a,*v;
3422   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
3423   const PetscScalar *b;
3424 
3425   PetscFunctionBegin;
3426   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3427   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3428   /* forward solve the lower triangular */
3429   idx    = 0;
3430   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
3431   for (i=1; i<n; i++) {
3432     v     =  aa + 25*ai[i];
3433     vi    =  aj + ai[i];
3434     nz    =  diag[i] - ai[i];
3435     idx   =  5*i;
3436     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
3437     while (nz--) {
3438       jdx   = 5*(*vi++);
3439       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
3440       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3441       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3442       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3443       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3444       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3445       v    += 25;
3446     }
3447     x[idx]   = s1;
3448     x[1+idx] = s2;
3449     x[2+idx] = s3;
3450     x[3+idx] = s4;
3451     x[4+idx] = s5;
3452   }
3453   /* backward solve the upper triangular */
3454   for (i=n-1; i>=0; i--){
3455     v    = aa + 25*diag[i] + 25;
3456     vi   = aj + diag[i] + 1;
3457     nz   = ai[i+1] - diag[i] - 1;
3458     idt  = 5*i;
3459     s1 = x[idt];  s2 = x[1+idt];
3460     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
3461     while (nz--) {
3462       idx   = 5*(*vi++);
3463       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
3464       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3465       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3466       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3467       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3468       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3469       v    += 25;
3470     }
3471     v        = aa + 25*diag[i];
3472     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
3473     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
3474     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
3475     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
3476     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
3477   }
3478 
3479   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3480   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3481   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
3482   PetscFunctionReturn(0);
3483 }
3484 
3485 #undef __FUNCT__
3486 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
3487 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
3488 {
3489   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3490   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3491   PetscInt          i,k,nz,idx,idt,jdx;
3492   PetscErrorCode    ierr;
3493   const MatScalar   *aa=a->a,*v;
3494   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
3495   const PetscScalar *b;
3496 
3497   PetscFunctionBegin;
3498   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3499   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3500   /* forward solve the lower triangular */
3501   idx    = 0;
3502   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
3503   for (i=1; i<n; i++) {
3504     v   = aa + 25*ai[i];
3505     vi  = aj + ai[i];
3506     nz  = ai[i+1] - ai[i];
3507     idx = 5*i;
3508     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
3509     for(k=0;k<nz;k++) {
3510       jdx   = 5*vi[k];
3511       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
3512       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3513       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3514       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3515       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3516       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3517       v    += 25;
3518     }
3519     x[idx]   = s1;
3520     x[1+idx] = s2;
3521     x[2+idx] = s3;
3522     x[3+idx] = s4;
3523     x[4+idx] = s5;
3524   }
3525 
3526   /* backward solve the upper triangular */
3527   for (i=n-1; i>=0; i--){
3528     v   = aa + 25*(adiag[i+1]+1);
3529     vi  = aj + adiag[i+1]+1;
3530     nz  = adiag[i] - adiag[i+1]-1;
3531     idt = 5*i;
3532     s1 = x[idt];  s2 = x[1+idt];
3533     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
3534     for(k=0;k<nz;k++){
3535       idx   = 5*vi[k];
3536       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
3537       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3538       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3539       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3540       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3541       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3542       v    += 25;
3543     }
3544     /* x = inv_diagonal*x */
3545     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
3546     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
3547     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
3548     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
3549     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
3550   }
3551 
3552   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3553   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3554   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
3555   PetscFunctionReturn(0);
3556 }
3557 
3558 #undef __FUNCT__
3559 #define __FUNCT__ "MatSolve_SeqBAIJ_4_inplace"
3560 PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
3561 {
3562   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3563   IS                iscol=a->col,isrow=a->row;
3564   PetscErrorCode    ierr;
3565   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3566   PetscInt          i,nz,idx,idt,idc;
3567   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3568   const MatScalar   *aa=a->a,*v;
3569   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
3570   const PetscScalar *b;
3571 
3572   PetscFunctionBegin;
3573   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3574   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3575   t  = a->solve_work;
3576 
3577   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3578   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3579 
3580   /* forward solve the lower triangular */
3581   idx    = 4*(*r++);
3582   t[0] = b[idx];   t[1] = b[1+idx];
3583   t[2] = b[2+idx]; t[3] = b[3+idx];
3584   for (i=1; i<n; i++) {
3585     v     = aa + 16*ai[i];
3586     vi    = aj + ai[i];
3587     nz    = diag[i] - ai[i];
3588     idx   = 4*(*r++);
3589     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3590     while (nz--) {
3591       idx   = 4*(*vi++);
3592       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
3593       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3594       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3595       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3596       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3597       v    += 16;
3598     }
3599     idx        = 4*i;
3600     t[idx]   = s1;t[1+idx] = s2;
3601     t[2+idx] = s3;t[3+idx] = s4;
3602   }
3603   /* backward solve the upper triangular */
3604   for (i=n-1; i>=0; i--){
3605     v    = aa + 16*diag[i] + 16;
3606     vi   = aj + diag[i] + 1;
3607     nz   = ai[i+1] - diag[i] - 1;
3608     idt  = 4*i;
3609     s1 = t[idt];  s2 = t[1+idt];
3610     s3 = t[2+idt];s4 = t[3+idt];
3611     while (nz--) {
3612       idx   = 4*(*vi++);
3613       x1    = t[idx];   x2 = t[1+idx];
3614       x3    = t[2+idx]; x4 = t[3+idx];
3615       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3616       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3617       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3618       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3619       v += 16;
3620     }
3621     idc      = 4*(*c--);
3622     v        = aa + 16*diag[i];
3623     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3624     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3625     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3626     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3627   }
3628 
3629   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3630   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3631   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3632   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3633   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3634   PetscFunctionReturn(0);
3635 }
3636 
3637 #undef __FUNCT__
3638 #define __FUNCT__ "MatSolve_SeqBAIJ_4"
3639 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
3640 {
3641   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3642   IS                iscol=a->col,isrow=a->row;
3643   PetscErrorCode    ierr;
3644   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3645   PetscInt          i,nz,idx,idt,idc,m;
3646   const PetscInt    *r,*c,*rout,*cout;
3647   const MatScalar   *aa=a->a,*v;
3648   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
3649   const PetscScalar *b;
3650 
3651   PetscFunctionBegin;
3652   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3653   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3654   t  = a->solve_work;
3655 
3656   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3657   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
3658 
3659   /* forward solve the lower triangular */
3660   idx    = 4*r[0];
3661   t[0] = b[idx];   t[1] = b[1+idx];
3662   t[2] = b[2+idx]; t[3] = b[3+idx];
3663   for (i=1; i<n; i++) {
3664     v     = aa + 16*ai[i];
3665     vi    = aj + ai[i];
3666     nz    = ai[i+1] - ai[i];
3667     idx   = 4*r[i];
3668     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3669     for(m=0;m<nz;m++){
3670       idx   = 4*vi[m];
3671       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
3672       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3673       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3674       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3675       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3676       v    += 16;
3677     }
3678     idx        = 4*i;
3679     t[idx]   = s1;t[1+idx] = s2;
3680     t[2+idx] = s3;t[3+idx] = s4;
3681   }
3682   /* backward solve the upper triangular */
3683   for (i=n-1; i>=0; i--){
3684     v    = aa + 16*(adiag[i+1]+1);
3685     vi   = aj + adiag[i+1]+1;
3686     nz   = adiag[i] - adiag[i+1] - 1;
3687     idt  = 4*i;
3688     s1 = t[idt];  s2 = t[1+idt];
3689     s3 = t[2+idt];s4 = t[3+idt];
3690     for(m=0;m<nz;m++){
3691       idx   = 4*vi[m];
3692       x1    = t[idx];   x2 = t[1+idx];
3693       x3    = t[2+idx]; x4 = t[3+idx];
3694       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3695       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3696       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3697       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3698       v += 16;
3699     }
3700     idc      = 4*c[i];
3701     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3702     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3703     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3704     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3705   }
3706 
3707   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3708   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3709   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3710   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3711   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3712   PetscFunctionReturn(0);
3713 }
3714 
3715 #undef __FUNCT__
3716 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion"
3717 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
3718 {
3719   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3720   IS                iscol=a->col,isrow=a->row;
3721   PetscErrorCode    ierr;
3722   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3723   PetscInt          i,nz,idx,idt,idc;
3724   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3725   const MatScalar   *aa=a->a,*v;
3726   MatScalar         s1,s2,s3,s4,x1,x2,x3,x4,*t;
3727   PetscScalar       *x;
3728   const PetscScalar *b;
3729 
3730   PetscFunctionBegin;
3731   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3732   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3733   t  = (MatScalar *)a->solve_work;
3734 
3735   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3736   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3737 
3738   /* forward solve the lower triangular */
3739   idx    = 4*(*r++);
3740   t[0] = (MatScalar)b[idx];
3741   t[1] = (MatScalar)b[1+idx];
3742   t[2] = (MatScalar)b[2+idx];
3743   t[3] = (MatScalar)b[3+idx];
3744   for (i=1; i<n; i++) {
3745     v     = aa + 16*ai[i];
3746     vi    = aj + ai[i];
3747     nz    = diag[i] - ai[i];
3748     idx   = 4*(*r++);
3749     s1 = (MatScalar)b[idx];
3750     s2 = (MatScalar)b[1+idx];
3751     s3 = (MatScalar)b[2+idx];
3752     s4 = (MatScalar)b[3+idx];
3753     while (nz--) {
3754       idx   = 4*(*vi++);
3755       x1  = t[idx];
3756       x2  = t[1+idx];
3757       x3  = t[2+idx];
3758       x4  = t[3+idx];
3759       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3760       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3761       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3762       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3763       v    += 16;
3764     }
3765     idx        = 4*i;
3766     t[idx]   = s1;
3767     t[1+idx] = s2;
3768     t[2+idx] = s3;
3769     t[3+idx] = s4;
3770   }
3771   /* backward solve the upper triangular */
3772   for (i=n-1; i>=0; i--){
3773     v    = aa + 16*diag[i] + 16;
3774     vi   = aj + diag[i] + 1;
3775     nz   = ai[i+1] - diag[i] - 1;
3776     idt  = 4*i;
3777     s1 = t[idt];
3778     s2 = t[1+idt];
3779     s3 = t[2+idt];
3780     s4 = t[3+idt];
3781     while (nz--) {
3782       idx   = 4*(*vi++);
3783       x1  = t[idx];
3784       x2  = t[1+idx];
3785       x3  = t[2+idx];
3786       x4  = t[3+idx];
3787       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3788       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3789       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3790       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3791       v += 16;
3792     }
3793     idc      = 4*(*c--);
3794     v        = aa + 16*diag[i];
3795     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3796     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3797     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3798     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3799     x[idc]   = (PetscScalar)t[idt];
3800     x[1+idc] = (PetscScalar)t[1+idt];
3801     x[2+idc] = (PetscScalar)t[2+idt];
3802     x[3+idc] = (PetscScalar)t[3+idt];
3803  }
3804 
3805   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3806   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3807   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3808   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3809   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3810   PetscFunctionReturn(0);
3811 }
3812 
3813 #if defined (PETSC_HAVE_SSE)
3814 
3815 #include PETSC_HAVE_SSE
3816 
3817 #undef __FUNCT__
3818 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
3819 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
3820 {
3821   /*
3822      Note: This code uses demotion of double
3823      to float when performing the mixed-mode computation.
3824      This may not be numerically reasonable for all applications.
3825   */
3826   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3827   IS             iscol=a->col,isrow=a->row;
3828   PetscErrorCode ierr;
3829   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
3830   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
3831   MatScalar      *aa=a->a,*v;
3832   PetscScalar    *x,*b,*t;
3833 
3834   /* Make space in temp stack for 16 Byte Aligned arrays */
3835   float           ssealignedspace[11],*tmps,*tmpx;
3836   unsigned long   offset;
3837 
3838   PetscFunctionBegin;
3839   SSE_SCOPE_BEGIN;
3840 
3841     offset = (unsigned long)ssealignedspace % 16;
3842     if (offset) offset = (16 - offset)/4;
3843     tmps = &ssealignedspace[offset];
3844     tmpx = &ssealignedspace[offset+4];
3845     PREFETCH_NTA(aa+16*ai[1]);
3846 
3847     ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3848     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3849     t  = a->solve_work;
3850 
3851     ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3852     ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3853 
3854     /* forward solve the lower triangular */
3855     idx  = 4*(*r++);
3856     t[0] = b[idx];   t[1] = b[1+idx];
3857     t[2] = b[2+idx]; t[3] = b[3+idx];
3858     v    =  aa + 16*ai[1];
3859 
3860     for (i=1; i<n;) {
3861       PREFETCH_NTA(&v[8]);
3862       vi   =  aj      + ai[i];
3863       nz   =  diag[i] - ai[i];
3864       idx  =  4*(*r++);
3865 
3866       /* Demote sum from double to float */
3867       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
3868       LOAD_PS(tmps,XMM7);
3869 
3870       while (nz--) {
3871         PREFETCH_NTA(&v[16]);
3872         idx = 4*(*vi++);
3873 
3874         /* Demote solution (so far) from double to float */
3875         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
3876 
3877         /* 4x4 Matrix-Vector product with negative accumulation: */
3878         SSE_INLINE_BEGIN_2(tmpx,v)
3879           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3880 
3881           /* First Column */
3882           SSE_COPY_PS(XMM0,XMM6)
3883           SSE_SHUFFLE(XMM0,XMM0,0x00)
3884           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3885           SSE_SUB_PS(XMM7,XMM0)
3886 
3887           /* Second Column */
3888           SSE_COPY_PS(XMM1,XMM6)
3889           SSE_SHUFFLE(XMM1,XMM1,0x55)
3890           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3891           SSE_SUB_PS(XMM7,XMM1)
3892 
3893           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3894 
3895           /* Third Column */
3896           SSE_COPY_PS(XMM2,XMM6)
3897           SSE_SHUFFLE(XMM2,XMM2,0xAA)
3898           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3899           SSE_SUB_PS(XMM7,XMM2)
3900 
3901           /* Fourth Column */
3902           SSE_COPY_PS(XMM3,XMM6)
3903           SSE_SHUFFLE(XMM3,XMM3,0xFF)
3904           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3905           SSE_SUB_PS(XMM7,XMM3)
3906         SSE_INLINE_END_2
3907 
3908         v  += 16;
3909       }
3910       idx = 4*i;
3911       v   = aa + 16*ai[++i];
3912       PREFETCH_NTA(v);
3913       STORE_PS(tmps,XMM7);
3914 
3915       /* Promote result from float to double */
3916       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
3917     }
3918     /* backward solve the upper triangular */
3919     idt  = 4*(n-1);
3920     ai16 = 16*diag[n-1];
3921     v    = aa + ai16 + 16;
3922     for (i=n-1; i>=0;){
3923       PREFETCH_NTA(&v[8]);
3924       vi = aj + diag[i] + 1;
3925       nz = ai[i+1] - diag[i] - 1;
3926 
3927       /* Demote accumulator from double to float */
3928       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
3929       LOAD_PS(tmps,XMM7);
3930 
3931       while (nz--) {
3932         PREFETCH_NTA(&v[16]);
3933         idx = 4*(*vi++);
3934 
3935         /* Demote solution (so far) from double to float */
3936         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
3937 
3938         /* 4x4 Matrix-Vector Product with negative accumulation: */
3939         SSE_INLINE_BEGIN_2(tmpx,v)
3940           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3941 
3942           /* First Column */
3943           SSE_COPY_PS(XMM0,XMM6)
3944           SSE_SHUFFLE(XMM0,XMM0,0x00)
3945           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3946           SSE_SUB_PS(XMM7,XMM0)
3947 
3948           /* Second Column */
3949           SSE_COPY_PS(XMM1,XMM6)
3950           SSE_SHUFFLE(XMM1,XMM1,0x55)
3951           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3952           SSE_SUB_PS(XMM7,XMM1)
3953 
3954           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3955 
3956           /* Third Column */
3957           SSE_COPY_PS(XMM2,XMM6)
3958           SSE_SHUFFLE(XMM2,XMM2,0xAA)
3959           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3960           SSE_SUB_PS(XMM7,XMM2)
3961 
3962           /* Fourth Column */
3963           SSE_COPY_PS(XMM3,XMM6)
3964           SSE_SHUFFLE(XMM3,XMM3,0xFF)
3965           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3966           SSE_SUB_PS(XMM7,XMM3)
3967         SSE_INLINE_END_2
3968         v  += 16;
3969       }
3970       v    = aa + ai16;
3971       ai16 = 16*diag[--i];
3972       PREFETCH_NTA(aa+ai16+16);
3973       /*
3974          Scale the result by the diagonal 4x4 block,
3975          which was inverted as part of the factorization
3976       */
3977       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
3978         /* First Column */
3979         SSE_COPY_PS(XMM0,XMM7)
3980         SSE_SHUFFLE(XMM0,XMM0,0x00)
3981         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
3982 
3983         /* Second Column */
3984         SSE_COPY_PS(XMM1,XMM7)
3985         SSE_SHUFFLE(XMM1,XMM1,0x55)
3986         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
3987         SSE_ADD_PS(XMM0,XMM1)
3988 
3989         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
3990 
3991         /* Third Column */
3992         SSE_COPY_PS(XMM2,XMM7)
3993         SSE_SHUFFLE(XMM2,XMM2,0xAA)
3994         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
3995         SSE_ADD_PS(XMM0,XMM2)
3996 
3997         /* Fourth Column */
3998         SSE_COPY_PS(XMM3,XMM7)
3999         SSE_SHUFFLE(XMM3,XMM3,0xFF)
4000         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
4001         SSE_ADD_PS(XMM0,XMM3)
4002 
4003         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
4004       SSE_INLINE_END_3
4005 
4006       /* Promote solution from float to double */
4007       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
4008 
4009       /* Apply reordering to t and stream into x.    */
4010       /* This way, x doesn't pollute the cache.      */
4011       /* Be careful with size: 2 doubles = 4 floats! */
4012       idc  = 4*(*c--);
4013       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
4014         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
4015         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
4016         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
4017         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
4018         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
4019         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
4020       SSE_INLINE_END_2
4021       v    = aa + ai16 + 16;
4022       idt -= 4;
4023     }
4024 
4025     ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4026     ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4027     ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4028     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4029     ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4030   SSE_SCOPE_END;
4031   PetscFunctionReturn(0);
4032 }
4033 
4034 #endif
4035 
4036 
4037 /*
4038       Special case where the matrix was ILU(0) factored in the natural
4039    ordering. This eliminates the need for the column and row permutation.
4040 */
4041 #undef __FUNCT__
4042 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_inplace"
4043 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
4044 {
4045   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4046   PetscInt          n=a->mbs;
4047   const PetscInt    *ai=a->i,*aj=a->j;
4048   PetscErrorCode    ierr;
4049   const PetscInt    *diag = a->diag;
4050   const MatScalar   *aa=a->a;
4051   PetscScalar       *x;
4052   const PetscScalar *b;
4053 
4054   PetscFunctionBegin;
4055   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4056   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4057 
4058 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
4059   {
4060     static PetscScalar w[2000]; /* very BAD need to fix */
4061     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
4062   }
4063 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
4064   {
4065     static PetscScalar w[2000]; /* very BAD need to fix */
4066     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
4067   }
4068 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
4069   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
4070 #else
4071   {
4072     PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
4073     const MatScalar *v;
4074     PetscInt        jdx,idt,idx,nz,i,ai16;
4075     const PetscInt  *vi;
4076 
4077   /* forward solve the lower triangular */
4078   idx    = 0;
4079   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
4080   for (i=1; i<n; i++) {
4081     v     =  aa      + 16*ai[i];
4082     vi    =  aj      + ai[i];
4083     nz    =  diag[i] - ai[i];
4084     idx   +=  4;
4085     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
4086     while (nz--) {
4087       jdx   = 4*(*vi++);
4088       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
4089       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
4090       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
4091       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4092       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4093       v    += 16;
4094     }
4095     x[idx]   = s1;
4096     x[1+idx] = s2;
4097     x[2+idx] = s3;
4098     x[3+idx] = s4;
4099   }
4100   /* backward solve the upper triangular */
4101   idt = 4*(n-1);
4102   for (i=n-1; i>=0; i--){
4103     ai16 = 16*diag[i];
4104     v    = aa + ai16 + 16;
4105     vi   = aj + diag[i] + 1;
4106     nz   = ai[i+1] - diag[i] - 1;
4107     s1 = x[idt];  s2 = x[1+idt];
4108     s3 = x[2+idt];s4 = x[3+idt];
4109     while (nz--) {
4110       idx   = 4*(*vi++);
4111       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
4112       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
4113       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
4114       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
4115       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
4116       v    += 16;
4117     }
4118     v        = aa + ai16;
4119     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
4120     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
4121     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
4122     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
4123     idt -= 4;
4124   }
4125   }
4126 #endif
4127 
4128   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4129   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4130   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4131   PetscFunctionReturn(0);
4132 }
4133 
4134 #undef __FUNCT__
4135 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
4136 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
4137 {
4138     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4139     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
4140     PetscInt          i,k,nz,idx,jdx,idt;
4141     PetscErrorCode    ierr;
4142     const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
4143     const MatScalar   *aa=a->a,*v;
4144     PetscScalar       *x;
4145     const PetscScalar *b;
4146     PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4;
4147 
4148     PetscFunctionBegin;
4149     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4150     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4151     /* forward solve the lower triangular */
4152     idx    = 0;
4153     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
4154     for (i=1; i<n; i++) {
4155        v    = aa + bs2*ai[i];
4156        vi   = aj + ai[i];
4157        nz   = ai[i+1] - ai[i];
4158       idx   = bs*i;
4159        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
4160       for(k=0;k<nz;k++) {
4161           jdx   = bs*vi[k];
4162           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
4163           s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
4164           s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
4165           s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4166 	  s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4167 
4168           v   +=  bs2;
4169         }
4170 
4171        x[idx]   = s1;
4172        x[1+idx] = s2;
4173        x[2+idx] = s3;
4174        x[3+idx] = s4;
4175     }
4176 
4177    /* backward solve the upper triangular */
4178   for (i=n-1; i>=0; i--){
4179     v   = aa + bs2*(adiag[i+1]+1);
4180      vi  = aj + adiag[i+1]+1;
4181      nz  = adiag[i] - adiag[i+1]-1;
4182      idt = bs*i;
4183      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
4184 
4185     for(k=0;k<nz;k++){
4186       idx   = bs*vi[k];
4187        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
4188        s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
4189        s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
4190        s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4191        s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4192 
4193         v   +=  bs2;
4194     }
4195     /* x = inv_diagonal*x */
4196    x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
4197    x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;;
4198    x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
4199    x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
4200 
4201   }
4202 
4203   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4204   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4205   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
4206   PetscFunctionReturn(0);
4207 }
4208 
4209 #undef __FUNCT__
4210 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion"
4211 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
4212 {
4213   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4214   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*diag=a->diag;
4215   PetscErrorCode    ierr;
4216   const MatScalar   *aa=a->a;
4217   const PetscScalar *b;
4218   PetscScalar       *x;
4219 
4220   PetscFunctionBegin;
4221   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4222   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4223 
4224   {
4225     MatScalar        s1,s2,s3,s4,x1,x2,x3,x4;
4226     const MatScalar  *v;
4227     MatScalar        *t=(MatScalar *)x;
4228     PetscInt         jdx,idt,idx,nz,i,ai16;
4229     const PetscInt   *vi;
4230 
4231     /* forward solve the lower triangular */
4232     idx  = 0;
4233     t[0] = (MatScalar)b[0];
4234     t[1] = (MatScalar)b[1];
4235     t[2] = (MatScalar)b[2];
4236     t[3] = (MatScalar)b[3];
4237     for (i=1; i<n; i++) {
4238       v     =  aa      + 16*ai[i];
4239       vi    =  aj      + ai[i];
4240       nz    =  diag[i] - ai[i];
4241       idx   +=  4;
4242       s1 = (MatScalar)b[idx];
4243       s2 = (MatScalar)b[1+idx];
4244       s3 = (MatScalar)b[2+idx];
4245       s4 = (MatScalar)b[3+idx];
4246       while (nz--) {
4247         jdx = 4*(*vi++);
4248         x1  = t[jdx];
4249         x2  = t[1+jdx];
4250         x3  = t[2+jdx];
4251         x4  = t[3+jdx];
4252         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
4253         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
4254         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4255         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4256         v    += 16;
4257       }
4258       t[idx]   = s1;
4259       t[1+idx] = s2;
4260       t[2+idx] = s3;
4261       t[3+idx] = s4;
4262     }
4263     /* backward solve the upper triangular */
4264     idt = 4*(n-1);
4265     for (i=n-1; i>=0; i--){
4266       ai16 = 16*diag[i];
4267       v    = aa + ai16 + 16;
4268       vi   = aj + diag[i] + 1;
4269       nz   = ai[i+1] - diag[i] - 1;
4270       s1   = t[idt];
4271       s2   = t[1+idt];
4272       s3   = t[2+idt];
4273       s4   = t[3+idt];
4274       while (nz--) {
4275         idx = 4*(*vi++);
4276         x1  = (MatScalar)x[idx];
4277         x2  = (MatScalar)x[1+idx];
4278         x3  = (MatScalar)x[2+idx];
4279         x4  = (MatScalar)x[3+idx];
4280         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
4281         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
4282         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4283         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4284         v    += 16;
4285       }
4286       v        = aa + ai16;
4287       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
4288       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
4289       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
4290       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
4291       idt -= 4;
4292     }
4293   }
4294 
4295   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4296   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4297   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4298   PetscFunctionReturn(0);
4299 }
4300 
4301 #if defined (PETSC_HAVE_SSE)
4302 
4303 #include PETSC_HAVE_SSE
4304 #undef __FUNCT__
4305 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj"
4306 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
4307 {
4308   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
4309   unsigned short *aj=(unsigned short *)a->j;
4310   PetscErrorCode ierr;
4311   int            *ai=a->i,n=a->mbs,*diag = a->diag;
4312   MatScalar      *aa=a->a;
4313   PetscScalar    *x,*b;
4314 
4315   PetscFunctionBegin;
4316   SSE_SCOPE_BEGIN;
4317   /*
4318      Note: This code currently uses demotion of double
4319      to float when performing the mixed-mode computation.
4320      This may not be numerically reasonable for all applications.
4321   */
4322   PREFETCH_NTA(aa+16*ai[1]);
4323 
4324   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4325   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4326   {
4327     /* x will first be computed in single precision then promoted inplace to double */
4328     MatScalar      *v,*t=(MatScalar *)x;
4329     int            nz,i,idt,ai16;
4330     unsigned int   jdx,idx;
4331     unsigned short *vi;
4332     /* Forward solve the lower triangular factor. */
4333 
4334     /* First block is the identity. */
4335     idx  = 0;
4336     CONVERT_DOUBLE4_FLOAT4(t,b);
4337     v    =  aa + 16*((unsigned int)ai[1]);
4338 
4339     for (i=1; i<n;) {
4340       PREFETCH_NTA(&v[8]);
4341       vi   =  aj      + ai[i];
4342       nz   =  diag[i] - ai[i];
4343       idx +=  4;
4344 
4345       /* Demote RHS from double to float. */
4346       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
4347       LOAD_PS(&t[idx],XMM7);
4348 
4349       while (nz--) {
4350         PREFETCH_NTA(&v[16]);
4351         jdx = 4*((unsigned int)(*vi++));
4352 
4353         /* 4x4 Matrix-Vector product with negative accumulation: */
4354         SSE_INLINE_BEGIN_2(&t[jdx],v)
4355           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4356 
4357           /* First Column */
4358           SSE_COPY_PS(XMM0,XMM6)
4359           SSE_SHUFFLE(XMM0,XMM0,0x00)
4360           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4361           SSE_SUB_PS(XMM7,XMM0)
4362 
4363           /* Second Column */
4364           SSE_COPY_PS(XMM1,XMM6)
4365           SSE_SHUFFLE(XMM1,XMM1,0x55)
4366           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4367           SSE_SUB_PS(XMM7,XMM1)
4368 
4369           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4370 
4371           /* Third Column */
4372           SSE_COPY_PS(XMM2,XMM6)
4373           SSE_SHUFFLE(XMM2,XMM2,0xAA)
4374           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4375           SSE_SUB_PS(XMM7,XMM2)
4376 
4377           /* Fourth Column */
4378           SSE_COPY_PS(XMM3,XMM6)
4379           SSE_SHUFFLE(XMM3,XMM3,0xFF)
4380           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4381           SSE_SUB_PS(XMM7,XMM3)
4382         SSE_INLINE_END_2
4383 
4384         v  += 16;
4385       }
4386       v    =  aa + 16*ai[++i];
4387       PREFETCH_NTA(v);
4388       STORE_PS(&t[idx],XMM7);
4389     }
4390 
4391     /* Backward solve the upper triangular factor.*/
4392 
4393     idt  = 4*(n-1);
4394     ai16 = 16*diag[n-1];
4395     v    = aa + ai16 + 16;
4396     for (i=n-1; i>=0;){
4397       PREFETCH_NTA(&v[8]);
4398       vi = aj + diag[i] + 1;
4399       nz = ai[i+1] - diag[i] - 1;
4400 
4401       LOAD_PS(&t[idt],XMM7);
4402 
4403       while (nz--) {
4404         PREFETCH_NTA(&v[16]);
4405         idx = 4*((unsigned int)(*vi++));
4406 
4407         /* 4x4 Matrix-Vector Product with negative accumulation: */
4408         SSE_INLINE_BEGIN_2(&t[idx],v)
4409           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4410 
4411           /* First Column */
4412           SSE_COPY_PS(XMM0,XMM6)
4413           SSE_SHUFFLE(XMM0,XMM0,0x00)
4414           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4415           SSE_SUB_PS(XMM7,XMM0)
4416 
4417           /* Second Column */
4418           SSE_COPY_PS(XMM1,XMM6)
4419           SSE_SHUFFLE(XMM1,XMM1,0x55)
4420           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4421           SSE_SUB_PS(XMM7,XMM1)
4422 
4423           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4424 
4425           /* Third Column */
4426           SSE_COPY_PS(XMM2,XMM6)
4427           SSE_SHUFFLE(XMM2,XMM2,0xAA)
4428           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4429           SSE_SUB_PS(XMM7,XMM2)
4430 
4431           /* Fourth Column */
4432           SSE_COPY_PS(XMM3,XMM6)
4433           SSE_SHUFFLE(XMM3,XMM3,0xFF)
4434           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4435           SSE_SUB_PS(XMM7,XMM3)
4436         SSE_INLINE_END_2
4437         v  += 16;
4438       }
4439       v    = aa + ai16;
4440       ai16 = 16*diag[--i];
4441       PREFETCH_NTA(aa+ai16+16);
4442       /*
4443          Scale the result by the diagonal 4x4 block,
4444          which was inverted as part of the factorization
4445       */
4446       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
4447         /* First Column */
4448         SSE_COPY_PS(XMM0,XMM7)
4449         SSE_SHUFFLE(XMM0,XMM0,0x00)
4450         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
4451 
4452         /* Second Column */
4453         SSE_COPY_PS(XMM1,XMM7)
4454         SSE_SHUFFLE(XMM1,XMM1,0x55)
4455         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
4456         SSE_ADD_PS(XMM0,XMM1)
4457 
4458         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
4459 
4460         /* Third Column */
4461         SSE_COPY_PS(XMM2,XMM7)
4462         SSE_SHUFFLE(XMM2,XMM2,0xAA)
4463         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
4464         SSE_ADD_PS(XMM0,XMM2)
4465 
4466         /* Fourth Column */
4467         SSE_COPY_PS(XMM3,XMM7)
4468         SSE_SHUFFLE(XMM3,XMM3,0xFF)
4469         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
4470         SSE_ADD_PS(XMM0,XMM3)
4471 
4472         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
4473       SSE_INLINE_END_3
4474 
4475       v    = aa + ai16 + 16;
4476       idt -= 4;
4477     }
4478 
4479     /* Convert t from single precision back to double precision (inplace)*/
4480     idt = 4*(n-1);
4481     for (i=n-1;i>=0;i--) {
4482       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
4483       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
4484       PetscScalar *xtemp=&x[idt];
4485       MatScalar   *ttemp=&t[idt];
4486       xtemp[3] = (PetscScalar)ttemp[3];
4487       xtemp[2] = (PetscScalar)ttemp[2];
4488       xtemp[1] = (PetscScalar)ttemp[1];
4489       xtemp[0] = (PetscScalar)ttemp[0];
4490       idt -= 4;
4491     }
4492 
4493   } /* End of artificial scope. */
4494   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4495   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4496   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4497   SSE_SCOPE_END;
4498   PetscFunctionReturn(0);
4499 }
4500 
4501 #undef __FUNCT__
4502 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion"
4503 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
4504 {
4505   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
4506   int            *aj=a->j;
4507   PetscErrorCode ierr;
4508   int            *ai=a->i,n=a->mbs,*diag = a->diag;
4509   MatScalar      *aa=a->a;
4510   PetscScalar    *x,*b;
4511 
4512   PetscFunctionBegin;
4513   SSE_SCOPE_BEGIN;
4514   /*
4515      Note: This code currently uses demotion of double
4516      to float when performing the mixed-mode computation.
4517      This may not be numerically reasonable for all applications.
4518   */
4519   PREFETCH_NTA(aa+16*ai[1]);
4520 
4521   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4522   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4523   {
4524     /* x will first be computed in single precision then promoted inplace to double */
4525     MatScalar *v,*t=(MatScalar *)x;
4526     int       nz,i,idt,ai16;
4527     int       jdx,idx;
4528     int       *vi;
4529     /* Forward solve the lower triangular factor. */
4530 
4531     /* First block is the identity. */
4532     idx  = 0;
4533     CONVERT_DOUBLE4_FLOAT4(t,b);
4534     v    =  aa + 16*ai[1];
4535 
4536     for (i=1; i<n;) {
4537       PREFETCH_NTA(&v[8]);
4538       vi   =  aj      + ai[i];
4539       nz   =  diag[i] - ai[i];
4540       idx +=  4;
4541 
4542       /* Demote RHS from double to float. */
4543       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
4544       LOAD_PS(&t[idx],XMM7);
4545 
4546       while (nz--) {
4547         PREFETCH_NTA(&v[16]);
4548         jdx = 4*(*vi++);
4549 /*          jdx = *vi++; */
4550 
4551         /* 4x4 Matrix-Vector product with negative accumulation: */
4552         SSE_INLINE_BEGIN_2(&t[jdx],v)
4553           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4554 
4555           /* First Column */
4556           SSE_COPY_PS(XMM0,XMM6)
4557           SSE_SHUFFLE(XMM0,XMM0,0x00)
4558           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4559           SSE_SUB_PS(XMM7,XMM0)
4560 
4561           /* Second Column */
4562           SSE_COPY_PS(XMM1,XMM6)
4563           SSE_SHUFFLE(XMM1,XMM1,0x55)
4564           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4565           SSE_SUB_PS(XMM7,XMM1)
4566 
4567           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4568 
4569           /* Third Column */
4570           SSE_COPY_PS(XMM2,XMM6)
4571           SSE_SHUFFLE(XMM2,XMM2,0xAA)
4572           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4573           SSE_SUB_PS(XMM7,XMM2)
4574 
4575           /* Fourth Column */
4576           SSE_COPY_PS(XMM3,XMM6)
4577           SSE_SHUFFLE(XMM3,XMM3,0xFF)
4578           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4579           SSE_SUB_PS(XMM7,XMM3)
4580         SSE_INLINE_END_2
4581 
4582         v  += 16;
4583       }
4584       v    =  aa + 16*ai[++i];
4585       PREFETCH_NTA(v);
4586       STORE_PS(&t[idx],XMM7);
4587     }
4588 
4589     /* Backward solve the upper triangular factor.*/
4590 
4591     idt  = 4*(n-1);
4592     ai16 = 16*diag[n-1];
4593     v    = aa + ai16 + 16;
4594     for (i=n-1; i>=0;){
4595       PREFETCH_NTA(&v[8]);
4596       vi = aj + diag[i] + 1;
4597       nz = ai[i+1] - diag[i] - 1;
4598 
4599       LOAD_PS(&t[idt],XMM7);
4600 
4601       while (nz--) {
4602         PREFETCH_NTA(&v[16]);
4603         idx = 4*(*vi++);
4604 /*          idx = *vi++; */
4605 
4606         /* 4x4 Matrix-Vector Product with negative accumulation: */
4607         SSE_INLINE_BEGIN_2(&t[idx],v)
4608           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4609 
4610           /* First Column */
4611           SSE_COPY_PS(XMM0,XMM6)
4612           SSE_SHUFFLE(XMM0,XMM0,0x00)
4613           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4614           SSE_SUB_PS(XMM7,XMM0)
4615 
4616           /* Second Column */
4617           SSE_COPY_PS(XMM1,XMM6)
4618           SSE_SHUFFLE(XMM1,XMM1,0x55)
4619           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4620           SSE_SUB_PS(XMM7,XMM1)
4621 
4622           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4623 
4624           /* Third Column */
4625           SSE_COPY_PS(XMM2,XMM6)
4626           SSE_SHUFFLE(XMM2,XMM2,0xAA)
4627           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4628           SSE_SUB_PS(XMM7,XMM2)
4629 
4630           /* Fourth Column */
4631           SSE_COPY_PS(XMM3,XMM6)
4632           SSE_SHUFFLE(XMM3,XMM3,0xFF)
4633           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4634           SSE_SUB_PS(XMM7,XMM3)
4635         SSE_INLINE_END_2
4636         v  += 16;
4637       }
4638       v    = aa + ai16;
4639       ai16 = 16*diag[--i];
4640       PREFETCH_NTA(aa+ai16+16);
4641       /*
4642          Scale the result by the diagonal 4x4 block,
4643          which was inverted as part of the factorization
4644       */
4645       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
4646         /* First Column */
4647         SSE_COPY_PS(XMM0,XMM7)
4648         SSE_SHUFFLE(XMM0,XMM0,0x00)
4649         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
4650 
4651         /* Second Column */
4652         SSE_COPY_PS(XMM1,XMM7)
4653         SSE_SHUFFLE(XMM1,XMM1,0x55)
4654         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
4655         SSE_ADD_PS(XMM0,XMM1)
4656 
4657         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
4658 
4659         /* Third Column */
4660         SSE_COPY_PS(XMM2,XMM7)
4661         SSE_SHUFFLE(XMM2,XMM2,0xAA)
4662         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
4663         SSE_ADD_PS(XMM0,XMM2)
4664 
4665         /* Fourth Column */
4666         SSE_COPY_PS(XMM3,XMM7)
4667         SSE_SHUFFLE(XMM3,XMM3,0xFF)
4668         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
4669         SSE_ADD_PS(XMM0,XMM3)
4670 
4671         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
4672       SSE_INLINE_END_3
4673 
4674       v    = aa + ai16 + 16;
4675       idt -= 4;
4676     }
4677 
4678     /* Convert t from single precision back to double precision (inplace)*/
4679     idt = 4*(n-1);
4680     for (i=n-1;i>=0;i--) {
4681       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
4682       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
4683       PetscScalar *xtemp=&x[idt];
4684       MatScalar   *ttemp=&t[idt];
4685       xtemp[3] = (PetscScalar)ttemp[3];
4686       xtemp[2] = (PetscScalar)ttemp[2];
4687       xtemp[1] = (PetscScalar)ttemp[1];
4688       xtemp[0] = (PetscScalar)ttemp[0];
4689       idt -= 4;
4690     }
4691 
4692   } /* End of artificial scope. */
4693   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4694   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4695   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4696   SSE_SCOPE_END;
4697   PetscFunctionReturn(0);
4698 }
4699 
4700 #endif
4701 
4702 #undef __FUNCT__
4703 #define __FUNCT__ "MatSolve_SeqBAIJ_3_inplace"
4704 PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
4705 {
4706   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4707   IS                iscol=a->col,isrow=a->row;
4708   PetscErrorCode    ierr;
4709   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
4710   PetscInt          i,nz,idx,idt,idc;
4711   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
4712   const MatScalar   *aa=a->a,*v;
4713   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
4714   const PetscScalar *b;
4715 
4716   PetscFunctionBegin;
4717   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4718   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4719   t  = a->solve_work;
4720 
4721   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4722   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
4723 
4724   /* forward solve the lower triangular */
4725   idx    = 3*(*r++);
4726   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
4727   for (i=1; i<n; i++) {
4728     v     = aa + 9*ai[i];
4729     vi    = aj + ai[i];
4730     nz    = diag[i] - ai[i];
4731     idx   = 3*(*r++);
4732     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
4733     while (nz--) {
4734       idx   = 3*(*vi++);
4735       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4736       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4737       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4738       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4739       v += 9;
4740     }
4741     idx = 3*i;
4742     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
4743   }
4744   /* backward solve the upper triangular */
4745   for (i=n-1; i>=0; i--){
4746     v    = aa + 9*diag[i] + 9;
4747     vi   = aj + diag[i] + 1;
4748     nz   = ai[i+1] - diag[i] - 1;
4749     idt  = 3*i;
4750     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
4751     while (nz--) {
4752       idx   = 3*(*vi++);
4753       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4754       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4755       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4756       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4757       v += 9;
4758     }
4759     idc = 3*(*c--);
4760     v   = aa + 9*diag[i];
4761     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4762     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4763     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4764   }
4765   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4766   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4767   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4768   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4769   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
4770   PetscFunctionReturn(0);
4771 }
4772 
4773 #undef __FUNCT__
4774 #define __FUNCT__ "MatSolve_SeqBAIJ_3"
4775 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
4776 {
4777   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4778   IS                iscol=a->col,isrow=a->row;
4779   PetscErrorCode    ierr;
4780   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
4781   PetscInt          i,nz,idx,idt,idc,m;
4782   const PetscInt    *r,*c,*rout,*cout;
4783   const MatScalar   *aa=a->a,*v;
4784   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
4785   const PetscScalar *b;
4786 
4787   PetscFunctionBegin;
4788   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4789   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4790   t  = a->solve_work;
4791 
4792   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4793   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
4794 
4795   /* forward solve the lower triangular */
4796   idx    = 3*r[0];
4797   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
4798   for (i=1; i<n; i++) {
4799     v     = aa + 9*ai[i];
4800     vi    = aj + ai[i];
4801     nz    = ai[i+1] - ai[i];
4802     idx   = 3*r[i];
4803     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
4804     for(m=0;m<nz;m++){
4805       idx   = 3*vi[m];
4806       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4807       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4808       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4809       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4810       v += 9;
4811     }
4812     idx = 3*i;
4813     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
4814   }
4815   /* backward solve the upper triangular */
4816   for (i=n-1; i>=0; i--){
4817     v    = aa + 9*(adiag[i+1]+1);
4818     vi   = aj + adiag[i+1]+1;
4819     nz   = adiag[i] - adiag[i+1] - 1;
4820     idt  = 3*i;
4821     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
4822     for(m=0;m<nz;m++){
4823       idx   = 3*vi[m];
4824       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4825       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4826       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4827       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4828       v += 9;
4829     }
4830     idc = 3*c[i];
4831     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4832     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4833     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4834   }
4835   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4836   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4837   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4838   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4839   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
4840   PetscFunctionReturn(0);
4841 }
4842 
4843 /*
4844       Special case where the matrix was ILU(0) factored in the natural
4845    ordering. This eliminates the need for the column and row permutation.
4846 */
4847 #undef __FUNCT__
4848 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_inplace"
4849 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
4850 {
4851   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4852   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j;
4853   PetscErrorCode    ierr;
4854   const PetscInt    *diag = a->diag,*vi;
4855   const MatScalar   *aa=a->a,*v;
4856   PetscScalar       *x,s1,s2,s3,x1,x2,x3;
4857   const PetscScalar *b;
4858   PetscInt          jdx,idt,idx,nz,i;
4859 
4860   PetscFunctionBegin;
4861   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4862   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4863 
4864   /* forward solve the lower triangular */
4865   idx    = 0;
4866   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
4867   for (i=1; i<n; i++) {
4868     v     =  aa      + 9*ai[i];
4869     vi    =  aj      + ai[i];
4870     nz    =  diag[i] - ai[i];
4871     idx   +=  3;
4872     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
4873     while (nz--) {
4874       jdx   = 3*(*vi++);
4875       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
4876       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4877       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4878       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4879       v    += 9;
4880     }
4881     x[idx]   = s1;
4882     x[1+idx] = s2;
4883     x[2+idx] = s3;
4884   }
4885   /* backward solve the upper triangular */
4886   for (i=n-1; i>=0; i--){
4887     v    = aa + 9*diag[i] + 9;
4888     vi   = aj + diag[i] + 1;
4889     nz   = ai[i+1] - diag[i] - 1;
4890     idt  = 3*i;
4891     s1 = x[idt];  s2 = x[1+idt];
4892     s3 = x[2+idt];
4893     while (nz--) {
4894       idx   = 3*(*vi++);
4895       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
4896       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4897       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4898       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4899       v    += 9;
4900     }
4901     v        = aa +  9*diag[i];
4902     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4903     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4904     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4905   }
4906 
4907   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4908   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4909   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
4910   PetscFunctionReturn(0);
4911 }
4912 
4913 #undef __FUNCT__
4914 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
4915 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
4916 {
4917     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4918     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
4919     PetscErrorCode    ierr;
4920     PetscInt          i,k,nz,idx,jdx,idt;
4921     const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
4922     const MatScalar   *aa=a->a,*v;
4923     PetscScalar       *x;
4924     const PetscScalar *b;
4925     PetscScalar        s1,s2,s3,x1,x2,x3;
4926 
4927     PetscFunctionBegin;
4928     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4929     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4930     /* forward solve the lower triangular */
4931     idx    = 0;
4932     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
4933     for (i=1; i<n; i++) {
4934        v    = aa + bs2*ai[i];
4935        vi   = aj + ai[i];
4936        nz   = ai[i+1] - ai[i];
4937       idx   = bs*i;
4938        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];
4939       for(k=0;k<nz;k++){
4940          jdx   = bs*vi[k];
4941           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
4942           s1   -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4943           s2   -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4944           s3   -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4945 
4946           v   +=  bs2;
4947         }
4948 
4949        x[idx]   = s1;
4950        x[1+idx] = s2;
4951        x[2+idx] = s3;
4952     }
4953 
4954    /* backward solve the upper triangular */
4955   for (i=n-1; i>=0; i--){
4956     v   = aa + bs2*(adiag[i+1]+1);
4957      vi  = aj + adiag[i+1]+1;
4958      nz  = adiag[i] - adiag[i+1]-1;
4959      idt = bs*i;
4960      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];
4961 
4962      for(k=0;k<nz;k++){
4963        idx   = bs*vi[k];
4964        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
4965        s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4966        s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4967        s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4968 
4969         v   +=  bs2;
4970     }
4971     /* x = inv_diagonal*x */
4972    x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4973    x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4974    x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4975 
4976   }
4977 
4978   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4979   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4980   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
4981   PetscFunctionReturn(0);
4982 }
4983 
4984 #undef __FUNCT__
4985 #define __FUNCT__ "MatSolve_SeqBAIJ_2_inplace"
4986 PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
4987 {
4988   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4989   IS                iscol=a->col,isrow=a->row;
4990   PetscErrorCode    ierr;
4991   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
4992   PetscInt          i,nz,idx,idt,idc;
4993   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
4994   const MatScalar   *aa=a->a,*v;
4995   PetscScalar       *x,s1,s2,x1,x2,*t;
4996   const PetscScalar *b;
4997 
4998   PetscFunctionBegin;
4999   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
5000   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5001   t  = a->solve_work;
5002 
5003   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
5004   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
5005 
5006   /* forward solve the lower triangular */
5007   idx    = 2*(*r++);
5008   t[0] = b[idx]; t[1] = b[1+idx];
5009   for (i=1; i<n; i++) {
5010     v     = aa + 4*ai[i];
5011     vi    = aj + ai[i];
5012     nz    = diag[i] - ai[i];
5013     idx   = 2*(*r++);
5014     s1  = b[idx]; s2 = b[1+idx];
5015     while (nz--) {
5016       idx   = 2*(*vi++);
5017       x1    = t[idx]; x2 = t[1+idx];
5018       s1 -= v[0]*x1 + v[2]*x2;
5019       s2 -= v[1]*x1 + v[3]*x2;
5020       v += 4;
5021     }
5022     idx = 2*i;
5023     t[idx] = s1; t[1+idx] = s2;
5024   }
5025   /* backward solve the upper triangular */
5026   for (i=n-1; i>=0; i--){
5027     v    = aa + 4*diag[i] + 4;
5028     vi   = aj + diag[i] + 1;
5029     nz   = ai[i+1] - diag[i] - 1;
5030     idt  = 2*i;
5031     s1 = t[idt]; s2 = t[1+idt];
5032     while (nz--) {
5033       idx   = 2*(*vi++);
5034       x1    = t[idx]; x2 = t[1+idx];
5035       s1 -= v[0]*x1 + v[2]*x2;
5036       s2 -= v[1]*x1 + v[3]*x2;
5037       v += 4;
5038     }
5039     idc = 2*(*c--);
5040     v   = aa + 4*diag[i];
5041     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
5042     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
5043   }
5044   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
5045   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
5046   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
5047   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5048   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
5049   PetscFunctionReturn(0);
5050 }
5051 
5052 #undef __FUNCT__
5053 #define __FUNCT__ "MatSolve_SeqBAIJ_2"
5054 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
5055 {
5056   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
5057   IS                iscol=a->col,isrow=a->row;
5058   PetscErrorCode    ierr;
5059   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
5060   PetscInt          i,nz,idx,jdx,idt,idc,m;
5061   const PetscInt    *r,*c,*rout,*cout;
5062   const MatScalar   *aa=a->a,*v;
5063   PetscScalar       *x,s1,s2,x1,x2,*t;
5064   const PetscScalar *b;
5065 
5066   PetscFunctionBegin;
5067   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
5068   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5069   t  = a->solve_work;
5070 
5071   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
5072   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
5073 
5074   /* forward solve the lower triangular */
5075   idx    = 2*r[0];
5076   t[0] = b[idx]; t[1] = b[1+idx];
5077   for (i=1; i<n; i++) {
5078     v     = aa + 4*ai[i];
5079     vi    = aj + ai[i];
5080     nz    = ai[i+1] - ai[i];
5081     idx   = 2*r[i];
5082     s1  = b[idx]; s2 = b[1+idx];
5083     for(m=0;m<nz;m++){
5084       jdx   = 2*vi[m];
5085       x1    = t[jdx]; x2 = t[1+jdx];
5086       s1 -= v[0]*x1 + v[2]*x2;
5087       s2 -= v[1]*x1 + v[3]*x2;
5088       v += 4;
5089     }
5090     idx = 2*i;
5091     t[idx] = s1; t[1+idx] = s2;
5092   }
5093   /* backward solve the upper triangular */
5094   for (i=n-1; i>=0; i--){
5095     v    = aa + 4*(adiag[i+1]+1);
5096     vi   = aj + adiag[i+1]+1;
5097     nz   = adiag[i] - adiag[i+1] - 1;
5098     idt  = 2*i;
5099     s1 = t[idt]; s2 = t[1+idt];
5100     for(m=0;m<nz;m++){
5101       idx   = 2*vi[m];
5102       x1    = t[idx]; x2 = t[1+idx];
5103       s1 -= v[0]*x1 + v[2]*x2;
5104       s2 -= v[1]*x1 + v[3]*x2;
5105       v += 4;
5106     }
5107     idc = 2*c[i];
5108     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
5109     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
5110   }
5111   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
5112   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
5113   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
5114   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5115   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
5116   PetscFunctionReturn(0);
5117 }
5118 
5119 /*
5120       Special case where the matrix was ILU(0) factored in the natural
5121    ordering. This eliminates the need for the column and row permutation.
5122 */
5123 #undef __FUNCT__
5124 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_inplace"
5125 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
5126 {
5127   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
5128   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
5129   PetscErrorCode    ierr;
5130   const MatScalar   *aa=a->a,*v;
5131   PetscScalar       *x,s1,s2,x1,x2;
5132   const PetscScalar *b;
5133   PetscInt          jdx,idt,idx,nz,i;
5134 
5135   PetscFunctionBegin;
5136   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
5137   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5138 
5139   /* forward solve the lower triangular */
5140   idx    = 0;
5141   x[0]   = b[0]; x[1] = b[1];
5142   for (i=1; i<n; i++) {
5143     v     =  aa      + 4*ai[i];
5144     vi    =  aj      + ai[i];
5145     nz    =  diag[i] - ai[i];
5146     idx   +=  2;
5147     s1  =  b[idx];s2 = b[1+idx];
5148     while (nz--) {
5149       jdx   = 2*(*vi++);
5150       x1    = x[jdx];x2 = x[1+jdx];
5151       s1 -= v[0]*x1 + v[2]*x2;
5152       s2 -= v[1]*x1 + v[3]*x2;
5153       v    += 4;
5154     }
5155     x[idx]   = s1;
5156     x[1+idx] = s2;
5157   }
5158   /* backward solve the upper triangular */
5159   for (i=n-1; i>=0; i--){
5160     v    = aa + 4*diag[i] + 4;
5161     vi   = aj + diag[i] + 1;
5162     nz   = ai[i+1] - diag[i] - 1;
5163     idt  = 2*i;
5164     s1 = x[idt];  s2 = x[1+idt];
5165     while (nz--) {
5166       idx   = 2*(*vi++);
5167       x1    = x[idx];   x2 = x[1+idx];
5168       s1 -= v[0]*x1 + v[2]*x2;
5169       s2 -= v[1]*x1 + v[3]*x2;
5170       v    += 4;
5171     }
5172     v        = aa +  4*diag[i];
5173     x[idt]   = v[0]*s1 + v[2]*s2;
5174     x[1+idt] = v[1]*s1 + v[3]*s2;
5175   }
5176 
5177   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
5178   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5179   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
5180   PetscFunctionReturn(0);
5181 }
5182 
5183 #undef __FUNCT__
5184 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
5185 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
5186 {
5187     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
5188     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
5189     PetscInt          i,k,nz,idx,idt,jdx;
5190     PetscErrorCode    ierr;
5191     const MatScalar   *aa=a->a,*v;
5192     PetscScalar       *x,s1,s2,x1,x2;
5193     const PetscScalar *b;
5194 
5195     PetscFunctionBegin;
5196     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
5197     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5198     /* forward solve the lower triangular */
5199     idx    = 0;
5200     x[0] = b[idx]; x[1] = b[1+idx];
5201     for (i=1; i<n; i++) {
5202         v   = aa + 4*ai[i];
5203        vi   = aj + ai[i];
5204        nz   = ai[i+1] - ai[i];
5205        idx  = 2*i;
5206        s1   = b[idx];s2 = b[1+idx];
5207       for(k=0;k<nz;k++){
5208          jdx   = 2*vi[k];
5209           x1    = x[jdx];x2 = x[1+jdx];
5210           s1   -= v[0]*x1 + v[2]*x2;
5211           s2   -= v[1]*x1 + v[3]*x2;
5212            v   +=  4;
5213         }
5214        x[idx]   = s1;
5215        x[1+idx] = s2;
5216     }
5217 
5218    /* backward solve the upper triangular */
5219   for (i=n-1; i>=0; i--){
5220      v   = aa + 4*(adiag[i+1]+1);
5221      vi  = aj + adiag[i+1]+1;
5222      nz  = adiag[i] - adiag[i+1]-1;
5223      idt = 2*i;
5224      s1 = x[idt];  s2 = x[1+idt];
5225      for(k=0;k<nz;k++){
5226       idx   = 2*vi[k];
5227        x1    = x[idx];   x2 = x[1+idx];
5228        s1 -= v[0]*x1 + v[2]*x2;
5229        s2 -= v[1]*x1 + v[3]*x2;
5230          v    += 4;
5231     }
5232     /* x = inv_diagonal*x */
5233    x[idt]   = v[0]*s1 + v[2]*s2;
5234    x[1+idt] = v[1]*s1 + v[3]*s2;
5235   }
5236 
5237   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
5238   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5239   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
5240   PetscFunctionReturn(0);
5241 }
5242 
5243 #undef __FUNCT__
5244 #define __FUNCT__ "MatSolve_SeqBAIJ_1_inplace"
5245 PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
5246 {
5247   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
5248   IS                iscol=a->col,isrow=a->row;
5249   PetscErrorCode    ierr;
5250   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
5251   PetscInt          i,nz;
5252   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
5253   const MatScalar   *aa=a->a,*v;
5254   PetscScalar       *x,s1,*t;
5255   const PetscScalar *b;
5256 
5257   PetscFunctionBegin;
5258   if (!n) PetscFunctionReturn(0);
5259 
5260   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
5261   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5262   t  = a->solve_work;
5263 
5264   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
5265   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
5266 
5267   /* forward solve the lower triangular */
5268   t[0] = b[*r++];
5269   for (i=1; i<n; i++) {
5270     v     = aa + ai[i];
5271     vi    = aj + ai[i];
5272     nz    = diag[i] - ai[i];
5273     s1  = b[*r++];
5274     while (nz--) {
5275       s1 -= (*v++)*t[*vi++];
5276     }
5277     t[i] = s1;
5278   }
5279   /* backward solve the upper triangular */
5280   for (i=n-1; i>=0; i--){
5281     v    = aa + diag[i] + 1;
5282     vi   = aj + diag[i] + 1;
5283     nz   = ai[i+1] - diag[i] - 1;
5284     s1 = t[i];
5285     while (nz--) {
5286       s1 -= (*v++)*t[*vi++];
5287     }
5288     x[*c--] = t[i] = aa[diag[i]]*s1;
5289   }
5290 
5291   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
5292   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
5293   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
5294   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5295   ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr);
5296   PetscFunctionReturn(0);
5297 }
5298 /*
5299       Special case where the matrix was ILU(0) factored in the natural
5300    ordering. This eliminates the need for the column and row permutation.
5301 */
5302 #undef __FUNCT__
5303 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering_inplace"
5304 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
5305 {
5306   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
5307   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
5308   PetscErrorCode    ierr;
5309   const MatScalar   *aa=a->a,*v;
5310   PetscScalar       *x;
5311   const PetscScalar *b;
5312   PetscScalar       s1,x1;
5313   PetscInt          jdx,idt,idx,nz,i;
5314 
5315   PetscFunctionBegin;
5316   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
5317   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5318 
5319   /* forward solve the lower triangular */
5320   idx    = 0;
5321   x[0]   = b[0];
5322   for (i=1; i<n; i++) {
5323     v     =  aa      + ai[i];
5324     vi    =  aj      + ai[i];
5325     nz    =  diag[i] - ai[i];
5326     idx   +=  1;
5327     s1  =  b[idx];
5328     while (nz--) {
5329       jdx   = *vi++;
5330       x1    = x[jdx];
5331       s1 -= v[0]*x1;
5332       v    += 1;
5333     }
5334     x[idx]   = s1;
5335   }
5336   /* backward solve the upper triangular */
5337   for (i=n-1; i>=0; i--){
5338     v    = aa + diag[i] + 1;
5339     vi   = aj + diag[i] + 1;
5340     nz   = ai[i+1] - diag[i] - 1;
5341     idt  = i;
5342     s1 = x[idt];
5343     while (nz--) {
5344       idx   = *vi++;
5345       x1    = x[idx];
5346       s1 -= v[0]*x1;
5347       v    += 1;
5348     }
5349     v        = aa +  diag[i];
5350     x[idt]   = v[0]*s1;
5351   }
5352   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
5353   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5354   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
5355   PetscFunctionReturn(0);
5356 }
5357 
5358 /* ----------------------------------------------------------------*/
5359 EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscTruth);
5360 
5361 #undef __FUNCT__
5362 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering"
5363 /*
5364    This is not much faster than MatLUFactorNumeric_SeqBAIJ_N() but the solve is faster at least sometimes
5365 */
5366 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info)
5367 {
5368   Mat             C=B;
5369   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
5370   PetscErrorCode  ierr;
5371   PetscInt        i,j,k,ipvt[15];
5372   const PetscInt  n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*ajtmp,*bjtmp,*bdiag=b->diag,*pj;
5373   PetscInt        nz,nzL,row;
5374   MatScalar       *rtmp,*pc,*mwork,*pv,*vv,work[225];
5375   const MatScalar *v,*aa=a->a;
5376   PetscInt        bs2 = a->bs2,bs=A->rmap->bs,flg;
5377   PetscInt        sol_ver;
5378 
5379   PetscFunctionBegin;
5380 
5381   ierr = PetscOptionsGetInt(PETSC_NULL,"-sol_ver",&sol_ver,PETSC_NULL);CHKERRQ(ierr);
5382 
5383   /* generate work space needed by the factorization */
5384   ierr = PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);CHKERRQ(ierr);
5385   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
5386 
5387   for (i=0; i<n; i++){
5388     /* zero rtmp */
5389     /* L part */
5390     nz    = bi[i+1] - bi[i];
5391     bjtmp = bj + bi[i];
5392     for  (j=0; j<nz; j++){
5393       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5394     }
5395 
5396     /* U part */
5397     nz = bdiag[i] - bdiag[i+1];
5398     bjtmp = bj + bdiag[i+1]+1;
5399     for  (j=0; j<nz; j++){
5400       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5401     }
5402 
5403     /* load in initial (unfactored row) */
5404     nz    = ai[i+1] - ai[i];
5405     ajtmp = aj + ai[i];
5406     v     = aa + bs2*ai[i];
5407     for (j=0; j<nz; j++) {
5408       ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
5409     }
5410 
5411     /* elimination */
5412     bjtmp = bj + bi[i];
5413     nzL   = bi[i+1] - bi[i];
5414     for(k=0;k < nzL;k++) {
5415       row = bjtmp[k];
5416       pc = rtmp + bs2*row;
5417       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
5418       if (flg) {
5419         pv = b->a + bs2*bdiag[row];
5420 	Kernel_A_gets_A_times_B(bs,pc,pv,mwork);
5421 	/*ierr = Kernel_A_gets_A_times_B_15(pc,pv,mwork);CHKERRQ(ierr);*/
5422 	pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
5423         pv = b->a + bs2*(bdiag[row+1]+1);
5424         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
5425         for (j=0; j<nz; j++) {
5426           vv   = rtmp + bs2*pj[j];
5427           Kernel_A_gets_A_minus_B_times_C(bs,vv,pc,pv);
5428 	  /* ierr = Kernel_A_gets_A_minus_B_times_C_15(vv,pc,pv);CHKERRQ(ierr); */
5429 	  pv  += bs2;
5430         }
5431         ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
5432       }
5433     }
5434 
5435     /* finished row so stick it into b->a */
5436     /* L part */
5437     pv   = b->a + bs2*bi[i] ;
5438     pj   = b->j + bi[i] ;
5439     nz   = bi[i+1] - bi[i];
5440     for (j=0; j<nz; j++) {
5441       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5442     }
5443 
5444     /* Mark diagonal and invert diagonal for simplier triangular solves */
5445     pv   = b->a + bs2*bdiag[i];
5446     pj   = b->j + bdiag[i];
5447     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5448     /* Kernel_A_gets_inverse_A(bs,pv,pivots,work); */
5449     ierr = Kernel_A_gets_inverse_A_15(pv,ipvt,work,info->shiftamount);CHKERRQ(ierr);
5450 
5451     /* U part */
5452     pv = b->a + bs2*(bdiag[i+1]+1);
5453     pj = b->j + bdiag[i+1]+1;
5454     nz = bdiag[i] - bdiag[i+1] - 1;
5455     for (j=0; j<nz; j++){
5456       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5457     }
5458   }
5459 
5460   ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr);
5461   C->ops->solve = MatSolve_SeqBAIJ_15_NaturalOrdering_ver1;
5462   C->ops->solvetranspose = MatSolve_SeqBAIJ_N_NaturalOrdering;
5463   C->assembled = PETSC_TRUE;
5464   ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
5465   PetscFunctionReturn(0);
5466 }
5467 
5468 #undef __FUNCT__
5469 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N"
5470 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N(Mat B,Mat A,const MatFactorInfo *info)
5471 {
5472   Mat            C=B;
5473   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
5474   IS             isrow = b->row,isicol = b->icol;
5475   PetscErrorCode ierr;
5476   const PetscInt *r,*ic,*ics;
5477   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
5478   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
5479   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
5480   PetscInt       bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg;
5481   MatScalar      *v_work;
5482   PetscTruth     col_identity,row_identity,both_identity;
5483 
5484   PetscFunctionBegin;
5485   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
5486   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
5487 
5488   ierr = PetscMalloc(bs2*n*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
5489   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
5490   ics  = ic;
5491 
5492   /* generate work space needed by dense LU factorization */
5493   ierr  = PetscMalloc3(bs,MatScalar,&v_work,bs2,MatScalar,&mwork,bs,PetscInt,&v_pivots);CHKERRQ(ierr);
5494 
5495   for (i=0; i<n; i++){
5496     /* zero rtmp */
5497     /* L part */
5498     nz    = bi[i+1] - bi[i];
5499     bjtmp = bj + bi[i];
5500     for  (j=0; j<nz; j++){
5501       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5502     }
5503 
5504     /* U part */
5505     nz = bdiag[i] - bdiag[i+1];
5506     bjtmp = bj + bdiag[i+1]+1;
5507     for  (j=0; j<nz; j++){
5508       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5509     }
5510 
5511     /* load in initial (unfactored row) */
5512     nz    = ai[r[i]+1] - ai[r[i]];
5513     ajtmp = aj + ai[r[i]];
5514     v     = aa + bs2*ai[r[i]];
5515     for (j=0; j<nz; j++) {
5516       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
5517     }
5518 
5519     /* elimination */
5520     bjtmp = bj + bi[i];
5521     nzL   = bi[i+1] - bi[i];
5522     for(k=0;k < nzL;k++) {
5523       row = bjtmp[k];
5524       pc = rtmp + bs2*row;
5525       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
5526       if (flg) {
5527         pv         = b->a + bs2*bdiag[row];
5528         Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */
5529         pj         = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
5530         pv         = b->a + bs2*(bdiag[row+1]+1);
5531         nz         = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
5532         for (j=0; j<nz; j++) {
5533           Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j);
5534         }
5535         ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
5536       }
5537     }
5538 
5539     /* finished row so stick it into b->a */
5540     /* L part */
5541     pv   = b->a + bs2*bi[i] ;
5542     pj   = b->j + bi[i] ;
5543     nz   = bi[i+1] - bi[i];
5544     for (j=0; j<nz; j++) {
5545       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5546     }
5547 
5548     /* Mark diagonal and invert diagonal for simplier triangular solves */
5549     pv  = b->a + bs2*bdiag[i];
5550     pj  = b->j + bdiag[i];
5551     /* if (*pj != i)SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */
5552     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5553     ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr);
5554 
5555     /* U part */
5556     pv = b->a + bs2*(bdiag[i+1]+1);
5557     pj = b->j + bdiag[i+1]+1;
5558     nz = bdiag[i] - bdiag[i+1] - 1;
5559     for (j=0; j<nz; j++){
5560       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5561     }
5562   }
5563 
5564   ierr = PetscFree(rtmp);CHKERRQ(ierr);
5565   ierr = PetscFree3(v_work,mwork,v_pivots);CHKERRQ(ierr);
5566   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
5567   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
5568 
5569   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5570   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
5571   both_identity = (PetscTruth) (row_identity && col_identity);
5572   if (both_identity){
5573     C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering;
5574   } else {
5575     C->ops->solve = MatSolve_SeqBAIJ_N;
5576   }
5577   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_N;
5578 
5579   C->assembled = PETSC_TRUE;
5580   ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
5581   PetscFunctionReturn(0);
5582 }
5583 
5584 /*
5585    ilu(0) with natural ordering under new data structure.
5586    See MatILUFactorSymbolic_SeqAIJ_ilu0() for detailed description
5587    because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_inplace().
5588 */
5589 
5590 #undef __FUNCT__
5591 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0"
5592 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
5593 {
5594 
5595   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
5596   PetscErrorCode     ierr;
5597   PetscInt           n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2;
5598   PetscInt           i,j,nz,*bi,*bj,*bdiag,bi_temp;
5599 
5600   PetscFunctionBegin;
5601   ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr);
5602   b    = (Mat_SeqBAIJ*)(fact)->data;
5603 
5604   /* allocate matrix arrays for new data structure */
5605   ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,n+1,PetscInt,&b->i);CHKERRQ(ierr);
5606   ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(n+1)*sizeof(PetscInt));CHKERRQ(ierr);
5607   b->singlemalloc = PETSC_TRUE;
5608   if (!b->diag){
5609     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr);
5610     ierr = PetscLogObjectMemory(fact,(n+1)*sizeof(PetscInt));CHKERRQ(ierr);
5611   }
5612   bdiag = b->diag;
5613 
5614   if (n > 0) {
5615     ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr);
5616   }
5617 
5618   /* set bi and bj with new data structure */
5619   bi = b->i;
5620   bj = b->j;
5621 
5622   /* L part */
5623   bi[0] = 0;
5624   for (i=0; i<n; i++){
5625     nz = adiag[i] - ai[i];
5626     bi[i+1] = bi[i] + nz;
5627     aj = a->j + ai[i];
5628     for (j=0; j<nz; j++){
5629       *bj = aj[j]; bj++;
5630     }
5631   }
5632 
5633   /* U part */
5634   bi_temp = bi[n];
5635   bdiag[n] = bi[n]-1;
5636   for (i=n-1; i>=0; i--){
5637     nz = ai[i+1] - adiag[i] - 1;
5638     bi_temp = bi_temp + nz + 1;
5639     aj = a->j + adiag[i] + 1;
5640     for (j=0; j<nz; j++){
5641       *bj = aj[j]; bj++;
5642     }
5643     /* diag[i] */
5644     *bj = i; bj++;
5645     bdiag[i] = bi_temp - 1;
5646   }
5647   PetscFunctionReturn(0);
5648 }
5649 
5650 #undef __FUNCT__
5651 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ"
5652 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
5653 {
5654   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
5655   IS                 isicol;
5656   PetscErrorCode     ierr;
5657   const PetscInt     *r,*ic;
5658   PetscInt           n=a->mbs,*ai=a->i,*aj=a->j,d;
5659   PetscInt           *bi,*cols,nnz,*cols_lvl;
5660   PetscInt           *bdiag,prow,fm,nzbd,reallocs=0,dcount=0;
5661   PetscInt           i,levels,diagonal_fill;
5662   PetscTruth         col_identity,row_identity,both_identity;
5663   PetscReal          f;
5664   PetscInt           nlnk,*lnk,*lnk_lvl=PETSC_NULL;
5665   PetscBT            lnkbt;
5666   PetscInt           nzi,*bj,**bj_ptr,**bjlvl_ptr;
5667   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
5668   PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL;
5669   PetscTruth         missing;
5670   PetscInt           bs=A->rmap->bs,bs2=a->bs2;
5671 
5672   PetscFunctionBegin;
5673   if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
5674   ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr);
5675   if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
5676 
5677   f             = info->fill;
5678   levels        = (PetscInt)info->levels;
5679   diagonal_fill = (PetscInt)info->diagonal_fill;
5680   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
5681 
5682   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5683   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
5684   both_identity = (PetscTruth) (row_identity && col_identity);
5685 
5686   if (!levels && both_identity) {
5687     /* special case: ilu(0) with natural ordering */
5688     ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0(fact,A,isrow,iscol,info);CHKERRQ(ierr);
5689     ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
5690 
5691     fact->factortype               = MAT_FACTOR_ILU;
5692     (fact)->info.factor_mallocs    = 0;
5693     (fact)->info.fill_ratio_given  = info->fill;
5694     (fact)->info.fill_ratio_needed = 1.0;
5695     b                = (Mat_SeqBAIJ*)(fact)->data;
5696     b->row           = isrow;
5697     b->col           = iscol;
5698     b->icol          = isicol;
5699     ierr             = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5700     ierr             = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5701     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5702     ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
5703     PetscFunctionReturn(0);
5704   }
5705 
5706   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
5707   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
5708 
5709   /* get new row pointers */
5710   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
5711   bi[0] = 0;
5712   /* bdiag is location of diagonal in factor */
5713   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr);
5714   bdiag[0]  = 0;
5715 
5716   ierr = PetscMalloc2(n,PetscInt*,&bj_ptr,n,PetscInt*,&bjlvl_ptr);CHKERRQ(ierr);
5717 
5718   /* create a linked list for storing column indices of the active row */
5719   nlnk = n + 1;
5720   ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
5721 
5722   /* initial FreeSpace size is f*(ai[n]+1) */
5723   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr);
5724   current_space = free_space;
5725   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr);
5726   current_space_lvl = free_space_lvl;
5727 
5728   for (i=0; i<n; i++) {
5729     nzi = 0;
5730     /* copy current row into linked list */
5731     nnz  = ai[r[i]+1] - ai[r[i]];
5732     if (!nnz) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i);
5733     cols = aj + ai[r[i]];
5734     lnk[i] = -1; /* marker to indicate if diagonal exists */
5735     ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
5736     nzi += nlnk;
5737 
5738     /* make sure diagonal entry is included */
5739     if (diagonal_fill && lnk[i] == -1) {
5740       fm = n;
5741       while (lnk[fm] < i) fm = lnk[fm];
5742       lnk[i]     = lnk[fm]; /* insert diagonal into linked list */
5743       lnk[fm]    = i;
5744       lnk_lvl[i] = 0;
5745       nzi++; dcount++;
5746     }
5747 
5748     /* add pivot rows into the active row */
5749     nzbd = 0;
5750     prow = lnk[n];
5751     while (prow < i) {
5752       nnz      = bdiag[prow];
5753       cols     = bj_ptr[prow] + nnz + 1;
5754       cols_lvl = bjlvl_ptr[prow] + nnz + 1;
5755       nnz      = bi[prow+1] - bi[prow] - nnz - 1;
5756       ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr);
5757       nzi += nlnk;
5758       prow = lnk[prow];
5759       nzbd++;
5760     }
5761     bdiag[i] = nzbd;
5762     bi[i+1]  = bi[i] + nzi;
5763 
5764     /* if free space is not available, make more free space */
5765     if (current_space->local_remaining<nzi) {
5766       nnz = 2*nzi*(n - i); /* estimated and max additional space needed */
5767       ierr = PetscFreeSpaceGet(nnz,&current_space);CHKERRQ(ierr);
5768       ierr = PetscFreeSpaceGet(nnz,&current_space_lvl);CHKERRQ(ierr);
5769       reallocs++;
5770     }
5771 
5772     /* copy data into free_space and free_space_lvl, then initialize lnk */
5773     ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr);
5774     bj_ptr[i]    = current_space->array;
5775     bjlvl_ptr[i] = current_space_lvl->array;
5776 
5777     /* make sure the active row i has diagonal entry */
5778     if (*(bj_ptr[i]+bdiag[i]) != i) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\ntry running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i);
5779 
5780     current_space->array           += nzi;
5781     current_space->local_used      += nzi;
5782     current_space->local_remaining -= nzi;
5783     current_space_lvl->array           += nzi;
5784     current_space_lvl->local_used      += nzi;
5785     current_space_lvl->local_remaining -= nzi;
5786   }
5787 
5788   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
5789   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
5790 
5791   /* destroy list of free space and other temporary arrays */
5792   ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr);
5793 
5794   /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */
5795   ierr = PetscFreeSpaceContiguous_LU(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr);
5796 
5797   ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
5798   ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr);
5799   ierr = PetscFree2(bj_ptr,bjlvl_ptr);CHKERRQ(ierr);
5800 
5801 #if defined(PETSC_USE_INFO)
5802   {
5803     PetscReal af = ((PetscReal)bi[n])/((PetscReal)ai[n]);
5804     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr);
5805     ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
5806     ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr);
5807     ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
5808     if (diagonal_fill) {
5809       ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr);
5810     }
5811   }
5812 #endif
5813 
5814   /* put together the new matrix */
5815   ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
5816   ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
5817   b = (Mat_SeqBAIJ*)(fact)->data;
5818   b->free_a       = PETSC_TRUE;
5819   b->free_ij      = PETSC_TRUE;
5820   b->singlemalloc = PETSC_FALSE;
5821   ierr = PetscMalloc( (bs2*(bdiag[0]+1) )*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
5822   b->j          = bj;
5823   b->i          = bi;
5824   b->diag       = bdiag;
5825   b->free_diag  = PETSC_TRUE;
5826   b->ilen       = 0;
5827   b->imax       = 0;
5828   b->row        = isrow;
5829   b->col        = iscol;
5830   ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5831   ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5832   b->icol       = isicol;
5833   ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
5834   /* In b structure:  Free imax, ilen, old a, old j.
5835      Allocate bdiag, solve_work, new a, new j */
5836   ierr = PetscLogObjectMemory(fact,(bdiag[0]+1) * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr);
5837   b->maxnz = b->nz = bdiag[0]+1;
5838   fact->info.factor_mallocs    = reallocs;
5839   fact->info.fill_ratio_given  = f;
5840   fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]);
5841   ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
5842   PetscFunctionReturn(0);
5843 }
5844 
5845 
5846 /*
5847      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
5848    except that the data structure of Mat_SeqAIJ is slightly different.
5849    Not a good example of code reuse.
5850 */
5851 #undef __FUNCT__
5852 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_inplace"
5853 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_inplace(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
5854 {
5855   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b;
5856   IS             isicol;
5857   PetscErrorCode ierr;
5858   const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi;
5859   PetscInt       prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp;
5860   PetscInt       *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0;
5861   PetscInt       incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd;
5862   PetscTruth     col_identity,row_identity,both_identity,flg;
5863   PetscReal      f;
5864 
5865   PetscFunctionBegin;
5866   ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr);
5867   if (flg) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd);
5868 
5869   f             = info->fill;
5870   levels        = (PetscInt)info->levels;
5871   diagonal_fill = (PetscInt)info->diagonal_fill;
5872   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
5873 
5874   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5875   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
5876   both_identity = (PetscTruth) (row_identity && col_identity);
5877 
5878   if (!levels && both_identity) {  /* special case copy the nonzero structure */
5879     ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr);
5880     ierr = MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);CHKERRQ(ierr);
5881 
5882     fact->factortype = MAT_FACTOR_ILU;
5883     b            = (Mat_SeqBAIJ*)fact->data;
5884     b->row       = isrow;
5885     b->col       = iscol;
5886     ierr         = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5887     ierr         = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5888     b->icol      = isicol;
5889     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5890     ierr         = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
5891     PetscFunctionReturn(0);
5892   }
5893 
5894   /* general case perform the symbolic factorization */
5895     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
5896     ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
5897 
5898     /* get new row pointers */
5899     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr);
5900     ainew[0] = 0;
5901     /* don't know how many column pointers are needed so estimate */
5902     jmax = (PetscInt)(f*ai[n] + 1);
5903     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr);
5904     /* ajfill is level of fill for each fill entry */
5905     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr);
5906     /* fill is a linked list of nonzeros in active row */
5907     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr);
5908     /* im is level for each filled value */
5909     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr);
5910     /* dloc is location of diagonal in factor */
5911     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr);
5912     dloc[0]  = 0;
5913     for (prow=0; prow<n; prow++) {
5914 
5915       /* copy prow into linked list */
5916       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
5917       if (!nz) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow);
5918       xi         = aj + ai[r[prow]];
5919       fill[n]    = n;
5920       fill[prow] = -1; /* marker for diagonal entry */
5921       while (nz--) {
5922 	fm  = n;
5923 	idx = ic[*xi++];
5924 	do {
5925 	  m  = fm;
5926 	  fm = fill[m];
5927 	} while (fm < idx);
5928 	fill[m]   = idx;
5929 	fill[idx] = fm;
5930 	im[idx]   = 0;
5931       }
5932 
5933       /* make sure diagonal entry is included */
5934       if (diagonal_fill && fill[prow] == -1) {
5935 	fm = n;
5936 	while (fill[fm] < prow) fm = fill[fm];
5937 	fill[prow] = fill[fm];  /* insert diagonal into linked list */
5938 	fill[fm]   = prow;
5939 	im[prow]   = 0;
5940 	nzf++;
5941 	dcount++;
5942       }
5943 
5944       nzi = 0;
5945       row = fill[n];
5946       while (row < prow) {
5947 	incrlev = im[row] + 1;
5948 	nz      = dloc[row];
5949 	xi      = ajnew  + ainew[row] + nz + 1;
5950 	flev    = ajfill + ainew[row] + nz + 1;
5951 	nnz     = ainew[row+1] - ainew[row] - nz - 1;
5952 	fm      = row;
5953 	while (nnz-- > 0) {
5954 	  idx = *xi++;
5955 	  if (*flev + incrlev > levels) {
5956 	    flev++;
5957 	    continue;
5958 	  }
5959 	  do {
5960 	    m  = fm;
5961 	    fm = fill[m];
5962 	  } while (fm < idx);
5963 	  if (fm != idx) {
5964 	    im[idx]   = *flev + incrlev;
5965 	    fill[m]   = idx;
5966 	    fill[idx] = fm;
5967 	    fm        = idx;
5968 	    nzf++;
5969 	  } else {
5970 	    if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
5971 	  }
5972 	  flev++;
5973 	}
5974 	row = fill[row];
5975 	nzi++;
5976       }
5977       /* copy new filled row into permanent storage */
5978       ainew[prow+1] = ainew[prow] + nzf;
5979       if (ainew[prow+1] > jmax) {
5980 
5981 	/* estimate how much additional space we will need */
5982 	/* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
5983 	/* just double the memory each time */
5984 	PetscInt maxadd = jmax;
5985 	/* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
5986 	if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
5987 	jmax += maxadd;
5988 
5989 	/* allocate a longer ajnew and ajfill */
5990 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
5991 	ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
5992 	ierr = PetscFree(ajnew);CHKERRQ(ierr);
5993 	ajnew = xitmp;
5994 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
5995 	ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
5996 	ierr = PetscFree(ajfill);CHKERRQ(ierr);
5997 	ajfill = xitmp;
5998 	reallocate++; /* count how many reallocations are needed */
5999       }
6000       xitmp       = ajnew + ainew[prow];
6001       flev        = ajfill + ainew[prow];
6002       dloc[prow]  = nzi;
6003       fm          = fill[n];
6004       while (nzf--) {
6005 	*xitmp++ = fm;
6006 	*flev++ = im[fm];
6007 	fm      = fill[fm];
6008       }
6009       /* make sure row has diagonal entry */
6010       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
6011 	SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
6012     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow);
6013       }
6014     }
6015     ierr = PetscFree(ajfill);CHKERRQ(ierr);
6016     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
6017     ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
6018     ierr = PetscFree(fill);CHKERRQ(ierr);
6019     ierr = PetscFree(im);CHKERRQ(ierr);
6020 
6021 #if defined(PETSC_USE_INFO)
6022     {
6023       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
6024       ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr);
6025       ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
6026       ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr);
6027       ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
6028       if (diagonal_fill) {
6029 	ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr);
6030       }
6031     }
6032 #endif
6033 
6034     /* put together the new matrix */
6035     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
6036     ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
6037     b    = (Mat_SeqBAIJ*)fact->data;
6038     b->free_a       = PETSC_TRUE;
6039     b->free_ij      = PETSC_TRUE;
6040     b->singlemalloc = PETSC_FALSE;
6041     ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
6042     b->j          = ajnew;
6043     b->i          = ainew;
6044     for (i=0; i<n; i++) dloc[i] += ainew[i];
6045     b->diag       = dloc;
6046     b->free_diag  = PETSC_TRUE;
6047     b->ilen       = 0;
6048     b->imax       = 0;
6049     b->row        = isrow;
6050     b->col        = iscol;
6051     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
6052     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
6053     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
6054     b->icol       = isicol;
6055     ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
6056     /* In b structure:  Free imax, ilen, old a, old j.
6057        Allocate dloc, solve_work, new a, new j */
6058     ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr);
6059     b->maxnz          = b->nz = ainew[n];
6060 
6061     fact->info.factor_mallocs    = reallocate;
6062     fact->info.fill_ratio_given  = f;
6063     fact->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
6064 
6065   ierr = MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);CHKERRQ(ierr);
6066   PetscFunctionReturn(0);
6067 }
6068 
6069 #undef __FUNCT__
6070 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE"
6071 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
6072 {
6073   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
6074   /* int i,*AJ=a->j,nz=a->nz; */
6075   PetscFunctionBegin;
6076   /* Undo Column scaling */
6077 /*    while (nz--) { */
6078 /*      AJ[i] = AJ[i]/4; */
6079 /*    } */
6080   /* This should really invoke a push/pop logic, but we don't have that yet. */
6081   A->ops->setunfactored = PETSC_NULL;
6082   PetscFunctionReturn(0);
6083 }
6084 
6085 #undef __FUNCT__
6086 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj"
6087 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
6088 {
6089   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
6090   PetscInt       *AJ=a->j,nz=a->nz;
6091   unsigned short *aj=(unsigned short *)AJ;
6092   PetscFunctionBegin;
6093   /* Is this really necessary? */
6094   while (nz--) {
6095     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
6096   }
6097   A->ops->setunfactored = PETSC_NULL;
6098   PetscFunctionReturn(0);
6099 }
6100 
6101 
6102