xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision 753ebd1d069d7a4e86ed47ccb83df941bd1fb5db)
1 #define PETSCMAT_DLL
2 
3 
4 /*
5     Factorization code for BAIJ format.
6 */
7 
8 #include "../src/mat/impls/baij/seq/baij.h"
9 #include "../src/mat/blockinvert.h"
10 #include "petscbt.h"
11 #include "../src/mat/utils/freespace.h"
12 
13 #undef __FUNCT__
14 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering"
15 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
16 {
17   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
18   PetscErrorCode ierr;
19   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
20   PetscInt       *diag = a->diag;
21   MatScalar      *aa=a->a,*v;
22   PetscScalar    s1,*x,*b;
23 
24   PetscFunctionBegin;
25   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
26   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
27   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
28 
29   /* forward solve the U^T */
30   for (i=0; i<n; i++) {
31 
32     v     = aa + diag[i];
33     /* multiply by the inverse of the block diagonal */
34     s1    = (*v++)*x[i];
35     vi    = aj + diag[i] + 1;
36     nz    = ai[i+1] - diag[i] - 1;
37     while (nz--) {
38       x[*vi++]  -= (*v++)*s1;
39     }
40     x[i]   = s1;
41   }
42   /* backward solve the L^T */
43   for (i=n-1; i>=0; i--){
44     v    = aa + diag[i] - 1;
45     vi   = aj + diag[i] - 1;
46     nz   = diag[i] - ai[i];
47     s1   = x[i];
48     while (nz--) {
49       x[*vi--]   -=  (*v--)*s1;
50     }
51   }
52   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
53   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
54   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
55   PetscFunctionReturn(0);
56 }
57 
58 #undef __FUNCT__
59 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering"
60 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
61 {
62   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
63   PetscErrorCode ierr;
64   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
65   PetscInt       *diag = a->diag,oidx;
66   MatScalar      *aa=a->a,*v;
67   PetscScalar    s1,s2,x1,x2;
68   PetscScalar    *x,*b;
69 
70   PetscFunctionBegin;
71   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
72   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
73   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
74 
75   /* forward solve the U^T */
76   idx = 0;
77   for (i=0; i<n; i++) {
78 
79     v     = aa + 4*diag[i];
80     /* multiply by the inverse of the block diagonal */
81     x1 = x[idx];   x2 = x[1+idx];
82     s1 = v[0]*x1  +  v[1]*x2;
83     s2 = v[2]*x1  +  v[3]*x2;
84     v += 4;
85 
86     vi    = aj + diag[i] + 1;
87     nz    = ai[i+1] - diag[i] - 1;
88     while (nz--) {
89       oidx = 2*(*vi++);
90       x[oidx]   -= v[0]*s1  +  v[1]*s2;
91       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
92       v  += 4;
93     }
94     x[idx]   = s1;x[1+idx] = s2;
95     idx += 2;
96   }
97   /* backward solve the L^T */
98   for (i=n-1; i>=0; i--){
99     v    = aa + 4*diag[i] - 4;
100     vi   = aj + diag[i] - 1;
101     nz   = diag[i] - ai[i];
102     idt  = 2*i;
103     s1   = x[idt];  s2 = x[1+idt];
104     while (nz--) {
105       idx   = 2*(*vi--);
106       x[idx]   -=  v[0]*s1 +  v[1]*s2;
107       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
108       v -= 4;
109     }
110   }
111   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
112   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
113   ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
114   PetscFunctionReturn(0);
115 }
116 
117 #undef __FUNCT__
118 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering"
119 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
120 {
121   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
122   PetscErrorCode ierr;
123   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
124   PetscInt       *diag = a->diag,oidx;
125   MatScalar      *aa=a->a,*v;
126   PetscScalar    s1,s2,s3,x1,x2,x3;
127   PetscScalar    *x,*b;
128 
129   PetscFunctionBegin;
130   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
131   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
132   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
133 
134   /* forward solve the U^T */
135   idx = 0;
136   for (i=0; i<n; i++) {
137 
138     v     = aa + 9*diag[i];
139     /* multiply by the inverse of the block diagonal */
140     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
141     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
142     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
143     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
144     v += 9;
145 
146     vi    = aj + diag[i] + 1;
147     nz    = ai[i+1] - diag[i] - 1;
148     while (nz--) {
149       oidx = 3*(*vi++);
150       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
151       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
152       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
153       v  += 9;
154     }
155     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
156     idx += 3;
157   }
158   /* backward solve the L^T */
159   for (i=n-1; i>=0; i--){
160     v    = aa + 9*diag[i] - 9;
161     vi   = aj + diag[i] - 1;
162     nz   = diag[i] - ai[i];
163     idt  = 3*i;
164     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
165     while (nz--) {
166       idx   = 3*(*vi--);
167       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
168       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
169       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
170       v -= 9;
171     }
172   }
173   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
174   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
175   ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
176   PetscFunctionReturn(0);
177 }
178 
179 #undef __FUNCT__
180 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering"
181 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
182 {
183   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
184   PetscErrorCode ierr;
185   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
186   PetscInt       *diag = a->diag,oidx;
187   MatScalar      *aa=a->a,*v;
188   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
189   PetscScalar    *x,*b;
190 
191   PetscFunctionBegin;
192   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
193   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
194   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
195 
196   /* forward solve the U^T */
197   idx = 0;
198   for (i=0; i<n; i++) {
199 
200     v     = aa + 16*diag[i];
201     /* multiply by the inverse of the block diagonal */
202     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
203     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
204     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
205     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
206     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
207     v += 16;
208 
209     vi    = aj + diag[i] + 1;
210     nz    = ai[i+1] - diag[i] - 1;
211     while (nz--) {
212       oidx = 4*(*vi++);
213       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
214       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
215       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
216       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
217       v  += 16;
218     }
219     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
220     idx += 4;
221   }
222   /* backward solve the L^T */
223   for (i=n-1; i>=0; i--){
224     v    = aa + 16*diag[i] - 16;
225     vi   = aj + diag[i] - 1;
226     nz   = diag[i] - ai[i];
227     idt  = 4*i;
228     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
229     while (nz--) {
230       idx   = 4*(*vi--);
231       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
232       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
233       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
234       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
235       v -= 16;
236     }
237   }
238   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
239   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
240   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
241   PetscFunctionReturn(0);
242 }
243 
244 #undef __FUNCT__
245 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering"
246 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
247 {
248   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
249   PetscErrorCode ierr;
250   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
251   PetscInt       *diag = a->diag,oidx;
252   MatScalar      *aa=a->a,*v;
253   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
254   PetscScalar    *x,*b;
255 
256   PetscFunctionBegin;
257   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
258   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
259   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
260 
261   /* forward solve the U^T */
262   idx = 0;
263   for (i=0; i<n; i++) {
264 
265     v     = aa + 25*diag[i];
266     /* multiply by the inverse of the block diagonal */
267     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
268     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
269     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
270     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
271     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
272     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
273     v += 25;
274 
275     vi    = aj + diag[i] + 1;
276     nz    = ai[i+1] - diag[i] - 1;
277     while (nz--) {
278       oidx = 5*(*vi++);
279       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
280       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
281       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
282       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
283       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
284       v  += 25;
285     }
286     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
287     idx += 5;
288   }
289   /* backward solve the L^T */
290   for (i=n-1; i>=0; i--){
291     v    = aa + 25*diag[i] - 25;
292     vi   = aj + diag[i] - 1;
293     nz   = diag[i] - ai[i];
294     idt  = 5*i;
295     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
296     while (nz--) {
297       idx   = 5*(*vi--);
298       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
299       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
300       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
301       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
302       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
303       v -= 25;
304     }
305   }
306   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
307   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
308   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
309   PetscFunctionReturn(0);
310 }
311 
312 #undef __FUNCT__
313 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering"
314 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
315 {
316   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
317   PetscErrorCode ierr;
318   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
319   PetscInt       *diag = a->diag,oidx;
320   MatScalar      *aa=a->a,*v;
321   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
322   PetscScalar    *x,*b;
323 
324   PetscFunctionBegin;
325   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
326   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
327   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
328 
329   /* forward solve the U^T */
330   idx = 0;
331   for (i=0; i<n; i++) {
332 
333     v     = aa + 36*diag[i];
334     /* multiply by the inverse of the block diagonal */
335     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
336     x6    = x[5+idx];
337     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
338     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
339     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
340     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
341     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
342     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
343     v += 36;
344 
345     vi    = aj + diag[i] + 1;
346     nz    = ai[i+1] - diag[i] - 1;
347     while (nz--) {
348       oidx = 6*(*vi++);
349       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
350       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
351       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
352       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
353       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
354       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
355       v  += 36;
356     }
357     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
358     x[5+idx] = s6;
359     idx += 6;
360   }
361   /* backward solve the L^T */
362   for (i=n-1; i>=0; i--){
363     v    = aa + 36*diag[i] - 36;
364     vi   = aj + diag[i] - 1;
365     nz   = diag[i] - ai[i];
366     idt  = 6*i;
367     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
368     s6 = x[5+idt];
369     while (nz--) {
370       idx   = 6*(*vi--);
371       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
372       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
373       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
374       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
375       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
376       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
377       v -= 36;
378     }
379   }
380   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
381   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
382   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
383   PetscFunctionReturn(0);
384 }
385 
386 #undef __FUNCT__
387 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering"
388 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
389 {
390   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
391   PetscErrorCode ierr;
392   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
393   PetscInt       *diag = a->diag,oidx;
394   MatScalar      *aa=a->a,*v;
395   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
396   PetscScalar    *x,*b;
397 
398   PetscFunctionBegin;
399   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
400   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
401   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
402 
403   /* forward solve the U^T */
404   idx = 0;
405   for (i=0; i<n; i++) {
406 
407     v     = aa + 49*diag[i];
408     /* multiply by the inverse of the block diagonal */
409     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
410     x6    = x[5+idx]; x7 = x[6+idx];
411     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
412     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
413     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
414     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
415     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
416     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
417     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
418     v += 49;
419 
420     vi    = aj + diag[i] + 1;
421     nz    = ai[i+1] - diag[i] - 1;
422     while (nz--) {
423       oidx = 7*(*vi++);
424       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
425       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
426       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
427       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
428       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
429       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
430       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
431       v  += 49;
432     }
433     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
434     x[5+idx] = s6;x[6+idx] = s7;
435     idx += 7;
436   }
437   /* backward solve the L^T */
438   for (i=n-1; i>=0; i--){
439     v    = aa + 49*diag[i] - 49;
440     vi   = aj + diag[i] - 1;
441     nz   = diag[i] - ai[i];
442     idt  = 7*i;
443     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
444     s6 = x[5+idt];s7 = x[6+idt];
445     while (nz--) {
446       idx   = 7*(*vi--);
447       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
448       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
449       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
450       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
451       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
452       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
453       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
454       v -= 49;
455     }
456   }
457   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
458   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
459   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
460   PetscFunctionReturn(0);
461 }
462 
463 /*---------------------------------------------------------------------------------------------*/
464 #undef __FUNCT__
465 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1"
466 PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
467 {
468   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
469   IS             iscol=a->col,isrow=a->row;
470   PetscErrorCode ierr;
471   const PetscInt *r,*c,*rout,*cout;
472   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
473   PetscInt       *diag = a->diag;
474   MatScalar      *aa=a->a,*v;
475   PetscScalar    s1,*x,*b,*t;
476 
477   PetscFunctionBegin;
478   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
479   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
480   t  = a->solve_work;
481 
482   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
483   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
484 
485   /* copy the b into temp work space according to permutation */
486   for (i=0; i<n; i++) {
487     t[i] = b[c[i]];
488   }
489 
490   /* forward solve the U^T */
491   for (i=0; i<n; i++) {
492 
493     v     = aa + diag[i];
494     /* multiply by the inverse of the block diagonal */
495     s1    = (*v++)*t[i];
496     vi    = aj + diag[i] + 1;
497     nz    = ai[i+1] - diag[i] - 1;
498     while (nz--) {
499       t[*vi++]  -= (*v++)*s1;
500     }
501     t[i]   = s1;
502   }
503   /* backward solve the L^T */
504   for (i=n-1; i>=0; i--){
505     v    = aa + diag[i] - 1;
506     vi   = aj + diag[i] - 1;
507     nz   = diag[i] - ai[i];
508     s1   = t[i];
509     while (nz--) {
510       t[*vi--]   -=  (*v--)*s1;
511     }
512   }
513 
514   /* copy t into x according to permutation */
515   for (i=0; i<n; i++) {
516     x[r[i]]   = t[i];
517   }
518 
519   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
520   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
521   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
522   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
523   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
524   PetscFunctionReturn(0);
525 }
526 
527 #undef __FUNCT__
528 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2"
529 PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
530 {
531   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
532   IS             iscol=a->col,isrow=a->row;
533   PetscErrorCode ierr;
534   const PetscInt *r,*c,*rout,*cout;
535   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
536   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
537   MatScalar      *aa=a->a,*v;
538   PetscScalar    s1,s2,x1,x2;
539   PetscScalar    *x,*b,*t;
540 
541   PetscFunctionBegin;
542   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
543   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
544   t  = a->solve_work;
545 
546   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
547   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
548 
549   /* copy the b into temp work space according to permutation */
550   ii = 0;
551   for (i=0; i<n; i++) {
552     ic      = 2*c[i];
553     t[ii]   = b[ic];
554     t[ii+1] = b[ic+1];
555     ii += 2;
556   }
557 
558   /* forward solve the U^T */
559   idx = 0;
560   for (i=0; i<n; i++) {
561 
562     v     = aa + 4*diag[i];
563     /* multiply by the inverse of the block diagonal */
564     x1    = t[idx];   x2 = t[1+idx];
565     s1 = v[0]*x1  +  v[1]*x2;
566     s2 = v[2]*x1  +  v[3]*x2;
567     v += 4;
568 
569     vi    = aj + diag[i] + 1;
570     nz    = ai[i+1] - diag[i] - 1;
571     while (nz--) {
572       oidx = 2*(*vi++);
573       t[oidx]   -= v[0]*s1  +  v[1]*s2;
574       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
575       v  += 4;
576     }
577     t[idx]   = s1;t[1+idx] = s2;
578     idx += 2;
579   }
580   /* backward solve the L^T */
581   for (i=n-1; i>=0; i--){
582     v    = aa + 4*diag[i] - 4;
583     vi   = aj + diag[i] - 1;
584     nz   = diag[i] - ai[i];
585     idt  = 2*i;
586     s1 = t[idt];  s2 = t[1+idt];
587     while (nz--) {
588       idx   = 2*(*vi--);
589       t[idx]   -=  v[0]*s1 +  v[1]*s2;
590       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
591       v -= 4;
592     }
593   }
594 
595   /* copy t into x according to permutation */
596   ii = 0;
597   for (i=0; i<n; i++) {
598     ir      = 2*r[i];
599     x[ir]   = t[ii];
600     x[ir+1] = t[ii+1];
601     ii += 2;
602   }
603 
604   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
605   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
606   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
607   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
608   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
609   PetscFunctionReturn(0);
610 }
611 
612 #undef __FUNCT__
613 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3"
614 PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
615 {
616   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
617   IS             iscol=a->col,isrow=a->row;
618   PetscErrorCode ierr;
619   const PetscInt *r,*c,*rout,*cout;
620   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
621   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
622   MatScalar      *aa=a->a,*v;
623   PetscScalar    s1,s2,s3,x1,x2,x3;
624   PetscScalar    *x,*b,*t;
625 
626   PetscFunctionBegin;
627   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
628   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
629   t  = a->solve_work;
630 
631   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
632   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
633 
634   /* copy the b into temp work space according to permutation */
635   ii = 0;
636   for (i=0; i<n; i++) {
637     ic      = 3*c[i];
638     t[ii]   = b[ic];
639     t[ii+1] = b[ic+1];
640     t[ii+2] = b[ic+2];
641     ii += 3;
642   }
643 
644   /* forward solve the U^T */
645   idx = 0;
646   for (i=0; i<n; i++) {
647 
648     v     = aa + 9*diag[i];
649     /* multiply by the inverse of the block diagonal */
650     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
651     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
652     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
653     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
654     v += 9;
655 
656     vi    = aj + diag[i] + 1;
657     nz    = ai[i+1] - diag[i] - 1;
658     while (nz--) {
659       oidx = 3*(*vi++);
660       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
661       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
662       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
663       v  += 9;
664     }
665     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
666     idx += 3;
667   }
668   /* backward solve the L^T */
669   for (i=n-1; i>=0; i--){
670     v    = aa + 9*diag[i] - 9;
671     vi   = aj + diag[i] - 1;
672     nz   = diag[i] - ai[i];
673     idt  = 3*i;
674     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
675     while (nz--) {
676       idx   = 3*(*vi--);
677       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
678       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
679       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
680       v -= 9;
681     }
682   }
683 
684   /* copy t into x according to permutation */
685   ii = 0;
686   for (i=0; i<n; i++) {
687     ir      = 3*r[i];
688     x[ir]   = t[ii];
689     x[ir+1] = t[ii+1];
690     x[ir+2] = t[ii+2];
691     ii += 3;
692   }
693 
694   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
695   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
696   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
697   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
698   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
699   PetscFunctionReturn(0);
700 }
701 
702 #undef __FUNCT__
703 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4"
704 PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
705 {
706   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
707   IS             iscol=a->col,isrow=a->row;
708   PetscErrorCode ierr;
709   const PetscInt *r,*c,*rout,*cout;
710   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
711   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
712   MatScalar      *aa=a->a,*v;
713   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
714   PetscScalar    *x,*b,*t;
715 
716   PetscFunctionBegin;
717   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
718   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
719   t  = a->solve_work;
720 
721   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
722   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
723 
724   /* copy the b into temp work space according to permutation */
725   ii = 0;
726   for (i=0; i<n; i++) {
727     ic      = 4*c[i];
728     t[ii]   = b[ic];
729     t[ii+1] = b[ic+1];
730     t[ii+2] = b[ic+2];
731     t[ii+3] = b[ic+3];
732     ii += 4;
733   }
734 
735   /* forward solve the U^T */
736   idx = 0;
737   for (i=0; i<n; i++) {
738 
739     v     = aa + 16*diag[i];
740     /* multiply by the inverse of the block diagonal */
741     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
742     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
743     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
744     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
745     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
746     v += 16;
747 
748     vi    = aj + diag[i] + 1;
749     nz    = ai[i+1] - diag[i] - 1;
750     while (nz--) {
751       oidx = 4*(*vi++);
752       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
753       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
754       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
755       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
756       v  += 16;
757     }
758     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
759     idx += 4;
760   }
761   /* backward solve the L^T */
762   for (i=n-1; i>=0; i--){
763     v    = aa + 16*diag[i] - 16;
764     vi   = aj + diag[i] - 1;
765     nz   = diag[i] - ai[i];
766     idt  = 4*i;
767     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
768     while (nz--) {
769       idx   = 4*(*vi--);
770       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
771       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
772       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
773       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
774       v -= 16;
775     }
776   }
777 
778   /* copy t into x according to permutation */
779   ii = 0;
780   for (i=0; i<n; i++) {
781     ir      = 4*r[i];
782     x[ir]   = t[ii];
783     x[ir+1] = t[ii+1];
784     x[ir+2] = t[ii+2];
785     x[ir+3] = t[ii+3];
786     ii += 4;
787   }
788 
789   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
790   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
791   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
792   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
793   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
794   PetscFunctionReturn(0);
795 }
796 
797 #undef __FUNCT__
798 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5"
799 PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
800 {
801   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
802   IS             iscol=a->col,isrow=a->row;
803   PetscErrorCode ierr;
804   const PetscInt *r,*c,*rout,*cout;
805   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
806   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
807   MatScalar      *aa=a->a,*v;
808   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
809   PetscScalar    *x,*b,*t;
810 
811   PetscFunctionBegin;
812   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
813   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
814   t  = a->solve_work;
815 
816   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
817   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
818 
819   /* copy the b into temp work space according to permutation */
820   ii = 0;
821   for (i=0; i<n; i++) {
822     ic      = 5*c[i];
823     t[ii]   = b[ic];
824     t[ii+1] = b[ic+1];
825     t[ii+2] = b[ic+2];
826     t[ii+3] = b[ic+3];
827     t[ii+4] = b[ic+4];
828     ii += 5;
829   }
830 
831   /* forward solve the U^T */
832   idx = 0;
833   for (i=0; i<n; i++) {
834 
835     v     = aa + 25*diag[i];
836     /* multiply by the inverse of the block diagonal */
837     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
838     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
839     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
840     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
841     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
842     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
843     v += 25;
844 
845     vi    = aj + diag[i] + 1;
846     nz    = ai[i+1] - diag[i] - 1;
847     while (nz--) {
848       oidx = 5*(*vi++);
849       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
850       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
851       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
852       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
853       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
854       v  += 25;
855     }
856     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
857     idx += 5;
858   }
859   /* backward solve the L^T */
860   for (i=n-1; i>=0; i--){
861     v    = aa + 25*diag[i] - 25;
862     vi   = aj + diag[i] - 1;
863     nz   = diag[i] - ai[i];
864     idt  = 5*i;
865     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
866     while (nz--) {
867       idx   = 5*(*vi--);
868       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
869       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
870       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
871       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
872       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
873       v -= 25;
874     }
875   }
876 
877   /* copy t into x according to permutation */
878   ii = 0;
879   for (i=0; i<n; i++) {
880     ir      = 5*r[i];
881     x[ir]   = t[ii];
882     x[ir+1] = t[ii+1];
883     x[ir+2] = t[ii+2];
884     x[ir+3] = t[ii+3];
885     x[ir+4] = t[ii+4];
886     ii += 5;
887   }
888 
889   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
890   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
891   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
892   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
893   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
894   PetscFunctionReturn(0);
895 }
896 
897 #undef __FUNCT__
898 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6"
899 PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
900 {
901   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
902   IS             iscol=a->col,isrow=a->row;
903   PetscErrorCode ierr;
904   const PetscInt *r,*c,*rout,*cout;
905   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
906   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
907   MatScalar      *aa=a->a,*v;
908   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
909   PetscScalar    *x,*b,*t;
910 
911   PetscFunctionBegin;
912   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
913   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
914   t  = a->solve_work;
915 
916   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
917   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
918 
919   /* copy the b into temp work space according to permutation */
920   ii = 0;
921   for (i=0; i<n; i++) {
922     ic      = 6*c[i];
923     t[ii]   = b[ic];
924     t[ii+1] = b[ic+1];
925     t[ii+2] = b[ic+2];
926     t[ii+3] = b[ic+3];
927     t[ii+4] = b[ic+4];
928     t[ii+5] = b[ic+5];
929     ii += 6;
930   }
931 
932   /* forward solve the U^T */
933   idx = 0;
934   for (i=0; i<n; i++) {
935 
936     v     = aa + 36*diag[i];
937     /* multiply by the inverse of the block diagonal */
938     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
939     x6    = t[5+idx];
940     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
941     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
942     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
943     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
944     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
945     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
946     v += 36;
947 
948     vi    = aj + diag[i] + 1;
949     nz    = ai[i+1] - diag[i] - 1;
950     while (nz--) {
951       oidx = 6*(*vi++);
952       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
953       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
954       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
955       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
956       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
957       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
958       v  += 36;
959     }
960     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
961     t[5+idx] = s6;
962     idx += 6;
963   }
964   /* backward solve the L^T */
965   for (i=n-1; i>=0; i--){
966     v    = aa + 36*diag[i] - 36;
967     vi   = aj + diag[i] - 1;
968     nz   = diag[i] - ai[i];
969     idt  = 6*i;
970     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
971     s6 = t[5+idt];
972     while (nz--) {
973       idx   = 6*(*vi--);
974       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
975       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
976       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
977       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
978       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
979       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
980       v -= 36;
981     }
982   }
983 
984   /* copy t into x according to permutation */
985   ii = 0;
986   for (i=0; i<n; i++) {
987     ir      = 6*r[i];
988     x[ir]   = t[ii];
989     x[ir+1] = t[ii+1];
990     x[ir+2] = t[ii+2];
991     x[ir+3] = t[ii+3];
992     x[ir+4] = t[ii+4];
993     x[ir+5] = t[ii+5];
994     ii += 6;
995   }
996 
997   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
998   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
999   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1000   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1001   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1002   PetscFunctionReturn(0);
1003 }
1004 
1005 #undef __FUNCT__
1006 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7"
1007 PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1008 {
1009   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1010   IS             iscol=a->col,isrow=a->row;
1011   PetscErrorCode ierr;
1012   const PetscInt *r,*c,*rout,*cout;
1013   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1014   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1015   MatScalar      *aa=a->a,*v;
1016   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1017   PetscScalar    *x,*b,*t;
1018 
1019   PetscFunctionBegin;
1020   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1021   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1022   t  = a->solve_work;
1023 
1024   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1025   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1026 
1027   /* copy the b into temp work space according to permutation */
1028   ii = 0;
1029   for (i=0; i<n; i++) {
1030     ic      = 7*c[i];
1031     t[ii]   = b[ic];
1032     t[ii+1] = b[ic+1];
1033     t[ii+2] = b[ic+2];
1034     t[ii+3] = b[ic+3];
1035     t[ii+4] = b[ic+4];
1036     t[ii+5] = b[ic+5];
1037     t[ii+6] = b[ic+6];
1038     ii += 7;
1039   }
1040 
1041   /* forward solve the U^T */
1042   idx = 0;
1043   for (i=0; i<n; i++) {
1044 
1045     v     = aa + 49*diag[i];
1046     /* multiply by the inverse of the block diagonal */
1047     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1048     x6    = t[5+idx]; x7 = t[6+idx];
1049     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1050     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1051     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1052     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1053     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1054     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1055     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1056     v += 49;
1057 
1058     vi    = aj + diag[i] + 1;
1059     nz    = ai[i+1] - diag[i] - 1;
1060     while (nz--) {
1061       oidx = 7*(*vi++);
1062       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1063       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1064       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1065       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1066       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1067       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1068       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1069       v  += 49;
1070     }
1071     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1072     t[5+idx] = s6;t[6+idx] = s7;
1073     idx += 7;
1074   }
1075   /* backward solve the L^T */
1076   for (i=n-1; i>=0; i--){
1077     v    = aa + 49*diag[i] - 49;
1078     vi   = aj + diag[i] - 1;
1079     nz   = diag[i] - ai[i];
1080     idt  = 7*i;
1081     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1082     s6 = t[5+idt];s7 = t[6+idt];
1083     while (nz--) {
1084       idx   = 7*(*vi--);
1085       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1086       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1087       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1088       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1089       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1090       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1091       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1092       v -= 49;
1093     }
1094   }
1095 
1096   /* copy t into x according to permutation */
1097   ii = 0;
1098   for (i=0; i<n; i++) {
1099     ir      = 7*r[i];
1100     x[ir]   = t[ii];
1101     x[ir+1] = t[ii+1];
1102     x[ir+2] = t[ii+2];
1103     x[ir+3] = t[ii+3];
1104     x[ir+4] = t[ii+4];
1105     x[ir+5] = t[ii+5];
1106     x[ir+6] = t[ii+6];
1107     ii += 7;
1108   }
1109 
1110   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1111   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1112   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1113   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1114   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
1115   PetscFunctionReturn(0);
1116 }
1117 
1118 /* ----------------------------------------------------------- */
1119 #undef __FUNCT__
1120 #define __FUNCT__ "MatSolve_SeqBAIJ_N"
1121 PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
1122 {
1123   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1124   IS             iscol=a->col,isrow=a->row;
1125   PetscErrorCode ierr;
1126   const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
1127   PetscInt       i,n=a->mbs;
1128   PetscInt       nz,bs=A->rmap->bs,bs2=a->bs2;
1129   MatScalar      *aa=a->a,*v;
1130   PetscScalar    *x,*b,*s,*t,*ls;
1131 
1132   PetscFunctionBegin;
1133   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1134   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1135   t  = a->solve_work;
1136 
1137   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1138   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1139 
1140   /* forward solve the lower triangular */
1141   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
1142   for (i=1; i<n; i++) {
1143     v   = aa + bs2*ai[i];
1144     vi  = aj + ai[i];
1145     nz  = a->diag[i] - ai[i];
1146     s = t + bs*i;
1147     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
1148     while (nz--) {
1149       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
1150       v += bs2;
1151     }
1152   }
1153   /* backward solve the upper triangular */
1154   ls = a->solve_work + A->cmap->n;
1155   for (i=n-1; i>=0; i--){
1156     v   = aa + bs2*(a->diag[i] + 1);
1157     vi  = aj + a->diag[i] + 1;
1158     nz  = ai[i+1] - a->diag[i] - 1;
1159     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1160     while (nz--) {
1161       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
1162       v += bs2;
1163     }
1164     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
1165     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1166   }
1167 
1168   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1169   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1170   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1171   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1172   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
1173   PetscFunctionReturn(0);
1174 }
1175 
1176 #undef __FUNCT__
1177 #define __FUNCT__ "MatSolve_SeqBAIJ_7"
1178 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1179 {
1180   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1181   IS             iscol=a->col,isrow=a->row;
1182   PetscErrorCode ierr;
1183   const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*diag = a->diag,*vi;
1184   PetscInt       i,n=a->mbs,nz,idx,idt,idc;
1185   MatScalar      *aa=a->a,*v;
1186   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1187   PetscScalar    *x,*b,*t;
1188 
1189   PetscFunctionBegin;
1190   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1191   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1192   t  = a->solve_work;
1193 
1194   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1195   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1196 
1197   /* forward solve the lower triangular */
1198   idx    = 7*(*r++);
1199   t[0] = b[idx];   t[1] = b[1+idx];
1200   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1201   t[5] = b[5+idx]; t[6] = b[6+idx];
1202 
1203   for (i=1; i<n; i++) {
1204     v     = aa + 49*ai[i];
1205     vi    = aj + ai[i];
1206     nz    = diag[i] - ai[i];
1207     idx   = 7*(*r++);
1208     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1209     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1210     while (nz--) {
1211       idx   = 7*(*vi++);
1212       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1213       x4    = t[3+idx];x5 = t[4+idx];
1214       x6    = t[5+idx];x7 = t[6+idx];
1215       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1216       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1217       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1218       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1219       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1220       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1221       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1222       v += 49;
1223     }
1224     idx = 7*i;
1225     t[idx]   = s1;t[1+idx] = s2;
1226     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1227     t[5+idx] = s6;t[6+idx] = s7;
1228   }
1229   /* backward solve the upper triangular */
1230   for (i=n-1; i>=0; i--){
1231     v    = aa + 49*diag[i] + 49;
1232     vi   = aj + diag[i] + 1;
1233     nz   = ai[i+1] - diag[i] - 1;
1234     idt  = 7*i;
1235     s1 = t[idt];  s2 = t[1+idt];
1236     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1237     s6 = t[5+idt];s7 = t[6+idt];
1238     while (nz--) {
1239       idx   = 7*(*vi++);
1240       x1    = t[idx];   x2 = t[1+idx];
1241       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1242       x6    = t[5+idx]; x7 = t[6+idx];
1243       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1244       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1245       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1246       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1247       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1248       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1249       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1250       v += 49;
1251     }
1252     idc = 7*(*c--);
1253     v   = aa + 49*diag[i];
1254     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1255                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1256     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1257                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1258     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1259                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1260     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1261                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1262     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1263                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1264     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1265                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1266     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1267                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
1268   }
1269 
1270   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1271   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1272   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1273   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1274   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
1275   PetscFunctionReturn(0);
1276 }
1277 
1278 #undef __FUNCT__
1279 #define __FUNCT__ "MatSolve_SeqBAIJ_7_newdatastruct"
1280 PetscErrorCode MatSolve_SeqBAIJ_7_newdatastruct(Mat A,Vec bb,Vec xx)
1281 {
1282   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1283   IS             iscol=a->col,isrow=a->row;
1284   PetscErrorCode ierr;
1285   const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*vi;
1286   PetscInt       i,n=a->mbs,nz,idx,idt,idc,k,m;
1287   MatScalar      *aa=a->a,*v;
1288   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1289   PetscScalar    *x,*b,*t;
1290 
1291   PetscFunctionBegin;
1292   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1293   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1294   t  = a->solve_work;
1295 
1296   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1297   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1298 
1299   /* forward solve the lower triangular */
1300   idx    = 7*r[0];
1301   t[0] = b[idx];   t[1] = b[1+idx];
1302   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1303   t[5] = b[5+idx]; t[6] = b[6+idx];
1304 
1305   for (i=1; i<n; i++) {
1306     v     = aa + 49*ai[i];
1307     vi    = aj + ai[i];
1308     nz    = ai[i+1] - ai[i];
1309     idx   = 7*r[i];
1310     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1311     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1312     for(m=0;m<nz;m++){
1313       idx   = 7*vi[m];
1314       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1315       x4    = t[3+idx];x5 = t[4+idx];
1316       x6    = t[5+idx];x7 = t[6+idx];
1317       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1318       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1319       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1320       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1321       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1322       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1323       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1324       v += 49;
1325     }
1326     idx = 7*i;
1327     t[idx]   = s1;t[1+idx] = s2;
1328     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1329     t[5+idx] = s6;t[6+idx] = s7;
1330   }
1331   /* backward solve the upper triangular */
1332   for (i=n-1; i>=0; i--){
1333     k    = 2*n-i;
1334     v    = aa + 49*ai[k];
1335     vi   = aj + ai[k];
1336     nz   = ai[k+1] - ai[k] - 1;
1337     idt  = 7*i;
1338     s1 = t[idt];  s2 = t[1+idt];
1339     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1340     s6 = t[5+idt];s7 = t[6+idt];
1341     for(m=0;m<nz;m++){
1342       idx   = 7*vi[m];
1343       x1    = t[idx];   x2 = t[1+idx];
1344       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1345       x6    = t[5+idx]; x7 = t[6+idx];
1346       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1347       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1348       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1349       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1350       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1351       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1352       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1353       v += 49;
1354     }
1355     idc = 7*c[i];
1356     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1357                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1358     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1359                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1360     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1361                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1362     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1363                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1364     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1365                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1366     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1367                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1368     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1369                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
1370   }
1371 
1372   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1373   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1374   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1375   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1376   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
1377   PetscFunctionReturn(0);
1378 }
1379 
1380 #undef __FUNCT__
1381 #define __FUNCT__ "MatSolve_SeqBAIJ_7_newdatastruct_v2"
1382 PetscErrorCode MatSolve_SeqBAIJ_7_newdatastruct_v2(Mat A,Vec bb,Vec xx)
1383 {
1384   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1385   IS             iscol=a->col,isrow=a->row;
1386   PetscErrorCode ierr;
1387   const PetscInt *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag,*rout,*cout,*vi;
1388   PetscInt       i,n=a->mbs,nz,idx,idt,idc,m;
1389   MatScalar      *aa=a->a,*v;
1390   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1391   PetscScalar    *x,*b,*t;
1392 
1393   PetscFunctionBegin;
1394   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1395   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1396   t  = a->solve_work;
1397 
1398   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1399   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1400 
1401   /* forward solve the lower triangular */
1402   idx    = 7*r[0];
1403   t[0] = b[idx];   t[1] = b[1+idx];
1404   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1405   t[5] = b[5+idx]; t[6] = b[6+idx];
1406 
1407   for (i=1; i<n; i++) {
1408     v     = aa + 49*ai[i];
1409     vi    = aj + ai[i];
1410     nz    = ai[i+1] - ai[i];
1411     idx   = 7*r[i];
1412     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1413     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1414     for(m=0;m<nz;m++){
1415       idx   = 7*vi[m];
1416       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1417       x4    = t[3+idx];x5 = t[4+idx];
1418       x6    = t[5+idx];x7 = t[6+idx];
1419       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1420       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1421       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1422       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1423       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1424       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1425       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1426       v += 49;
1427     }
1428     idx = 7*i;
1429     t[idx]   = s1;t[1+idx] = s2;
1430     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1431     t[5+idx] = s6;t[6+idx] = s7;
1432   }
1433   /* backward solve the upper triangular */
1434   for (i=n-1; i>=0; i--){
1435     v    = aa + 49*(adiag[i+1]+1);
1436     vi   = aj + adiag[i+1]+1;
1437     nz   = adiag[i] - adiag[i+1] - 1;
1438     idt  = 7*i;
1439     s1 = t[idt];  s2 = t[1+idt];
1440     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1441     s6 = t[5+idt];s7 = t[6+idt];
1442     for(m=0;m<nz;m++){
1443       idx   = 7*vi[m];
1444       x1    = t[idx];   x2 = t[1+idx];
1445       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1446       x6    = t[5+idx]; x7 = t[6+idx];
1447       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1448       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1449       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1450       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1451       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1452       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1453       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1454       v += 49;
1455     }
1456     idc = 7*c[i];
1457     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1458                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1459     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1460                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1461     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1462                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1463     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1464                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1465     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1466                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1467     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1468                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1469     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1470                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
1471   }
1472 
1473   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1474   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1475   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1476   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1477   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
1478   PetscFunctionReturn(0);
1479 }
1480 
1481 #undef __FUNCT__
1482 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
1483 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
1484 {
1485   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1486   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1487   PetscErrorCode    ierr;
1488   PetscInt          *diag = a->diag,jdx;
1489   const MatScalar   *aa=a->a,*v;
1490   PetscScalar       *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1491   const PetscScalar *b;
1492 
1493   PetscFunctionBegin;
1494   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1495   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1496   /* forward solve the lower triangular */
1497   idx    = 0;
1498   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1499   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1500   x[6] = b[6+idx];
1501   for (i=1; i<n; i++) {
1502     v     =  aa + 49*ai[i];
1503     vi    =  aj + ai[i];
1504     nz    =  diag[i] - ai[i];
1505     idx   =  7*i;
1506     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1507     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1508     s7  =  b[6+idx];
1509     while (nz--) {
1510       jdx   = 7*(*vi++);
1511       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1512       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1513       x7    = x[6+jdx];
1514       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1515       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1516       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1517       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1518       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1519       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1520       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1521       v += 49;
1522      }
1523     x[idx]   = s1;
1524     x[1+idx] = s2;
1525     x[2+idx] = s3;
1526     x[3+idx] = s4;
1527     x[4+idx] = s5;
1528     x[5+idx] = s6;
1529     x[6+idx] = s7;
1530   }
1531   /* backward solve the upper triangular */
1532   for (i=n-1; i>=0; i--){
1533     v    = aa + 49*diag[i] + 49;
1534     vi   = aj + diag[i] + 1;
1535     nz   = ai[i+1] - diag[i] - 1;
1536     idt  = 7*i;
1537     s1 = x[idt];   s2 = x[1+idt];
1538     s3 = x[2+idt]; s4 = x[3+idt];
1539     s5 = x[4+idt]; s6 = x[5+idt];
1540     s7 = x[6+idt];
1541     while (nz--) {
1542       idx   = 7*(*vi++);
1543       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1544       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1545       x7    = x[6+idx];
1546       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1547       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1548       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1549       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1550       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1551       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1552       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1553       v += 49;
1554     }
1555     v        = aa + 49*diag[i];
1556     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
1557                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
1558     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
1559                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
1560     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
1561                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
1562     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
1563                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
1564     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
1565                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
1566     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
1567                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
1568     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
1569                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
1570   }
1571 
1572   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1573   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1574   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1575   PetscFunctionReturn(0);
1576 }
1577 
1578 #undef __FUNCT__
1579 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct"
1580 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
1581 {
1582     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1583     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
1584     PetscErrorCode    ierr;
1585     PetscInt          idx,jdx,idt;
1586     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
1587     const MatScalar   *aa=a->a,*v;
1588     PetscScalar       *x;
1589     const PetscScalar *b;
1590     PetscScalar        s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1591 
1592     PetscFunctionBegin;
1593     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1594     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1595     /* forward solve the lower triangular */
1596     idx    = 0;
1597     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
1598     x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx];
1599     for (i=1; i<n; i++) {
1600        v    = aa + bs2*ai[i];
1601        vi   = aj + ai[i];
1602        nz   = ai[i+1] - ai[i];
1603       idx   = bs*i;
1604        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1605        s5   = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1606        for(k=0;k<nz;k++) {
1607           jdx   = bs*vi[k];
1608           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
1609 	  x5    = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx];
1610           s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
1611           s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
1612           s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
1613 	  s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
1614           s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
1615 	  s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
1616 	  s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
1617           v   +=  bs2;
1618         }
1619 
1620        x[idx]   = s1;
1621        x[1+idx] = s2;
1622        x[2+idx] = s3;
1623        x[3+idx] = s4;
1624        x[4+idx] = s5;
1625        x[5+idx] = s6;
1626        x[6+idx] = s7;
1627     }
1628 
1629    /* backward solve the upper triangular */
1630   for (i=n-1; i>=0; i--){
1631      v   = aa + bs2*ai[2*n-i];
1632      vi  = aj + ai[2*n-i];
1633      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
1634      idt = bs*i;
1635      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
1636      s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt];
1637     for(k=0;k<nz;k++) {
1638       idx   = bs*vi[k];
1639        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
1640        x5    = x[4+idx];x6 = x[5+idx];x7 = x[6+idx];
1641        s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
1642        s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
1643        s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
1644        s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
1645        s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
1646        s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
1647        s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
1648         v   +=  bs2;
1649     }
1650     /* x = inv_diagonal*x */
1651     x[idt]   = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4  + v[28]*s5 + v[35]*s6 + v[42]*s7;
1652     x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4  + v[29]*s5 + v[36]*s6 + v[43]*s7;
1653     x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4  + v[30]*s5 + v[37]*s6 + v[44]*s7;
1654     x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4  + v[31]*s5 + v[38]*s6 + v[45]*s7;
1655     x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4  + v[32]*s5 + v[39]*s6 + v[46]*s7;
1656     x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4  + v[33]*s5 + v[40]*s6 + v[47]*s7;
1657     x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4  + v[34]*s5 + v[41]*s6 + v[48]*s7;
1658   }
1659 
1660   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1661   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1662   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1663   PetscFunctionReturn(0);
1664 }
1665 
1666 #undef __FUNCT__
1667 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct_v2"
1668 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx)
1669 {
1670     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1671     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
1672     PetscErrorCode    ierr;
1673     PetscInt          idx,jdx,idt;
1674     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
1675     const MatScalar   *aa=a->a,*v;
1676     PetscScalar       *x;
1677     const PetscScalar *b;
1678     PetscScalar        s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1679 
1680     PetscFunctionBegin;
1681     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1682     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1683     /* forward solve the lower triangular */
1684     idx    = 0;
1685     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
1686     x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx];
1687     for (i=1; i<n; i++) {
1688        v    = aa + bs2*ai[i];
1689        vi   = aj + ai[i];
1690        nz   = ai[i+1] - ai[i];
1691       idx   = bs*i;
1692        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1693        s5   = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1694        for(k=0;k<nz;k++) {
1695           jdx   = bs*vi[k];
1696           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
1697 	  x5    = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx];
1698           s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
1699           s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
1700           s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
1701 	  s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
1702           s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
1703 	  s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
1704 	  s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
1705           v   +=  bs2;
1706         }
1707 
1708        x[idx]   = s1;
1709        x[1+idx] = s2;
1710        x[2+idx] = s3;
1711        x[3+idx] = s4;
1712        x[4+idx] = s5;
1713        x[5+idx] = s6;
1714        x[6+idx] = s7;
1715     }
1716 
1717    /* backward solve the upper triangular */
1718   for (i=n-1; i>=0; i--){
1719     v   = aa + bs2*(adiag[i+1]+1);
1720      vi  = aj + adiag[i+1]+1;
1721      nz  = adiag[i] - adiag[i+1]-1;
1722      idt = bs*i;
1723      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
1724      s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt];
1725     for(k=0;k<nz;k++) {
1726       idx   = bs*vi[k];
1727        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
1728        x5    = x[4+idx];x6 = x[5+idx];x7 = x[6+idx];
1729        s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
1730        s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
1731        s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
1732        s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
1733        s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
1734        s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
1735        s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
1736         v   +=  bs2;
1737     }
1738     /* x = inv_diagonal*x */
1739     x[idt]   = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4  + v[28]*s5 + v[35]*s6 + v[42]*s7;
1740     x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4  + v[29]*s5 + v[36]*s6 + v[43]*s7;
1741     x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4  + v[30]*s5 + v[37]*s6 + v[44]*s7;
1742     x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4  + v[31]*s5 + v[38]*s6 + v[45]*s7;
1743     x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4  + v[32]*s5 + v[39]*s6 + v[46]*s7;
1744     x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4  + v[33]*s5 + v[40]*s6 + v[47]*s7;
1745     x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4  + v[34]*s5 + v[41]*s6 + v[48]*s7;
1746   }
1747 
1748   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1749   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1750   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1751   PetscFunctionReturn(0);
1752 }
1753 
1754 #undef __FUNCT__
1755 #define __FUNCT__ "MatSolve_SeqBAIJ_6"
1756 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
1757 {
1758   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1759   IS                iscol=a->col,isrow=a->row;
1760   PetscErrorCode    ierr;
1761   const PetscInt    *r,*c,*rout,*cout;
1762   PetscInt          *diag = a->diag,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
1763   const MatScalar   *aa=a->a,*v;
1764   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
1765   const PetscScalar *b;
1766   PetscFunctionBegin;
1767   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1768   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1769   t  = a->solve_work;
1770 
1771   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1772   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1773 
1774   /* forward solve the lower triangular */
1775   idx    = 6*(*r++);
1776   t[0] = b[idx];   t[1] = b[1+idx];
1777   t[2] = b[2+idx]; t[3] = b[3+idx];
1778   t[4] = b[4+idx]; t[5] = b[5+idx];
1779   for (i=1; i<n; i++) {
1780     v     = aa + 36*ai[i];
1781     vi    = aj + ai[i];
1782     nz    = diag[i] - ai[i];
1783     idx   = 6*(*r++);
1784     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1785     s5  = b[4+idx]; s6 = b[5+idx];
1786     while (nz--) {
1787       idx   = 6*(*vi++);
1788       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1789       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1790       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1791       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1792       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1793       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1794       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1795       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1796       v += 36;
1797     }
1798     idx = 6*i;
1799     t[idx]   = s1;t[1+idx] = s2;
1800     t[2+idx] = s3;t[3+idx] = s4;
1801     t[4+idx] = s5;t[5+idx] = s6;
1802   }
1803   /* backward solve the upper triangular */
1804   for (i=n-1; i>=0; i--){
1805     v    = aa + 36*diag[i] + 36;
1806     vi   = aj + diag[i] + 1;
1807     nz   = ai[i+1] - diag[i] - 1;
1808     idt  = 6*i;
1809     s1 = t[idt];  s2 = t[1+idt];
1810     s3 = t[2+idt];s4 = t[3+idt];
1811     s5 = t[4+idt];s6 = t[5+idt];
1812     while (nz--) {
1813       idx   = 6*(*vi++);
1814       x1    = t[idx];   x2 = t[1+idx];
1815       x3    = t[2+idx]; x4 = t[3+idx];
1816       x5    = t[4+idx]; x6 = t[5+idx];
1817       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1818       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1819       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1820       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1821       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1822       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1823       v += 36;
1824     }
1825     idc = 6*(*c--);
1826     v   = aa + 36*diag[i];
1827     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
1828                                  v[18]*s4+v[24]*s5+v[30]*s6;
1829     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1830                                  v[19]*s4+v[25]*s5+v[31]*s6;
1831     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1832                                  v[20]*s4+v[26]*s5+v[32]*s6;
1833     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1834                                  v[21]*s4+v[27]*s5+v[33]*s6;
1835     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1836                                  v[22]*s4+v[28]*s5+v[34]*s6;
1837     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1838                                  v[23]*s4+v[29]*s5+v[35]*s6;
1839   }
1840 
1841   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1842   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1843   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1844   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1845   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1846   PetscFunctionReturn(0);
1847 }
1848 
1849 #undef __FUNCT__
1850 #define __FUNCT__ "MatSolve_SeqBAIJ_6_newdatastruct"
1851 PetscErrorCode MatSolve_SeqBAIJ_6_newdatastruct(Mat A,Vec bb,Vec xx)
1852 {
1853   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1854   IS                iscol=a->col,isrow=a->row;
1855   PetscErrorCode    ierr;
1856   const PetscInt    *r,*c,*rout,*cout;
1857   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m;
1858   const MatScalar   *aa=a->a,*v;
1859   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
1860   const PetscScalar *b;
1861   PetscFunctionBegin;
1862   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1863   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1864   t  = a->solve_work;
1865 
1866   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1867   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1868 
1869   /* forward solve the lower triangular */
1870   idx    = 6*r[0];
1871   t[0] = b[idx];   t[1] = b[1+idx];
1872   t[2] = b[2+idx]; t[3] = b[3+idx];
1873   t[4] = b[4+idx]; t[5] = b[5+idx];
1874   for (i=1; i<n; i++) {
1875     v     = aa + 36*ai[i];
1876     vi    = aj + ai[i];
1877     nz    = ai[i+1] - ai[i];
1878     idx   = 6*r[i];
1879     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1880     s5  = b[4+idx]; s6 = b[5+idx];
1881     for(m=0;m<nz;m++){
1882       idx   = 6*vi[m];
1883       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1884       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1885       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1886       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1887       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1888       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1889       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1890       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1891       v += 36;
1892     }
1893     idx = 6*i;
1894     t[idx]   = s1;t[1+idx] = s2;
1895     t[2+idx] = s3;t[3+idx] = s4;
1896     t[4+idx] = s5;t[5+idx] = s6;
1897   }
1898   /* backward solve the upper triangular */
1899   for (i=n-1; i>=0; i--){
1900     k    = 2*n-i;
1901     v    = aa + 36*ai[k];
1902     vi   = aj + ai[k];
1903     nz   = ai[k+1] - ai[k] - 1;
1904     idt  = 6*i;
1905     s1 = t[idt];  s2 = t[1+idt];
1906     s3 = t[2+idt];s4 = t[3+idt];
1907     s5 = t[4+idt];s6 = t[5+idt];
1908     for(m=0;m<nz;m++){
1909       idx   = 6*vi[m];
1910       x1    = t[idx];   x2 = t[1+idx];
1911       x3    = t[2+idx]; x4 = t[3+idx];
1912       x5    = t[4+idx]; x6 = t[5+idx];
1913       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1914       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1915       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1916       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1917       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1918       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1919       v += 36;
1920     }
1921     idc = 6*c[i];
1922     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
1923                                  v[18]*s4+v[24]*s5+v[30]*s6;
1924     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1925                                  v[19]*s4+v[25]*s5+v[31]*s6;
1926     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1927                                  v[20]*s4+v[26]*s5+v[32]*s6;
1928     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1929                                  v[21]*s4+v[27]*s5+v[33]*s6;
1930     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1931                                  v[22]*s4+v[28]*s5+v[34]*s6;
1932     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1933                                  v[23]*s4+v[29]*s5+v[35]*s6;
1934   }
1935 
1936   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1937   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1938   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1939   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1940   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1941   PetscFunctionReturn(0);
1942 }
1943 
1944 #undef __FUNCT__
1945 #define __FUNCT__ "MatSolve_SeqBAIJ_6_newdatastruct_v2"
1946 PetscErrorCode MatSolve_SeqBAIJ_6_newdatastruct_v2(Mat A,Vec bb,Vec xx)
1947 {
1948   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1949   IS                iscol=a->col,isrow=a->row;
1950   PetscErrorCode    ierr;
1951   const PetscInt    *r,*c,*rout,*cout;
1952   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
1953   const MatScalar   *aa=a->a,*v;
1954   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
1955   const PetscScalar *b;
1956   PetscFunctionBegin;
1957   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1958   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1959   t  = a->solve_work;
1960 
1961   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1962   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1963 
1964   /* forward solve the lower triangular */
1965   idx    = 6*r[0];
1966   t[0] = b[idx];   t[1] = b[1+idx];
1967   t[2] = b[2+idx]; t[3] = b[3+idx];
1968   t[4] = b[4+idx]; t[5] = b[5+idx];
1969   for (i=1; i<n; i++) {
1970     v     = aa + 36*ai[i];
1971     vi    = aj + ai[i];
1972     nz    = ai[i+1] - ai[i];
1973     idx   = 6*r[i];
1974     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1975     s5  = b[4+idx]; s6 = b[5+idx];
1976     for(m=0;m<nz;m++){
1977       idx   = 6*vi[m];
1978       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1979       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1980       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1981       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1982       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1983       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1984       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1985       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1986       v += 36;
1987     }
1988     idx = 6*i;
1989     t[idx]   = s1;t[1+idx] = s2;
1990     t[2+idx] = s3;t[3+idx] = s4;
1991     t[4+idx] = s5;t[5+idx] = s6;
1992   }
1993   /* backward solve the upper triangular */
1994   for (i=n-1; i>=0; i--){
1995     v    = aa + 36*(adiag[i+1]+1);
1996     vi   = aj + adiag[i+1]+1;
1997     nz   = adiag[i] - adiag[i+1] - 1;
1998     idt  = 6*i;
1999     s1 = t[idt];  s2 = t[1+idt];
2000     s3 = t[2+idt];s4 = t[3+idt];
2001     s5 = t[4+idt];s6 = t[5+idt];
2002     for(m=0;m<nz;m++){
2003       idx   = 6*vi[m];
2004       x1    = t[idx];   x2 = t[1+idx];
2005       x3    = t[2+idx]; x4 = t[3+idx];
2006       x5    = t[4+idx]; x6 = t[5+idx];
2007       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2008       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2009       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2010       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2011       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2012       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
2013       v += 36;
2014     }
2015     idc = 6*c[i];
2016     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
2017                                  v[18]*s4+v[24]*s5+v[30]*s6;
2018     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
2019                                  v[19]*s4+v[25]*s5+v[31]*s6;
2020     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
2021                                  v[20]*s4+v[26]*s5+v[32]*s6;
2022     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
2023                                  v[21]*s4+v[27]*s5+v[33]*s6;
2024     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
2025                                  v[22]*s4+v[28]*s5+v[34]*s6;
2026     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
2027                                  v[23]*s4+v[29]*s5+v[35]*s6;
2028   }
2029 
2030   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2031   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2032   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2033   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2034   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
2035   PetscFunctionReturn(0);
2036 }
2037 
2038 #undef __FUNCT__
2039 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
2040 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
2041 {
2042   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2043   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
2044   PetscErrorCode    ierr;
2045   PetscInt          *diag = a->diag,jdx;
2046   const MatScalar   *aa=a->a,*v;
2047   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
2048   const PetscScalar *b;
2049 
2050   PetscFunctionBegin;
2051   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2052   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2053   /* forward solve the lower triangular */
2054   idx    = 0;
2055   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
2056   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
2057   for (i=1; i<n; i++) {
2058     v     =  aa + 36*ai[i];
2059     vi    =  aj + ai[i];
2060     nz    =  diag[i] - ai[i];
2061     idx   =  6*i;
2062     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
2063     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
2064     while (nz--) {
2065       jdx   = 6*(*vi++);
2066       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
2067       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
2068       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2069       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2070       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2071       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2072       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2073       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
2074       v += 36;
2075      }
2076     x[idx]   = s1;
2077     x[1+idx] = s2;
2078     x[2+idx] = s3;
2079     x[3+idx] = s4;
2080     x[4+idx] = s5;
2081     x[5+idx] = s6;
2082   }
2083   /* backward solve the upper triangular */
2084   for (i=n-1; i>=0; i--){
2085     v    = aa + 36*diag[i] + 36;
2086     vi   = aj + diag[i] + 1;
2087     nz   = ai[i+1] - diag[i] - 1;
2088     idt  = 6*i;
2089     s1 = x[idt];   s2 = x[1+idt];
2090     s3 = x[2+idt]; s4 = x[3+idt];
2091     s5 = x[4+idt]; s6 = x[5+idt];
2092     while (nz--) {
2093       idx   = 6*(*vi++);
2094       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
2095       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
2096       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2097       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2098       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2099       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2100       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2101       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
2102       v += 36;
2103     }
2104     v        = aa + 36*diag[i];
2105     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
2106     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
2107     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
2108     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
2109     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
2110     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
2111   }
2112 
2113   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2114   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2115   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
2116   PetscFunctionReturn(0);
2117 }
2118 
2119 #undef __FUNCT__
2120 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct"
2121 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
2122 {
2123     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2124     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
2125     PetscErrorCode    ierr;
2126     PetscInt          idx,jdx,idt;
2127     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
2128     const MatScalar   *aa=a->a,*v;
2129     PetscScalar       *x;
2130     const PetscScalar *b;
2131     PetscScalar        s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
2132 
2133     PetscFunctionBegin;
2134     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2135     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2136     /* forward solve the lower triangular */
2137     idx    = 0;
2138     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
2139     x[4] = b[4+idx];x[5] = b[5+idx];
2140     for (i=1; i<n; i++) {
2141        v    = aa + bs2*ai[i];
2142        vi   = aj + ai[i];
2143        nz   = ai[i+1] - ai[i];
2144       idx   = bs*i;
2145        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2146        s5   = b[4+idx];s6 = b[5+idx];
2147        for(k=0;k<nz;k++){
2148           jdx   = bs*vi[k];
2149           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
2150 	  x5    = x[4+jdx]; x6 = x[5+jdx];
2151           s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
2152           s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
2153           s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
2154 	  s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
2155           s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
2156 	  s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
2157           v   +=  bs2;
2158         }
2159 
2160        x[idx]   = s1;
2161        x[1+idx] = s2;
2162        x[2+idx] = s3;
2163        x[3+idx] = s4;
2164        x[4+idx] = s5;
2165        x[5+idx] = s6;
2166     }
2167 
2168    /* backward solve the upper triangular */
2169   for (i=n-1; i>=0; i--){
2170      v   = aa + bs2*ai[2*n-i];
2171      vi  = aj + ai[2*n-i];
2172      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
2173      idt = bs*i;
2174      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
2175      s5 = x[4+idt];s6 = x[5+idt];
2176      for(k=0;k<nz;k++){
2177       idx   = bs*vi[k];
2178        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
2179        x5    = x[4+idx];x6 = x[5+idx];
2180        s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
2181        s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
2182        s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
2183        s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
2184        s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
2185        s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
2186         v   +=  bs2;
2187     }
2188     /* x = inv_diagonal*x */
2189    x[idt]   = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
2190    x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
2191    x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
2192    x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
2193    x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
2194    x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
2195   }
2196 
2197   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2198   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2199   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2200   PetscFunctionReturn(0);
2201 }
2202 
2203 #undef __FUNCT__
2204 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct_v2"
2205 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx)
2206 {
2207     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2208     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
2209     PetscErrorCode    ierr;
2210     PetscInt          idx,jdx,idt;
2211     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
2212     const MatScalar   *aa=a->a,*v;
2213     PetscScalar       *x;
2214     const PetscScalar *b;
2215     PetscScalar        s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
2216 
2217     PetscFunctionBegin;
2218     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2219     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2220     /* forward solve the lower triangular */
2221     idx    = 0;
2222     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
2223     x[4] = b[4+idx];x[5] = b[5+idx];
2224     for (i=1; i<n; i++) {
2225        v    = aa + bs2*ai[i];
2226        vi   = aj + ai[i];
2227        nz   = ai[i+1] - ai[i];
2228       idx   = bs*i;
2229        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2230        s5   = b[4+idx];s6 = b[5+idx];
2231        for(k=0;k<nz;k++){
2232           jdx   = bs*vi[k];
2233           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
2234 	  x5    = x[4+jdx]; x6 = x[5+jdx];
2235           s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
2236           s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
2237           s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
2238 	  s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
2239           s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
2240 	  s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
2241           v   +=  bs2;
2242         }
2243 
2244        x[idx]   = s1;
2245        x[1+idx] = s2;
2246        x[2+idx] = s3;
2247        x[3+idx] = s4;
2248        x[4+idx] = s5;
2249        x[5+idx] = s6;
2250     }
2251 
2252    /* backward solve the upper triangular */
2253   for (i=n-1; i>=0; i--){
2254     v   = aa + bs2*(adiag[i+1]+1);
2255      vi  = aj + adiag[i+1]+1;
2256      nz  = adiag[i] - adiag[i+1]-1;
2257      idt = bs*i;
2258      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
2259      s5 = x[4+idt];s6 = x[5+idt];
2260      for(k=0;k<nz;k++){
2261       idx   = bs*vi[k];
2262        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
2263        x5    = x[4+idx];x6 = x[5+idx];
2264        s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
2265        s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
2266        s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
2267        s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
2268        s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
2269        s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
2270         v   +=  bs2;
2271     }
2272     /* x = inv_diagonal*x */
2273    x[idt]   = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
2274    x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
2275    x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
2276    x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
2277    x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
2278    x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
2279   }
2280 
2281   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2282   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2283   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2284   PetscFunctionReturn(0);
2285 }
2286 
2287 #undef __FUNCT__
2288 #define __FUNCT__ "MatSolve_SeqBAIJ_5"
2289 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
2290 {
2291   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2292   IS                iscol=a->col,isrow=a->row;
2293   PetscErrorCode    ierr;
2294   const PetscInt    *r,*c,*rout,*cout,*diag = a->diag;
2295   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
2296   const MatScalar   *aa=a->a,*v;
2297   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
2298   const PetscScalar *b;
2299 
2300   PetscFunctionBegin;
2301   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2302   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2303   t  = a->solve_work;
2304 
2305   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2306   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2307 
2308   /* forward solve the lower triangular */
2309   idx    = 5*(*r++);
2310   t[0] = b[idx];   t[1] = b[1+idx];
2311   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
2312   for (i=1; i<n; i++) {
2313     v     = aa + 25*ai[i];
2314     vi    = aj + ai[i];
2315     nz    = diag[i] - ai[i];
2316     idx   = 5*(*r++);
2317     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2318     s5  = b[4+idx];
2319     while (nz--) {
2320       idx   = 5*(*vi++);
2321       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
2322       x4    = t[3+idx];x5 = t[4+idx];
2323       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
2324       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
2325       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
2326       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
2327       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
2328       v += 25;
2329     }
2330     idx = 5*i;
2331     t[idx]   = s1;t[1+idx] = s2;
2332     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2333   }
2334   /* backward solve the upper triangular */
2335   for (i=n-1; i>=0; i--){
2336     v    = aa + 25*diag[i] + 25;
2337     vi   = aj + diag[i] + 1;
2338     nz   = ai[i+1] - diag[i] - 1;
2339     idt  = 5*i;
2340     s1 = t[idt];  s2 = t[1+idt];
2341     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2342     while (nz--) {
2343       idx   = 5*(*vi++);
2344       x1    = t[idx];   x2 = t[1+idx];
2345       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2346       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
2347       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
2348       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
2349       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
2350       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
2351       v += 25;
2352     }
2353     idc = 5*(*c--);
2354     v   = aa + 25*diag[i];
2355     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
2356                                  v[15]*s4+v[20]*s5;
2357     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
2358                                  v[16]*s4+v[21]*s5;
2359     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
2360                                  v[17]*s4+v[22]*s5;
2361     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
2362                                  v[18]*s4+v[23]*s5;
2363     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
2364                                  v[19]*s4+v[24]*s5;
2365   }
2366 
2367   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2368   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2369   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2370   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2371   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
2372   PetscFunctionReturn(0);
2373 }
2374 
2375 #undef __FUNCT__
2376 #define __FUNCT__ "MatSolve_SeqBAIJ_5_newdatastruct"
2377 PetscErrorCode MatSolve_SeqBAIJ_5_newdatastruct(Mat A,Vec bb,Vec xx)
2378 {
2379   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2380   IS                iscol=a->col,isrow=a->row;
2381   PetscErrorCode    ierr;
2382   const PetscInt    *r,*c,*rout,*cout;
2383   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m;
2384   const MatScalar   *aa=a->a,*v;
2385   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
2386   const PetscScalar *b;
2387 
2388   PetscFunctionBegin;
2389   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2390   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2391   t  = a->solve_work;
2392 
2393   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2394   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2395 
2396   /* forward solve the lower triangular */
2397   idx    = 5*r[0];
2398   t[0] = b[idx];   t[1] = b[1+idx];
2399   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
2400   for (i=1; i<n; i++) {
2401     v     = aa + 25*ai[i];
2402     vi    = aj + ai[i];
2403     nz    = ai[i+1] - ai[i];
2404     idx   = 5*r[i];
2405     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2406     s5  = b[4+idx];
2407     for(m=0;m<nz;m++){
2408       idx   = 5*vi[m];
2409       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
2410       x4    = t[3+idx];x5 = t[4+idx];
2411       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
2412       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
2413       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
2414       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
2415       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
2416       v += 25;
2417     }
2418     idx = 5*i;
2419     t[idx]   = s1;t[1+idx] = s2;
2420     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2421   }
2422   /* backward solve the upper triangular */
2423   for (i=n-1; i>=0; i--){
2424     k    = 2*n-i;
2425     v    = aa + 25*ai[k];
2426     vi   = aj + ai[k];
2427     nz   = ai[k+1] - ai[k] - 1;
2428     idt  = 5*i;
2429     s1 = t[idt];  s2 = t[1+idt];
2430     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2431     for(m=0;m<nz;m++){
2432       idx   = 5*vi[m];
2433       x1    = t[idx];   x2 = t[1+idx];
2434       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2435       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
2436       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
2437       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
2438       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
2439       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
2440       v += 25;
2441     }
2442     idc = 5*c[i];
2443     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
2444                                  v[15]*s4+v[20]*s5;
2445     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
2446                                  v[16]*s4+v[21]*s5;
2447     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
2448                                  v[17]*s4+v[22]*s5;
2449     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
2450                                  v[18]*s4+v[23]*s5;
2451     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
2452                                  v[19]*s4+v[24]*s5;
2453   }
2454 
2455   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2456   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2457   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2458   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2459   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
2460   PetscFunctionReturn(0);
2461 }
2462 
2463 #undef __FUNCT__
2464 #define __FUNCT__ "MatSolve_SeqBAIJ_5_newdatastruct_v2"
2465 PetscErrorCode MatSolve_SeqBAIJ_5_newdatastruct_v2(Mat A,Vec bb,Vec xx)
2466 {
2467   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2468   IS                iscol=a->col,isrow=a->row;
2469   PetscErrorCode    ierr;
2470   const PetscInt    *r,*c,*rout,*cout;
2471   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
2472   const MatScalar   *aa=a->a,*v;
2473   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
2474   const PetscScalar *b;
2475 
2476   PetscFunctionBegin;
2477   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2478   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2479   t  = a->solve_work;
2480 
2481   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2482   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2483 
2484   /* forward solve the lower triangular */
2485   idx    = 5*r[0];
2486   t[0] = b[idx];   t[1] = b[1+idx];
2487   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
2488   for (i=1; i<n; i++) {
2489     v     = aa + 25*ai[i];
2490     vi    = aj + ai[i];
2491     nz    = ai[i+1] - ai[i];
2492     idx   = 5*r[i];
2493     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2494     s5  = b[4+idx];
2495     for(m=0;m<nz;m++){
2496       idx   = 5*vi[m];
2497       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
2498       x4    = t[3+idx];x5 = t[4+idx];
2499       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
2500       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
2501       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
2502       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
2503       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
2504       v += 25;
2505     }
2506     idx = 5*i;
2507     t[idx]   = s1;t[1+idx] = s2;
2508     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2509   }
2510   /* backward solve the upper triangular */
2511   for (i=n-1; i>=0; i--){
2512     v    = aa + 25*(adiag[i+1]+1);
2513     vi   = aj + adiag[i+1]+1;
2514     nz   = adiag[i] - adiag[i+1] - 1;
2515     idt  = 5*i;
2516     s1 = t[idt];  s2 = t[1+idt];
2517     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2518     for(m=0;m<nz;m++){
2519       idx   = 5*vi[m];
2520       x1    = t[idx];   x2 = t[1+idx];
2521       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2522       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
2523       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
2524       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
2525       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
2526       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
2527       v += 25;
2528     }
2529     idc = 5*c[i];
2530     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
2531                                  v[15]*s4+v[20]*s5;
2532     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
2533                                  v[16]*s4+v[21]*s5;
2534     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
2535                                  v[17]*s4+v[22]*s5;
2536     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
2537                                  v[18]*s4+v[23]*s5;
2538     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
2539                                  v[19]*s4+v[24]*s5;
2540   }
2541 
2542   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2543   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2544   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2545   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2546   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
2547   PetscFunctionReturn(0);
2548 }
2549 
2550 #undef __FUNCT__
2551 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
2552 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
2553 {
2554   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2555   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
2556   PetscErrorCode    ierr;
2557   PetscInt          *diag = a->diag,jdx;
2558   const MatScalar   *aa=a->a,*v;
2559   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
2560   const PetscScalar *b;
2561 
2562   PetscFunctionBegin;
2563   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2564   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2565   /* forward solve the lower triangular */
2566   idx    = 0;
2567   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
2568   for (i=1; i<n; i++) {
2569     v     =  aa + 25*ai[i];
2570     vi    =  aj + ai[i];
2571     nz    =  diag[i] - ai[i];
2572     idx   =  5*i;
2573     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
2574     while (nz--) {
2575       jdx   = 5*(*vi++);
2576       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
2577       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2578       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2579       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2580       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2581       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
2582       v    += 25;
2583     }
2584     x[idx]   = s1;
2585     x[1+idx] = s2;
2586     x[2+idx] = s3;
2587     x[3+idx] = s4;
2588     x[4+idx] = s5;
2589   }
2590   /* backward solve the upper triangular */
2591   for (i=n-1; i>=0; i--){
2592     v    = aa + 25*diag[i] + 25;
2593     vi   = aj + diag[i] + 1;
2594     nz   = ai[i+1] - diag[i] - 1;
2595     idt  = 5*i;
2596     s1 = x[idt];  s2 = x[1+idt];
2597     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
2598     while (nz--) {
2599       idx   = 5*(*vi++);
2600       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
2601       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2602       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2603       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2604       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2605       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
2606       v    += 25;
2607     }
2608     v        = aa + 25*diag[i];
2609     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
2610     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
2611     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
2612     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
2613     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
2614   }
2615 
2616   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2617   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2618   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
2619   PetscFunctionReturn(0);
2620 }
2621 
2622 #undef __FUNCT__
2623 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct"
2624 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
2625 {
2626   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2627   PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
2628   PetscErrorCode    ierr;
2629   PetscInt          jdx;
2630   const MatScalar   *aa=a->a,*v;
2631   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
2632   const PetscScalar *b;
2633 
2634   PetscFunctionBegin;
2635   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2636   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2637   /* forward solve the lower triangular */
2638   idx    = 0;
2639   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
2640   for (i=1; i<n; i++) {
2641     v   = aa + 25*ai[i];
2642     vi  = aj + ai[i];
2643     nz  = ai[i+1] - ai[i];
2644     idx = 5*i;
2645     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
2646     for(k=0;k<nz;k++) {
2647       jdx   = 5*vi[k];
2648       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
2649       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2650       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2651       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2652       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2653       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
2654       v    += 25;
2655     }
2656     x[idx]   = s1;
2657     x[1+idx] = s2;
2658     x[2+idx] = s3;
2659     x[3+idx] = s4;
2660     x[4+idx] = s5;
2661   }
2662 
2663   /* backward solve the upper triangular */
2664   for (i=n-1; i>=0; i--){
2665     v   = aa + 25*ai[2*n-i];
2666     vi  = aj + ai[2*n-i];
2667     nz  = ai[2*n-i +1] - ai[2*n-i]-1;
2668     idt = 5*i;
2669     s1 = x[idt];  s2 = x[1+idt];
2670     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
2671     for(k=0;k<nz;k++){
2672       idx   = 5*vi[k];
2673       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
2674       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2675       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2676       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2677       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2678       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
2679       v    += 25;
2680     }
2681     /* x = inv_diagonal*x */
2682     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
2683     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
2684     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
2685     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
2686     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
2687   }
2688 
2689   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2690   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2691   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
2692   PetscFunctionReturn(0);
2693 }
2694 
2695 #undef __FUNCT__
2696 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct_v2"
2697 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx)
2698 {
2699   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2700   PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt;
2701   PetscErrorCode    ierr;
2702   PetscInt          jdx;
2703   const MatScalar   *aa=a->a,*v;
2704   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
2705   const PetscScalar *b;
2706 
2707   PetscFunctionBegin;
2708   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2709   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2710   /* forward solve the lower triangular */
2711   idx    = 0;
2712   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
2713   for (i=1; i<n; i++) {
2714     v   = aa + 25*ai[i];
2715     vi  = aj + ai[i];
2716     nz  = ai[i+1] - ai[i];
2717     idx = 5*i;
2718     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
2719     for(k=0;k<nz;k++) {
2720       jdx   = 5*vi[k];
2721       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
2722       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2723       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2724       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2725       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2726       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
2727       v    += 25;
2728     }
2729     x[idx]   = s1;
2730     x[1+idx] = s2;
2731     x[2+idx] = s3;
2732     x[3+idx] = s4;
2733     x[4+idx] = s5;
2734   }
2735 
2736   /* backward solve the upper triangular */
2737   for (i=n-1; i>=0; i--){
2738     v   = aa + 25*(adiag[i+1]+1);
2739     vi  = aj + adiag[i+1]+1;
2740     nz  = adiag[i] - adiag[i+1]-1;
2741     idt = 5*i;
2742     s1 = x[idt];  s2 = x[1+idt];
2743     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
2744     for(k=0;k<nz;k++){
2745       idx   = 5*vi[k];
2746       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
2747       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2748       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2749       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2750       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2751       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
2752       v    += 25;
2753     }
2754     /* x = inv_diagonal*x */
2755     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
2756     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
2757     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
2758     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
2759     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
2760   }
2761 
2762   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2763   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2764   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
2765   PetscFunctionReturn(0);
2766 }
2767 
2768 #undef __FUNCT__
2769 #define __FUNCT__ "MatSolve_SeqBAIJ_4"
2770 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
2771 {
2772   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2773   IS                iscol=a->col,isrow=a->row;
2774   PetscErrorCode    ierr;
2775   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
2776   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
2777   const MatScalar   *aa=a->a,*v;
2778   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
2779   const PetscScalar *b;
2780 
2781   PetscFunctionBegin;
2782   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2783   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2784   t  = a->solve_work;
2785 
2786   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2787   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2788 
2789   /* forward solve the lower triangular */
2790   idx    = 4*(*r++);
2791   t[0] = b[idx];   t[1] = b[1+idx];
2792   t[2] = b[2+idx]; t[3] = b[3+idx];
2793   for (i=1; i<n; i++) {
2794     v     = aa + 16*ai[i];
2795     vi    = aj + ai[i];
2796     nz    = diag[i] - ai[i];
2797     idx   = 4*(*r++);
2798     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2799     while (nz--) {
2800       idx   = 4*(*vi++);
2801       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
2802       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2803       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2804       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2805       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2806       v    += 16;
2807     }
2808     idx        = 4*i;
2809     t[idx]   = s1;t[1+idx] = s2;
2810     t[2+idx] = s3;t[3+idx] = s4;
2811   }
2812   /* backward solve the upper triangular */
2813   for (i=n-1; i>=0; i--){
2814     v    = aa + 16*diag[i] + 16;
2815     vi   = aj + diag[i] + 1;
2816     nz   = ai[i+1] - diag[i] - 1;
2817     idt  = 4*i;
2818     s1 = t[idt];  s2 = t[1+idt];
2819     s3 = t[2+idt];s4 = t[3+idt];
2820     while (nz--) {
2821       idx   = 4*(*vi++);
2822       x1    = t[idx];   x2 = t[1+idx];
2823       x3    = t[2+idx]; x4 = t[3+idx];
2824       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2825       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2826       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2827       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
2828       v += 16;
2829     }
2830     idc      = 4*(*c--);
2831     v        = aa + 16*diag[i];
2832     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
2833     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
2834     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
2835     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
2836   }
2837 
2838   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2839   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2840   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2841   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2842   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2843   PetscFunctionReturn(0);
2844 }
2845 
2846 #undef __FUNCT__
2847 #define __FUNCT__ "MatSolve_SeqBAIJ_4_newdatastruct"
2848 PetscErrorCode MatSolve_SeqBAIJ_4_newdatastruct(Mat A,Vec bb,Vec xx)
2849 {
2850   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2851   IS                iscol=a->col,isrow=a->row;
2852   PetscErrorCode    ierr;
2853   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m;
2854   const PetscInt    *r,*c,*rout,*cout;
2855   const MatScalar   *aa=a->a,*v;
2856   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
2857   const PetscScalar *b;
2858 
2859   PetscFunctionBegin;
2860   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2861   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2862   t  = a->solve_work;
2863 
2864   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2865   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2866 
2867   /* forward solve the lower triangular */
2868   idx    = 4*r[0];
2869   t[0] = b[idx];   t[1] = b[1+idx];
2870   t[2] = b[2+idx]; t[3] = b[3+idx];
2871   for (i=1; i<n; i++) {
2872     v     = aa + 16*ai[i];
2873     vi    = aj + ai[i];
2874     nz    = ai[i+1] - ai[i];
2875     idx   = 4*r[i];
2876     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2877     for(m=0;m<nz;m++){
2878       idx   = 4*vi[m];
2879       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
2880       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2881       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2882       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2883       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2884       v    += 16;
2885     }
2886     idx        = 4*i;
2887     t[idx]   = s1;t[1+idx] = s2;
2888     t[2+idx] = s3;t[3+idx] = s4;
2889   }
2890   /* backward solve the upper triangular */
2891   for (i=n-1; i>=0; i--){
2892     k    = 2*n-i;
2893     v    = aa + 16*ai[k];
2894     vi   = aj + ai[k];
2895     nz   = ai[k+1] - ai[k] - 1;
2896     idt  = 4*i;
2897     s1 = t[idt];  s2 = t[1+idt];
2898     s3 = t[2+idt];s4 = t[3+idt];
2899     for(m=0;m<nz;m++){
2900       idx   = 4*vi[m];
2901       x1    = t[idx];   x2 = t[1+idx];
2902       x3    = t[2+idx]; x4 = t[3+idx];
2903       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2904       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2905       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2906       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
2907       v += 16;
2908     }
2909     idc      = 4*c[i];
2910     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
2911     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
2912     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
2913     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
2914   }
2915 
2916   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2917   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2918   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2919   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2920   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2921   PetscFunctionReturn(0);
2922 }
2923 
2924 #undef __FUNCT__
2925 #define __FUNCT__ "MatSolve_SeqBAIJ_4_newdatastruct_v2"
2926 PetscErrorCode MatSolve_SeqBAIJ_4_newdatastruct_v2(Mat A,Vec bb,Vec xx)
2927 {
2928   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2929   IS                iscol=a->col,isrow=a->row;
2930   PetscErrorCode    ierr;
2931   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
2932   const PetscInt    *r,*c,*rout,*cout;
2933   const MatScalar   *aa=a->a,*v;
2934   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
2935   const PetscScalar *b;
2936 
2937   PetscFunctionBegin;
2938   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2939   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2940   t  = a->solve_work;
2941 
2942   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2943   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2944 
2945   /* forward solve the lower triangular */
2946   idx    = 4*r[0];
2947   t[0] = b[idx];   t[1] = b[1+idx];
2948   t[2] = b[2+idx]; t[3] = b[3+idx];
2949   for (i=1; i<n; i++) {
2950     v     = aa + 16*ai[i];
2951     vi    = aj + ai[i];
2952     nz    = ai[i+1] - ai[i];
2953     idx   = 4*r[i];
2954     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2955     for(m=0;m<nz;m++){
2956       idx   = 4*vi[m];
2957       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
2958       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2959       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2960       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2961       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2962       v    += 16;
2963     }
2964     idx        = 4*i;
2965     t[idx]   = s1;t[1+idx] = s2;
2966     t[2+idx] = s3;t[3+idx] = s4;
2967   }
2968   /* backward solve the upper triangular */
2969   for (i=n-1; i>=0; i--){
2970     v    = aa + 16*(adiag[i+1]+1);
2971     vi   = aj + adiag[i+1]+1;
2972     nz   = adiag[i] - adiag[i+1] - 1;
2973     idt  = 4*i;
2974     s1 = t[idt];  s2 = t[1+idt];
2975     s3 = t[2+idt];s4 = t[3+idt];
2976     for(m=0;m<nz;m++){
2977       idx   = 4*vi[m];
2978       x1    = t[idx];   x2 = t[1+idx];
2979       x3    = t[2+idx]; x4 = t[3+idx];
2980       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2981       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2982       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2983       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
2984       v += 16;
2985     }
2986     idc      = 4*c[i];
2987     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
2988     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
2989     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
2990     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
2991   }
2992 
2993   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2994   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2995   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2996   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2997   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2998   PetscFunctionReturn(0);
2999 }
3000 
3001 #undef __FUNCT__
3002 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion"
3003 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
3004 {
3005   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3006   IS                iscol=a->col,isrow=a->row;
3007   PetscErrorCode    ierr;
3008   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
3009   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3010   const MatScalar   *aa=a->a,*v;
3011   MatScalar         s1,s2,s3,s4,x1,x2,x3,x4,*t;
3012   PetscScalar       *x;
3013   const PetscScalar *b;
3014 
3015   PetscFunctionBegin;
3016   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3017   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3018   t  = (MatScalar *)a->solve_work;
3019 
3020   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3021   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3022 
3023   /* forward solve the lower triangular */
3024   idx    = 4*(*r++);
3025   t[0] = (MatScalar)b[idx];
3026   t[1] = (MatScalar)b[1+idx];
3027   t[2] = (MatScalar)b[2+idx];
3028   t[3] = (MatScalar)b[3+idx];
3029   for (i=1; i<n; i++) {
3030     v     = aa + 16*ai[i];
3031     vi    = aj + ai[i];
3032     nz    = diag[i] - ai[i];
3033     idx   = 4*(*r++);
3034     s1 = (MatScalar)b[idx];
3035     s2 = (MatScalar)b[1+idx];
3036     s3 = (MatScalar)b[2+idx];
3037     s4 = (MatScalar)b[3+idx];
3038     while (nz--) {
3039       idx   = 4*(*vi++);
3040       x1  = t[idx];
3041       x2  = t[1+idx];
3042       x3  = t[2+idx];
3043       x4  = t[3+idx];
3044       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3045       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3046       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3047       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3048       v    += 16;
3049     }
3050     idx        = 4*i;
3051     t[idx]   = s1;
3052     t[1+idx] = s2;
3053     t[2+idx] = s3;
3054     t[3+idx] = s4;
3055   }
3056   /* backward solve the upper triangular */
3057   for (i=n-1; i>=0; i--){
3058     v    = aa + 16*diag[i] + 16;
3059     vi   = aj + diag[i] + 1;
3060     nz   = ai[i+1] - diag[i] - 1;
3061     idt  = 4*i;
3062     s1 = t[idt];
3063     s2 = t[1+idt];
3064     s3 = t[2+idt];
3065     s4 = t[3+idt];
3066     while (nz--) {
3067       idx   = 4*(*vi++);
3068       x1  = t[idx];
3069       x2  = t[1+idx];
3070       x3  = t[2+idx];
3071       x4  = t[3+idx];
3072       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3073       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3074       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3075       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3076       v += 16;
3077     }
3078     idc      = 4*(*c--);
3079     v        = aa + 16*diag[i];
3080     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3081     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3082     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3083     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3084     x[idc]   = (PetscScalar)t[idt];
3085     x[1+idc] = (PetscScalar)t[1+idt];
3086     x[2+idc] = (PetscScalar)t[2+idt];
3087     x[3+idc] = (PetscScalar)t[3+idt];
3088  }
3089 
3090   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3091   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3092   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3093   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3094   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3095   PetscFunctionReturn(0);
3096 }
3097 
3098 #if defined (PETSC_HAVE_SSE)
3099 
3100 #include PETSC_HAVE_SSE
3101 
3102 #undef __FUNCT__
3103 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
3104 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
3105 {
3106   /*
3107      Note: This code uses demotion of double
3108      to float when performing the mixed-mode computation.
3109      This may not be numerically reasonable for all applications.
3110   */
3111   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3112   IS             iscol=a->col,isrow=a->row;
3113   PetscErrorCode ierr;
3114   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
3115   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
3116   MatScalar      *aa=a->a,*v;
3117   PetscScalar    *x,*b,*t;
3118 
3119   /* Make space in temp stack for 16 Byte Aligned arrays */
3120   float           ssealignedspace[11],*tmps,*tmpx;
3121   unsigned long   offset;
3122 
3123   PetscFunctionBegin;
3124   SSE_SCOPE_BEGIN;
3125 
3126     offset = (unsigned long)ssealignedspace % 16;
3127     if (offset) offset = (16 - offset)/4;
3128     tmps = &ssealignedspace[offset];
3129     tmpx = &ssealignedspace[offset+4];
3130     PREFETCH_NTA(aa+16*ai[1]);
3131 
3132     ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3133     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3134     t  = a->solve_work;
3135 
3136     ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3137     ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3138 
3139     /* forward solve the lower triangular */
3140     idx  = 4*(*r++);
3141     t[0] = b[idx];   t[1] = b[1+idx];
3142     t[2] = b[2+idx]; t[3] = b[3+idx];
3143     v    =  aa + 16*ai[1];
3144 
3145     for (i=1; i<n;) {
3146       PREFETCH_NTA(&v[8]);
3147       vi   =  aj      + ai[i];
3148       nz   =  diag[i] - ai[i];
3149       idx  =  4*(*r++);
3150 
3151       /* Demote sum from double to float */
3152       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
3153       LOAD_PS(tmps,XMM7);
3154 
3155       while (nz--) {
3156         PREFETCH_NTA(&v[16]);
3157         idx = 4*(*vi++);
3158 
3159         /* Demote solution (so far) from double to float */
3160         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
3161 
3162         /* 4x4 Matrix-Vector product with negative accumulation: */
3163         SSE_INLINE_BEGIN_2(tmpx,v)
3164           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3165 
3166           /* First Column */
3167           SSE_COPY_PS(XMM0,XMM6)
3168           SSE_SHUFFLE(XMM0,XMM0,0x00)
3169           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3170           SSE_SUB_PS(XMM7,XMM0)
3171 
3172           /* Second Column */
3173           SSE_COPY_PS(XMM1,XMM6)
3174           SSE_SHUFFLE(XMM1,XMM1,0x55)
3175           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3176           SSE_SUB_PS(XMM7,XMM1)
3177 
3178           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3179 
3180           /* Third Column */
3181           SSE_COPY_PS(XMM2,XMM6)
3182           SSE_SHUFFLE(XMM2,XMM2,0xAA)
3183           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3184           SSE_SUB_PS(XMM7,XMM2)
3185 
3186           /* Fourth Column */
3187           SSE_COPY_PS(XMM3,XMM6)
3188           SSE_SHUFFLE(XMM3,XMM3,0xFF)
3189           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3190           SSE_SUB_PS(XMM7,XMM3)
3191         SSE_INLINE_END_2
3192 
3193         v  += 16;
3194       }
3195       idx = 4*i;
3196       v   = aa + 16*ai[++i];
3197       PREFETCH_NTA(v);
3198       STORE_PS(tmps,XMM7);
3199 
3200       /* Promote result from float to double */
3201       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
3202     }
3203     /* backward solve the upper triangular */
3204     idt  = 4*(n-1);
3205     ai16 = 16*diag[n-1];
3206     v    = aa + ai16 + 16;
3207     for (i=n-1; i>=0;){
3208       PREFETCH_NTA(&v[8]);
3209       vi = aj + diag[i] + 1;
3210       nz = ai[i+1] - diag[i] - 1;
3211 
3212       /* Demote accumulator from double to float */
3213       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
3214       LOAD_PS(tmps,XMM7);
3215 
3216       while (nz--) {
3217         PREFETCH_NTA(&v[16]);
3218         idx = 4*(*vi++);
3219 
3220         /* Demote solution (so far) from double to float */
3221         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
3222 
3223         /* 4x4 Matrix-Vector Product with negative accumulation: */
3224         SSE_INLINE_BEGIN_2(tmpx,v)
3225           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3226 
3227           /* First Column */
3228           SSE_COPY_PS(XMM0,XMM6)
3229           SSE_SHUFFLE(XMM0,XMM0,0x00)
3230           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3231           SSE_SUB_PS(XMM7,XMM0)
3232 
3233           /* Second Column */
3234           SSE_COPY_PS(XMM1,XMM6)
3235           SSE_SHUFFLE(XMM1,XMM1,0x55)
3236           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3237           SSE_SUB_PS(XMM7,XMM1)
3238 
3239           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3240 
3241           /* Third Column */
3242           SSE_COPY_PS(XMM2,XMM6)
3243           SSE_SHUFFLE(XMM2,XMM2,0xAA)
3244           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3245           SSE_SUB_PS(XMM7,XMM2)
3246 
3247           /* Fourth Column */
3248           SSE_COPY_PS(XMM3,XMM6)
3249           SSE_SHUFFLE(XMM3,XMM3,0xFF)
3250           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3251           SSE_SUB_PS(XMM7,XMM3)
3252         SSE_INLINE_END_2
3253         v  += 16;
3254       }
3255       v    = aa + ai16;
3256       ai16 = 16*diag[--i];
3257       PREFETCH_NTA(aa+ai16+16);
3258       /*
3259          Scale the result by the diagonal 4x4 block,
3260          which was inverted as part of the factorization
3261       */
3262       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
3263         /* First Column */
3264         SSE_COPY_PS(XMM0,XMM7)
3265         SSE_SHUFFLE(XMM0,XMM0,0x00)
3266         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
3267 
3268         /* Second Column */
3269         SSE_COPY_PS(XMM1,XMM7)
3270         SSE_SHUFFLE(XMM1,XMM1,0x55)
3271         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
3272         SSE_ADD_PS(XMM0,XMM1)
3273 
3274         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
3275 
3276         /* Third Column */
3277         SSE_COPY_PS(XMM2,XMM7)
3278         SSE_SHUFFLE(XMM2,XMM2,0xAA)
3279         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
3280         SSE_ADD_PS(XMM0,XMM2)
3281 
3282         /* Fourth Column */
3283         SSE_COPY_PS(XMM3,XMM7)
3284         SSE_SHUFFLE(XMM3,XMM3,0xFF)
3285         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
3286         SSE_ADD_PS(XMM0,XMM3)
3287 
3288         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
3289       SSE_INLINE_END_3
3290 
3291       /* Promote solution from float to double */
3292       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
3293 
3294       /* Apply reordering to t and stream into x.    */
3295       /* This way, x doesn't pollute the cache.      */
3296       /* Be careful with size: 2 doubles = 4 floats! */
3297       idc  = 4*(*c--);
3298       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
3299         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
3300         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
3301         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
3302         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
3303         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
3304         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
3305       SSE_INLINE_END_2
3306       v    = aa + ai16 + 16;
3307       idt -= 4;
3308     }
3309 
3310     ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3311     ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3312     ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3313     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3314     ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3315   SSE_SCOPE_END;
3316   PetscFunctionReturn(0);
3317 }
3318 
3319 #endif
3320 
3321 
3322 /*
3323       Special case where the matrix was ILU(0) factored in the natural
3324    ordering. This eliminates the need for the column and row permutation.
3325 */
3326 #undef __FUNCT__
3327 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
3328 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
3329 {
3330   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3331   PetscInt          n=a->mbs;
3332   const PetscInt    *ai=a->i,*aj=a->j;
3333   PetscErrorCode    ierr;
3334   const PetscInt    *diag = a->diag;
3335   const MatScalar   *aa=a->a;
3336   PetscScalar       *x;
3337   const PetscScalar *b;
3338 
3339   PetscFunctionBegin;
3340   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3341   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3342 
3343 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
3344   {
3345     static PetscScalar w[2000]; /* very BAD need to fix */
3346     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
3347   }
3348 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
3349   {
3350     static PetscScalar w[2000]; /* very BAD need to fix */
3351     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
3352   }
3353 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
3354   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
3355 #else
3356   {
3357     PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
3358     const MatScalar *v;
3359     PetscInt        jdx,idt,idx,nz,i,ai16;
3360     const PetscInt  *vi;
3361 
3362   /* forward solve the lower triangular */
3363   idx    = 0;
3364   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
3365   for (i=1; i<n; i++) {
3366     v     =  aa      + 16*ai[i];
3367     vi    =  aj      + ai[i];
3368     nz    =  diag[i] - ai[i];
3369     idx   +=  4;
3370     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3371     while (nz--) {
3372       jdx   = 4*(*vi++);
3373       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
3374       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3375       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3376       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3377       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3378       v    += 16;
3379     }
3380     x[idx]   = s1;
3381     x[1+idx] = s2;
3382     x[2+idx] = s3;
3383     x[3+idx] = s4;
3384   }
3385   /* backward solve the upper triangular */
3386   idt = 4*(n-1);
3387   for (i=n-1; i>=0; i--){
3388     ai16 = 16*diag[i];
3389     v    = aa + ai16 + 16;
3390     vi   = aj + diag[i] + 1;
3391     nz   = ai[i+1] - diag[i] - 1;
3392     s1 = x[idt];  s2 = x[1+idt];
3393     s3 = x[2+idt];s4 = x[3+idt];
3394     while (nz--) {
3395       idx   = 4*(*vi++);
3396       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
3397       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3398       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3399       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3400       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3401       v    += 16;
3402     }
3403     v        = aa + ai16;
3404     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
3405     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
3406     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
3407     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
3408     idt -= 4;
3409   }
3410   }
3411 #endif
3412 
3413   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3414   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3415   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3416   PetscFunctionReturn(0);
3417 }
3418 
3419 #undef __FUNCT__
3420 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct"
3421 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
3422 {
3423     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3424     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
3425     PetscErrorCode    ierr;
3426     PetscInt          idx,jdx,idt;
3427     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
3428     const MatScalar   *aa=a->a,*v;
3429     PetscScalar       *x;
3430     const PetscScalar *b;
3431     PetscScalar        s1,s2,s3,s4,x1,x2,x3,x4;
3432 
3433     PetscFunctionBegin;
3434     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3435     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3436     /* forward solve the lower triangular */
3437     idx    = 0;
3438     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
3439     for (i=1; i<n; i++) {
3440        v    = aa + bs2*ai[i];
3441        vi   = aj + ai[i];
3442        nz   = ai[i+1] - ai[i];
3443       idx   = bs*i;
3444        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3445       for(k=0;k<nz;k++) {
3446           jdx   = bs*vi[k];
3447           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
3448           s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3449           s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3450           s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3451 	  s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3452 
3453           v   +=  bs2;
3454         }
3455 
3456        x[idx]   = s1;
3457        x[1+idx] = s2;
3458        x[2+idx] = s3;
3459        x[3+idx] = s4;
3460     }
3461 
3462    /* backward solve the upper triangular */
3463   for (i=n-1; i>=0; i--){
3464      v   = aa + bs2*ai[2*n-i];
3465      vi  = aj + ai[2*n-i];
3466      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
3467      idt = bs*i;
3468      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
3469 
3470     for(k=0;k<nz;k++){
3471       idx   = bs*vi[k];
3472        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
3473        s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3474        s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3475        s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3476        s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3477 
3478         v   +=  bs2;
3479     }
3480     /* x = inv_diagonal*x */
3481    x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
3482    x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;;
3483    x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
3484    x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
3485 
3486   }
3487 
3488   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3489   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3490   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
3491   PetscFunctionReturn(0);
3492 }
3493 
3494 #undef __FUNCT__
3495 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct_v2"
3496 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx)
3497 {
3498     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3499     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
3500     PetscErrorCode    ierr;
3501     PetscInt          idx,jdx,idt;
3502     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
3503     const MatScalar   *aa=a->a,*v;
3504     PetscScalar       *x;
3505     const PetscScalar *b;
3506     PetscScalar        s1,s2,s3,s4,x1,x2,x3,x4;
3507 
3508     PetscFunctionBegin;
3509     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3510     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3511     /* forward solve the lower triangular */
3512     idx    = 0;
3513     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
3514     for (i=1; i<n; i++) {
3515        v    = aa + bs2*ai[i];
3516        vi   = aj + ai[i];
3517        nz   = ai[i+1] - ai[i];
3518       idx   = bs*i;
3519        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3520       for(k=0;k<nz;k++) {
3521           jdx   = bs*vi[k];
3522           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
3523           s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3524           s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3525           s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3526 	  s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3527 
3528           v   +=  bs2;
3529         }
3530 
3531        x[idx]   = s1;
3532        x[1+idx] = s2;
3533        x[2+idx] = s3;
3534        x[3+idx] = s4;
3535     }
3536 
3537    /* backward solve the upper triangular */
3538   for (i=n-1; i>=0; i--){
3539     v   = aa + bs2*(adiag[i+1]+1);
3540      vi  = aj + adiag[i+1]+1;
3541      nz  = adiag[i] - adiag[i+1]-1;
3542      idt = bs*i;
3543      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
3544 
3545     for(k=0;k<nz;k++){
3546       idx   = bs*vi[k];
3547        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
3548        s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3549        s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3550        s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3551        s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3552 
3553         v   +=  bs2;
3554     }
3555     /* x = inv_diagonal*x */
3556    x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
3557    x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;;
3558    x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
3559    x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
3560 
3561   }
3562 
3563   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3564   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3565   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
3566   PetscFunctionReturn(0);
3567 }
3568 
3569 #undef __FUNCT__
3570 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion"
3571 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
3572 {
3573   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3574   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
3575   PetscErrorCode ierr;
3576   PetscInt       *diag = a->diag;
3577   MatScalar      *aa=a->a;
3578   PetscScalar    *x,*b;
3579 
3580   PetscFunctionBegin;
3581   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3582   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3583 
3584   {
3585     MatScalar  s1,s2,s3,s4,x1,x2,x3,x4;
3586     MatScalar  *v,*t=(MatScalar *)x;
3587     PetscInt   jdx,idt,idx,nz,*vi,i,ai16;
3588 
3589     /* forward solve the lower triangular */
3590     idx  = 0;
3591     t[0] = (MatScalar)b[0];
3592     t[1] = (MatScalar)b[1];
3593     t[2] = (MatScalar)b[2];
3594     t[3] = (MatScalar)b[3];
3595     for (i=1; i<n; i++) {
3596       v     =  aa      + 16*ai[i];
3597       vi    =  aj      + ai[i];
3598       nz    =  diag[i] - ai[i];
3599       idx   +=  4;
3600       s1 = (MatScalar)b[idx];
3601       s2 = (MatScalar)b[1+idx];
3602       s3 = (MatScalar)b[2+idx];
3603       s4 = (MatScalar)b[3+idx];
3604       while (nz--) {
3605         jdx = 4*(*vi++);
3606         x1  = t[jdx];
3607         x2  = t[1+jdx];
3608         x3  = t[2+jdx];
3609         x4  = t[3+jdx];
3610         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3611         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3612         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3613         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3614         v    += 16;
3615       }
3616       t[idx]   = s1;
3617       t[1+idx] = s2;
3618       t[2+idx] = s3;
3619       t[3+idx] = s4;
3620     }
3621     /* backward solve the upper triangular */
3622     idt = 4*(n-1);
3623     for (i=n-1; i>=0; i--){
3624       ai16 = 16*diag[i];
3625       v    = aa + ai16 + 16;
3626       vi   = aj + diag[i] + 1;
3627       nz   = ai[i+1] - diag[i] - 1;
3628       s1   = t[idt];
3629       s2   = t[1+idt];
3630       s3   = t[2+idt];
3631       s4   = t[3+idt];
3632       while (nz--) {
3633         idx = 4*(*vi++);
3634         x1  = (MatScalar)x[idx];
3635         x2  = (MatScalar)x[1+idx];
3636         x3  = (MatScalar)x[2+idx];
3637         x4  = (MatScalar)x[3+idx];
3638         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3639         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3640         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3641         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3642         v    += 16;
3643       }
3644       v        = aa + ai16;
3645       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
3646       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
3647       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
3648       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
3649       idt -= 4;
3650     }
3651   }
3652 
3653   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3654   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3655   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3656   PetscFunctionReturn(0);
3657 }
3658 
3659 #if defined (PETSC_HAVE_SSE)
3660 
3661 #include PETSC_HAVE_SSE
3662 #undef __FUNCT__
3663 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj"
3664 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
3665 {
3666   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3667   unsigned short *aj=(unsigned short *)a->j;
3668   PetscErrorCode ierr;
3669   int            *ai=a->i,n=a->mbs,*diag = a->diag;
3670   MatScalar      *aa=a->a;
3671   PetscScalar    *x,*b;
3672 
3673   PetscFunctionBegin;
3674   SSE_SCOPE_BEGIN;
3675   /*
3676      Note: This code currently uses demotion of double
3677      to float when performing the mixed-mode computation.
3678      This may not be numerically reasonable for all applications.
3679   */
3680   PREFETCH_NTA(aa+16*ai[1]);
3681 
3682   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3683   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3684   {
3685     /* x will first be computed in single precision then promoted inplace to double */
3686     MatScalar      *v,*t=(MatScalar *)x;
3687     int            nz,i,idt,ai16;
3688     unsigned int   jdx,idx;
3689     unsigned short *vi;
3690     /* Forward solve the lower triangular factor. */
3691 
3692     /* First block is the identity. */
3693     idx  = 0;
3694     CONVERT_DOUBLE4_FLOAT4(t,b);
3695     v    =  aa + 16*((unsigned int)ai[1]);
3696 
3697     for (i=1; i<n;) {
3698       PREFETCH_NTA(&v[8]);
3699       vi   =  aj      + ai[i];
3700       nz   =  diag[i] - ai[i];
3701       idx +=  4;
3702 
3703       /* Demote RHS from double to float. */
3704       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
3705       LOAD_PS(&t[idx],XMM7);
3706 
3707       while (nz--) {
3708         PREFETCH_NTA(&v[16]);
3709         jdx = 4*((unsigned int)(*vi++));
3710 
3711         /* 4x4 Matrix-Vector product with negative accumulation: */
3712         SSE_INLINE_BEGIN_2(&t[jdx],v)
3713           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3714 
3715           /* First Column */
3716           SSE_COPY_PS(XMM0,XMM6)
3717           SSE_SHUFFLE(XMM0,XMM0,0x00)
3718           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3719           SSE_SUB_PS(XMM7,XMM0)
3720 
3721           /* Second Column */
3722           SSE_COPY_PS(XMM1,XMM6)
3723           SSE_SHUFFLE(XMM1,XMM1,0x55)
3724           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3725           SSE_SUB_PS(XMM7,XMM1)
3726 
3727           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3728 
3729           /* Third Column */
3730           SSE_COPY_PS(XMM2,XMM6)
3731           SSE_SHUFFLE(XMM2,XMM2,0xAA)
3732           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3733           SSE_SUB_PS(XMM7,XMM2)
3734 
3735           /* Fourth Column */
3736           SSE_COPY_PS(XMM3,XMM6)
3737           SSE_SHUFFLE(XMM3,XMM3,0xFF)
3738           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3739           SSE_SUB_PS(XMM7,XMM3)
3740         SSE_INLINE_END_2
3741 
3742         v  += 16;
3743       }
3744       v    =  aa + 16*ai[++i];
3745       PREFETCH_NTA(v);
3746       STORE_PS(&t[idx],XMM7);
3747     }
3748 
3749     /* Backward solve the upper triangular factor.*/
3750 
3751     idt  = 4*(n-1);
3752     ai16 = 16*diag[n-1];
3753     v    = aa + ai16 + 16;
3754     for (i=n-1; i>=0;){
3755       PREFETCH_NTA(&v[8]);
3756       vi = aj + diag[i] + 1;
3757       nz = ai[i+1] - diag[i] - 1;
3758 
3759       LOAD_PS(&t[idt],XMM7);
3760 
3761       while (nz--) {
3762         PREFETCH_NTA(&v[16]);
3763         idx = 4*((unsigned int)(*vi++));
3764 
3765         /* 4x4 Matrix-Vector Product with negative accumulation: */
3766         SSE_INLINE_BEGIN_2(&t[idx],v)
3767           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3768 
3769           /* First Column */
3770           SSE_COPY_PS(XMM0,XMM6)
3771           SSE_SHUFFLE(XMM0,XMM0,0x00)
3772           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3773           SSE_SUB_PS(XMM7,XMM0)
3774 
3775           /* Second Column */
3776           SSE_COPY_PS(XMM1,XMM6)
3777           SSE_SHUFFLE(XMM1,XMM1,0x55)
3778           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3779           SSE_SUB_PS(XMM7,XMM1)
3780 
3781           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3782 
3783           /* Third Column */
3784           SSE_COPY_PS(XMM2,XMM6)
3785           SSE_SHUFFLE(XMM2,XMM2,0xAA)
3786           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3787           SSE_SUB_PS(XMM7,XMM2)
3788 
3789           /* Fourth Column */
3790           SSE_COPY_PS(XMM3,XMM6)
3791           SSE_SHUFFLE(XMM3,XMM3,0xFF)
3792           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3793           SSE_SUB_PS(XMM7,XMM3)
3794         SSE_INLINE_END_2
3795         v  += 16;
3796       }
3797       v    = aa + ai16;
3798       ai16 = 16*diag[--i];
3799       PREFETCH_NTA(aa+ai16+16);
3800       /*
3801          Scale the result by the diagonal 4x4 block,
3802          which was inverted as part of the factorization
3803       */
3804       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
3805         /* First Column */
3806         SSE_COPY_PS(XMM0,XMM7)
3807         SSE_SHUFFLE(XMM0,XMM0,0x00)
3808         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
3809 
3810         /* Second Column */
3811         SSE_COPY_PS(XMM1,XMM7)
3812         SSE_SHUFFLE(XMM1,XMM1,0x55)
3813         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
3814         SSE_ADD_PS(XMM0,XMM1)
3815 
3816         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
3817 
3818         /* Third Column */
3819         SSE_COPY_PS(XMM2,XMM7)
3820         SSE_SHUFFLE(XMM2,XMM2,0xAA)
3821         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
3822         SSE_ADD_PS(XMM0,XMM2)
3823 
3824         /* Fourth Column */
3825         SSE_COPY_PS(XMM3,XMM7)
3826         SSE_SHUFFLE(XMM3,XMM3,0xFF)
3827         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
3828         SSE_ADD_PS(XMM0,XMM3)
3829 
3830         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
3831       SSE_INLINE_END_3
3832 
3833       v    = aa + ai16 + 16;
3834       idt -= 4;
3835     }
3836 
3837     /* Convert t from single precision back to double precision (inplace)*/
3838     idt = 4*(n-1);
3839     for (i=n-1;i>=0;i--) {
3840       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
3841       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
3842       PetscScalar *xtemp=&x[idt];
3843       MatScalar   *ttemp=&t[idt];
3844       xtemp[3] = (PetscScalar)ttemp[3];
3845       xtemp[2] = (PetscScalar)ttemp[2];
3846       xtemp[1] = (PetscScalar)ttemp[1];
3847       xtemp[0] = (PetscScalar)ttemp[0];
3848       idt -= 4;
3849     }
3850 
3851   } /* End of artificial scope. */
3852   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3853   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3854   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3855   SSE_SCOPE_END;
3856   PetscFunctionReturn(0);
3857 }
3858 
3859 #undef __FUNCT__
3860 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion"
3861 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
3862 {
3863   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3864   int            *aj=a->j;
3865   PetscErrorCode ierr;
3866   int            *ai=a->i,n=a->mbs,*diag = a->diag;
3867   MatScalar      *aa=a->a;
3868   PetscScalar    *x,*b;
3869 
3870   PetscFunctionBegin;
3871   SSE_SCOPE_BEGIN;
3872   /*
3873      Note: This code currently uses demotion of double
3874      to float when performing the mixed-mode computation.
3875      This may not be numerically reasonable for all applications.
3876   */
3877   PREFETCH_NTA(aa+16*ai[1]);
3878 
3879   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3880   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3881   {
3882     /* x will first be computed in single precision then promoted inplace to double */
3883     MatScalar *v,*t=(MatScalar *)x;
3884     int       nz,i,idt,ai16;
3885     int       jdx,idx;
3886     int       *vi;
3887     /* Forward solve the lower triangular factor. */
3888 
3889     /* First block is the identity. */
3890     idx  = 0;
3891     CONVERT_DOUBLE4_FLOAT4(t,b);
3892     v    =  aa + 16*ai[1];
3893 
3894     for (i=1; i<n;) {
3895       PREFETCH_NTA(&v[8]);
3896       vi   =  aj      + ai[i];
3897       nz   =  diag[i] - ai[i];
3898       idx +=  4;
3899 
3900       /* Demote RHS from double to float. */
3901       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
3902       LOAD_PS(&t[idx],XMM7);
3903 
3904       while (nz--) {
3905         PREFETCH_NTA(&v[16]);
3906         jdx = 4*(*vi++);
3907 /*          jdx = *vi++; */
3908 
3909         /* 4x4 Matrix-Vector product with negative accumulation: */
3910         SSE_INLINE_BEGIN_2(&t[jdx],v)
3911           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3912 
3913           /* First Column */
3914           SSE_COPY_PS(XMM0,XMM6)
3915           SSE_SHUFFLE(XMM0,XMM0,0x00)
3916           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3917           SSE_SUB_PS(XMM7,XMM0)
3918 
3919           /* Second Column */
3920           SSE_COPY_PS(XMM1,XMM6)
3921           SSE_SHUFFLE(XMM1,XMM1,0x55)
3922           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3923           SSE_SUB_PS(XMM7,XMM1)
3924 
3925           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3926 
3927           /* Third Column */
3928           SSE_COPY_PS(XMM2,XMM6)
3929           SSE_SHUFFLE(XMM2,XMM2,0xAA)
3930           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3931           SSE_SUB_PS(XMM7,XMM2)
3932 
3933           /* Fourth Column */
3934           SSE_COPY_PS(XMM3,XMM6)
3935           SSE_SHUFFLE(XMM3,XMM3,0xFF)
3936           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3937           SSE_SUB_PS(XMM7,XMM3)
3938         SSE_INLINE_END_2
3939 
3940         v  += 16;
3941       }
3942       v    =  aa + 16*ai[++i];
3943       PREFETCH_NTA(v);
3944       STORE_PS(&t[idx],XMM7);
3945     }
3946 
3947     /* Backward solve the upper triangular factor.*/
3948 
3949     idt  = 4*(n-1);
3950     ai16 = 16*diag[n-1];
3951     v    = aa + ai16 + 16;
3952     for (i=n-1; i>=0;){
3953       PREFETCH_NTA(&v[8]);
3954       vi = aj + diag[i] + 1;
3955       nz = ai[i+1] - diag[i] - 1;
3956 
3957       LOAD_PS(&t[idt],XMM7);
3958 
3959       while (nz--) {
3960         PREFETCH_NTA(&v[16]);
3961         idx = 4*(*vi++);
3962 /*          idx = *vi++; */
3963 
3964         /* 4x4 Matrix-Vector Product with negative accumulation: */
3965         SSE_INLINE_BEGIN_2(&t[idx],v)
3966           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3967 
3968           /* First Column */
3969           SSE_COPY_PS(XMM0,XMM6)
3970           SSE_SHUFFLE(XMM0,XMM0,0x00)
3971           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3972           SSE_SUB_PS(XMM7,XMM0)
3973 
3974           /* Second Column */
3975           SSE_COPY_PS(XMM1,XMM6)
3976           SSE_SHUFFLE(XMM1,XMM1,0x55)
3977           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3978           SSE_SUB_PS(XMM7,XMM1)
3979 
3980           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3981 
3982           /* Third Column */
3983           SSE_COPY_PS(XMM2,XMM6)
3984           SSE_SHUFFLE(XMM2,XMM2,0xAA)
3985           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3986           SSE_SUB_PS(XMM7,XMM2)
3987 
3988           /* Fourth Column */
3989           SSE_COPY_PS(XMM3,XMM6)
3990           SSE_SHUFFLE(XMM3,XMM3,0xFF)
3991           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3992           SSE_SUB_PS(XMM7,XMM3)
3993         SSE_INLINE_END_2
3994         v  += 16;
3995       }
3996       v    = aa + ai16;
3997       ai16 = 16*diag[--i];
3998       PREFETCH_NTA(aa+ai16+16);
3999       /*
4000          Scale the result by the diagonal 4x4 block,
4001          which was inverted as part of the factorization
4002       */
4003       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
4004         /* First Column */
4005         SSE_COPY_PS(XMM0,XMM7)
4006         SSE_SHUFFLE(XMM0,XMM0,0x00)
4007         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
4008 
4009         /* Second Column */
4010         SSE_COPY_PS(XMM1,XMM7)
4011         SSE_SHUFFLE(XMM1,XMM1,0x55)
4012         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
4013         SSE_ADD_PS(XMM0,XMM1)
4014 
4015         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
4016 
4017         /* Third Column */
4018         SSE_COPY_PS(XMM2,XMM7)
4019         SSE_SHUFFLE(XMM2,XMM2,0xAA)
4020         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
4021         SSE_ADD_PS(XMM0,XMM2)
4022 
4023         /* Fourth Column */
4024         SSE_COPY_PS(XMM3,XMM7)
4025         SSE_SHUFFLE(XMM3,XMM3,0xFF)
4026         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
4027         SSE_ADD_PS(XMM0,XMM3)
4028 
4029         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
4030       SSE_INLINE_END_3
4031 
4032       v    = aa + ai16 + 16;
4033       idt -= 4;
4034     }
4035 
4036     /* Convert t from single precision back to double precision (inplace)*/
4037     idt = 4*(n-1);
4038     for (i=n-1;i>=0;i--) {
4039       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
4040       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
4041       PetscScalar *xtemp=&x[idt];
4042       MatScalar   *ttemp=&t[idt];
4043       xtemp[3] = (PetscScalar)ttemp[3];
4044       xtemp[2] = (PetscScalar)ttemp[2];
4045       xtemp[1] = (PetscScalar)ttemp[1];
4046       xtemp[0] = (PetscScalar)ttemp[0];
4047       idt -= 4;
4048     }
4049 
4050   } /* End of artificial scope. */
4051   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4052   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4053   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4054   SSE_SCOPE_END;
4055   PetscFunctionReturn(0);
4056 }
4057 
4058 #endif
4059 
4060 #undef __FUNCT__
4061 #define __FUNCT__ "MatSolve_SeqBAIJ_3"
4062 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
4063 {
4064   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4065   IS                iscol=a->col,isrow=a->row;
4066   PetscErrorCode    ierr;
4067   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
4068   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
4069   const MatScalar   *aa=a->a,*v;
4070   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
4071   const PetscScalar *b;
4072 
4073   PetscFunctionBegin;
4074   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4075   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4076   t  = a->solve_work;
4077 
4078   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4079   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
4080 
4081   /* forward solve the lower triangular */
4082   idx    = 3*(*r++);
4083   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
4084   for (i=1; i<n; i++) {
4085     v     = aa + 9*ai[i];
4086     vi    = aj + ai[i];
4087     nz    = diag[i] - ai[i];
4088     idx   = 3*(*r++);
4089     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
4090     while (nz--) {
4091       idx   = 3*(*vi++);
4092       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4093       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4094       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4095       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4096       v += 9;
4097     }
4098     idx = 3*i;
4099     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
4100   }
4101   /* backward solve the upper triangular */
4102   for (i=n-1; i>=0; i--){
4103     v    = aa + 9*diag[i] + 9;
4104     vi   = aj + diag[i] + 1;
4105     nz   = ai[i+1] - diag[i] - 1;
4106     idt  = 3*i;
4107     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
4108     while (nz--) {
4109       idx   = 3*(*vi++);
4110       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4111       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4112       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4113       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4114       v += 9;
4115     }
4116     idc = 3*(*c--);
4117     v   = aa + 9*diag[i];
4118     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4119     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4120     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4121   }
4122   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4123   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4124   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4125   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4126   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
4127   PetscFunctionReturn(0);
4128 }
4129 
4130 #undef __FUNCT__
4131 #define __FUNCT__ "MatSolve_SeqBAIJ_3_newdatastruct"
4132 PetscErrorCode MatSolve_SeqBAIJ_3_newdatastruct(Mat A,Vec bb,Vec xx)
4133 {
4134   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4135   IS                iscol=a->col,isrow=a->row;
4136   PetscErrorCode    ierr;
4137   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m;
4138   const PetscInt    *r,*c,*rout,*cout;
4139   const MatScalar   *aa=a->a,*v;
4140   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
4141   const PetscScalar *b;
4142 
4143   PetscFunctionBegin;
4144   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4145   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4146   t  = a->solve_work;
4147 
4148   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4149   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
4150 
4151   /* forward solve the lower triangular */
4152   idx    = 3*r[0];
4153   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
4154   for (i=1; i<n; i++) {
4155     v     = aa + 9*ai[i];
4156     vi    = aj + ai[i];
4157     nz    = ai[i+1] - ai[i];
4158     idx   = 3*r[i];
4159     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
4160     for(m=0;m<nz;m++){
4161       idx   = 3*vi[m];
4162       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4163       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4164       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4165       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4166       v += 9;
4167     }
4168     idx = 3*i;
4169     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
4170   }
4171   /* backward solve the upper triangular */
4172   for (i=n-1; i>=0; i--){
4173     k    = 2*n-i;
4174     v    = aa + 9*ai[k];
4175     vi   = aj + ai[k];
4176     nz   = ai[k +1] - ai[k] - 1;
4177     idt  = 3*i;
4178     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
4179     for(m=0;m<nz;m++){
4180       idx   = 3*vi[m];
4181       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4182       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4183       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4184       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4185       v += 9;
4186     }
4187     idc = 3*c[i];
4188     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4189     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4190     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4191   }
4192   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4193   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4194   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4195   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4196   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
4197   PetscFunctionReturn(0);
4198 }
4199 
4200 #undef __FUNCT__
4201 #define __FUNCT__ "MatSolve_SeqBAIJ_3_newdatastruct_v2"
4202 PetscErrorCode MatSolve_SeqBAIJ_3_newdatastruct_v2(Mat A,Vec bb,Vec xx)
4203 {
4204   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4205   IS                iscol=a->col,isrow=a->row;
4206   PetscErrorCode    ierr;
4207   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
4208   const PetscInt    *r,*c,*rout,*cout;
4209   const MatScalar   *aa=a->a,*v;
4210   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
4211   const PetscScalar *b;
4212 
4213   PetscFunctionBegin;
4214   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4215   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4216   t  = a->solve_work;
4217 
4218   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4219   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
4220 
4221   /* forward solve the lower triangular */
4222   idx    = 3*r[0];
4223   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
4224   for (i=1; i<n; i++) {
4225     v     = aa + 9*ai[i];
4226     vi    = aj + ai[i];
4227     nz    = ai[i+1] - ai[i];
4228     idx   = 3*r[i];
4229     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
4230     for(m=0;m<nz;m++){
4231       idx   = 3*vi[m];
4232       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4233       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4234       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4235       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4236       v += 9;
4237     }
4238     idx = 3*i;
4239     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
4240   }
4241   /* backward solve the upper triangular */
4242   for (i=n-1; i>=0; i--){
4243     v    = aa + 9*(adiag[i+1]+1);
4244     vi   = aj + adiag[i+1]+1;
4245     nz   = adiag[i] - adiag[i+1] - 1;
4246     idt  = 3*i;
4247     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
4248     for(m=0;m<nz;m++){
4249       idx   = 3*vi[m];
4250       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4251       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4252       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4253       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4254       v += 9;
4255     }
4256     idc = 3*c[i];
4257     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4258     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4259     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4260   }
4261   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4262   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4263   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4264   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4265   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
4266   PetscFunctionReturn(0);
4267 }
4268 
4269 /*
4270       Special case where the matrix was ILU(0) factored in the natural
4271    ordering. This eliminates the need for the column and row permutation.
4272 */
4273 #undef __FUNCT__
4274 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
4275 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
4276 {
4277   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4278   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
4279   PetscErrorCode    ierr;
4280   PetscInt          *diag = a->diag;
4281   const MatScalar   *aa=a->a,*v;
4282   PetscScalar       *x,s1,s2,s3,x1,x2,x3;
4283   const PetscScalar *b;
4284   PetscInt          jdx,idt,idx,nz,*vi,i;
4285 
4286   PetscFunctionBegin;
4287   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4288   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4289 
4290   /* forward solve the lower triangular */
4291   idx    = 0;
4292   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
4293   for (i=1; i<n; i++) {
4294     v     =  aa      + 9*ai[i];
4295     vi    =  aj      + ai[i];
4296     nz    =  diag[i] - ai[i];
4297     idx   +=  3;
4298     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
4299     while (nz--) {
4300       jdx   = 3*(*vi++);
4301       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
4302       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4303       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4304       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4305       v    += 9;
4306     }
4307     x[idx]   = s1;
4308     x[1+idx] = s2;
4309     x[2+idx] = s3;
4310   }
4311   /* backward solve the upper triangular */
4312   for (i=n-1; i>=0; i--){
4313     v    = aa + 9*diag[i] + 9;
4314     vi   = aj + diag[i] + 1;
4315     nz   = ai[i+1] - diag[i] - 1;
4316     idt  = 3*i;
4317     s1 = x[idt];  s2 = x[1+idt];
4318     s3 = x[2+idt];
4319     while (nz--) {
4320       idx   = 3*(*vi++);
4321       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
4322       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4323       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4324       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4325       v    += 9;
4326     }
4327     v        = aa +  9*diag[i];
4328     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4329     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4330     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4331   }
4332 
4333   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4334   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4335   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
4336   PetscFunctionReturn(0);
4337 }
4338 
4339 #undef __FUNCT__
4340 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct"
4341 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
4342 {
4343     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4344     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
4345     PetscErrorCode    ierr;
4346     PetscInt          idx,jdx,idt;
4347     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
4348     const MatScalar   *aa=a->a,*v;
4349     PetscScalar       *x;
4350     const PetscScalar *b;
4351     PetscScalar        s1,s2,s3,x1,x2,x3;
4352 
4353     PetscFunctionBegin;
4354     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4355     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4356     /* forward solve the lower triangular */
4357     idx    = 0;
4358     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
4359     for (i=1; i<n; i++) {
4360        v    = aa + bs2*ai[i];
4361        vi   = aj + ai[i];
4362        nz   = ai[i+1] - ai[i];
4363       idx   = bs*i;
4364        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];
4365       for(k=0;k<nz;k++){
4366          jdx   = bs*vi[k];
4367           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
4368           s1   -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4369           s2   -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4370           s3   -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4371 
4372           v   +=  bs2;
4373         }
4374 
4375        x[idx]   = s1;
4376        x[1+idx] = s2;
4377        x[2+idx] = s3;
4378     }
4379 
4380    /* backward solve the upper triangular */
4381   for (i=n-1; i>=0; i--){
4382      v   = aa + bs2*ai[2*n-i];
4383      vi  = aj + ai[2*n-i];
4384      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
4385      idt = bs*i;
4386      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];
4387 
4388      for(k=0;k<nz;k++){
4389        idx   = bs*vi[k];
4390        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
4391        s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4392        s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4393        s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4394 
4395         v   +=  bs2;
4396     }
4397     /* x = inv_diagonal*x */
4398    x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4399    x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4400    x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4401 
4402   }
4403 
4404   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4405   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4406   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
4407   PetscFunctionReturn(0);
4408 }
4409 
4410 #undef __FUNCT__
4411 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct_v2"
4412 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx)
4413 {
4414     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4415     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
4416     PetscErrorCode    ierr;
4417     PetscInt          idx,jdx,idt;
4418     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
4419     const MatScalar   *aa=a->a,*v;
4420     PetscScalar       *x;
4421     const PetscScalar *b;
4422     PetscScalar        s1,s2,s3,x1,x2,x3;
4423 
4424     PetscFunctionBegin;
4425     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4426     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4427     /* forward solve the lower triangular */
4428     idx    = 0;
4429     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
4430     for (i=1; i<n; i++) {
4431        v    = aa + bs2*ai[i];
4432        vi   = aj + ai[i];
4433        nz   = ai[i+1] - ai[i];
4434       idx   = bs*i;
4435        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];
4436       for(k=0;k<nz;k++){
4437          jdx   = bs*vi[k];
4438           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
4439           s1   -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4440           s2   -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4441           s3   -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4442 
4443           v   +=  bs2;
4444         }
4445 
4446        x[idx]   = s1;
4447        x[1+idx] = s2;
4448        x[2+idx] = s3;
4449     }
4450 
4451    /* backward solve the upper triangular */
4452   for (i=n-1; i>=0; i--){
4453     v   = aa + bs2*(adiag[i+1]+1);
4454      vi  = aj + adiag[i+1]+1;
4455      nz  = adiag[i] - adiag[i+1]-1;
4456      idt = bs*i;
4457      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];
4458 
4459      for(k=0;k<nz;k++){
4460        idx   = bs*vi[k];
4461        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
4462        s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4463        s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4464        s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4465 
4466         v   +=  bs2;
4467     }
4468     /* x = inv_diagonal*x */
4469    x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4470    x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4471    x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4472 
4473   }
4474 
4475   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4476   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4477   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
4478   PetscFunctionReturn(0);
4479 }
4480 
4481 #undef __FUNCT__
4482 #define __FUNCT__ "MatSolve_SeqBAIJ_2"
4483 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
4484 {
4485   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4486   IS                iscol=a->col,isrow=a->row;
4487   PetscErrorCode    ierr;
4488   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
4489   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
4490   const MatScalar   *aa=a->a,*v;
4491   PetscScalar       *x,s1,s2,x1,x2,*t;
4492   const PetscScalar *b;
4493 
4494   PetscFunctionBegin;
4495   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4496   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4497   t  = a->solve_work;
4498 
4499   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4500   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
4501 
4502   /* forward solve the lower triangular */
4503   idx    = 2*(*r++);
4504   t[0] = b[idx]; t[1] = b[1+idx];
4505   for (i=1; i<n; i++) {
4506     v     = aa + 4*ai[i];
4507     vi    = aj + ai[i];
4508     nz    = diag[i] - ai[i];
4509     idx   = 2*(*r++);
4510     s1  = b[idx]; s2 = b[1+idx];
4511     while (nz--) {
4512       idx   = 2*(*vi++);
4513       x1    = t[idx]; x2 = t[1+idx];
4514       s1 -= v[0]*x1 + v[2]*x2;
4515       s2 -= v[1]*x1 + v[3]*x2;
4516       v += 4;
4517     }
4518     idx = 2*i;
4519     t[idx] = s1; t[1+idx] = s2;
4520   }
4521   /* backward solve the upper triangular */
4522   for (i=n-1; i>=0; i--){
4523     v    = aa + 4*diag[i] + 4;
4524     vi   = aj + diag[i] + 1;
4525     nz   = ai[i+1] - diag[i] - 1;
4526     idt  = 2*i;
4527     s1 = t[idt]; s2 = t[1+idt];
4528     while (nz--) {
4529       idx   = 2*(*vi++);
4530       x1    = t[idx]; x2 = t[1+idx];
4531       s1 -= v[0]*x1 + v[2]*x2;
4532       s2 -= v[1]*x1 + v[3]*x2;
4533       v += 4;
4534     }
4535     idc = 2*(*c--);
4536     v   = aa + 4*diag[i];
4537     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
4538     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
4539   }
4540   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4541   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4542   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4543   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4544   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
4545   PetscFunctionReturn(0);
4546 }
4547 
4548 #undef __FUNCT__
4549 #define __FUNCT__ "MatSolve_SeqBAIJ_2_newdatastruct"
4550 PetscErrorCode MatSolve_SeqBAIJ_2_newdatastruct(Mat A,Vec bb,Vec xx)
4551 {
4552   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4553   IS                iscol=a->col,isrow=a->row;
4554   PetscErrorCode    ierr;
4555   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,jdx,idt,idc,k,m;
4556   const PetscInt    *r,*c,*rout,*cout;
4557   const MatScalar   *aa=a->a,*v;
4558   PetscScalar       *x,s1,s2,x1,x2,*t;
4559   const PetscScalar *b;
4560 
4561   PetscFunctionBegin;
4562   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4563   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4564   t  = a->solve_work;
4565 
4566   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4567   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
4568 
4569   /* forward solve the lower triangular */
4570   idx    = 2*r[0];
4571   t[0] = b[idx]; t[1] = b[1+idx];
4572   for (i=1; i<n; i++) {
4573     v     = aa + 4*ai[i];
4574     vi    = aj + ai[i];
4575     nz    = ai[i+1] - ai[i];
4576     idx   = 2*r[i];
4577     s1  = b[idx]; s2 = b[1+idx];
4578     for(m=0;m<nz;m++){
4579       jdx   = 2*vi[m];
4580       x1    = t[jdx]; x2 = t[1+jdx];
4581       s1 -= v[0]*x1 + v[2]*x2;
4582       s2 -= v[1]*x1 + v[3]*x2;
4583       v += 4;
4584     }
4585     idx = 2*i;
4586     t[idx] = s1; t[1+idx] = s2;
4587   }
4588   /* backward solve the upper triangular */
4589   for (i=n-1; i>=0; i--){
4590     k = 2*n-i;
4591     v    = aa + 4*ai[k];
4592     vi   = aj + ai[k];
4593     nz   = ai[k +1] - ai[k] - 1;
4594     idt  = 2*i;
4595     s1 = t[idt]; s2 = t[1+idt];
4596     for(m=0;m<nz;m++){
4597       idx   = 2*vi[m];
4598       x1    = t[idx]; x2 = t[1+idx];
4599       s1 -= v[0]*x1 + v[2]*x2;
4600       s2 -= v[1]*x1 + v[3]*x2;
4601       v += 4;
4602     }
4603     idc = 2*c[i];
4604     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
4605     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
4606   }
4607   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4608   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4609   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4610   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4611   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
4612   PetscFunctionReturn(0);
4613 }
4614 
4615 #undef __FUNCT__
4616 #define __FUNCT__ "MatSolve_SeqBAIJ_2_newdatastruct_v2"
4617 PetscErrorCode MatSolve_SeqBAIJ_2_newdatastruct_v2(Mat A,Vec bb,Vec xx)
4618 {
4619   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4620   IS                iscol=a->col,isrow=a->row;
4621   PetscErrorCode    ierr;
4622   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,jdx,idt,idc,m;
4623   const PetscInt    *r,*c,*rout,*cout;
4624   const MatScalar   *aa=a->a,*v;
4625   PetscScalar       *x,s1,s2,x1,x2,*t;
4626   const PetscScalar *b;
4627 
4628   PetscFunctionBegin;
4629   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4630   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4631   t  = a->solve_work;
4632 
4633   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4634   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
4635 
4636   /* forward solve the lower triangular */
4637   idx    = 2*r[0];
4638   t[0] = b[idx]; t[1] = b[1+idx];
4639   for (i=1; i<n; i++) {
4640     v     = aa + 4*ai[i];
4641     vi    = aj + ai[i];
4642     nz    = ai[i+1] - ai[i];
4643     idx   = 2*r[i];
4644     s1  = b[idx]; s2 = b[1+idx];
4645     for(m=0;m<nz;m++){
4646       jdx   = 2*vi[m];
4647       x1    = t[jdx]; x2 = t[1+jdx];
4648       s1 -= v[0]*x1 + v[2]*x2;
4649       s2 -= v[1]*x1 + v[3]*x2;
4650       v += 4;
4651     }
4652     idx = 2*i;
4653     t[idx] = s1; t[1+idx] = s2;
4654   }
4655   /* backward solve the upper triangular */
4656   for (i=n-1; i>=0; i--){
4657     v    = aa + 4*(adiag[i+1]+1);
4658     vi   = aj + adiag[i+1]+1;
4659     nz   = adiag[i] - adiag[i+1] - 1;
4660     idt  = 2*i;
4661     s1 = t[idt]; s2 = t[1+idt];
4662     for(m=0;m<nz;m++){
4663       idx   = 2*vi[m];
4664       x1    = t[idx]; x2 = t[1+idx];
4665       s1 -= v[0]*x1 + v[2]*x2;
4666       s2 -= v[1]*x1 + v[3]*x2;
4667       v += 4;
4668     }
4669     idc = 2*c[i];
4670     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
4671     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
4672   }
4673   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4674   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4675   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4676   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4677   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
4678   PetscFunctionReturn(0);
4679 }
4680 
4681 /*
4682       Special case where the matrix was ILU(0) factored in the natural
4683    ordering. This eliminates the need for the column and row permutation.
4684 */
4685 #undef __FUNCT__
4686 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
4687 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
4688 {
4689   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4690   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
4691   PetscErrorCode    ierr;
4692   PetscInt          *diag = a->diag;
4693   const MatScalar   *aa=a->a,*v;
4694   PetscScalar       *x,s1,s2,x1,x2;
4695   const PetscScalar *b;
4696   PetscInt          jdx,idt,idx,nz,*vi,i;
4697 
4698   PetscFunctionBegin;
4699   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4700   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4701 
4702   /* forward solve the lower triangular */
4703   idx    = 0;
4704   x[0]   = b[0]; x[1] = b[1];
4705   for (i=1; i<n; i++) {
4706     v     =  aa      + 4*ai[i];
4707     vi    =  aj      + ai[i];
4708     nz    =  diag[i] - ai[i];
4709     idx   +=  2;
4710     s1  =  b[idx];s2 = b[1+idx];
4711     while (nz--) {
4712       jdx   = 2*(*vi++);
4713       x1    = x[jdx];x2 = x[1+jdx];
4714       s1 -= v[0]*x1 + v[2]*x2;
4715       s2 -= v[1]*x1 + v[3]*x2;
4716       v    += 4;
4717     }
4718     x[idx]   = s1;
4719     x[1+idx] = s2;
4720   }
4721   /* backward solve the upper triangular */
4722   for (i=n-1; i>=0; i--){
4723     v    = aa + 4*diag[i] + 4;
4724     vi   = aj + diag[i] + 1;
4725     nz   = ai[i+1] - diag[i] - 1;
4726     idt  = 2*i;
4727     s1 = x[idt];  s2 = x[1+idt];
4728     while (nz--) {
4729       idx   = 2*(*vi++);
4730       x1    = x[idx];   x2 = x[1+idx];
4731       s1 -= v[0]*x1 + v[2]*x2;
4732       s2 -= v[1]*x1 + v[3]*x2;
4733       v    += 4;
4734     }
4735     v        = aa +  4*diag[i];
4736     x[idt]   = v[0]*s1 + v[2]*s2;
4737     x[1+idt] = v[1]*s1 + v[3]*s2;
4738   }
4739 
4740   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4741   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4742   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
4743   PetscFunctionReturn(0);
4744 }
4745 
4746 #undef __FUNCT__
4747 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct"
4748 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
4749 {
4750     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4751     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
4752     PetscErrorCode    ierr;
4753     PetscInt          jdx;
4754     const MatScalar   *aa=a->a,*v;
4755     PetscScalar       *x,s1,s2,x1,x2;
4756     const PetscScalar *b;
4757 
4758     PetscFunctionBegin;
4759     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4760     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4761     /* forward solve the lower triangular */
4762     idx    = 0;
4763     x[0] = b[idx]; x[1] = b[1+idx];
4764     for (i=1; i<n; i++) {
4765         v   = aa + 4*ai[i];
4766        vi   = aj + ai[i];
4767        nz   = ai[i+1] - ai[i];
4768        idx  = 2*i;
4769        s1   = b[idx];s2 = b[1+idx];
4770       for(k=0;k<nz;k++){
4771          jdx   = 2*vi[k];
4772           x1    = x[jdx];x2 = x[1+jdx];
4773           s1   -= v[0]*x1 + v[2]*x2;
4774           s2   -= v[1]*x1 + v[3]*x2;
4775            v   +=  4;
4776         }
4777        x[idx]   = s1;
4778        x[1+idx] = s2;
4779     }
4780 
4781    /* backward solve the upper triangular */
4782   for (i=n-1; i>=0; i--){
4783      v   = aa + 4*ai[2*n-i];
4784      vi  = aj + ai[2*n-i];
4785      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
4786      idt = 2*i;
4787      s1 = x[idt];  s2 = x[1+idt];
4788      for(k=0;k<nz;k++){
4789       idx   = 2*vi[k];
4790        x1    = x[idx];   x2 = x[1+idx];
4791        s1 -= v[0]*x1 + v[2]*x2;
4792        s2 -= v[1]*x1 + v[3]*x2;
4793          v    += 4;
4794     }
4795     /* x = inv_diagonal*x */
4796    x[idt]   = v[0]*s1 + v[2]*s2;
4797    x[1+idt] = v[1]*s1 + v[3]*s2;
4798   }
4799 
4800   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4801   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4802   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
4803   PetscFunctionReturn(0);
4804 }
4805 
4806 #undef __FUNCT__
4807 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct_v2"
4808 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx)
4809 {
4810     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4811     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt;
4812     PetscErrorCode    ierr;
4813     PetscInt          jdx;
4814     const MatScalar   *aa=a->a,*v;
4815     PetscScalar       *x,s1,s2,x1,x2;
4816     const PetscScalar *b;
4817 
4818     PetscFunctionBegin;
4819     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4820     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4821     /* forward solve the lower triangular */
4822     idx    = 0;
4823     x[0] = b[idx]; x[1] = b[1+idx];
4824     for (i=1; i<n; i++) {
4825         v   = aa + 4*ai[i];
4826        vi   = aj + ai[i];
4827        nz   = ai[i+1] - ai[i];
4828        idx  = 2*i;
4829        s1   = b[idx];s2 = b[1+idx];
4830       for(k=0;k<nz;k++){
4831          jdx   = 2*vi[k];
4832           x1    = x[jdx];x2 = x[1+jdx];
4833           s1   -= v[0]*x1 + v[2]*x2;
4834           s2   -= v[1]*x1 + v[3]*x2;
4835            v   +=  4;
4836         }
4837        x[idx]   = s1;
4838        x[1+idx] = s2;
4839     }
4840 
4841    /* backward solve the upper triangular */
4842   for (i=n-1; i>=0; i--){
4843      v   = aa + 4*(adiag[i+1]+1);
4844      vi  = aj + adiag[i+1]+1;
4845      nz  = adiag[i] - adiag[i+1]-1;
4846      idt = 2*i;
4847      s1 = x[idt];  s2 = x[1+idt];
4848      for(k=0;k<nz;k++){
4849       idx   = 2*vi[k];
4850        x1    = x[idx];   x2 = x[1+idx];
4851        s1 -= v[0]*x1 + v[2]*x2;
4852        s2 -= v[1]*x1 + v[3]*x2;
4853          v    += 4;
4854     }
4855     /* x = inv_diagonal*x */
4856    x[idt]   = v[0]*s1 + v[2]*s2;
4857    x[1+idt] = v[1]*s1 + v[3]*s2;
4858   }
4859 
4860   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4861   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4862   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
4863   PetscFunctionReturn(0);
4864 }
4865 
4866 #undef __FUNCT__
4867 #define __FUNCT__ "MatSolve_SeqBAIJ_1"
4868 PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
4869 {
4870   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
4871   IS             iscol=a->col,isrow=a->row;
4872   PetscErrorCode ierr;
4873   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
4874   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
4875   MatScalar      *aa=a->a,*v;
4876   PetscScalar    *x,*b,s1,*t;
4877 
4878   PetscFunctionBegin;
4879   if (!n) PetscFunctionReturn(0);
4880 
4881   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4882   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4883   t  = a->solve_work;
4884 
4885   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4886   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
4887 
4888   /* forward solve the lower triangular */
4889   t[0] = b[*r++];
4890   for (i=1; i<n; i++) {
4891     v     = aa + ai[i];
4892     vi    = aj + ai[i];
4893     nz    = diag[i] - ai[i];
4894     s1  = b[*r++];
4895     while (nz--) {
4896       s1 -= (*v++)*t[*vi++];
4897     }
4898     t[i] = s1;
4899   }
4900   /* backward solve the upper triangular */
4901   for (i=n-1; i>=0; i--){
4902     v    = aa + diag[i] + 1;
4903     vi   = aj + diag[i] + 1;
4904     nz   = ai[i+1] - diag[i] - 1;
4905     s1 = t[i];
4906     while (nz--) {
4907       s1 -= (*v++)*t[*vi++];
4908     }
4909     x[*c--] = t[i] = aa[diag[i]]*s1;
4910   }
4911 
4912   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4913   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4914   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4915   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4916   ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr);
4917   PetscFunctionReturn(0);
4918 }
4919 /*
4920       Special case where the matrix was ILU(0) factored in the natural
4921    ordering. This eliminates the need for the column and row permutation.
4922 */
4923 #undef __FUNCT__
4924 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering"
4925 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
4926 {
4927   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
4928   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
4929   PetscErrorCode ierr;
4930   PetscInt       *diag = a->diag;
4931   MatScalar      *aa=a->a;
4932   PetscScalar    *x,*b;
4933   PetscScalar    s1,x1;
4934   MatScalar      *v;
4935   PetscInt       jdx,idt,idx,nz,*vi,i;
4936 
4937   PetscFunctionBegin;
4938   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4939   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4940 
4941   /* forward solve the lower triangular */
4942   idx    = 0;
4943   x[0]   = b[0];
4944   for (i=1; i<n; i++) {
4945     v     =  aa      + ai[i];
4946     vi    =  aj      + ai[i];
4947     nz    =  diag[i] - ai[i];
4948     idx   +=  1;
4949     s1  =  b[idx];
4950     while (nz--) {
4951       jdx   = *vi++;
4952       x1    = x[jdx];
4953       s1 -= v[0]*x1;
4954       v    += 1;
4955     }
4956     x[idx]   = s1;
4957   }
4958   /* backward solve the upper triangular */
4959   for (i=n-1; i>=0; i--){
4960     v    = aa + diag[i] + 1;
4961     vi   = aj + diag[i] + 1;
4962     nz   = ai[i+1] - diag[i] - 1;
4963     idt  = i;
4964     s1 = x[idt];
4965     while (nz--) {
4966       idx   = *vi++;
4967       x1    = x[idx];
4968       s1 -= v[0]*x1;
4969       v    += 1;
4970     }
4971     v        = aa +  diag[i];
4972     x[idt]   = v[0]*s1;
4973   }
4974   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4975   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4976   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
4977   PetscFunctionReturn(0);
4978 }
4979 
4980 /* ----------------------------------------------------------------*/
4981 EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscTruth);
4982 EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat,PetscTruth);
4983 
4984 #undef __FUNCT__
4985 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N_newdatastruct"
4986 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N_newdatastruct(Mat B,Mat A,const MatFactorInfo *info)
4987 {
4988   Mat            C=B;
4989   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
4990   IS             isrow = b->row,isicol = b->icol;
4991   PetscErrorCode ierr;
4992   const PetscInt *r,*ic,*ics;
4993   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
4994   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
4995   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
4996   PetscInt       bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg;
4997   MatScalar      *v_work;
4998 
4999   PetscFunctionBegin;
5000   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
5001   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
5002   ierr = PetscMalloc((bs2*n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
5003   ierr = PetscMemzero(rtmp,(bs2*n+1)*sizeof(MatScalar));CHKERRQ(ierr);
5004   ics  = ic;
5005 
5006   /* generate work space needed by dense LU factorization */
5007   ierr     = PetscMalloc(bs*sizeof(PetscInt) + (bs+bs2)*sizeof(MatScalar),&v_work);CHKERRQ(ierr);
5008   mwork    = v_work + bs;
5009   v_pivots = (PetscInt*)(mwork + bs2);
5010 
5011   for (i=0; i<n; i++){
5012     /* zero rtmp */
5013     /* L part */
5014     nz    = bi[i+1] - bi[i];
5015     bjtmp = bj + bi[i];
5016     for  (j=0; j<nz; j++){
5017       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5018     }
5019 
5020     /* U part */
5021     nz = bdiag[i] - bdiag[i+1];
5022     bjtmp = bj + bdiag[i+1]+1;
5023     for  (j=0; j<nz; j++){
5024       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5025     }
5026 
5027     /* load in initial (unfactored row) */
5028     nz    = ai[r[i]+1] - ai[r[i]];
5029     ajtmp = aj + ai[r[i]];
5030     v     = aa + bs2*ai[r[i]];
5031     for (j=0; j<nz; j++) {
5032       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
5033     }
5034 
5035     /* elimination */
5036     bjtmp = bj + bi[i];
5037     nzL   = bi[i+1] - bi[i];
5038     for(k=0;k < nzL;k++) {
5039       row = bjtmp[k];
5040       pc = rtmp + bs2*row;
5041       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
5042       if (flg) {
5043         pv         = b->a + bs2*bdiag[row];
5044         Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */
5045         pj         = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
5046         pv         = b->a + bs2*(bdiag[row+1]+1);
5047         nz         = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
5048         for (j=0; j<nz; j++) {
5049           Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j);
5050         }
5051         ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
5052       }
5053     }
5054 
5055     /* finished row so stick it into b->a */
5056     /* L part */
5057     pv   = b->a + bs2*bi[i] ;
5058     pj   = b->j + bi[i] ;
5059     nz   = bi[i+1] - bi[i];
5060     for (j=0; j<nz; j++) {
5061       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5062     }
5063 
5064     /* Mark diagonal and invert diagonal for simplier triangular solves */
5065     pv  = b->a + bs2*bdiag[i];
5066     pj  = b->j + bdiag[i];
5067     /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */
5068     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5069     ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr);
5070 
5071     /* U part */
5072     pv = b->a + bs2*(bdiag[i+1]+1);
5073     pj = b->j + bdiag[i+1]+1;
5074     nz = bdiag[i] - bdiag[i+1] - 1;
5075     for (j=0; j<nz; j++){
5076       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5077     }
5078   }
5079 
5080   ierr = PetscFree(rtmp);CHKERRQ(ierr);
5081   ierr = PetscFree(v_work);CHKERRQ(ierr);
5082   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
5083   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
5084 
5085   C->assembled = PETSC_TRUE;
5086   ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
5087   PetscFunctionReturn(0);
5088 }
5089 
5090 /*
5091    ilu(0) with natural ordering under new data structure.
5092    See MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct() for detailed description
5093    because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct().
5094 */
5095 
5096 #undef __FUNCT__
5097 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct"
5098 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
5099 {
5100 
5101   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
5102   PetscErrorCode     ierr;
5103   PetscInt           n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2;
5104   PetscInt           i,j,nz,*bi,*bj,*bdiag,bi_temp;
5105 
5106   PetscFunctionBegin;
5107   ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr);
5108   b    = (Mat_SeqBAIJ*)(fact)->data;
5109 
5110   /* allocate matrix arrays for new data structure */
5111   ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,n+1,PetscInt,&b->i);CHKERRQ(ierr);
5112   ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(n+1)*sizeof(PetscInt));CHKERRQ(ierr);
5113   b->singlemalloc = PETSC_TRUE;
5114   if (!b->diag){
5115     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr);
5116     ierr = PetscLogObjectMemory(fact,(n+1)*sizeof(PetscInt));CHKERRQ(ierr);
5117   }
5118   bdiag = b->diag;
5119 
5120   if (n > 0) {
5121     ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr);
5122   }
5123 
5124   /* set bi and bj with new data structure */
5125   bi = b->i;
5126   bj = b->j;
5127 
5128   /* L part */
5129   bi[0] = 0;
5130   for (i=0; i<n; i++){
5131     nz = adiag[i] - ai[i];
5132     bi[i+1] = bi[i] + nz;
5133     aj = a->j + ai[i];
5134     for (j=0; j<nz; j++){
5135       *bj = aj[j]; bj++;
5136     }
5137   }
5138 
5139   /* U part */
5140   bi_temp = bi[n];
5141   bdiag[n] = bi[n]-1;
5142   for (i=n-1; i>=0; i--){
5143     nz = ai[i+1] - adiag[i] - 1;
5144     bi_temp = bi_temp + nz + 1;
5145     aj = a->j + adiag[i] + 1;
5146     for (j=0; j<nz; j++){
5147       *bj = aj[j]; bj++;
5148     }
5149     /* diag[i] */
5150     *bj = i; bj++;
5151     bdiag[i] = bi_temp - 1;
5152   }
5153   PetscFunctionReturn(0);
5154 }
5155 
5156 #undef __FUNCT__
5157 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_newdatastruct"
5158 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
5159 {
5160   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
5161   IS                 isicol;
5162   PetscErrorCode     ierr;
5163   const PetscInt     *r,*ic;
5164   PetscInt           n=a->mbs,*ai=a->i,*aj=a->j,d;
5165   PetscInt           *bi,*cols,nnz,*cols_lvl;
5166   PetscInt           *bdiag,prow,fm,nzbd,reallocs=0,dcount=0;
5167   PetscInt           i,levels,diagonal_fill;
5168   PetscTruth         col_identity,row_identity,both_identity;
5169   PetscReal          f;
5170   PetscInt           nlnk,*lnk,*lnk_lvl=PETSC_NULL;
5171   PetscBT            lnkbt;
5172   PetscInt           nzi,*bj,**bj_ptr,**bjlvl_ptr;
5173   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
5174   PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL;
5175   PetscTruth         missing;
5176   PetscInt           bs=A->rmap->bs,bs2=a->bs2;
5177 
5178   PetscFunctionBegin;
5179   if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
5180   ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr);
5181   if (missing) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
5182 
5183   f             = info->fill;
5184   levels        = (PetscInt)info->levels;
5185   diagonal_fill = (PetscInt)info->diagonal_fill;
5186   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
5187 
5188   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5189   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
5190   both_identity = (PetscTruth) (row_identity && col_identity);
5191 
5192   if (!levels && both_identity) {
5193     /* special case: ilu(0) with natural ordering */
5194     ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr);
5195     (fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_N_newdatastruct;
5196     /* set MatSolve routines */
5197     switch (bs){
5198     case 2:
5199       fact->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct_v2;
5200       break;
5201     case 3:
5202       fact->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct_v2;
5203       break;
5204     case 4:
5205       fact->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct_v2;
5206       break;
5207     case 5:
5208       fact->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct_v2;
5209       break;
5210     case 6:
5211       fact->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct_v2;
5212       break;
5213     case 7:
5214       fact->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct_v2;
5215       break;
5216     default:
5217       fact->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct_v2;
5218       break;
5219     }
5220 
5221     fact->factor = MAT_FACTOR_ILU;
5222     (fact)->info.factor_mallocs    = 0;
5223     (fact)->info.fill_ratio_given  = info->fill;
5224     (fact)->info.fill_ratio_needed = 1.0;
5225     b                = (Mat_SeqBAIJ*)(fact)->data;
5226     b->row           = isrow;
5227     b->col           = iscol;
5228     b->icol          = isicol;
5229     ierr             = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5230     ierr             = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5231     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5232     ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
5233     PetscFunctionReturn(0);
5234   }
5235 
5236   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
5237   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
5238 
5239   /* get new row pointers */
5240   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
5241   bi[0] = 0;
5242   /* bdiag is location of diagonal in factor */
5243   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr);
5244   bdiag[0]  = 0;
5245 
5246   ierr = PetscMalloc((2*n+1)*sizeof(PetscInt**),&bj_ptr);CHKERRQ(ierr);
5247   bjlvl_ptr = (PetscInt**)(bj_ptr + n);
5248 
5249   /* create a linked list for storing column indices of the active row */
5250   nlnk = n + 1;
5251   ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
5252 
5253   /* initial FreeSpace size is f*(ai[n]+1) */
5254   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr);
5255   current_space = free_space;
5256   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr);
5257   current_space_lvl = free_space_lvl;
5258 
5259   for (i=0; i<n; i++) {
5260     nzi = 0;
5261     /* copy current row into linked list */
5262     nnz  = ai[r[i]+1] - ai[r[i]];
5263     if (!nnz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i);
5264     cols = aj + ai[r[i]];
5265     lnk[i] = -1; /* marker to indicate if diagonal exists */
5266     ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
5267     nzi += nlnk;
5268 
5269     /* make sure diagonal entry is included */
5270     if (diagonal_fill && lnk[i] == -1) {
5271       fm = n;
5272       while (lnk[fm] < i) fm = lnk[fm];
5273       lnk[i]     = lnk[fm]; /* insert diagonal into linked list */
5274       lnk[fm]    = i;
5275       lnk_lvl[i] = 0;
5276       nzi++; dcount++;
5277     }
5278 
5279     /* add pivot rows into the active row */
5280     nzbd = 0;
5281     prow = lnk[n];
5282     while (prow < i) {
5283       nnz      = bdiag[prow];
5284       cols     = bj_ptr[prow] + nnz + 1;
5285       cols_lvl = bjlvl_ptr[prow] + nnz + 1;
5286       nnz      = bi[prow+1] - bi[prow] - nnz - 1;
5287       ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr);
5288       nzi += nlnk;
5289       prow = lnk[prow];
5290       nzbd++;
5291     }
5292     bdiag[i] = nzbd;
5293     bi[i+1]  = bi[i] + nzi;
5294 
5295     /* if free space is not available, make more free space */
5296     if (current_space->local_remaining<nzi) {
5297       nnz = 2*nzi*(n - i); /* estimated and max additional space needed */
5298       ierr = PetscFreeSpaceGet(nnz,&current_space);CHKERRQ(ierr);
5299       ierr = PetscFreeSpaceGet(nnz,&current_space_lvl);CHKERRQ(ierr);
5300       reallocs++;
5301     }
5302 
5303     /* copy data into free_space and free_space_lvl, then initialize lnk */
5304     ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr);
5305     bj_ptr[i]    = current_space->array;
5306     bjlvl_ptr[i] = current_space_lvl->array;
5307 
5308     /* make sure the active row i has diagonal entry */
5309     if (*(bj_ptr[i]+bdiag[i]) != i) {
5310       SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
5311     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i);
5312     }
5313 
5314     current_space->array           += nzi;
5315     current_space->local_used      += nzi;
5316     current_space->local_remaining -= nzi;
5317     current_space_lvl->array           += nzi;
5318     current_space_lvl->local_used      += nzi;
5319     current_space_lvl->local_remaining -= nzi;
5320   }
5321 
5322   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
5323   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
5324 
5325   /* destroy list of free space and other temporary arrays */
5326   ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr);
5327 
5328   /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */
5329   ierr = PetscFreeSpaceContiguous_LU_v2(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr);
5330 
5331   ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
5332   ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr);
5333   ierr = PetscFree(bj_ptr);CHKERRQ(ierr);
5334 
5335 #if defined(PETSC_USE_INFO)
5336   {
5337     PetscReal af = ((PetscReal)bi[n])/((PetscReal)ai[n]);
5338     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr);
5339     ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
5340     ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr);
5341     ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
5342     if (diagonal_fill) {
5343       ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr);
5344     }
5345   }
5346 #endif
5347 
5348   /* put together the new matrix */
5349   ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
5350   ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
5351   b = (Mat_SeqBAIJ*)(fact)->data;
5352   b->free_a       = PETSC_TRUE;
5353   b->free_ij      = PETSC_TRUE;
5354   b->singlemalloc = PETSC_FALSE;
5355   ierr = PetscMalloc( (bs2*(bdiag[0]+1) )*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
5356   b->j          = bj;
5357   b->i          = bi;
5358   b->diag       = bdiag;
5359   b->free_diag  = PETSC_TRUE;
5360   b->ilen       = 0;
5361   b->imax       = 0;
5362   b->row        = isrow;
5363   b->col        = iscol;
5364   ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5365   ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5366   b->icol       = isicol;
5367   ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
5368   /* In b structure:  Free imax, ilen, old a, old j.
5369      Allocate bdiag, solve_work, new a, new j */
5370   ierr = PetscLogObjectMemory(fact,(bdiag[0]+1) * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr);
5371   b->maxnz = b->nz = bdiag[0]+1;
5372   (fact)->info.factor_mallocs    = reallocs;
5373   (fact)->info.fill_ratio_given  = f;
5374   (fact)->info.fill_ratio_needed = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]);
5375   (fact)->ops->lufactornumeric   = MatLUFactorNumeric_SeqBAIJ_N_newdatastruct;
5376   /* set MatSolve routines */
5377   if (both_identity){
5378     switch (bs){
5379     case 2:
5380       fact->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct_v2;
5381       break;
5382     case 3:
5383       fact->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct_v2;
5384       break;
5385     case 4:
5386       fact->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct_v2;
5387       break;
5388     case 5:
5389       fact->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct_v2;
5390       break;
5391     case 6:
5392       fact->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct_v2;
5393       break;
5394     case 7:
5395       fact->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct_v2;
5396       break;
5397     default:
5398       fact->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct_v2;
5399       break;
5400     }
5401   } else {
5402     switch (bs){
5403     case 2:
5404       fact->ops->solve = MatSolve_SeqBAIJ_2_newdatastruct_v2;
5405       break;
5406     case 3:
5407       fact->ops->solve = MatSolve_SeqBAIJ_3_newdatastruct_v2;
5408       break;
5409     case 4:
5410       fact->ops->solve = MatSolve_SeqBAIJ_4_newdatastruct_v2;
5411       break;
5412     case 5:
5413       fact->ops->solve = MatSolve_SeqBAIJ_5_newdatastruct_v2;
5414       break;
5415     case 6:
5416       fact->ops->solve = MatSolve_SeqBAIJ_6_newdatastruct_v2;
5417       break;
5418     case 7:
5419       fact->ops->solve = MatSolve_SeqBAIJ_7_newdatastruct_v2;
5420       break;
5421     default:
5422       fact->ops->solve = MatSolve_SeqBAIJ_N_newdatastruct_v2;
5423       break;
5424     }
5425   }
5426   PetscFunctionReturn(0);
5427 }
5428 
5429 
5430 /*
5431      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
5432    except that the data structure of Mat_SeqAIJ is slightly different.
5433    Not a good example of code reuse.
5434 */
5435 #undef __FUNCT__
5436 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ"
5437 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
5438 {
5439   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b;
5440   IS             isicol;
5441   PetscErrorCode ierr;
5442   const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi;
5443   PetscInt       prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp;
5444   PetscInt       *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0;
5445   PetscInt       incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd;
5446   PetscTruth     col_identity,row_identity,both_identity,flg;
5447   PetscReal      f;
5448   PetscTruth     newdatastruct = PETSC_FALSE;
5449 
5450   PetscFunctionBegin;
5451   ierr = PetscOptionsGetTruth(PETSC_NULL,"-ilu_new",&newdatastruct,PETSC_NULL);CHKERRQ(ierr);
5452   if (newdatastruct){
5453     ierr = MatILUFactorSymbolic_SeqBAIJ_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr);
5454     PetscFunctionReturn(0);
5455   }
5456 
5457   ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr);
5458   if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd);
5459 
5460   f             = info->fill;
5461   levels        = (PetscInt)info->levels;
5462   diagonal_fill = (PetscInt)info->diagonal_fill;
5463   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
5464 
5465   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5466   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
5467   both_identity = (PetscTruth) (row_identity && col_identity);
5468 
5469   if (!levels && both_identity) {  /* special case copy the nonzero structure */
5470     ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr);
5471     ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
5472 
5473     fact->factor = MAT_FACTOR_ILU;
5474     b            = (Mat_SeqBAIJ*)(fact)->data;
5475     b->row       = isrow;
5476     b->col       = iscol;
5477     ierr         = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5478     ierr         = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5479     b->icol      = isicol;
5480     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5481     ierr         = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
5482     PetscFunctionReturn(0);
5483   }
5484 
5485   /* general case perform the symbolic factorization */
5486     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
5487     ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
5488 
5489     /* get new row pointers */
5490     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr);
5491     ainew[0] = 0;
5492     /* don't know how many column pointers are needed so estimate */
5493     jmax = (PetscInt)(f*ai[n] + 1);
5494     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr);
5495     /* ajfill is level of fill for each fill entry */
5496     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr);
5497     /* fill is a linked list of nonzeros in active row */
5498     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr);
5499     /* im is level for each filled value */
5500     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr);
5501     /* dloc is location of diagonal in factor */
5502     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr);
5503     dloc[0]  = 0;
5504     for (prow=0; prow<n; prow++) {
5505 
5506       /* copy prow into linked list */
5507       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
5508       if (!nz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow);
5509       xi         = aj + ai[r[prow]];
5510       fill[n]    = n;
5511       fill[prow] = -1; /* marker for diagonal entry */
5512       while (nz--) {
5513 	fm  = n;
5514 	idx = ic[*xi++];
5515 	do {
5516 	  m  = fm;
5517 	  fm = fill[m];
5518 	} while (fm < idx);
5519 	fill[m]   = idx;
5520 	fill[idx] = fm;
5521 	im[idx]   = 0;
5522       }
5523 
5524       /* make sure diagonal entry is included */
5525       if (diagonal_fill && fill[prow] == -1) {
5526 	fm = n;
5527 	while (fill[fm] < prow) fm = fill[fm];
5528 	fill[prow] = fill[fm];  /* insert diagonal into linked list */
5529 	fill[fm]   = prow;
5530 	im[prow]   = 0;
5531 	nzf++;
5532 	dcount++;
5533       }
5534 
5535       nzi = 0;
5536       row = fill[n];
5537       while (row < prow) {
5538 	incrlev = im[row] + 1;
5539 	nz      = dloc[row];
5540 	xi      = ajnew  + ainew[row] + nz + 1;
5541 	flev    = ajfill + ainew[row] + nz + 1;
5542 	nnz     = ainew[row+1] - ainew[row] - nz - 1;
5543 	fm      = row;
5544 	while (nnz-- > 0) {
5545 	  idx = *xi++;
5546 	  if (*flev + incrlev > levels) {
5547 	    flev++;
5548 	    continue;
5549 	  }
5550 	  do {
5551 	    m  = fm;
5552 	    fm = fill[m];
5553 	  } while (fm < idx);
5554 	  if (fm != idx) {
5555 	    im[idx]   = *flev + incrlev;
5556 	    fill[m]   = idx;
5557 	    fill[idx] = fm;
5558 	    fm        = idx;
5559 	    nzf++;
5560 	  } else {
5561 	    if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
5562 	  }
5563 	  flev++;
5564 	}
5565 	row = fill[row];
5566 	nzi++;
5567       }
5568       /* copy new filled row into permanent storage */
5569       ainew[prow+1] = ainew[prow] + nzf;
5570       if (ainew[prow+1] > jmax) {
5571 
5572 	/* estimate how much additional space we will need */
5573 	/* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
5574 	/* just double the memory each time */
5575 	PetscInt maxadd = jmax;
5576 	/* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
5577 	if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
5578 	jmax += maxadd;
5579 
5580 	/* allocate a longer ajnew and ajfill */
5581 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
5582 	ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
5583 	ierr = PetscFree(ajnew);CHKERRQ(ierr);
5584 	ajnew = xitmp;
5585 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
5586 	ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
5587 	ierr = PetscFree(ajfill);CHKERRQ(ierr);
5588 	ajfill = xitmp;
5589 	reallocate++; /* count how many reallocations are needed */
5590       }
5591       xitmp       = ajnew + ainew[prow];
5592       flev        = ajfill + ainew[prow];
5593       dloc[prow]  = nzi;
5594       fm          = fill[n];
5595       while (nzf--) {
5596 	*xitmp++ = fm;
5597 	*flev++ = im[fm];
5598 	fm      = fill[fm];
5599       }
5600       /* make sure row has diagonal entry */
5601       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
5602 	SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
5603     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow);
5604       }
5605     }
5606     ierr = PetscFree(ajfill);CHKERRQ(ierr);
5607     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
5608     ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
5609     ierr = PetscFree(fill);CHKERRQ(ierr);
5610     ierr = PetscFree(im);CHKERRQ(ierr);
5611 
5612 #if defined(PETSC_USE_INFO)
5613     {
5614       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
5615       ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr);
5616       ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
5617       ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr);
5618       ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
5619       if (diagonal_fill) {
5620 	ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr);
5621       }
5622     }
5623 #endif
5624 
5625     /* put together the new matrix */
5626     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
5627     ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
5628     b    = (Mat_SeqBAIJ*)(fact)->data;
5629     b->free_a       = PETSC_TRUE;
5630     b->free_ij      = PETSC_TRUE;
5631     b->singlemalloc = PETSC_FALSE;
5632     ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
5633     b->j          = ajnew;
5634     b->i          = ainew;
5635     for (i=0; i<n; i++) dloc[i] += ainew[i];
5636     b->diag       = dloc;
5637     b->free_diag  = PETSC_TRUE;
5638     b->ilen       = 0;
5639     b->imax       = 0;
5640     b->row        = isrow;
5641     b->col        = iscol;
5642     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5643     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5644     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5645     b->icol       = isicol;
5646     ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
5647     /* In b structure:  Free imax, ilen, old a, old j.
5648        Allocate dloc, solve_work, new a, new j */
5649     ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr);
5650     b->maxnz          = b->nz = ainew[n];
5651 
5652     (fact)->info.factor_mallocs    = reallocate;
5653     (fact)->info.fill_ratio_given  = f;
5654     (fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
5655 
5656   ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
5657   PetscFunctionReturn(0);
5658 }
5659 
5660 #undef __FUNCT__
5661 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE"
5662 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
5663 {
5664   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
5665   /* int i,*AJ=a->j,nz=a->nz; */
5666   PetscFunctionBegin;
5667   /* Undo Column scaling */
5668 /*    while (nz--) { */
5669 /*      AJ[i] = AJ[i]/4; */
5670 /*    } */
5671   /* This should really invoke a push/pop logic, but we don't have that yet. */
5672   A->ops->setunfactored = PETSC_NULL;
5673   PetscFunctionReturn(0);
5674 }
5675 
5676 #undef __FUNCT__
5677 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj"
5678 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
5679 {
5680   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
5681   PetscInt       *AJ=a->j,nz=a->nz;
5682   unsigned short *aj=(unsigned short *)AJ;
5683   PetscFunctionBegin;
5684   /* Is this really necessary? */
5685   while (nz--) {
5686     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
5687   }
5688   A->ops->setunfactored = PETSC_NULL;
5689   PetscFunctionReturn(0);
5690 }
5691 
5692 
5693