xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision a4efd8ea4bb0600a5aef92b2c23cbfa314827193)
1 #define PETSCMAT_DLL
2 
3 /*
4     Factorization code for BAIJ format.
5 */
6 
7 #include "src/mat/impls/baij/seq/baij.h"
8 #include "src/inline/ilu.h"
9 #include "src/inline/dot.h"
10 
11 #undef __FUNCT__
12 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering"
13 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
14 {
15   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
16   PetscErrorCode ierr;
17   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
18   PetscInt       *diag = a->diag;
19   MatScalar      *aa=a->a,*v;
20   PetscScalar    s1,*x,*b;
21 
22   PetscFunctionBegin;
23   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
24   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
25   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
26 
27   /* forward solve the U^T */
28   for (i=0; i<n; i++) {
29 
30     v     = aa + diag[i];
31     /* multiply by the inverse of the block diagonal */
32     s1    = (*v++)*x[i];
33     vi    = aj + diag[i] + 1;
34     nz    = ai[i+1] - diag[i] - 1;
35     while (nz--) {
36       x[*vi++]  -= (*v++)*s1;
37     }
38     x[i]   = s1;
39   }
40   /* backward solve the L^T */
41   for (i=n-1; i>=0; i--){
42     v    = aa + diag[i] - 1;
43     vi   = aj + diag[i] - 1;
44     nz   = diag[i] - ai[i];
45     s1   = x[i];
46     while (nz--) {
47       x[*vi--]   -=  (*v--)*s1;
48     }
49   }
50   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
51   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
52   ierr = PetscLogFlops(2*(a->nz) - A->cmap->n);CHKERRQ(ierr);
53   PetscFunctionReturn(0);
54 }
55 
56 #undef __FUNCT__
57 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering"
58 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
59 {
60   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
61   PetscErrorCode ierr;
62   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
63   PetscInt       *diag = a->diag,oidx;
64   MatScalar      *aa=a->a,*v;
65   PetscScalar    s1,s2,x1,x2;
66   PetscScalar    *x,*b;
67 
68   PetscFunctionBegin;
69   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
70   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
71   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
72 
73   /* forward solve the U^T */
74   idx = 0;
75   for (i=0; i<n; i++) {
76 
77     v     = aa + 4*diag[i];
78     /* multiply by the inverse of the block diagonal */
79     x1 = x[idx];   x2 = x[1+idx];
80     s1 = v[0]*x1  +  v[1]*x2;
81     s2 = v[2]*x1  +  v[3]*x2;
82     v += 4;
83 
84     vi    = aj + diag[i] + 1;
85     nz    = ai[i+1] - diag[i] - 1;
86     while (nz--) {
87       oidx = 2*(*vi++);
88       x[oidx]   -= v[0]*s1  +  v[1]*s2;
89       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
90       v  += 4;
91     }
92     x[idx]   = s1;x[1+idx] = s2;
93     idx += 2;
94   }
95   /* backward solve the L^T */
96   for (i=n-1; i>=0; i--){
97     v    = aa + 4*diag[i] - 4;
98     vi   = aj + diag[i] - 1;
99     nz   = diag[i] - ai[i];
100     idt  = 2*i;
101     s1   = x[idt];  s2 = x[1+idt];
102     while (nz--) {
103       idx   = 2*(*vi--);
104       x[idx]   -=  v[0]*s1 +  v[1]*s2;
105       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
106       v -= 4;
107     }
108   }
109   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
110   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
111   ierr = PetscLogFlops(2*4*(a->nz) - 2*A->cmap->n);CHKERRQ(ierr);
112   PetscFunctionReturn(0);
113 }
114 
115 #undef __FUNCT__
116 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering"
117 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
118 {
119   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
120   PetscErrorCode ierr;
121   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
122   PetscInt       *diag = a->diag,oidx;
123   MatScalar      *aa=a->a,*v;
124   PetscScalar    s1,s2,s3,x1,x2,x3;
125   PetscScalar    *x,*b;
126 
127   PetscFunctionBegin;
128   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
129   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
130   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
131 
132   /* forward solve the U^T */
133   idx = 0;
134   for (i=0; i<n; i++) {
135 
136     v     = aa + 9*diag[i];
137     /* multiply by the inverse of the block diagonal */
138     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
139     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
140     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
141     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
142     v += 9;
143 
144     vi    = aj + diag[i] + 1;
145     nz    = ai[i+1] - diag[i] - 1;
146     while (nz--) {
147       oidx = 3*(*vi++);
148       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
149       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
150       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
151       v  += 9;
152     }
153     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
154     idx += 3;
155   }
156   /* backward solve the L^T */
157   for (i=n-1; i>=0; i--){
158     v    = aa + 9*diag[i] - 9;
159     vi   = aj + diag[i] - 1;
160     nz   = diag[i] - ai[i];
161     idt  = 3*i;
162     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
163     while (nz--) {
164       idx   = 3*(*vi--);
165       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
166       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
167       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
168       v -= 9;
169     }
170   }
171   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
172   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
173   ierr = PetscLogFlops(2*9*(a->nz) - 3*A->cmap->n);CHKERRQ(ierr);
174   PetscFunctionReturn(0);
175 }
176 
177 #undef __FUNCT__
178 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering"
179 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
180 {
181   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
182   PetscErrorCode ierr;
183   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
184   PetscInt       *diag = a->diag,oidx;
185   MatScalar      *aa=a->a,*v;
186   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
187   PetscScalar    *x,*b;
188 
189   PetscFunctionBegin;
190   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
191   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
192   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
193 
194   /* forward solve the U^T */
195   idx = 0;
196   for (i=0; i<n; i++) {
197 
198     v     = aa + 16*diag[i];
199     /* multiply by the inverse of the block diagonal */
200     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
201     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
202     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
203     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
204     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
205     v += 16;
206 
207     vi    = aj + diag[i] + 1;
208     nz    = ai[i+1] - diag[i] - 1;
209     while (nz--) {
210       oidx = 4*(*vi++);
211       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
212       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
213       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
214       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
215       v  += 16;
216     }
217     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
218     idx += 4;
219   }
220   /* backward solve the L^T */
221   for (i=n-1; i>=0; i--){
222     v    = aa + 16*diag[i] - 16;
223     vi   = aj + diag[i] - 1;
224     nz   = diag[i] - ai[i];
225     idt  = 4*i;
226     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
227     while (nz--) {
228       idx   = 4*(*vi--);
229       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
230       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
231       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
232       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
233       v -= 16;
234     }
235   }
236   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
237   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
238   ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr);
239   PetscFunctionReturn(0);
240 }
241 
242 #undef __FUNCT__
243 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering"
244 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
245 {
246   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
247   PetscErrorCode ierr;
248   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
249   PetscInt       *diag = a->diag,oidx;
250   MatScalar      *aa=a->a,*v;
251   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
252   PetscScalar    *x,*b;
253 
254   PetscFunctionBegin;
255   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
256   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
257   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
258 
259   /* forward solve the U^T */
260   idx = 0;
261   for (i=0; i<n; i++) {
262 
263     v     = aa + 25*diag[i];
264     /* multiply by the inverse of the block diagonal */
265     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
266     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
267     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
268     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
269     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
270     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
271     v += 25;
272 
273     vi    = aj + diag[i] + 1;
274     nz    = ai[i+1] - diag[i] - 1;
275     while (nz--) {
276       oidx = 5*(*vi++);
277       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
278       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
279       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
280       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
281       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
282       v  += 25;
283     }
284     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
285     idx += 5;
286   }
287   /* backward solve the L^T */
288   for (i=n-1; i>=0; i--){
289     v    = aa + 25*diag[i] - 25;
290     vi   = aj + diag[i] - 1;
291     nz   = diag[i] - ai[i];
292     idt  = 5*i;
293     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
294     while (nz--) {
295       idx   = 5*(*vi--);
296       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
297       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
298       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
299       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
300       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
301       v -= 25;
302     }
303   }
304   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
305   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
306   ierr = PetscLogFlops(2*25*(a->nz) - 5*A->cmap->n);CHKERRQ(ierr);
307   PetscFunctionReturn(0);
308 }
309 
310 #undef __FUNCT__
311 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering"
312 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
313 {
314   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
315   PetscErrorCode ierr;
316   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
317   PetscInt       *diag = a->diag,oidx;
318   MatScalar      *aa=a->a,*v;
319   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
320   PetscScalar    *x,*b;
321 
322   PetscFunctionBegin;
323   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
324   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
325   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
326 
327   /* forward solve the U^T */
328   idx = 0;
329   for (i=0; i<n; i++) {
330 
331     v     = aa + 36*diag[i];
332     /* multiply by the inverse of the block diagonal */
333     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
334     x6    = x[5+idx];
335     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
336     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
337     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
338     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
339     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
340     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
341     v += 36;
342 
343     vi    = aj + diag[i] + 1;
344     nz    = ai[i+1] - diag[i] - 1;
345     while (nz--) {
346       oidx = 6*(*vi++);
347       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
348       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
349       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
350       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
351       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
352       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
353       v  += 36;
354     }
355     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
356     x[5+idx] = s6;
357     idx += 6;
358   }
359   /* backward solve the L^T */
360   for (i=n-1; i>=0; i--){
361     v    = aa + 36*diag[i] - 36;
362     vi   = aj + diag[i] - 1;
363     nz   = diag[i] - ai[i];
364     idt  = 6*i;
365     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
366     s6 = x[5+idt];
367     while (nz--) {
368       idx   = 6*(*vi--);
369       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
370       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
371       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
372       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
373       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
374       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
375       v -= 36;
376     }
377   }
378   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
379   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
380   ierr = PetscLogFlops(2*36*(a->nz) - 6*A->cmap->n);CHKERRQ(ierr);
381   PetscFunctionReturn(0);
382 }
383 
384 #undef __FUNCT__
385 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering"
386 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
387 {
388   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
389   PetscErrorCode ierr;
390   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
391   PetscInt       *diag = a->diag,oidx;
392   MatScalar      *aa=a->a,*v;
393   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
394   PetscScalar    *x,*b;
395 
396   PetscFunctionBegin;
397   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
398   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
399   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
400 
401   /* forward solve the U^T */
402   idx = 0;
403   for (i=0; i<n; i++) {
404 
405     v     = aa + 49*diag[i];
406     /* multiply by the inverse of the block diagonal */
407     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
408     x6    = x[5+idx]; x7 = x[6+idx];
409     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
410     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
411     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
412     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
413     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
414     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
415     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
416     v += 49;
417 
418     vi    = aj + diag[i] + 1;
419     nz    = ai[i+1] - diag[i] - 1;
420     while (nz--) {
421       oidx = 7*(*vi++);
422       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
423       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
424       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
425       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
426       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
427       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
428       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
429       v  += 49;
430     }
431     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
432     x[5+idx] = s6;x[6+idx] = s7;
433     idx += 7;
434   }
435   /* backward solve the L^T */
436   for (i=n-1; i>=0; i--){
437     v    = aa + 49*diag[i] - 49;
438     vi   = aj + diag[i] - 1;
439     nz   = diag[i] - ai[i];
440     idt  = 7*i;
441     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
442     s6 = x[5+idt];s7 = x[6+idt];
443     while (nz--) {
444       idx   = 7*(*vi--);
445       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
446       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
447       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
448       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
449       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
450       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
451       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
452       v -= 49;
453     }
454   }
455   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
456   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
457   ierr = PetscLogFlops(2*49*(a->nz) - 7*A->cmap->n);CHKERRQ(ierr);
458   PetscFunctionReturn(0);
459 }
460 
461 /*---------------------------------------------------------------------------------------------*/
462 #undef __FUNCT__
463 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1"
464 PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
465 {
466   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
467   IS             iscol=a->col,isrow=a->row;
468   PetscErrorCode ierr;
469   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
470   PetscInt       *diag = a->diag;
471   MatScalar      *aa=a->a,*v;
472   PetscScalar    s1,*x,*b,*t;
473 
474   PetscFunctionBegin;
475   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
476   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
477   t  = a->solve_work;
478 
479   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
480   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
481 
482   /* copy the b into temp work space according to permutation */
483   for (i=0; i<n; i++) {
484     t[i] = b[c[i]];
485   }
486 
487   /* forward solve the U^T */
488   for (i=0; i<n; i++) {
489 
490     v     = aa + diag[i];
491     /* multiply by the inverse of the block diagonal */
492     s1    = (*v++)*t[i];
493     vi    = aj + diag[i] + 1;
494     nz    = ai[i+1] - diag[i] - 1;
495     while (nz--) {
496       t[*vi++]  -= (*v++)*s1;
497     }
498     t[i]   = s1;
499   }
500   /* backward solve the L^T */
501   for (i=n-1; i>=0; i--){
502     v    = aa + diag[i] - 1;
503     vi   = aj + diag[i] - 1;
504     nz   = diag[i] - ai[i];
505     s1   = t[i];
506     while (nz--) {
507       t[*vi--]   -=  (*v--)*s1;
508     }
509   }
510 
511   /* copy t into x according to permutation */
512   for (i=0; i<n; i++) {
513     x[r[i]]   = t[i];
514   }
515 
516   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
517   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
518   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
519   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
520   ierr = PetscLogFlops(2*(a->nz) - A->cmap->n);CHKERRQ(ierr);
521   PetscFunctionReturn(0);
522 }
523 
524 #undef __FUNCT__
525 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2"
526 PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
527 {
528   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
529   IS             iscol=a->col,isrow=a->row;
530   PetscErrorCode ierr;
531   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
532   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
533   MatScalar      *aa=a->a,*v;
534   PetscScalar    s1,s2,x1,x2;
535   PetscScalar    *x,*b,*t;
536 
537   PetscFunctionBegin;
538   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
539   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
540   t  = a->solve_work;
541 
542   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
543   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
544 
545   /* copy the b into temp work space according to permutation */
546   ii = 0;
547   for (i=0; i<n; i++) {
548     ic      = 2*c[i];
549     t[ii]   = b[ic];
550     t[ii+1] = b[ic+1];
551     ii += 2;
552   }
553 
554   /* forward solve the U^T */
555   idx = 0;
556   for (i=0; i<n; i++) {
557 
558     v     = aa + 4*diag[i];
559     /* multiply by the inverse of the block diagonal */
560     x1    = t[idx];   x2 = t[1+idx];
561     s1 = v[0]*x1  +  v[1]*x2;
562     s2 = v[2]*x1  +  v[3]*x2;
563     v += 4;
564 
565     vi    = aj + diag[i] + 1;
566     nz    = ai[i+1] - diag[i] - 1;
567     while (nz--) {
568       oidx = 2*(*vi++);
569       t[oidx]   -= v[0]*s1  +  v[1]*s2;
570       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
571       v  += 4;
572     }
573     t[idx]   = s1;t[1+idx] = s2;
574     idx += 2;
575   }
576   /* backward solve the L^T */
577   for (i=n-1; i>=0; i--){
578     v    = aa + 4*diag[i] - 4;
579     vi   = aj + diag[i] - 1;
580     nz   = diag[i] - ai[i];
581     idt  = 2*i;
582     s1 = t[idt];  s2 = t[1+idt];
583     while (nz--) {
584       idx   = 2*(*vi--);
585       t[idx]   -=  v[0]*s1 +  v[1]*s2;
586       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
587       v -= 4;
588     }
589   }
590 
591   /* copy t into x according to permutation */
592   ii = 0;
593   for (i=0; i<n; i++) {
594     ir      = 2*r[i];
595     x[ir]   = t[ii];
596     x[ir+1] = t[ii+1];
597     ii += 2;
598   }
599 
600   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
601   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
602   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
603   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
604   ierr = PetscLogFlops(2*4*(a->nz) - 2*A->cmap->n);CHKERRQ(ierr);
605   PetscFunctionReturn(0);
606 }
607 
608 #undef __FUNCT__
609 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3"
610 PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
611 {
612   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
613   IS             iscol=a->col,isrow=a->row;
614   PetscErrorCode ierr;
615   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
616   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
617   MatScalar      *aa=a->a,*v;
618   PetscScalar    s1,s2,s3,x1,x2,x3;
619   PetscScalar    *x,*b,*t;
620 
621   PetscFunctionBegin;
622   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
623   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
624   t  = a->solve_work;
625 
626   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
627   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
628 
629   /* copy the b into temp work space according to permutation */
630   ii = 0;
631   for (i=0; i<n; i++) {
632     ic      = 3*c[i];
633     t[ii]   = b[ic];
634     t[ii+1] = b[ic+1];
635     t[ii+2] = b[ic+2];
636     ii += 3;
637   }
638 
639   /* forward solve the U^T */
640   idx = 0;
641   for (i=0; i<n; i++) {
642 
643     v     = aa + 9*diag[i];
644     /* multiply by the inverse of the block diagonal */
645     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
646     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
647     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
648     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
649     v += 9;
650 
651     vi    = aj + diag[i] + 1;
652     nz    = ai[i+1] - diag[i] - 1;
653     while (nz--) {
654       oidx = 3*(*vi++);
655       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
656       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
657       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
658       v  += 9;
659     }
660     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
661     idx += 3;
662   }
663   /* backward solve the L^T */
664   for (i=n-1; i>=0; i--){
665     v    = aa + 9*diag[i] - 9;
666     vi   = aj + diag[i] - 1;
667     nz   = diag[i] - ai[i];
668     idt  = 3*i;
669     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
670     while (nz--) {
671       idx   = 3*(*vi--);
672       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
673       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
674       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
675       v -= 9;
676     }
677   }
678 
679   /* copy t into x according to permutation */
680   ii = 0;
681   for (i=0; i<n; i++) {
682     ir      = 3*r[i];
683     x[ir]   = t[ii];
684     x[ir+1] = t[ii+1];
685     x[ir+2] = t[ii+2];
686     ii += 3;
687   }
688 
689   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
690   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
691   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
692   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
693   ierr = PetscLogFlops(2*9*(a->nz) - 3*A->cmap->n);CHKERRQ(ierr);
694   PetscFunctionReturn(0);
695 }
696 
697 #undef __FUNCT__
698 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4"
699 PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
700 {
701   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
702   IS             iscol=a->col,isrow=a->row;
703   PetscErrorCode ierr;
704   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
705   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
706   MatScalar      *aa=a->a,*v;
707   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
708   PetscScalar    *x,*b,*t;
709 
710   PetscFunctionBegin;
711   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
712   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
713   t  = a->solve_work;
714 
715   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
716   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
717 
718   /* copy the b into temp work space according to permutation */
719   ii = 0;
720   for (i=0; i<n; i++) {
721     ic      = 4*c[i];
722     t[ii]   = b[ic];
723     t[ii+1] = b[ic+1];
724     t[ii+2] = b[ic+2];
725     t[ii+3] = b[ic+3];
726     ii += 4;
727   }
728 
729   /* forward solve the U^T */
730   idx = 0;
731   for (i=0; i<n; i++) {
732 
733     v     = aa + 16*diag[i];
734     /* multiply by the inverse of the block diagonal */
735     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
736     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
737     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
738     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
739     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
740     v += 16;
741 
742     vi    = aj + diag[i] + 1;
743     nz    = ai[i+1] - diag[i] - 1;
744     while (nz--) {
745       oidx = 4*(*vi++);
746       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
747       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
748       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
749       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
750       v  += 16;
751     }
752     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
753     idx += 4;
754   }
755   /* backward solve the L^T */
756   for (i=n-1; i>=0; i--){
757     v    = aa + 16*diag[i] - 16;
758     vi   = aj + diag[i] - 1;
759     nz   = diag[i] - ai[i];
760     idt  = 4*i;
761     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
762     while (nz--) {
763       idx   = 4*(*vi--);
764       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
765       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
766       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
767       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
768       v -= 16;
769     }
770   }
771 
772   /* copy t into x according to permutation */
773   ii = 0;
774   for (i=0; i<n; i++) {
775     ir      = 4*r[i];
776     x[ir]   = t[ii];
777     x[ir+1] = t[ii+1];
778     x[ir+2] = t[ii+2];
779     x[ir+3] = t[ii+3];
780     ii += 4;
781   }
782 
783   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
784   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
785   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
786   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
787   ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr);
788   PetscFunctionReturn(0);
789 }
790 
791 #undef __FUNCT__
792 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5"
793 PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
794 {
795   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
796   IS             iscol=a->col,isrow=a->row;
797   PetscErrorCode ierr;
798   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
799   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
800   MatScalar      *aa=a->a,*v;
801   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
802   PetscScalar    *x,*b,*t;
803 
804   PetscFunctionBegin;
805   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
806   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
807   t  = a->solve_work;
808 
809   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
810   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
811 
812   /* copy the b into temp work space according to permutation */
813   ii = 0;
814   for (i=0; i<n; i++) {
815     ic      = 5*c[i];
816     t[ii]   = b[ic];
817     t[ii+1] = b[ic+1];
818     t[ii+2] = b[ic+2];
819     t[ii+3] = b[ic+3];
820     t[ii+4] = b[ic+4];
821     ii += 5;
822   }
823 
824   /* forward solve the U^T */
825   idx = 0;
826   for (i=0; i<n; i++) {
827 
828     v     = aa + 25*diag[i];
829     /* multiply by the inverse of the block diagonal */
830     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
831     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
832     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
833     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
834     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
835     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
836     v += 25;
837 
838     vi    = aj + diag[i] + 1;
839     nz    = ai[i+1] - diag[i] - 1;
840     while (nz--) {
841       oidx = 5*(*vi++);
842       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
843       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
844       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
845       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
846       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
847       v  += 25;
848     }
849     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
850     idx += 5;
851   }
852   /* backward solve the L^T */
853   for (i=n-1; i>=0; i--){
854     v    = aa + 25*diag[i] - 25;
855     vi   = aj + diag[i] - 1;
856     nz   = diag[i] - ai[i];
857     idt  = 5*i;
858     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
859     while (nz--) {
860       idx   = 5*(*vi--);
861       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
862       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
863       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
864       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
865       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
866       v -= 25;
867     }
868   }
869 
870   /* copy t into x according to permutation */
871   ii = 0;
872   for (i=0; i<n; i++) {
873     ir      = 5*r[i];
874     x[ir]   = t[ii];
875     x[ir+1] = t[ii+1];
876     x[ir+2] = t[ii+2];
877     x[ir+3] = t[ii+3];
878     x[ir+4] = t[ii+4];
879     ii += 5;
880   }
881 
882   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
883   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
884   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
885   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
886   ierr = PetscLogFlops(2*25*(a->nz) - 5*A->cmap->n);CHKERRQ(ierr);
887   PetscFunctionReturn(0);
888 }
889 
890 #undef __FUNCT__
891 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6"
892 PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
893 {
894   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
895   IS             iscol=a->col,isrow=a->row;
896   PetscErrorCode ierr;
897   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
898   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
899   MatScalar      *aa=a->a,*v;
900   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
901   PetscScalar    *x,*b,*t;
902 
903   PetscFunctionBegin;
904   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
905   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
906   t  = a->solve_work;
907 
908   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
909   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
910 
911   /* copy the b into temp work space according to permutation */
912   ii = 0;
913   for (i=0; i<n; i++) {
914     ic      = 6*c[i];
915     t[ii]   = b[ic];
916     t[ii+1] = b[ic+1];
917     t[ii+2] = b[ic+2];
918     t[ii+3] = b[ic+3];
919     t[ii+4] = b[ic+4];
920     t[ii+5] = b[ic+5];
921     ii += 6;
922   }
923 
924   /* forward solve the U^T */
925   idx = 0;
926   for (i=0; i<n; i++) {
927 
928     v     = aa + 36*diag[i];
929     /* multiply by the inverse of the block diagonal */
930     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
931     x6    = t[5+idx];
932     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
933     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
934     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
935     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
936     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
937     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
938     v += 36;
939 
940     vi    = aj + diag[i] + 1;
941     nz    = ai[i+1] - diag[i] - 1;
942     while (nz--) {
943       oidx = 6*(*vi++);
944       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
945       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
946       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
947       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
948       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
949       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
950       v  += 36;
951     }
952     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
953     t[5+idx] = s6;
954     idx += 6;
955   }
956   /* backward solve the L^T */
957   for (i=n-1; i>=0; i--){
958     v    = aa + 36*diag[i] - 36;
959     vi   = aj + diag[i] - 1;
960     nz   = diag[i] - ai[i];
961     idt  = 6*i;
962     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
963     s6 = t[5+idt];
964     while (nz--) {
965       idx   = 6*(*vi--);
966       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
967       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
968       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
969       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
970       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
971       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
972       v -= 36;
973     }
974   }
975 
976   /* copy t into x according to permutation */
977   ii = 0;
978   for (i=0; i<n; i++) {
979     ir      = 6*r[i];
980     x[ir]   = t[ii];
981     x[ir+1] = t[ii+1];
982     x[ir+2] = t[ii+2];
983     x[ir+3] = t[ii+3];
984     x[ir+4] = t[ii+4];
985     x[ir+5] = t[ii+5];
986     ii += 6;
987   }
988 
989   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
990   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
991   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
992   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
993   ierr = PetscLogFlops(2*36*(a->nz) - 6*A->cmap->n);CHKERRQ(ierr);
994   PetscFunctionReturn(0);
995 }
996 
997 #undef __FUNCT__
998 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7"
999 PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1000 {
1001   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1002   IS             iscol=a->col,isrow=a->row;
1003   PetscErrorCode ierr;
1004   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
1005   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1006   MatScalar      *aa=a->a,*v;
1007   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1008   PetscScalar    *x,*b,*t;
1009 
1010   PetscFunctionBegin;
1011   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1012   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1013   t  = a->solve_work;
1014 
1015   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1016   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1017 
1018   /* copy the b into temp work space according to permutation */
1019   ii = 0;
1020   for (i=0; i<n; i++) {
1021     ic      = 7*c[i];
1022     t[ii]   = b[ic];
1023     t[ii+1] = b[ic+1];
1024     t[ii+2] = b[ic+2];
1025     t[ii+3] = b[ic+3];
1026     t[ii+4] = b[ic+4];
1027     t[ii+5] = b[ic+5];
1028     t[ii+6] = b[ic+6];
1029     ii += 7;
1030   }
1031 
1032   /* forward solve the U^T */
1033   idx = 0;
1034   for (i=0; i<n; i++) {
1035 
1036     v     = aa + 49*diag[i];
1037     /* multiply by the inverse of the block diagonal */
1038     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1039     x6    = t[5+idx]; x7 = t[6+idx];
1040     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1041     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1042     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1043     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1044     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1045     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1046     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1047     v += 49;
1048 
1049     vi    = aj + diag[i] + 1;
1050     nz    = ai[i+1] - diag[i] - 1;
1051     while (nz--) {
1052       oidx = 7*(*vi++);
1053       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1054       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1055       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1056       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1057       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1058       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1059       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1060       v  += 49;
1061     }
1062     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1063     t[5+idx] = s6;t[6+idx] = s7;
1064     idx += 7;
1065   }
1066   /* backward solve the L^T */
1067   for (i=n-1; i>=0; i--){
1068     v    = aa + 49*diag[i] - 49;
1069     vi   = aj + diag[i] - 1;
1070     nz   = diag[i] - ai[i];
1071     idt  = 7*i;
1072     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1073     s6 = t[5+idt];s7 = t[6+idt];
1074     while (nz--) {
1075       idx   = 7*(*vi--);
1076       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1077       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1078       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1079       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1080       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1081       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1082       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1083       v -= 49;
1084     }
1085   }
1086 
1087   /* copy t into x according to permutation */
1088   ii = 0;
1089   for (i=0; i<n; i++) {
1090     ir      = 7*r[i];
1091     x[ir]   = t[ii];
1092     x[ir+1] = t[ii+1];
1093     x[ir+2] = t[ii+2];
1094     x[ir+3] = t[ii+3];
1095     x[ir+4] = t[ii+4];
1096     x[ir+5] = t[ii+5];
1097     x[ir+6] = t[ii+6];
1098     ii += 7;
1099   }
1100 
1101   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1102   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1103   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1104   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1105   ierr = PetscLogFlops(2*49*(a->nz) - 7*A->cmap->n);CHKERRQ(ierr);
1106   PetscFunctionReturn(0);
1107 }
1108 
1109 /* ----------------------------------------------------------- */
1110 #undef __FUNCT__
1111 #define __FUNCT__ "MatSolve_SeqBAIJ_N"
1112 PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
1113 {
1114   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1115   IS             iscol=a->col,isrow=a->row;
1116   PetscErrorCode ierr;
1117   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1118   PetscInt       nz,bs=A->rmap->bs,bs2=a->bs2,*rout,*cout;
1119   MatScalar      *aa=a->a,*v;
1120   PetscScalar    *x,*b,*s,*t,*ls;
1121 
1122   PetscFunctionBegin;
1123   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1124   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1125   t  = a->solve_work;
1126 
1127   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1128   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1129 
1130   /* forward solve the lower triangular */
1131   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
1132   for (i=1; i<n; i++) {
1133     v   = aa + bs2*ai[i];
1134     vi  = aj + ai[i];
1135     nz  = a->diag[i] - ai[i];
1136     s = t + bs*i;
1137     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
1138     while (nz--) {
1139       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
1140       v += bs2;
1141     }
1142   }
1143   /* backward solve the upper triangular */
1144   ls = a->solve_work + A->cmap->n;
1145   for (i=n-1; i>=0; i--){
1146     v   = aa + bs2*(a->diag[i] + 1);
1147     vi  = aj + a->diag[i] + 1;
1148     nz  = ai[i+1] - a->diag[i] - 1;
1149     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1150     while (nz--) {
1151       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
1152       v += bs2;
1153     }
1154     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
1155     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1156   }
1157 
1158   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1159   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1160   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1161   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1162   ierr = PetscLogFlops(2*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
1163   PetscFunctionReturn(0);
1164 }
1165 
1166 #undef __FUNCT__
1167 #define __FUNCT__ "MatSolve_SeqBAIJ_7"
1168 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1169 {
1170   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1171   IS             iscol=a->col,isrow=a->row;
1172   PetscErrorCode ierr;
1173   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1174   PetscInt       *diag = a->diag;
1175   MatScalar      *aa=a->a,*v;
1176   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1177   PetscScalar    *x,*b,*t;
1178 
1179   PetscFunctionBegin;
1180   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1181   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1182   t  = a->solve_work;
1183 
1184   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1185   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1186 
1187   /* forward solve the lower triangular */
1188   idx    = 7*(*r++);
1189   t[0] = b[idx];   t[1] = b[1+idx];
1190   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1191   t[5] = b[5+idx]; t[6] = b[6+idx];
1192 
1193   for (i=1; i<n; i++) {
1194     v     = aa + 49*ai[i];
1195     vi    = aj + ai[i];
1196     nz    = diag[i] - ai[i];
1197     idx   = 7*(*r++);
1198     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1199     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1200     while (nz--) {
1201       idx   = 7*(*vi++);
1202       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1203       x4    = t[3+idx];x5 = t[4+idx];
1204       x6    = t[5+idx];x7 = t[6+idx];
1205       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1206       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1207       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1208       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1209       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1210       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1211       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1212       v += 49;
1213     }
1214     idx = 7*i;
1215     t[idx]   = s1;t[1+idx] = s2;
1216     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1217     t[5+idx] = s6;t[6+idx] = s7;
1218   }
1219   /* backward solve the upper triangular */
1220   for (i=n-1; i>=0; i--){
1221     v    = aa + 49*diag[i] + 49;
1222     vi   = aj + diag[i] + 1;
1223     nz   = ai[i+1] - diag[i] - 1;
1224     idt  = 7*i;
1225     s1 = t[idt];  s2 = t[1+idt];
1226     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1227     s6 = t[5+idt];s7 = t[6+idt];
1228     while (nz--) {
1229       idx   = 7*(*vi++);
1230       x1    = t[idx];   x2 = t[1+idx];
1231       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1232       x6    = t[5+idx]; x7 = t[6+idx];
1233       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1234       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1235       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1236       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1237       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1238       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1239       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1240       v += 49;
1241     }
1242     idc = 7*(*c--);
1243     v   = aa + 49*diag[i];
1244     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1245                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1246     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1247                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1248     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1249                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1250     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1251                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1252     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1253                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1254     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1255                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1256     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1257                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
1258   }
1259 
1260   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1261   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1262   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1263   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1264   ierr = PetscLogFlops(2*49*(a->nz) - 7*A->cmap->n);CHKERRQ(ierr);
1265   PetscFunctionReturn(0);
1266 }
1267 
1268 #undef __FUNCT__
1269 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
1270 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
1271 {
1272   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1273   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1274   PetscErrorCode    ierr;
1275   PetscInt          *diag = a->diag,jdx;
1276   const MatScalar   *aa=a->a,*v;
1277   PetscScalar       *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1278   const PetscScalar *b;
1279 
1280   PetscFunctionBegin;
1281   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1282   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1283   /* forward solve the lower triangular */
1284   idx    = 0;
1285   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1286   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1287   x[6] = b[6+idx];
1288   for (i=1; i<n; i++) {
1289     v     =  aa + 49*ai[i];
1290     vi    =  aj + ai[i];
1291     nz    =  diag[i] - ai[i];
1292     idx   =  7*i;
1293     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1294     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1295     s7  =  b[6+idx];
1296     while (nz--) {
1297       jdx   = 7*(*vi++);
1298       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1299       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1300       x7    = x[6+jdx];
1301       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1302       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1303       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1304       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1305       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1306       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1307       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1308       v += 49;
1309      }
1310     x[idx]   = s1;
1311     x[1+idx] = s2;
1312     x[2+idx] = s3;
1313     x[3+idx] = s4;
1314     x[4+idx] = s5;
1315     x[5+idx] = s6;
1316     x[6+idx] = s7;
1317   }
1318   /* backward solve the upper triangular */
1319   for (i=n-1; i>=0; i--){
1320     v    = aa + 49*diag[i] + 49;
1321     vi   = aj + diag[i] + 1;
1322     nz   = ai[i+1] - diag[i] - 1;
1323     idt  = 7*i;
1324     s1 = x[idt];   s2 = x[1+idt];
1325     s3 = x[2+idt]; s4 = x[3+idt];
1326     s5 = x[4+idt]; s6 = x[5+idt];
1327     s7 = x[6+idt];
1328     while (nz--) {
1329       idx   = 7*(*vi++);
1330       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1331       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1332       x7    = x[6+idx];
1333       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1334       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1335       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1336       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1337       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1338       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1339       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1340       v += 49;
1341     }
1342     v        = aa + 49*diag[i];
1343     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
1344                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
1345     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
1346                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
1347     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
1348                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
1349     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
1350                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
1351     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
1352                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
1353     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
1354                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
1355     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
1356                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
1357   }
1358 
1359   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1360   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1361   ierr = PetscLogFlops(2*36*(a->nz) - 6*A->cmap->n);CHKERRQ(ierr);
1362   PetscFunctionReturn(0);
1363 }
1364 
1365 #undef __FUNCT__
1366 #define __FUNCT__ "MatSolve_SeqBAIJ_6"
1367 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
1368 {
1369   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1370   IS                iscol=a->col,isrow=a->row;
1371   PetscErrorCode    ierr;
1372   PetscInt          *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1373   PetscInt          *diag = a->diag;
1374   const MatScalar   *aa=a->a,*v;
1375   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
1376   const PetscScalar *b;
1377   PetscFunctionBegin;
1378   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1379   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1380   t  = a->solve_work;
1381 
1382   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1383   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1384 
1385   /* forward solve the lower triangular */
1386   idx    = 6*(*r++);
1387   t[0] = b[idx];   t[1] = b[1+idx];
1388   t[2] = b[2+idx]; t[3] = b[3+idx];
1389   t[4] = b[4+idx]; t[5] = b[5+idx];
1390   for (i=1; i<n; i++) {
1391     v     = aa + 36*ai[i];
1392     vi    = aj + ai[i];
1393     nz    = diag[i] - ai[i];
1394     idx   = 6*(*r++);
1395     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1396     s5  = b[4+idx]; s6 = b[5+idx];
1397     while (nz--) {
1398       idx   = 6*(*vi++);
1399       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1400       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1401       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1402       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1403       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1404       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1405       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1406       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1407       v += 36;
1408     }
1409     idx = 6*i;
1410     t[idx]   = s1;t[1+idx] = s2;
1411     t[2+idx] = s3;t[3+idx] = s4;
1412     t[4+idx] = s5;t[5+idx] = s6;
1413   }
1414   /* backward solve the upper triangular */
1415   for (i=n-1; i>=0; i--){
1416     v    = aa + 36*diag[i] + 36;
1417     vi   = aj + diag[i] + 1;
1418     nz   = ai[i+1] - diag[i] - 1;
1419     idt  = 6*i;
1420     s1 = t[idt];  s2 = t[1+idt];
1421     s3 = t[2+idt];s4 = t[3+idt];
1422     s5 = t[4+idt];s6 = t[5+idt];
1423     while (nz--) {
1424       idx   = 6*(*vi++);
1425       x1    = t[idx];   x2 = t[1+idx];
1426       x3    = t[2+idx]; x4 = t[3+idx];
1427       x5    = t[4+idx]; x6 = t[5+idx];
1428       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1429       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1430       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1431       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1432       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1433       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1434       v += 36;
1435     }
1436     idc = 6*(*c--);
1437     v   = aa + 36*diag[i];
1438     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
1439                                  v[18]*s4+v[24]*s5+v[30]*s6;
1440     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1441                                  v[19]*s4+v[25]*s5+v[31]*s6;
1442     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1443                                  v[20]*s4+v[26]*s5+v[32]*s6;
1444     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1445                                  v[21]*s4+v[27]*s5+v[33]*s6;
1446     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1447                                  v[22]*s4+v[28]*s5+v[34]*s6;
1448     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1449                                  v[23]*s4+v[29]*s5+v[35]*s6;
1450   }
1451 
1452   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1453   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1454   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1455   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1456   ierr = PetscLogFlops(2*36*(a->nz) - 6*A->cmap->n);CHKERRQ(ierr);
1457   PetscFunctionReturn(0);
1458 }
1459 
1460 #undef __FUNCT__
1461 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
1462 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
1463 {
1464   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1465   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1466   PetscErrorCode    ierr;
1467   PetscInt          *diag = a->diag,jdx;
1468   const MatScalar   *aa=a->a,*v;
1469   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
1470   const PetscScalar *b;
1471 
1472   PetscFunctionBegin;
1473   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1474   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1475   /* forward solve the lower triangular */
1476   idx    = 0;
1477   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1478   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1479   for (i=1; i<n; i++) {
1480     v     =  aa + 36*ai[i];
1481     vi    =  aj + ai[i];
1482     nz    =  diag[i] - ai[i];
1483     idx   =  6*i;
1484     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1485     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1486     while (nz--) {
1487       jdx   = 6*(*vi++);
1488       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1489       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1490       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1491       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1492       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1493       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1494       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1495       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1496       v += 36;
1497      }
1498     x[idx]   = s1;
1499     x[1+idx] = s2;
1500     x[2+idx] = s3;
1501     x[3+idx] = s4;
1502     x[4+idx] = s5;
1503     x[5+idx] = s6;
1504   }
1505   /* backward solve the upper triangular */
1506   for (i=n-1; i>=0; i--){
1507     v    = aa + 36*diag[i] + 36;
1508     vi   = aj + diag[i] + 1;
1509     nz   = ai[i+1] - diag[i] - 1;
1510     idt  = 6*i;
1511     s1 = x[idt];   s2 = x[1+idt];
1512     s3 = x[2+idt]; s4 = x[3+idt];
1513     s5 = x[4+idt]; s6 = x[5+idt];
1514     while (nz--) {
1515       idx   = 6*(*vi++);
1516       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1517       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1518       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1519       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1520       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1521       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1522       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1523       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1524       v += 36;
1525     }
1526     v        = aa + 36*diag[i];
1527     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
1528     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
1529     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
1530     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
1531     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
1532     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
1533   }
1534 
1535   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1536   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1537   ierr = PetscLogFlops(2*36*(a->nz) - 6*A->cmap->n);CHKERRQ(ierr);
1538   PetscFunctionReturn(0);
1539 }
1540 
1541 #undef __FUNCT__
1542 #define __FUNCT__ "MatSolve_SeqBAIJ_5"
1543 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
1544 {
1545   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1546   IS                iscol=a->col,isrow=a->row;
1547   PetscErrorCode    ierr;
1548   PetscInt          *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1549   PetscInt          *diag = a->diag;
1550   const MatScalar   *aa=a->a,*v;
1551   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
1552   const PetscScalar *b;
1553 
1554   PetscFunctionBegin;
1555   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1556   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1557   t  = a->solve_work;
1558 
1559   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1560   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1561 
1562   /* forward solve the lower triangular */
1563   idx    = 5*(*r++);
1564   t[0] = b[idx];   t[1] = b[1+idx];
1565   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1566   for (i=1; i<n; i++) {
1567     v     = aa + 25*ai[i];
1568     vi    = aj + ai[i];
1569     nz    = diag[i] - ai[i];
1570     idx   = 5*(*r++);
1571     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1572     s5  = b[4+idx];
1573     while (nz--) {
1574       idx   = 5*(*vi++);
1575       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1576       x4    = t[3+idx];x5 = t[4+idx];
1577       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1578       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1579       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1580       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1581       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1582       v += 25;
1583     }
1584     idx = 5*i;
1585     t[idx]   = s1;t[1+idx] = s2;
1586     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1587   }
1588   /* backward solve the upper triangular */
1589   for (i=n-1; i>=0; i--){
1590     v    = aa + 25*diag[i] + 25;
1591     vi   = aj + diag[i] + 1;
1592     nz   = ai[i+1] - diag[i] - 1;
1593     idt  = 5*i;
1594     s1 = t[idt];  s2 = t[1+idt];
1595     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1596     while (nz--) {
1597       idx   = 5*(*vi++);
1598       x1    = t[idx];   x2 = t[1+idx];
1599       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1600       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1601       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1602       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1603       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1604       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1605       v += 25;
1606     }
1607     idc = 5*(*c--);
1608     v   = aa + 25*diag[i];
1609     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
1610                                  v[15]*s4+v[20]*s5;
1611     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
1612                                  v[16]*s4+v[21]*s5;
1613     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
1614                                  v[17]*s4+v[22]*s5;
1615     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
1616                                  v[18]*s4+v[23]*s5;
1617     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
1618                                  v[19]*s4+v[24]*s5;
1619   }
1620 
1621   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1622   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1623   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1624   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1625   ierr = PetscLogFlops(2*25*(a->nz) - 5*A->cmap->n);CHKERRQ(ierr);
1626   PetscFunctionReturn(0);
1627 }
1628 
1629 #undef __FUNCT__
1630 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
1631 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
1632 {
1633   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1634   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1635   PetscErrorCode    ierr;
1636   PetscInt          *diag = a->diag,jdx;
1637   const MatScalar   *aa=a->a,*v;
1638   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
1639   const PetscScalar *b;
1640 
1641   PetscFunctionBegin;
1642   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1643   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1644   /* forward solve the lower triangular */
1645   idx    = 0;
1646   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
1647   for (i=1; i<n; i++) {
1648     v     =  aa + 25*ai[i];
1649     vi    =  aj + ai[i];
1650     nz    =  diag[i] - ai[i];
1651     idx   =  5*i;
1652     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
1653     while (nz--) {
1654       jdx   = 5*(*vi++);
1655       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
1656       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1657       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1658       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1659       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1660       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1661       v    += 25;
1662     }
1663     x[idx]   = s1;
1664     x[1+idx] = s2;
1665     x[2+idx] = s3;
1666     x[3+idx] = s4;
1667     x[4+idx] = s5;
1668   }
1669   /* backward solve the upper triangular */
1670   for (i=n-1; i>=0; i--){
1671     v    = aa + 25*diag[i] + 25;
1672     vi   = aj + diag[i] + 1;
1673     nz   = ai[i+1] - diag[i] - 1;
1674     idt  = 5*i;
1675     s1 = x[idt];  s2 = x[1+idt];
1676     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
1677     while (nz--) {
1678       idx   = 5*(*vi++);
1679       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
1680       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1681       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1682       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1683       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1684       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1685       v    += 25;
1686     }
1687     v        = aa + 25*diag[i];
1688     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
1689     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
1690     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
1691     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
1692     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
1693   }
1694 
1695   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1696   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1697   ierr = PetscLogFlops(2*25*(a->nz) - 5*A->cmap->n);CHKERRQ(ierr);
1698   PetscFunctionReturn(0);
1699 }
1700 
1701 #undef __FUNCT__
1702 #define __FUNCT__ "MatSolve_SeqBAIJ_4"
1703 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
1704 {
1705   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1706   IS                iscol=a->col,isrow=a->row;
1707   PetscErrorCode    ierr;
1708   PetscInt          *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1709   PetscInt          *diag = a->diag;
1710   const MatScalar   *aa=a->a,*v;
1711   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
1712   const PetscScalar *b;
1713 
1714   PetscFunctionBegin;
1715   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1716   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1717   t  = a->solve_work;
1718 
1719   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1720   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1721 
1722   /* forward solve the lower triangular */
1723   idx    = 4*(*r++);
1724   t[0] = b[idx];   t[1] = b[1+idx];
1725   t[2] = b[2+idx]; t[3] = b[3+idx];
1726   for (i=1; i<n; i++) {
1727     v     = aa + 16*ai[i];
1728     vi    = aj + ai[i];
1729     nz    = diag[i] - ai[i];
1730     idx   = 4*(*r++);
1731     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1732     while (nz--) {
1733       idx   = 4*(*vi++);
1734       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
1735       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1736       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1737       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1738       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1739       v    += 16;
1740     }
1741     idx        = 4*i;
1742     t[idx]   = s1;t[1+idx] = s2;
1743     t[2+idx] = s3;t[3+idx] = s4;
1744   }
1745   /* backward solve the upper triangular */
1746   for (i=n-1; i>=0; i--){
1747     v    = aa + 16*diag[i] + 16;
1748     vi   = aj + diag[i] + 1;
1749     nz   = ai[i+1] - diag[i] - 1;
1750     idt  = 4*i;
1751     s1 = t[idt];  s2 = t[1+idt];
1752     s3 = t[2+idt];s4 = t[3+idt];
1753     while (nz--) {
1754       idx   = 4*(*vi++);
1755       x1    = t[idx];   x2 = t[1+idx];
1756       x3    = t[2+idx]; x4 = t[3+idx];
1757       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1758       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1759       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1760       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1761       v += 16;
1762     }
1763     idc      = 4*(*c--);
1764     v        = aa + 16*diag[i];
1765     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1766     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1767     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1768     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1769   }
1770 
1771   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1772   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1773   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1774   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1775   ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr);
1776   PetscFunctionReturn(0);
1777 }
1778 
1779 #undef __FUNCT__
1780 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion"
1781 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
1782 {
1783   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1784   IS                iscol=a->col,isrow=a->row;
1785   PetscErrorCode    ierr;
1786   PetscInt          *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1787   PetscInt          *diag = a->diag;
1788   const MatScalar   *aa=a->a,*v;
1789   MatScalar         s1,s2,s3,s4,x1,x2,x3,x4,*t;
1790   PetscScalar       *x;
1791   const PetscScalar *b;
1792 
1793   PetscFunctionBegin;
1794   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1795   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1796   t  = (MatScalar *)a->solve_work;
1797 
1798   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1799   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1800 
1801   /* forward solve the lower triangular */
1802   idx    = 4*(*r++);
1803   t[0] = (MatScalar)b[idx];
1804   t[1] = (MatScalar)b[1+idx];
1805   t[2] = (MatScalar)b[2+idx];
1806   t[3] = (MatScalar)b[3+idx];
1807   for (i=1; i<n; i++) {
1808     v     = aa + 16*ai[i];
1809     vi    = aj + ai[i];
1810     nz    = diag[i] - ai[i];
1811     idx   = 4*(*r++);
1812     s1 = (MatScalar)b[idx];
1813     s2 = (MatScalar)b[1+idx];
1814     s3 = (MatScalar)b[2+idx];
1815     s4 = (MatScalar)b[3+idx];
1816     while (nz--) {
1817       idx   = 4*(*vi++);
1818       x1  = t[idx];
1819       x2  = t[1+idx];
1820       x3  = t[2+idx];
1821       x4  = t[3+idx];
1822       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1823       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1824       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1825       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1826       v    += 16;
1827     }
1828     idx        = 4*i;
1829     t[idx]   = s1;
1830     t[1+idx] = s2;
1831     t[2+idx] = s3;
1832     t[3+idx] = s4;
1833   }
1834   /* backward solve the upper triangular */
1835   for (i=n-1; i>=0; i--){
1836     v    = aa + 16*diag[i] + 16;
1837     vi   = aj + diag[i] + 1;
1838     nz   = ai[i+1] - diag[i] - 1;
1839     idt  = 4*i;
1840     s1 = t[idt];
1841     s2 = t[1+idt];
1842     s3 = t[2+idt];
1843     s4 = t[3+idt];
1844     while (nz--) {
1845       idx   = 4*(*vi++);
1846       x1  = t[idx];
1847       x2  = t[1+idx];
1848       x3  = t[2+idx];
1849       x4  = t[3+idx];
1850       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1851       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1852       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1853       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1854       v += 16;
1855     }
1856     idc      = 4*(*c--);
1857     v        = aa + 16*diag[i];
1858     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1859     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1860     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1861     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1862     x[idc]   = (PetscScalar)t[idt];
1863     x[1+idc] = (PetscScalar)t[1+idt];
1864     x[2+idc] = (PetscScalar)t[2+idt];
1865     x[3+idc] = (PetscScalar)t[3+idt];
1866  }
1867 
1868   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1869   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1870   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1871   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1872   ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr);
1873   PetscFunctionReturn(0);
1874 }
1875 
1876 #if defined (PETSC_HAVE_SSE)
1877 
1878 #include PETSC_HAVE_SSE
1879 
1880 #undef __FUNCT__
1881 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
1882 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
1883 {
1884   /*
1885      Note: This code uses demotion of double
1886      to float when performing the mixed-mode computation.
1887      This may not be numerically reasonable for all applications.
1888   */
1889   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
1890   IS             iscol=a->col,isrow=a->row;
1891   PetscErrorCode ierr;
1892   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1893   PetscInt       *diag = a->diag,ai16;
1894   MatScalar      *aa=a->a,*v;
1895   PetscScalar    *x,*b,*t;
1896 
1897   /* Make space in temp stack for 16 Byte Aligned arrays */
1898   float           ssealignedspace[11],*tmps,*tmpx;
1899   unsigned long   offset;
1900 
1901   PetscFunctionBegin;
1902   SSE_SCOPE_BEGIN;
1903 
1904     offset = (unsigned long)ssealignedspace % 16;
1905     if (offset) offset = (16 - offset)/4;
1906     tmps = &ssealignedspace[offset];
1907     tmpx = &ssealignedspace[offset+4];
1908     PREFETCH_NTA(aa+16*ai[1]);
1909 
1910     ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1911     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1912     t  = a->solve_work;
1913 
1914     ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1915     ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1916 
1917     /* forward solve the lower triangular */
1918     idx  = 4*(*r++);
1919     t[0] = b[idx];   t[1] = b[1+idx];
1920     t[2] = b[2+idx]; t[3] = b[3+idx];
1921     v    =  aa + 16*ai[1];
1922 
1923     for (i=1; i<n;) {
1924       PREFETCH_NTA(&v[8]);
1925       vi   =  aj      + ai[i];
1926       nz   =  diag[i] - ai[i];
1927       idx  =  4*(*r++);
1928 
1929       /* Demote sum from double to float */
1930       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
1931       LOAD_PS(tmps,XMM7);
1932 
1933       while (nz--) {
1934         PREFETCH_NTA(&v[16]);
1935         idx = 4*(*vi++);
1936 
1937         /* Demote solution (so far) from double to float */
1938         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
1939 
1940         /* 4x4 Matrix-Vector product with negative accumulation: */
1941         SSE_INLINE_BEGIN_2(tmpx,v)
1942           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
1943 
1944           /* First Column */
1945           SSE_COPY_PS(XMM0,XMM6)
1946           SSE_SHUFFLE(XMM0,XMM0,0x00)
1947           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1948           SSE_SUB_PS(XMM7,XMM0)
1949 
1950           /* Second Column */
1951           SSE_COPY_PS(XMM1,XMM6)
1952           SSE_SHUFFLE(XMM1,XMM1,0x55)
1953           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1954           SSE_SUB_PS(XMM7,XMM1)
1955 
1956           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
1957 
1958           /* Third Column */
1959           SSE_COPY_PS(XMM2,XMM6)
1960           SSE_SHUFFLE(XMM2,XMM2,0xAA)
1961           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1962           SSE_SUB_PS(XMM7,XMM2)
1963 
1964           /* Fourth Column */
1965           SSE_COPY_PS(XMM3,XMM6)
1966           SSE_SHUFFLE(XMM3,XMM3,0xFF)
1967           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1968           SSE_SUB_PS(XMM7,XMM3)
1969         SSE_INLINE_END_2
1970 
1971         v  += 16;
1972       }
1973       idx = 4*i;
1974       v   = aa + 16*ai[++i];
1975       PREFETCH_NTA(v);
1976       STORE_PS(tmps,XMM7);
1977 
1978       /* Promote result from float to double */
1979       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
1980     }
1981     /* backward solve the upper triangular */
1982     idt  = 4*(n-1);
1983     ai16 = 16*diag[n-1];
1984     v    = aa + ai16 + 16;
1985     for (i=n-1; i>=0;){
1986       PREFETCH_NTA(&v[8]);
1987       vi = aj + diag[i] + 1;
1988       nz = ai[i+1] - diag[i] - 1;
1989 
1990       /* Demote accumulator from double to float */
1991       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
1992       LOAD_PS(tmps,XMM7);
1993 
1994       while (nz--) {
1995         PREFETCH_NTA(&v[16]);
1996         idx = 4*(*vi++);
1997 
1998         /* Demote solution (so far) from double to float */
1999         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
2000 
2001         /* 4x4 Matrix-Vector Product with negative accumulation: */
2002         SSE_INLINE_BEGIN_2(tmpx,v)
2003           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2004 
2005           /* First Column */
2006           SSE_COPY_PS(XMM0,XMM6)
2007           SSE_SHUFFLE(XMM0,XMM0,0x00)
2008           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2009           SSE_SUB_PS(XMM7,XMM0)
2010 
2011           /* Second Column */
2012           SSE_COPY_PS(XMM1,XMM6)
2013           SSE_SHUFFLE(XMM1,XMM1,0x55)
2014           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2015           SSE_SUB_PS(XMM7,XMM1)
2016 
2017           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2018 
2019           /* Third Column */
2020           SSE_COPY_PS(XMM2,XMM6)
2021           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2022           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2023           SSE_SUB_PS(XMM7,XMM2)
2024 
2025           /* Fourth Column */
2026           SSE_COPY_PS(XMM3,XMM6)
2027           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2028           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2029           SSE_SUB_PS(XMM7,XMM3)
2030         SSE_INLINE_END_2
2031         v  += 16;
2032       }
2033       v    = aa + ai16;
2034       ai16 = 16*diag[--i];
2035       PREFETCH_NTA(aa+ai16+16);
2036       /*
2037          Scale the result by the diagonal 4x4 block,
2038          which was inverted as part of the factorization
2039       */
2040       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
2041         /* First Column */
2042         SSE_COPY_PS(XMM0,XMM7)
2043         SSE_SHUFFLE(XMM0,XMM0,0x00)
2044         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2045 
2046         /* Second Column */
2047         SSE_COPY_PS(XMM1,XMM7)
2048         SSE_SHUFFLE(XMM1,XMM1,0x55)
2049         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2050         SSE_ADD_PS(XMM0,XMM1)
2051 
2052         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2053 
2054         /* Third Column */
2055         SSE_COPY_PS(XMM2,XMM7)
2056         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2057         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2058         SSE_ADD_PS(XMM0,XMM2)
2059 
2060         /* Fourth Column */
2061         SSE_COPY_PS(XMM3,XMM7)
2062         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2063         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2064         SSE_ADD_PS(XMM0,XMM3)
2065 
2066         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2067       SSE_INLINE_END_3
2068 
2069       /* Promote solution from float to double */
2070       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
2071 
2072       /* Apply reordering to t and stream into x.    */
2073       /* This way, x doesn't pollute the cache.      */
2074       /* Be careful with size: 2 doubles = 4 floats! */
2075       idc  = 4*(*c--);
2076       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
2077         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
2078         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
2079         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
2080         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
2081         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
2082         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
2083       SSE_INLINE_END_2
2084       v    = aa + ai16 + 16;
2085       idt -= 4;
2086     }
2087 
2088     ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2089     ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2090     ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2091     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2092     ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr);
2093   SSE_SCOPE_END;
2094   PetscFunctionReturn(0);
2095 }
2096 
2097 #endif
2098 
2099 
2100 /*
2101       Special case where the matrix was ILU(0) factored in the natural
2102    ordering. This eliminates the need for the column and row permutation.
2103 */
2104 #undef __FUNCT__
2105 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
2106 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
2107 {
2108   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2109   PetscInt          n=a->mbs;
2110   const PetscInt    *ai=a->i,*aj=a->j;
2111   PetscErrorCode    ierr;
2112   const PetscInt    *diag = a->diag;
2113   const MatScalar   *aa=a->a;
2114   PetscScalar       *x;
2115   const PetscScalar *b;
2116 
2117   PetscFunctionBegin;
2118   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2119   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2120 
2121 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
2122   {
2123     static PetscScalar w[2000]; /* very BAD need to fix */
2124     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
2125   }
2126 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
2127   {
2128     static PetscScalar w[2000]; /* very BAD need to fix */
2129     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
2130   }
2131 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
2132   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
2133 #else
2134   {
2135     PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
2136     const MatScalar *v;
2137     PetscInt        jdx,idt,idx,nz,i,ai16;
2138     const PetscInt  *vi;
2139 
2140   /* forward solve the lower triangular */
2141   idx    = 0;
2142   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
2143   for (i=1; i<n; i++) {
2144     v     =  aa      + 16*ai[i];
2145     vi    =  aj      + ai[i];
2146     nz    =  diag[i] - ai[i];
2147     idx   +=  4;
2148     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2149     while (nz--) {
2150       jdx   = 4*(*vi++);
2151       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
2152       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2153       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2154       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2155       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2156       v    += 16;
2157     }
2158     x[idx]   = s1;
2159     x[1+idx] = s2;
2160     x[2+idx] = s3;
2161     x[3+idx] = s4;
2162   }
2163   /* backward solve the upper triangular */
2164   idt = 4*(n-1);
2165   for (i=n-1; i>=0; i--){
2166     ai16 = 16*diag[i];
2167     v    = aa + ai16 + 16;
2168     vi   = aj + diag[i] + 1;
2169     nz   = ai[i+1] - diag[i] - 1;
2170     s1 = x[idt];  s2 = x[1+idt];
2171     s3 = x[2+idt];s4 = x[3+idt];
2172     while (nz--) {
2173       idx   = 4*(*vi++);
2174       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
2175       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2176       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2177       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2178       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
2179       v    += 16;
2180     }
2181     v        = aa + ai16;
2182     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
2183     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
2184     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
2185     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
2186     idt -= 4;
2187   }
2188   }
2189 #endif
2190 
2191   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2192   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2193   ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr);
2194   PetscFunctionReturn(0);
2195 }
2196 
2197 #undef __FUNCT__
2198 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion"
2199 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
2200 {
2201   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2202   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
2203   PetscErrorCode ierr;
2204   PetscInt       *diag = a->diag;
2205   MatScalar      *aa=a->a;
2206   PetscScalar    *x,*b;
2207 
2208   PetscFunctionBegin;
2209   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2210   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2211 
2212   {
2213     MatScalar  s1,s2,s3,s4,x1,x2,x3,x4;
2214     MatScalar  *v,*t=(MatScalar *)x;
2215     PetscInt   jdx,idt,idx,nz,*vi,i,ai16;
2216 
2217     /* forward solve the lower triangular */
2218     idx  = 0;
2219     t[0] = (MatScalar)b[0];
2220     t[1] = (MatScalar)b[1];
2221     t[2] = (MatScalar)b[2];
2222     t[3] = (MatScalar)b[3];
2223     for (i=1; i<n; i++) {
2224       v     =  aa      + 16*ai[i];
2225       vi    =  aj      + ai[i];
2226       nz    =  diag[i] - ai[i];
2227       idx   +=  4;
2228       s1 = (MatScalar)b[idx];
2229       s2 = (MatScalar)b[1+idx];
2230       s3 = (MatScalar)b[2+idx];
2231       s4 = (MatScalar)b[3+idx];
2232       while (nz--) {
2233         jdx = 4*(*vi++);
2234         x1  = t[jdx];
2235         x2  = t[1+jdx];
2236         x3  = t[2+jdx];
2237         x4  = t[3+jdx];
2238         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2239         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2240         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2241         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2242         v    += 16;
2243       }
2244       t[idx]   = s1;
2245       t[1+idx] = s2;
2246       t[2+idx] = s3;
2247       t[3+idx] = s4;
2248     }
2249     /* backward solve the upper triangular */
2250     idt = 4*(n-1);
2251     for (i=n-1; i>=0; i--){
2252       ai16 = 16*diag[i];
2253       v    = aa + ai16 + 16;
2254       vi   = aj + diag[i] + 1;
2255       nz   = ai[i+1] - diag[i] - 1;
2256       s1   = t[idt];
2257       s2   = t[1+idt];
2258       s3   = t[2+idt];
2259       s4   = t[3+idt];
2260       while (nz--) {
2261         idx = 4*(*vi++);
2262         x1  = (MatScalar)x[idx];
2263         x2  = (MatScalar)x[1+idx];
2264         x3  = (MatScalar)x[2+idx];
2265         x4  = (MatScalar)x[3+idx];
2266         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2267         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2268         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2269         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2270         v    += 16;
2271       }
2272       v        = aa + ai16;
2273       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
2274       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
2275       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
2276       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
2277       idt -= 4;
2278     }
2279   }
2280 
2281   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2282   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2283   ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr);
2284   PetscFunctionReturn(0);
2285 }
2286 
2287 #if defined (PETSC_HAVE_SSE)
2288 
2289 #include PETSC_HAVE_SSE
2290 #undef __FUNCT__
2291 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj"
2292 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
2293 {
2294   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2295   unsigned short *aj=(unsigned short *)a->j;
2296   PetscErrorCode ierr;
2297   int            *ai=a->i,n=a->mbs,*diag = a->diag;
2298   MatScalar      *aa=a->a;
2299   PetscScalar    *x,*b;
2300 
2301   PetscFunctionBegin;
2302   SSE_SCOPE_BEGIN;
2303   /*
2304      Note: This code currently uses demotion of double
2305      to float when performing the mixed-mode computation.
2306      This may not be numerically reasonable for all applications.
2307   */
2308   PREFETCH_NTA(aa+16*ai[1]);
2309 
2310   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2311   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2312   {
2313     /* x will first be computed in single precision then promoted inplace to double */
2314     MatScalar      *v,*t=(MatScalar *)x;
2315     int            nz,i,idt,ai16;
2316     unsigned int   jdx,idx;
2317     unsigned short *vi;
2318     /* Forward solve the lower triangular factor. */
2319 
2320     /* First block is the identity. */
2321     idx  = 0;
2322     CONVERT_DOUBLE4_FLOAT4(t,b);
2323     v    =  aa + 16*((unsigned int)ai[1]);
2324 
2325     for (i=1; i<n;) {
2326       PREFETCH_NTA(&v[8]);
2327       vi   =  aj      + ai[i];
2328       nz   =  diag[i] - ai[i];
2329       idx +=  4;
2330 
2331       /* Demote RHS from double to float. */
2332       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
2333       LOAD_PS(&t[idx],XMM7);
2334 
2335       while (nz--) {
2336         PREFETCH_NTA(&v[16]);
2337         jdx = 4*((unsigned int)(*vi++));
2338 
2339         /* 4x4 Matrix-Vector product with negative accumulation: */
2340         SSE_INLINE_BEGIN_2(&t[jdx],v)
2341           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2342 
2343           /* First Column */
2344           SSE_COPY_PS(XMM0,XMM6)
2345           SSE_SHUFFLE(XMM0,XMM0,0x00)
2346           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2347           SSE_SUB_PS(XMM7,XMM0)
2348 
2349           /* Second Column */
2350           SSE_COPY_PS(XMM1,XMM6)
2351           SSE_SHUFFLE(XMM1,XMM1,0x55)
2352           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2353           SSE_SUB_PS(XMM7,XMM1)
2354 
2355           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2356 
2357           /* Third Column */
2358           SSE_COPY_PS(XMM2,XMM6)
2359           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2360           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2361           SSE_SUB_PS(XMM7,XMM2)
2362 
2363           /* Fourth Column */
2364           SSE_COPY_PS(XMM3,XMM6)
2365           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2366           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2367           SSE_SUB_PS(XMM7,XMM3)
2368         SSE_INLINE_END_2
2369 
2370         v  += 16;
2371       }
2372       v    =  aa + 16*ai[++i];
2373       PREFETCH_NTA(v);
2374       STORE_PS(&t[idx],XMM7);
2375     }
2376 
2377     /* Backward solve the upper triangular factor.*/
2378 
2379     idt  = 4*(n-1);
2380     ai16 = 16*diag[n-1];
2381     v    = aa + ai16 + 16;
2382     for (i=n-1; i>=0;){
2383       PREFETCH_NTA(&v[8]);
2384       vi = aj + diag[i] + 1;
2385       nz = ai[i+1] - diag[i] - 1;
2386 
2387       LOAD_PS(&t[idt],XMM7);
2388 
2389       while (nz--) {
2390         PREFETCH_NTA(&v[16]);
2391         idx = 4*((unsigned int)(*vi++));
2392 
2393         /* 4x4 Matrix-Vector Product with negative accumulation: */
2394         SSE_INLINE_BEGIN_2(&t[idx],v)
2395           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2396 
2397           /* First Column */
2398           SSE_COPY_PS(XMM0,XMM6)
2399           SSE_SHUFFLE(XMM0,XMM0,0x00)
2400           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2401           SSE_SUB_PS(XMM7,XMM0)
2402 
2403           /* Second Column */
2404           SSE_COPY_PS(XMM1,XMM6)
2405           SSE_SHUFFLE(XMM1,XMM1,0x55)
2406           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2407           SSE_SUB_PS(XMM7,XMM1)
2408 
2409           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2410 
2411           /* Third Column */
2412           SSE_COPY_PS(XMM2,XMM6)
2413           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2414           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2415           SSE_SUB_PS(XMM7,XMM2)
2416 
2417           /* Fourth Column */
2418           SSE_COPY_PS(XMM3,XMM6)
2419           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2420           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2421           SSE_SUB_PS(XMM7,XMM3)
2422         SSE_INLINE_END_2
2423         v  += 16;
2424       }
2425       v    = aa + ai16;
2426       ai16 = 16*diag[--i];
2427       PREFETCH_NTA(aa+ai16+16);
2428       /*
2429          Scale the result by the diagonal 4x4 block,
2430          which was inverted as part of the factorization
2431       */
2432       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
2433         /* First Column */
2434         SSE_COPY_PS(XMM0,XMM7)
2435         SSE_SHUFFLE(XMM0,XMM0,0x00)
2436         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2437 
2438         /* Second Column */
2439         SSE_COPY_PS(XMM1,XMM7)
2440         SSE_SHUFFLE(XMM1,XMM1,0x55)
2441         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2442         SSE_ADD_PS(XMM0,XMM1)
2443 
2444         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2445 
2446         /* Third Column */
2447         SSE_COPY_PS(XMM2,XMM7)
2448         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2449         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2450         SSE_ADD_PS(XMM0,XMM2)
2451 
2452         /* Fourth Column */
2453         SSE_COPY_PS(XMM3,XMM7)
2454         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2455         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2456         SSE_ADD_PS(XMM0,XMM3)
2457 
2458         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2459       SSE_INLINE_END_3
2460 
2461       v    = aa + ai16 + 16;
2462       idt -= 4;
2463     }
2464 
2465     /* Convert t from single precision back to double precision (inplace)*/
2466     idt = 4*(n-1);
2467     for (i=n-1;i>=0;i--) {
2468       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
2469       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
2470       PetscScalar *xtemp=&x[idt];
2471       MatScalar   *ttemp=&t[idt];
2472       xtemp[3] = (PetscScalar)ttemp[3];
2473       xtemp[2] = (PetscScalar)ttemp[2];
2474       xtemp[1] = (PetscScalar)ttemp[1];
2475       xtemp[0] = (PetscScalar)ttemp[0];
2476       idt -= 4;
2477     }
2478 
2479   } /* End of artificial scope. */
2480   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2481   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2482   ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr);
2483   SSE_SCOPE_END;
2484   PetscFunctionReturn(0);
2485 }
2486 
2487 #undef __FUNCT__
2488 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion"
2489 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
2490 {
2491   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2492   int            *aj=a->j;
2493   PetscErrorCode ierr;
2494   int            *ai=a->i,n=a->mbs,*diag = a->diag;
2495   MatScalar      *aa=a->a;
2496   PetscScalar    *x,*b;
2497 
2498   PetscFunctionBegin;
2499   SSE_SCOPE_BEGIN;
2500   /*
2501      Note: This code currently uses demotion of double
2502      to float when performing the mixed-mode computation.
2503      This may not be numerically reasonable for all applications.
2504   */
2505   PREFETCH_NTA(aa+16*ai[1]);
2506 
2507   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2508   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2509   {
2510     /* x will first be computed in single precision then promoted inplace to double */
2511     MatScalar *v,*t=(MatScalar *)x;
2512     int       nz,i,idt,ai16;
2513     int       jdx,idx;
2514     int       *vi;
2515     /* Forward solve the lower triangular factor. */
2516 
2517     /* First block is the identity. */
2518     idx  = 0;
2519     CONVERT_DOUBLE4_FLOAT4(t,b);
2520     v    =  aa + 16*ai[1];
2521 
2522     for (i=1; i<n;) {
2523       PREFETCH_NTA(&v[8]);
2524       vi   =  aj      + ai[i];
2525       nz   =  diag[i] - ai[i];
2526       idx +=  4;
2527 
2528       /* Demote RHS from double to float. */
2529       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
2530       LOAD_PS(&t[idx],XMM7);
2531 
2532       while (nz--) {
2533         PREFETCH_NTA(&v[16]);
2534         jdx = 4*(*vi++);
2535 /*          jdx = *vi++; */
2536 
2537         /* 4x4 Matrix-Vector product with negative accumulation: */
2538         SSE_INLINE_BEGIN_2(&t[jdx],v)
2539           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2540 
2541           /* First Column */
2542           SSE_COPY_PS(XMM0,XMM6)
2543           SSE_SHUFFLE(XMM0,XMM0,0x00)
2544           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2545           SSE_SUB_PS(XMM7,XMM0)
2546 
2547           /* Second Column */
2548           SSE_COPY_PS(XMM1,XMM6)
2549           SSE_SHUFFLE(XMM1,XMM1,0x55)
2550           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2551           SSE_SUB_PS(XMM7,XMM1)
2552 
2553           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2554 
2555           /* Third Column */
2556           SSE_COPY_PS(XMM2,XMM6)
2557           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2558           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2559           SSE_SUB_PS(XMM7,XMM2)
2560 
2561           /* Fourth Column */
2562           SSE_COPY_PS(XMM3,XMM6)
2563           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2564           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2565           SSE_SUB_PS(XMM7,XMM3)
2566         SSE_INLINE_END_2
2567 
2568         v  += 16;
2569       }
2570       v    =  aa + 16*ai[++i];
2571       PREFETCH_NTA(v);
2572       STORE_PS(&t[idx],XMM7);
2573     }
2574 
2575     /* Backward solve the upper triangular factor.*/
2576 
2577     idt  = 4*(n-1);
2578     ai16 = 16*diag[n-1];
2579     v    = aa + ai16 + 16;
2580     for (i=n-1; i>=0;){
2581       PREFETCH_NTA(&v[8]);
2582       vi = aj + diag[i] + 1;
2583       nz = ai[i+1] - diag[i] - 1;
2584 
2585       LOAD_PS(&t[idt],XMM7);
2586 
2587       while (nz--) {
2588         PREFETCH_NTA(&v[16]);
2589         idx = 4*(*vi++);
2590 /*          idx = *vi++; */
2591 
2592         /* 4x4 Matrix-Vector Product with negative accumulation: */
2593         SSE_INLINE_BEGIN_2(&t[idx],v)
2594           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2595 
2596           /* First Column */
2597           SSE_COPY_PS(XMM0,XMM6)
2598           SSE_SHUFFLE(XMM0,XMM0,0x00)
2599           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2600           SSE_SUB_PS(XMM7,XMM0)
2601 
2602           /* Second Column */
2603           SSE_COPY_PS(XMM1,XMM6)
2604           SSE_SHUFFLE(XMM1,XMM1,0x55)
2605           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2606           SSE_SUB_PS(XMM7,XMM1)
2607 
2608           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2609 
2610           /* Third Column */
2611           SSE_COPY_PS(XMM2,XMM6)
2612           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2613           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2614           SSE_SUB_PS(XMM7,XMM2)
2615 
2616           /* Fourth Column */
2617           SSE_COPY_PS(XMM3,XMM6)
2618           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2619           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2620           SSE_SUB_PS(XMM7,XMM3)
2621         SSE_INLINE_END_2
2622         v  += 16;
2623       }
2624       v    = aa + ai16;
2625       ai16 = 16*diag[--i];
2626       PREFETCH_NTA(aa+ai16+16);
2627       /*
2628          Scale the result by the diagonal 4x4 block,
2629          which was inverted as part of the factorization
2630       */
2631       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
2632         /* First Column */
2633         SSE_COPY_PS(XMM0,XMM7)
2634         SSE_SHUFFLE(XMM0,XMM0,0x00)
2635         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2636 
2637         /* Second Column */
2638         SSE_COPY_PS(XMM1,XMM7)
2639         SSE_SHUFFLE(XMM1,XMM1,0x55)
2640         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2641         SSE_ADD_PS(XMM0,XMM1)
2642 
2643         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2644 
2645         /* Third Column */
2646         SSE_COPY_PS(XMM2,XMM7)
2647         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2648         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2649         SSE_ADD_PS(XMM0,XMM2)
2650 
2651         /* Fourth Column */
2652         SSE_COPY_PS(XMM3,XMM7)
2653         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2654         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2655         SSE_ADD_PS(XMM0,XMM3)
2656 
2657         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2658       SSE_INLINE_END_3
2659 
2660       v    = aa + ai16 + 16;
2661       idt -= 4;
2662     }
2663 
2664     /* Convert t from single precision back to double precision (inplace)*/
2665     idt = 4*(n-1);
2666     for (i=n-1;i>=0;i--) {
2667       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
2668       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
2669       PetscScalar *xtemp=&x[idt];
2670       MatScalar   *ttemp=&t[idt];
2671       xtemp[3] = (PetscScalar)ttemp[3];
2672       xtemp[2] = (PetscScalar)ttemp[2];
2673       xtemp[1] = (PetscScalar)ttemp[1];
2674       xtemp[0] = (PetscScalar)ttemp[0];
2675       idt -= 4;
2676     }
2677 
2678   } /* End of artificial scope. */
2679   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2680   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2681   ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr);
2682   SSE_SCOPE_END;
2683   PetscFunctionReturn(0);
2684 }
2685 
2686 #endif
2687 #undef __FUNCT__
2688 #define __FUNCT__ "MatSolve_SeqBAIJ_3"
2689 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
2690 {
2691   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2692   IS                iscol=a->col,isrow=a->row;
2693   PetscErrorCode    ierr;
2694   PetscInt          *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
2695   PetscInt          *diag = a->diag;
2696   const MatScalar   *aa=a->a,*v;
2697   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
2698   const PetscScalar *b;
2699 
2700   PetscFunctionBegin;
2701   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2702   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2703   t  = a->solve_work;
2704 
2705   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2706   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2707 
2708   /* forward solve the lower triangular */
2709   idx    = 3*(*r++);
2710   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
2711   for (i=1; i<n; i++) {
2712     v     = aa + 9*ai[i];
2713     vi    = aj + ai[i];
2714     nz    = diag[i] - ai[i];
2715     idx   = 3*(*r++);
2716     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
2717     while (nz--) {
2718       idx   = 3*(*vi++);
2719       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2720       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2721       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2722       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2723       v += 9;
2724     }
2725     idx = 3*i;
2726     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
2727   }
2728   /* backward solve the upper triangular */
2729   for (i=n-1; i>=0; i--){
2730     v    = aa + 9*diag[i] + 9;
2731     vi   = aj + diag[i] + 1;
2732     nz   = ai[i+1] - diag[i] - 1;
2733     idt  = 3*i;
2734     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
2735     while (nz--) {
2736       idx   = 3*(*vi++);
2737       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2738       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2739       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2740       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2741       v += 9;
2742     }
2743     idc = 3*(*c--);
2744     v   = aa + 9*diag[i];
2745     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2746     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2747     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
2748   }
2749   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2750   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2751   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2752   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2753   ierr = PetscLogFlops(2*9*(a->nz) - 3*A->cmap->n);CHKERRQ(ierr);
2754   PetscFunctionReturn(0);
2755 }
2756 
2757 /*
2758       Special case where the matrix was ILU(0) factored in the natural
2759    ordering. This eliminates the need for the column and row permutation.
2760 */
2761 #undef __FUNCT__
2762 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
2763 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
2764 {
2765   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2766   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
2767   PetscErrorCode    ierr;
2768   PetscInt          *diag = a->diag;
2769   const MatScalar   *aa=a->a,*v;
2770   PetscScalar       *x,s1,s2,s3,x1,x2,x3;
2771   const PetscScalar *b;
2772   PetscInt          jdx,idt,idx,nz,*vi,i;
2773 
2774   PetscFunctionBegin;
2775   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2776   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2777 
2778   /* forward solve the lower triangular */
2779   idx    = 0;
2780   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
2781   for (i=1; i<n; i++) {
2782     v     =  aa      + 9*ai[i];
2783     vi    =  aj      + ai[i];
2784     nz    =  diag[i] - ai[i];
2785     idx   +=  3;
2786     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
2787     while (nz--) {
2788       jdx   = 3*(*vi++);
2789       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
2790       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2791       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2792       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2793       v    += 9;
2794     }
2795     x[idx]   = s1;
2796     x[1+idx] = s2;
2797     x[2+idx] = s3;
2798   }
2799   /* backward solve the upper triangular */
2800   for (i=n-1; i>=0; i--){
2801     v    = aa + 9*diag[i] + 9;
2802     vi   = aj + diag[i] + 1;
2803     nz   = ai[i+1] - diag[i] - 1;
2804     idt  = 3*i;
2805     s1 = x[idt];  s2 = x[1+idt];
2806     s3 = x[2+idt];
2807     while (nz--) {
2808       idx   = 3*(*vi++);
2809       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
2810       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2811       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2812       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2813       v    += 9;
2814     }
2815     v        = aa +  9*diag[i];
2816     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2817     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2818     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
2819   }
2820 
2821   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2822   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2823   ierr = PetscLogFlops(2*9*(a->nz) - 3*A->cmap->n);CHKERRQ(ierr);
2824   PetscFunctionReturn(0);
2825 }
2826 
2827 #undef __FUNCT__
2828 #define __FUNCT__ "MatSolve_SeqBAIJ_2"
2829 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
2830 {
2831   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2832   IS                iscol=a->col,isrow=a->row;
2833   PetscErrorCode    ierr;
2834   PetscInt          *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
2835   PetscInt          *diag = a->diag;
2836   const MatScalar   *aa=a->a,*v;
2837   PetscScalar       *x,s1,s2,x1,x2,*t;
2838   const PetscScalar *b;
2839 
2840   PetscFunctionBegin;
2841   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2842   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2843   t  = a->solve_work;
2844 
2845   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2846   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2847 
2848   /* forward solve the lower triangular */
2849   idx    = 2*(*r++);
2850   t[0] = b[idx]; t[1] = b[1+idx];
2851   for (i=1; i<n; i++) {
2852     v     = aa + 4*ai[i];
2853     vi    = aj + ai[i];
2854     nz    = diag[i] - ai[i];
2855     idx   = 2*(*r++);
2856     s1  = b[idx]; s2 = b[1+idx];
2857     while (nz--) {
2858       idx   = 2*(*vi++);
2859       x1    = t[idx]; x2 = t[1+idx];
2860       s1 -= v[0]*x1 + v[2]*x2;
2861       s2 -= v[1]*x1 + v[3]*x2;
2862       v += 4;
2863     }
2864     idx = 2*i;
2865     t[idx] = s1; t[1+idx] = s2;
2866   }
2867   /* backward solve the upper triangular */
2868   for (i=n-1; i>=0; i--){
2869     v    = aa + 4*diag[i] + 4;
2870     vi   = aj + diag[i] + 1;
2871     nz   = ai[i+1] - diag[i] - 1;
2872     idt  = 2*i;
2873     s1 = t[idt]; s2 = t[1+idt];
2874     while (nz--) {
2875       idx   = 2*(*vi++);
2876       x1    = t[idx]; x2 = t[1+idx];
2877       s1 -= v[0]*x1 + v[2]*x2;
2878       s2 -= v[1]*x1 + v[3]*x2;
2879       v += 4;
2880     }
2881     idc = 2*(*c--);
2882     v   = aa + 4*diag[i];
2883     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
2884     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
2885   }
2886   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2887   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2888   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2889   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2890   ierr = PetscLogFlops(2*4*(a->nz) - 2*A->cmap->n);CHKERRQ(ierr);
2891   PetscFunctionReturn(0);
2892 }
2893 
2894 /*
2895       Special case where the matrix was ILU(0) factored in the natural
2896    ordering. This eliminates the need for the column and row permutation.
2897 */
2898 #undef __FUNCT__
2899 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
2900 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
2901 {
2902   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2903   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
2904   PetscErrorCode    ierr;
2905   PetscInt          *diag = a->diag;
2906   const MatScalar   *aa=a->a,*v;
2907   PetscScalar       *x,s1,s2,x1,x2;
2908   const PetscScalar *b;
2909   PetscInt          jdx,idt,idx,nz,*vi,i;
2910 
2911   PetscFunctionBegin;
2912   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2913   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2914 
2915   /* forward solve the lower triangular */
2916   idx    = 0;
2917   x[0]   = b[0]; x[1] = b[1];
2918   for (i=1; i<n; i++) {
2919     v     =  aa      + 4*ai[i];
2920     vi    =  aj      + ai[i];
2921     nz    =  diag[i] - ai[i];
2922     idx   +=  2;
2923     s1  =  b[idx];s2 = b[1+idx];
2924     while (nz--) {
2925       jdx   = 2*(*vi++);
2926       x1    = x[jdx];x2 = x[1+jdx];
2927       s1 -= v[0]*x1 + v[2]*x2;
2928       s2 -= v[1]*x1 + v[3]*x2;
2929       v    += 4;
2930     }
2931     x[idx]   = s1;
2932     x[1+idx] = s2;
2933   }
2934   /* backward solve the upper triangular */
2935   for (i=n-1; i>=0; i--){
2936     v    = aa + 4*diag[i] + 4;
2937     vi   = aj + diag[i] + 1;
2938     nz   = ai[i+1] - diag[i] - 1;
2939     idt  = 2*i;
2940     s1 = x[idt];  s2 = x[1+idt];
2941     while (nz--) {
2942       idx   = 2*(*vi++);
2943       x1    = x[idx];   x2 = x[1+idx];
2944       s1 -= v[0]*x1 + v[2]*x2;
2945       s2 -= v[1]*x1 + v[3]*x2;
2946       v    += 4;
2947     }
2948     v        = aa +  4*diag[i];
2949     x[idt]   = v[0]*s1 + v[2]*s2;
2950     x[1+idt] = v[1]*s1 + v[3]*s2;
2951   }
2952 
2953   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2954   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2955   ierr = PetscLogFlops(2*4*(a->nz) - 2*A->cmap->n);CHKERRQ(ierr);
2956   PetscFunctionReturn(0);
2957 }
2958 
2959 #undef __FUNCT__
2960 #define __FUNCT__ "MatSolve_SeqBAIJ_1"
2961 PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
2962 {
2963   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
2964   IS             iscol=a->col,isrow=a->row;
2965   PetscErrorCode ierr;
2966   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
2967   PetscInt       *diag = a->diag;
2968   MatScalar      *aa=a->a,*v;
2969   PetscScalar    *x,*b,s1,*t;
2970 
2971   PetscFunctionBegin;
2972   if (!n) PetscFunctionReturn(0);
2973 
2974   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2975   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2976   t  = a->solve_work;
2977 
2978   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2979   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2980 
2981   /* forward solve the lower triangular */
2982   t[0] = b[*r++];
2983   for (i=1; i<n; i++) {
2984     v     = aa + ai[i];
2985     vi    = aj + ai[i];
2986     nz    = diag[i] - ai[i];
2987     s1  = b[*r++];
2988     while (nz--) {
2989       s1 -= (*v++)*t[*vi++];
2990     }
2991     t[i] = s1;
2992   }
2993   /* backward solve the upper triangular */
2994   for (i=n-1; i>=0; i--){
2995     v    = aa + diag[i] + 1;
2996     vi   = aj + diag[i] + 1;
2997     nz   = ai[i+1] - diag[i] - 1;
2998     s1 = t[i];
2999     while (nz--) {
3000       s1 -= (*v++)*t[*vi++];
3001     }
3002     x[*c--] = t[i] = aa[diag[i]]*s1;
3003   }
3004 
3005   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3006   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3007   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3008   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3009   ierr = PetscLogFlops(2*1*(a->nz) - A->cmap->n);CHKERRQ(ierr);
3010   PetscFunctionReturn(0);
3011 }
3012 /*
3013       Special case where the matrix was ILU(0) factored in the natural
3014    ordering. This eliminates the need for the column and row permutation.
3015 */
3016 #undef __FUNCT__
3017 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering"
3018 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
3019 {
3020   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3021   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
3022   PetscErrorCode ierr;
3023   PetscInt       *diag = a->diag;
3024   MatScalar      *aa=a->a;
3025   PetscScalar    *x,*b;
3026   PetscScalar    s1,x1;
3027   MatScalar      *v;
3028   PetscInt       jdx,idt,idx,nz,*vi,i;
3029 
3030   PetscFunctionBegin;
3031   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3032   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3033 
3034   /* forward solve the lower triangular */
3035   idx    = 0;
3036   x[0]   = b[0];
3037   for (i=1; i<n; i++) {
3038     v     =  aa      + ai[i];
3039     vi    =  aj      + ai[i];
3040     nz    =  diag[i] - ai[i];
3041     idx   +=  1;
3042     s1  =  b[idx];
3043     while (nz--) {
3044       jdx   = *vi++;
3045       x1    = x[jdx];
3046       s1 -= v[0]*x1;
3047       v    += 1;
3048     }
3049     x[idx]   = s1;
3050   }
3051   /* backward solve the upper triangular */
3052   for (i=n-1; i>=0; i--){
3053     v    = aa + diag[i] + 1;
3054     vi   = aj + diag[i] + 1;
3055     nz   = ai[i+1] - diag[i] - 1;
3056     idt  = i;
3057     s1 = x[idt];
3058     while (nz--) {
3059       idx   = *vi++;
3060       x1    = x[idx];
3061       s1 -= v[0]*x1;
3062       v    += 1;
3063     }
3064     v        = aa +  diag[i];
3065     x[idt]   = v[0]*s1;
3066   }
3067   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3068   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3069   ierr = PetscLogFlops(2*(a->nz) - A->cmap->n);CHKERRQ(ierr);
3070   PetscFunctionReturn(0);
3071 }
3072 
3073 /* ----------------------------------------------------------------*/
3074 /*
3075      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
3076    except that the data structure of Mat_SeqAIJ is slightly different.
3077    Not a good example of code reuse.
3078 */
3079 EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,MatDuplicateOption,Mat*);
3080 EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat,PetscTruth);
3081 
3082 #undef __FUNCT__
3083 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ"
3084 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,MatFactorInfo *info,Mat *fact)
3085 {
3086   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b;
3087   IS             isicol;
3088   PetscErrorCode ierr;
3089   PetscInt       *r,*ic,prow,n = a->mbs,*ai = a->i,*aj = a->j;
3090   PetscInt       *ainew,*ajnew,jmax,*fill,*xi,nz,*im,*ajfill,*flev;
3091   PetscInt       *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0;
3092   PetscInt       incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd;
3093   PetscTruth     col_identity,row_identity,flg;
3094   PetscReal      f;
3095 
3096   PetscFunctionBegin;
3097   f             = info->fill;
3098   levels        = (PetscInt)info->levels;
3099   diagonal_fill = (PetscInt)info->diagonal_fill;
3100   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
3101   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
3102   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
3103 
3104   if (!levels && row_identity && col_identity) {  /* special case copy the nonzero structure */
3105     ierr = MatDuplicateNoCreate_SeqBAIJ(A,MAT_DO_NOT_COPY_VALUES,fact);CHKERRQ(ierr);
3106     (*fact)->factor = MAT_FACTOR_LU;
3107     b               = (Mat_SeqBAIJ*)(*fact)->data;
3108     ierr = MatMissingDiagonal_SeqBAIJ(*fact,&flg,&dd);CHKERRQ(ierr);
3109     if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry in row %D",dd);
3110     b->row        = isrow;
3111     b->col        = iscol;
3112     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
3113     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
3114     b->icol       = isicol;
3115     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3116     ierr          = PetscMalloc(((*fact)->rmap->N+1+(*fact)->rmap->bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
3117   } else { /* general case perform the symbolic factorization */
3118     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
3119     ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
3120 
3121     /* get new row pointers */
3122     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr);
3123     ainew[0] = 0;
3124     /* don't know how many column pointers are needed so estimate */
3125     jmax = (PetscInt)(f*ai[n] + 1);
3126     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr);
3127     /* ajfill is level of fill for each fill entry */
3128     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr);
3129     /* fill is a linked list of nonzeros in active row */
3130     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr);
3131     /* im is level for each filled value */
3132     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr);
3133     /* dloc is location of diagonal in factor */
3134     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr);
3135     dloc[0]  = 0;
3136     for (prow=0; prow<n; prow++) {
3137 
3138       /* copy prow into linked list */
3139       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
3140       if (!nz) SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix");
3141       xi         = aj + ai[r[prow]];
3142       fill[n]    = n;
3143       fill[prow] = -1; /* marker for diagonal entry */
3144       while (nz--) {
3145 	fm  = n;
3146 	idx = ic[*xi++];
3147 	do {
3148 	  m  = fm;
3149 	  fm = fill[m];
3150 	} while (fm < idx);
3151 	fill[m]   = idx;
3152 	fill[idx] = fm;
3153 	im[idx]   = 0;
3154       }
3155 
3156       /* make sure diagonal entry is included */
3157       if (diagonal_fill && fill[prow] == -1) {
3158 	fm = n;
3159 	while (fill[fm] < prow) fm = fill[fm];
3160 	fill[prow] = fill[fm];  /* insert diagonal into linked list */
3161 	fill[fm]   = prow;
3162 	im[prow]   = 0;
3163 	nzf++;
3164 	dcount++;
3165       }
3166 
3167       nzi = 0;
3168       row = fill[n];
3169       while (row < prow) {
3170 	incrlev = im[row] + 1;
3171 	nz      = dloc[row];
3172 	xi      = ajnew  + ainew[row] + nz + 1;
3173 	flev    = ajfill + ainew[row] + nz + 1;
3174 	nnz     = ainew[row+1] - ainew[row] - nz - 1;
3175 	fm      = row;
3176 	while (nnz-- > 0) {
3177 	  idx = *xi++;
3178 	  if (*flev + incrlev > levels) {
3179 	    flev++;
3180 	    continue;
3181 	  }
3182 	  do {
3183 	    m  = fm;
3184 	    fm = fill[m];
3185 	  } while (fm < idx);
3186 	  if (fm != idx) {
3187 	    im[idx]   = *flev + incrlev;
3188 	    fill[m]   = idx;
3189 	    fill[idx] = fm;
3190 	    fm        = idx;
3191 	    nzf++;
3192 	  } else {
3193 	    if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
3194 	  }
3195 	  flev++;
3196 	}
3197 	row = fill[row];
3198 	nzi++;
3199       }
3200       /* copy new filled row into permanent storage */
3201       ainew[prow+1] = ainew[prow] + nzf;
3202       if (ainew[prow+1] > jmax) {
3203 
3204 	/* estimate how much additional space we will need */
3205 	/* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
3206 	/* just double the memory each time */
3207 	PetscInt maxadd = jmax;
3208 	/* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
3209 	if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
3210 	jmax += maxadd;
3211 
3212 	/* allocate a longer ajnew and ajfill */
3213 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xi);CHKERRQ(ierr);
3214 	ierr = PetscMemcpy(xi,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
3215 	ierr = PetscFree(ajnew);CHKERRQ(ierr);
3216 	ajnew = xi;
3217 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xi);CHKERRQ(ierr);
3218 	ierr = PetscMemcpy(xi,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
3219 	ierr = PetscFree(ajfill);CHKERRQ(ierr);
3220 	ajfill = xi;
3221 	reallocate++; /* count how many reallocations are needed */
3222       }
3223       xi          = ajnew + ainew[prow];
3224       flev        = ajfill + ainew[prow];
3225       dloc[prow]  = nzi;
3226       fm          = fill[n];
3227       while (nzf--) {
3228 	*xi++   = fm;
3229 	*flev++ = im[fm];
3230 	fm      = fill[fm];
3231       }
3232       /* make sure row has diagonal entry */
3233       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
3234 	SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
3235     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow);
3236       }
3237     }
3238     ierr = PetscFree(ajfill);CHKERRQ(ierr);
3239     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
3240     ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
3241     ierr = PetscFree(fill);CHKERRQ(ierr);
3242     ierr = PetscFree(im);CHKERRQ(ierr);
3243 
3244 #if defined(PETSC_USE_INFO)
3245     {
3246       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
3247       ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr);
3248       ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
3249       ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr);
3250       ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
3251       if (diagonal_fill) {
3252 	ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr);
3253       }
3254     }
3255 #endif
3256 
3257     /* put together the new matrix */
3258     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(*fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
3259     ierr = PetscLogObjectParent(*fact,isicol);CHKERRQ(ierr);
3260     b    = (Mat_SeqBAIJ*)(*fact)->data;
3261     b->free_a       = PETSC_TRUE;
3262     b->free_ij      = PETSC_TRUE;
3263     b->singlemalloc = PETSC_FALSE;
3264     ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
3265     b->j          = ajnew;
3266     b->i          = ainew;
3267     for (i=0; i<n; i++) dloc[i] += ainew[i];
3268     b->diag       = dloc;
3269     b->ilen       = 0;
3270     b->imax       = 0;
3271     b->row        = isrow;
3272     b->col        = iscol;
3273     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3274     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
3275     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
3276     b->icol       = isicol;
3277     ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
3278     /* In b structure:  Free imax, ilen, old a, old j.
3279        Allocate dloc, solve_work, new a, new j */
3280     ierr = PetscLogObjectMemory(*fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr);
3281     b->maxnz          = b->nz = ainew[n];
3282     (*fact)->factor   = MAT_FACTOR_LU;
3283 
3284     (*fact)->info.factor_mallocs    = reallocate;
3285     (*fact)->info.fill_ratio_given  = f;
3286     (*fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
3287   }
3288   ierr = MatSeqBAIJSetNumericFactorization(*fact,(PetscTruth)(row_identity && col_identity));CHKERRQ(ierr);
3289   PetscFunctionReturn(0);
3290 }
3291 
3292 #undef __FUNCT__
3293 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE"
3294 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
3295 {
3296   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
3297   /* int i,*AJ=a->j,nz=a->nz; */
3298   PetscFunctionBegin;
3299   /* Undo Column scaling */
3300 /*    while (nz--) { */
3301 /*      AJ[i] = AJ[i]/4; */
3302 /*    } */
3303   /* This should really invoke a push/pop logic, but we don't have that yet. */
3304   A->ops->setunfactored = PETSC_NULL;
3305   PetscFunctionReturn(0);
3306 }
3307 
3308 #undef __FUNCT__
3309 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj"
3310 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
3311 {
3312   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3313   PetscInt       *AJ=a->j,nz=a->nz;
3314   unsigned short *aj=(unsigned short *)AJ;
3315   PetscFunctionBegin;
3316   /* Is this really necessary? */
3317   while (nz--) {
3318     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
3319   }
3320   A->ops->setunfactored = PETSC_NULL;
3321   PetscFunctionReturn(0);
3322 }
3323 
3324 
3325