xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision 09f3b4e5628a00a1eaf17d80982cfbcc515cc9c1)
1 #define PETSCMAT_DLL
2 
3 /*
4     Factorization code for BAIJ format.
5 */
6 
7 #include "src/mat/impls/baij/seq/baij.h"
8 #include "src/inline/ilu.h"
9 #include "src/inline/dot.h"
10 
11 #undef __FUNCT__
12 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering"
13 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
14 {
15   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
16   PetscErrorCode ierr;
17   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
18   PetscInt       *diag = a->diag;
19   MatScalar      *aa=a->a,*v;
20   PetscScalar    s1,*x,*b;
21 
22   PetscFunctionBegin;
23   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
24   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
25   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
26 
27   /* forward solve the U^T */
28   for (i=0; i<n; i++) {
29 
30     v     = aa + diag[i];
31     /* multiply by the inverse of the block diagonal */
32     s1    = (*v++)*x[i];
33     vi    = aj + diag[i] + 1;
34     nz    = ai[i+1] - diag[i] - 1;
35     while (nz--) {
36       x[*vi++]  -= (*v++)*s1;
37     }
38     x[i]   = s1;
39   }
40   /* backward solve the L^T */
41   for (i=n-1; i>=0; i--){
42     v    = aa + diag[i] - 1;
43     vi   = aj + diag[i] - 1;
44     nz   = diag[i] - ai[i];
45     s1   = x[i];
46     while (nz--) {
47       x[*vi--]   -=  (*v--)*s1;
48     }
49   }
50   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
51   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
52   ierr = PetscLogFlops(2*(a->nz) - A->n);CHKERRQ(ierr);
53   PetscFunctionReturn(0);
54 }
55 
56 #undef __FUNCT__
57 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering"
58 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
59 {
60   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
61   PetscErrorCode ierr;
62   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
63   PetscInt       *diag = a->diag,oidx;
64   MatScalar      *aa=a->a,*v;
65   PetscScalar    s1,s2,x1,x2;
66   PetscScalar    *x,*b;
67 
68   PetscFunctionBegin;
69   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
70   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
71   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
72 
73   /* forward solve the U^T */
74   idx = 0;
75   for (i=0; i<n; i++) {
76 
77     v     = aa + 4*diag[i];
78     /* multiply by the inverse of the block diagonal */
79     x1 = x[idx];   x2 = x[1+idx];
80     s1 = v[0]*x1  +  v[1]*x2;
81     s2 = v[2]*x1  +  v[3]*x2;
82     v += 4;
83 
84     vi    = aj + diag[i] + 1;
85     nz    = ai[i+1] - diag[i] - 1;
86     while (nz--) {
87       oidx = 2*(*vi++);
88       x[oidx]   -= v[0]*s1  +  v[1]*s2;
89       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
90       v  += 4;
91     }
92     x[idx]   = s1;x[1+idx] = s2;
93     idx += 2;
94   }
95   /* backward solve the L^T */
96   for (i=n-1; i>=0; i--){
97     v    = aa + 4*diag[i] - 4;
98     vi   = aj + diag[i] - 1;
99     nz   = diag[i] - ai[i];
100     idt  = 2*i;
101     s1   = x[idt];  s2 = x[1+idt];
102     while (nz--) {
103       idx   = 2*(*vi--);
104       x[idx]   -=  v[0]*s1 +  v[1]*s2;
105       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
106       v -= 4;
107     }
108   }
109   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
110   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
111   ierr = PetscLogFlops(2*4*(a->nz) - 2*A->n);CHKERRQ(ierr);
112   PetscFunctionReturn(0);
113 }
114 
115 #undef __FUNCT__
116 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering"
117 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
118 {
119   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
120   PetscErrorCode ierr;
121   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
122   PetscInt       *diag = a->diag,oidx;
123   MatScalar      *aa=a->a,*v;
124   PetscScalar    s1,s2,s3,x1,x2,x3;
125   PetscScalar    *x,*b;
126 
127   PetscFunctionBegin;
128   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
129   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
130   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
131 
132   /* forward solve the U^T */
133   idx = 0;
134   for (i=0; i<n; i++) {
135 
136     v     = aa + 9*diag[i];
137     /* multiply by the inverse of the block diagonal */
138     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
139     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
140     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
141     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
142     v += 9;
143 
144     vi    = aj + diag[i] + 1;
145     nz    = ai[i+1] - diag[i] - 1;
146     while (nz--) {
147       oidx = 3*(*vi++);
148       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
149       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
150       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
151       v  += 9;
152     }
153     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
154     idx += 3;
155   }
156   /* backward solve the L^T */
157   for (i=n-1; i>=0; i--){
158     v    = aa + 9*diag[i] - 9;
159     vi   = aj + diag[i] - 1;
160     nz   = diag[i] - ai[i];
161     idt  = 3*i;
162     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
163     while (nz--) {
164       idx   = 3*(*vi--);
165       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
166       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
167       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
168       v -= 9;
169     }
170   }
171   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
172   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
173   ierr = PetscLogFlops(2*9*(a->nz) - 3*A->n);CHKERRQ(ierr);
174   PetscFunctionReturn(0);
175 }
176 
177 #undef __FUNCT__
178 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering"
179 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
180 {
181   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
182   PetscErrorCode ierr;
183   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
184   PetscInt       *diag = a->diag,oidx;
185   MatScalar      *aa=a->a,*v;
186   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
187   PetscScalar    *x,*b;
188 
189   PetscFunctionBegin;
190   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
191   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
192   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
193 
194   /* forward solve the U^T */
195   idx = 0;
196   for (i=0; i<n; i++) {
197 
198     v     = aa + 16*diag[i];
199     /* multiply by the inverse of the block diagonal */
200     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
201     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
202     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
203     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
204     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
205     v += 16;
206 
207     vi    = aj + diag[i] + 1;
208     nz    = ai[i+1] - diag[i] - 1;
209     while (nz--) {
210       oidx = 4*(*vi++);
211       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
212       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
213       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
214       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
215       v  += 16;
216     }
217     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
218     idx += 4;
219   }
220   /* backward solve the L^T */
221   for (i=n-1; i>=0; i--){
222     v    = aa + 16*diag[i] - 16;
223     vi   = aj + diag[i] - 1;
224     nz   = diag[i] - ai[i];
225     idt  = 4*i;
226     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
227     while (nz--) {
228       idx   = 4*(*vi--);
229       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
230       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
231       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
232       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
233       v -= 16;
234     }
235   }
236   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
237   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
238   ierr = PetscLogFlops(2*16*(a->nz) - 4*A->n);CHKERRQ(ierr);
239   PetscFunctionReturn(0);
240 }
241 
242 #undef __FUNCT__
243 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering"
244 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
245 {
246   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
247   PetscErrorCode ierr;
248   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
249   PetscInt       *diag = a->diag,oidx;
250   MatScalar      *aa=a->a,*v;
251   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
252   PetscScalar    *x,*b;
253 
254   PetscFunctionBegin;
255   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
256   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
257   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
258 
259   /* forward solve the U^T */
260   idx = 0;
261   for (i=0; i<n; i++) {
262 
263     v     = aa + 25*diag[i];
264     /* multiply by the inverse of the block diagonal */
265     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
266     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
267     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
268     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
269     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
270     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
271     v += 25;
272 
273     vi    = aj + diag[i] + 1;
274     nz    = ai[i+1] - diag[i] - 1;
275     while (nz--) {
276       oidx = 5*(*vi++);
277       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
278       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
279       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
280       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
281       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
282       v  += 25;
283     }
284     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
285     idx += 5;
286   }
287   /* backward solve the L^T */
288   for (i=n-1; i>=0; i--){
289     v    = aa + 25*diag[i] - 25;
290     vi   = aj + diag[i] - 1;
291     nz   = diag[i] - ai[i];
292     idt  = 5*i;
293     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
294     while (nz--) {
295       idx   = 5*(*vi--);
296       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
297       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
298       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
299       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
300       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
301       v -= 25;
302     }
303   }
304   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
305   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
306   ierr = PetscLogFlops(2*25*(a->nz) - 5*A->n);CHKERRQ(ierr);
307   PetscFunctionReturn(0);
308 }
309 
310 #undef __FUNCT__
311 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering"
312 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
313 {
314   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
315   PetscErrorCode ierr;
316   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
317   PetscInt       *diag = a->diag,oidx;
318   MatScalar      *aa=a->a,*v;
319   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
320   PetscScalar    *x,*b;
321 
322   PetscFunctionBegin;
323   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
324   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
325   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
326 
327   /* forward solve the U^T */
328   idx = 0;
329   for (i=0; i<n; i++) {
330 
331     v     = aa + 36*diag[i];
332     /* multiply by the inverse of the block diagonal */
333     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
334     x6    = x[5+idx];
335     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
336     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
337     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
338     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
339     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
340     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
341     v += 36;
342 
343     vi    = aj + diag[i] + 1;
344     nz    = ai[i+1] - diag[i] - 1;
345     while (nz--) {
346       oidx = 6*(*vi++);
347       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
348       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
349       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
350       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
351       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
352       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
353       v  += 36;
354     }
355     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
356     x[5+idx] = s6;
357     idx += 6;
358   }
359   /* backward solve the L^T */
360   for (i=n-1; i>=0; i--){
361     v    = aa + 36*diag[i] - 36;
362     vi   = aj + diag[i] - 1;
363     nz   = diag[i] - ai[i];
364     idt  = 6*i;
365     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
366     s6 = x[5+idt];
367     while (nz--) {
368       idx   = 6*(*vi--);
369       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
370       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
371       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
372       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
373       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
374       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
375       v -= 36;
376     }
377   }
378   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
379   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
380   ierr = PetscLogFlops(2*36*(a->nz) - 6*A->n);CHKERRQ(ierr);
381   PetscFunctionReturn(0);
382 }
383 
384 #undef __FUNCT__
385 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering"
386 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
387 {
388   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
389   PetscErrorCode ierr;
390   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
391   PetscInt       *diag = a->diag,oidx;
392   MatScalar      *aa=a->a,*v;
393   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
394   PetscScalar    *x,*b;
395 
396   PetscFunctionBegin;
397   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
398   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
399   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
400 
401   /* forward solve the U^T */
402   idx = 0;
403   for (i=0; i<n; i++) {
404 
405     v     = aa + 49*diag[i];
406     /* multiply by the inverse of the block diagonal */
407     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
408     x6    = x[5+idx]; x7 = x[6+idx];
409     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
410     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
411     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
412     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
413     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
414     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
415     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
416     v += 49;
417 
418     vi    = aj + diag[i] + 1;
419     nz    = ai[i+1] - diag[i] - 1;
420     while (nz--) {
421       oidx = 7*(*vi++);
422       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
423       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
424       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
425       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
426       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
427       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
428       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
429       v  += 49;
430     }
431     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
432     x[5+idx] = s6;x[6+idx] = s7;
433     idx += 7;
434   }
435   /* backward solve the L^T */
436   for (i=n-1; i>=0; i--){
437     v    = aa + 49*diag[i] - 49;
438     vi   = aj + diag[i] - 1;
439     nz   = diag[i] - ai[i];
440     idt  = 7*i;
441     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
442     s6 = x[5+idt];s7 = x[6+idt];
443     while (nz--) {
444       idx   = 7*(*vi--);
445       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
446       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
447       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
448       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
449       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
450       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
451       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
452       v -= 49;
453     }
454   }
455   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
456   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
457   ierr = PetscLogFlops(2*49*(a->nz) - 7*A->n);CHKERRQ(ierr);
458   PetscFunctionReturn(0);
459 }
460 
461 /*---------------------------------------------------------------------------------------------*/
462 #undef __FUNCT__
463 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1"
464 PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
465 {
466   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
467   IS             iscol=a->col,isrow=a->row;
468   PetscErrorCode ierr;
469   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
470   PetscInt       *diag = a->diag;
471   MatScalar      *aa=a->a,*v;
472   PetscScalar    s1,*x,*b,*t;
473 
474   PetscFunctionBegin;
475   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
476   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
477   t  = a->solve_work;
478 
479   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
480   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
481 
482   /* copy the b into temp work space according to permutation */
483   for (i=0; i<n; i++) {
484     t[i] = b[c[i]];
485   }
486 
487   /* forward solve the U^T */
488   for (i=0; i<n; i++) {
489 
490     v     = aa + diag[i];
491     /* multiply by the inverse of the block diagonal */
492     s1    = (*v++)*t[i];
493     vi    = aj + diag[i] + 1;
494     nz    = ai[i+1] - diag[i] - 1;
495     while (nz--) {
496       t[*vi++]  -= (*v++)*s1;
497     }
498     t[i]   = s1;
499   }
500   /* backward solve the L^T */
501   for (i=n-1; i>=0; i--){
502     v    = aa + diag[i] - 1;
503     vi   = aj + diag[i] - 1;
504     nz   = diag[i] - ai[i];
505     s1   = t[i];
506     while (nz--) {
507       t[*vi--]   -=  (*v--)*s1;
508     }
509   }
510 
511   /* copy t into x according to permutation */
512   for (i=0; i<n; i++) {
513     x[r[i]]   = t[i];
514   }
515 
516   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
517   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
518   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
519   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
520   ierr = PetscLogFlops(2*(a->nz) - A->n);CHKERRQ(ierr);
521   PetscFunctionReturn(0);
522 }
523 
524 #undef __FUNCT__
525 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2"
526 PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
527 {
528   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
529   IS             iscol=a->col,isrow=a->row;
530   PetscErrorCode ierr;
531   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
532   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
533   MatScalar      *aa=a->a,*v;
534   PetscScalar    s1,s2,x1,x2;
535   PetscScalar    *x,*b,*t;
536 
537   PetscFunctionBegin;
538   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
539   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
540   t  = a->solve_work;
541 
542   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
543   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
544 
545   /* copy the b into temp work space according to permutation */
546   ii = 0;
547   for (i=0; i<n; i++) {
548     ic      = 2*c[i];
549     t[ii]   = b[ic];
550     t[ii+1] = b[ic+1];
551     ii += 2;
552   }
553 
554   /* forward solve the U^T */
555   idx = 0;
556   for (i=0; i<n; i++) {
557 
558     v     = aa + 4*diag[i];
559     /* multiply by the inverse of the block diagonal */
560     x1    = t[idx];   x2 = t[1+idx];
561     s1 = v[0]*x1  +  v[1]*x2;
562     s2 = v[2]*x1  +  v[3]*x2;
563     v += 4;
564 
565     vi    = aj + diag[i] + 1;
566     nz    = ai[i+1] - diag[i] - 1;
567     while (nz--) {
568       oidx = 2*(*vi++);
569       t[oidx]   -= v[0]*s1  +  v[1]*s2;
570       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
571       v  += 4;
572     }
573     t[idx]   = s1;t[1+idx] = s2;
574     idx += 2;
575   }
576   /* backward solve the L^T */
577   for (i=n-1; i>=0; i--){
578     v    = aa + 4*diag[i] - 4;
579     vi   = aj + diag[i] - 1;
580     nz   = diag[i] - ai[i];
581     idt  = 2*i;
582     s1 = t[idt];  s2 = t[1+idt];
583     while (nz--) {
584       idx   = 2*(*vi--);
585       t[idx]   -=  v[0]*s1 +  v[1]*s2;
586       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
587       v -= 4;
588     }
589   }
590 
591   /* copy t into x according to permutation */
592   ii = 0;
593   for (i=0; i<n; i++) {
594     ir      = 2*r[i];
595     x[ir]   = t[ii];
596     x[ir+1] = t[ii+1];
597     ii += 2;
598   }
599 
600   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
601   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
602   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
603   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
604   ierr = PetscLogFlops(2*4*(a->nz) - 2*A->n);CHKERRQ(ierr);
605   PetscFunctionReturn(0);
606 }
607 
608 #undef __FUNCT__
609 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3"
610 PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
611 {
612   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
613   IS             iscol=a->col,isrow=a->row;
614   PetscErrorCode ierr;
615   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
616   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
617   MatScalar      *aa=a->a,*v;
618   PetscScalar    s1,s2,s3,x1,x2,x3;
619   PetscScalar    *x,*b,*t;
620 
621   PetscFunctionBegin;
622   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
623   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
624   t  = a->solve_work;
625 
626   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
627   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
628 
629   /* copy the b into temp work space according to permutation */
630   ii = 0;
631   for (i=0; i<n; i++) {
632     ic      = 3*c[i];
633     t[ii]   = b[ic];
634     t[ii+1] = b[ic+1];
635     t[ii+2] = b[ic+2];
636     ii += 3;
637   }
638 
639   /* forward solve the U^T */
640   idx = 0;
641   for (i=0; i<n; i++) {
642 
643     v     = aa + 9*diag[i];
644     /* multiply by the inverse of the block diagonal */
645     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
646     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
647     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
648     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
649     v += 9;
650 
651     vi    = aj + diag[i] + 1;
652     nz    = ai[i+1] - diag[i] - 1;
653     while (nz--) {
654       oidx = 3*(*vi++);
655       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
656       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
657       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
658       v  += 9;
659     }
660     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
661     idx += 3;
662   }
663   /* backward solve the L^T */
664   for (i=n-1; i>=0; i--){
665     v    = aa + 9*diag[i] - 9;
666     vi   = aj + diag[i] - 1;
667     nz   = diag[i] - ai[i];
668     idt  = 3*i;
669     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
670     while (nz--) {
671       idx   = 3*(*vi--);
672       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
673       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
674       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
675       v -= 9;
676     }
677   }
678 
679   /* copy t into x according to permutation */
680   ii = 0;
681   for (i=0; i<n; i++) {
682     ir      = 3*r[i];
683     x[ir]   = t[ii];
684     x[ir+1] = t[ii+1];
685     x[ir+2] = t[ii+2];
686     ii += 3;
687   }
688 
689   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
690   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
691   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
692   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
693   ierr = PetscLogFlops(2*9*(a->nz) - 3*A->n);CHKERRQ(ierr);
694   PetscFunctionReturn(0);
695 }
696 
697 #undef __FUNCT__
698 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4"
699 PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
700 {
701   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
702   IS             iscol=a->col,isrow=a->row;
703   PetscErrorCode ierr;
704   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
705   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
706   MatScalar      *aa=a->a,*v;
707   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
708   PetscScalar    *x,*b,*t;
709 
710   PetscFunctionBegin;
711   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
712   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
713   t  = a->solve_work;
714 
715   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
716   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
717 
718   /* copy the b into temp work space according to permutation */
719   ii = 0;
720   for (i=0; i<n; i++) {
721     ic      = 4*c[i];
722     t[ii]   = b[ic];
723     t[ii+1] = b[ic+1];
724     t[ii+2] = b[ic+2];
725     t[ii+3] = b[ic+3];
726     ii += 4;
727   }
728 
729   /* forward solve the U^T */
730   idx = 0;
731   for (i=0; i<n; i++) {
732 
733     v     = aa + 16*diag[i];
734     /* multiply by the inverse of the block diagonal */
735     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
736     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
737     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
738     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
739     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
740     v += 16;
741 
742     vi    = aj + diag[i] + 1;
743     nz    = ai[i+1] - diag[i] - 1;
744     while (nz--) {
745       oidx = 4*(*vi++);
746       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
747       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
748       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
749       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
750       v  += 16;
751     }
752     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
753     idx += 4;
754   }
755   /* backward solve the L^T */
756   for (i=n-1; i>=0; i--){
757     v    = aa + 16*diag[i] - 16;
758     vi   = aj + diag[i] - 1;
759     nz   = diag[i] - ai[i];
760     idt  = 4*i;
761     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
762     while (nz--) {
763       idx   = 4*(*vi--);
764       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
765       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
766       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
767       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
768       v -= 16;
769     }
770   }
771 
772   /* copy t into x according to permutation */
773   ii = 0;
774   for (i=0; i<n; i++) {
775     ir      = 4*r[i];
776     x[ir]   = t[ii];
777     x[ir+1] = t[ii+1];
778     x[ir+2] = t[ii+2];
779     x[ir+3] = t[ii+3];
780     ii += 4;
781   }
782 
783   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
784   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
785   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
786   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
787   ierr = PetscLogFlops(2*16*(a->nz) - 4*A->n);CHKERRQ(ierr);
788   PetscFunctionReturn(0);
789 }
790 
791 #undef __FUNCT__
792 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5"
793 PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
794 {
795   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
796   IS             iscol=a->col,isrow=a->row;
797   PetscErrorCode ierr;
798   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
799   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
800   MatScalar      *aa=a->a,*v;
801   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
802   PetscScalar    *x,*b,*t;
803 
804   PetscFunctionBegin;
805   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
806   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
807   t  = a->solve_work;
808 
809   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
810   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
811 
812   /* copy the b into temp work space according to permutation */
813   ii = 0;
814   for (i=0; i<n; i++) {
815     ic      = 5*c[i];
816     t[ii]   = b[ic];
817     t[ii+1] = b[ic+1];
818     t[ii+2] = b[ic+2];
819     t[ii+3] = b[ic+3];
820     t[ii+4] = b[ic+4];
821     ii += 5;
822   }
823 
824   /* forward solve the U^T */
825   idx = 0;
826   for (i=0; i<n; i++) {
827 
828     v     = aa + 25*diag[i];
829     /* multiply by the inverse of the block diagonal */
830     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
831     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
832     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
833     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
834     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
835     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
836     v += 25;
837 
838     vi    = aj + diag[i] + 1;
839     nz    = ai[i+1] - diag[i] - 1;
840     while (nz--) {
841       oidx = 5*(*vi++);
842       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
843       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
844       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
845       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
846       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
847       v  += 25;
848     }
849     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
850     idx += 5;
851   }
852   /* backward solve the L^T */
853   for (i=n-1; i>=0; i--){
854     v    = aa + 25*diag[i] - 25;
855     vi   = aj + diag[i] - 1;
856     nz   = diag[i] - ai[i];
857     idt  = 5*i;
858     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
859     while (nz--) {
860       idx   = 5*(*vi--);
861       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
862       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
863       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
864       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
865       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
866       v -= 25;
867     }
868   }
869 
870   /* copy t into x according to permutation */
871   ii = 0;
872   for (i=0; i<n; i++) {
873     ir      = 5*r[i];
874     x[ir]   = t[ii];
875     x[ir+1] = t[ii+1];
876     x[ir+2] = t[ii+2];
877     x[ir+3] = t[ii+3];
878     x[ir+4] = t[ii+4];
879     ii += 5;
880   }
881 
882   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
883   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
884   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
885   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
886   ierr = PetscLogFlops(2*25*(a->nz) - 5*A->n);CHKERRQ(ierr);
887   PetscFunctionReturn(0);
888 }
889 
890 #undef __FUNCT__
891 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6"
892 PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
893 {
894   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
895   IS             iscol=a->col,isrow=a->row;
896   PetscErrorCode ierr;
897   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
898   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
899   MatScalar      *aa=a->a,*v;
900   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
901   PetscScalar    *x,*b,*t;
902 
903   PetscFunctionBegin;
904   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
905   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
906   t  = a->solve_work;
907 
908   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
909   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
910 
911   /* copy the b into temp work space according to permutation */
912   ii = 0;
913   for (i=0; i<n; i++) {
914     ic      = 6*c[i];
915     t[ii]   = b[ic];
916     t[ii+1] = b[ic+1];
917     t[ii+2] = b[ic+2];
918     t[ii+3] = b[ic+3];
919     t[ii+4] = b[ic+4];
920     t[ii+5] = b[ic+5];
921     ii += 6;
922   }
923 
924   /* forward solve the U^T */
925   idx = 0;
926   for (i=0; i<n; i++) {
927 
928     v     = aa + 36*diag[i];
929     /* multiply by the inverse of the block diagonal */
930     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
931     x6    = t[5+idx];
932     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
933     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
934     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
935     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
936     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
937     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
938     v += 36;
939 
940     vi    = aj + diag[i] + 1;
941     nz    = ai[i+1] - diag[i] - 1;
942     while (nz--) {
943       oidx = 6*(*vi++);
944       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
945       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
946       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
947       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
948       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
949       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
950       v  += 36;
951     }
952     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
953     t[5+idx] = s6;
954     idx += 6;
955   }
956   /* backward solve the L^T */
957   for (i=n-1; i>=0; i--){
958     v    = aa + 36*diag[i] - 36;
959     vi   = aj + diag[i] - 1;
960     nz   = diag[i] - ai[i];
961     idt  = 6*i;
962     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
963     s6 = t[5+idt];
964     while (nz--) {
965       idx   = 6*(*vi--);
966       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
967       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
968       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
969       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
970       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
971       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
972       v -= 36;
973     }
974   }
975 
976   /* copy t into x according to permutation */
977   ii = 0;
978   for (i=0; i<n; i++) {
979     ir      = 6*r[i];
980     x[ir]   = t[ii];
981     x[ir+1] = t[ii+1];
982     x[ir+2] = t[ii+2];
983     x[ir+3] = t[ii+3];
984     x[ir+4] = t[ii+4];
985     x[ir+5] = t[ii+5];
986     ii += 6;
987   }
988 
989   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
990   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
991   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
992   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
993   ierr = PetscLogFlops(2*36*(a->nz) - 6*A->n);CHKERRQ(ierr);
994   PetscFunctionReturn(0);
995 }
996 
997 #undef __FUNCT__
998 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7"
999 PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1000 {
1001   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1002   IS             iscol=a->col,isrow=a->row;
1003   PetscErrorCode ierr;
1004   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
1005   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1006   MatScalar      *aa=a->a,*v;
1007   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1008   PetscScalar    *x,*b,*t;
1009 
1010   PetscFunctionBegin;
1011   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1012   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1013   t  = a->solve_work;
1014 
1015   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1016   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1017 
1018   /* copy the b into temp work space according to permutation */
1019   ii = 0;
1020   for (i=0; i<n; i++) {
1021     ic      = 7*c[i];
1022     t[ii]   = b[ic];
1023     t[ii+1] = b[ic+1];
1024     t[ii+2] = b[ic+2];
1025     t[ii+3] = b[ic+3];
1026     t[ii+4] = b[ic+4];
1027     t[ii+5] = b[ic+5];
1028     t[ii+6] = b[ic+6];
1029     ii += 7;
1030   }
1031 
1032   /* forward solve the U^T */
1033   idx = 0;
1034   for (i=0; i<n; i++) {
1035 
1036     v     = aa + 49*diag[i];
1037     /* multiply by the inverse of the block diagonal */
1038     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1039     x6    = t[5+idx]; x7 = t[6+idx];
1040     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1041     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1042     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1043     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1044     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1045     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1046     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1047     v += 49;
1048 
1049     vi    = aj + diag[i] + 1;
1050     nz    = ai[i+1] - diag[i] - 1;
1051     while (nz--) {
1052       oidx = 7*(*vi++);
1053       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1054       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1055       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1056       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1057       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1058       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1059       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1060       v  += 49;
1061     }
1062     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1063     t[5+idx] = s6;t[6+idx] = s7;
1064     idx += 7;
1065   }
1066   /* backward solve the L^T */
1067   for (i=n-1; i>=0; i--){
1068     v    = aa + 49*diag[i] - 49;
1069     vi   = aj + diag[i] - 1;
1070     nz   = diag[i] - ai[i];
1071     idt  = 7*i;
1072     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1073     s6 = t[5+idt];s7 = t[6+idt];
1074     while (nz--) {
1075       idx   = 7*(*vi--);
1076       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1077       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1078       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1079       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1080       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1081       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1082       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1083       v -= 49;
1084     }
1085   }
1086 
1087   /* copy t into x according to permutation */
1088   ii = 0;
1089   for (i=0; i<n; i++) {
1090     ir      = 7*r[i];
1091     x[ir]   = t[ii];
1092     x[ir+1] = t[ii+1];
1093     x[ir+2] = t[ii+2];
1094     x[ir+3] = t[ii+3];
1095     x[ir+4] = t[ii+4];
1096     x[ir+5] = t[ii+5];
1097     x[ir+6] = t[ii+6];
1098     ii += 7;
1099   }
1100 
1101   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1102   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1103   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1104   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1105   ierr = PetscLogFlops(2*49*(a->nz) - 7*A->n);CHKERRQ(ierr);
1106   PetscFunctionReturn(0);
1107 }
1108 
1109 /* ----------------------------------------------------------- */
1110 #undef __FUNCT__
1111 #define __FUNCT__ "MatSolve_SeqBAIJ_N"
1112 PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
1113 {
1114   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1115   IS             iscol=a->col,isrow=a->row;
1116   PetscErrorCode ierr;
1117   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1118   PetscInt       nz,bs=A->bs,bs2=a->bs2,*rout,*cout;
1119   MatScalar      *aa=a->a,*v;
1120   PetscScalar    *x,*b,*s,*t,*ls;
1121 
1122   PetscFunctionBegin;
1123   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1124   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1125   t  = a->solve_work;
1126 
1127   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1128   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1129 
1130   /* forward solve the lower triangular */
1131   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
1132   for (i=1; i<n; i++) {
1133     v   = aa + bs2*ai[i];
1134     vi  = aj + ai[i];
1135     nz  = a->diag[i] - ai[i];
1136     s = t + bs*i;
1137     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
1138     while (nz--) {
1139       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
1140       v += bs2;
1141     }
1142   }
1143   /* backward solve the upper triangular */
1144   ls = a->solve_work + A->n;
1145   for (i=n-1; i>=0; i--){
1146     v   = aa + bs2*(a->diag[i] + 1);
1147     vi  = aj + a->diag[i] + 1;
1148     nz  = ai[i+1] - a->diag[i] - 1;
1149     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1150     while (nz--) {
1151       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
1152       v += bs2;
1153     }
1154     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
1155     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1156   }
1157 
1158   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1159   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1160   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1161   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1162   ierr = PetscLogFlops(2*(a->bs2)*(a->nz) - A->bs*A->n);CHKERRQ(ierr);
1163   PetscFunctionReturn(0);
1164 }
1165 
1166 #undef __FUNCT__
1167 #define __FUNCT__ "MatSolve_SeqBAIJ_7"
1168 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1169 {
1170   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1171   IS             iscol=a->col,isrow=a->row;
1172   PetscErrorCode ierr;
1173   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1174   PetscInt       *diag = a->diag;
1175   MatScalar      *aa=a->a,*v;
1176   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1177   PetscScalar    *x,*b,*t;
1178 
1179   PetscFunctionBegin;
1180   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1181   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1182   t  = a->solve_work;
1183 
1184   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1185   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1186 
1187   /* forward solve the lower triangular */
1188   idx    = 7*(*r++);
1189   t[0] = b[idx];   t[1] = b[1+idx];
1190   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1191   t[5] = b[5+idx]; t[6] = b[6+idx];
1192 
1193   for (i=1; i<n; i++) {
1194     v     = aa + 49*ai[i];
1195     vi    = aj + ai[i];
1196     nz    = diag[i] - ai[i];
1197     idx   = 7*(*r++);
1198     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1199     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1200     while (nz--) {
1201       idx   = 7*(*vi++);
1202       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1203       x4    = t[3+idx];x5 = t[4+idx];
1204       x6    = t[5+idx];x7 = t[6+idx];
1205       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1206       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1207       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1208       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1209       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1210       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1211       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1212       v += 49;
1213     }
1214     idx = 7*i;
1215     t[idx]   = s1;t[1+idx] = s2;
1216     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1217     t[5+idx] = s6;t[6+idx] = s7;
1218   }
1219   /* backward solve the upper triangular */
1220   for (i=n-1; i>=0; i--){
1221     v    = aa + 49*diag[i] + 49;
1222     vi   = aj + diag[i] + 1;
1223     nz   = ai[i+1] - diag[i] - 1;
1224     idt  = 7*i;
1225     s1 = t[idt];  s2 = t[1+idt];
1226     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1227     s6 = t[5+idt];s7 = t[6+idt];
1228     while (nz--) {
1229       idx   = 7*(*vi++);
1230       x1    = t[idx];   x2 = t[1+idx];
1231       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1232       x6    = t[5+idx]; x7 = t[6+idx];
1233       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1234       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1235       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1236       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1237       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1238       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1239       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1240       v += 49;
1241     }
1242     idc = 7*(*c--);
1243     v   = aa + 49*diag[i];
1244     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1245                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1246     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1247                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1248     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1249                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1250     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1251                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1252     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1253                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1254     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1255                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1256     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1257                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
1258   }
1259 
1260   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1261   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1262   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1263   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1264   ierr = PetscLogFlops(2*49*(a->nz) - 7*A->n);CHKERRQ(ierr);
1265   PetscFunctionReturn(0);
1266 }
1267 
1268 #undef __FUNCT__
1269 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
1270 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
1271 {
1272   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
1273   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1274   PetscErrorCode ierr;
1275   PetscInt       *diag = a->diag,jdx;
1276   MatScalar      *aa=a->a,*v;
1277   PetscScalar    *x,*b,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1278 
1279   PetscFunctionBegin;
1280   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1281   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1282   /* forward solve the lower triangular */
1283   idx    = 0;
1284   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1285   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1286   x[6] = b[6+idx];
1287   for (i=1; i<n; i++) {
1288     v     =  aa + 49*ai[i];
1289     vi    =  aj + ai[i];
1290     nz    =  diag[i] - ai[i];
1291     idx   =  7*i;
1292     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1293     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1294     s7  =  b[6+idx];
1295     while (nz--) {
1296       jdx   = 7*(*vi++);
1297       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1298       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1299       x7    = x[6+jdx];
1300       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1301       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1302       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1303       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1304       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1305       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1306       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1307       v += 49;
1308      }
1309     x[idx]   = s1;
1310     x[1+idx] = s2;
1311     x[2+idx] = s3;
1312     x[3+idx] = s4;
1313     x[4+idx] = s5;
1314     x[5+idx] = s6;
1315     x[6+idx] = s7;
1316   }
1317   /* backward solve the upper triangular */
1318   for (i=n-1; i>=0; i--){
1319     v    = aa + 49*diag[i] + 49;
1320     vi   = aj + diag[i] + 1;
1321     nz   = ai[i+1] - diag[i] - 1;
1322     idt  = 7*i;
1323     s1 = x[idt];   s2 = x[1+idt];
1324     s3 = x[2+idt]; s4 = x[3+idt];
1325     s5 = x[4+idt]; s6 = x[5+idt];
1326     s7 = x[6+idt];
1327     while (nz--) {
1328       idx   = 7*(*vi++);
1329       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1330       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1331       x7    = x[6+idx];
1332       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1333       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1334       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1335       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1336       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1337       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1338       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1339       v += 49;
1340     }
1341     v        = aa + 49*diag[i];
1342     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
1343                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
1344     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
1345                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
1346     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
1347                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
1348     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
1349                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
1350     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
1351                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
1352     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
1353                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
1354     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
1355                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
1356   }
1357 
1358   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1359   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1360   ierr = PetscLogFlops(2*36*(a->nz) - 6*A->n);CHKERRQ(ierr);
1361   PetscFunctionReturn(0);
1362 }
1363 
1364 #undef __FUNCT__
1365 #define __FUNCT__ "MatSolve_SeqBAIJ_6"
1366 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
1367 {
1368   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1369   IS             iscol=a->col,isrow=a->row;
1370   PetscErrorCode ierr;
1371   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1372   PetscInt       *diag = a->diag;
1373   MatScalar      *aa=a->a,*v;
1374   PetscScalar    *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
1375 
1376   PetscFunctionBegin;
1377   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1378   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1379   t  = a->solve_work;
1380 
1381   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1382   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1383 
1384   /* forward solve the lower triangular */
1385   idx    = 6*(*r++);
1386   t[0] = b[idx];   t[1] = b[1+idx];
1387   t[2] = b[2+idx]; t[3] = b[3+idx];
1388   t[4] = b[4+idx]; t[5] = b[5+idx];
1389   for (i=1; i<n; i++) {
1390     v     = aa + 36*ai[i];
1391     vi    = aj + ai[i];
1392     nz    = diag[i] - ai[i];
1393     idx   = 6*(*r++);
1394     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1395     s5  = b[4+idx]; s6 = b[5+idx];
1396     while (nz--) {
1397       idx   = 6*(*vi++);
1398       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1399       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1400       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1401       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1402       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1403       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1404       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1405       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1406       v += 36;
1407     }
1408     idx = 6*i;
1409     t[idx]   = s1;t[1+idx] = s2;
1410     t[2+idx] = s3;t[3+idx] = s4;
1411     t[4+idx] = s5;t[5+idx] = s6;
1412   }
1413   /* backward solve the upper triangular */
1414   for (i=n-1; i>=0; i--){
1415     v    = aa + 36*diag[i] + 36;
1416     vi   = aj + diag[i] + 1;
1417     nz   = ai[i+1] - diag[i] - 1;
1418     idt  = 6*i;
1419     s1 = t[idt];  s2 = t[1+idt];
1420     s3 = t[2+idt];s4 = t[3+idt];
1421     s5 = t[4+idt];s6 = t[5+idt];
1422     while (nz--) {
1423       idx   = 6*(*vi++);
1424       x1    = t[idx];   x2 = t[1+idx];
1425       x3    = t[2+idx]; x4 = t[3+idx];
1426       x5    = t[4+idx]; x6 = t[5+idx];
1427       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1428       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1429       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1430       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1431       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1432       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1433       v += 36;
1434     }
1435     idc = 6*(*c--);
1436     v   = aa + 36*diag[i];
1437     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
1438                                  v[18]*s4+v[24]*s5+v[30]*s6;
1439     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1440                                  v[19]*s4+v[25]*s5+v[31]*s6;
1441     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1442                                  v[20]*s4+v[26]*s5+v[32]*s6;
1443     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1444                                  v[21]*s4+v[27]*s5+v[33]*s6;
1445     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1446                                  v[22]*s4+v[28]*s5+v[34]*s6;
1447     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1448                                  v[23]*s4+v[29]*s5+v[35]*s6;
1449   }
1450 
1451   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1452   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1453   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1454   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1455   ierr = PetscLogFlops(2*36*(a->nz) - 6*A->n);CHKERRQ(ierr);
1456   PetscFunctionReturn(0);
1457 }
1458 
1459 #undef __FUNCT__
1460 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
1461 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
1462 {
1463   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
1464   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1465   PetscErrorCode ierr;
1466   PetscInt       *diag = a->diag,jdx;
1467   MatScalar      *aa=a->a,*v;
1468   PetscScalar    *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
1469 
1470   PetscFunctionBegin;
1471   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1472   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1473   /* forward solve the lower triangular */
1474   idx    = 0;
1475   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1476   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1477   for (i=1; i<n; i++) {
1478     v     =  aa + 36*ai[i];
1479     vi    =  aj + ai[i];
1480     nz    =  diag[i] - ai[i];
1481     idx   =  6*i;
1482     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1483     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1484     while (nz--) {
1485       jdx   = 6*(*vi++);
1486       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1487       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1488       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1489       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1490       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1491       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1492       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1493       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1494       v += 36;
1495      }
1496     x[idx]   = s1;
1497     x[1+idx] = s2;
1498     x[2+idx] = s3;
1499     x[3+idx] = s4;
1500     x[4+idx] = s5;
1501     x[5+idx] = s6;
1502   }
1503   /* backward solve the upper triangular */
1504   for (i=n-1; i>=0; i--){
1505     v    = aa + 36*diag[i] + 36;
1506     vi   = aj + diag[i] + 1;
1507     nz   = ai[i+1] - diag[i] - 1;
1508     idt  = 6*i;
1509     s1 = x[idt];   s2 = x[1+idt];
1510     s3 = x[2+idt]; s4 = x[3+idt];
1511     s5 = x[4+idt]; s6 = x[5+idt];
1512     while (nz--) {
1513       idx   = 6*(*vi++);
1514       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1515       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1516       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1517       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1518       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1519       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1520       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1521       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1522       v += 36;
1523     }
1524     v        = aa + 36*diag[i];
1525     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
1526     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
1527     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
1528     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
1529     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
1530     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
1531   }
1532 
1533   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1534   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1535   ierr = PetscLogFlops(2*36*(a->nz) - 6*A->n);CHKERRQ(ierr);
1536   PetscFunctionReturn(0);
1537 }
1538 
1539 #undef __FUNCT__
1540 #define __FUNCT__ "MatSolve_SeqBAIJ_5"
1541 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
1542 {
1543   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1544   IS             iscol=a->col,isrow=a->row;
1545   PetscErrorCode ierr;
1546   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1547   PetscInt       *diag = a->diag;
1548   MatScalar      *aa=a->a,*v;
1549   PetscScalar    *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
1550 
1551   PetscFunctionBegin;
1552   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1553   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1554   t  = a->solve_work;
1555 
1556   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1557   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1558 
1559   /* forward solve the lower triangular */
1560   idx    = 5*(*r++);
1561   t[0] = b[idx];   t[1] = b[1+idx];
1562   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1563   for (i=1; i<n; i++) {
1564     v     = aa + 25*ai[i];
1565     vi    = aj + ai[i];
1566     nz    = diag[i] - ai[i];
1567     idx   = 5*(*r++);
1568     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1569     s5  = b[4+idx];
1570     while (nz--) {
1571       idx   = 5*(*vi++);
1572       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1573       x4    = t[3+idx];x5 = t[4+idx];
1574       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1575       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1576       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1577       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1578       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1579       v += 25;
1580     }
1581     idx = 5*i;
1582     t[idx]   = s1;t[1+idx] = s2;
1583     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1584   }
1585   /* backward solve the upper triangular */
1586   for (i=n-1; i>=0; i--){
1587     v    = aa + 25*diag[i] + 25;
1588     vi   = aj + diag[i] + 1;
1589     nz   = ai[i+1] - diag[i] - 1;
1590     idt  = 5*i;
1591     s1 = t[idt];  s2 = t[1+idt];
1592     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1593     while (nz--) {
1594       idx   = 5*(*vi++);
1595       x1    = t[idx];   x2 = t[1+idx];
1596       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1597       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1598       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1599       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1600       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1601       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1602       v += 25;
1603     }
1604     idc = 5*(*c--);
1605     v   = aa + 25*diag[i];
1606     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
1607                                  v[15]*s4+v[20]*s5;
1608     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
1609                                  v[16]*s4+v[21]*s5;
1610     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
1611                                  v[17]*s4+v[22]*s5;
1612     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
1613                                  v[18]*s4+v[23]*s5;
1614     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
1615                                  v[19]*s4+v[24]*s5;
1616   }
1617 
1618   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1619   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1620   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1621   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1622   ierr = PetscLogFlops(2*25*(a->nz) - 5*A->n);CHKERRQ(ierr);
1623   PetscFunctionReturn(0);
1624 }
1625 
1626 #undef __FUNCT__
1627 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
1628 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
1629 {
1630   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
1631   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1632   PetscErrorCode ierr;
1633   PetscInt       *diag = a->diag,jdx;
1634   MatScalar      *aa=a->a,*v;
1635   PetscScalar    *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
1636 
1637   PetscFunctionBegin;
1638   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1639   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1640   /* forward solve the lower triangular */
1641   idx    = 0;
1642   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
1643   for (i=1; i<n; i++) {
1644     v     =  aa + 25*ai[i];
1645     vi    =  aj + ai[i];
1646     nz    =  diag[i] - ai[i];
1647     idx   =  5*i;
1648     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
1649     while (nz--) {
1650       jdx   = 5*(*vi++);
1651       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
1652       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1653       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1654       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1655       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1656       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1657       v    += 25;
1658     }
1659     x[idx]   = s1;
1660     x[1+idx] = s2;
1661     x[2+idx] = s3;
1662     x[3+idx] = s4;
1663     x[4+idx] = s5;
1664   }
1665   /* backward solve the upper triangular */
1666   for (i=n-1; i>=0; i--){
1667     v    = aa + 25*diag[i] + 25;
1668     vi   = aj + diag[i] + 1;
1669     nz   = ai[i+1] - diag[i] - 1;
1670     idt  = 5*i;
1671     s1 = x[idt];  s2 = x[1+idt];
1672     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
1673     while (nz--) {
1674       idx   = 5*(*vi++);
1675       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
1676       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1677       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1678       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1679       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1680       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1681       v    += 25;
1682     }
1683     v        = aa + 25*diag[i];
1684     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
1685     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
1686     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
1687     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
1688     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
1689   }
1690 
1691   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1692   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1693   ierr = PetscLogFlops(2*25*(a->nz) - 5*A->n);CHKERRQ(ierr);
1694   PetscFunctionReturn(0);
1695 }
1696 
1697 #undef __FUNCT__
1698 #define __FUNCT__ "MatSolve_SeqBAIJ_4"
1699 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
1700 {
1701   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
1702   IS             iscol=a->col,isrow=a->row;
1703   PetscErrorCode ierr;
1704   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1705   PetscInt       *diag = a->diag;
1706   MatScalar      *aa=a->a,*v;
1707   PetscScalar    *x,*b,s1,s2,s3,s4,x1,x2,x3,x4,*t;
1708 
1709   PetscFunctionBegin;
1710   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1711   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1712   t  = a->solve_work;
1713 
1714   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1715   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1716 
1717   /* forward solve the lower triangular */
1718   idx    = 4*(*r++);
1719   t[0] = b[idx];   t[1] = b[1+idx];
1720   t[2] = b[2+idx]; t[3] = b[3+idx];
1721   for (i=1; i<n; i++) {
1722     v     = aa + 16*ai[i];
1723     vi    = aj + ai[i];
1724     nz    = diag[i] - ai[i];
1725     idx   = 4*(*r++);
1726     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1727     while (nz--) {
1728       idx   = 4*(*vi++);
1729       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
1730       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1731       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1732       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1733       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1734       v    += 16;
1735     }
1736     idx        = 4*i;
1737     t[idx]   = s1;t[1+idx] = s2;
1738     t[2+idx] = s3;t[3+idx] = s4;
1739   }
1740   /* backward solve the upper triangular */
1741   for (i=n-1; i>=0; i--){
1742     v    = aa + 16*diag[i] + 16;
1743     vi   = aj + diag[i] + 1;
1744     nz   = ai[i+1] - diag[i] - 1;
1745     idt  = 4*i;
1746     s1 = t[idt];  s2 = t[1+idt];
1747     s3 = t[2+idt];s4 = t[3+idt];
1748     while (nz--) {
1749       idx   = 4*(*vi++);
1750       x1    = t[idx];   x2 = t[1+idx];
1751       x3    = t[2+idx]; x4 = t[3+idx];
1752       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1753       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1754       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1755       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1756       v += 16;
1757     }
1758     idc      = 4*(*c--);
1759     v        = aa + 16*diag[i];
1760     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1761     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1762     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1763     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1764   }
1765 
1766   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1767   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1768   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1769   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1770   ierr = PetscLogFlops(2*16*(a->nz) - 4*A->n);CHKERRQ(ierr);
1771   PetscFunctionReturn(0);
1772 }
1773 
1774 #undef __FUNCT__
1775 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion"
1776 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
1777 {
1778   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
1779   IS             iscol=a->col,isrow=a->row;
1780   PetscErrorCode ierr;
1781   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1782   PetscInt       *diag = a->diag;
1783   MatScalar      *aa=a->a,*v,s1,s2,s3,s4,x1,x2,x3,x4,*t;
1784   PetscScalar    *x,*b;
1785 
1786   PetscFunctionBegin;
1787   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1788   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1789   t  = (MatScalar *)a->solve_work;
1790 
1791   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1792   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1793 
1794   /* forward solve the lower triangular */
1795   idx    = 4*(*r++);
1796   t[0] = (MatScalar)b[idx];
1797   t[1] = (MatScalar)b[1+idx];
1798   t[2] = (MatScalar)b[2+idx];
1799   t[3] = (MatScalar)b[3+idx];
1800   for (i=1; i<n; i++) {
1801     v     = aa + 16*ai[i];
1802     vi    = aj + ai[i];
1803     nz    = diag[i] - ai[i];
1804     idx   = 4*(*r++);
1805     s1 = (MatScalar)b[idx];
1806     s2 = (MatScalar)b[1+idx];
1807     s3 = (MatScalar)b[2+idx];
1808     s4 = (MatScalar)b[3+idx];
1809     while (nz--) {
1810       idx   = 4*(*vi++);
1811       x1  = t[idx];
1812       x2  = t[1+idx];
1813       x3  = t[2+idx];
1814       x4  = t[3+idx];
1815       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1816       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1817       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1818       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1819       v    += 16;
1820     }
1821     idx        = 4*i;
1822     t[idx]   = s1;
1823     t[1+idx] = s2;
1824     t[2+idx] = s3;
1825     t[3+idx] = s4;
1826   }
1827   /* backward solve the upper triangular */
1828   for (i=n-1; i>=0; i--){
1829     v    = aa + 16*diag[i] + 16;
1830     vi   = aj + diag[i] + 1;
1831     nz   = ai[i+1] - diag[i] - 1;
1832     idt  = 4*i;
1833     s1 = t[idt];
1834     s2 = t[1+idt];
1835     s3 = t[2+idt];
1836     s4 = t[3+idt];
1837     while (nz--) {
1838       idx   = 4*(*vi++);
1839       x1  = t[idx];
1840       x2  = t[1+idx];
1841       x3  = t[2+idx];
1842       x4  = t[3+idx];
1843       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1844       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1845       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1846       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1847       v += 16;
1848     }
1849     idc      = 4*(*c--);
1850     v        = aa + 16*diag[i];
1851     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1852     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1853     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1854     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1855     x[idc]   = (PetscScalar)t[idt];
1856     x[1+idc] = (PetscScalar)t[1+idt];
1857     x[2+idc] = (PetscScalar)t[2+idt];
1858     x[3+idc] = (PetscScalar)t[3+idt];
1859  }
1860 
1861   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1862   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1863   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1864   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1865   ierr = PetscLogFlops(2*16*(a->nz) - 4*A->n);CHKERRQ(ierr);
1866   PetscFunctionReturn(0);
1867 }
1868 
1869 #if defined (PETSC_HAVE_SSE)
1870 
1871 #include PETSC_HAVE_SSE
1872 
1873 #undef __FUNCT__
1874 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
1875 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
1876 {
1877   /*
1878      Note: This code uses demotion of double
1879      to float when performing the mixed-mode computation.
1880      This may not be numerically reasonable for all applications.
1881   */
1882   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
1883   IS             iscol=a->col,isrow=a->row;
1884   PetscErrorCode ierr;
1885   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1886   PetscInt       *diag = a->diag,ai16;
1887   MatScalar      *aa=a->a,*v;
1888   PetscScalar    *x,*b,*t;
1889 
1890   /* Make space in temp stack for 16 Byte Aligned arrays */
1891   float           ssealignedspace[11],*tmps,*tmpx;
1892   unsigned long   offset;
1893 
1894   PetscFunctionBegin;
1895   SSE_SCOPE_BEGIN;
1896 
1897     offset = (unsigned long)ssealignedspace % 16;
1898     if (offset) offset = (16 - offset)/4;
1899     tmps = &ssealignedspace[offset];
1900     tmpx = &ssealignedspace[offset+4];
1901     PREFETCH_NTA(aa+16*ai[1]);
1902 
1903     ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1904     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1905     t  = a->solve_work;
1906 
1907     ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1908     ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1909 
1910     /* forward solve the lower triangular */
1911     idx  = 4*(*r++);
1912     t[0] = b[idx];   t[1] = b[1+idx];
1913     t[2] = b[2+idx]; t[3] = b[3+idx];
1914     v    =  aa + 16*ai[1];
1915 
1916     for (i=1; i<n;) {
1917       PREFETCH_NTA(&v[8]);
1918       vi   =  aj      + ai[i];
1919       nz   =  diag[i] - ai[i];
1920       idx  =  4*(*r++);
1921 
1922       /* Demote sum from double to float */
1923       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
1924       LOAD_PS(tmps,XMM7);
1925 
1926       while (nz--) {
1927         PREFETCH_NTA(&v[16]);
1928         idx = 4*(*vi++);
1929 
1930         /* Demote solution (so far) from double to float */
1931         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
1932 
1933         /* 4x4 Matrix-Vector product with negative accumulation: */
1934         SSE_INLINE_BEGIN_2(tmpx,v)
1935           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
1936 
1937           /* First Column */
1938           SSE_COPY_PS(XMM0,XMM6)
1939           SSE_SHUFFLE(XMM0,XMM0,0x00)
1940           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1941           SSE_SUB_PS(XMM7,XMM0)
1942 
1943           /* Second Column */
1944           SSE_COPY_PS(XMM1,XMM6)
1945           SSE_SHUFFLE(XMM1,XMM1,0x55)
1946           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1947           SSE_SUB_PS(XMM7,XMM1)
1948 
1949           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
1950 
1951           /* Third Column */
1952           SSE_COPY_PS(XMM2,XMM6)
1953           SSE_SHUFFLE(XMM2,XMM2,0xAA)
1954           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1955           SSE_SUB_PS(XMM7,XMM2)
1956 
1957           /* Fourth Column */
1958           SSE_COPY_PS(XMM3,XMM6)
1959           SSE_SHUFFLE(XMM3,XMM3,0xFF)
1960           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1961           SSE_SUB_PS(XMM7,XMM3)
1962         SSE_INLINE_END_2
1963 
1964         v  += 16;
1965       }
1966       idx = 4*i;
1967       v   = aa + 16*ai[++i];
1968       PREFETCH_NTA(v);
1969       STORE_PS(tmps,XMM7);
1970 
1971       /* Promote result from float to double */
1972       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
1973     }
1974     /* backward solve the upper triangular */
1975     idt  = 4*(n-1);
1976     ai16 = 16*diag[n-1];
1977     v    = aa + ai16 + 16;
1978     for (i=n-1; i>=0;){
1979       PREFETCH_NTA(&v[8]);
1980       vi = aj + diag[i] + 1;
1981       nz = ai[i+1] - diag[i] - 1;
1982 
1983       /* Demote accumulator from double to float */
1984       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
1985       LOAD_PS(tmps,XMM7);
1986 
1987       while (nz--) {
1988         PREFETCH_NTA(&v[16]);
1989         idx = 4*(*vi++);
1990 
1991         /* Demote solution (so far) from double to float */
1992         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
1993 
1994         /* 4x4 Matrix-Vector Product with negative accumulation: */
1995         SSE_INLINE_BEGIN_2(tmpx,v)
1996           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
1997 
1998           /* First Column */
1999           SSE_COPY_PS(XMM0,XMM6)
2000           SSE_SHUFFLE(XMM0,XMM0,0x00)
2001           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2002           SSE_SUB_PS(XMM7,XMM0)
2003 
2004           /* Second Column */
2005           SSE_COPY_PS(XMM1,XMM6)
2006           SSE_SHUFFLE(XMM1,XMM1,0x55)
2007           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2008           SSE_SUB_PS(XMM7,XMM1)
2009 
2010           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2011 
2012           /* Third Column */
2013           SSE_COPY_PS(XMM2,XMM6)
2014           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2015           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2016           SSE_SUB_PS(XMM7,XMM2)
2017 
2018           /* Fourth Column */
2019           SSE_COPY_PS(XMM3,XMM6)
2020           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2021           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2022           SSE_SUB_PS(XMM7,XMM3)
2023         SSE_INLINE_END_2
2024         v  += 16;
2025       }
2026       v    = aa + ai16;
2027       ai16 = 16*diag[--i];
2028       PREFETCH_NTA(aa+ai16+16);
2029       /*
2030          Scale the result by the diagonal 4x4 block,
2031          which was inverted as part of the factorization
2032       */
2033       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
2034         /* First Column */
2035         SSE_COPY_PS(XMM0,XMM7)
2036         SSE_SHUFFLE(XMM0,XMM0,0x00)
2037         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2038 
2039         /* Second Column */
2040         SSE_COPY_PS(XMM1,XMM7)
2041         SSE_SHUFFLE(XMM1,XMM1,0x55)
2042         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2043         SSE_ADD_PS(XMM0,XMM1)
2044 
2045         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2046 
2047         /* Third Column */
2048         SSE_COPY_PS(XMM2,XMM7)
2049         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2050         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2051         SSE_ADD_PS(XMM0,XMM2)
2052 
2053         /* Fourth Column */
2054         SSE_COPY_PS(XMM3,XMM7)
2055         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2056         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2057         SSE_ADD_PS(XMM0,XMM3)
2058 
2059         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2060       SSE_INLINE_END_3
2061 
2062       /* Promote solution from float to double */
2063       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
2064 
2065       /* Apply reordering to t and stream into x.    */
2066       /* This way, x doesn't pollute the cache.      */
2067       /* Be careful with size: 2 doubles = 4 floats! */
2068       idc  = 4*(*c--);
2069       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
2070         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
2071         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
2072         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
2073         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
2074         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
2075         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
2076       SSE_INLINE_END_2
2077       v    = aa + ai16 + 16;
2078       idt -= 4;
2079     }
2080 
2081     ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2082     ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2083     ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2084     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2085     ierr = PetscLogFlops(2*16*(a->nz) - 4*A->n);CHKERRQ(ierr);
2086   SSE_SCOPE_END;
2087   PetscFunctionReturn(0);
2088 }
2089 
2090 #endif
2091 
2092 
2093 /*
2094       Special case where the matrix was ILU(0) factored in the natural
2095    ordering. This eliminates the need for the column and row permutation.
2096 */
2097 #undef __FUNCT__
2098 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
2099 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
2100 {
2101   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2102   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
2103   PetscErrorCode ierr;
2104   PetscInt       *diag = a->diag;
2105   MatScalar      *aa=a->a;
2106   PetscScalar    *x,*b;
2107 
2108   PetscFunctionBegin;
2109   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2110   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2111 
2112 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
2113   {
2114     static PetscScalar w[2000]; /* very BAD need to fix */
2115     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
2116   }
2117 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
2118   {
2119     static PetscScalar w[2000]; /* very BAD need to fix */
2120     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
2121   }
2122 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
2123   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
2124 #else
2125   {
2126     PetscScalar  s1,s2,s3,s4,x1,x2,x3,x4;
2127     MatScalar    *v;
2128     PetscInt     jdx,idt,idx,nz,*vi,i,ai16;
2129 
2130   /* forward solve the lower triangular */
2131   idx    = 0;
2132   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
2133   for (i=1; i<n; i++) {
2134     v     =  aa      + 16*ai[i];
2135     vi    =  aj      + ai[i];
2136     nz    =  diag[i] - ai[i];
2137     idx   +=  4;
2138     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2139     while (nz--) {
2140       jdx   = 4*(*vi++);
2141       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
2142       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2143       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2144       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2145       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2146       v    += 16;
2147     }
2148     x[idx]   = s1;
2149     x[1+idx] = s2;
2150     x[2+idx] = s3;
2151     x[3+idx] = s4;
2152   }
2153   /* backward solve the upper triangular */
2154   idt = 4*(n-1);
2155   for (i=n-1; i>=0; i--){
2156     ai16 = 16*diag[i];
2157     v    = aa + ai16 + 16;
2158     vi   = aj + diag[i] + 1;
2159     nz   = ai[i+1] - diag[i] - 1;
2160     s1 = x[idt];  s2 = x[1+idt];
2161     s3 = x[2+idt];s4 = x[3+idt];
2162     while (nz--) {
2163       idx   = 4*(*vi++);
2164       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
2165       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2166       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2167       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2168       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
2169       v    += 16;
2170     }
2171     v        = aa + ai16;
2172     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
2173     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
2174     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
2175     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
2176     idt -= 4;
2177   }
2178   }
2179 #endif
2180 
2181   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2182   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2183   ierr = PetscLogFlops(2*16*(a->nz) - 4*A->n);CHKERRQ(ierr);
2184   PetscFunctionReturn(0);
2185 }
2186 
2187 #undef __FUNCT__
2188 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion"
2189 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
2190 {
2191   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2192   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
2193   PetscErrorCode ierr;
2194   PetscInt       *diag = a->diag;
2195   MatScalar      *aa=a->a;
2196   PetscScalar    *x,*b;
2197 
2198   PetscFunctionBegin;
2199   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2200   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2201 
2202   {
2203     MatScalar  s1,s2,s3,s4,x1,x2,x3,x4;
2204     MatScalar  *v,*t=(MatScalar *)x;
2205     PetscInt   jdx,idt,idx,nz,*vi,i,ai16;
2206 
2207     /* forward solve the lower triangular */
2208     idx  = 0;
2209     t[0] = (MatScalar)b[0];
2210     t[1] = (MatScalar)b[1];
2211     t[2] = (MatScalar)b[2];
2212     t[3] = (MatScalar)b[3];
2213     for (i=1; i<n; i++) {
2214       v     =  aa      + 16*ai[i];
2215       vi    =  aj      + ai[i];
2216       nz    =  diag[i] - ai[i];
2217       idx   +=  4;
2218       s1 = (MatScalar)b[idx];
2219       s2 = (MatScalar)b[1+idx];
2220       s3 = (MatScalar)b[2+idx];
2221       s4 = (MatScalar)b[3+idx];
2222       while (nz--) {
2223         jdx = 4*(*vi++);
2224         x1  = t[jdx];
2225         x2  = t[1+jdx];
2226         x3  = t[2+jdx];
2227         x4  = t[3+jdx];
2228         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2229         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2230         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2231         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2232         v    += 16;
2233       }
2234       t[idx]   = s1;
2235       t[1+idx] = s2;
2236       t[2+idx] = s3;
2237       t[3+idx] = s4;
2238     }
2239     /* backward solve the upper triangular */
2240     idt = 4*(n-1);
2241     for (i=n-1; i>=0; i--){
2242       ai16 = 16*diag[i];
2243       v    = aa + ai16 + 16;
2244       vi   = aj + diag[i] + 1;
2245       nz   = ai[i+1] - diag[i] - 1;
2246       s1   = t[idt];
2247       s2   = t[1+idt];
2248       s3   = t[2+idt];
2249       s4   = t[3+idt];
2250       while (nz--) {
2251         idx = 4*(*vi++);
2252         x1  = (MatScalar)x[idx];
2253         x2  = (MatScalar)x[1+idx];
2254         x3  = (MatScalar)x[2+idx];
2255         x4  = (MatScalar)x[3+idx];
2256         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2257         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2258         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2259         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2260         v    += 16;
2261       }
2262       v        = aa + ai16;
2263       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
2264       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
2265       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
2266       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
2267       idt -= 4;
2268     }
2269   }
2270 
2271   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2272   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2273   ierr = PetscLogFlops(2*16*(a->nz) - 4*A->n);CHKERRQ(ierr);
2274   PetscFunctionReturn(0);
2275 }
2276 
2277 #if defined (PETSC_HAVE_SSE)
2278 
2279 #include PETSC_HAVE_SSE
2280 #undef __FUNCT__
2281 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj"
2282 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
2283 {
2284   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2285   unsigned short *aj=(unsigned short *)a->j;
2286   PetscErrorCode ierr;
2287   int            *ai=a->i,n=a->mbs,*diag = a->diag;
2288   MatScalar      *aa=a->a;
2289   PetscScalar    *x,*b;
2290 
2291   PetscFunctionBegin;
2292   SSE_SCOPE_BEGIN;
2293   /*
2294      Note: This code currently uses demotion of double
2295      to float when performing the mixed-mode computation.
2296      This may not be numerically reasonable for all applications.
2297   */
2298   PREFETCH_NTA(aa+16*ai[1]);
2299 
2300   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2301   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2302   {
2303     /* x will first be computed in single precision then promoted inplace to double */
2304     MatScalar      *v,*t=(MatScalar *)x;
2305     int            nz,i,idt,ai16;
2306     unsigned int   jdx,idx;
2307     unsigned short *vi;
2308     /* Forward solve the lower triangular factor. */
2309 
2310     /* First block is the identity. */
2311     idx  = 0;
2312     CONVERT_DOUBLE4_FLOAT4(t,b);
2313     v    =  aa + 16*((unsigned int)ai[1]);
2314 
2315     for (i=1; i<n;) {
2316       PREFETCH_NTA(&v[8]);
2317       vi   =  aj      + ai[i];
2318       nz   =  diag[i] - ai[i];
2319       idx +=  4;
2320 
2321       /* Demote RHS from double to float. */
2322       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
2323       LOAD_PS(&t[idx],XMM7);
2324 
2325       while (nz--) {
2326         PREFETCH_NTA(&v[16]);
2327         jdx = 4*((unsigned int)(*vi++));
2328 
2329         /* 4x4 Matrix-Vector product with negative accumulation: */
2330         SSE_INLINE_BEGIN_2(&t[jdx],v)
2331           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2332 
2333           /* First Column */
2334           SSE_COPY_PS(XMM0,XMM6)
2335           SSE_SHUFFLE(XMM0,XMM0,0x00)
2336           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2337           SSE_SUB_PS(XMM7,XMM0)
2338 
2339           /* Second Column */
2340           SSE_COPY_PS(XMM1,XMM6)
2341           SSE_SHUFFLE(XMM1,XMM1,0x55)
2342           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2343           SSE_SUB_PS(XMM7,XMM1)
2344 
2345           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2346 
2347           /* Third Column */
2348           SSE_COPY_PS(XMM2,XMM6)
2349           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2350           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2351           SSE_SUB_PS(XMM7,XMM2)
2352 
2353           /* Fourth Column */
2354           SSE_COPY_PS(XMM3,XMM6)
2355           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2356           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2357           SSE_SUB_PS(XMM7,XMM3)
2358         SSE_INLINE_END_2
2359 
2360         v  += 16;
2361       }
2362       v    =  aa + 16*ai[++i];
2363       PREFETCH_NTA(v);
2364       STORE_PS(&t[idx],XMM7);
2365     }
2366 
2367     /* Backward solve the upper triangular factor.*/
2368 
2369     idt  = 4*(n-1);
2370     ai16 = 16*diag[n-1];
2371     v    = aa + ai16 + 16;
2372     for (i=n-1; i>=0;){
2373       PREFETCH_NTA(&v[8]);
2374       vi = aj + diag[i] + 1;
2375       nz = ai[i+1] - diag[i] - 1;
2376 
2377       LOAD_PS(&t[idt],XMM7);
2378 
2379       while (nz--) {
2380         PREFETCH_NTA(&v[16]);
2381         idx = 4*((unsigned int)(*vi++));
2382 
2383         /* 4x4 Matrix-Vector Product with negative accumulation: */
2384         SSE_INLINE_BEGIN_2(&t[idx],v)
2385           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2386 
2387           /* First Column */
2388           SSE_COPY_PS(XMM0,XMM6)
2389           SSE_SHUFFLE(XMM0,XMM0,0x00)
2390           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2391           SSE_SUB_PS(XMM7,XMM0)
2392 
2393           /* Second Column */
2394           SSE_COPY_PS(XMM1,XMM6)
2395           SSE_SHUFFLE(XMM1,XMM1,0x55)
2396           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2397           SSE_SUB_PS(XMM7,XMM1)
2398 
2399           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2400 
2401           /* Third Column */
2402           SSE_COPY_PS(XMM2,XMM6)
2403           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2404           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2405           SSE_SUB_PS(XMM7,XMM2)
2406 
2407           /* Fourth Column */
2408           SSE_COPY_PS(XMM3,XMM6)
2409           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2410           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2411           SSE_SUB_PS(XMM7,XMM3)
2412         SSE_INLINE_END_2
2413         v  += 16;
2414       }
2415       v    = aa + ai16;
2416       ai16 = 16*diag[--i];
2417       PREFETCH_NTA(aa+ai16+16);
2418       /*
2419          Scale the result by the diagonal 4x4 block,
2420          which was inverted as part of the factorization
2421       */
2422       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
2423         /* First Column */
2424         SSE_COPY_PS(XMM0,XMM7)
2425         SSE_SHUFFLE(XMM0,XMM0,0x00)
2426         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2427 
2428         /* Second Column */
2429         SSE_COPY_PS(XMM1,XMM7)
2430         SSE_SHUFFLE(XMM1,XMM1,0x55)
2431         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2432         SSE_ADD_PS(XMM0,XMM1)
2433 
2434         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2435 
2436         /* Third Column */
2437         SSE_COPY_PS(XMM2,XMM7)
2438         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2439         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2440         SSE_ADD_PS(XMM0,XMM2)
2441 
2442         /* Fourth Column */
2443         SSE_COPY_PS(XMM3,XMM7)
2444         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2445         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2446         SSE_ADD_PS(XMM0,XMM3)
2447 
2448         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2449       SSE_INLINE_END_3
2450 
2451       v    = aa + ai16 + 16;
2452       idt -= 4;
2453     }
2454 
2455     /* Convert t from single precision back to double precision (inplace)*/
2456     idt = 4*(n-1);
2457     for (i=n-1;i>=0;i--) {
2458       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
2459       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
2460       PetscScalar *xtemp=&x[idt];
2461       MatScalar   *ttemp=&t[idt];
2462       xtemp[3] = (PetscScalar)ttemp[3];
2463       xtemp[2] = (PetscScalar)ttemp[2];
2464       xtemp[1] = (PetscScalar)ttemp[1];
2465       xtemp[0] = (PetscScalar)ttemp[0];
2466       idt -= 4;
2467     }
2468 
2469   } /* End of artificial scope. */
2470   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2471   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2472   ierr = PetscLogFlops(2*16*(a->nz) - 4*A->n);CHKERRQ(ierr);
2473   SSE_SCOPE_END;
2474   PetscFunctionReturn(0);
2475 }
2476 
2477 #undef __FUNCT__
2478 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion"
2479 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
2480 {
2481   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2482   int            *aj=a->j;
2483   PetscErrorCode ierr;
2484   int            *ai=a->i,n=a->mbs,*diag = a->diag;
2485   MatScalar      *aa=a->a;
2486   PetscScalar    *x,*b;
2487 
2488   PetscFunctionBegin;
2489   SSE_SCOPE_BEGIN;
2490   /*
2491      Note: This code currently uses demotion of double
2492      to float when performing the mixed-mode computation.
2493      This may not be numerically reasonable for all applications.
2494   */
2495   PREFETCH_NTA(aa+16*ai[1]);
2496 
2497   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2498   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2499   {
2500     /* x will first be computed in single precision then promoted inplace to double */
2501     MatScalar *v,*t=(MatScalar *)x;
2502     int       nz,i,idt,ai16;
2503     int       jdx,idx;
2504     int       *vi;
2505     /* Forward solve the lower triangular factor. */
2506 
2507     /* First block is the identity. */
2508     idx  = 0;
2509     CONVERT_DOUBLE4_FLOAT4(t,b);
2510     v    =  aa + 16*ai[1];
2511 
2512     for (i=1; i<n;) {
2513       PREFETCH_NTA(&v[8]);
2514       vi   =  aj      + ai[i];
2515       nz   =  diag[i] - ai[i];
2516       idx +=  4;
2517 
2518       /* Demote RHS from double to float. */
2519       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
2520       LOAD_PS(&t[idx],XMM7);
2521 
2522       while (nz--) {
2523         PREFETCH_NTA(&v[16]);
2524         jdx = 4*(*vi++);
2525 /*          jdx = *vi++; */
2526 
2527         /* 4x4 Matrix-Vector product with negative accumulation: */
2528         SSE_INLINE_BEGIN_2(&t[jdx],v)
2529           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2530 
2531           /* First Column */
2532           SSE_COPY_PS(XMM0,XMM6)
2533           SSE_SHUFFLE(XMM0,XMM0,0x00)
2534           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2535           SSE_SUB_PS(XMM7,XMM0)
2536 
2537           /* Second Column */
2538           SSE_COPY_PS(XMM1,XMM6)
2539           SSE_SHUFFLE(XMM1,XMM1,0x55)
2540           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2541           SSE_SUB_PS(XMM7,XMM1)
2542 
2543           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2544 
2545           /* Third Column */
2546           SSE_COPY_PS(XMM2,XMM6)
2547           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2548           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2549           SSE_SUB_PS(XMM7,XMM2)
2550 
2551           /* Fourth Column */
2552           SSE_COPY_PS(XMM3,XMM6)
2553           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2554           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2555           SSE_SUB_PS(XMM7,XMM3)
2556         SSE_INLINE_END_2
2557 
2558         v  += 16;
2559       }
2560       v    =  aa + 16*ai[++i];
2561       PREFETCH_NTA(v);
2562       STORE_PS(&t[idx],XMM7);
2563     }
2564 
2565     /* Backward solve the upper triangular factor.*/
2566 
2567     idt  = 4*(n-1);
2568     ai16 = 16*diag[n-1];
2569     v    = aa + ai16 + 16;
2570     for (i=n-1; i>=0;){
2571       PREFETCH_NTA(&v[8]);
2572       vi = aj + diag[i] + 1;
2573       nz = ai[i+1] - diag[i] - 1;
2574 
2575       LOAD_PS(&t[idt],XMM7);
2576 
2577       while (nz--) {
2578         PREFETCH_NTA(&v[16]);
2579         idx = 4*(*vi++);
2580 /*          idx = *vi++; */
2581 
2582         /* 4x4 Matrix-Vector Product with negative accumulation: */
2583         SSE_INLINE_BEGIN_2(&t[idx],v)
2584           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2585 
2586           /* First Column */
2587           SSE_COPY_PS(XMM0,XMM6)
2588           SSE_SHUFFLE(XMM0,XMM0,0x00)
2589           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2590           SSE_SUB_PS(XMM7,XMM0)
2591 
2592           /* Second Column */
2593           SSE_COPY_PS(XMM1,XMM6)
2594           SSE_SHUFFLE(XMM1,XMM1,0x55)
2595           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2596           SSE_SUB_PS(XMM7,XMM1)
2597 
2598           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2599 
2600           /* Third Column */
2601           SSE_COPY_PS(XMM2,XMM6)
2602           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2603           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2604           SSE_SUB_PS(XMM7,XMM2)
2605 
2606           /* Fourth Column */
2607           SSE_COPY_PS(XMM3,XMM6)
2608           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2609           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2610           SSE_SUB_PS(XMM7,XMM3)
2611         SSE_INLINE_END_2
2612         v  += 16;
2613       }
2614       v    = aa + ai16;
2615       ai16 = 16*diag[--i];
2616       PREFETCH_NTA(aa+ai16+16);
2617       /*
2618          Scale the result by the diagonal 4x4 block,
2619          which was inverted as part of the factorization
2620       */
2621       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
2622         /* First Column */
2623         SSE_COPY_PS(XMM0,XMM7)
2624         SSE_SHUFFLE(XMM0,XMM0,0x00)
2625         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2626 
2627         /* Second Column */
2628         SSE_COPY_PS(XMM1,XMM7)
2629         SSE_SHUFFLE(XMM1,XMM1,0x55)
2630         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2631         SSE_ADD_PS(XMM0,XMM1)
2632 
2633         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2634 
2635         /* Third Column */
2636         SSE_COPY_PS(XMM2,XMM7)
2637         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2638         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2639         SSE_ADD_PS(XMM0,XMM2)
2640 
2641         /* Fourth Column */
2642         SSE_COPY_PS(XMM3,XMM7)
2643         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2644         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2645         SSE_ADD_PS(XMM0,XMM3)
2646 
2647         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2648       SSE_INLINE_END_3
2649 
2650       v    = aa + ai16 + 16;
2651       idt -= 4;
2652     }
2653 
2654     /* Convert t from single precision back to double precision (inplace)*/
2655     idt = 4*(n-1);
2656     for (i=n-1;i>=0;i--) {
2657       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
2658       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
2659       PetscScalar *xtemp=&x[idt];
2660       MatScalar   *ttemp=&t[idt];
2661       xtemp[3] = (PetscScalar)ttemp[3];
2662       xtemp[2] = (PetscScalar)ttemp[2];
2663       xtemp[1] = (PetscScalar)ttemp[1];
2664       xtemp[0] = (PetscScalar)ttemp[0];
2665       idt -= 4;
2666     }
2667 
2668   } /* End of artificial scope. */
2669   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2670   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2671   ierr = PetscLogFlops(2*16*(a->nz) - 4*A->n);CHKERRQ(ierr);
2672   SSE_SCOPE_END;
2673   PetscFunctionReturn(0);
2674 }
2675 
2676 #endif
2677 #undef __FUNCT__
2678 #define __FUNCT__ "MatSolve_SeqBAIJ_3"
2679 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
2680 {
2681   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
2682   IS             iscol=a->col,isrow=a->row;
2683   PetscErrorCode ierr;
2684   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
2685   PetscInt       *diag = a->diag;
2686   MatScalar      *aa=a->a,*v;
2687   PetscScalar    *x,*b,s1,s2,s3,x1,x2,x3,*t;
2688 
2689   PetscFunctionBegin;
2690   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2691   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2692   t  = a->solve_work;
2693 
2694   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2695   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2696 
2697   /* forward solve the lower triangular */
2698   idx    = 3*(*r++);
2699   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
2700   for (i=1; i<n; i++) {
2701     v     = aa + 9*ai[i];
2702     vi    = aj + ai[i];
2703     nz    = diag[i] - ai[i];
2704     idx   = 3*(*r++);
2705     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
2706     while (nz--) {
2707       idx   = 3*(*vi++);
2708       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2709       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2710       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2711       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2712       v += 9;
2713     }
2714     idx = 3*i;
2715     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
2716   }
2717   /* backward solve the upper triangular */
2718   for (i=n-1; i>=0; i--){
2719     v    = aa + 9*diag[i] + 9;
2720     vi   = aj + diag[i] + 1;
2721     nz   = ai[i+1] - diag[i] - 1;
2722     idt  = 3*i;
2723     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
2724     while (nz--) {
2725       idx   = 3*(*vi++);
2726       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2727       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2728       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2729       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2730       v += 9;
2731     }
2732     idc = 3*(*c--);
2733     v   = aa + 9*diag[i];
2734     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2735     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2736     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
2737   }
2738   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2739   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2740   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2741   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2742   ierr = PetscLogFlops(2*9*(a->nz) - 3*A->n);CHKERRQ(ierr);
2743   PetscFunctionReturn(0);
2744 }
2745 
2746 /*
2747       Special case where the matrix was ILU(0) factored in the natural
2748    ordering. This eliminates the need for the column and row permutation.
2749 */
2750 #undef __FUNCT__
2751 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
2752 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
2753 {
2754   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2755   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
2756   PetscErrorCode ierr;
2757   PetscInt       *diag = a->diag;
2758   MatScalar      *aa=a->a,*v;
2759   PetscScalar    *x,*b,s1,s2,s3,x1,x2,x3;
2760   PetscInt       jdx,idt,idx,nz,*vi,i;
2761 
2762   PetscFunctionBegin;
2763   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2764   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2765 
2766 
2767   /* forward solve the lower triangular */
2768   idx    = 0;
2769   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
2770   for (i=1; i<n; i++) {
2771     v     =  aa      + 9*ai[i];
2772     vi    =  aj      + ai[i];
2773     nz    =  diag[i] - ai[i];
2774     idx   +=  3;
2775     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
2776     while (nz--) {
2777       jdx   = 3*(*vi++);
2778       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
2779       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2780       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2781       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2782       v    += 9;
2783     }
2784     x[idx]   = s1;
2785     x[1+idx] = s2;
2786     x[2+idx] = s3;
2787   }
2788   /* backward solve the upper triangular */
2789   for (i=n-1; i>=0; i--){
2790     v    = aa + 9*diag[i] + 9;
2791     vi   = aj + diag[i] + 1;
2792     nz   = ai[i+1] - diag[i] - 1;
2793     idt  = 3*i;
2794     s1 = x[idt];  s2 = x[1+idt];
2795     s3 = x[2+idt];
2796     while (nz--) {
2797       idx   = 3*(*vi++);
2798       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
2799       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2800       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2801       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2802       v    += 9;
2803     }
2804     v        = aa +  9*diag[i];
2805     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2806     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2807     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
2808   }
2809 
2810   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2811   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2812   ierr = PetscLogFlops(2*9*(a->nz) - 3*A->n);CHKERRQ(ierr);
2813   PetscFunctionReturn(0);
2814 }
2815 
2816 #undef __FUNCT__
2817 #define __FUNCT__ "MatSolve_SeqBAIJ_2"
2818 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
2819 {
2820   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
2821   IS             iscol=a->col,isrow=a->row;
2822   PetscErrorCode ierr;
2823   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
2824   PetscInt       *diag = a->diag;
2825   MatScalar      *aa=a->a,*v;
2826   PetscScalar    *x,*b,s1,s2,x1,x2,*t;
2827 
2828   PetscFunctionBegin;
2829   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2830   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2831   t  = a->solve_work;
2832 
2833   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2834   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2835 
2836   /* forward solve the lower triangular */
2837   idx    = 2*(*r++);
2838   t[0] = b[idx]; t[1] = b[1+idx];
2839   for (i=1; i<n; i++) {
2840     v     = aa + 4*ai[i];
2841     vi    = aj + ai[i];
2842     nz    = diag[i] - ai[i];
2843     idx   = 2*(*r++);
2844     s1  = b[idx]; s2 = b[1+idx];
2845     while (nz--) {
2846       idx   = 2*(*vi++);
2847       x1    = t[idx]; x2 = t[1+idx];
2848       s1 -= v[0]*x1 + v[2]*x2;
2849       s2 -= v[1]*x1 + v[3]*x2;
2850       v += 4;
2851     }
2852     idx = 2*i;
2853     t[idx] = s1; t[1+idx] = s2;
2854   }
2855   /* backward solve the upper triangular */
2856   for (i=n-1; i>=0; i--){
2857     v    = aa + 4*diag[i] + 4;
2858     vi   = aj + diag[i] + 1;
2859     nz   = ai[i+1] - diag[i] - 1;
2860     idt  = 2*i;
2861     s1 = t[idt]; s2 = t[1+idt];
2862     while (nz--) {
2863       idx   = 2*(*vi++);
2864       x1    = t[idx]; x2 = t[1+idx];
2865       s1 -= v[0]*x1 + v[2]*x2;
2866       s2 -= v[1]*x1 + v[3]*x2;
2867       v += 4;
2868     }
2869     idc = 2*(*c--);
2870     v   = aa + 4*diag[i];
2871     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
2872     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
2873   }
2874   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2875   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2876   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2877   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2878   ierr = PetscLogFlops(2*4*(a->nz) - 2*A->n);CHKERRQ(ierr);
2879   PetscFunctionReturn(0);
2880 }
2881 
2882 /*
2883       Special case where the matrix was ILU(0) factored in the natural
2884    ordering. This eliminates the need for the column and row permutation.
2885 */
2886 #undef __FUNCT__
2887 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
2888 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
2889 {
2890   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2891   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
2892   PetscErrorCode ierr;
2893   PetscInt       *diag = a->diag;
2894   MatScalar      *aa=a->a,*v;
2895   PetscScalar    *x,*b,s1,s2,x1,x2;
2896   PetscInt       jdx,idt,idx,nz,*vi,i;
2897 
2898   PetscFunctionBegin;
2899   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2900   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2901 
2902   /* forward solve the lower triangular */
2903   idx    = 0;
2904   x[0]   = b[0]; x[1] = b[1];
2905   for (i=1; i<n; i++) {
2906     v     =  aa      + 4*ai[i];
2907     vi    =  aj      + ai[i];
2908     nz    =  diag[i] - ai[i];
2909     idx   +=  2;
2910     s1  =  b[idx];s2 = b[1+idx];
2911     while (nz--) {
2912       jdx   = 2*(*vi++);
2913       x1    = x[jdx];x2 = x[1+jdx];
2914       s1 -= v[0]*x1 + v[2]*x2;
2915       s2 -= v[1]*x1 + v[3]*x2;
2916       v    += 4;
2917     }
2918     x[idx]   = s1;
2919     x[1+idx] = s2;
2920   }
2921   /* backward solve the upper triangular */
2922   for (i=n-1; i>=0; i--){
2923     v    = aa + 4*diag[i] + 4;
2924     vi   = aj + diag[i] + 1;
2925     nz   = ai[i+1] - diag[i] - 1;
2926     idt  = 2*i;
2927     s1 = x[idt];  s2 = x[1+idt];
2928     while (nz--) {
2929       idx   = 2*(*vi++);
2930       x1    = x[idx];   x2 = x[1+idx];
2931       s1 -= v[0]*x1 + v[2]*x2;
2932       s2 -= v[1]*x1 + v[3]*x2;
2933       v    += 4;
2934     }
2935     v        = aa +  4*diag[i];
2936     x[idt]   = v[0]*s1 + v[2]*s2;
2937     x[1+idt] = v[1]*s1 + v[3]*s2;
2938   }
2939 
2940   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2941   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2942   ierr = PetscLogFlops(2*4*(a->nz) - 2*A->n);CHKERRQ(ierr);
2943   PetscFunctionReturn(0);
2944 }
2945 
2946 #undef __FUNCT__
2947 #define __FUNCT__ "MatSolve_SeqBAIJ_1"
2948 PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
2949 {
2950   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
2951   IS             iscol=a->col,isrow=a->row;
2952   PetscErrorCode ierr;
2953   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
2954   PetscInt       *diag = a->diag;
2955   MatScalar      *aa=a->a,*v;
2956   PetscScalar    *x,*b,s1,*t;
2957 
2958   PetscFunctionBegin;
2959   if (!n) PetscFunctionReturn(0);
2960 
2961   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2962   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2963   t  = a->solve_work;
2964 
2965   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2966   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2967 
2968   /* forward solve the lower triangular */
2969   t[0] = b[*r++];
2970   for (i=1; i<n; i++) {
2971     v     = aa + ai[i];
2972     vi    = aj + ai[i];
2973     nz    = diag[i] - ai[i];
2974     s1  = b[*r++];
2975     while (nz--) {
2976       s1 -= (*v++)*t[*vi++];
2977     }
2978     t[i] = s1;
2979   }
2980   /* backward solve the upper triangular */
2981   for (i=n-1; i>=0; i--){
2982     v    = aa + diag[i] + 1;
2983     vi   = aj + diag[i] + 1;
2984     nz   = ai[i+1] - diag[i] - 1;
2985     s1 = t[i];
2986     while (nz--) {
2987       s1 -= (*v++)*t[*vi++];
2988     }
2989     x[*c--] = t[i] = aa[diag[i]]*s1;
2990   }
2991 
2992   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2993   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2994   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2995   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2996   ierr = PetscLogFlops(2*1*(a->nz) - A->n);CHKERRQ(ierr);
2997   PetscFunctionReturn(0);
2998 }
2999 /*
3000       Special case where the matrix was ILU(0) factored in the natural
3001    ordering. This eliminates the need for the column and row permutation.
3002 */
3003 #undef __FUNCT__
3004 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering"
3005 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
3006 {
3007   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3008   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
3009   PetscErrorCode ierr;
3010   PetscInt       *diag = a->diag;
3011   MatScalar      *aa=a->a;
3012   PetscScalar    *x,*b;
3013   PetscScalar    s1,x1;
3014   MatScalar      *v;
3015   PetscInt       jdx,idt,idx,nz,*vi,i;
3016 
3017   PetscFunctionBegin;
3018   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3019   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3020 
3021   /* forward solve the lower triangular */
3022   idx    = 0;
3023   x[0]   = b[0];
3024   for (i=1; i<n; i++) {
3025     v     =  aa      + ai[i];
3026     vi    =  aj      + ai[i];
3027     nz    =  diag[i] - ai[i];
3028     idx   +=  1;
3029     s1  =  b[idx];
3030     while (nz--) {
3031       jdx   = *vi++;
3032       x1    = x[jdx];
3033       s1 -= v[0]*x1;
3034       v    += 1;
3035     }
3036     x[idx]   = s1;
3037   }
3038   /* backward solve the upper triangular */
3039   for (i=n-1; i>=0; i--){
3040     v    = aa + diag[i] + 1;
3041     vi   = aj + diag[i] + 1;
3042     nz   = ai[i+1] - diag[i] - 1;
3043     idt  = i;
3044     s1 = x[idt];
3045     while (nz--) {
3046       idx   = *vi++;
3047       x1    = x[idx];
3048       s1 -= v[0]*x1;
3049       v    += 1;
3050     }
3051     v        = aa +  diag[i];
3052     x[idt]   = v[0]*s1;
3053   }
3054   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3055   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3056   ierr = PetscLogFlops(2*(a->nz) - A->n);CHKERRQ(ierr);
3057   PetscFunctionReturn(0);
3058 }
3059 
3060 /* ----------------------------------------------------------------*/
3061 /*
3062      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
3063    except that the data structure of Mat_SeqAIJ is slightly different.
3064    Not a good example of code reuse.
3065 */
3066 EXTERN PetscErrorCode MatMissingDiagonal_SeqBAIJ(Mat);
3067 
3068 #undef __FUNCT__
3069 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ"
3070 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,MatFactorInfo *info,Mat *fact)
3071 {
3072   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b;
3073   IS             isicol;
3074   PetscErrorCode ierr;
3075   PetscInt       *r,*ic,prow,n = a->mbs,*ai = a->i,*aj = a->j;
3076   PetscInt       *ainew,*ajnew,jmax,*fill,*xi,nz,*im,*ajfill,*flev;
3077   PetscInt       *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0;
3078   PetscInt       incrlev,nnz,i,bs = A->bs,bs2 = a->bs2,levels,diagonal_fill;
3079   PetscTruth     col_identity,row_identity;
3080   PetscReal      f;
3081 
3082   PetscFunctionBegin;
3083   f             = info->fill;
3084   levels        = (PetscInt)info->levels;
3085   diagonal_fill = (PetscInt)info->diagonal_fill;
3086   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
3087   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
3088   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
3089 
3090   if (!levels && row_identity && col_identity) {  /* special case copy the nonzero structure */
3091     ierr = MatDuplicate_SeqBAIJ(A,MAT_DO_NOT_COPY_VALUES,fact);CHKERRQ(ierr);
3092     (*fact)->factor = FACTOR_LU;
3093     b               = (Mat_SeqBAIJ*)(*fact)->data;
3094     if (!b->diag) {
3095       ierr = MatMarkDiagonal_SeqBAIJ(*fact);CHKERRQ(ierr);
3096     }
3097     ierr = MatMissingDiagonal_SeqBAIJ(*fact);CHKERRQ(ierr);
3098     b->row        = isrow;
3099     b->col        = iscol;
3100     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
3101     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
3102     b->icol       = isicol;
3103     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3104     ierr          = PetscMalloc(((*fact)->m+1+(*fact)->bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
3105   } else { /* general case perform the symbolic factorization */
3106     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
3107     ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
3108 
3109     /* get new row pointers */
3110     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr);
3111     ainew[0] = 0;
3112     /* don't know how many column pointers are needed so estimate */
3113     jmax = (PetscInt)(f*ai[n] + 1);
3114     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr);
3115     /* ajfill is level of fill for each fill entry */
3116     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr);
3117     /* fill is a linked list of nonzeros in active row */
3118     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr);
3119     /* im is level for each filled value */
3120     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr);
3121     /* dloc is location of diagonal in factor */
3122     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr);
3123     dloc[0]  = 0;
3124     for (prow=0; prow<n; prow++) {
3125 
3126       /* copy prow into linked list */
3127       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
3128       if (!nz) SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix");
3129       xi         = aj + ai[r[prow]];
3130       fill[n]    = n;
3131       fill[prow] = -1; /* marker for diagonal entry */
3132       while (nz--) {
3133 	fm  = n;
3134 	idx = ic[*xi++];
3135 	do {
3136 	  m  = fm;
3137 	  fm = fill[m];
3138 	} while (fm < idx);
3139 	fill[m]   = idx;
3140 	fill[idx] = fm;
3141 	im[idx]   = 0;
3142       }
3143 
3144       /* make sure diagonal entry is included */
3145       if (diagonal_fill && fill[prow] == -1) {
3146 	fm = n;
3147 	while (fill[fm] < prow) fm = fill[fm];
3148 	fill[prow] = fill[fm];  /* insert diagonal into linked list */
3149 	fill[fm]   = prow;
3150 	im[prow]   = 0;
3151 	nzf++;
3152 	dcount++;
3153       }
3154 
3155       nzi = 0;
3156       row = fill[n];
3157       while (row < prow) {
3158 	incrlev = im[row] + 1;
3159 	nz      = dloc[row];
3160 	xi      = ajnew  + ainew[row] + nz + 1;
3161 	flev    = ajfill + ainew[row] + nz + 1;
3162 	nnz     = ainew[row+1] - ainew[row] - nz - 1;
3163 	fm      = row;
3164 	while (nnz-- > 0) {
3165 	  idx = *xi++;
3166 	  if (*flev + incrlev > levels) {
3167 	    flev++;
3168 	    continue;
3169 	  }
3170 	  do {
3171 	    m  = fm;
3172 	    fm = fill[m];
3173 	  } while (fm < idx);
3174 	  if (fm != idx) {
3175 	    im[idx]   = *flev + incrlev;
3176 	    fill[m]   = idx;
3177 	    fill[idx] = fm;
3178 	    fm        = idx;
3179 	    nzf++;
3180 	  } else {
3181 	    if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
3182 	  }
3183 	  flev++;
3184 	}
3185 	row = fill[row];
3186 	nzi++;
3187       }
3188       /* copy new filled row into permanent storage */
3189       ainew[prow+1] = ainew[prow] + nzf;
3190       if (ainew[prow+1] > jmax) {
3191 
3192 	/* estimate how much additional space we will need */
3193 	/* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
3194 	/* just double the memory each time */
3195 	PetscInt maxadd = jmax;
3196 	/* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
3197 	if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
3198 	jmax += maxadd;
3199 
3200 	/* allocate a longer ajnew and ajfill */
3201 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xi);CHKERRQ(ierr);
3202 	ierr = PetscMemcpy(xi,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
3203 	ierr = PetscFree(ajnew);CHKERRQ(ierr);
3204 	ajnew = xi;
3205 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xi);CHKERRQ(ierr);
3206 	ierr = PetscMemcpy(xi,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
3207 	ierr = PetscFree(ajfill);CHKERRQ(ierr);
3208 	ajfill = xi;
3209 	reallocate++; /* count how many reallocations are needed */
3210       }
3211       xi          = ajnew + ainew[prow];
3212       flev        = ajfill + ainew[prow];
3213       dloc[prow]  = nzi;
3214       fm          = fill[n];
3215       while (nzf--) {
3216 	*xi++   = fm;
3217 	*flev++ = im[fm];
3218 	fm      = fill[fm];
3219       }
3220       /* make sure row has diagonal entry */
3221       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
3222 	SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
3223     try running with -pc_ilu_nonzeros_along_diagonal or -pc_ilu_diagonal_fill",prow);
3224       }
3225     }
3226     ierr = PetscFree(ajfill);CHKERRQ(ierr);
3227     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
3228     ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
3229     ierr = PetscFree(fill);CHKERRQ(ierr);
3230     ierr = PetscFree(im);CHKERRQ(ierr);
3231 
3232 #if defined(PETSC_USE_VERBOSE)
3233     {
3234       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
3235       ierr = PetscVerboseInfo((A,"MatILUFactorSymbolic_SeqBAIJ:Reallocs %D Fill ratio:given %g needed %g\n",reallocate,f,af));CHKERRQ(ierr);
3236       ierr = PetscVerboseInfo((A,"MatILUFactorSymbolic_SeqBAIJ:Run with -pc_ilu_fill %g or use \n",af));CHKERRQ(ierr);
3237       ierr = PetscVerboseInfo((A,"MatILUFactorSymbolic_SeqBAIJ:PCILUSetFill(pc,%g);\n",af));CHKERRQ(ierr);
3238       ierr = PetscVerboseInfo((A,"MatILUFactorSymbolic_SeqBAIJ:for best performance.\n"));CHKERRQ(ierr);
3239       if (diagonal_fill) {
3240 	ierr = PetscVerboseInfo((A,"MatILUFactorSymbolic_SeqBAIJ:Detected and replaced %D missing diagonals\n",dcount));CHKERRQ(ierr);
3241       }
3242     }
3243 #endif
3244 
3245     /* put together the new matrix */
3246     ierr = MatCreate(A->comm,fact);CHKERRQ(ierr);
3247     ierr = MatSetSizes(*fact,bs*n,bs*n,bs*n,bs*n);CHKERRQ(ierr);
3248     ierr = MatSetType(*fact,A->type_name);CHKERRQ(ierr);
3249     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(*fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
3250     ierr = PetscLogObjectParent(*fact,isicol);CHKERRQ(ierr);
3251     b    = (Mat_SeqBAIJ*)(*fact)->data;
3252     b->singlemalloc = PETSC_FALSE;
3253     ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
3254     b->j          = ajnew;
3255     b->i          = ainew;
3256     for (i=0; i<n; i++) dloc[i] += ainew[i];
3257     b->diag       = dloc;
3258     b->ilen       = 0;
3259     b->imax       = 0;
3260     b->row        = isrow;
3261     b->col        = iscol;
3262     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3263     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
3264     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
3265     b->icol       = isicol;
3266     ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
3267     /* In b structure:  Free imax, ilen, old a, old j.
3268        Allocate dloc, solve_work, new a, new j */
3269     ierr = PetscLogObjectMemory(*fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr);
3270     b->maxnz          = b->nz = ainew[n];
3271     (*fact)->factor   = FACTOR_LU;
3272 
3273     (*fact)->info.factor_mallocs    = reallocate;
3274     (*fact)->info.fill_ratio_given  = f;
3275     (*fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
3276   }
3277 
3278   if (row_identity && col_identity) {
3279     ierr = MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(*fact);CHKERRQ(ierr);
3280   }
3281   PetscFunctionReturn(0);
3282 }
3283 
3284 #undef __FUNCT__
3285 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE"
3286 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
3287 {
3288   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
3289   /* int i,*AJ=a->j,nz=a->nz; */
3290   PetscFunctionBegin;
3291   /* Undo Column scaling */
3292 /*    while (nz--) { */
3293 /*      AJ[i] = AJ[i]/4; */
3294 /*    } */
3295   /* This should really invoke a push/pop logic, but we don't have that yet. */
3296   A->ops->setunfactored = PETSC_NULL;
3297   PetscFunctionReturn(0);
3298 }
3299 
3300 #undef __FUNCT__
3301 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj"
3302 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
3303 {
3304   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3305   PetscInt       *AJ=a->j,nz=a->nz;
3306   unsigned short *aj=(unsigned short *)AJ;
3307   PetscFunctionBegin;
3308   /* Is this really necessary? */
3309   while (nz--) {
3310     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
3311   }
3312   A->ops->setunfactored = PETSC_NULL;
3313   PetscFunctionReturn(0);
3314 }
3315 
3316 #undef __FUNCT__
3317 #define __FUNCT__ "MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering"
3318 PetscErrorCode MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(Mat inA)
3319 {
3320   PetscErrorCode ierr;
3321   /*
3322       Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver
3323       with natural ordering
3324   */
3325   PetscFunctionBegin;
3326   inA->ops->solve             = MatSolve_SeqBAIJ_Update;
3327   inA->ops->solvetranspose    = MatSolveTranspose_SeqBAIJ_Update;
3328   switch (inA->bs) {
3329   case 1:
3330     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_1;
3331     ierr = PetscVerboseInfo((inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=1\n"));CHKERRQ(ierr);
3332     break;
3333   case 2:
3334     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering;
3335     ierr = PetscVerboseInfo((inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=2\n"));CHKERRQ(ierr);
3336     break;
3337   case 3:
3338     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering;
3339     ierr = PetscVerboseInfo((inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=3\n"));CHKERRQ(ierr);
3340     break;
3341   case 4:
3342 #if defined(PETSC_USE_MAT_SINGLE)
3343     {
3344       PetscTruth  sse_enabled_local;
3345       PetscErrorCode ierr;
3346       ierr = PetscSSEIsEnabled(inA->comm,&sse_enabled_local,PETSC_NULL);CHKERRQ(ierr);
3347       if (sse_enabled_local) {
3348 #  if defined(PETSC_HAVE_SSE)
3349         int i,*AJ=a->j,nz=a->nz,n=a->mbs;
3350         if (n==(unsigned short)n) {
3351           unsigned short *aj=(unsigned short *)AJ;
3352           for (i=0;i<nz;i++) {
3353             aj[i] = (unsigned short)AJ[i];
3354           }
3355           inA->ops->setunfactored   = MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj;
3356           inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj;
3357           ierr = PetscVerboseInfo((inA,"MatILUFactor_SeqBAIJ:Using special SSE, in-place natural ordering, ushort j index factor BS=4\n"));CHKERRQ(ierr);
3358         } else {
3359         /* Scale the column indices for easier indexing in MatSolve. */
3360 /*            for (i=0;i<nz;i++) { */
3361 /*              AJ[i] = AJ[i]*4; */
3362 /*            } */
3363           inA->ops->setunfactored   = MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE;
3364           inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE;
3365           ierr = PetscVerboseInfo((inA,"MatILUFactor_SeqBAIJ:Using special SSE, in-place natural ordering, int j index factor BS=4\n"));CHKERRQ(ierr);
3366         }
3367 #  else
3368       /* This should never be reached.  If so, problem in PetscSSEIsEnabled. */
3369         SETERRQ(PETSC_ERR_SUP,"SSE Hardware unavailable");
3370 #  endif
3371       } else {
3372         inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering;
3373         ierr = PetscVerboseInfo((inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=4\n"));CHKERRQ(ierr);
3374       }
3375     }
3376 #else
3377     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering;
3378     ierr = PetscVerboseInfo((inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=4\n"));CHKERRQ(ierr);
3379 #endif
3380     break;
3381   case 5:
3382     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_5_NaturalOrdering;
3383     ierr = PetscVerboseInfo((inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=5\n"));CHKERRQ(ierr);
3384     break;
3385   case 6:
3386     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_6_NaturalOrdering;
3387     ierr = PetscVerboseInfo((inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=6\n"));CHKERRQ(ierr);
3388     break;
3389   case 7:
3390     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_7_NaturalOrdering;
3391     ierr = PetscVerboseInfo((inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=7\n"));CHKERRQ(ierr);
3392     break;
3393   }
3394   PetscFunctionReturn(0);
3395 }
3396 
3397 #undef __FUNCT__
3398 #define __FUNCT__ "MatSeqBAIJ_UpdateSolvers"
3399 PetscErrorCode MatSeqBAIJ_UpdateSolvers(Mat A)
3400 {
3401   /*
3402       Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver
3403       with natural ordering
3404   */
3405   Mat_SeqBAIJ    *a  = (Mat_SeqBAIJ *)A->data;
3406   IS             row = a->row, col = a->col;
3407   PetscTruth     row_identity, col_identity;
3408   PetscTruth     use_natural;
3409   PetscErrorCode ierr;
3410 
3411   PetscFunctionBegin;
3412 
3413   use_natural = PETSC_FALSE;
3414   if (row && col) {
3415     ierr = ISIdentity(row,&row_identity);CHKERRQ(ierr);
3416     ierr = ISIdentity(col,&col_identity);CHKERRQ(ierr);
3417 
3418     if (row_identity && col_identity) {
3419       use_natural = PETSC_TRUE;
3420     }
3421   } else {
3422     use_natural = PETSC_TRUE;
3423   }
3424 
3425   switch (A->bs) {
3426   case 1:
3427     if (use_natural) {
3428       A->ops->solve           = MatSolve_SeqBAIJ_1_NaturalOrdering;
3429       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering;
3430       ierr = PetscVerboseInfo((A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=1\n"));CHKERRQ(ierr);
3431       ierr = PetscVerboseInfo((A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n"));CHKERRQ(ierr);
3432     } else {
3433       A->ops->solve           = MatSolve_SeqBAIJ_1;
3434       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_1;
3435     }
3436     break;
3437   case 2:
3438     if (use_natural) {
3439       A->ops->solve           = MatSolve_SeqBAIJ_2_NaturalOrdering;
3440       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering;
3441       ierr = PetscVerboseInfo((A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=2\n"));CHKERRQ(ierr);
3442       ierr = PetscVerboseInfo((A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n"));CHKERRQ(ierr);
3443     } else {
3444       A->ops->solve           = MatSolve_SeqBAIJ_2;
3445       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_2;
3446     }
3447     break;
3448   case 3:
3449     if (use_natural) {
3450       A->ops->solve           = MatSolve_SeqBAIJ_3_NaturalOrdering;
3451       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering;
3452       ierr = PetscVerboseInfo((A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=3\n"));CHKERRQ(ierr);
3453       ierr = PetscVerboseInfo((A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n"));CHKERRQ(ierr);
3454     } else {
3455       A->ops->solve           = MatSolve_SeqBAIJ_3;
3456       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_3;
3457     }
3458     break;
3459   case 4:
3460     {
3461       PetscTruth sse_enabled_local;
3462       ierr = PetscSSEIsEnabled(A->comm,&sse_enabled_local,PETSC_NULL);CHKERRQ(ierr);
3463       if (use_natural) {
3464 #if defined(PETSC_USE_MAT_SINGLE)
3465         if (sse_enabled_local) { /* Natural + Single + SSE */
3466 #  if defined(PETSC_HAVE_SSE)
3467           PetscInt n=a->mbs;
3468           if (n==(unsigned short)n) {
3469             A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj;
3470             ierr = PetscVerboseInfo((A,"MatSolve_SeqBAIJ:Using single precision, SSE, in-place, ushort j index, natural ordering solve BS=4\n"));CHKERRQ(ierr);
3471           } else {
3472             A->ops->solve         = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion;
3473             ierr = PetscVerboseInfo((A,"MatSolve_SeqBAIJ:Using single precision, SSE, in-place, int j index, natural ordering solve BS=4\n"));CHKERRQ(ierr);
3474           }
3475 #  else
3476           /* This should never be reached, unless there is a bug in PetscSSEIsEnabled(). */
3477           SETERRQ(PETSC_ERR_SUP,"SSE implementations are unavailable.");
3478 #  endif
3479         } else { /* Natural + Single */
3480           A->ops->solve         = MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion;
3481           ierr = PetscVerboseInfo((A,"MatSolve_SeqBAIJ:Using single precision, in-place, natural ordering solve BS=4\n"));CHKERRQ(ierr);
3482         }
3483 #else
3484         A->ops->solve           = MatSolve_SeqBAIJ_4_NaturalOrdering;
3485         ierr = PetscVerboseInfo((A,"MatSolve_SeqBAIJ:Using special in-place, natural ordering solve BS=4\n"));CHKERRQ(ierr);
3486 #endif
3487         A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
3488         ierr = PetscVerboseInfo((A,"MatSolveTranspose_SeqBAIJ:Using special in-place, natural ordering solve BS=4\n"));CHKERRQ(ierr);
3489       } else { /* Arbitrary ordering */
3490 #if defined(PETSC_USE_MAT_SINGLE)
3491         if (sse_enabled_local) { /* Arbitrary + Single + SSE */
3492 #  if defined(PETSC_HAVE_SSE)
3493           A->ops->solve         = MatSolve_SeqBAIJ_4_SSE_Demotion;
3494           ierr = PetscVerboseInfo((A,"MatSolve_SeqBAIJ:Using single precision, SSE solve BS=4\n"));CHKERRQ(ierr);
3495 #  else
3496           /* This should never be reached, unless there is a bug in PetscSSEIsEnabled(). */
3497           SETERRQ(PETSC_ERR_SUP,"SSE implementations are unavailable.");
3498 #  endif
3499         } else { /* Arbitrary + Single */
3500           A->ops->solve         = MatSolve_SeqBAIJ_4_Demotion;
3501           ierr = PetscVerboseInfo((A,"MatSolve_SeqBAIJ:Using single precision solve BS=4\n"));CHKERRQ(ierr);
3502         }
3503 #else
3504         A->ops->solve           = MatSolve_SeqBAIJ_4;
3505 #endif
3506         A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_4;
3507       }
3508     }
3509     break;
3510   case 5:
3511     if (use_natural) {
3512       A->ops->solve           = MatSolve_SeqBAIJ_5_NaturalOrdering;
3513       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_5_NaturalOrdering;
3514       ierr = PetscVerboseInfo((A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=5\n"));CHKERRQ(ierr);
3515       ierr = PetscVerboseInfo((A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=5\n"));CHKERRQ(ierr);
3516     } else {
3517       A->ops->solve           = MatSolve_SeqBAIJ_5;
3518       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_5;
3519     }
3520     break;
3521   case 6:
3522     if (use_natural) {
3523       A->ops->solve           = MatSolve_SeqBAIJ_6_NaturalOrdering;
3524       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_6_NaturalOrdering;
3525       ierr = PetscVerboseInfo((A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=6\n"));CHKERRQ(ierr);
3526       ierr = PetscVerboseInfo((A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=6\n"));CHKERRQ(ierr);
3527     } else {
3528       A->ops->solve           = MatSolve_SeqBAIJ_6;
3529       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_6;
3530     }
3531     break;
3532   case 7:
3533     if (use_natural) {
3534       A->ops->solve           = MatSolve_SeqBAIJ_7_NaturalOrdering;
3535       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_7_NaturalOrdering;
3536       ierr = PetscVerboseInfo((A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=7\n"));CHKERRQ(ierr);
3537       ierr = PetscVerboseInfo((A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=7\n"));CHKERRQ(ierr);
3538     } else {
3539       A->ops->solve           = MatSolve_SeqBAIJ_7;
3540       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_7;
3541     }
3542     break;
3543   default:
3544     A->ops->solve             = MatSolve_SeqBAIJ_N;
3545     break;
3546   }
3547   PetscFunctionReturn(0);
3548 }
3549 
3550 #undef __FUNCT__
3551 #define __FUNCT__ "MatSolve_SeqBAIJ_Update"
3552 PetscErrorCode MatSolve_SeqBAIJ_Update(Mat A,Vec x,Vec y)
3553 {
3554   PetscErrorCode ierr;
3555 
3556   PetscFunctionBegin;
3557   ierr = MatSeqBAIJ_UpdateSolvers(A);
3558   if (A->ops->solve != MatSolve_SeqBAIJ_Update) {
3559     ierr = (*A->ops->solve)(A,x,y);CHKERRQ(ierr);
3560   } else {
3561     SETERRQ(PETSC_ERR_SUP,"Something really wrong happened.");
3562   }
3563   PetscFunctionReturn(0);
3564 }
3565 
3566 #undef __FUNCT__
3567 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_Update"
3568 PetscErrorCode MatSolveTranspose_SeqBAIJ_Update(Mat A,Vec x,Vec y)
3569 {
3570   PetscErrorCode ierr;
3571 
3572   PetscFunctionBegin;
3573   ierr = MatSeqBAIJ_UpdateSolvers(A);
3574   ierr = (*A->ops->solvetranspose)(A,x,y);CHKERRQ(ierr);
3575   PetscFunctionReturn(0);
3576 }
3577 
3578 
3579