xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision ce0a2cd1da0658c2b28aad1be2e2c8e41567bece)
1 #define PETSCMAT_DLL
2 
3 /*
4     Factorization code for BAIJ format.
5 */
6 
7 #include "src/mat/impls/baij/seq/baij.h"
8 #include "src/inline/ilu.h"
9 #include "src/inline/dot.h"
10 
11 #undef __FUNCT__
12 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering"
13 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
14 {
15   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
16   PetscErrorCode ierr;
17   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
18   PetscInt       *diag = a->diag;
19   MatScalar      *aa=a->a,*v;
20   PetscScalar    s1,*x,*b;
21 
22   PetscFunctionBegin;
23   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
24   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
25   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
26 
27   /* forward solve the U^T */
28   for (i=0; i<n; i++) {
29 
30     v     = aa + diag[i];
31     /* multiply by the inverse of the block diagonal */
32     s1    = (*v++)*x[i];
33     vi    = aj + diag[i] + 1;
34     nz    = ai[i+1] - diag[i] - 1;
35     while (nz--) {
36       x[*vi++]  -= (*v++)*s1;
37     }
38     x[i]   = s1;
39   }
40   /* backward solve the L^T */
41   for (i=n-1; i>=0; i--){
42     v    = aa + diag[i] - 1;
43     vi   = aj + diag[i] - 1;
44     nz   = diag[i] - ai[i];
45     s1   = x[i];
46     while (nz--) {
47       x[*vi--]   -=  (*v--)*s1;
48     }
49   }
50   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
51   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
52   ierr = PetscLogFlops(2*(a->nz) - A->cmap.n);CHKERRQ(ierr);
53   PetscFunctionReturn(0);
54 }
55 
56 #undef __FUNCT__
57 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering"
58 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
59 {
60   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
61   PetscErrorCode ierr;
62   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
63   PetscInt       *diag = a->diag,oidx;
64   MatScalar      *aa=a->a,*v;
65   PetscScalar    s1,s2,x1,x2;
66   PetscScalar    *x,*b;
67 
68   PetscFunctionBegin;
69   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
70   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
71   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
72 
73   /* forward solve the U^T */
74   idx = 0;
75   for (i=0; i<n; i++) {
76 
77     v     = aa + 4*diag[i];
78     /* multiply by the inverse of the block diagonal */
79     x1 = x[idx];   x2 = x[1+idx];
80     s1 = v[0]*x1  +  v[1]*x2;
81     s2 = v[2]*x1  +  v[3]*x2;
82     v += 4;
83 
84     vi    = aj + diag[i] + 1;
85     nz    = ai[i+1] - diag[i] - 1;
86     while (nz--) {
87       oidx = 2*(*vi++);
88       x[oidx]   -= v[0]*s1  +  v[1]*s2;
89       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
90       v  += 4;
91     }
92     x[idx]   = s1;x[1+idx] = s2;
93     idx += 2;
94   }
95   /* backward solve the L^T */
96   for (i=n-1; i>=0; i--){
97     v    = aa + 4*diag[i] - 4;
98     vi   = aj + diag[i] - 1;
99     nz   = diag[i] - ai[i];
100     idt  = 2*i;
101     s1   = x[idt];  s2 = x[1+idt];
102     while (nz--) {
103       idx   = 2*(*vi--);
104       x[idx]   -=  v[0]*s1 +  v[1]*s2;
105       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
106       v -= 4;
107     }
108   }
109   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
110   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
111   ierr = PetscLogFlops(2*4*(a->nz) - 2*A->cmap.n);CHKERRQ(ierr);
112   PetscFunctionReturn(0);
113 }
114 
115 #undef __FUNCT__
116 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering"
117 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
118 {
119   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
120   PetscErrorCode ierr;
121   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
122   PetscInt       *diag = a->diag,oidx;
123   MatScalar      *aa=a->a,*v;
124   PetscScalar    s1,s2,s3,x1,x2,x3;
125   PetscScalar    *x,*b;
126 
127   PetscFunctionBegin;
128   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
129   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
130   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
131 
132   /* forward solve the U^T */
133   idx = 0;
134   for (i=0; i<n; i++) {
135 
136     v     = aa + 9*diag[i];
137     /* multiply by the inverse of the block diagonal */
138     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
139     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
140     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
141     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
142     v += 9;
143 
144     vi    = aj + diag[i] + 1;
145     nz    = ai[i+1] - diag[i] - 1;
146     while (nz--) {
147       oidx = 3*(*vi++);
148       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
149       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
150       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
151       v  += 9;
152     }
153     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
154     idx += 3;
155   }
156   /* backward solve the L^T */
157   for (i=n-1; i>=0; i--){
158     v    = aa + 9*diag[i] - 9;
159     vi   = aj + diag[i] - 1;
160     nz   = diag[i] - ai[i];
161     idt  = 3*i;
162     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
163     while (nz--) {
164       idx   = 3*(*vi--);
165       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
166       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
167       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
168       v -= 9;
169     }
170   }
171   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
172   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
173   ierr = PetscLogFlops(2*9*(a->nz) - 3*A->cmap.n);CHKERRQ(ierr);
174   PetscFunctionReturn(0);
175 }
176 
177 #undef __FUNCT__
178 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering"
179 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
180 {
181   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
182   PetscErrorCode ierr;
183   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
184   PetscInt       *diag = a->diag,oidx;
185   MatScalar      *aa=a->a,*v;
186   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
187   PetscScalar    *x,*b;
188 
189   PetscFunctionBegin;
190   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
191   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
192   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
193 
194   /* forward solve the U^T */
195   idx = 0;
196   for (i=0; i<n; i++) {
197 
198     v     = aa + 16*diag[i];
199     /* multiply by the inverse of the block diagonal */
200     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
201     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
202     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
203     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
204     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
205     v += 16;
206 
207     vi    = aj + diag[i] + 1;
208     nz    = ai[i+1] - diag[i] - 1;
209     while (nz--) {
210       oidx = 4*(*vi++);
211       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
212       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
213       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
214       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
215       v  += 16;
216     }
217     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
218     idx += 4;
219   }
220   /* backward solve the L^T */
221   for (i=n-1; i>=0; i--){
222     v    = aa + 16*diag[i] - 16;
223     vi   = aj + diag[i] - 1;
224     nz   = diag[i] - ai[i];
225     idt  = 4*i;
226     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
227     while (nz--) {
228       idx   = 4*(*vi--);
229       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
230       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
231       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
232       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
233       v -= 16;
234     }
235   }
236   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
237   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
238   ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap.n);CHKERRQ(ierr);
239   PetscFunctionReturn(0);
240 }
241 
242 #undef __FUNCT__
243 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering"
244 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
245 {
246   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
247   PetscErrorCode ierr;
248   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
249   PetscInt       *diag = a->diag,oidx;
250   MatScalar      *aa=a->a,*v;
251   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
252   PetscScalar    *x,*b;
253 
254   PetscFunctionBegin;
255   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
256   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
257   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
258 
259   /* forward solve the U^T */
260   idx = 0;
261   for (i=0; i<n; i++) {
262 
263     v     = aa + 25*diag[i];
264     /* multiply by the inverse of the block diagonal */
265     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
266     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
267     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
268     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
269     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
270     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
271     v += 25;
272 
273     vi    = aj + diag[i] + 1;
274     nz    = ai[i+1] - diag[i] - 1;
275     while (nz--) {
276       oidx = 5*(*vi++);
277       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
278       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
279       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
280       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
281       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
282       v  += 25;
283     }
284     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
285     idx += 5;
286   }
287   /* backward solve the L^T */
288   for (i=n-1; i>=0; i--){
289     v    = aa + 25*diag[i] - 25;
290     vi   = aj + diag[i] - 1;
291     nz   = diag[i] - ai[i];
292     idt  = 5*i;
293     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
294     while (nz--) {
295       idx   = 5*(*vi--);
296       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
297       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
298       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
299       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
300       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
301       v -= 25;
302     }
303   }
304   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
305   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
306   ierr = PetscLogFlops(2*25*(a->nz) - 5*A->cmap.n);CHKERRQ(ierr);
307   PetscFunctionReturn(0);
308 }
309 
310 #undef __FUNCT__
311 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering"
312 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
313 {
314   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
315   PetscErrorCode ierr;
316   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
317   PetscInt       *diag = a->diag,oidx;
318   MatScalar      *aa=a->a,*v;
319   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
320   PetscScalar    *x,*b;
321 
322   PetscFunctionBegin;
323   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
324   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
325   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
326 
327   /* forward solve the U^T */
328   idx = 0;
329   for (i=0; i<n; i++) {
330 
331     v     = aa + 36*diag[i];
332     /* multiply by the inverse of the block diagonal */
333     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
334     x6    = x[5+idx];
335     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
336     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
337     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
338     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
339     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
340     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
341     v += 36;
342 
343     vi    = aj + diag[i] + 1;
344     nz    = ai[i+1] - diag[i] - 1;
345     while (nz--) {
346       oidx = 6*(*vi++);
347       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
348       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
349       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
350       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
351       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
352       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
353       v  += 36;
354     }
355     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
356     x[5+idx] = s6;
357     idx += 6;
358   }
359   /* backward solve the L^T */
360   for (i=n-1; i>=0; i--){
361     v    = aa + 36*diag[i] - 36;
362     vi   = aj + diag[i] - 1;
363     nz   = diag[i] - ai[i];
364     idt  = 6*i;
365     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
366     s6 = x[5+idt];
367     while (nz--) {
368       idx   = 6*(*vi--);
369       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
370       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
371       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
372       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
373       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
374       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
375       v -= 36;
376     }
377   }
378   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
379   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
380   ierr = PetscLogFlops(2*36*(a->nz) - 6*A->cmap.n);CHKERRQ(ierr);
381   PetscFunctionReturn(0);
382 }
383 
384 #undef __FUNCT__
385 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering"
386 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
387 {
388   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
389   PetscErrorCode ierr;
390   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
391   PetscInt       *diag = a->diag,oidx;
392   MatScalar      *aa=a->a,*v;
393   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
394   PetscScalar    *x,*b;
395 
396   PetscFunctionBegin;
397   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
398   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
399   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
400 
401   /* forward solve the U^T */
402   idx = 0;
403   for (i=0; i<n; i++) {
404 
405     v     = aa + 49*diag[i];
406     /* multiply by the inverse of the block diagonal */
407     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
408     x6    = x[5+idx]; x7 = x[6+idx];
409     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
410     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
411     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
412     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
413     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
414     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
415     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
416     v += 49;
417 
418     vi    = aj + diag[i] + 1;
419     nz    = ai[i+1] - diag[i] - 1;
420     while (nz--) {
421       oidx = 7*(*vi++);
422       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
423       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
424       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
425       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
426       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
427       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
428       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
429       v  += 49;
430     }
431     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
432     x[5+idx] = s6;x[6+idx] = s7;
433     idx += 7;
434   }
435   /* backward solve the L^T */
436   for (i=n-1; i>=0; i--){
437     v    = aa + 49*diag[i] - 49;
438     vi   = aj + diag[i] - 1;
439     nz   = diag[i] - ai[i];
440     idt  = 7*i;
441     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
442     s6 = x[5+idt];s7 = x[6+idt];
443     while (nz--) {
444       idx   = 7*(*vi--);
445       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
446       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
447       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
448       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
449       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
450       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
451       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
452       v -= 49;
453     }
454   }
455   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
456   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
457   ierr = PetscLogFlops(2*49*(a->nz) - 7*A->cmap.n);CHKERRQ(ierr);
458   PetscFunctionReturn(0);
459 }
460 
461 /*---------------------------------------------------------------------------------------------*/
462 #undef __FUNCT__
463 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1"
464 PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
465 {
466   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
467   IS             iscol=a->col,isrow=a->row;
468   PetscErrorCode ierr;
469   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
470   PetscInt       *diag = a->diag;
471   MatScalar      *aa=a->a,*v;
472   PetscScalar    s1,*x,*b,*t;
473 
474   PetscFunctionBegin;
475   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
476   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
477   t  = a->solve_work;
478 
479   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
480   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
481 
482   /* copy the b into temp work space according to permutation */
483   for (i=0; i<n; i++) {
484     t[i] = b[c[i]];
485   }
486 
487   /* forward solve the U^T */
488   for (i=0; i<n; i++) {
489 
490     v     = aa + diag[i];
491     /* multiply by the inverse of the block diagonal */
492     s1    = (*v++)*t[i];
493     vi    = aj + diag[i] + 1;
494     nz    = ai[i+1] - diag[i] - 1;
495     while (nz--) {
496       t[*vi++]  -= (*v++)*s1;
497     }
498     t[i]   = s1;
499   }
500   /* backward solve the L^T */
501   for (i=n-1; i>=0; i--){
502     v    = aa + diag[i] - 1;
503     vi   = aj + diag[i] - 1;
504     nz   = diag[i] - ai[i];
505     s1   = t[i];
506     while (nz--) {
507       t[*vi--]   -=  (*v--)*s1;
508     }
509   }
510 
511   /* copy t into x according to permutation */
512   for (i=0; i<n; i++) {
513     x[r[i]]   = t[i];
514   }
515 
516   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
517   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
518   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
519   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
520   ierr = PetscLogFlops(2*(a->nz) - A->cmap.n);CHKERRQ(ierr);
521   PetscFunctionReturn(0);
522 }
523 
524 #undef __FUNCT__
525 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2"
526 PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
527 {
528   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
529   IS             iscol=a->col,isrow=a->row;
530   PetscErrorCode ierr;
531   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
532   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
533   MatScalar      *aa=a->a,*v;
534   PetscScalar    s1,s2,x1,x2;
535   PetscScalar    *x,*b,*t;
536 
537   PetscFunctionBegin;
538   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
539   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
540   t  = a->solve_work;
541 
542   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
543   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
544 
545   /* copy the b into temp work space according to permutation */
546   ii = 0;
547   for (i=0; i<n; i++) {
548     ic      = 2*c[i];
549     t[ii]   = b[ic];
550     t[ii+1] = b[ic+1];
551     ii += 2;
552   }
553 
554   /* forward solve the U^T */
555   idx = 0;
556   for (i=0; i<n; i++) {
557 
558     v     = aa + 4*diag[i];
559     /* multiply by the inverse of the block diagonal */
560     x1    = t[idx];   x2 = t[1+idx];
561     s1 = v[0]*x1  +  v[1]*x2;
562     s2 = v[2]*x1  +  v[3]*x2;
563     v += 4;
564 
565     vi    = aj + diag[i] + 1;
566     nz    = ai[i+1] - diag[i] - 1;
567     while (nz--) {
568       oidx = 2*(*vi++);
569       t[oidx]   -= v[0]*s1  +  v[1]*s2;
570       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
571       v  += 4;
572     }
573     t[idx]   = s1;t[1+idx] = s2;
574     idx += 2;
575   }
576   /* backward solve the L^T */
577   for (i=n-1; i>=0; i--){
578     v    = aa + 4*diag[i] - 4;
579     vi   = aj + diag[i] - 1;
580     nz   = diag[i] - ai[i];
581     idt  = 2*i;
582     s1 = t[idt];  s2 = t[1+idt];
583     while (nz--) {
584       idx   = 2*(*vi--);
585       t[idx]   -=  v[0]*s1 +  v[1]*s2;
586       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
587       v -= 4;
588     }
589   }
590 
591   /* copy t into x according to permutation */
592   ii = 0;
593   for (i=0; i<n; i++) {
594     ir      = 2*r[i];
595     x[ir]   = t[ii];
596     x[ir+1] = t[ii+1];
597     ii += 2;
598   }
599 
600   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
601   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
602   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
603   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
604   ierr = PetscLogFlops(2*4*(a->nz) - 2*A->cmap.n);CHKERRQ(ierr);
605   PetscFunctionReturn(0);
606 }
607 
608 #undef __FUNCT__
609 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3"
610 PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
611 {
612   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
613   IS             iscol=a->col,isrow=a->row;
614   PetscErrorCode ierr;
615   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
616   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
617   MatScalar      *aa=a->a,*v;
618   PetscScalar    s1,s2,s3,x1,x2,x3;
619   PetscScalar    *x,*b,*t;
620 
621   PetscFunctionBegin;
622   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
623   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
624   t  = a->solve_work;
625 
626   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
627   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
628 
629   /* copy the b into temp work space according to permutation */
630   ii = 0;
631   for (i=0; i<n; i++) {
632     ic      = 3*c[i];
633     t[ii]   = b[ic];
634     t[ii+1] = b[ic+1];
635     t[ii+2] = b[ic+2];
636     ii += 3;
637   }
638 
639   /* forward solve the U^T */
640   idx = 0;
641   for (i=0; i<n; i++) {
642 
643     v     = aa + 9*diag[i];
644     /* multiply by the inverse of the block diagonal */
645     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
646     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
647     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
648     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
649     v += 9;
650 
651     vi    = aj + diag[i] + 1;
652     nz    = ai[i+1] - diag[i] - 1;
653     while (nz--) {
654       oidx = 3*(*vi++);
655       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
656       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
657       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
658       v  += 9;
659     }
660     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
661     idx += 3;
662   }
663   /* backward solve the L^T */
664   for (i=n-1; i>=0; i--){
665     v    = aa + 9*diag[i] - 9;
666     vi   = aj + diag[i] - 1;
667     nz   = diag[i] - ai[i];
668     idt  = 3*i;
669     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
670     while (nz--) {
671       idx   = 3*(*vi--);
672       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
673       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
674       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
675       v -= 9;
676     }
677   }
678 
679   /* copy t into x according to permutation */
680   ii = 0;
681   for (i=0; i<n; i++) {
682     ir      = 3*r[i];
683     x[ir]   = t[ii];
684     x[ir+1] = t[ii+1];
685     x[ir+2] = t[ii+2];
686     ii += 3;
687   }
688 
689   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
690   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
691   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
692   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
693   ierr = PetscLogFlops(2*9*(a->nz) - 3*A->cmap.n);CHKERRQ(ierr);
694   PetscFunctionReturn(0);
695 }
696 
697 #undef __FUNCT__
698 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4"
699 PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
700 {
701   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
702   IS             iscol=a->col,isrow=a->row;
703   PetscErrorCode ierr;
704   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
705   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
706   MatScalar      *aa=a->a,*v;
707   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
708   PetscScalar    *x,*b,*t;
709 
710   PetscFunctionBegin;
711   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
712   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
713   t  = a->solve_work;
714 
715   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
716   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
717 
718   /* copy the b into temp work space according to permutation */
719   ii = 0;
720   for (i=0; i<n; i++) {
721     ic      = 4*c[i];
722     t[ii]   = b[ic];
723     t[ii+1] = b[ic+1];
724     t[ii+2] = b[ic+2];
725     t[ii+3] = b[ic+3];
726     ii += 4;
727   }
728 
729   /* forward solve the U^T */
730   idx = 0;
731   for (i=0; i<n; i++) {
732 
733     v     = aa + 16*diag[i];
734     /* multiply by the inverse of the block diagonal */
735     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
736     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
737     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
738     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
739     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
740     v += 16;
741 
742     vi    = aj + diag[i] + 1;
743     nz    = ai[i+1] - diag[i] - 1;
744     while (nz--) {
745       oidx = 4*(*vi++);
746       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
747       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
748       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
749       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
750       v  += 16;
751     }
752     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
753     idx += 4;
754   }
755   /* backward solve the L^T */
756   for (i=n-1; i>=0; i--){
757     v    = aa + 16*diag[i] - 16;
758     vi   = aj + diag[i] - 1;
759     nz   = diag[i] - ai[i];
760     idt  = 4*i;
761     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
762     while (nz--) {
763       idx   = 4*(*vi--);
764       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
765       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
766       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
767       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
768       v -= 16;
769     }
770   }
771 
772   /* copy t into x according to permutation */
773   ii = 0;
774   for (i=0; i<n; i++) {
775     ir      = 4*r[i];
776     x[ir]   = t[ii];
777     x[ir+1] = t[ii+1];
778     x[ir+2] = t[ii+2];
779     x[ir+3] = t[ii+3];
780     ii += 4;
781   }
782 
783   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
784   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
785   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
786   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
787   ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap.n);CHKERRQ(ierr);
788   PetscFunctionReturn(0);
789 }
790 
791 #undef __FUNCT__
792 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5"
793 PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
794 {
795   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
796   IS             iscol=a->col,isrow=a->row;
797   PetscErrorCode ierr;
798   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
799   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
800   MatScalar      *aa=a->a,*v;
801   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
802   PetscScalar    *x,*b,*t;
803 
804   PetscFunctionBegin;
805   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
806   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
807   t  = a->solve_work;
808 
809   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
810   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
811 
812   /* copy the b into temp work space according to permutation */
813   ii = 0;
814   for (i=0; i<n; i++) {
815     ic      = 5*c[i];
816     t[ii]   = b[ic];
817     t[ii+1] = b[ic+1];
818     t[ii+2] = b[ic+2];
819     t[ii+3] = b[ic+3];
820     t[ii+4] = b[ic+4];
821     ii += 5;
822   }
823 
824   /* forward solve the U^T */
825   idx = 0;
826   for (i=0; i<n; i++) {
827 
828     v     = aa + 25*diag[i];
829     /* multiply by the inverse of the block diagonal */
830     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
831     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
832     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
833     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
834     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
835     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
836     v += 25;
837 
838     vi    = aj + diag[i] + 1;
839     nz    = ai[i+1] - diag[i] - 1;
840     while (nz--) {
841       oidx = 5*(*vi++);
842       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
843       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
844       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
845       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
846       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
847       v  += 25;
848     }
849     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
850     idx += 5;
851   }
852   /* backward solve the L^T */
853   for (i=n-1; i>=0; i--){
854     v    = aa + 25*diag[i] - 25;
855     vi   = aj + diag[i] - 1;
856     nz   = diag[i] - ai[i];
857     idt  = 5*i;
858     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
859     while (nz--) {
860       idx   = 5*(*vi--);
861       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
862       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
863       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
864       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
865       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
866       v -= 25;
867     }
868   }
869 
870   /* copy t into x according to permutation */
871   ii = 0;
872   for (i=0; i<n; i++) {
873     ir      = 5*r[i];
874     x[ir]   = t[ii];
875     x[ir+1] = t[ii+1];
876     x[ir+2] = t[ii+2];
877     x[ir+3] = t[ii+3];
878     x[ir+4] = t[ii+4];
879     ii += 5;
880   }
881 
882   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
883   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
884   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
885   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
886   ierr = PetscLogFlops(2*25*(a->nz) - 5*A->cmap.n);CHKERRQ(ierr);
887   PetscFunctionReturn(0);
888 }
889 
890 #undef __FUNCT__
891 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6"
892 PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
893 {
894   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
895   IS             iscol=a->col,isrow=a->row;
896   PetscErrorCode ierr;
897   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
898   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
899   MatScalar      *aa=a->a,*v;
900   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
901   PetscScalar    *x,*b,*t;
902 
903   PetscFunctionBegin;
904   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
905   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
906   t  = a->solve_work;
907 
908   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
909   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
910 
911   /* copy the b into temp work space according to permutation */
912   ii = 0;
913   for (i=0; i<n; i++) {
914     ic      = 6*c[i];
915     t[ii]   = b[ic];
916     t[ii+1] = b[ic+1];
917     t[ii+2] = b[ic+2];
918     t[ii+3] = b[ic+3];
919     t[ii+4] = b[ic+4];
920     t[ii+5] = b[ic+5];
921     ii += 6;
922   }
923 
924   /* forward solve the U^T */
925   idx = 0;
926   for (i=0; i<n; i++) {
927 
928     v     = aa + 36*diag[i];
929     /* multiply by the inverse of the block diagonal */
930     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
931     x6    = t[5+idx];
932     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
933     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
934     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
935     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
936     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
937     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
938     v += 36;
939 
940     vi    = aj + diag[i] + 1;
941     nz    = ai[i+1] - diag[i] - 1;
942     while (nz--) {
943       oidx = 6*(*vi++);
944       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
945       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
946       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
947       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
948       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
949       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
950       v  += 36;
951     }
952     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
953     t[5+idx] = s6;
954     idx += 6;
955   }
956   /* backward solve the L^T */
957   for (i=n-1; i>=0; i--){
958     v    = aa + 36*diag[i] - 36;
959     vi   = aj + diag[i] - 1;
960     nz   = diag[i] - ai[i];
961     idt  = 6*i;
962     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
963     s6 = t[5+idt];
964     while (nz--) {
965       idx   = 6*(*vi--);
966       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
967       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
968       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
969       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
970       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
971       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
972       v -= 36;
973     }
974   }
975 
976   /* copy t into x according to permutation */
977   ii = 0;
978   for (i=0; i<n; i++) {
979     ir      = 6*r[i];
980     x[ir]   = t[ii];
981     x[ir+1] = t[ii+1];
982     x[ir+2] = t[ii+2];
983     x[ir+3] = t[ii+3];
984     x[ir+4] = t[ii+4];
985     x[ir+5] = t[ii+5];
986     ii += 6;
987   }
988 
989   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
990   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
991   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
992   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
993   ierr = PetscLogFlops(2*36*(a->nz) - 6*A->cmap.n);CHKERRQ(ierr);
994   PetscFunctionReturn(0);
995 }
996 
997 #undef __FUNCT__
998 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7"
999 PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1000 {
1001   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1002   IS             iscol=a->col,isrow=a->row;
1003   PetscErrorCode ierr;
1004   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
1005   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1006   MatScalar      *aa=a->a,*v;
1007   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1008   PetscScalar    *x,*b,*t;
1009 
1010   PetscFunctionBegin;
1011   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1012   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1013   t  = a->solve_work;
1014 
1015   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1016   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1017 
1018   /* copy the b into temp work space according to permutation */
1019   ii = 0;
1020   for (i=0; i<n; i++) {
1021     ic      = 7*c[i];
1022     t[ii]   = b[ic];
1023     t[ii+1] = b[ic+1];
1024     t[ii+2] = b[ic+2];
1025     t[ii+3] = b[ic+3];
1026     t[ii+4] = b[ic+4];
1027     t[ii+5] = b[ic+5];
1028     t[ii+6] = b[ic+6];
1029     ii += 7;
1030   }
1031 
1032   /* forward solve the U^T */
1033   idx = 0;
1034   for (i=0; i<n; i++) {
1035 
1036     v     = aa + 49*diag[i];
1037     /* multiply by the inverse of the block diagonal */
1038     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1039     x6    = t[5+idx]; x7 = t[6+idx];
1040     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1041     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1042     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1043     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1044     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1045     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1046     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1047     v += 49;
1048 
1049     vi    = aj + diag[i] + 1;
1050     nz    = ai[i+1] - diag[i] - 1;
1051     while (nz--) {
1052       oidx = 7*(*vi++);
1053       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1054       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1055       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1056       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1057       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1058       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1059       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1060       v  += 49;
1061     }
1062     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1063     t[5+idx] = s6;t[6+idx] = s7;
1064     idx += 7;
1065   }
1066   /* backward solve the L^T */
1067   for (i=n-1; i>=0; i--){
1068     v    = aa + 49*diag[i] - 49;
1069     vi   = aj + diag[i] - 1;
1070     nz   = diag[i] - ai[i];
1071     idt  = 7*i;
1072     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1073     s6 = t[5+idt];s7 = t[6+idt];
1074     while (nz--) {
1075       idx   = 7*(*vi--);
1076       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1077       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1078       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1079       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1080       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1081       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1082       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1083       v -= 49;
1084     }
1085   }
1086 
1087   /* copy t into x according to permutation */
1088   ii = 0;
1089   for (i=0; i<n; i++) {
1090     ir      = 7*r[i];
1091     x[ir]   = t[ii];
1092     x[ir+1] = t[ii+1];
1093     x[ir+2] = t[ii+2];
1094     x[ir+3] = t[ii+3];
1095     x[ir+4] = t[ii+4];
1096     x[ir+5] = t[ii+5];
1097     x[ir+6] = t[ii+6];
1098     ii += 7;
1099   }
1100 
1101   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1102   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1103   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1104   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1105   ierr = PetscLogFlops(2*49*(a->nz) - 7*A->cmap.n);CHKERRQ(ierr);
1106   PetscFunctionReturn(0);
1107 }
1108 
1109 /* ----------------------------------------------------------- */
1110 #undef __FUNCT__
1111 #define __FUNCT__ "MatSolve_SeqBAIJ_N"
1112 PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
1113 {
1114   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1115   IS             iscol=a->col,isrow=a->row;
1116   PetscErrorCode ierr;
1117   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1118   PetscInt       nz,bs=A->rmap.bs,bs2=a->bs2,*rout,*cout;
1119   MatScalar      *aa=a->a,*v;
1120   PetscScalar    *x,*b,*s,*t,*ls;
1121 
1122   PetscFunctionBegin;
1123   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1124   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1125   t  = a->solve_work;
1126 
1127   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1128   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1129 
1130   /* forward solve the lower triangular */
1131   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
1132   for (i=1; i<n; i++) {
1133     v   = aa + bs2*ai[i];
1134     vi  = aj + ai[i];
1135     nz  = a->diag[i] - ai[i];
1136     s = t + bs*i;
1137     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
1138     while (nz--) {
1139       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
1140       v += bs2;
1141     }
1142   }
1143   /* backward solve the upper triangular */
1144   ls = a->solve_work + A->cmap.n;
1145   for (i=n-1; i>=0; i--){
1146     v   = aa + bs2*(a->diag[i] + 1);
1147     vi  = aj + a->diag[i] + 1;
1148     nz  = ai[i+1] - a->diag[i] - 1;
1149     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1150     while (nz--) {
1151       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
1152       v += bs2;
1153     }
1154     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
1155     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1156   }
1157 
1158   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1159   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1160   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1161   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1162   ierr = PetscLogFlops(2*(a->bs2)*(a->nz) - A->rmap.bs*A->cmap.n);CHKERRQ(ierr);
1163   PetscFunctionReturn(0);
1164 }
1165 
1166 #undef __FUNCT__
1167 #define __FUNCT__ "MatSolve_SeqBAIJ_7"
1168 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1169 {
1170   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1171   IS             iscol=a->col,isrow=a->row;
1172   PetscErrorCode ierr;
1173   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1174   PetscInt       *diag = a->diag;
1175   MatScalar      *aa=a->a,*v;
1176   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1177   PetscScalar    *x,*b,*t;
1178 
1179   PetscFunctionBegin;
1180   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1181   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1182   t  = a->solve_work;
1183 
1184   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1185   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1186 
1187   /* forward solve the lower triangular */
1188   idx    = 7*(*r++);
1189   t[0] = b[idx];   t[1] = b[1+idx];
1190   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1191   t[5] = b[5+idx]; t[6] = b[6+idx];
1192 
1193   for (i=1; i<n; i++) {
1194     v     = aa + 49*ai[i];
1195     vi    = aj + ai[i];
1196     nz    = diag[i] - ai[i];
1197     idx   = 7*(*r++);
1198     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1199     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1200     while (nz--) {
1201       idx   = 7*(*vi++);
1202       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1203       x4    = t[3+idx];x5 = t[4+idx];
1204       x6    = t[5+idx];x7 = t[6+idx];
1205       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1206       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1207       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1208       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1209       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1210       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1211       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1212       v += 49;
1213     }
1214     idx = 7*i;
1215     t[idx]   = s1;t[1+idx] = s2;
1216     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1217     t[5+idx] = s6;t[6+idx] = s7;
1218   }
1219   /* backward solve the upper triangular */
1220   for (i=n-1; i>=0; i--){
1221     v    = aa + 49*diag[i] + 49;
1222     vi   = aj + diag[i] + 1;
1223     nz   = ai[i+1] - diag[i] - 1;
1224     idt  = 7*i;
1225     s1 = t[idt];  s2 = t[1+idt];
1226     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1227     s6 = t[5+idt];s7 = t[6+idt];
1228     while (nz--) {
1229       idx   = 7*(*vi++);
1230       x1    = t[idx];   x2 = t[1+idx];
1231       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1232       x6    = t[5+idx]; x7 = t[6+idx];
1233       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1234       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1235       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1236       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1237       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1238       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1239       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1240       v += 49;
1241     }
1242     idc = 7*(*c--);
1243     v   = aa + 49*diag[i];
1244     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1245                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1246     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1247                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1248     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1249                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1250     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1251                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1252     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1253                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1254     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1255                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1256     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1257                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
1258   }
1259 
1260   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1261   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1262   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1263   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1264   ierr = PetscLogFlops(2*49*(a->nz) - 7*A->cmap.n);CHKERRQ(ierr);
1265   PetscFunctionReturn(0);
1266 }
1267 
1268 #undef __FUNCT__
1269 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
1270 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
1271 {
1272   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1273   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1274   PetscErrorCode    ierr;
1275   PetscInt          *diag = a->diag,jdx;
1276   const MatScalar   *aa=a->a,*v;
1277   PetscScalar       *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1278   const PetscScalar *b;
1279 
1280   PetscFunctionBegin;
1281   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1282   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1283   /* forward solve the lower triangular */
1284   idx    = 0;
1285   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1286   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1287   x[6] = b[6+idx];
1288   for (i=1; i<n; i++) {
1289     v     =  aa + 49*ai[i];
1290     vi    =  aj + ai[i];
1291     nz    =  diag[i] - ai[i];
1292     idx   =  7*i;
1293     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1294     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1295     s7  =  b[6+idx];
1296     while (nz--) {
1297       jdx   = 7*(*vi++);
1298       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1299       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1300       x7    = x[6+jdx];
1301       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1302       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1303       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1304       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1305       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1306       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1307       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1308       v += 49;
1309      }
1310     x[idx]   = s1;
1311     x[1+idx] = s2;
1312     x[2+idx] = s3;
1313     x[3+idx] = s4;
1314     x[4+idx] = s5;
1315     x[5+idx] = s6;
1316     x[6+idx] = s7;
1317   }
1318   /* backward solve the upper triangular */
1319   for (i=n-1; i>=0; i--){
1320     v    = aa + 49*diag[i] + 49;
1321     vi   = aj + diag[i] + 1;
1322     nz   = ai[i+1] - diag[i] - 1;
1323     idt  = 7*i;
1324     s1 = x[idt];   s2 = x[1+idt];
1325     s3 = x[2+idt]; s4 = x[3+idt];
1326     s5 = x[4+idt]; s6 = x[5+idt];
1327     s7 = x[6+idt];
1328     while (nz--) {
1329       idx   = 7*(*vi++);
1330       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1331       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1332       x7    = x[6+idx];
1333       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1334       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1335       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1336       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1337       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1338       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1339       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1340       v += 49;
1341     }
1342     v        = aa + 49*diag[i];
1343     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
1344                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
1345     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
1346                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
1347     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
1348                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
1349     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
1350                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
1351     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
1352                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
1353     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
1354                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
1355     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
1356                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
1357   }
1358 
1359   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1360   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1361   ierr = PetscLogFlops(2*36*(a->nz) - 6*A->cmap.n);CHKERRQ(ierr);
1362   PetscFunctionReturn(0);
1363 }
1364 
1365 #undef __FUNCT__
1366 #define __FUNCT__ "MatSolve_SeqBAIJ_6"
1367 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
1368 {
1369   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1370   IS                iscol=a->col,isrow=a->row;
1371   PetscErrorCode    ierr;
1372   PetscInt          *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1373   PetscInt          *diag = a->diag;
1374   const MatScalar   *aa=a->a,*v;
1375   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
1376   const PetscScalar *b;
1377   PetscFunctionBegin;
1378   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1379   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1380   t  = a->solve_work;
1381 
1382   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1383   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1384 
1385   /* forward solve the lower triangular */
1386   idx    = 6*(*r++);
1387   t[0] = b[idx];   t[1] = b[1+idx];
1388   t[2] = b[2+idx]; t[3] = b[3+idx];
1389   t[4] = b[4+idx]; t[5] = b[5+idx];
1390   for (i=1; i<n; i++) {
1391     v     = aa + 36*ai[i];
1392     vi    = aj + ai[i];
1393     nz    = diag[i] - ai[i];
1394     idx   = 6*(*r++);
1395     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1396     s5  = b[4+idx]; s6 = b[5+idx];
1397     while (nz--) {
1398       idx   = 6*(*vi++);
1399       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1400       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1401       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1402       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1403       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1404       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1405       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1406       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1407       v += 36;
1408     }
1409     idx = 6*i;
1410     t[idx]   = s1;t[1+idx] = s2;
1411     t[2+idx] = s3;t[3+idx] = s4;
1412     t[4+idx] = s5;t[5+idx] = s6;
1413   }
1414   /* backward solve the upper triangular */
1415   for (i=n-1; i>=0; i--){
1416     v    = aa + 36*diag[i] + 36;
1417     vi   = aj + diag[i] + 1;
1418     nz   = ai[i+1] - diag[i] - 1;
1419     idt  = 6*i;
1420     s1 = t[idt];  s2 = t[1+idt];
1421     s3 = t[2+idt];s4 = t[3+idt];
1422     s5 = t[4+idt];s6 = t[5+idt];
1423     while (nz--) {
1424       idx   = 6*(*vi++);
1425       x1    = t[idx];   x2 = t[1+idx];
1426       x3    = t[2+idx]; x4 = t[3+idx];
1427       x5    = t[4+idx]; x6 = t[5+idx];
1428       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1429       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1430       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1431       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1432       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1433       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1434       v += 36;
1435     }
1436     idc = 6*(*c--);
1437     v   = aa + 36*diag[i];
1438     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
1439                                  v[18]*s4+v[24]*s5+v[30]*s6;
1440     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1441                                  v[19]*s4+v[25]*s5+v[31]*s6;
1442     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1443                                  v[20]*s4+v[26]*s5+v[32]*s6;
1444     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1445                                  v[21]*s4+v[27]*s5+v[33]*s6;
1446     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1447                                  v[22]*s4+v[28]*s5+v[34]*s6;
1448     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1449                                  v[23]*s4+v[29]*s5+v[35]*s6;
1450   }
1451 
1452   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1453   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1454   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1455   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1456   ierr = PetscLogFlops(2*36*(a->nz) - 6*A->cmap.n);CHKERRQ(ierr);
1457   PetscFunctionReturn(0);
1458 }
1459 
1460 #undef __FUNCT__
1461 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
1462 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
1463 {
1464   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1465   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1466   PetscErrorCode    ierr;
1467   PetscInt          *diag = a->diag,jdx;
1468   const MatScalar   *aa=a->a,*v;
1469   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
1470   const PetscScalar *b;
1471 
1472   PetscFunctionBegin;
1473   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1474   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1475   /* forward solve the lower triangular */
1476   idx    = 0;
1477   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1478   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1479   for (i=1; i<n; i++) {
1480     v     =  aa + 36*ai[i];
1481     vi    =  aj + ai[i];
1482     nz    =  diag[i] - ai[i];
1483     idx   =  6*i;
1484     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1485     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1486     while (nz--) {
1487       jdx   = 6*(*vi++);
1488       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1489       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1490       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1491       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1492       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1493       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1494       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1495       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1496       v += 36;
1497      }
1498     x[idx]   = s1;
1499     x[1+idx] = s2;
1500     x[2+idx] = s3;
1501     x[3+idx] = s4;
1502     x[4+idx] = s5;
1503     x[5+idx] = s6;
1504   }
1505   /* backward solve the upper triangular */
1506   for (i=n-1; i>=0; i--){
1507     v    = aa + 36*diag[i] + 36;
1508     vi   = aj + diag[i] + 1;
1509     nz   = ai[i+1] - diag[i] - 1;
1510     idt  = 6*i;
1511     s1 = x[idt];   s2 = x[1+idt];
1512     s3 = x[2+idt]; s4 = x[3+idt];
1513     s5 = x[4+idt]; s6 = x[5+idt];
1514     while (nz--) {
1515       idx   = 6*(*vi++);
1516       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1517       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1518       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1519       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1520       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1521       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1522       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1523       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1524       v += 36;
1525     }
1526     v        = aa + 36*diag[i];
1527     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
1528     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
1529     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
1530     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
1531     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
1532     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
1533   }
1534 
1535   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1536   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1537   ierr = PetscLogFlops(2*36*(a->nz) - 6*A->cmap.n);CHKERRQ(ierr);
1538   PetscFunctionReturn(0);
1539 }
1540 
1541 #undef __FUNCT__
1542 #define __FUNCT__ "MatSolve_SeqBAIJ_5"
1543 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
1544 {
1545   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1546   IS                iscol=a->col,isrow=a->row;
1547   PetscErrorCode    ierr;
1548   PetscInt          *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1549   PetscInt          *diag = a->diag;
1550   const MatScalar   *aa=a->a,*v;
1551   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
1552   const PetscScalar *b;
1553 
1554   PetscFunctionBegin;
1555   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1556   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1557   t  = a->solve_work;
1558 
1559   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1560   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1561 
1562   /* forward solve the lower triangular */
1563   idx    = 5*(*r++);
1564   t[0] = b[idx];   t[1] = b[1+idx];
1565   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1566   for (i=1; i<n; i++) {
1567     v     = aa + 25*ai[i];
1568     vi    = aj + ai[i];
1569     nz    = diag[i] - ai[i];
1570     idx   = 5*(*r++);
1571     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1572     s5  = b[4+idx];
1573     while (nz--) {
1574       idx   = 5*(*vi++);
1575       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1576       x4    = t[3+idx];x5 = t[4+idx];
1577       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1578       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1579       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1580       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1581       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1582       v += 25;
1583     }
1584     idx = 5*i;
1585     t[idx]   = s1;t[1+idx] = s2;
1586     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1587   }
1588   /* backward solve the upper triangular */
1589   for (i=n-1; i>=0; i--){
1590     v    = aa + 25*diag[i] + 25;
1591     vi   = aj + diag[i] + 1;
1592     nz   = ai[i+1] - diag[i] - 1;
1593     idt  = 5*i;
1594     s1 = t[idt];  s2 = t[1+idt];
1595     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1596     while (nz--) {
1597       idx   = 5*(*vi++);
1598       x1    = t[idx];   x2 = t[1+idx];
1599       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1600       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1601       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1602       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1603       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1604       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1605       v += 25;
1606     }
1607     idc = 5*(*c--);
1608     v   = aa + 25*diag[i];
1609     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
1610                                  v[15]*s4+v[20]*s5;
1611     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
1612                                  v[16]*s4+v[21]*s5;
1613     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
1614                                  v[17]*s4+v[22]*s5;
1615     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
1616                                  v[18]*s4+v[23]*s5;
1617     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
1618                                  v[19]*s4+v[24]*s5;
1619   }
1620 
1621   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1622   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1623   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1624   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1625   ierr = PetscLogFlops(2*25*(a->nz) - 5*A->cmap.n);CHKERRQ(ierr);
1626   PetscFunctionReturn(0);
1627 }
1628 
1629 #undef __FUNCT__
1630 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
1631 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
1632 {
1633   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1634   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1635   PetscErrorCode    ierr;
1636   PetscInt          *diag = a->diag,jdx;
1637   const MatScalar   *aa=a->a,*v;
1638   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
1639   const PetscScalar *b;
1640 
1641   PetscFunctionBegin;
1642   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1643   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1644   /* forward solve the lower triangular */
1645   idx    = 0;
1646   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
1647   for (i=1; i<n; i++) {
1648     v     =  aa + 25*ai[i];
1649     vi    =  aj + ai[i];
1650     nz    =  diag[i] - ai[i];
1651     idx   =  5*i;
1652     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
1653     while (nz--) {
1654       jdx   = 5*(*vi++);
1655       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
1656       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1657       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1658       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1659       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1660       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1661       v    += 25;
1662     }
1663     x[idx]   = s1;
1664     x[1+idx] = s2;
1665     x[2+idx] = s3;
1666     x[3+idx] = s4;
1667     x[4+idx] = s5;
1668   }
1669   /* backward solve the upper triangular */
1670   for (i=n-1; i>=0; i--){
1671     v    = aa + 25*diag[i] + 25;
1672     vi   = aj + diag[i] + 1;
1673     nz   = ai[i+1] - diag[i] - 1;
1674     idt  = 5*i;
1675     s1 = x[idt];  s2 = x[1+idt];
1676     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
1677     while (nz--) {
1678       idx   = 5*(*vi++);
1679       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
1680       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1681       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1682       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1683       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1684       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1685       v    += 25;
1686     }
1687     v        = aa + 25*diag[i];
1688     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
1689     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
1690     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
1691     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
1692     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
1693   }
1694 
1695   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1696   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1697   ierr = PetscLogFlops(2*25*(a->nz) - 5*A->cmap.n);CHKERRQ(ierr);
1698   PetscFunctionReturn(0);
1699 }
1700 
1701 #undef __FUNCT__
1702 #define __FUNCT__ "MatSolve_SeqBAIJ_4"
1703 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
1704 {
1705   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1706   IS                iscol=a->col,isrow=a->row;
1707   PetscErrorCode    ierr;
1708   PetscInt          *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1709   PetscInt          *diag = a->diag;
1710   const MatScalar   *aa=a->a,*v;
1711   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
1712   const PetscScalar *b;
1713 
1714   PetscFunctionBegin;
1715   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1716   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1717   t  = a->solve_work;
1718 
1719   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1720   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1721 
1722   /* forward solve the lower triangular */
1723   idx    = 4*(*r++);
1724   t[0] = b[idx];   t[1] = b[1+idx];
1725   t[2] = b[2+idx]; t[3] = b[3+idx];
1726   for (i=1; i<n; i++) {
1727     v     = aa + 16*ai[i];
1728     vi    = aj + ai[i];
1729     nz    = diag[i] - ai[i];
1730     idx   = 4*(*r++);
1731     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1732     while (nz--) {
1733       idx   = 4*(*vi++);
1734       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
1735       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1736       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1737       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1738       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1739       v    += 16;
1740     }
1741     idx        = 4*i;
1742     t[idx]   = s1;t[1+idx] = s2;
1743     t[2+idx] = s3;t[3+idx] = s4;
1744   }
1745   /* backward solve the upper triangular */
1746   for (i=n-1; i>=0; i--){
1747     v    = aa + 16*diag[i] + 16;
1748     vi   = aj + diag[i] + 1;
1749     nz   = ai[i+1] - diag[i] - 1;
1750     idt  = 4*i;
1751     s1 = t[idt];  s2 = t[1+idt];
1752     s3 = t[2+idt];s4 = t[3+idt];
1753     while (nz--) {
1754       idx   = 4*(*vi++);
1755       x1    = t[idx];   x2 = t[1+idx];
1756       x3    = t[2+idx]; x4 = t[3+idx];
1757       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1758       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1759       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1760       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1761       v += 16;
1762     }
1763     idc      = 4*(*c--);
1764     v        = aa + 16*diag[i];
1765     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1766     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1767     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1768     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1769   }
1770 
1771   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1772   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1773   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1774   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1775   ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap.n);CHKERRQ(ierr);
1776   PetscFunctionReturn(0);
1777 }
1778 
1779 #undef __FUNCT__
1780 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion"
1781 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
1782 {
1783   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1784   IS                iscol=a->col,isrow=a->row;
1785   PetscErrorCode    ierr;
1786   PetscInt          *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1787   PetscInt          *diag = a->diag;
1788   const MatScalar   *aa=a->a,*v;
1789   MatScalar         s1,s2,s3,s4,x1,x2,x3,x4,*t;
1790   PetscScalar       *x;
1791   const PetscScalar *b;
1792 
1793   PetscFunctionBegin;
1794   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1795   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1796   t  = (MatScalar *)a->solve_work;
1797 
1798   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1799   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1800 
1801   /* forward solve the lower triangular */
1802   idx    = 4*(*r++);
1803   t[0] = (MatScalar)b[idx];
1804   t[1] = (MatScalar)b[1+idx];
1805   t[2] = (MatScalar)b[2+idx];
1806   t[3] = (MatScalar)b[3+idx];
1807   for (i=1; i<n; i++) {
1808     v     = aa + 16*ai[i];
1809     vi    = aj + ai[i];
1810     nz    = diag[i] - ai[i];
1811     idx   = 4*(*r++);
1812     s1 = (MatScalar)b[idx];
1813     s2 = (MatScalar)b[1+idx];
1814     s3 = (MatScalar)b[2+idx];
1815     s4 = (MatScalar)b[3+idx];
1816     while (nz--) {
1817       idx   = 4*(*vi++);
1818       x1  = t[idx];
1819       x2  = t[1+idx];
1820       x3  = t[2+idx];
1821       x4  = t[3+idx];
1822       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1823       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1824       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1825       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1826       v    += 16;
1827     }
1828     idx        = 4*i;
1829     t[idx]   = s1;
1830     t[1+idx] = s2;
1831     t[2+idx] = s3;
1832     t[3+idx] = s4;
1833   }
1834   /* backward solve the upper triangular */
1835   for (i=n-1; i>=0; i--){
1836     v    = aa + 16*diag[i] + 16;
1837     vi   = aj + diag[i] + 1;
1838     nz   = ai[i+1] - diag[i] - 1;
1839     idt  = 4*i;
1840     s1 = t[idt];
1841     s2 = t[1+idt];
1842     s3 = t[2+idt];
1843     s4 = t[3+idt];
1844     while (nz--) {
1845       idx   = 4*(*vi++);
1846       x1  = t[idx];
1847       x2  = t[1+idx];
1848       x3  = t[2+idx];
1849       x4  = t[3+idx];
1850       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1851       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1852       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1853       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1854       v += 16;
1855     }
1856     idc      = 4*(*c--);
1857     v        = aa + 16*diag[i];
1858     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1859     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1860     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1861     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1862     x[idc]   = (PetscScalar)t[idt];
1863     x[1+idc] = (PetscScalar)t[1+idt];
1864     x[2+idc] = (PetscScalar)t[2+idt];
1865     x[3+idc] = (PetscScalar)t[3+idt];
1866  }
1867 
1868   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1869   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1870   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1871   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1872   ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap.n);CHKERRQ(ierr);
1873   PetscFunctionReturn(0);
1874 }
1875 
1876 #if defined (PETSC_HAVE_SSE)
1877 
1878 #include PETSC_HAVE_SSE
1879 
1880 #undef __FUNCT__
1881 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
1882 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
1883 {
1884   /*
1885      Note: This code uses demotion of double
1886      to float when performing the mixed-mode computation.
1887      This may not be numerically reasonable for all applications.
1888   */
1889   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
1890   IS             iscol=a->col,isrow=a->row;
1891   PetscErrorCode ierr;
1892   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1893   PetscInt       *diag = a->diag,ai16;
1894   MatScalar      *aa=a->a,*v;
1895   PetscScalar    *x,*b,*t;
1896 
1897   /* Make space in temp stack for 16 Byte Aligned arrays */
1898   float           ssealignedspace[11],*tmps,*tmpx;
1899   unsigned long   offset;
1900 
1901   PetscFunctionBegin;
1902   SSE_SCOPE_BEGIN;
1903 
1904     offset = (unsigned long)ssealignedspace % 16;
1905     if (offset) offset = (16 - offset)/4;
1906     tmps = &ssealignedspace[offset];
1907     tmpx = &ssealignedspace[offset+4];
1908     PREFETCH_NTA(aa+16*ai[1]);
1909 
1910     ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1911     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1912     t  = a->solve_work;
1913 
1914     ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1915     ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1916 
1917     /* forward solve the lower triangular */
1918     idx  = 4*(*r++);
1919     t[0] = b[idx];   t[1] = b[1+idx];
1920     t[2] = b[2+idx]; t[3] = b[3+idx];
1921     v    =  aa + 16*ai[1];
1922 
1923     for (i=1; i<n;) {
1924       PREFETCH_NTA(&v[8]);
1925       vi   =  aj      + ai[i];
1926       nz   =  diag[i] - ai[i];
1927       idx  =  4*(*r++);
1928 
1929       /* Demote sum from double to float */
1930       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
1931       LOAD_PS(tmps,XMM7);
1932 
1933       while (nz--) {
1934         PREFETCH_NTA(&v[16]);
1935         idx = 4*(*vi++);
1936 
1937         /* Demote solution (so far) from double to float */
1938         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
1939 
1940         /* 4x4 Matrix-Vector product with negative accumulation: */
1941         SSE_INLINE_BEGIN_2(tmpx,v)
1942           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
1943 
1944           /* First Column */
1945           SSE_COPY_PS(XMM0,XMM6)
1946           SSE_SHUFFLE(XMM0,XMM0,0x00)
1947           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1948           SSE_SUB_PS(XMM7,XMM0)
1949 
1950           /* Second Column */
1951           SSE_COPY_PS(XMM1,XMM6)
1952           SSE_SHUFFLE(XMM1,XMM1,0x55)
1953           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1954           SSE_SUB_PS(XMM7,XMM1)
1955 
1956           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
1957 
1958           /* Third Column */
1959           SSE_COPY_PS(XMM2,XMM6)
1960           SSE_SHUFFLE(XMM2,XMM2,0xAA)
1961           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1962           SSE_SUB_PS(XMM7,XMM2)
1963 
1964           /* Fourth Column */
1965           SSE_COPY_PS(XMM3,XMM6)
1966           SSE_SHUFFLE(XMM3,XMM3,0xFF)
1967           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1968           SSE_SUB_PS(XMM7,XMM3)
1969         SSE_INLINE_END_2
1970 
1971         v  += 16;
1972       }
1973       idx = 4*i;
1974       v   = aa + 16*ai[++i];
1975       PREFETCH_NTA(v);
1976       STORE_PS(tmps,XMM7);
1977 
1978       /* Promote result from float to double */
1979       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
1980     }
1981     /* backward solve the upper triangular */
1982     idt  = 4*(n-1);
1983     ai16 = 16*diag[n-1];
1984     v    = aa + ai16 + 16;
1985     for (i=n-1; i>=0;){
1986       PREFETCH_NTA(&v[8]);
1987       vi = aj + diag[i] + 1;
1988       nz = ai[i+1] - diag[i] - 1;
1989 
1990       /* Demote accumulator from double to float */
1991       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
1992       LOAD_PS(tmps,XMM7);
1993 
1994       while (nz--) {
1995         PREFETCH_NTA(&v[16]);
1996         idx = 4*(*vi++);
1997 
1998         /* Demote solution (so far) from double to float */
1999         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
2000 
2001         /* 4x4 Matrix-Vector Product with negative accumulation: */
2002         SSE_INLINE_BEGIN_2(tmpx,v)
2003           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2004 
2005           /* First Column */
2006           SSE_COPY_PS(XMM0,XMM6)
2007           SSE_SHUFFLE(XMM0,XMM0,0x00)
2008           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2009           SSE_SUB_PS(XMM7,XMM0)
2010 
2011           /* Second Column */
2012           SSE_COPY_PS(XMM1,XMM6)
2013           SSE_SHUFFLE(XMM1,XMM1,0x55)
2014           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2015           SSE_SUB_PS(XMM7,XMM1)
2016 
2017           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2018 
2019           /* Third Column */
2020           SSE_COPY_PS(XMM2,XMM6)
2021           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2022           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2023           SSE_SUB_PS(XMM7,XMM2)
2024 
2025           /* Fourth Column */
2026           SSE_COPY_PS(XMM3,XMM6)
2027           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2028           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2029           SSE_SUB_PS(XMM7,XMM3)
2030         SSE_INLINE_END_2
2031         v  += 16;
2032       }
2033       v    = aa + ai16;
2034       ai16 = 16*diag[--i];
2035       PREFETCH_NTA(aa+ai16+16);
2036       /*
2037          Scale the result by the diagonal 4x4 block,
2038          which was inverted as part of the factorization
2039       */
2040       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
2041         /* First Column */
2042         SSE_COPY_PS(XMM0,XMM7)
2043         SSE_SHUFFLE(XMM0,XMM0,0x00)
2044         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2045 
2046         /* Second Column */
2047         SSE_COPY_PS(XMM1,XMM7)
2048         SSE_SHUFFLE(XMM1,XMM1,0x55)
2049         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2050         SSE_ADD_PS(XMM0,XMM1)
2051 
2052         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2053 
2054         /* Third Column */
2055         SSE_COPY_PS(XMM2,XMM7)
2056         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2057         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2058         SSE_ADD_PS(XMM0,XMM2)
2059 
2060         /* Fourth Column */
2061         SSE_COPY_PS(XMM3,XMM7)
2062         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2063         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2064         SSE_ADD_PS(XMM0,XMM3)
2065 
2066         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2067       SSE_INLINE_END_3
2068 
2069       /* Promote solution from float to double */
2070       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
2071 
2072       /* Apply reordering to t and stream into x.    */
2073       /* This way, x doesn't pollute the cache.      */
2074       /* Be careful with size: 2 doubles = 4 floats! */
2075       idc  = 4*(*c--);
2076       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
2077         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
2078         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
2079         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
2080         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
2081         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
2082         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
2083       SSE_INLINE_END_2
2084       v    = aa + ai16 + 16;
2085       idt -= 4;
2086     }
2087 
2088     ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2089     ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2090     ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2091     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2092     ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap.n);CHKERRQ(ierr);
2093   SSE_SCOPE_END;
2094   PetscFunctionReturn(0);
2095 }
2096 
2097 #endif
2098 
2099 
2100 /*
2101       Special case where the matrix was ILU(0) factored in the natural
2102    ordering. This eliminates the need for the column and row permutation.
2103 */
2104 #undef __FUNCT__
2105 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
2106 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
2107 {
2108   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2109   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
2110   PetscErrorCode    ierr;
2111   PetscInt          *diag = a->diag;
2112   const MatScalar   *aa=a->a;
2113   PetscScalar       *x;
2114   const PetscScalar *b;
2115 
2116   PetscFunctionBegin;
2117   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2118   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2119 
2120 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
2121   {
2122     static PetscScalar w[2000]; /* very BAD need to fix */
2123     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
2124   }
2125 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
2126   {
2127     static PetscScalar w[2000]; /* very BAD need to fix */
2128     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
2129   }
2130 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
2131   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
2132 #else
2133   {
2134     PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
2135     const MatScalar *v;
2136     PetscInt        jdx,idt,idx,nz,*vi,i,ai16;
2137 
2138   /* forward solve the lower triangular */
2139   idx    = 0;
2140   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
2141   for (i=1; i<n; i++) {
2142     v     =  aa      + 16*ai[i];
2143     vi    =  aj      + ai[i];
2144     nz    =  diag[i] - ai[i];
2145     idx   +=  4;
2146     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2147     while (nz--) {
2148       jdx   = 4*(*vi++);
2149       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
2150       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2151       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2152       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2153       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2154       v    += 16;
2155     }
2156     x[idx]   = s1;
2157     x[1+idx] = s2;
2158     x[2+idx] = s3;
2159     x[3+idx] = s4;
2160   }
2161   /* backward solve the upper triangular */
2162   idt = 4*(n-1);
2163   for (i=n-1; i>=0; i--){
2164     ai16 = 16*diag[i];
2165     v    = aa + ai16 + 16;
2166     vi   = aj + diag[i] + 1;
2167     nz   = ai[i+1] - diag[i] - 1;
2168     s1 = x[idt];  s2 = x[1+idt];
2169     s3 = x[2+idt];s4 = x[3+idt];
2170     while (nz--) {
2171       idx   = 4*(*vi++);
2172       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
2173       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2174       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2175       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2176       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
2177       v    += 16;
2178     }
2179     v        = aa + ai16;
2180     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
2181     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
2182     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
2183     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
2184     idt -= 4;
2185   }
2186   }
2187 #endif
2188 
2189   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2190   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2191   ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap.n);CHKERRQ(ierr);
2192   PetscFunctionReturn(0);
2193 }
2194 
2195 #undef __FUNCT__
2196 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion"
2197 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
2198 {
2199   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2200   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
2201   PetscErrorCode ierr;
2202   PetscInt       *diag = a->diag;
2203   MatScalar      *aa=a->a;
2204   PetscScalar    *x,*b;
2205 
2206   PetscFunctionBegin;
2207   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2208   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2209 
2210   {
2211     MatScalar  s1,s2,s3,s4,x1,x2,x3,x4;
2212     MatScalar  *v,*t=(MatScalar *)x;
2213     PetscInt   jdx,idt,idx,nz,*vi,i,ai16;
2214 
2215     /* forward solve the lower triangular */
2216     idx  = 0;
2217     t[0] = (MatScalar)b[0];
2218     t[1] = (MatScalar)b[1];
2219     t[2] = (MatScalar)b[2];
2220     t[3] = (MatScalar)b[3];
2221     for (i=1; i<n; i++) {
2222       v     =  aa      + 16*ai[i];
2223       vi    =  aj      + ai[i];
2224       nz    =  diag[i] - ai[i];
2225       idx   +=  4;
2226       s1 = (MatScalar)b[idx];
2227       s2 = (MatScalar)b[1+idx];
2228       s3 = (MatScalar)b[2+idx];
2229       s4 = (MatScalar)b[3+idx];
2230       while (nz--) {
2231         jdx = 4*(*vi++);
2232         x1  = t[jdx];
2233         x2  = t[1+jdx];
2234         x3  = t[2+jdx];
2235         x4  = t[3+jdx];
2236         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2237         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2238         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2239         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2240         v    += 16;
2241       }
2242       t[idx]   = s1;
2243       t[1+idx] = s2;
2244       t[2+idx] = s3;
2245       t[3+idx] = s4;
2246     }
2247     /* backward solve the upper triangular */
2248     idt = 4*(n-1);
2249     for (i=n-1; i>=0; i--){
2250       ai16 = 16*diag[i];
2251       v    = aa + ai16 + 16;
2252       vi   = aj + diag[i] + 1;
2253       nz   = ai[i+1] - diag[i] - 1;
2254       s1   = t[idt];
2255       s2   = t[1+idt];
2256       s3   = t[2+idt];
2257       s4   = t[3+idt];
2258       while (nz--) {
2259         idx = 4*(*vi++);
2260         x1  = (MatScalar)x[idx];
2261         x2  = (MatScalar)x[1+idx];
2262         x3  = (MatScalar)x[2+idx];
2263         x4  = (MatScalar)x[3+idx];
2264         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2265         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2266         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2267         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2268         v    += 16;
2269       }
2270       v        = aa + ai16;
2271       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
2272       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
2273       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
2274       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
2275       idt -= 4;
2276     }
2277   }
2278 
2279   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2280   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2281   ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap.n);CHKERRQ(ierr);
2282   PetscFunctionReturn(0);
2283 }
2284 
2285 #if defined (PETSC_HAVE_SSE)
2286 
2287 #include PETSC_HAVE_SSE
2288 #undef __FUNCT__
2289 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj"
2290 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
2291 {
2292   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2293   unsigned short *aj=(unsigned short *)a->j;
2294   PetscErrorCode ierr;
2295   int            *ai=a->i,n=a->mbs,*diag = a->diag;
2296   MatScalar      *aa=a->a;
2297   PetscScalar    *x,*b;
2298 
2299   PetscFunctionBegin;
2300   SSE_SCOPE_BEGIN;
2301   /*
2302      Note: This code currently uses demotion of double
2303      to float when performing the mixed-mode computation.
2304      This may not be numerically reasonable for all applications.
2305   */
2306   PREFETCH_NTA(aa+16*ai[1]);
2307 
2308   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2309   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2310   {
2311     /* x will first be computed in single precision then promoted inplace to double */
2312     MatScalar      *v,*t=(MatScalar *)x;
2313     int            nz,i,idt,ai16;
2314     unsigned int   jdx,idx;
2315     unsigned short *vi;
2316     /* Forward solve the lower triangular factor. */
2317 
2318     /* First block is the identity. */
2319     idx  = 0;
2320     CONVERT_DOUBLE4_FLOAT4(t,b);
2321     v    =  aa + 16*((unsigned int)ai[1]);
2322 
2323     for (i=1; i<n;) {
2324       PREFETCH_NTA(&v[8]);
2325       vi   =  aj      + ai[i];
2326       nz   =  diag[i] - ai[i];
2327       idx +=  4;
2328 
2329       /* Demote RHS from double to float. */
2330       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
2331       LOAD_PS(&t[idx],XMM7);
2332 
2333       while (nz--) {
2334         PREFETCH_NTA(&v[16]);
2335         jdx = 4*((unsigned int)(*vi++));
2336 
2337         /* 4x4 Matrix-Vector product with negative accumulation: */
2338         SSE_INLINE_BEGIN_2(&t[jdx],v)
2339           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2340 
2341           /* First Column */
2342           SSE_COPY_PS(XMM0,XMM6)
2343           SSE_SHUFFLE(XMM0,XMM0,0x00)
2344           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2345           SSE_SUB_PS(XMM7,XMM0)
2346 
2347           /* Second Column */
2348           SSE_COPY_PS(XMM1,XMM6)
2349           SSE_SHUFFLE(XMM1,XMM1,0x55)
2350           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2351           SSE_SUB_PS(XMM7,XMM1)
2352 
2353           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2354 
2355           /* Third Column */
2356           SSE_COPY_PS(XMM2,XMM6)
2357           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2358           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2359           SSE_SUB_PS(XMM7,XMM2)
2360 
2361           /* Fourth Column */
2362           SSE_COPY_PS(XMM3,XMM6)
2363           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2364           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2365           SSE_SUB_PS(XMM7,XMM3)
2366         SSE_INLINE_END_2
2367 
2368         v  += 16;
2369       }
2370       v    =  aa + 16*ai[++i];
2371       PREFETCH_NTA(v);
2372       STORE_PS(&t[idx],XMM7);
2373     }
2374 
2375     /* Backward solve the upper triangular factor.*/
2376 
2377     idt  = 4*(n-1);
2378     ai16 = 16*diag[n-1];
2379     v    = aa + ai16 + 16;
2380     for (i=n-1; i>=0;){
2381       PREFETCH_NTA(&v[8]);
2382       vi = aj + diag[i] + 1;
2383       nz = ai[i+1] - diag[i] - 1;
2384 
2385       LOAD_PS(&t[idt],XMM7);
2386 
2387       while (nz--) {
2388         PREFETCH_NTA(&v[16]);
2389         idx = 4*((unsigned int)(*vi++));
2390 
2391         /* 4x4 Matrix-Vector Product with negative accumulation: */
2392         SSE_INLINE_BEGIN_2(&t[idx],v)
2393           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2394 
2395           /* First Column */
2396           SSE_COPY_PS(XMM0,XMM6)
2397           SSE_SHUFFLE(XMM0,XMM0,0x00)
2398           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2399           SSE_SUB_PS(XMM7,XMM0)
2400 
2401           /* Second Column */
2402           SSE_COPY_PS(XMM1,XMM6)
2403           SSE_SHUFFLE(XMM1,XMM1,0x55)
2404           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2405           SSE_SUB_PS(XMM7,XMM1)
2406 
2407           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2408 
2409           /* Third Column */
2410           SSE_COPY_PS(XMM2,XMM6)
2411           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2412           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2413           SSE_SUB_PS(XMM7,XMM2)
2414 
2415           /* Fourth Column */
2416           SSE_COPY_PS(XMM3,XMM6)
2417           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2418           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2419           SSE_SUB_PS(XMM7,XMM3)
2420         SSE_INLINE_END_2
2421         v  += 16;
2422       }
2423       v    = aa + ai16;
2424       ai16 = 16*diag[--i];
2425       PREFETCH_NTA(aa+ai16+16);
2426       /*
2427          Scale the result by the diagonal 4x4 block,
2428          which was inverted as part of the factorization
2429       */
2430       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
2431         /* First Column */
2432         SSE_COPY_PS(XMM0,XMM7)
2433         SSE_SHUFFLE(XMM0,XMM0,0x00)
2434         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2435 
2436         /* Second Column */
2437         SSE_COPY_PS(XMM1,XMM7)
2438         SSE_SHUFFLE(XMM1,XMM1,0x55)
2439         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2440         SSE_ADD_PS(XMM0,XMM1)
2441 
2442         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2443 
2444         /* Third Column */
2445         SSE_COPY_PS(XMM2,XMM7)
2446         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2447         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2448         SSE_ADD_PS(XMM0,XMM2)
2449 
2450         /* Fourth Column */
2451         SSE_COPY_PS(XMM3,XMM7)
2452         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2453         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2454         SSE_ADD_PS(XMM0,XMM3)
2455 
2456         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2457       SSE_INLINE_END_3
2458 
2459       v    = aa + ai16 + 16;
2460       idt -= 4;
2461     }
2462 
2463     /* Convert t from single precision back to double precision (inplace)*/
2464     idt = 4*(n-1);
2465     for (i=n-1;i>=0;i--) {
2466       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
2467       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
2468       PetscScalar *xtemp=&x[idt];
2469       MatScalar   *ttemp=&t[idt];
2470       xtemp[3] = (PetscScalar)ttemp[3];
2471       xtemp[2] = (PetscScalar)ttemp[2];
2472       xtemp[1] = (PetscScalar)ttemp[1];
2473       xtemp[0] = (PetscScalar)ttemp[0];
2474       idt -= 4;
2475     }
2476 
2477   } /* End of artificial scope. */
2478   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2479   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2480   ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap.n);CHKERRQ(ierr);
2481   SSE_SCOPE_END;
2482   PetscFunctionReturn(0);
2483 }
2484 
2485 #undef __FUNCT__
2486 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion"
2487 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
2488 {
2489   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2490   int            *aj=a->j;
2491   PetscErrorCode ierr;
2492   int            *ai=a->i,n=a->mbs,*diag = a->diag;
2493   MatScalar      *aa=a->a;
2494   PetscScalar    *x,*b;
2495 
2496   PetscFunctionBegin;
2497   SSE_SCOPE_BEGIN;
2498   /*
2499      Note: This code currently uses demotion of double
2500      to float when performing the mixed-mode computation.
2501      This may not be numerically reasonable for all applications.
2502   */
2503   PREFETCH_NTA(aa+16*ai[1]);
2504 
2505   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2506   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2507   {
2508     /* x will first be computed in single precision then promoted inplace to double */
2509     MatScalar *v,*t=(MatScalar *)x;
2510     int       nz,i,idt,ai16;
2511     int       jdx,idx;
2512     int       *vi;
2513     /* Forward solve the lower triangular factor. */
2514 
2515     /* First block is the identity. */
2516     idx  = 0;
2517     CONVERT_DOUBLE4_FLOAT4(t,b);
2518     v    =  aa + 16*ai[1];
2519 
2520     for (i=1; i<n;) {
2521       PREFETCH_NTA(&v[8]);
2522       vi   =  aj      + ai[i];
2523       nz   =  diag[i] - ai[i];
2524       idx +=  4;
2525 
2526       /* Demote RHS from double to float. */
2527       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
2528       LOAD_PS(&t[idx],XMM7);
2529 
2530       while (nz--) {
2531         PREFETCH_NTA(&v[16]);
2532         jdx = 4*(*vi++);
2533 /*          jdx = *vi++; */
2534 
2535         /* 4x4 Matrix-Vector product with negative accumulation: */
2536         SSE_INLINE_BEGIN_2(&t[jdx],v)
2537           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2538 
2539           /* First Column */
2540           SSE_COPY_PS(XMM0,XMM6)
2541           SSE_SHUFFLE(XMM0,XMM0,0x00)
2542           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2543           SSE_SUB_PS(XMM7,XMM0)
2544 
2545           /* Second Column */
2546           SSE_COPY_PS(XMM1,XMM6)
2547           SSE_SHUFFLE(XMM1,XMM1,0x55)
2548           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2549           SSE_SUB_PS(XMM7,XMM1)
2550 
2551           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2552 
2553           /* Third Column */
2554           SSE_COPY_PS(XMM2,XMM6)
2555           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2556           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2557           SSE_SUB_PS(XMM7,XMM2)
2558 
2559           /* Fourth Column */
2560           SSE_COPY_PS(XMM3,XMM6)
2561           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2562           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2563           SSE_SUB_PS(XMM7,XMM3)
2564         SSE_INLINE_END_2
2565 
2566         v  += 16;
2567       }
2568       v    =  aa + 16*ai[++i];
2569       PREFETCH_NTA(v);
2570       STORE_PS(&t[idx],XMM7);
2571     }
2572 
2573     /* Backward solve the upper triangular factor.*/
2574 
2575     idt  = 4*(n-1);
2576     ai16 = 16*diag[n-1];
2577     v    = aa + ai16 + 16;
2578     for (i=n-1; i>=0;){
2579       PREFETCH_NTA(&v[8]);
2580       vi = aj + diag[i] + 1;
2581       nz = ai[i+1] - diag[i] - 1;
2582 
2583       LOAD_PS(&t[idt],XMM7);
2584 
2585       while (nz--) {
2586         PREFETCH_NTA(&v[16]);
2587         idx = 4*(*vi++);
2588 /*          idx = *vi++; */
2589 
2590         /* 4x4 Matrix-Vector Product with negative accumulation: */
2591         SSE_INLINE_BEGIN_2(&t[idx],v)
2592           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2593 
2594           /* First Column */
2595           SSE_COPY_PS(XMM0,XMM6)
2596           SSE_SHUFFLE(XMM0,XMM0,0x00)
2597           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2598           SSE_SUB_PS(XMM7,XMM0)
2599 
2600           /* Second Column */
2601           SSE_COPY_PS(XMM1,XMM6)
2602           SSE_SHUFFLE(XMM1,XMM1,0x55)
2603           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2604           SSE_SUB_PS(XMM7,XMM1)
2605 
2606           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2607 
2608           /* Third Column */
2609           SSE_COPY_PS(XMM2,XMM6)
2610           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2611           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2612           SSE_SUB_PS(XMM7,XMM2)
2613 
2614           /* Fourth Column */
2615           SSE_COPY_PS(XMM3,XMM6)
2616           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2617           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2618           SSE_SUB_PS(XMM7,XMM3)
2619         SSE_INLINE_END_2
2620         v  += 16;
2621       }
2622       v    = aa + ai16;
2623       ai16 = 16*diag[--i];
2624       PREFETCH_NTA(aa+ai16+16);
2625       /*
2626          Scale the result by the diagonal 4x4 block,
2627          which was inverted as part of the factorization
2628       */
2629       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
2630         /* First Column */
2631         SSE_COPY_PS(XMM0,XMM7)
2632         SSE_SHUFFLE(XMM0,XMM0,0x00)
2633         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2634 
2635         /* Second Column */
2636         SSE_COPY_PS(XMM1,XMM7)
2637         SSE_SHUFFLE(XMM1,XMM1,0x55)
2638         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2639         SSE_ADD_PS(XMM0,XMM1)
2640 
2641         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2642 
2643         /* Third Column */
2644         SSE_COPY_PS(XMM2,XMM7)
2645         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2646         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2647         SSE_ADD_PS(XMM0,XMM2)
2648 
2649         /* Fourth Column */
2650         SSE_COPY_PS(XMM3,XMM7)
2651         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2652         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2653         SSE_ADD_PS(XMM0,XMM3)
2654 
2655         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2656       SSE_INLINE_END_3
2657 
2658       v    = aa + ai16 + 16;
2659       idt -= 4;
2660     }
2661 
2662     /* Convert t from single precision back to double precision (inplace)*/
2663     idt = 4*(n-1);
2664     for (i=n-1;i>=0;i--) {
2665       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
2666       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
2667       PetscScalar *xtemp=&x[idt];
2668       MatScalar   *ttemp=&t[idt];
2669       xtemp[3] = (PetscScalar)ttemp[3];
2670       xtemp[2] = (PetscScalar)ttemp[2];
2671       xtemp[1] = (PetscScalar)ttemp[1];
2672       xtemp[0] = (PetscScalar)ttemp[0];
2673       idt -= 4;
2674     }
2675 
2676   } /* End of artificial scope. */
2677   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2678   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2679   ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap.n);CHKERRQ(ierr);
2680   SSE_SCOPE_END;
2681   PetscFunctionReturn(0);
2682 }
2683 
2684 #endif
2685 #undef __FUNCT__
2686 #define __FUNCT__ "MatSolve_SeqBAIJ_3"
2687 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
2688 {
2689   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2690   IS                iscol=a->col,isrow=a->row;
2691   PetscErrorCode    ierr;
2692   PetscInt          *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
2693   PetscInt          *diag = a->diag;
2694   const MatScalar   *aa=a->a,*v;
2695   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
2696   const PetscScalar *b;
2697 
2698   PetscFunctionBegin;
2699   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2700   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2701   t  = a->solve_work;
2702 
2703   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2704   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2705 
2706   /* forward solve the lower triangular */
2707   idx    = 3*(*r++);
2708   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
2709   for (i=1; i<n; i++) {
2710     v     = aa + 9*ai[i];
2711     vi    = aj + ai[i];
2712     nz    = diag[i] - ai[i];
2713     idx   = 3*(*r++);
2714     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
2715     while (nz--) {
2716       idx   = 3*(*vi++);
2717       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2718       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2719       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2720       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2721       v += 9;
2722     }
2723     idx = 3*i;
2724     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
2725   }
2726   /* backward solve the upper triangular */
2727   for (i=n-1; i>=0; i--){
2728     v    = aa + 9*diag[i] + 9;
2729     vi   = aj + diag[i] + 1;
2730     nz   = ai[i+1] - diag[i] - 1;
2731     idt  = 3*i;
2732     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
2733     while (nz--) {
2734       idx   = 3*(*vi++);
2735       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2736       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2737       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2738       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2739       v += 9;
2740     }
2741     idc = 3*(*c--);
2742     v   = aa + 9*diag[i];
2743     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2744     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2745     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
2746   }
2747   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2748   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2749   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2750   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2751   ierr = PetscLogFlops(2*9*(a->nz) - 3*A->cmap.n);CHKERRQ(ierr);
2752   PetscFunctionReturn(0);
2753 }
2754 
2755 /*
2756       Special case where the matrix was ILU(0) factored in the natural
2757    ordering. This eliminates the need for the column and row permutation.
2758 */
2759 #undef __FUNCT__
2760 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
2761 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
2762 {
2763   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2764   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
2765   PetscErrorCode    ierr;
2766   PetscInt          *diag = a->diag;
2767   const MatScalar   *aa=a->a,*v;
2768   PetscScalar       *x,s1,s2,s3,x1,x2,x3;
2769   const PetscScalar *b;
2770   PetscInt          jdx,idt,idx,nz,*vi,i;
2771 
2772   PetscFunctionBegin;
2773   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2774   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2775 
2776   /* forward solve the lower triangular */
2777   idx    = 0;
2778   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
2779   for (i=1; i<n; i++) {
2780     v     =  aa      + 9*ai[i];
2781     vi    =  aj      + ai[i];
2782     nz    =  diag[i] - ai[i];
2783     idx   +=  3;
2784     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
2785     while (nz--) {
2786       jdx   = 3*(*vi++);
2787       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
2788       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2789       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2790       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2791       v    += 9;
2792     }
2793     x[idx]   = s1;
2794     x[1+idx] = s2;
2795     x[2+idx] = s3;
2796   }
2797   /* backward solve the upper triangular */
2798   for (i=n-1; i>=0; i--){
2799     v    = aa + 9*diag[i] + 9;
2800     vi   = aj + diag[i] + 1;
2801     nz   = ai[i+1] - diag[i] - 1;
2802     idt  = 3*i;
2803     s1 = x[idt];  s2 = x[1+idt];
2804     s3 = x[2+idt];
2805     while (nz--) {
2806       idx   = 3*(*vi++);
2807       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
2808       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2809       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2810       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2811       v    += 9;
2812     }
2813     v        = aa +  9*diag[i];
2814     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2815     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2816     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
2817   }
2818 
2819   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2820   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2821   ierr = PetscLogFlops(2*9*(a->nz) - 3*A->cmap.n);CHKERRQ(ierr);
2822   PetscFunctionReturn(0);
2823 }
2824 
2825 #undef __FUNCT__
2826 #define __FUNCT__ "MatSolve_SeqBAIJ_2"
2827 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
2828 {
2829   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2830   IS                iscol=a->col,isrow=a->row;
2831   PetscErrorCode    ierr;
2832   PetscInt          *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
2833   PetscInt          *diag = a->diag;
2834   const MatScalar   *aa=a->a,*v;
2835   PetscScalar       *x,s1,s2,x1,x2,*t;
2836   const PetscScalar *b;
2837 
2838   PetscFunctionBegin;
2839   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2840   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2841   t  = a->solve_work;
2842 
2843   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2844   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2845 
2846   /* forward solve the lower triangular */
2847   idx    = 2*(*r++);
2848   t[0] = b[idx]; t[1] = b[1+idx];
2849   for (i=1; i<n; i++) {
2850     v     = aa + 4*ai[i];
2851     vi    = aj + ai[i];
2852     nz    = diag[i] - ai[i];
2853     idx   = 2*(*r++);
2854     s1  = b[idx]; s2 = b[1+idx];
2855     while (nz--) {
2856       idx   = 2*(*vi++);
2857       x1    = t[idx]; x2 = t[1+idx];
2858       s1 -= v[0]*x1 + v[2]*x2;
2859       s2 -= v[1]*x1 + v[3]*x2;
2860       v += 4;
2861     }
2862     idx = 2*i;
2863     t[idx] = s1; t[1+idx] = s2;
2864   }
2865   /* backward solve the upper triangular */
2866   for (i=n-1; i>=0; i--){
2867     v    = aa + 4*diag[i] + 4;
2868     vi   = aj + diag[i] + 1;
2869     nz   = ai[i+1] - diag[i] - 1;
2870     idt  = 2*i;
2871     s1 = t[idt]; s2 = t[1+idt];
2872     while (nz--) {
2873       idx   = 2*(*vi++);
2874       x1    = t[idx]; x2 = t[1+idx];
2875       s1 -= v[0]*x1 + v[2]*x2;
2876       s2 -= v[1]*x1 + v[3]*x2;
2877       v += 4;
2878     }
2879     idc = 2*(*c--);
2880     v   = aa + 4*diag[i];
2881     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
2882     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
2883   }
2884   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2885   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2886   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2887   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2888   ierr = PetscLogFlops(2*4*(a->nz) - 2*A->cmap.n);CHKERRQ(ierr);
2889   PetscFunctionReturn(0);
2890 }
2891 
2892 /*
2893       Special case where the matrix was ILU(0) factored in the natural
2894    ordering. This eliminates the need for the column and row permutation.
2895 */
2896 #undef __FUNCT__
2897 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
2898 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
2899 {
2900   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2901   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
2902   PetscErrorCode    ierr;
2903   PetscInt          *diag = a->diag;
2904   const MatScalar   *aa=a->a,*v;
2905   PetscScalar       *x,s1,s2,x1,x2;
2906   const PetscScalar *b;
2907   PetscInt          jdx,idt,idx,nz,*vi,i;
2908 
2909   PetscFunctionBegin;
2910   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2911   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2912 
2913   /* forward solve the lower triangular */
2914   idx    = 0;
2915   x[0]   = b[0]; x[1] = b[1];
2916   for (i=1; i<n; i++) {
2917     v     =  aa      + 4*ai[i];
2918     vi    =  aj      + ai[i];
2919     nz    =  diag[i] - ai[i];
2920     idx   +=  2;
2921     s1  =  b[idx];s2 = b[1+idx];
2922     while (nz--) {
2923       jdx   = 2*(*vi++);
2924       x1    = x[jdx];x2 = x[1+jdx];
2925       s1 -= v[0]*x1 + v[2]*x2;
2926       s2 -= v[1]*x1 + v[3]*x2;
2927       v    += 4;
2928     }
2929     x[idx]   = s1;
2930     x[1+idx] = s2;
2931   }
2932   /* backward solve the upper triangular */
2933   for (i=n-1; i>=0; i--){
2934     v    = aa + 4*diag[i] + 4;
2935     vi   = aj + diag[i] + 1;
2936     nz   = ai[i+1] - diag[i] - 1;
2937     idt  = 2*i;
2938     s1 = x[idt];  s2 = x[1+idt];
2939     while (nz--) {
2940       idx   = 2*(*vi++);
2941       x1    = x[idx];   x2 = x[1+idx];
2942       s1 -= v[0]*x1 + v[2]*x2;
2943       s2 -= v[1]*x1 + v[3]*x2;
2944       v    += 4;
2945     }
2946     v        = aa +  4*diag[i];
2947     x[idt]   = v[0]*s1 + v[2]*s2;
2948     x[1+idt] = v[1]*s1 + v[3]*s2;
2949   }
2950 
2951   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2952   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2953   ierr = PetscLogFlops(2*4*(a->nz) - 2*A->cmap.n);CHKERRQ(ierr);
2954   PetscFunctionReturn(0);
2955 }
2956 
2957 #undef __FUNCT__
2958 #define __FUNCT__ "MatSolve_SeqBAIJ_1"
2959 PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
2960 {
2961   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
2962   IS             iscol=a->col,isrow=a->row;
2963   PetscErrorCode ierr;
2964   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
2965   PetscInt       *diag = a->diag;
2966   MatScalar      *aa=a->a,*v;
2967   PetscScalar    *x,*b,s1,*t;
2968 
2969   PetscFunctionBegin;
2970   if (!n) PetscFunctionReturn(0);
2971 
2972   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2973   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2974   t  = a->solve_work;
2975 
2976   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2977   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2978 
2979   /* forward solve the lower triangular */
2980   t[0] = b[*r++];
2981   for (i=1; i<n; i++) {
2982     v     = aa + ai[i];
2983     vi    = aj + ai[i];
2984     nz    = diag[i] - ai[i];
2985     s1  = b[*r++];
2986     while (nz--) {
2987       s1 -= (*v++)*t[*vi++];
2988     }
2989     t[i] = s1;
2990   }
2991   /* backward solve the upper triangular */
2992   for (i=n-1; i>=0; i--){
2993     v    = aa + diag[i] + 1;
2994     vi   = aj + diag[i] + 1;
2995     nz   = ai[i+1] - diag[i] - 1;
2996     s1 = t[i];
2997     while (nz--) {
2998       s1 -= (*v++)*t[*vi++];
2999     }
3000     x[*c--] = t[i] = aa[diag[i]]*s1;
3001   }
3002 
3003   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3004   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3005   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3006   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3007   ierr = PetscLogFlops(2*1*(a->nz) - A->cmap.n);CHKERRQ(ierr);
3008   PetscFunctionReturn(0);
3009 }
3010 /*
3011       Special case where the matrix was ILU(0) factored in the natural
3012    ordering. This eliminates the need for the column and row permutation.
3013 */
3014 #undef __FUNCT__
3015 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering"
3016 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
3017 {
3018   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3019   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
3020   PetscErrorCode ierr;
3021   PetscInt       *diag = a->diag;
3022   MatScalar      *aa=a->a;
3023   PetscScalar    *x,*b;
3024   PetscScalar    s1,x1;
3025   MatScalar      *v;
3026   PetscInt       jdx,idt,idx,nz,*vi,i;
3027 
3028   PetscFunctionBegin;
3029   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3030   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3031 
3032   /* forward solve the lower triangular */
3033   idx    = 0;
3034   x[0]   = b[0];
3035   for (i=1; i<n; i++) {
3036     v     =  aa      + ai[i];
3037     vi    =  aj      + ai[i];
3038     nz    =  diag[i] - ai[i];
3039     idx   +=  1;
3040     s1  =  b[idx];
3041     while (nz--) {
3042       jdx   = *vi++;
3043       x1    = x[jdx];
3044       s1 -= v[0]*x1;
3045       v    += 1;
3046     }
3047     x[idx]   = s1;
3048   }
3049   /* backward solve the upper triangular */
3050   for (i=n-1; i>=0; i--){
3051     v    = aa + diag[i] + 1;
3052     vi   = aj + diag[i] + 1;
3053     nz   = ai[i+1] - diag[i] - 1;
3054     idt  = i;
3055     s1 = x[idt];
3056     while (nz--) {
3057       idx   = *vi++;
3058       x1    = x[idx];
3059       s1 -= v[0]*x1;
3060       v    += 1;
3061     }
3062     v        = aa +  diag[i];
3063     x[idt]   = v[0]*s1;
3064   }
3065   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3066   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3067   ierr = PetscLogFlops(2*(a->nz) - A->cmap.n);CHKERRQ(ierr);
3068   PetscFunctionReturn(0);
3069 }
3070 
3071 /* ----------------------------------------------------------------*/
3072 /*
3073      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
3074    except that the data structure of Mat_SeqAIJ is slightly different.
3075    Not a good example of code reuse.
3076 */
3077 
3078 #undef __FUNCT__
3079 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ"
3080 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,MatFactorInfo *info,Mat *fact)
3081 {
3082   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b;
3083   IS             isicol;
3084   PetscErrorCode ierr;
3085   PetscInt       *r,*ic,prow,n = a->mbs,*ai = a->i,*aj = a->j;
3086   PetscInt       *ainew,*ajnew,jmax,*fill,*xi,nz,*im,*ajfill,*flev;
3087   PetscInt       *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0;
3088   PetscInt       incrlev,nnz,i,bs = A->rmap.bs,bs2 = a->bs2,levels,diagonal_fill,dd;
3089   PetscTruth     col_identity,row_identity,flg;
3090   PetscReal      f;
3091 
3092   PetscFunctionBegin;
3093   f             = info->fill;
3094   levels        = (PetscInt)info->levels;
3095   diagonal_fill = (PetscInt)info->diagonal_fill;
3096   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
3097   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
3098   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
3099 
3100   if (!levels && row_identity && col_identity) {  /* special case copy the nonzero structure */
3101     ierr = MatDuplicate_SeqBAIJ(A,MAT_DO_NOT_COPY_VALUES,fact);CHKERRQ(ierr);
3102     (*fact)->factor = FACTOR_LU;
3103     b               = (Mat_SeqBAIJ*)(*fact)->data;
3104     ierr = MatMissingDiagonal_SeqBAIJ(*fact,&flg,&dd);CHKERRQ(ierr);
3105     if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry in row %D",dd);
3106     b->row        = isrow;
3107     b->col        = iscol;
3108     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
3109     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
3110     b->icol       = isicol;
3111     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3112     ierr          = PetscMalloc(((*fact)->rmap.N+1+(*fact)->rmap.bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
3113   } else { /* general case perform the symbolic factorization */
3114     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
3115     ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
3116 
3117     /* get new row pointers */
3118     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr);
3119     ainew[0] = 0;
3120     /* don't know how many column pointers are needed so estimate */
3121     jmax = (PetscInt)(f*ai[n] + 1);
3122     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr);
3123     /* ajfill is level of fill for each fill entry */
3124     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr);
3125     /* fill is a linked list of nonzeros in active row */
3126     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr);
3127     /* im is level for each filled value */
3128     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr);
3129     /* dloc is location of diagonal in factor */
3130     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr);
3131     dloc[0]  = 0;
3132     for (prow=0; prow<n; prow++) {
3133 
3134       /* copy prow into linked list */
3135       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
3136       if (!nz) SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix");
3137       xi         = aj + ai[r[prow]];
3138       fill[n]    = n;
3139       fill[prow] = -1; /* marker for diagonal entry */
3140       while (nz--) {
3141 	fm  = n;
3142 	idx = ic[*xi++];
3143 	do {
3144 	  m  = fm;
3145 	  fm = fill[m];
3146 	} while (fm < idx);
3147 	fill[m]   = idx;
3148 	fill[idx] = fm;
3149 	im[idx]   = 0;
3150       }
3151 
3152       /* make sure diagonal entry is included */
3153       if (diagonal_fill && fill[prow] == -1) {
3154 	fm = n;
3155 	while (fill[fm] < prow) fm = fill[fm];
3156 	fill[prow] = fill[fm];  /* insert diagonal into linked list */
3157 	fill[fm]   = prow;
3158 	im[prow]   = 0;
3159 	nzf++;
3160 	dcount++;
3161       }
3162 
3163       nzi = 0;
3164       row = fill[n];
3165       while (row < prow) {
3166 	incrlev = im[row] + 1;
3167 	nz      = dloc[row];
3168 	xi      = ajnew  + ainew[row] + nz + 1;
3169 	flev    = ajfill + ainew[row] + nz + 1;
3170 	nnz     = ainew[row+1] - ainew[row] - nz - 1;
3171 	fm      = row;
3172 	while (nnz-- > 0) {
3173 	  idx = *xi++;
3174 	  if (*flev + incrlev > levels) {
3175 	    flev++;
3176 	    continue;
3177 	  }
3178 	  do {
3179 	    m  = fm;
3180 	    fm = fill[m];
3181 	  } while (fm < idx);
3182 	  if (fm != idx) {
3183 	    im[idx]   = *flev + incrlev;
3184 	    fill[m]   = idx;
3185 	    fill[idx] = fm;
3186 	    fm        = idx;
3187 	    nzf++;
3188 	  } else {
3189 	    if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
3190 	  }
3191 	  flev++;
3192 	}
3193 	row = fill[row];
3194 	nzi++;
3195       }
3196       /* copy new filled row into permanent storage */
3197       ainew[prow+1] = ainew[prow] + nzf;
3198       if (ainew[prow+1] > jmax) {
3199 
3200 	/* estimate how much additional space we will need */
3201 	/* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
3202 	/* just double the memory each time */
3203 	PetscInt maxadd = jmax;
3204 	/* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
3205 	if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
3206 	jmax += maxadd;
3207 
3208 	/* allocate a longer ajnew and ajfill */
3209 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xi);CHKERRQ(ierr);
3210 	ierr = PetscMemcpy(xi,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
3211 	ierr = PetscFree(ajnew);CHKERRQ(ierr);
3212 	ajnew = xi;
3213 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xi);CHKERRQ(ierr);
3214 	ierr = PetscMemcpy(xi,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
3215 	ierr = PetscFree(ajfill);CHKERRQ(ierr);
3216 	ajfill = xi;
3217 	reallocate++; /* count how many reallocations are needed */
3218       }
3219       xi          = ajnew + ainew[prow];
3220       flev        = ajfill + ainew[prow];
3221       dloc[prow]  = nzi;
3222       fm          = fill[n];
3223       while (nzf--) {
3224 	*xi++   = fm;
3225 	*flev++ = im[fm];
3226 	fm      = fill[fm];
3227       }
3228       /* make sure row has diagonal entry */
3229       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
3230 	SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
3231     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow);
3232       }
3233     }
3234     ierr = PetscFree(ajfill);CHKERRQ(ierr);
3235     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
3236     ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
3237     ierr = PetscFree(fill);CHKERRQ(ierr);
3238     ierr = PetscFree(im);CHKERRQ(ierr);
3239 
3240 #if defined(PETSC_USE_INFO)
3241     {
3242       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
3243       ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr);
3244       ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
3245       ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr);
3246       ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
3247       if (diagonal_fill) {
3248 	ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr);
3249       }
3250     }
3251 #endif
3252 
3253     /* put together the new matrix */
3254     ierr = MatCreate(((PetscObject)A)->comm,fact);CHKERRQ(ierr);
3255     ierr = MatSetSizes(*fact,bs*n,bs*n,bs*n,bs*n);CHKERRQ(ierr);
3256     ierr = MatSetType(*fact,((PetscObject)A)->type_name);CHKERRQ(ierr);
3257     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(*fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
3258     ierr = PetscLogObjectParent(*fact,isicol);CHKERRQ(ierr);
3259     b    = (Mat_SeqBAIJ*)(*fact)->data;
3260     b->free_a       = PETSC_TRUE;
3261     b->free_ij      = PETSC_TRUE;
3262     b->singlemalloc = PETSC_FALSE;
3263     ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
3264     b->j          = ajnew;
3265     b->i          = ainew;
3266     for (i=0; i<n; i++) dloc[i] += ainew[i];
3267     b->diag       = dloc;
3268     b->ilen       = 0;
3269     b->imax       = 0;
3270     b->row        = isrow;
3271     b->col        = iscol;
3272     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3273     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
3274     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
3275     b->icol       = isicol;
3276     ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
3277     /* In b structure:  Free imax, ilen, old a, old j.
3278        Allocate dloc, solve_work, new a, new j */
3279     ierr = PetscLogObjectMemory(*fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr);
3280     b->maxnz          = b->nz = ainew[n];
3281     (*fact)->factor   = FACTOR_LU;
3282 
3283     (*fact)->info.factor_mallocs    = reallocate;
3284     (*fact)->info.fill_ratio_given  = f;
3285     (*fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
3286   }
3287 
3288   if (row_identity && col_identity) {
3289     ierr = MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(*fact);CHKERRQ(ierr);
3290   }
3291   PetscFunctionReturn(0);
3292 }
3293 
3294 #undef __FUNCT__
3295 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE"
3296 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
3297 {
3298   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
3299   /* int i,*AJ=a->j,nz=a->nz; */
3300   PetscFunctionBegin;
3301   /* Undo Column scaling */
3302 /*    while (nz--) { */
3303 /*      AJ[i] = AJ[i]/4; */
3304 /*    } */
3305   /* This should really invoke a push/pop logic, but we don't have that yet. */
3306   A->ops->setunfactored = PETSC_NULL;
3307   PetscFunctionReturn(0);
3308 }
3309 
3310 #undef __FUNCT__
3311 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj"
3312 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
3313 {
3314   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3315   PetscInt       *AJ=a->j,nz=a->nz;
3316   unsigned short *aj=(unsigned short *)AJ;
3317   PetscFunctionBegin;
3318   /* Is this really necessary? */
3319   while (nz--) {
3320     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
3321   }
3322   A->ops->setunfactored = PETSC_NULL;
3323   PetscFunctionReturn(0);
3324 }
3325 
3326 #undef __FUNCT__
3327 #define __FUNCT__ "MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering"
3328 PetscErrorCode MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(Mat inA)
3329 {
3330   PetscErrorCode ierr;
3331   /*
3332       Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver
3333       with natural ordering
3334   */
3335   PetscFunctionBegin;
3336   inA->ops->solve             = MatSolve_SeqBAIJ_Update;
3337   inA->ops->solvetranspose    = MatSolveTranspose_SeqBAIJ_Update;
3338   switch (inA->rmap.bs) {
3339   case 1:
3340     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_1;
3341     ierr = PetscInfo(inA,"Using special in-place natural ordering factor BS=1\n");CHKERRQ(ierr);
3342     break;
3343   case 2:
3344     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering;
3345     ierr = PetscInfo(inA,"Using special in-place natural ordering factor BS=2\n");CHKERRQ(ierr);
3346     break;
3347   case 3:
3348     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering;
3349     ierr = PetscInfo(inA,"Using special in-place natural ordering factor BS=3\n");CHKERRQ(ierr);
3350     break;
3351   case 4:
3352 #if defined(PETSC_USE_MAT_SINGLE)
3353     {
3354       PetscTruth  sse_enabled_local;
3355       PetscErrorCode ierr;
3356       ierr = PetscSSEIsEnabled(((PetscObject)inA)->comm,&sse_enabled_local,PETSC_NULL);CHKERRQ(ierr);
3357       if (sse_enabled_local) {
3358 #  if defined(PETSC_HAVE_SSE)
3359         int i,*AJ=a->j,nz=a->nz,n=a->mbs;
3360         if (n==(unsigned short)n) {
3361           unsigned short *aj=(unsigned short *)AJ;
3362           for (i=0;i<nz;i++) {
3363             aj[i] = (unsigned short)AJ[i];
3364           }
3365           inA->ops->setunfactored   = MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj;
3366           inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj;
3367           ierr = PetscInfo(inA,"Using special SSE, in-place natural ordering, ushort j index factor BS=4\n");CHKERRQ(ierr);
3368         } else {
3369         /* Scale the column indices for easier indexing in MatSolve. */
3370 /*            for (i=0;i<nz;i++) { */
3371 /*              AJ[i] = AJ[i]*4; */
3372 /*            } */
3373           inA->ops->setunfactored   = MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE;
3374           inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE;
3375           ierr = PetscInfo(inA,"Using special SSE, in-place natural ordering, int j index factor BS=4\n");CHKERRQ(ierr);
3376         }
3377 #  else
3378       /* This should never be reached.  If so, problem in PetscSSEIsEnabled. */
3379         SETERRQ(PETSC_ERR_SUP,"SSE Hardware unavailable");
3380 #  endif
3381       } else {
3382         inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering;
3383         ierr = PetscInfo(inA,"Using special in-place natural ordering factor BS=4\n");CHKERRQ(ierr);
3384       }
3385     }
3386 #else
3387     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering;
3388     ierr = PetscInfo(inA,"Using special in-place natural ordering factor BS=4\n");CHKERRQ(ierr);
3389 #endif
3390     break;
3391   case 5:
3392     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_5_NaturalOrdering;
3393     ierr = PetscInfo(inA,"Using special in-place natural ordering factor BS=5\n");CHKERRQ(ierr);
3394     break;
3395   case 6:
3396     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_6_NaturalOrdering;
3397     ierr = PetscInfo(inA,"Using special in-place natural ordering factor BS=6\n");CHKERRQ(ierr);
3398     break;
3399   case 7:
3400     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_7_NaturalOrdering;
3401     ierr = PetscInfo(inA,"Using special in-place natural ordering factor BS=7\n");CHKERRQ(ierr);
3402     break;
3403   }
3404   PetscFunctionReturn(0);
3405 }
3406 
3407 #undef __FUNCT__
3408 #define __FUNCT__ "MatSeqBAIJ_UpdateSolvers"
3409 PetscErrorCode MatSeqBAIJ_UpdateSolvers(Mat A)
3410 {
3411   /*
3412       Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver
3413       with natural ordering
3414   */
3415   Mat_SeqBAIJ    *a  = (Mat_SeqBAIJ *)A->data;
3416   IS             row = a->row, col = a->col;
3417   PetscTruth     row_identity, col_identity;
3418   PetscTruth     use_natural;
3419   PetscErrorCode ierr;
3420 
3421   PetscFunctionBegin;
3422 
3423   use_natural = PETSC_FALSE;
3424   if (row && col) {
3425     ierr = ISIdentity(row,&row_identity);CHKERRQ(ierr);
3426     ierr = ISIdentity(col,&col_identity);CHKERRQ(ierr);
3427 
3428     if (row_identity && col_identity) {
3429       use_natural = PETSC_TRUE;
3430     }
3431   } else {
3432     use_natural = PETSC_TRUE;
3433   }
3434 
3435   switch (A->rmap.bs) {
3436   case 1:
3437     if (use_natural) {
3438       A->ops->solve           = MatSolve_SeqBAIJ_1_NaturalOrdering;
3439       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering;
3440       ierr = PetscInfo(A,"Using special in-place natural ordering solve BS=1\n");CHKERRQ(ierr);
3441       ierr = PetscInfo(A,"Using special in-place natural ordering solve BS=4\n");CHKERRQ(ierr);
3442     } else {
3443       A->ops->solve           = MatSolve_SeqBAIJ_1;
3444       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_1;
3445     }
3446     break;
3447   case 2:
3448     if (use_natural) {
3449       A->ops->solve           = MatSolve_SeqBAIJ_2_NaturalOrdering;
3450       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering;
3451       ierr = PetscInfo(A,"Using special in-place natural ordering solve BS=2\n");CHKERRQ(ierr);
3452       ierr = PetscInfo(A,"Using special in-place natural ordering solve BS=4\n");CHKERRQ(ierr);
3453     } else {
3454       A->ops->solve           = MatSolve_SeqBAIJ_2;
3455       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_2;
3456     }
3457     break;
3458   case 3:
3459     if (use_natural) {
3460       A->ops->solve           = MatSolve_SeqBAIJ_3_NaturalOrdering;
3461       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering;
3462       ierr = PetscInfo(A,"Using special in-place natural ordering solve BS=3\n");CHKERRQ(ierr);
3463       ierr = PetscInfo(A,"Using special in-place natural ordering solve BS=4\n");CHKERRQ(ierr);
3464     } else {
3465       A->ops->solve           = MatSolve_SeqBAIJ_3;
3466       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_3;
3467     }
3468     break;
3469   case 4:
3470     {
3471       PetscTruth sse_enabled_local;
3472       ierr = PetscSSEIsEnabled(((PetscObject)A)->comm,&sse_enabled_local,PETSC_NULL);CHKERRQ(ierr);
3473       if (use_natural) {
3474 #if defined(PETSC_USE_MAT_SINGLE)
3475         if (sse_enabled_local) { /* Natural + Single + SSE */
3476 #  if defined(PETSC_HAVE_SSE)
3477           PetscInt n=a->mbs;
3478           if (n==(unsigned short)n) {
3479             A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj;
3480             ierr = PetscInfo(A,"Using single precision, SSE, in-place, ushort j index, natural ordering solve BS=4\n");CHKERRQ(ierr);
3481           } else {
3482             A->ops->solve         = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion;
3483             ierr = PetscInfo(A,"Using single precision, SSE, in-place, int j index, natural ordering solve BS=4\n");CHKERRQ(ierr);
3484           }
3485 #  else
3486           /* This should never be reached, unless there is a bug in PetscSSEIsEnabled(). */
3487           SETERRQ(PETSC_ERR_SUP,"SSE implementations are unavailable.");
3488 #  endif
3489         } else { /* Natural + Single */
3490           A->ops->solve         = MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion;
3491           ierr = PetscInfo(A,"Using single precision, in-place, natural ordering solve BS=4\n");CHKERRQ(ierr);
3492         }
3493 #else
3494         A->ops->solve           = MatSolve_SeqBAIJ_4_NaturalOrdering;
3495         ierr = PetscInfo(A,"Using special in-place, natural ordering solve BS=4\n");CHKERRQ(ierr);
3496 #endif
3497         A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
3498         ierr = PetscInfo(A,"Using special in-place, natural ordering solve BS=4\n");CHKERRQ(ierr);
3499       } else { /* Arbitrary ordering */
3500 #if defined(PETSC_USE_MAT_SINGLE)
3501         if (sse_enabled_local) { /* Arbitrary + Single + SSE */
3502 #  if defined(PETSC_HAVE_SSE)
3503           A->ops->solve         = MatSolve_SeqBAIJ_4_SSE_Demotion;
3504           ierr = PetscInfo(A,"Using single precision, SSE solve BS=4\n");CHKERRQ(ierr);
3505 #  else
3506           /* This should never be reached, unless there is a bug in PetscSSEIsEnabled(). */
3507           SETERRQ(PETSC_ERR_SUP,"SSE implementations are unavailable.");
3508 #  endif
3509         } else { /* Arbitrary + Single */
3510           A->ops->solve         = MatSolve_SeqBAIJ_4_Demotion;
3511           ierr = PetscInfo(A,"Using single precision solve BS=4\n");CHKERRQ(ierr);
3512         }
3513 #else
3514         A->ops->solve           = MatSolve_SeqBAIJ_4;
3515 #endif
3516         A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_4;
3517       }
3518     }
3519     break;
3520   case 5:
3521     if (use_natural) {
3522       A->ops->solve           = MatSolve_SeqBAIJ_5_NaturalOrdering;
3523       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_5_NaturalOrdering;
3524       ierr = PetscInfo(A,"Using special in-place natural ordering solve BS=5\n");CHKERRQ(ierr);
3525       ierr = PetscInfo(A,"Using special in-place natural ordering solve BS=5\n");CHKERRQ(ierr);
3526     } else {
3527       A->ops->solve           = MatSolve_SeqBAIJ_5;
3528       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_5;
3529     }
3530     break;
3531   case 6:
3532     if (use_natural) {
3533       A->ops->solve           = MatSolve_SeqBAIJ_6_NaturalOrdering;
3534       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_6_NaturalOrdering;
3535       ierr = PetscInfo(A,"Using special in-place natural ordering solve BS=6\n");CHKERRQ(ierr);
3536       ierr = PetscInfo(A,"Using special in-place natural ordering solve BS=6\n");CHKERRQ(ierr);
3537     } else {
3538       A->ops->solve           = MatSolve_SeqBAIJ_6;
3539       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_6;
3540     }
3541     break;
3542   case 7:
3543     if (use_natural) {
3544       A->ops->solve           = MatSolve_SeqBAIJ_7_NaturalOrdering;
3545       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_7_NaturalOrdering;
3546       ierr = PetscInfo(A,"Using special in-place natural ordering solve BS=7\n");CHKERRQ(ierr);
3547       ierr = PetscInfo(A,"Using special in-place natural ordering solve BS=7\n");CHKERRQ(ierr);
3548     } else {
3549       A->ops->solve           = MatSolve_SeqBAIJ_7;
3550       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_7;
3551     }
3552     break;
3553   default:
3554     A->ops->solve             = MatSolve_SeqBAIJ_N;
3555     break;
3556   }
3557   PetscFunctionReturn(0);
3558 }
3559 
3560 #undef __FUNCT__
3561 #define __FUNCT__ "MatSolve_SeqBAIJ_Update"
3562 PetscErrorCode MatSolve_SeqBAIJ_Update(Mat A,Vec x,Vec y)
3563 {
3564   PetscErrorCode ierr;
3565 
3566   PetscFunctionBegin;
3567   ierr = MatSeqBAIJ_UpdateSolvers(A);
3568   if (A->ops->solve != MatSolve_SeqBAIJ_Update) {
3569     ierr = (*A->ops->solve)(A,x,y);CHKERRQ(ierr);
3570   } else {
3571     SETERRQ(PETSC_ERR_SUP,"Something really wrong happened.");
3572   }
3573   PetscFunctionReturn(0);
3574 }
3575 
3576 #undef __FUNCT__
3577 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_Update"
3578 PetscErrorCode MatSolveTranspose_SeqBAIJ_Update(Mat A,Vec x,Vec y)
3579 {
3580   PetscErrorCode ierr;
3581 
3582   PetscFunctionBegin;
3583   ierr = MatSeqBAIJ_UpdateSolvers(A);
3584   ierr = (*A->ops->solvetranspose)(A,x,y);CHKERRQ(ierr);
3585   PetscFunctionReturn(0);
3586 }
3587 
3588 
3589