xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision 6849ba73f22fecb8f92ef896a42e4e8bd4cd6965)
1 /*
2     Factorization code for BAIJ format.
3 */
4 
5 #include "src/mat/impls/baij/seq/baij.h"
6 #include "src/inline/ilu.h"
7 #include "src/inline/dot.h"
8 
9 #undef __FUNCT__
10 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering"
11 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
12 {
13   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
14   PetscErrorCode ierr;
15   int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
16   int             *diag = a->diag;
17   MatScalar       *aa=a->a,*v;
18   PetscScalar     s1,*x,*b;
19 
20   PetscFunctionBegin;
21   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
22   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
23   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
24 
25   /* forward solve the U^T */
26   for (i=0; i<n; i++) {
27 
28     v     = aa + diag[i];
29     /* multiply by the inverse of the block diagonal */
30     s1    = (*v++)*x[i];
31     vi    = aj + diag[i] + 1;
32     nz    = ai[i+1] - diag[i] - 1;
33     while (nz--) {
34       x[*vi++]  -= (*v++)*s1;
35     }
36     x[i]   = s1;
37   }
38   /* backward solve the L^T */
39   for (i=n-1; i>=0; i--){
40     v    = aa + diag[i] - 1;
41     vi   = aj + diag[i] - 1;
42     nz   = diag[i] - ai[i];
43     s1   = x[i];
44     while (nz--) {
45       x[*vi--]   -=  (*v--)*s1;
46     }
47   }
48   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
49   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
50   PetscLogFlops(2*(a->nz) - A->n);
51   PetscFunctionReturn(0);
52 }
53 
54 #undef __FUNCT__
55 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering"
56 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
57 {
58   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
59   PetscErrorCode ierr;
60   int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
61   int             *diag = a->diag,oidx;
62   MatScalar       *aa=a->a,*v;
63   PetscScalar     s1,s2,x1,x2;
64   PetscScalar     *x,*b;
65 
66   PetscFunctionBegin;
67   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
68   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
69   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
70 
71   /* forward solve the U^T */
72   idx = 0;
73   for (i=0; i<n; i++) {
74 
75     v     = aa + 4*diag[i];
76     /* multiply by the inverse of the block diagonal */
77     x1 = x[idx];   x2 = x[1+idx];
78     s1 = v[0]*x1  +  v[1]*x2;
79     s2 = v[2]*x1  +  v[3]*x2;
80     v += 4;
81 
82     vi    = aj + diag[i] + 1;
83     nz    = ai[i+1] - diag[i] - 1;
84     while (nz--) {
85       oidx = 2*(*vi++);
86       x[oidx]   -= v[0]*s1  +  v[1]*s2;
87       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
88       v  += 4;
89     }
90     x[idx]   = s1;x[1+idx] = s2;
91     idx += 2;
92   }
93   /* backward solve the L^T */
94   for (i=n-1; i>=0; i--){
95     v    = aa + 4*diag[i] - 4;
96     vi   = aj + diag[i] - 1;
97     nz   = diag[i] - ai[i];
98     idt  = 2*i;
99     s1   = x[idt];  s2 = x[1+idt];
100     while (nz--) {
101       idx   = 2*(*vi--);
102       x[idx]   -=  v[0]*s1 +  v[1]*s2;
103       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
104       v -= 4;
105     }
106   }
107   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
108   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
109   PetscLogFlops(2*4*(a->nz) - 2*A->n);
110   PetscFunctionReturn(0);
111 }
112 
113 #undef __FUNCT__
114 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering"
115 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
116 {
117   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
118   PetscErrorCode ierr;
119   int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
120   int             *diag = a->diag,oidx;
121   MatScalar       *aa=a->a,*v;
122   PetscScalar     s1,s2,s3,x1,x2,x3;
123   PetscScalar     *x,*b;
124 
125   PetscFunctionBegin;
126   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
127   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
128   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
129 
130   /* forward solve the U^T */
131   idx = 0;
132   for (i=0; i<n; i++) {
133 
134     v     = aa + 9*diag[i];
135     /* multiply by the inverse of the block diagonal */
136     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
137     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
138     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
139     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
140     v += 9;
141 
142     vi    = aj + diag[i] + 1;
143     nz    = ai[i+1] - diag[i] - 1;
144     while (nz--) {
145       oidx = 3*(*vi++);
146       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
147       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
148       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
149       v  += 9;
150     }
151     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
152     idx += 3;
153   }
154   /* backward solve the L^T */
155   for (i=n-1; i>=0; i--){
156     v    = aa + 9*diag[i] - 9;
157     vi   = aj + diag[i] - 1;
158     nz   = diag[i] - ai[i];
159     idt  = 3*i;
160     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
161     while (nz--) {
162       idx   = 3*(*vi--);
163       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
164       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
165       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
166       v -= 9;
167     }
168   }
169   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
170   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
171   PetscLogFlops(2*9*(a->nz) - 3*A->n);
172   PetscFunctionReturn(0);
173 }
174 
175 #undef __FUNCT__
176 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering"
177 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
178 {
179   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
180   PetscErrorCode ierr;
181   int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
182   int             *diag = a->diag,oidx;
183   MatScalar       *aa=a->a,*v;
184   PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
185   PetscScalar     *x,*b;
186 
187   PetscFunctionBegin;
188   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
189   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
190   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
191 
192   /* forward solve the U^T */
193   idx = 0;
194   for (i=0; i<n; i++) {
195 
196     v     = aa + 16*diag[i];
197     /* multiply by the inverse of the block diagonal */
198     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
199     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
200     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
201     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
202     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
203     v += 16;
204 
205     vi    = aj + diag[i] + 1;
206     nz    = ai[i+1] - diag[i] - 1;
207     while (nz--) {
208       oidx = 4*(*vi++);
209       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
210       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
211       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
212       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
213       v  += 16;
214     }
215     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
216     idx += 4;
217   }
218   /* backward solve the L^T */
219   for (i=n-1; i>=0; i--){
220     v    = aa + 16*diag[i] - 16;
221     vi   = aj + diag[i] - 1;
222     nz   = diag[i] - ai[i];
223     idt  = 4*i;
224     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
225     while (nz--) {
226       idx   = 4*(*vi--);
227       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
228       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
229       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
230       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
231       v -= 16;
232     }
233   }
234   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
235   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
236   PetscLogFlops(2*16*(a->nz) - 4*A->n);
237   PetscFunctionReturn(0);
238 }
239 
240 #undef __FUNCT__
241 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering"
242 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
243 {
244   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
245   PetscErrorCode ierr;
246   int  i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
247   int             *diag = a->diag,oidx;
248   MatScalar       *aa=a->a,*v;
249   PetscScalar     s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
250   PetscScalar     *x,*b;
251 
252   PetscFunctionBegin;
253   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
254   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
255   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
256 
257   /* forward solve the U^T */
258   idx = 0;
259   for (i=0; i<n; i++) {
260 
261     v     = aa + 25*diag[i];
262     /* multiply by the inverse of the block diagonal */
263     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
264     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
265     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
266     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
267     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
268     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
269     v += 25;
270 
271     vi    = aj + diag[i] + 1;
272     nz    = ai[i+1] - diag[i] - 1;
273     while (nz--) {
274       oidx = 5*(*vi++);
275       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
276       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
277       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
278       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
279       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
280       v  += 25;
281     }
282     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
283     idx += 5;
284   }
285   /* backward solve the L^T */
286   for (i=n-1; i>=0; i--){
287     v    = aa + 25*diag[i] - 25;
288     vi   = aj + diag[i] - 1;
289     nz   = diag[i] - ai[i];
290     idt  = 5*i;
291     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
292     while (nz--) {
293       idx   = 5*(*vi--);
294       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
295       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
296       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
297       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
298       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
299       v -= 25;
300     }
301   }
302   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
303   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
304   PetscLogFlops(2*25*(a->nz) - 5*A->n);
305   PetscFunctionReturn(0);
306 }
307 
308 #undef __FUNCT__
309 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering"
310 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
311 {
312   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
313   PetscErrorCode ierr;
314   int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
315   int             *diag = a->diag,oidx;
316   MatScalar       *aa=a->a,*v;
317   PetscScalar     s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
318   PetscScalar     *x,*b;
319 
320   PetscFunctionBegin;
321   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
322   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
323   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
324 
325   /* forward solve the U^T */
326   idx = 0;
327   for (i=0; i<n; i++) {
328 
329     v     = aa + 36*diag[i];
330     /* multiply by the inverse of the block diagonal */
331     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
332     x6    = x[5+idx];
333     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
334     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
335     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
336     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
337     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
338     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
339     v += 36;
340 
341     vi    = aj + diag[i] + 1;
342     nz    = ai[i+1] - diag[i] - 1;
343     while (nz--) {
344       oidx = 6*(*vi++);
345       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
346       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
347       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
348       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
349       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
350       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
351       v  += 36;
352     }
353     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
354     x[5+idx] = s6;
355     idx += 6;
356   }
357   /* backward solve the L^T */
358   for (i=n-1; i>=0; i--){
359     v    = aa + 36*diag[i] - 36;
360     vi   = aj + diag[i] - 1;
361     nz   = diag[i] - ai[i];
362     idt  = 6*i;
363     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
364     s6 = x[5+idt];
365     while (nz--) {
366       idx   = 6*(*vi--);
367       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
368       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
369       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
370       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
371       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
372       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
373       v -= 36;
374     }
375   }
376   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
377   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
378   PetscLogFlops(2*36*(a->nz) - 6*A->n);
379   PetscFunctionReturn(0);
380 }
381 
382 #undef __FUNCT__
383 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering"
384 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
385 {
386   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
387   PetscErrorCode ierr;
388   int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
389   int             *diag = a->diag,oidx;
390   MatScalar       *aa=a->a,*v;
391   PetscScalar     s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
392   PetscScalar     *x,*b;
393 
394   PetscFunctionBegin;
395   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
396   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
397   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
398 
399   /* forward solve the U^T */
400   idx = 0;
401   for (i=0; i<n; i++) {
402 
403     v     = aa + 49*diag[i];
404     /* multiply by the inverse of the block diagonal */
405     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
406     x6    = x[5+idx]; x7 = x[6+idx];
407     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
408     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
409     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
410     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
411     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
412     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
413     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
414     v += 49;
415 
416     vi    = aj + diag[i] + 1;
417     nz    = ai[i+1] - diag[i] - 1;
418     while (nz--) {
419       oidx = 7*(*vi++);
420       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
421       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
422       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
423       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
424       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
425       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
426       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
427       v  += 49;
428     }
429     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
430     x[5+idx] = s6;x[6+idx] = s7;
431     idx += 7;
432   }
433   /* backward solve the L^T */
434   for (i=n-1; i>=0; i--){
435     v    = aa + 49*diag[i] - 49;
436     vi   = aj + diag[i] - 1;
437     nz   = diag[i] - ai[i];
438     idt  = 7*i;
439     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
440     s6 = x[5+idt];s7 = x[6+idt];
441     while (nz--) {
442       idx   = 7*(*vi--);
443       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
444       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
445       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
446       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
447       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
448       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
449       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
450       v -= 49;
451     }
452   }
453   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
454   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
455   PetscLogFlops(2*49*(a->nz) - 7*A->n);
456   PetscFunctionReturn(0);
457 }
458 
459 /*---------------------------------------------------------------------------------------------*/
460 #undef __FUNCT__
461 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1"
462 PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
463 {
464   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
465   IS              iscol=a->col,isrow=a->row;
466   PetscErrorCode ierr;
467   int             *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
468   int             *diag = a->diag;
469   MatScalar       *aa=a->a,*v;
470   PetscScalar     s1,*x,*b,*t;
471 
472   PetscFunctionBegin;
473   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
474   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
475   t  = a->solve_work;
476 
477   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
478   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
479 
480   /* copy the b into temp work space according to permutation */
481   for (i=0; i<n; i++) {
482     t[i] = b[c[i]];
483   }
484 
485   /* forward solve the U^T */
486   for (i=0; i<n; i++) {
487 
488     v     = aa + diag[i];
489     /* multiply by the inverse of the block diagonal */
490     s1    = (*v++)*t[i];
491     vi    = aj + diag[i] + 1;
492     nz    = ai[i+1] - diag[i] - 1;
493     while (nz--) {
494       t[*vi++]  -= (*v++)*s1;
495     }
496     t[i]   = s1;
497   }
498   /* backward solve the L^T */
499   for (i=n-1; i>=0; i--){
500     v    = aa + diag[i] - 1;
501     vi   = aj + diag[i] - 1;
502     nz   = diag[i] - ai[i];
503     s1   = t[i];
504     while (nz--) {
505       t[*vi--]   -=  (*v--)*s1;
506     }
507   }
508 
509   /* copy t into x according to permutation */
510   for (i=0; i<n; i++) {
511     x[r[i]]   = t[i];
512   }
513 
514   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
515   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
516   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
517   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
518   PetscLogFlops(2*(a->nz) - A->n);
519   PetscFunctionReturn(0);
520 }
521 
522 #undef __FUNCT__
523 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2"
524 PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
525 {
526   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
527   IS              iscol=a->col,isrow=a->row;
528   PetscErrorCode ierr;
529   int             *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
530   int             *diag = a->diag,ii,ic,ir,oidx;
531   MatScalar       *aa=a->a,*v;
532   PetscScalar     s1,s2,x1,x2;
533   PetscScalar     *x,*b,*t;
534 
535   PetscFunctionBegin;
536   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
537   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
538   t  = a->solve_work;
539 
540   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
541   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
542 
543   /* copy the b into temp work space according to permutation */
544   ii = 0;
545   for (i=0; i<n; i++) {
546     ic      = 2*c[i];
547     t[ii]   = b[ic];
548     t[ii+1] = b[ic+1];
549     ii += 2;
550   }
551 
552   /* forward solve the U^T */
553   idx = 0;
554   for (i=0; i<n; i++) {
555 
556     v     = aa + 4*diag[i];
557     /* multiply by the inverse of the block diagonal */
558     x1    = t[idx];   x2 = t[1+idx];
559     s1 = v[0]*x1  +  v[1]*x2;
560     s2 = v[2]*x1  +  v[3]*x2;
561     v += 4;
562 
563     vi    = aj + diag[i] + 1;
564     nz    = ai[i+1] - diag[i] - 1;
565     while (nz--) {
566       oidx = 2*(*vi++);
567       t[oidx]   -= v[0]*s1  +  v[1]*s2;
568       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
569       v  += 4;
570     }
571     t[idx]   = s1;t[1+idx] = s2;
572     idx += 2;
573   }
574   /* backward solve the L^T */
575   for (i=n-1; i>=0; i--){
576     v    = aa + 4*diag[i] - 4;
577     vi   = aj + diag[i] - 1;
578     nz   = diag[i] - ai[i];
579     idt  = 2*i;
580     s1 = t[idt];  s2 = t[1+idt];
581     while (nz--) {
582       idx   = 2*(*vi--);
583       t[idx]   -=  v[0]*s1 +  v[1]*s2;
584       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
585       v -= 4;
586     }
587   }
588 
589   /* copy t into x according to permutation */
590   ii = 0;
591   for (i=0; i<n; i++) {
592     ir      = 2*r[i];
593     x[ir]   = t[ii];
594     x[ir+1] = t[ii+1];
595     ii += 2;
596   }
597 
598   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
599   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
600   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
601   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
602   PetscLogFlops(2*4*(a->nz) - 2*A->n);
603   PetscFunctionReturn(0);
604 }
605 
606 #undef __FUNCT__
607 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3"
608 PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
609 {
610   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
611   IS              iscol=a->col,isrow=a->row;
612   PetscErrorCode ierr;
613   int             *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
614   int             *diag = a->diag,ii,ic,ir,oidx;
615   MatScalar       *aa=a->a,*v;
616   PetscScalar     s1,s2,s3,x1,x2,x3;
617   PetscScalar     *x,*b,*t;
618 
619   PetscFunctionBegin;
620   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
621   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
622   t  = a->solve_work;
623 
624   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
625   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
626 
627   /* copy the b into temp work space according to permutation */
628   ii = 0;
629   for (i=0; i<n; i++) {
630     ic      = 3*c[i];
631     t[ii]   = b[ic];
632     t[ii+1] = b[ic+1];
633     t[ii+2] = b[ic+2];
634     ii += 3;
635   }
636 
637   /* forward solve the U^T */
638   idx = 0;
639   for (i=0; i<n; i++) {
640 
641     v     = aa + 9*diag[i];
642     /* multiply by the inverse of the block diagonal */
643     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
644     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
645     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
646     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
647     v += 9;
648 
649     vi    = aj + diag[i] + 1;
650     nz    = ai[i+1] - diag[i] - 1;
651     while (nz--) {
652       oidx = 3*(*vi++);
653       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
654       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
655       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
656       v  += 9;
657     }
658     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
659     idx += 3;
660   }
661   /* backward solve the L^T */
662   for (i=n-1; i>=0; i--){
663     v    = aa + 9*diag[i] - 9;
664     vi   = aj + diag[i] - 1;
665     nz   = diag[i] - ai[i];
666     idt  = 3*i;
667     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
668     while (nz--) {
669       idx   = 3*(*vi--);
670       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
671       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
672       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
673       v -= 9;
674     }
675   }
676 
677   /* copy t into x according to permutation */
678   ii = 0;
679   for (i=0; i<n; i++) {
680     ir      = 3*r[i];
681     x[ir]   = t[ii];
682     x[ir+1] = t[ii+1];
683     x[ir+2] = t[ii+2];
684     ii += 3;
685   }
686 
687   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
688   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
689   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
690   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
691   PetscLogFlops(2*9*(a->nz) - 3*A->n);
692   PetscFunctionReturn(0);
693 }
694 
695 #undef __FUNCT__
696 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4"
697 PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
698 {
699   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
700   IS              iscol=a->col,isrow=a->row;
701   PetscErrorCode ierr;
702   int             *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
703   int             *diag = a->diag,ii,ic,ir,oidx;
704   MatScalar       *aa=a->a,*v;
705   PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
706   PetscScalar     *x,*b,*t;
707 
708   PetscFunctionBegin;
709   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
710   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
711   t  = a->solve_work;
712 
713   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
714   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
715 
716   /* copy the b into temp work space according to permutation */
717   ii = 0;
718   for (i=0; i<n; i++) {
719     ic      = 4*c[i];
720     t[ii]   = b[ic];
721     t[ii+1] = b[ic+1];
722     t[ii+2] = b[ic+2];
723     t[ii+3] = b[ic+3];
724     ii += 4;
725   }
726 
727   /* forward solve the U^T */
728   idx = 0;
729   for (i=0; i<n; i++) {
730 
731     v     = aa + 16*diag[i];
732     /* multiply by the inverse of the block diagonal */
733     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
734     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
735     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
736     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
737     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
738     v += 16;
739 
740     vi    = aj + diag[i] + 1;
741     nz    = ai[i+1] - diag[i] - 1;
742     while (nz--) {
743       oidx = 4*(*vi++);
744       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
745       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
746       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
747       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
748       v  += 16;
749     }
750     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
751     idx += 4;
752   }
753   /* backward solve the L^T */
754   for (i=n-1; i>=0; i--){
755     v    = aa + 16*diag[i] - 16;
756     vi   = aj + diag[i] - 1;
757     nz   = diag[i] - ai[i];
758     idt  = 4*i;
759     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
760     while (nz--) {
761       idx   = 4*(*vi--);
762       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
763       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
764       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
765       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
766       v -= 16;
767     }
768   }
769 
770   /* copy t into x according to permutation */
771   ii = 0;
772   for (i=0; i<n; i++) {
773     ir      = 4*r[i];
774     x[ir]   = t[ii];
775     x[ir+1] = t[ii+1];
776     x[ir+2] = t[ii+2];
777     x[ir+3] = t[ii+3];
778     ii += 4;
779   }
780 
781   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
782   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
783   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
784   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
785   PetscLogFlops(2*16*(a->nz) - 4*A->n);
786   PetscFunctionReturn(0);
787 }
788 
789 #undef __FUNCT__
790 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5"
791 PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
792 {
793   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
794   IS              iscol=a->col,isrow=a->row;
795   PetscErrorCode ierr;
796   int             *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
797   int             *diag = a->diag,ii,ic,ir,oidx;
798   MatScalar       *aa=a->a,*v;
799   PetscScalar     s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
800   PetscScalar     *x,*b,*t;
801 
802   PetscFunctionBegin;
803   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
804   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
805   t  = a->solve_work;
806 
807   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
808   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
809 
810   /* copy the b into temp work space according to permutation */
811   ii = 0;
812   for (i=0; i<n; i++) {
813     ic      = 5*c[i];
814     t[ii]   = b[ic];
815     t[ii+1] = b[ic+1];
816     t[ii+2] = b[ic+2];
817     t[ii+3] = b[ic+3];
818     t[ii+4] = b[ic+4];
819     ii += 5;
820   }
821 
822   /* forward solve the U^T */
823   idx = 0;
824   for (i=0; i<n; i++) {
825 
826     v     = aa + 25*diag[i];
827     /* multiply by the inverse of the block diagonal */
828     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
829     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
830     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
831     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
832     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
833     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
834     v += 25;
835 
836     vi    = aj + diag[i] + 1;
837     nz    = ai[i+1] - diag[i] - 1;
838     while (nz--) {
839       oidx = 5*(*vi++);
840       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
841       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
842       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
843       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
844       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
845       v  += 25;
846     }
847     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
848     idx += 5;
849   }
850   /* backward solve the L^T */
851   for (i=n-1; i>=0; i--){
852     v    = aa + 25*diag[i] - 25;
853     vi   = aj + diag[i] - 1;
854     nz   = diag[i] - ai[i];
855     idt  = 5*i;
856     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
857     while (nz--) {
858       idx   = 5*(*vi--);
859       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
860       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
861       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
862       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
863       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
864       v -= 25;
865     }
866   }
867 
868   /* copy t into x according to permutation */
869   ii = 0;
870   for (i=0; i<n; i++) {
871     ir      = 5*r[i];
872     x[ir]   = t[ii];
873     x[ir+1] = t[ii+1];
874     x[ir+2] = t[ii+2];
875     x[ir+3] = t[ii+3];
876     x[ir+4] = t[ii+4];
877     ii += 5;
878   }
879 
880   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
881   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
882   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
883   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
884   PetscLogFlops(2*25*(a->nz) - 5*A->n);
885   PetscFunctionReturn(0);
886 }
887 
888 #undef __FUNCT__
889 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6"
890 PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
891 {
892   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
893   IS              iscol=a->col,isrow=a->row;
894   PetscErrorCode ierr;
895   int             *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
896   int             *diag = a->diag,ii,ic,ir,oidx;
897   MatScalar       *aa=a->a,*v;
898   PetscScalar     s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
899   PetscScalar     *x,*b,*t;
900 
901   PetscFunctionBegin;
902   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
903   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
904   t  = a->solve_work;
905 
906   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
907   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
908 
909   /* copy the b into temp work space according to permutation */
910   ii = 0;
911   for (i=0; i<n; i++) {
912     ic      = 6*c[i];
913     t[ii]   = b[ic];
914     t[ii+1] = b[ic+1];
915     t[ii+2] = b[ic+2];
916     t[ii+3] = b[ic+3];
917     t[ii+4] = b[ic+4];
918     t[ii+5] = b[ic+5];
919     ii += 6;
920   }
921 
922   /* forward solve the U^T */
923   idx = 0;
924   for (i=0; i<n; i++) {
925 
926     v     = aa + 36*diag[i];
927     /* multiply by the inverse of the block diagonal */
928     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
929     x6    = t[5+idx];
930     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
931     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
932     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
933     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
934     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
935     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
936     v += 36;
937 
938     vi    = aj + diag[i] + 1;
939     nz    = ai[i+1] - diag[i] - 1;
940     while (nz--) {
941       oidx = 6*(*vi++);
942       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
943       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
944       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
945       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
946       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
947       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
948       v  += 36;
949     }
950     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
951     t[5+idx] = s6;
952     idx += 6;
953   }
954   /* backward solve the L^T */
955   for (i=n-1; i>=0; i--){
956     v    = aa + 36*diag[i] - 36;
957     vi   = aj + diag[i] - 1;
958     nz   = diag[i] - ai[i];
959     idt  = 6*i;
960     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
961     s6 = t[5+idt];
962     while (nz--) {
963       idx   = 6*(*vi--);
964       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
965       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
966       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
967       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
968       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
969       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
970       v -= 36;
971     }
972   }
973 
974   /* copy t into x according to permutation */
975   ii = 0;
976   for (i=0; i<n; i++) {
977     ir      = 6*r[i];
978     x[ir]   = t[ii];
979     x[ir+1] = t[ii+1];
980     x[ir+2] = t[ii+2];
981     x[ir+3] = t[ii+3];
982     x[ir+4] = t[ii+4];
983     x[ir+5] = t[ii+5];
984     ii += 6;
985   }
986 
987   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
988   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
989   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
990   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
991   PetscLogFlops(2*36*(a->nz) - 6*A->n);
992   PetscFunctionReturn(0);
993 }
994 
995 #undef __FUNCT__
996 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7"
997 PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
998 {
999   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1000   IS              iscol=a->col,isrow=a->row;
1001   PetscErrorCode ierr;
1002   int             *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
1003   int             *diag = a->diag,ii,ic,ir,oidx;
1004   MatScalar       *aa=a->a,*v;
1005   PetscScalar     s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1006   PetscScalar     *x,*b,*t;
1007 
1008   PetscFunctionBegin;
1009   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1010   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1011   t  = a->solve_work;
1012 
1013   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1014   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1015 
1016   /* copy the b into temp work space according to permutation */
1017   ii = 0;
1018   for (i=0; i<n; i++) {
1019     ic      = 7*c[i];
1020     t[ii]   = b[ic];
1021     t[ii+1] = b[ic+1];
1022     t[ii+2] = b[ic+2];
1023     t[ii+3] = b[ic+3];
1024     t[ii+4] = b[ic+4];
1025     t[ii+5] = b[ic+5];
1026     t[ii+6] = b[ic+6];
1027     ii += 7;
1028   }
1029 
1030   /* forward solve the U^T */
1031   idx = 0;
1032   for (i=0; i<n; i++) {
1033 
1034     v     = aa + 49*diag[i];
1035     /* multiply by the inverse of the block diagonal */
1036     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1037     x6    = t[5+idx]; x7 = t[6+idx];
1038     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1039     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1040     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1041     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1042     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1043     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1044     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1045     v += 49;
1046 
1047     vi    = aj + diag[i] + 1;
1048     nz    = ai[i+1] - diag[i] - 1;
1049     while (nz--) {
1050       oidx = 7*(*vi++);
1051       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1052       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1053       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1054       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1055       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1056       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1057       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1058       v  += 49;
1059     }
1060     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1061     t[5+idx] = s6;t[6+idx] = s7;
1062     idx += 7;
1063   }
1064   /* backward solve the L^T */
1065   for (i=n-1; i>=0; i--){
1066     v    = aa + 49*diag[i] - 49;
1067     vi   = aj + diag[i] - 1;
1068     nz   = diag[i] - ai[i];
1069     idt  = 7*i;
1070     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1071     s6 = t[5+idt];s7 = t[6+idt];
1072     while (nz--) {
1073       idx   = 7*(*vi--);
1074       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1075       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1076       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1077       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1078       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1079       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1080       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1081       v -= 49;
1082     }
1083   }
1084 
1085   /* copy t into x according to permutation */
1086   ii = 0;
1087   for (i=0; i<n; i++) {
1088     ir      = 7*r[i];
1089     x[ir]   = t[ii];
1090     x[ir+1] = t[ii+1];
1091     x[ir+2] = t[ii+2];
1092     x[ir+3] = t[ii+3];
1093     x[ir+4] = t[ii+4];
1094     x[ir+5] = t[ii+5];
1095     x[ir+6] = t[ii+6];
1096     ii += 7;
1097   }
1098 
1099   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1100   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1101   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1102   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1103   PetscLogFlops(2*49*(a->nz) - 7*A->n);
1104   PetscFunctionReturn(0);
1105 }
1106 
1107 /* ----------------------------------------------------------- */
1108 #undef __FUNCT__
1109 #define __FUNCT__ "MatSolve_SeqBAIJ_N"
1110 PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
1111 {
1112   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1113   IS              iscol=a->col,isrow=a->row;
1114   PetscErrorCode ierr;
1115   int             *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1116   int             nz,bs=a->bs,bs2=a->bs2,*rout,*cout;
1117   MatScalar       *aa=a->a,*v;
1118   PetscScalar     *x,*b,*s,*t,*ls;
1119 
1120   PetscFunctionBegin;
1121   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1122   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1123   t  = a->solve_work;
1124 
1125   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1126   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1127 
1128   /* forward solve the lower triangular */
1129   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
1130   for (i=1; i<n; i++) {
1131     v   = aa + bs2*ai[i];
1132     vi  = aj + ai[i];
1133     nz  = a->diag[i] - ai[i];
1134     s = t + bs*i;
1135     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
1136     while (nz--) {
1137       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
1138       v += bs2;
1139     }
1140   }
1141   /* backward solve the upper triangular */
1142   ls = a->solve_work + A->n;
1143   for (i=n-1; i>=0; i--){
1144     v   = aa + bs2*(a->diag[i] + 1);
1145     vi  = aj + a->diag[i] + 1;
1146     nz  = ai[i+1] - a->diag[i] - 1;
1147     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1148     while (nz--) {
1149       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
1150       v += bs2;
1151     }
1152     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
1153     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1154   }
1155 
1156   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1157   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1158   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1159   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1160   PetscLogFlops(2*(a->bs2)*(a->nz) - a->bs*A->n);
1161   PetscFunctionReturn(0);
1162 }
1163 
1164 #undef __FUNCT__
1165 #define __FUNCT__ "MatSolve_SeqBAIJ_7"
1166 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1167 {
1168   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1169   IS              iscol=a->col,isrow=a->row;
1170   PetscErrorCode ierr;
1171   int             *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1172   int             *diag = a->diag;
1173   MatScalar       *aa=a->a,*v;
1174   PetscScalar     s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1175   PetscScalar     *x,*b,*t;
1176 
1177   PetscFunctionBegin;
1178   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1179   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1180   t  = a->solve_work;
1181 
1182   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1183   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1184 
1185   /* forward solve the lower triangular */
1186   idx    = 7*(*r++);
1187   t[0] = b[idx];   t[1] = b[1+idx];
1188   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1189   t[5] = b[5+idx]; t[6] = b[6+idx];
1190 
1191   for (i=1; i<n; i++) {
1192     v     = aa + 49*ai[i];
1193     vi    = aj + ai[i];
1194     nz    = diag[i] - ai[i];
1195     idx   = 7*(*r++);
1196     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1197     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1198     while (nz--) {
1199       idx   = 7*(*vi++);
1200       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1201       x4    = t[3+idx];x5 = t[4+idx];
1202       x6    = t[5+idx];x7 = t[6+idx];
1203       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1204       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1205       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1206       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1207       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1208       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1209       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1210       v += 49;
1211     }
1212     idx = 7*i;
1213     t[idx]   = s1;t[1+idx] = s2;
1214     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1215     t[5+idx] = s6;t[6+idx] = s7;
1216   }
1217   /* backward solve the upper triangular */
1218   for (i=n-1; i>=0; i--){
1219     v    = aa + 49*diag[i] + 49;
1220     vi   = aj + diag[i] + 1;
1221     nz   = ai[i+1] - diag[i] - 1;
1222     idt  = 7*i;
1223     s1 = t[idt];  s2 = t[1+idt];
1224     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1225     s6 = t[5+idt];s7 = t[6+idt];
1226     while (nz--) {
1227       idx   = 7*(*vi++);
1228       x1    = t[idx];   x2 = t[1+idx];
1229       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1230       x6    = t[5+idx]; x7 = t[6+idx];
1231       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1232       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1233       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1234       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1235       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1236       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1237       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1238       v += 49;
1239     }
1240     idc = 7*(*c--);
1241     v   = aa + 49*diag[i];
1242     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1243                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1244     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1245                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1246     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1247                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1248     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1249                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1250     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1251                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1252     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1253                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1254     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1255                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
1256   }
1257 
1258   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1259   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1260   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1261   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1262   PetscLogFlops(2*49*(a->nz) - 7*A->n);
1263   PetscFunctionReturn(0);
1264 }
1265 
1266 #undef __FUNCT__
1267 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
1268 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
1269 {
1270   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1271   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1272   PetscErrorCode ierr;
1273   int *diag = a->diag,jdx;
1274   MatScalar       *aa=a->a,*v;
1275   PetscScalar     *x,*b,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1276 
1277   PetscFunctionBegin;
1278   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1279   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1280   /* forward solve the lower triangular */
1281   idx    = 0;
1282   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1283   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1284   x[6] = b[6+idx];
1285   for (i=1; i<n; i++) {
1286     v     =  aa + 49*ai[i];
1287     vi    =  aj + ai[i];
1288     nz    =  diag[i] - ai[i];
1289     idx   =  7*i;
1290     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1291     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1292     s7  =  b[6+idx];
1293     while (nz--) {
1294       jdx   = 7*(*vi++);
1295       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1296       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1297       x7    = x[6+jdx];
1298       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1299       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1300       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1301       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1302       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1303       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1304       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1305       v += 49;
1306      }
1307     x[idx]   = s1;
1308     x[1+idx] = s2;
1309     x[2+idx] = s3;
1310     x[3+idx] = s4;
1311     x[4+idx] = s5;
1312     x[5+idx] = s6;
1313     x[6+idx] = s7;
1314   }
1315   /* backward solve the upper triangular */
1316   for (i=n-1; i>=0; i--){
1317     v    = aa + 49*diag[i] + 49;
1318     vi   = aj + diag[i] + 1;
1319     nz   = ai[i+1] - diag[i] - 1;
1320     idt  = 7*i;
1321     s1 = x[idt];   s2 = x[1+idt];
1322     s3 = x[2+idt]; s4 = x[3+idt];
1323     s5 = x[4+idt]; s6 = x[5+idt];
1324     s7 = x[6+idt];
1325     while (nz--) {
1326       idx   = 7*(*vi++);
1327       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1328       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1329       x7    = x[6+idx];
1330       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1331       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1332       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1333       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1334       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1335       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1336       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1337       v += 49;
1338     }
1339     v        = aa + 49*diag[i];
1340     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
1341                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
1342     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
1343                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
1344     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
1345                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
1346     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
1347                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
1348     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
1349                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
1350     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
1351                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
1352     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
1353                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
1354   }
1355 
1356   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1357   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1358   PetscLogFlops(2*36*(a->nz) - 6*A->n);
1359   PetscFunctionReturn(0);
1360 }
1361 
1362 #undef __FUNCT__
1363 #define __FUNCT__ "MatSolve_SeqBAIJ_6"
1364 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
1365 {
1366   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1367   IS              iscol=a->col,isrow=a->row;
1368   PetscErrorCode ierr;
1369   int             *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1370   int             *diag = a->diag;
1371   MatScalar       *aa=a->a,*v;
1372   PetscScalar     *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
1373 
1374   PetscFunctionBegin;
1375   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1376   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1377   t  = a->solve_work;
1378 
1379   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1380   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1381 
1382   /* forward solve the lower triangular */
1383   idx    = 6*(*r++);
1384   t[0] = b[idx];   t[1] = b[1+idx];
1385   t[2] = b[2+idx]; t[3] = b[3+idx];
1386   t[4] = b[4+idx]; t[5] = b[5+idx];
1387   for (i=1; i<n; i++) {
1388     v     = aa + 36*ai[i];
1389     vi    = aj + ai[i];
1390     nz    = diag[i] - ai[i];
1391     idx   = 6*(*r++);
1392     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1393     s5  = b[4+idx]; s6 = b[5+idx];
1394     while (nz--) {
1395       idx   = 6*(*vi++);
1396       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1397       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1398       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1399       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1400       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1401       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1402       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1403       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1404       v += 36;
1405     }
1406     idx = 6*i;
1407     t[idx]   = s1;t[1+idx] = s2;
1408     t[2+idx] = s3;t[3+idx] = s4;
1409     t[4+idx] = s5;t[5+idx] = s6;
1410   }
1411   /* backward solve the upper triangular */
1412   for (i=n-1; i>=0; i--){
1413     v    = aa + 36*diag[i] + 36;
1414     vi   = aj + diag[i] + 1;
1415     nz   = ai[i+1] - diag[i] - 1;
1416     idt  = 6*i;
1417     s1 = t[idt];  s2 = t[1+idt];
1418     s3 = t[2+idt];s4 = t[3+idt];
1419     s5 = t[4+idt];s6 = t[5+idt];
1420     while (nz--) {
1421       idx   = 6*(*vi++);
1422       x1    = t[idx];   x2 = t[1+idx];
1423       x3    = t[2+idx]; x4 = t[3+idx];
1424       x5    = t[4+idx]; x6 = t[5+idx];
1425       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1426       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1427       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1428       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1429       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1430       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1431       v += 36;
1432     }
1433     idc = 6*(*c--);
1434     v   = aa + 36*diag[i];
1435     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
1436                                  v[18]*s4+v[24]*s5+v[30]*s6;
1437     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1438                                  v[19]*s4+v[25]*s5+v[31]*s6;
1439     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1440                                  v[20]*s4+v[26]*s5+v[32]*s6;
1441     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1442                                  v[21]*s4+v[27]*s5+v[33]*s6;
1443     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1444                                  v[22]*s4+v[28]*s5+v[34]*s6;
1445     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1446                                  v[23]*s4+v[29]*s5+v[35]*s6;
1447   }
1448 
1449   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1450   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1451   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1452   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1453   PetscLogFlops(2*36*(a->nz) - 6*A->n);
1454   PetscFunctionReturn(0);
1455 }
1456 
1457 #undef __FUNCT__
1458 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
1459 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
1460 {
1461   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1462   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1463   PetscErrorCode ierr;
1464   int *diag = a->diag,jdx;
1465   MatScalar       *aa=a->a,*v;
1466   PetscScalar     *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
1467 
1468   PetscFunctionBegin;
1469   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1470   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1471   /* forward solve the lower triangular */
1472   idx    = 0;
1473   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1474   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1475   for (i=1; i<n; i++) {
1476     v     =  aa + 36*ai[i];
1477     vi    =  aj + ai[i];
1478     nz    =  diag[i] - ai[i];
1479     idx   =  6*i;
1480     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1481     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1482     while (nz--) {
1483       jdx   = 6*(*vi++);
1484       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1485       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1486       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1487       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1488       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1489       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1490       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1491       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1492       v += 36;
1493      }
1494     x[idx]   = s1;
1495     x[1+idx] = s2;
1496     x[2+idx] = s3;
1497     x[3+idx] = s4;
1498     x[4+idx] = s5;
1499     x[5+idx] = s6;
1500   }
1501   /* backward solve the upper triangular */
1502   for (i=n-1; i>=0; i--){
1503     v    = aa + 36*diag[i] + 36;
1504     vi   = aj + diag[i] + 1;
1505     nz   = ai[i+1] - diag[i] - 1;
1506     idt  = 6*i;
1507     s1 = x[idt];   s2 = x[1+idt];
1508     s3 = x[2+idt]; s4 = x[3+idt];
1509     s5 = x[4+idt]; s6 = x[5+idt];
1510     while (nz--) {
1511       idx   = 6*(*vi++);
1512       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1513       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1514       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1515       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1516       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1517       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1518       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1519       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1520       v += 36;
1521     }
1522     v        = aa + 36*diag[i];
1523     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
1524     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
1525     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
1526     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
1527     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
1528     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
1529   }
1530 
1531   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1532   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1533   PetscLogFlops(2*36*(a->nz) - 6*A->n);
1534   PetscFunctionReturn(0);
1535 }
1536 
1537 #undef __FUNCT__
1538 #define __FUNCT__ "MatSolve_SeqBAIJ_5"
1539 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
1540 {
1541   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1542   IS              iscol=a->col,isrow=a->row;
1543   PetscErrorCode ierr;
1544   int             *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1545   int             *diag = a->diag;
1546   MatScalar       *aa=a->a,*v;
1547   PetscScalar     *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
1548 
1549   PetscFunctionBegin;
1550   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1551   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1552   t  = a->solve_work;
1553 
1554   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1555   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1556 
1557   /* forward solve the lower triangular */
1558   idx    = 5*(*r++);
1559   t[0] = b[idx];   t[1] = b[1+idx];
1560   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1561   for (i=1; i<n; i++) {
1562     v     = aa + 25*ai[i];
1563     vi    = aj + ai[i];
1564     nz    = diag[i] - ai[i];
1565     idx   = 5*(*r++);
1566     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1567     s5  = b[4+idx];
1568     while (nz--) {
1569       idx   = 5*(*vi++);
1570       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1571       x4    = t[3+idx];x5 = t[4+idx];
1572       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1573       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1574       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1575       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1576       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1577       v += 25;
1578     }
1579     idx = 5*i;
1580     t[idx]   = s1;t[1+idx] = s2;
1581     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1582   }
1583   /* backward solve the upper triangular */
1584   for (i=n-1; i>=0; i--){
1585     v    = aa + 25*diag[i] + 25;
1586     vi   = aj + diag[i] + 1;
1587     nz   = ai[i+1] - diag[i] - 1;
1588     idt  = 5*i;
1589     s1 = t[idt];  s2 = t[1+idt];
1590     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1591     while (nz--) {
1592       idx   = 5*(*vi++);
1593       x1    = t[idx];   x2 = t[1+idx];
1594       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1595       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1596       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1597       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1598       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1599       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1600       v += 25;
1601     }
1602     idc = 5*(*c--);
1603     v   = aa + 25*diag[i];
1604     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
1605                                  v[15]*s4+v[20]*s5;
1606     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
1607                                  v[16]*s4+v[21]*s5;
1608     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
1609                                  v[17]*s4+v[22]*s5;
1610     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
1611                                  v[18]*s4+v[23]*s5;
1612     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
1613                                  v[19]*s4+v[24]*s5;
1614   }
1615 
1616   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1617   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1618   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1619   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1620   PetscLogFlops(2*25*(a->nz) - 5*A->n);
1621   PetscFunctionReturn(0);
1622 }
1623 
1624 #undef __FUNCT__
1625 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
1626 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
1627 {
1628   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1629   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1630   PetscErrorCode ierr;
1631   int *diag = a->diag,jdx;
1632   MatScalar       *aa=a->a,*v;
1633   PetscScalar     *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
1634 
1635   PetscFunctionBegin;
1636   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1637   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1638   /* forward solve the lower triangular */
1639   idx    = 0;
1640   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
1641   for (i=1; i<n; i++) {
1642     v     =  aa + 25*ai[i];
1643     vi    =  aj + ai[i];
1644     nz    =  diag[i] - ai[i];
1645     idx   =  5*i;
1646     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
1647     while (nz--) {
1648       jdx   = 5*(*vi++);
1649       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
1650       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1651       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1652       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1653       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1654       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1655       v    += 25;
1656     }
1657     x[idx]   = s1;
1658     x[1+idx] = s2;
1659     x[2+idx] = s3;
1660     x[3+idx] = s4;
1661     x[4+idx] = s5;
1662   }
1663   /* backward solve the upper triangular */
1664   for (i=n-1; i>=0; i--){
1665     v    = aa + 25*diag[i] + 25;
1666     vi   = aj + diag[i] + 1;
1667     nz   = ai[i+1] - diag[i] - 1;
1668     idt  = 5*i;
1669     s1 = x[idt];  s2 = x[1+idt];
1670     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
1671     while (nz--) {
1672       idx   = 5*(*vi++);
1673       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
1674       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1675       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1676       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1677       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1678       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1679       v    += 25;
1680     }
1681     v        = aa + 25*diag[i];
1682     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
1683     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
1684     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
1685     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
1686     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
1687   }
1688 
1689   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1690   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1691   PetscLogFlops(2*25*(a->nz) - 5*A->n);
1692   PetscFunctionReturn(0);
1693 }
1694 
1695 #undef __FUNCT__
1696 #define __FUNCT__ "MatSolve_SeqBAIJ_4"
1697 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
1698 {
1699   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1700   IS              iscol=a->col,isrow=a->row;
1701   PetscErrorCode ierr;
1702   int             *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1703   int             *diag = a->diag;
1704   MatScalar       *aa=a->a,*v;
1705   PetscScalar     *x,*b,s1,s2,s3,s4,x1,x2,x3,x4,*t;
1706 
1707   PetscFunctionBegin;
1708   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1709   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1710   t  = a->solve_work;
1711 
1712   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1713   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1714 
1715   /* forward solve the lower triangular */
1716   idx    = 4*(*r++);
1717   t[0] = b[idx];   t[1] = b[1+idx];
1718   t[2] = b[2+idx]; t[3] = b[3+idx];
1719   for (i=1; i<n; i++) {
1720     v     = aa + 16*ai[i];
1721     vi    = aj + ai[i];
1722     nz    = diag[i] - ai[i];
1723     idx   = 4*(*r++);
1724     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1725     while (nz--) {
1726       idx   = 4*(*vi++);
1727       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
1728       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1729       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1730       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1731       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1732       v    += 16;
1733     }
1734     idx        = 4*i;
1735     t[idx]   = s1;t[1+idx] = s2;
1736     t[2+idx] = s3;t[3+idx] = s4;
1737   }
1738   /* backward solve the upper triangular */
1739   for (i=n-1; i>=0; i--){
1740     v    = aa + 16*diag[i] + 16;
1741     vi   = aj + diag[i] + 1;
1742     nz   = ai[i+1] - diag[i] - 1;
1743     idt  = 4*i;
1744     s1 = t[idt];  s2 = t[1+idt];
1745     s3 = t[2+idt];s4 = t[3+idt];
1746     while (nz--) {
1747       idx   = 4*(*vi++);
1748       x1    = t[idx];   x2 = t[1+idx];
1749       x3    = t[2+idx]; x4 = t[3+idx];
1750       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1751       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1752       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1753       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1754       v += 16;
1755     }
1756     idc      = 4*(*c--);
1757     v        = aa + 16*diag[i];
1758     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1759     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1760     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1761     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1762   }
1763 
1764   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1765   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1766   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1767   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1768   PetscLogFlops(2*16*(a->nz) - 4*A->n);
1769   PetscFunctionReturn(0);
1770 }
1771 
1772 #undef __FUNCT__
1773 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion"
1774 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
1775 {
1776   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1777   IS              iscol=a->col,isrow=a->row;
1778   PetscErrorCode ierr;
1779   int             *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1780   int             *diag = a->diag;
1781   MatScalar       *aa=a->a,*v,s1,s2,s3,s4,x1,x2,x3,x4,*t;
1782   PetscScalar     *x,*b;
1783 
1784   PetscFunctionBegin;
1785   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1786   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1787   t  = (MatScalar *)a->solve_work;
1788 
1789   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1790   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1791 
1792   /* forward solve the lower triangular */
1793   idx    = 4*(*r++);
1794   t[0] = (MatScalar)b[idx];
1795   t[1] = (MatScalar)b[1+idx];
1796   t[2] = (MatScalar)b[2+idx];
1797   t[3] = (MatScalar)b[3+idx];
1798   for (i=1; i<n; i++) {
1799     v     = aa + 16*ai[i];
1800     vi    = aj + ai[i];
1801     nz    = diag[i] - ai[i];
1802     idx   = 4*(*r++);
1803     s1 = (MatScalar)b[idx];
1804     s2 = (MatScalar)b[1+idx];
1805     s3 = (MatScalar)b[2+idx];
1806     s4 = (MatScalar)b[3+idx];
1807     while (nz--) {
1808       idx   = 4*(*vi++);
1809       x1  = t[idx];
1810       x2  = t[1+idx];
1811       x3  = t[2+idx];
1812       x4  = t[3+idx];
1813       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1814       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1815       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1816       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1817       v    += 16;
1818     }
1819     idx        = 4*i;
1820     t[idx]   = s1;
1821     t[1+idx] = s2;
1822     t[2+idx] = s3;
1823     t[3+idx] = s4;
1824   }
1825   /* backward solve the upper triangular */
1826   for (i=n-1; i>=0; i--){
1827     v    = aa + 16*diag[i] + 16;
1828     vi   = aj + diag[i] + 1;
1829     nz   = ai[i+1] - diag[i] - 1;
1830     idt  = 4*i;
1831     s1 = t[idt];
1832     s2 = t[1+idt];
1833     s3 = t[2+idt];
1834     s4 = t[3+idt];
1835     while (nz--) {
1836       idx   = 4*(*vi++);
1837       x1  = t[idx];
1838       x2  = t[1+idx];
1839       x3  = t[2+idx];
1840       x4  = t[3+idx];
1841       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1842       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1843       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1844       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1845       v += 16;
1846     }
1847     idc      = 4*(*c--);
1848     v        = aa + 16*diag[i];
1849     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1850     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1851     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1852     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1853     x[idc]   = (PetscScalar)t[idt];
1854     x[1+idc] = (PetscScalar)t[1+idt];
1855     x[2+idc] = (PetscScalar)t[2+idt];
1856     x[3+idc] = (PetscScalar)t[3+idt];
1857  }
1858 
1859   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1860   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1861   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1862   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1863   PetscLogFlops(2*16*(a->nz) - 4*A->n);
1864   PetscFunctionReturn(0);
1865 }
1866 
1867 #if defined (PETSC_HAVE_SSE)
1868 
1869 #include PETSC_HAVE_SSE
1870 
1871 #undef __FUNCT__
1872 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
1873 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
1874 {
1875   /*
1876      Note: This code uses demotion of double
1877      to float when performing the mixed-mode computation.
1878      This may not be numerically reasonable for all applications.
1879   */
1880   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1881   IS              iscol=a->col,isrow=a->row;
1882   PetscErrorCode ierr;
1883   int             *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1884   int             *diag = a->diag,ai16;
1885   MatScalar       *aa=a->a,*v;
1886   PetscScalar     *x,*b,*t;
1887 
1888   /* Make space in temp stack for 16 Byte Aligned arrays */
1889   float           ssealignedspace[11],*tmps,*tmpx;
1890   unsigned long   offset;
1891 
1892   PetscFunctionBegin;
1893   SSE_SCOPE_BEGIN;
1894 
1895     offset = (unsigned long)ssealignedspace % 16;
1896     if (offset) offset = (16 - offset)/4;
1897     tmps = &ssealignedspace[offset];
1898     tmpx = &ssealignedspace[offset+4];
1899     PREFETCH_NTA(aa+16*ai[1]);
1900 
1901     ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1902     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1903     t  = a->solve_work;
1904 
1905     ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1906     ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1907 
1908     /* forward solve the lower triangular */
1909     idx  = 4*(*r++);
1910     t[0] = b[idx];   t[1] = b[1+idx];
1911     t[2] = b[2+idx]; t[3] = b[3+idx];
1912     v    =  aa + 16*ai[1];
1913 
1914     for (i=1; i<n;) {
1915       PREFETCH_NTA(&v[8]);
1916       vi   =  aj      + ai[i];
1917       nz   =  diag[i] - ai[i];
1918       idx  =  4*(*r++);
1919 
1920       /* Demote sum from double to float */
1921       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
1922       LOAD_PS(tmps,XMM7);
1923 
1924       while (nz--) {
1925         PREFETCH_NTA(&v[16]);
1926         idx = 4*(*vi++);
1927 
1928         /* Demote solution (so far) from double to float */
1929         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
1930 
1931         /* 4x4 Matrix-Vector product with negative accumulation: */
1932         SSE_INLINE_BEGIN_2(tmpx,v)
1933           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
1934 
1935           /* First Column */
1936           SSE_COPY_PS(XMM0,XMM6)
1937           SSE_SHUFFLE(XMM0,XMM0,0x00)
1938           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1939           SSE_SUB_PS(XMM7,XMM0)
1940 
1941           /* Second Column */
1942           SSE_COPY_PS(XMM1,XMM6)
1943           SSE_SHUFFLE(XMM1,XMM1,0x55)
1944           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1945           SSE_SUB_PS(XMM7,XMM1)
1946 
1947           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
1948 
1949           /* Third Column */
1950           SSE_COPY_PS(XMM2,XMM6)
1951           SSE_SHUFFLE(XMM2,XMM2,0xAA)
1952           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1953           SSE_SUB_PS(XMM7,XMM2)
1954 
1955           /* Fourth Column */
1956           SSE_COPY_PS(XMM3,XMM6)
1957           SSE_SHUFFLE(XMM3,XMM3,0xFF)
1958           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1959           SSE_SUB_PS(XMM7,XMM3)
1960         SSE_INLINE_END_2
1961 
1962         v  += 16;
1963       }
1964       idx = 4*i;
1965       v   = aa + 16*ai[++i];
1966       PREFETCH_NTA(v);
1967       STORE_PS(tmps,XMM7);
1968 
1969       /* Promote result from float to double */
1970       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
1971     }
1972     /* backward solve the upper triangular */
1973     idt  = 4*(n-1);
1974     ai16 = 16*diag[n-1];
1975     v    = aa + ai16 + 16;
1976     for (i=n-1; i>=0;){
1977       PREFETCH_NTA(&v[8]);
1978       vi = aj + diag[i] + 1;
1979       nz = ai[i+1] - diag[i] - 1;
1980 
1981       /* Demote accumulator from double to float */
1982       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
1983       LOAD_PS(tmps,XMM7);
1984 
1985       while (nz--) {
1986         PREFETCH_NTA(&v[16]);
1987         idx = 4*(*vi++);
1988 
1989         /* Demote solution (so far) from double to float */
1990         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
1991 
1992         /* 4x4 Matrix-Vector Product with negative accumulation: */
1993         SSE_INLINE_BEGIN_2(tmpx,v)
1994           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
1995 
1996           /* First Column */
1997           SSE_COPY_PS(XMM0,XMM6)
1998           SSE_SHUFFLE(XMM0,XMM0,0x00)
1999           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2000           SSE_SUB_PS(XMM7,XMM0)
2001 
2002           /* Second Column */
2003           SSE_COPY_PS(XMM1,XMM6)
2004           SSE_SHUFFLE(XMM1,XMM1,0x55)
2005           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2006           SSE_SUB_PS(XMM7,XMM1)
2007 
2008           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2009 
2010           /* Third Column */
2011           SSE_COPY_PS(XMM2,XMM6)
2012           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2013           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2014           SSE_SUB_PS(XMM7,XMM2)
2015 
2016           /* Fourth Column */
2017           SSE_COPY_PS(XMM3,XMM6)
2018           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2019           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2020           SSE_SUB_PS(XMM7,XMM3)
2021         SSE_INLINE_END_2
2022         v  += 16;
2023       }
2024       v    = aa + ai16;
2025       ai16 = 16*diag[--i];
2026       PREFETCH_NTA(aa+ai16+16);
2027       /*
2028          Scale the result by the diagonal 4x4 block,
2029          which was inverted as part of the factorization
2030       */
2031       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
2032         /* First Column */
2033         SSE_COPY_PS(XMM0,XMM7)
2034         SSE_SHUFFLE(XMM0,XMM0,0x00)
2035         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2036 
2037         /* Second Column */
2038         SSE_COPY_PS(XMM1,XMM7)
2039         SSE_SHUFFLE(XMM1,XMM1,0x55)
2040         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2041         SSE_ADD_PS(XMM0,XMM1)
2042 
2043         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2044 
2045         /* Third Column */
2046         SSE_COPY_PS(XMM2,XMM7)
2047         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2048         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2049         SSE_ADD_PS(XMM0,XMM2)
2050 
2051         /* Fourth Column */
2052         SSE_COPY_PS(XMM3,XMM7)
2053         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2054         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2055         SSE_ADD_PS(XMM0,XMM3)
2056 
2057         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2058       SSE_INLINE_END_3
2059 
2060       /* Promote solution from float to double */
2061       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
2062 
2063       /* Apply reordering to t and stream into x.    */
2064       /* This way, x doesn't pollute the cache.      */
2065       /* Be careful with size: 2 doubles = 4 floats! */
2066       idc  = 4*(*c--);
2067       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
2068         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
2069         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
2070         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
2071         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
2072         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
2073         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
2074       SSE_INLINE_END_2
2075       v    = aa + ai16 + 16;
2076       idt -= 4;
2077     }
2078 
2079     ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2080     ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2081     ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2082     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2083     PetscLogFlops(2*16*(a->nz) - 4*A->n);
2084   SSE_SCOPE_END;
2085   PetscFunctionReturn(0);
2086 }
2087 
2088 #endif
2089 
2090 
2091 /*
2092       Special case where the matrix was ILU(0) factored in the natural
2093    ordering. This eliminates the need for the column and row permutation.
2094 */
2095 #undef __FUNCT__
2096 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
2097 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
2098 {
2099   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2100   int             n=a->mbs,*ai=a->i,*aj=a->j;
2101   PetscErrorCode ierr;
2102   int  *diag = a->diag;
2103   MatScalar       *aa=a->a;
2104   PetscScalar     *x,*b;
2105 
2106   PetscFunctionBegin;
2107   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2108   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2109 
2110 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
2111   {
2112     static PetscScalar w[2000]; /* very BAD need to fix */
2113     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
2114   }
2115 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
2116   {
2117     static PetscScalar w[2000]; /* very BAD need to fix */
2118     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
2119   }
2120 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
2121   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
2122 #else
2123   {
2124     PetscScalar  s1,s2,s3,s4,x1,x2,x3,x4;
2125     MatScalar    *v;
2126     int          jdx,idt,idx,nz,*vi,i,ai16;
2127 
2128   /* forward solve the lower triangular */
2129   idx    = 0;
2130   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
2131   for (i=1; i<n; i++) {
2132     v     =  aa      + 16*ai[i];
2133     vi    =  aj      + ai[i];
2134     nz    =  diag[i] - ai[i];
2135     idx   +=  4;
2136     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2137     while (nz--) {
2138       jdx   = 4*(*vi++);
2139       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
2140       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2141       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2142       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2143       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2144       v    += 16;
2145     }
2146     x[idx]   = s1;
2147     x[1+idx] = s2;
2148     x[2+idx] = s3;
2149     x[3+idx] = s4;
2150   }
2151   /* backward solve the upper triangular */
2152   idt = 4*(n-1);
2153   for (i=n-1; i>=0; i--){
2154     ai16 = 16*diag[i];
2155     v    = aa + ai16 + 16;
2156     vi   = aj + diag[i] + 1;
2157     nz   = ai[i+1] - diag[i] - 1;
2158     s1 = x[idt];  s2 = x[1+idt];
2159     s3 = x[2+idt];s4 = x[3+idt];
2160     while (nz--) {
2161       idx   = 4*(*vi++);
2162       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
2163       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2164       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2165       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2166       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
2167       v    += 16;
2168     }
2169     v        = aa + ai16;
2170     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
2171     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
2172     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
2173     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
2174     idt -= 4;
2175   }
2176   }
2177 #endif
2178 
2179   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2180   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2181   PetscLogFlops(2*16*(a->nz) - 4*A->n);
2182   PetscFunctionReturn(0);
2183 }
2184 
2185 #undef __FUNCT__
2186 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion"
2187 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
2188 {
2189   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2190   int             n=a->mbs,*ai=a->i,*aj=a->j;
2191   PetscErrorCode ierr;
2192   int  *diag = a->diag;
2193   MatScalar       *aa=a->a;
2194   PetscScalar     *x,*b;
2195 
2196   PetscFunctionBegin;
2197   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2198   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2199 
2200   {
2201     MatScalar  s1,s2,s3,s4,x1,x2,x3,x4;
2202     MatScalar  *v,*t=(MatScalar *)x;
2203     int        jdx,idt,idx,nz,*vi,i,ai16;
2204 
2205     /* forward solve the lower triangular */
2206     idx  = 0;
2207     t[0] = (MatScalar)b[0];
2208     t[1] = (MatScalar)b[1];
2209     t[2] = (MatScalar)b[2];
2210     t[3] = (MatScalar)b[3];
2211     for (i=1; i<n; i++) {
2212       v     =  aa      + 16*ai[i];
2213       vi    =  aj      + ai[i];
2214       nz    =  diag[i] - ai[i];
2215       idx   +=  4;
2216       s1 = (MatScalar)b[idx];
2217       s2 = (MatScalar)b[1+idx];
2218       s3 = (MatScalar)b[2+idx];
2219       s4 = (MatScalar)b[3+idx];
2220       while (nz--) {
2221         jdx = 4*(*vi++);
2222         x1  = t[jdx];
2223         x2  = t[1+jdx];
2224         x3  = t[2+jdx];
2225         x4  = t[3+jdx];
2226         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2227         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2228         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2229         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2230         v    += 16;
2231       }
2232       t[idx]   = s1;
2233       t[1+idx] = s2;
2234       t[2+idx] = s3;
2235       t[3+idx] = s4;
2236     }
2237     /* backward solve the upper triangular */
2238     idt = 4*(n-1);
2239     for (i=n-1; i>=0; i--){
2240       ai16 = 16*diag[i];
2241       v    = aa + ai16 + 16;
2242       vi   = aj + diag[i] + 1;
2243       nz   = ai[i+1] - diag[i] - 1;
2244       s1   = t[idt];
2245       s2   = t[1+idt];
2246       s3   = t[2+idt];
2247       s4   = t[3+idt];
2248       while (nz--) {
2249         idx = 4*(*vi++);
2250         x1  = (MatScalar)x[idx];
2251         x2  = (MatScalar)x[1+idx];
2252         x3  = (MatScalar)x[2+idx];
2253         x4  = (MatScalar)x[3+idx];
2254         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2255         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2256         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2257         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2258         v    += 16;
2259       }
2260       v        = aa + ai16;
2261       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
2262       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
2263       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
2264       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
2265       idt -= 4;
2266     }
2267   }
2268 
2269   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2270   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2271   PetscLogFlops(2*16*(a->nz) - 4*A->n);
2272   PetscFunctionReturn(0);
2273 }
2274 
2275 #if defined (PETSC_HAVE_SSE)
2276 
2277 #include PETSC_HAVE_SSE
2278 #undef __FUNCT__
2279 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj"
2280 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
2281 {
2282   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2283   unsigned short *aj=(unsigned short *)a->j;
2284   PetscErrorCode ierr;
2285   int  *ai=a->i,n=a->mbs,*diag = a->diag;
2286   MatScalar      *aa=a->a;
2287   PetscScalar    *x,*b;
2288 
2289   PetscFunctionBegin;
2290   SSE_SCOPE_BEGIN;
2291   /*
2292      Note: This code currently uses demotion of double
2293      to float when performing the mixed-mode computation.
2294      This may not be numerically reasonable for all applications.
2295   */
2296   PREFETCH_NTA(aa+16*ai[1]);
2297 
2298   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2299   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2300   {
2301     /* x will first be computed in single precision then promoted inplace to double */
2302     MatScalar      *v,*t=(MatScalar *)x;
2303     int            nz,i,idt,ai16;
2304     unsigned int   jdx,idx;
2305     unsigned short *vi;
2306     /* Forward solve the lower triangular factor. */
2307 
2308     /* First block is the identity. */
2309     idx  = 0;
2310     CONVERT_DOUBLE4_FLOAT4(t,b);
2311     v    =  aa + 16*((unsigned int)ai[1]);
2312 
2313     for (i=1; i<n;) {
2314       PREFETCH_NTA(&v[8]);
2315       vi   =  aj      + ai[i];
2316       nz   =  diag[i] - ai[i];
2317       idx +=  4;
2318 
2319       /* Demote RHS from double to float. */
2320       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
2321       LOAD_PS(&t[idx],XMM7);
2322 
2323       while (nz--) {
2324         PREFETCH_NTA(&v[16]);
2325         jdx = 4*((unsigned int)(*vi++));
2326 
2327         /* 4x4 Matrix-Vector product with negative accumulation: */
2328         SSE_INLINE_BEGIN_2(&t[jdx],v)
2329           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2330 
2331           /* First Column */
2332           SSE_COPY_PS(XMM0,XMM6)
2333           SSE_SHUFFLE(XMM0,XMM0,0x00)
2334           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2335           SSE_SUB_PS(XMM7,XMM0)
2336 
2337           /* Second Column */
2338           SSE_COPY_PS(XMM1,XMM6)
2339           SSE_SHUFFLE(XMM1,XMM1,0x55)
2340           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2341           SSE_SUB_PS(XMM7,XMM1)
2342 
2343           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2344 
2345           /* Third Column */
2346           SSE_COPY_PS(XMM2,XMM6)
2347           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2348           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2349           SSE_SUB_PS(XMM7,XMM2)
2350 
2351           /* Fourth Column */
2352           SSE_COPY_PS(XMM3,XMM6)
2353           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2354           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2355           SSE_SUB_PS(XMM7,XMM3)
2356         SSE_INLINE_END_2
2357 
2358         v  += 16;
2359       }
2360       v    =  aa + 16*ai[++i];
2361       PREFETCH_NTA(v);
2362       STORE_PS(&t[idx],XMM7);
2363     }
2364 
2365     /* Backward solve the upper triangular factor.*/
2366 
2367     idt  = 4*(n-1);
2368     ai16 = 16*diag[n-1];
2369     v    = aa + ai16 + 16;
2370     for (i=n-1; i>=0;){
2371       PREFETCH_NTA(&v[8]);
2372       vi = aj + diag[i] + 1;
2373       nz = ai[i+1] - diag[i] - 1;
2374 
2375       LOAD_PS(&t[idt],XMM7);
2376 
2377       while (nz--) {
2378         PREFETCH_NTA(&v[16]);
2379         idx = 4*((unsigned int)(*vi++));
2380 
2381         /* 4x4 Matrix-Vector Product with negative accumulation: */
2382         SSE_INLINE_BEGIN_2(&t[idx],v)
2383           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2384 
2385           /* First Column */
2386           SSE_COPY_PS(XMM0,XMM6)
2387           SSE_SHUFFLE(XMM0,XMM0,0x00)
2388           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2389           SSE_SUB_PS(XMM7,XMM0)
2390 
2391           /* Second Column */
2392           SSE_COPY_PS(XMM1,XMM6)
2393           SSE_SHUFFLE(XMM1,XMM1,0x55)
2394           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2395           SSE_SUB_PS(XMM7,XMM1)
2396 
2397           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2398 
2399           /* Third Column */
2400           SSE_COPY_PS(XMM2,XMM6)
2401           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2402           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2403           SSE_SUB_PS(XMM7,XMM2)
2404 
2405           /* Fourth Column */
2406           SSE_COPY_PS(XMM3,XMM6)
2407           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2408           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2409           SSE_SUB_PS(XMM7,XMM3)
2410         SSE_INLINE_END_2
2411         v  += 16;
2412       }
2413       v    = aa + ai16;
2414       ai16 = 16*diag[--i];
2415       PREFETCH_NTA(aa+ai16+16);
2416       /*
2417          Scale the result by the diagonal 4x4 block,
2418          which was inverted as part of the factorization
2419       */
2420       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
2421         /* First Column */
2422         SSE_COPY_PS(XMM0,XMM7)
2423         SSE_SHUFFLE(XMM0,XMM0,0x00)
2424         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2425 
2426         /* Second Column */
2427         SSE_COPY_PS(XMM1,XMM7)
2428         SSE_SHUFFLE(XMM1,XMM1,0x55)
2429         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2430         SSE_ADD_PS(XMM0,XMM1)
2431 
2432         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2433 
2434         /* Third Column */
2435         SSE_COPY_PS(XMM2,XMM7)
2436         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2437         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2438         SSE_ADD_PS(XMM0,XMM2)
2439 
2440         /* Fourth Column */
2441         SSE_COPY_PS(XMM3,XMM7)
2442         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2443         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2444         SSE_ADD_PS(XMM0,XMM3)
2445 
2446         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2447       SSE_INLINE_END_3
2448 
2449       v    = aa + ai16 + 16;
2450       idt -= 4;
2451     }
2452 
2453     /* Convert t from single precision back to double precision (inplace)*/
2454     idt = 4*(n-1);
2455     for (i=n-1;i>=0;i--) {
2456       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
2457       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
2458       PetscScalar *xtemp=&x[idt];
2459       MatScalar   *ttemp=&t[idt];
2460       xtemp[3] = (PetscScalar)ttemp[3];
2461       xtemp[2] = (PetscScalar)ttemp[2];
2462       xtemp[1] = (PetscScalar)ttemp[1];
2463       xtemp[0] = (PetscScalar)ttemp[0];
2464       idt -= 4;
2465     }
2466 
2467   } /* End of artificial scope. */
2468   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2469   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2470   PetscLogFlops(2*16*(a->nz) - 4*A->n);
2471   SSE_SCOPE_END;
2472   PetscFunctionReturn(0);
2473 }
2474 
2475 #undef __FUNCT__
2476 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion"
2477 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
2478 {
2479   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2480   int            *aj=a->j;
2481   PetscErrorCode ierr;
2482   int *ai=a->i,n=a->mbs,*diag = a->diag;
2483   MatScalar      *aa=a->a;
2484   PetscScalar    *x,*b;
2485 
2486   PetscFunctionBegin;
2487   SSE_SCOPE_BEGIN;
2488   /*
2489      Note: This code currently uses demotion of double
2490      to float when performing the mixed-mode computation.
2491      This may not be numerically reasonable for all applications.
2492   */
2493   PREFETCH_NTA(aa+16*ai[1]);
2494 
2495   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2496   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2497   {
2498     /* x will first be computed in single precision then promoted inplace to double */
2499     MatScalar *v,*t=(MatScalar *)x;
2500     int       nz,i,idt,ai16;
2501     int       jdx,idx;
2502     int       *vi;
2503     /* Forward solve the lower triangular factor. */
2504 
2505     /* First block is the identity. */
2506     idx  = 0;
2507     CONVERT_DOUBLE4_FLOAT4(t,b);
2508     v    =  aa + 16*ai[1];
2509 
2510     for (i=1; i<n;) {
2511       PREFETCH_NTA(&v[8]);
2512       vi   =  aj      + ai[i];
2513       nz   =  diag[i] - ai[i];
2514       idx +=  4;
2515 
2516       /* Demote RHS from double to float. */
2517       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
2518       LOAD_PS(&t[idx],XMM7);
2519 
2520       while (nz--) {
2521         PREFETCH_NTA(&v[16]);
2522         jdx = 4*(*vi++);
2523 /*          jdx = *vi++; */
2524 
2525         /* 4x4 Matrix-Vector product with negative accumulation: */
2526         SSE_INLINE_BEGIN_2(&t[jdx],v)
2527           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2528 
2529           /* First Column */
2530           SSE_COPY_PS(XMM0,XMM6)
2531           SSE_SHUFFLE(XMM0,XMM0,0x00)
2532           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2533           SSE_SUB_PS(XMM7,XMM0)
2534 
2535           /* Second Column */
2536           SSE_COPY_PS(XMM1,XMM6)
2537           SSE_SHUFFLE(XMM1,XMM1,0x55)
2538           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2539           SSE_SUB_PS(XMM7,XMM1)
2540 
2541           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2542 
2543           /* Third Column */
2544           SSE_COPY_PS(XMM2,XMM6)
2545           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2546           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2547           SSE_SUB_PS(XMM7,XMM2)
2548 
2549           /* Fourth Column */
2550           SSE_COPY_PS(XMM3,XMM6)
2551           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2552           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2553           SSE_SUB_PS(XMM7,XMM3)
2554         SSE_INLINE_END_2
2555 
2556         v  += 16;
2557       }
2558       v    =  aa + 16*ai[++i];
2559       PREFETCH_NTA(v);
2560       STORE_PS(&t[idx],XMM7);
2561     }
2562 
2563     /* Backward solve the upper triangular factor.*/
2564 
2565     idt  = 4*(n-1);
2566     ai16 = 16*diag[n-1];
2567     v    = aa + ai16 + 16;
2568     for (i=n-1; i>=0;){
2569       PREFETCH_NTA(&v[8]);
2570       vi = aj + diag[i] + 1;
2571       nz = ai[i+1] - diag[i] - 1;
2572 
2573       LOAD_PS(&t[idt],XMM7);
2574 
2575       while (nz--) {
2576         PREFETCH_NTA(&v[16]);
2577         idx = 4*(*vi++);
2578 /*          idx = *vi++; */
2579 
2580         /* 4x4 Matrix-Vector Product with negative accumulation: */
2581         SSE_INLINE_BEGIN_2(&t[idx],v)
2582           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2583 
2584           /* First Column */
2585           SSE_COPY_PS(XMM0,XMM6)
2586           SSE_SHUFFLE(XMM0,XMM0,0x00)
2587           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2588           SSE_SUB_PS(XMM7,XMM0)
2589 
2590           /* Second Column */
2591           SSE_COPY_PS(XMM1,XMM6)
2592           SSE_SHUFFLE(XMM1,XMM1,0x55)
2593           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2594           SSE_SUB_PS(XMM7,XMM1)
2595 
2596           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2597 
2598           /* Third Column */
2599           SSE_COPY_PS(XMM2,XMM6)
2600           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2601           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2602           SSE_SUB_PS(XMM7,XMM2)
2603 
2604           /* Fourth Column */
2605           SSE_COPY_PS(XMM3,XMM6)
2606           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2607           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2608           SSE_SUB_PS(XMM7,XMM3)
2609         SSE_INLINE_END_2
2610         v  += 16;
2611       }
2612       v    = aa + ai16;
2613       ai16 = 16*diag[--i];
2614       PREFETCH_NTA(aa+ai16+16);
2615       /*
2616          Scale the result by the diagonal 4x4 block,
2617          which was inverted as part of the factorization
2618       */
2619       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
2620         /* First Column */
2621         SSE_COPY_PS(XMM0,XMM7)
2622         SSE_SHUFFLE(XMM0,XMM0,0x00)
2623         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2624 
2625         /* Second Column */
2626         SSE_COPY_PS(XMM1,XMM7)
2627         SSE_SHUFFLE(XMM1,XMM1,0x55)
2628         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2629         SSE_ADD_PS(XMM0,XMM1)
2630 
2631         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2632 
2633         /* Third Column */
2634         SSE_COPY_PS(XMM2,XMM7)
2635         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2636         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2637         SSE_ADD_PS(XMM0,XMM2)
2638 
2639         /* Fourth Column */
2640         SSE_COPY_PS(XMM3,XMM7)
2641         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2642         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2643         SSE_ADD_PS(XMM0,XMM3)
2644 
2645         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2646       SSE_INLINE_END_3
2647 
2648       v    = aa + ai16 + 16;
2649       idt -= 4;
2650     }
2651 
2652     /* Convert t from single precision back to double precision (inplace)*/
2653     idt = 4*(n-1);
2654     for (i=n-1;i>=0;i--) {
2655       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
2656       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
2657       PetscScalar *xtemp=&x[idt];
2658       MatScalar   *ttemp=&t[idt];
2659       xtemp[3] = (PetscScalar)ttemp[3];
2660       xtemp[2] = (PetscScalar)ttemp[2];
2661       xtemp[1] = (PetscScalar)ttemp[1];
2662       xtemp[0] = (PetscScalar)ttemp[0];
2663       idt -= 4;
2664     }
2665 
2666   } /* End of artificial scope. */
2667   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2668   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2669   PetscLogFlops(2*16*(a->nz) - 4*A->n);
2670   SSE_SCOPE_END;
2671   PetscFunctionReturn(0);
2672 }
2673 
2674 #endif
2675 #undef __FUNCT__
2676 #define __FUNCT__ "MatSolve_SeqBAIJ_3"
2677 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
2678 {
2679   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
2680   IS              iscol=a->col,isrow=a->row;
2681   PetscErrorCode ierr;
2682   int             *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
2683   int             *diag = a->diag;
2684   MatScalar       *aa=a->a,*v;
2685   PetscScalar     *x,*b,s1,s2,s3,x1,x2,x3,*t;
2686 
2687   PetscFunctionBegin;
2688   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2689   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2690   t  = a->solve_work;
2691 
2692   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2693   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2694 
2695   /* forward solve the lower triangular */
2696   idx    = 3*(*r++);
2697   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
2698   for (i=1; i<n; i++) {
2699     v     = aa + 9*ai[i];
2700     vi    = aj + ai[i];
2701     nz    = diag[i] - ai[i];
2702     idx   = 3*(*r++);
2703     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
2704     while (nz--) {
2705       idx   = 3*(*vi++);
2706       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2707       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2708       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2709       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2710       v += 9;
2711     }
2712     idx = 3*i;
2713     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
2714   }
2715   /* backward solve the upper triangular */
2716   for (i=n-1; i>=0; i--){
2717     v    = aa + 9*diag[i] + 9;
2718     vi   = aj + diag[i] + 1;
2719     nz   = ai[i+1] - diag[i] - 1;
2720     idt  = 3*i;
2721     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
2722     while (nz--) {
2723       idx   = 3*(*vi++);
2724       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2725       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2726       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2727       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2728       v += 9;
2729     }
2730     idc = 3*(*c--);
2731     v   = aa + 9*diag[i];
2732     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2733     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2734     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
2735   }
2736   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2737   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2738   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2739   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2740   PetscLogFlops(2*9*(a->nz) - 3*A->n);
2741   PetscFunctionReturn(0);
2742 }
2743 
2744 /*
2745       Special case where the matrix was ILU(0) factored in the natural
2746    ordering. This eliminates the need for the column and row permutation.
2747 */
2748 #undef __FUNCT__
2749 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
2750 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
2751 {
2752   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2753   int             n=a->mbs,*ai=a->i,*aj=a->j;
2754   PetscErrorCode ierr;
2755   int *diag = a->diag;
2756   MatScalar       *aa=a->a,*v;
2757   PetscScalar     *x,*b,s1,s2,s3,x1,x2,x3;
2758   int             jdx,idt,idx,nz,*vi,i;
2759 
2760   PetscFunctionBegin;
2761   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2762   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2763 
2764 
2765   /* forward solve the lower triangular */
2766   idx    = 0;
2767   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
2768   for (i=1; i<n; i++) {
2769     v     =  aa      + 9*ai[i];
2770     vi    =  aj      + ai[i];
2771     nz    =  diag[i] - ai[i];
2772     idx   +=  3;
2773     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
2774     while (nz--) {
2775       jdx   = 3*(*vi++);
2776       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
2777       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2778       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2779       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2780       v    += 9;
2781     }
2782     x[idx]   = s1;
2783     x[1+idx] = s2;
2784     x[2+idx] = s3;
2785   }
2786   /* backward solve the upper triangular */
2787   for (i=n-1; i>=0; i--){
2788     v    = aa + 9*diag[i] + 9;
2789     vi   = aj + diag[i] + 1;
2790     nz   = ai[i+1] - diag[i] - 1;
2791     idt  = 3*i;
2792     s1 = x[idt];  s2 = x[1+idt];
2793     s3 = x[2+idt];
2794     while (nz--) {
2795       idx   = 3*(*vi++);
2796       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
2797       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2798       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2799       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2800       v    += 9;
2801     }
2802     v        = aa +  9*diag[i];
2803     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2804     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2805     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
2806   }
2807 
2808   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2809   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2810   PetscLogFlops(2*9*(a->nz) - 3*A->n);
2811   PetscFunctionReturn(0);
2812 }
2813 
2814 #undef __FUNCT__
2815 #define __FUNCT__ "MatSolve_SeqBAIJ_2"
2816 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
2817 {
2818   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
2819   IS              iscol=a->col,isrow=a->row;
2820   PetscErrorCode ierr;
2821   int             *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
2822   int             *diag = a->diag;
2823   MatScalar       *aa=a->a,*v;
2824   PetscScalar     *x,*b,s1,s2,x1,x2,*t;
2825 
2826   PetscFunctionBegin;
2827   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2828   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2829   t  = a->solve_work;
2830 
2831   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2832   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2833 
2834   /* forward solve the lower triangular */
2835   idx    = 2*(*r++);
2836   t[0] = b[idx]; t[1] = b[1+idx];
2837   for (i=1; i<n; i++) {
2838     v     = aa + 4*ai[i];
2839     vi    = aj + ai[i];
2840     nz    = diag[i] - ai[i];
2841     idx   = 2*(*r++);
2842     s1  = b[idx]; s2 = b[1+idx];
2843     while (nz--) {
2844       idx   = 2*(*vi++);
2845       x1    = t[idx]; x2 = t[1+idx];
2846       s1 -= v[0]*x1 + v[2]*x2;
2847       s2 -= v[1]*x1 + v[3]*x2;
2848       v += 4;
2849     }
2850     idx = 2*i;
2851     t[idx] = s1; t[1+idx] = s2;
2852   }
2853   /* backward solve the upper triangular */
2854   for (i=n-1; i>=0; i--){
2855     v    = aa + 4*diag[i] + 4;
2856     vi   = aj + diag[i] + 1;
2857     nz   = ai[i+1] - diag[i] - 1;
2858     idt  = 2*i;
2859     s1 = t[idt]; s2 = t[1+idt];
2860     while (nz--) {
2861       idx   = 2*(*vi++);
2862       x1    = t[idx]; x2 = t[1+idx];
2863       s1 -= v[0]*x1 + v[2]*x2;
2864       s2 -= v[1]*x1 + v[3]*x2;
2865       v += 4;
2866     }
2867     idc = 2*(*c--);
2868     v   = aa + 4*diag[i];
2869     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
2870     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
2871   }
2872   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2873   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2874   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2875   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2876   PetscLogFlops(2*4*(a->nz) - 2*A->n);
2877   PetscFunctionReturn(0);
2878 }
2879 
2880 /*
2881       Special case where the matrix was ILU(0) factored in the natural
2882    ordering. This eliminates the need for the column and row permutation.
2883 */
2884 #undef __FUNCT__
2885 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
2886 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
2887 {
2888   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2889   int             n=a->mbs,*ai=a->i,*aj=a->j;
2890   PetscErrorCode ierr;
2891   int  *diag = a->diag;
2892   MatScalar       *aa=a->a,*v;
2893   PetscScalar     *x,*b,s1,s2,x1,x2;
2894   int             jdx,idt,idx,nz,*vi,i;
2895 
2896   PetscFunctionBegin;
2897   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2898   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2899 
2900   /* forward solve the lower triangular */
2901   idx    = 0;
2902   x[0]   = b[0]; x[1] = b[1];
2903   for (i=1; i<n; i++) {
2904     v     =  aa      + 4*ai[i];
2905     vi    =  aj      + ai[i];
2906     nz    =  diag[i] - ai[i];
2907     idx   +=  2;
2908     s1  =  b[idx];s2 = b[1+idx];
2909     while (nz--) {
2910       jdx   = 2*(*vi++);
2911       x1    = x[jdx];x2 = x[1+jdx];
2912       s1 -= v[0]*x1 + v[2]*x2;
2913       s2 -= v[1]*x1 + v[3]*x2;
2914       v    += 4;
2915     }
2916     x[idx]   = s1;
2917     x[1+idx] = s2;
2918   }
2919   /* backward solve the upper triangular */
2920   for (i=n-1; i>=0; i--){
2921     v    = aa + 4*diag[i] + 4;
2922     vi   = aj + diag[i] + 1;
2923     nz   = ai[i+1] - diag[i] - 1;
2924     idt  = 2*i;
2925     s1 = x[idt];  s2 = x[1+idt];
2926     while (nz--) {
2927       idx   = 2*(*vi++);
2928       x1    = x[idx];   x2 = x[1+idx];
2929       s1 -= v[0]*x1 + v[2]*x2;
2930       s2 -= v[1]*x1 + v[3]*x2;
2931       v    += 4;
2932     }
2933     v        = aa +  4*diag[i];
2934     x[idt]   = v[0]*s1 + v[2]*s2;
2935     x[1+idt] = v[1]*s1 + v[3]*s2;
2936   }
2937 
2938   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2939   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2940   PetscLogFlops(2*4*(a->nz) - 2*A->n);
2941   PetscFunctionReturn(0);
2942 }
2943 
2944 #undef __FUNCT__
2945 #define __FUNCT__ "MatSolve_SeqBAIJ_1"
2946 PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
2947 {
2948   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
2949   IS              iscol=a->col,isrow=a->row;
2950   PetscErrorCode ierr;
2951   int             *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
2952   int             *diag = a->diag;
2953   MatScalar       *aa=a->a,*v;
2954   PetscScalar     *x,*b,s1,*t;
2955 
2956   PetscFunctionBegin;
2957   if (!n) PetscFunctionReturn(0);
2958 
2959   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2960   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2961   t  = a->solve_work;
2962 
2963   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2964   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2965 
2966   /* forward solve the lower triangular */
2967   t[0] = b[*r++];
2968   for (i=1; i<n; i++) {
2969     v     = aa + ai[i];
2970     vi    = aj + ai[i];
2971     nz    = diag[i] - ai[i];
2972     s1  = b[*r++];
2973     while (nz--) {
2974       s1 -= (*v++)*t[*vi++];
2975     }
2976     t[i] = s1;
2977   }
2978   /* backward solve the upper triangular */
2979   for (i=n-1; i>=0; i--){
2980     v    = aa + diag[i] + 1;
2981     vi   = aj + diag[i] + 1;
2982     nz   = ai[i+1] - diag[i] - 1;
2983     s1 = t[i];
2984     while (nz--) {
2985       s1 -= (*v++)*t[*vi++];
2986     }
2987     x[*c--] = t[i] = aa[diag[i]]*s1;
2988   }
2989 
2990   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2991   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2992   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2993   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2994   PetscLogFlops(2*1*(a->nz) - A->n);
2995   PetscFunctionReturn(0);
2996 }
2997 /*
2998       Special case where the matrix was ILU(0) factored in the natural
2999    ordering. This eliminates the need for the column and row permutation.
3000 */
3001 #undef __FUNCT__
3002 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering"
3003 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
3004 {
3005   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
3006   int             n=a->mbs,*ai=a->i,*aj=a->j;
3007   PetscErrorCode ierr;
3008   int        *diag = a->diag;
3009   MatScalar       *aa=a->a;
3010   PetscScalar     *x,*b;
3011   PetscScalar     s1,x1;
3012   MatScalar       *v;
3013   int             jdx,idt,idx,nz,*vi,i;
3014 
3015   PetscFunctionBegin;
3016   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3017   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3018 
3019   /* forward solve the lower triangular */
3020   idx    = 0;
3021   x[0]   = b[0];
3022   for (i=1; i<n; i++) {
3023     v     =  aa      + ai[i];
3024     vi    =  aj      + ai[i];
3025     nz    =  diag[i] - ai[i];
3026     idx   +=  1;
3027     s1  =  b[idx];
3028     while (nz--) {
3029       jdx   = *vi++;
3030       x1    = x[jdx];
3031       s1 -= v[0]*x1;
3032       v    += 1;
3033     }
3034     x[idx]   = s1;
3035   }
3036   /* backward solve the upper triangular */
3037   for (i=n-1; i>=0; i--){
3038     v    = aa + diag[i] + 1;
3039     vi   = aj + diag[i] + 1;
3040     nz   = ai[i+1] - diag[i] - 1;
3041     idt  = i;
3042     s1 = x[idt];
3043     while (nz--) {
3044       idx   = *vi++;
3045       x1    = x[idx];
3046       s1 -= v[0]*x1;
3047       v    += 1;
3048     }
3049     v        = aa +  diag[i];
3050     x[idt]   = v[0]*s1;
3051   }
3052   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3053   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3054   PetscLogFlops(2*(a->nz) - A->n);
3055   PetscFunctionReturn(0);
3056 }
3057 
3058 /* ----------------------------------------------------------------*/
3059 /*
3060      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
3061    except that the data structure of Mat_SeqAIJ is slightly different.
3062    Not a good example of code reuse.
3063 */
3064 EXTERN PetscErrorCode MatMissingDiagonal_SeqBAIJ(Mat);
3065 
3066 #undef __FUNCT__
3067 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ"
3068 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,MatFactorInfo *info,Mat *fact)
3069 {
3070   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b;
3071   IS          isicol;
3072   PetscErrorCode ierr;
3073   int         *r,*ic,prow,n = a->mbs,*ai = a->i,*aj = a->j;
3074   int         *ainew,*ajnew,jmax,*fill,*xi,nz,*im,*ajfill,*flev;
3075   int         *dloc,idx,row,m,fm,nzf,nzi,len, reallocate = 0,dcount = 0;
3076   int         incrlev,nnz,i,bs = a->bs,bs2 = a->bs2,levels,diagonal_fill;
3077   PetscTruth  col_identity,row_identity;
3078   PetscReal   f;
3079 
3080   PetscFunctionBegin;
3081   f             = info->fill;
3082   levels        = (int)info->levels;
3083   diagonal_fill = (int)info->diagonal_fill;
3084   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
3085   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
3086   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
3087 
3088   if (!levels && row_identity && col_identity) {  /* special case copy the nonzero structure */
3089     ierr = MatDuplicate_SeqBAIJ(A,MAT_DO_NOT_COPY_VALUES,fact);CHKERRQ(ierr);
3090     (*fact)->factor = FACTOR_LU;
3091     b               = (Mat_SeqBAIJ*)(*fact)->data;
3092     if (!b->diag) {
3093       ierr = MatMarkDiagonal_SeqBAIJ(*fact);CHKERRQ(ierr);
3094     }
3095     ierr = MatMissingDiagonal_SeqBAIJ(*fact);CHKERRQ(ierr);
3096     b->row        = isrow;
3097     b->col        = iscol;
3098     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
3099     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
3100     b->icol       = isicol;
3101     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3102     ierr          = PetscMalloc(((*fact)->m+1+b->bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
3103   } else { /* general case perform the symbolic factorization */
3104     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
3105     ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
3106 
3107     /* get new row pointers */
3108     ierr = PetscMalloc((n+1)*sizeof(int),&ainew);CHKERRQ(ierr);
3109     ainew[0] = 0;
3110     /* don't know how many column pointers are needed so estimate */
3111     jmax = (int)(f*ai[n] + 1);
3112     ierr = PetscMalloc((jmax)*sizeof(int),&ajnew);CHKERRQ(ierr);
3113     /* ajfill is level of fill for each fill entry */
3114     ierr = PetscMalloc((jmax)*sizeof(int),&ajfill);CHKERRQ(ierr);
3115     /* fill is a linked list of nonzeros in active row */
3116     ierr = PetscMalloc((n+1)*sizeof(int),&fill);CHKERRQ(ierr);
3117     /* im is level for each filled value */
3118     ierr = PetscMalloc((n+1)*sizeof(int),&im);CHKERRQ(ierr);
3119     /* dloc is location of diagonal in factor */
3120     ierr = PetscMalloc((n+1)*sizeof(int),&dloc);CHKERRQ(ierr);
3121     dloc[0]  = 0;
3122     for (prow=0; prow<n; prow++) {
3123 
3124       /* copy prow into linked list */
3125       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
3126       if (!nz) SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix");
3127       xi         = aj + ai[r[prow]];
3128       fill[n]    = n;
3129       fill[prow] = -1; /* marker for diagonal entry */
3130       while (nz--) {
3131 	fm  = n;
3132 	idx = ic[*xi++];
3133 	do {
3134 	  m  = fm;
3135 	  fm = fill[m];
3136 	} while (fm < idx);
3137 	fill[m]   = idx;
3138 	fill[idx] = fm;
3139 	im[idx]   = 0;
3140       }
3141 
3142       /* make sure diagonal entry is included */
3143       if (diagonal_fill && fill[prow] == -1) {
3144 	fm = n;
3145 	while (fill[fm] < prow) fm = fill[fm];
3146 	fill[prow] = fill[fm];  /* insert diagonal into linked list */
3147 	fill[fm]   = prow;
3148 	im[prow]   = 0;
3149 	nzf++;
3150 	dcount++;
3151       }
3152 
3153       nzi = 0;
3154       row = fill[n];
3155       while (row < prow) {
3156 	incrlev = im[row] + 1;
3157 	nz      = dloc[row];
3158 	xi      = ajnew  + ainew[row] + nz + 1;
3159 	flev    = ajfill + ainew[row] + nz + 1;
3160 	nnz     = ainew[row+1] - ainew[row] - nz - 1;
3161 	fm      = row;
3162 	while (nnz-- > 0) {
3163 	  idx = *xi++;
3164 	  if (*flev + incrlev > levels) {
3165 	    flev++;
3166 	    continue;
3167 	  }
3168 	  do {
3169 	    m  = fm;
3170 	    fm = fill[m];
3171 	  } while (fm < idx);
3172 	  if (fm != idx) {
3173 	    im[idx]   = *flev + incrlev;
3174 	    fill[m]   = idx;
3175 	    fill[idx] = fm;
3176 	    fm        = idx;
3177 	    nzf++;
3178 	  } else {
3179 	    if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
3180 	  }
3181 	  flev++;
3182 	}
3183 	row = fill[row];
3184 	nzi++;
3185       }
3186       /* copy new filled row into permanent storage */
3187       ainew[prow+1] = ainew[prow] + nzf;
3188       if (ainew[prow+1] > jmax) {
3189 
3190 	/* estimate how much additional space we will need */
3191 	/* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
3192 	/* just double the memory each time */
3193 	int maxadd = jmax;
3194 	/* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
3195 	if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
3196 	jmax += maxadd;
3197 
3198 	/* allocate a longer ajnew and ajfill */
3199 	ierr = PetscMalloc(jmax*sizeof(int),&xi);CHKERRQ(ierr);
3200 	ierr = PetscMemcpy(xi,ajnew,ainew[prow]*sizeof(int));CHKERRQ(ierr);
3201 	ierr = PetscFree(ajnew);CHKERRQ(ierr);
3202 	ajnew = xi;
3203 	ierr = PetscMalloc(jmax*sizeof(int),&xi);CHKERRQ(ierr);
3204 	ierr = PetscMemcpy(xi,ajfill,ainew[prow]*sizeof(int));CHKERRQ(ierr);
3205 	ierr = PetscFree(ajfill);CHKERRQ(ierr);
3206 	ajfill = xi;
3207 	reallocate++; /* count how many reallocations are needed */
3208       }
3209       xi          = ajnew + ainew[prow];
3210       flev        = ajfill + ainew[prow];
3211       dloc[prow]  = nzi;
3212       fm          = fill[n];
3213       while (nzf--) {
3214 	*xi++   = fm;
3215 	*flev++ = im[fm];
3216 	fm      = fill[fm];
3217       }
3218       /* make sure row has diagonal entry */
3219       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
3220 	SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %d has missing diagonal in factored matrix\n\
3221     try running with -pc_ilu_nonzeros_along_diagonal or -pc_ilu_diagonal_fill",prow);
3222       }
3223     }
3224     ierr = PetscFree(ajfill);CHKERRQ(ierr);
3225     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
3226     ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
3227     ierr = PetscFree(fill);CHKERRQ(ierr);
3228     ierr = PetscFree(im);CHKERRQ(ierr);
3229 
3230     {
3231       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
3232       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",reallocate,f,af);
3233       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Run with -pc_ilu_fill %g or use \n",af);
3234       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:PCILUSetFill(pc,%g);\n",af);
3235       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:for best performance.\n");
3236       if (diagonal_fill) {
3237 	PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Detected and replaced %d missing diagonals",dcount);
3238       }
3239     }
3240 
3241     /* put together the new matrix */
3242     ierr = MatCreate(A->comm,bs*n,bs*n,bs*n,bs*n,fact);CHKERRQ(ierr);
3243     ierr = MatSetType(*fact,A->type_name);CHKERRQ(ierr);
3244     ierr = MatSeqBAIJSetPreallocation(*fact,bs,0,PETSC_NULL);CHKERRQ(ierr);
3245     PetscLogObjectParent(*fact,isicol);
3246     b = (Mat_SeqBAIJ*)(*fact)->data;
3247     ierr = PetscFree(b->imax);CHKERRQ(ierr);
3248     b->singlemalloc = PETSC_FALSE;
3249     len = bs2*ainew[n]*sizeof(MatScalar);
3250     /* the next line frees the default space generated by the Create() */
3251     ierr = PetscFree(b->a);CHKERRQ(ierr);
3252     ierr = PetscFree(b->ilen);CHKERRQ(ierr);
3253     ierr = PetscMalloc(len,&b->a);CHKERRQ(ierr);
3254     b->j          = ajnew;
3255     b->i          = ainew;
3256     for (i=0; i<n; i++) dloc[i] += ainew[i];
3257     b->diag       = dloc;
3258     b->ilen       = 0;
3259     b->imax       = 0;
3260     b->row        = isrow;
3261     b->col        = iscol;
3262     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3263     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
3264     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
3265     b->icol       = isicol;
3266     ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
3267     /* In b structure:  Free imax, ilen, old a, old j.
3268        Allocate dloc, solve_work, new a, new j */
3269     PetscLogObjectMemory(*fact,(ainew[n]-n)*(sizeof(int))+bs2*ainew[n]*sizeof(PetscScalar));
3270     b->maxnz          = b->nz = ainew[n];
3271     (*fact)->factor   = FACTOR_LU;
3272 
3273     (*fact)->info.factor_mallocs    = reallocate;
3274     (*fact)->info.fill_ratio_given  = f;
3275     (*fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
3276   }
3277 
3278   if (row_identity && col_identity) {
3279     ierr = MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(*fact);CHKERRQ(ierr);
3280   }
3281   PetscFunctionReturn(0);
3282 }
3283 
3284 #undef __FUNCT__
3285 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE"
3286 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
3287 {
3288   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
3289   /* int i,*AJ=a->j,nz=a->nz; */
3290   PetscFunctionBegin;
3291   /* Undo Column scaling */
3292 /*    while (nz--) { */
3293 /*      AJ[i] = AJ[i]/4; */
3294 /*    } */
3295   /* This should really invoke a push/pop logic, but we don't have that yet. */
3296   A->ops->setunfactored = PETSC_NULL;
3297   PetscFunctionReturn(0);
3298 }
3299 
3300 #undef __FUNCT__
3301 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj"
3302 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
3303 {
3304   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3305   int *AJ=a->j,nz=a->nz;
3306   unsigned short *aj=(unsigned short *)AJ;
3307   PetscFunctionBegin;
3308   /* Is this really necessary? */
3309   while (nz--) {
3310     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
3311   }
3312   A->ops->setunfactored = PETSC_NULL;
3313   PetscFunctionReturn(0);
3314 }
3315 
3316 #undef __FUNCT__
3317 #define __FUNCT__ "MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering"
3318 PetscErrorCode MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(Mat inA)
3319 {
3320   /*
3321       Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver
3322       with natural ordering
3323   */
3324   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)inA->data;
3325 
3326   PetscFunctionBegin;
3327   inA->ops->solve             = MatSolve_SeqBAIJ_Update;
3328   inA->ops->solvetranspose    = MatSolveTranspose_SeqBAIJ_Update;
3329   switch (a->bs) {
3330   case 1:
3331     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_1;
3332     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=1\n");
3333     break;
3334   case 2:
3335     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering;
3336     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=2\n");
3337     break;
3338   case 3:
3339     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering;
3340     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=3\n");
3341     break;
3342   case 4:
3343 #if defined(PETSC_USE_MAT_SINGLE)
3344     {
3345       PetscTruth  sse_enabled_local;
3346       PetscErrorCode ierr;
3347       ierr = PetscSSEIsEnabled(inA->comm,&sse_enabled_local,PETSC_NULL);CHKERRQ(ierr);
3348       if (sse_enabled_local) {
3349 #  if defined(PETSC_HAVE_SSE)
3350         int i,*AJ=a->j,nz=a->nz,n=a->mbs;
3351         if (n==(unsigned short)n) {
3352           unsigned short *aj=(unsigned short *)AJ;
3353           for (i=0;i<nz;i++) {
3354             aj[i] = (unsigned short)AJ[i];
3355           }
3356           inA->ops->setunfactored   = MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj;
3357           inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj;
3358           PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special SSE, in-place natural ordering, ushort j index factor BS=4\n");
3359         } else {
3360         /* Scale the column indices for easier indexing in MatSolve. */
3361 /*            for (i=0;i<nz;i++) { */
3362 /*              AJ[i] = AJ[i]*4; */
3363 /*            } */
3364           inA->ops->setunfactored   = MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE;
3365           inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE;
3366           PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special SSE, in-place natural ordering, int j index factor BS=4\n");
3367         }
3368 #  else
3369       /* This should never be reached.  If so, problem in PetscSSEIsEnabled. */
3370         SETERRQ(PETSC_ERR_SUP,"SSE Hardware unavailable");
3371 #  endif
3372       } else {
3373         inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering;
3374         PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=4\n");
3375       }
3376     }
3377 #else
3378     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering;
3379     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=4\n");
3380 #endif
3381     break;
3382   case 5:
3383     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_5_NaturalOrdering;
3384     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=5\n");
3385     break;
3386   case 6:
3387     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_6_NaturalOrdering;
3388     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=6\n");
3389     break;
3390   case 7:
3391     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_7_NaturalOrdering;
3392     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=7\n");
3393     break;
3394   }
3395   PetscFunctionReturn(0);
3396 }
3397 
3398 #undef __FUNCT__
3399 #define __FUNCT__ "MatSeqBAIJ_UpdateSolvers"
3400 PetscErrorCode MatSeqBAIJ_UpdateSolvers(Mat A)
3401 {
3402   /*
3403       Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver
3404       with natural ordering
3405   */
3406   Mat_SeqBAIJ *a  = (Mat_SeqBAIJ *)A->data;
3407   IS          row = a->row, col = a->col;
3408   PetscTruth  row_identity, col_identity;
3409   PetscTruth  use_natural;
3410   PetscErrorCode ierr;
3411 
3412   PetscFunctionBegin;
3413 
3414   use_natural = PETSC_FALSE;
3415   if (row && col) {
3416     ierr = ISIdentity(row,&row_identity);CHKERRQ(ierr);
3417     ierr = ISIdentity(col,&col_identity);CHKERRQ(ierr);
3418 
3419     if (row_identity && col_identity) {
3420       use_natural = PETSC_TRUE;
3421     }
3422   } else {
3423     use_natural = PETSC_TRUE;
3424   }
3425 
3426   switch (a->bs) {
3427   case 1:
3428     if (use_natural) {
3429       A->ops->solve           = MatSolve_SeqBAIJ_1_NaturalOrdering;
3430       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering;
3431       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=1\n");
3432       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
3433     } else {
3434       A->ops->solve           = MatSolve_SeqBAIJ_1;
3435       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_1;
3436     }
3437     break;
3438   case 2:
3439     if (use_natural) {
3440       A->ops->solve           = MatSolve_SeqBAIJ_2_NaturalOrdering;
3441       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering;
3442       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=2\n");
3443       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
3444     } else {
3445       A->ops->solve           = MatSolve_SeqBAIJ_2;
3446       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_2;
3447     }
3448     break;
3449   case 3:
3450     if (use_natural) {
3451       A->ops->solve           = MatSolve_SeqBAIJ_3_NaturalOrdering;
3452       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering;
3453       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=3\n");
3454       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
3455     } else {
3456       A->ops->solve           = MatSolve_SeqBAIJ_3;
3457       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_3;
3458     }
3459     break;
3460   case 4:
3461     {
3462       PetscTruth sse_enabled_local;
3463       ierr = PetscSSEIsEnabled(A->comm,&sse_enabled_local,PETSC_NULL);CHKERRQ(ierr);
3464       if (use_natural) {
3465 #if defined(PETSC_USE_MAT_SINGLE)
3466         if (sse_enabled_local) { /* Natural + Single + SSE */
3467 #  if defined(PETSC_HAVE_SSE)
3468           int n=a->mbs;
3469           if (n==(unsigned short)n) {
3470             A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj;
3471             PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE, in-place, ushort j index, natural ordering solve BS=4\n");
3472           } else {
3473             A->ops->solve         = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion;
3474             PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE, in-place, int j index, natural ordering solve BS=4\n");
3475           }
3476 #  else
3477           /* This should never be reached, unless there is a bug in PetscSSEIsEnabled(). */
3478           SETERRQ(PETSC_ERR_SUP,"SSE implementations are unavailable.");
3479 #  endif
3480         } else { /* Natural + Single */
3481           A->ops->solve         = MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion;
3482           PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, in-place, natural ordering solve BS=4\n");
3483         }
3484 #else
3485         A->ops->solve           = MatSolve_SeqBAIJ_4_NaturalOrdering;
3486         PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place, natural ordering solve BS=4\n");
3487 #endif
3488         A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
3489         PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place, natural ordering solve BS=4\n");
3490       } else { /* Arbitrary ordering */
3491 #if defined(PETSC_USE_MAT_SINGLE)
3492         if (sse_enabled_local) { /* Arbitrary + Single + SSE */
3493 #  if defined(PETSC_HAVE_SSE)
3494           A->ops->solve         = MatSolve_SeqBAIJ_4_SSE_Demotion;
3495           PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE solve BS=4\n");
3496 #  else
3497           /* This should never be reached, unless there is a bug in PetscSSEIsEnabled(). */
3498           SETERRQ(PETSC_ERR_SUP,"SSE implementations are unavailable.");
3499 #  endif
3500         } else { /* Arbitrary + Single */
3501           A->ops->solve         = MatSolve_SeqBAIJ_4_Demotion;
3502           PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision solve BS=4\n");
3503         }
3504 #else
3505         A->ops->solve           = MatSolve_SeqBAIJ_4;
3506 #endif
3507         A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_4;
3508       }
3509     }
3510     break;
3511   case 5:
3512     if (use_natural) {
3513       A->ops->solve           = MatSolve_SeqBAIJ_5_NaturalOrdering;
3514       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_5_NaturalOrdering;
3515       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=5\n");
3516       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=5\n");
3517     } else {
3518       A->ops->solve           = MatSolve_SeqBAIJ_5;
3519       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_5;
3520     }
3521     break;
3522   case 6:
3523     if (use_natural) {
3524       A->ops->solve           = MatSolve_SeqBAIJ_6_NaturalOrdering;
3525       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_6_NaturalOrdering;
3526       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=6\n");
3527       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=6\n");
3528     } else {
3529       A->ops->solve           = MatSolve_SeqBAIJ_6;
3530       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_6;
3531     }
3532     break;
3533   case 7:
3534     if (use_natural) {
3535       A->ops->solve           = MatSolve_SeqBAIJ_7_NaturalOrdering;
3536       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_7_NaturalOrdering;
3537       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=7\n");
3538       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=7\n");
3539     } else {
3540       A->ops->solve           = MatSolve_SeqBAIJ_7;
3541       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_7;
3542     }
3543     break;
3544   default:
3545     A->ops->solve             = MatSolve_SeqBAIJ_N;
3546     break;
3547   }
3548   PetscFunctionReturn(0);
3549 }
3550 
3551 #undef __FUNCT__
3552 #define __FUNCT__ "MatSolve_SeqBAIJ_Update"
3553 PetscErrorCode MatSolve_SeqBAIJ_Update(Mat A,Vec x,Vec y) {
3554   PetscErrorCode ierr;
3555 
3556   PetscFunctionBegin;
3557   ierr = MatSeqBAIJ_UpdateSolvers(A);
3558   if (A->ops->solve != MatSolve_SeqBAIJ_Update) {
3559     ierr = (*A->ops->solve)(A,x,y);CHKERRQ(ierr);
3560   } else {
3561     SETERRQ(PETSC_ERR_SUP,"Something really wrong happened.");
3562   }
3563   PetscFunctionReturn(0);
3564 }
3565 
3566 #undef __FUNCT__
3567 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_Update"
3568 PetscErrorCode MatSolveTranspose_SeqBAIJ_Update(Mat A,Vec x,Vec y) {
3569   PetscErrorCode ierr;
3570 
3571   PetscFunctionBegin;
3572   ierr = MatSeqBAIJ_UpdateSolvers(A);
3573   ierr = (*A->ops->solvetranspose)(A,x,y);CHKERRQ(ierr);
3574   PetscFunctionReturn(0);
3575 }
3576 
3577 
3578