xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision ef66eb6987ddfdf4e414d6b820cbc8d8d7d17bc2)
1 /*$Id: baijfact2.c,v 1.70 2001/08/07 03:02:55 balay Exp bsmith $*/
2 /*
3     Factorization code for BAIJ format.
4 */
5 
6 #include "src/mat/impls/baij/seq/baij.h"
7 #include "src/vec/vecimpl.h"
8 #include "src/inline/ilu.h"
9 #include "src/inline/dot.h"
10 
11 #undef __FUNCT__
12 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering"
13 int MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
14 {
15   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
16   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
17   int             *diag = a->diag;
18   MatScalar       *aa=a->a,*v;
19   PetscScalar     s1,*x,*b;
20 
21   PetscFunctionBegin;
22   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
23   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
24   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
25 
26   /* forward solve the U^T */
27   for (i=0; i<n; i++) {
28 
29     v     = aa + diag[i];
30     /* multiply by the inverse of the block diagonal */
31     s1    = (*v++)*x[i];
32     vi    = aj + diag[i] + 1;
33     nz    = ai[i+1] - diag[i] - 1;
34     while (nz--) {
35       x[*vi++]  -= (*v++)*s1;
36     }
37     x[i]   = s1;
38   }
39   /* backward solve the L^T */
40   for (i=n-1; i>=0; i--){
41     v    = aa + diag[i] - 1;
42     vi   = aj + diag[i] - 1;
43     nz   = diag[i] - ai[i];
44     s1   = x[i];
45     while (nz--) {
46       x[*vi--]   -=  (*v--)*s1;
47     }
48   }
49   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
50   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
51   PetscLogFlops(2*(a->nz) - A->n);
52   PetscFunctionReturn(0);
53 }
54 
55 #undef __FUNCT__
56 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering"
57 int MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
58 {
59   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
60   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
61   int             *diag = a->diag,oidx;
62   MatScalar       *aa=a->a,*v;
63   PetscScalar     s1,s2,x1,x2;
64   PetscScalar     *x,*b;
65 
66   PetscFunctionBegin;
67   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
68   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
69   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
70 
71   /* forward solve the U^T */
72   idx = 0;
73   for (i=0; i<n; i++) {
74 
75     v     = aa + 4*diag[i];
76     /* multiply by the inverse of the block diagonal */
77     x1 = x[idx];   x2 = x[1+idx];
78     s1 = v[0]*x1  +  v[1]*x2;
79     s2 = v[2]*x1  +  v[3]*x2;
80     v += 4;
81 
82     vi    = aj + diag[i] + 1;
83     nz    = ai[i+1] - diag[i] - 1;
84     while (nz--) {
85       oidx = 2*(*vi++);
86       x[oidx]   -= v[0]*s1  +  v[1]*s2;
87       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
88       v  += 4;
89     }
90     x[idx]   = s1;x[1+idx] = s2;
91     idx += 2;
92   }
93   /* backward solve the L^T */
94   for (i=n-1; i>=0; i--){
95     v    = aa + 4*diag[i] - 4;
96     vi   = aj + diag[i] - 1;
97     nz   = diag[i] - ai[i];
98     idt  = 2*i;
99     s1   = x[idt];  s2 = x[1+idt];
100     while (nz--) {
101       idx   = 2*(*vi--);
102       x[idx]   -=  v[0]*s1 +  v[1]*s2;
103       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
104       v -= 4;
105     }
106   }
107   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
108   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
109   PetscLogFlops(2*4*(a->nz) - 2*A->n);
110   PetscFunctionReturn(0);
111 }
112 
113 #undef __FUNCT__
114 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering"
115 int MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
116 {
117   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
118   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
119   int             *diag = a->diag,oidx;
120   MatScalar       *aa=a->a,*v;
121   PetscScalar     s1,s2,s3,x1,x2,x3;
122   PetscScalar     *x,*b;
123 
124   PetscFunctionBegin;
125   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
126   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
127   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
128 
129   /* forward solve the U^T */
130   idx = 0;
131   for (i=0; i<n; i++) {
132 
133     v     = aa + 9*diag[i];
134     /* multiply by the inverse of the block diagonal */
135     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
136     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
137     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
138     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
139     v += 9;
140 
141     vi    = aj + diag[i] + 1;
142     nz    = ai[i+1] - diag[i] - 1;
143     while (nz--) {
144       oidx = 3*(*vi++);
145       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
146       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
147       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
148       v  += 9;
149     }
150     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
151     idx += 3;
152   }
153   /* backward solve the L^T */
154   for (i=n-1; i>=0; i--){
155     v    = aa + 9*diag[i] - 9;
156     vi   = aj + diag[i] - 1;
157     nz   = diag[i] - ai[i];
158     idt  = 3*i;
159     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
160     while (nz--) {
161       idx   = 3*(*vi--);
162       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
163       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
164       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
165       v -= 9;
166     }
167   }
168   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
169   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
170   PetscLogFlops(2*9*(a->nz) - 3*A->n);
171   PetscFunctionReturn(0);
172 }
173 
174 #undef __FUNCT__
175 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering"
176 int MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
177 {
178   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
179   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
180   int             *diag = a->diag,oidx;
181   MatScalar       *aa=a->a,*v;
182   PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
183   PetscScalar     *x,*b;
184 
185   PetscFunctionBegin;
186   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
187   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
188   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
189 
190   /* forward solve the U^T */
191   idx = 0;
192   for (i=0; i<n; i++) {
193 
194     v     = aa + 16*diag[i];
195     /* multiply by the inverse of the block diagonal */
196     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
197     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
198     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
199     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
200     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
201     v += 16;
202 
203     vi    = aj + diag[i] + 1;
204     nz    = ai[i+1] - diag[i] - 1;
205     while (nz--) {
206       oidx = 4*(*vi++);
207       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
208       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
209       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
210       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
211       v  += 16;
212     }
213     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
214     idx += 4;
215   }
216   /* backward solve the L^T */
217   for (i=n-1; i>=0; i--){
218     v    = aa + 16*diag[i] - 16;
219     vi   = aj + diag[i] - 1;
220     nz   = diag[i] - ai[i];
221     idt  = 4*i;
222     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
223     while (nz--) {
224       idx   = 4*(*vi--);
225       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
226       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
227       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
228       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
229       v -= 16;
230     }
231   }
232   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
233   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
234   PetscLogFlops(2*16*(a->nz) - 4*A->n);
235   PetscFunctionReturn(0);
236 }
237 
238 #undef __FUNCT__
239 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering"
240 int MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
241 {
242   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
243   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
244   int             *diag = a->diag,oidx;
245   MatScalar       *aa=a->a,*v;
246   PetscScalar     s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
247   PetscScalar     *x,*b;
248 
249   PetscFunctionBegin;
250   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
251   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
252   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
253 
254   /* forward solve the U^T */
255   idx = 0;
256   for (i=0; i<n; i++) {
257 
258     v     = aa + 25*diag[i];
259     /* multiply by the inverse of the block diagonal */
260     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
261     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
262     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
263     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
264     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
265     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
266     v += 25;
267 
268     vi    = aj + diag[i] + 1;
269     nz    = ai[i+1] - diag[i] - 1;
270     while (nz--) {
271       oidx = 5*(*vi++);
272       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
273       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
274       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
275       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
276       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
277       v  += 25;
278     }
279     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
280     idx += 5;
281   }
282   /* backward solve the L^T */
283   for (i=n-1; i>=0; i--){
284     v    = aa + 25*diag[i] - 25;
285     vi   = aj + diag[i] - 1;
286     nz   = diag[i] - ai[i];
287     idt  = 5*i;
288     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
289     while (nz--) {
290       idx   = 5*(*vi--);
291       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
292       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
293       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
294       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
295       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
296       v -= 25;
297     }
298   }
299   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
300   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
301   PetscLogFlops(2*25*(a->nz) - 5*A->n);
302   PetscFunctionReturn(0);
303 }
304 
305 #undef __FUNCT__
306 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering"
307 int MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
308 {
309   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
310   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
311   int             *diag = a->diag,oidx;
312   MatScalar       *aa=a->a,*v;
313   PetscScalar     s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
314   PetscScalar     *x,*b;
315 
316   PetscFunctionBegin;
317   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
318   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
319   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
320 
321   /* forward solve the U^T */
322   idx = 0;
323   for (i=0; i<n; i++) {
324 
325     v     = aa + 36*diag[i];
326     /* multiply by the inverse of the block diagonal */
327     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
328     x6    = x[5+idx];
329     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
330     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
331     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
332     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
333     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
334     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
335     v += 36;
336 
337     vi    = aj + diag[i] + 1;
338     nz    = ai[i+1] - diag[i] - 1;
339     while (nz--) {
340       oidx = 6*(*vi++);
341       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
342       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
343       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
344       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
345       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
346       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
347       v  += 36;
348     }
349     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
350     x[5+idx] = s6;
351     idx += 6;
352   }
353   /* backward solve the L^T */
354   for (i=n-1; i>=0; i--){
355     v    = aa + 36*diag[i] - 36;
356     vi   = aj + diag[i] - 1;
357     nz   = diag[i] - ai[i];
358     idt  = 6*i;
359     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
360     s6 = x[5+idt];
361     while (nz--) {
362       idx   = 6*(*vi--);
363       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
364       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
365       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
366       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
367       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
368       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
369       v -= 36;
370     }
371   }
372   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
373   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
374   PetscLogFlops(2*36*(a->nz) - 6*A->n);
375   PetscFunctionReturn(0);
376 }
377 
378 #undef __FUNCT__
379 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering"
380 int MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
381 {
382   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
383   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
384   int             *diag = a->diag,oidx;
385   MatScalar       *aa=a->a,*v;
386   PetscScalar     s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
387   PetscScalar     *x,*b;
388 
389   PetscFunctionBegin;
390   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
391   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
392   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
393 
394   /* forward solve the U^T */
395   idx = 0;
396   for (i=0; i<n; i++) {
397 
398     v     = aa + 49*diag[i];
399     /* multiply by the inverse of the block diagonal */
400     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
401     x6    = x[5+idx]; x7 = x[6+idx];
402     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
403     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
404     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
405     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
406     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
407     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
408     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
409     v += 49;
410 
411     vi    = aj + diag[i] + 1;
412     nz    = ai[i+1] - diag[i] - 1;
413     while (nz--) {
414       oidx = 7*(*vi++);
415       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
416       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
417       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
418       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
419       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
420       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
421       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
422       v  += 49;
423     }
424     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
425     x[5+idx] = s6;x[6+idx] = s7;
426     idx += 7;
427   }
428   /* backward solve the L^T */
429   for (i=n-1; i>=0; i--){
430     v    = aa + 49*diag[i] - 49;
431     vi   = aj + diag[i] - 1;
432     nz   = diag[i] - ai[i];
433     idt  = 7*i;
434     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
435     s6 = x[5+idt];s7 = x[6+idt];
436     while (nz--) {
437       idx   = 7*(*vi--);
438       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
439       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
440       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
441       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
442       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
443       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
444       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
445       v -= 49;
446     }
447   }
448   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
449   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
450   PetscLogFlops(2*49*(a->nz) - 7*A->n);
451   PetscFunctionReturn(0);
452 }
453 
454 /*---------------------------------------------------------------------------------------------*/
455 #undef __FUNCT__
456 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1"
457 int MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
458 {
459   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
460   IS              iscol=a->col,isrow=a->row;
461   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
462   int             *diag = a->diag;
463   MatScalar       *aa=a->a,*v;
464   PetscScalar     s1,*x,*b,*t;
465 
466   PetscFunctionBegin;
467   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
468   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
469   t  = a->solve_work;
470 
471   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
472   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
473 
474   /* copy the b into temp work space according to permutation */
475   for (i=0; i<n; i++) {
476     t[i] = b[c[i]];
477   }
478 
479   /* forward solve the U^T */
480   for (i=0; i<n; i++) {
481 
482     v     = aa + diag[i];
483     /* multiply by the inverse of the block diagonal */
484     s1    = (*v++)*t[i];
485     vi    = aj + diag[i] + 1;
486     nz    = ai[i+1] - diag[i] - 1;
487     while (nz--) {
488       t[*vi++]  -= (*v++)*s1;
489     }
490     t[i]   = s1;
491   }
492   /* backward solve the L^T */
493   for (i=n-1; i>=0; i--){
494     v    = aa + diag[i] - 1;
495     vi   = aj + diag[i] - 1;
496     nz   = diag[i] - ai[i];
497     s1   = t[i];
498     while (nz--) {
499       t[*vi--]   -=  (*v--)*s1;
500     }
501   }
502 
503   /* copy t into x according to permutation */
504   for (i=0; i<n; i++) {
505     x[r[i]]   = t[i];
506   }
507 
508   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
509   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
510   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
511   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
512   PetscLogFlops(2*(a->nz) - A->n);
513   PetscFunctionReturn(0);
514 }
515 
516 #undef __FUNCT__
517 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2"
518 int MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
519 {
520   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
521   IS              iscol=a->col,isrow=a->row;
522   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
523   int             *diag = a->diag,ii,ic,ir,oidx;
524   MatScalar       *aa=a->a,*v;
525   PetscScalar     s1,s2,x1,x2;
526   PetscScalar     *x,*b,*t;
527 
528   PetscFunctionBegin;
529   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
530   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
531   t  = a->solve_work;
532 
533   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
534   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
535 
536   /* copy the b into temp work space according to permutation */
537   ii = 0;
538   for (i=0; i<n; i++) {
539     ic      = 2*c[i];
540     t[ii]   = b[ic];
541     t[ii+1] = b[ic+1];
542     ii += 2;
543   }
544 
545   /* forward solve the U^T */
546   idx = 0;
547   for (i=0; i<n; i++) {
548 
549     v     = aa + 4*diag[i];
550     /* multiply by the inverse of the block diagonal */
551     x1    = t[idx];   x2 = t[1+idx];
552     s1 = v[0]*x1  +  v[1]*x2;
553     s2 = v[2]*x1  +  v[3]*x2;
554     v += 4;
555 
556     vi    = aj + diag[i] + 1;
557     nz    = ai[i+1] - diag[i] - 1;
558     while (nz--) {
559       oidx = 2*(*vi++);
560       t[oidx]   -= v[0]*s1  +  v[1]*s2;
561       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
562       v  += 4;
563     }
564     t[idx]   = s1;t[1+idx] = s2;
565     idx += 2;
566   }
567   /* backward solve the L^T */
568   for (i=n-1; i>=0; i--){
569     v    = aa + 4*diag[i] - 4;
570     vi   = aj + diag[i] - 1;
571     nz   = diag[i] - ai[i];
572     idt  = 2*i;
573     s1 = t[idt];  s2 = t[1+idt];
574     while (nz--) {
575       idx   = 2*(*vi--);
576       t[idx]   -=  v[0]*s1 +  v[1]*s2;
577       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
578       v -= 4;
579     }
580   }
581 
582   /* copy t into x according to permutation */
583   ii = 0;
584   for (i=0; i<n; i++) {
585     ir      = 2*r[i];
586     x[ir]   = t[ii];
587     x[ir+1] = t[ii+1];
588     ii += 2;
589   }
590 
591   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
592   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
593   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
594   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
595   PetscLogFlops(2*4*(a->nz) - 2*A->n);
596   PetscFunctionReturn(0);
597 }
598 
599 #undef __FUNCT__
600 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3"
601 int MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
602 {
603   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
604   IS              iscol=a->col,isrow=a->row;
605   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
606   int             *diag = a->diag,ii,ic,ir,oidx;
607   MatScalar       *aa=a->a,*v;
608   PetscScalar     s1,s2,s3,x1,x2,x3;
609   PetscScalar     *x,*b,*t;
610 
611   PetscFunctionBegin;
612   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
613   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
614   t  = a->solve_work;
615 
616   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
617   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
618 
619   /* copy the b into temp work space according to permutation */
620   ii = 0;
621   for (i=0; i<n; i++) {
622     ic      = 3*c[i];
623     t[ii]   = b[ic];
624     t[ii+1] = b[ic+1];
625     t[ii+2] = b[ic+2];
626     ii += 3;
627   }
628 
629   /* forward solve the U^T */
630   idx = 0;
631   for (i=0; i<n; i++) {
632 
633     v     = aa + 9*diag[i];
634     /* multiply by the inverse of the block diagonal */
635     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
636     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
637     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
638     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
639     v += 9;
640 
641     vi    = aj + diag[i] + 1;
642     nz    = ai[i+1] - diag[i] - 1;
643     while (nz--) {
644       oidx = 3*(*vi++);
645       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
646       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
647       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
648       v  += 9;
649     }
650     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
651     idx += 3;
652   }
653   /* backward solve the L^T */
654   for (i=n-1; i>=0; i--){
655     v    = aa + 9*diag[i] - 9;
656     vi   = aj + diag[i] - 1;
657     nz   = diag[i] - ai[i];
658     idt  = 3*i;
659     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
660     while (nz--) {
661       idx   = 3*(*vi--);
662       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
663       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
664       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
665       v -= 9;
666     }
667   }
668 
669   /* copy t into x according to permutation */
670   ii = 0;
671   for (i=0; i<n; i++) {
672     ir      = 3*r[i];
673     x[ir]   = t[ii];
674     x[ir+1] = t[ii+1];
675     x[ir+2] = t[ii+2];
676     ii += 3;
677   }
678 
679   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
680   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
681   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
682   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
683   PetscLogFlops(2*9*(a->nz) - 3*A->n);
684   PetscFunctionReturn(0);
685 }
686 
687 #undef __FUNCT__
688 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4"
689 int MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
690 {
691   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
692   IS              iscol=a->col,isrow=a->row;
693   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
694   int             *diag = a->diag,ii,ic,ir,oidx;
695   MatScalar       *aa=a->a,*v;
696   PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
697   PetscScalar     *x,*b,*t;
698 
699   PetscFunctionBegin;
700   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
701   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
702   t  = a->solve_work;
703 
704   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
705   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
706 
707   /* copy the b into temp work space according to permutation */
708   ii = 0;
709   for (i=0; i<n; i++) {
710     ic      = 4*c[i];
711     t[ii]   = b[ic];
712     t[ii+1] = b[ic+1];
713     t[ii+2] = b[ic+2];
714     t[ii+3] = b[ic+3];
715     ii += 4;
716   }
717 
718   /* forward solve the U^T */
719   idx = 0;
720   for (i=0; i<n; i++) {
721 
722     v     = aa + 16*diag[i];
723     /* multiply by the inverse of the block diagonal */
724     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
725     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
726     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
727     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
728     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
729     v += 16;
730 
731     vi    = aj + diag[i] + 1;
732     nz    = ai[i+1] - diag[i] - 1;
733     while (nz--) {
734       oidx = 4*(*vi++);
735       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
736       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
737       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
738       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
739       v  += 16;
740     }
741     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
742     idx += 4;
743   }
744   /* backward solve the L^T */
745   for (i=n-1; i>=0; i--){
746     v    = aa + 16*diag[i] - 16;
747     vi   = aj + diag[i] - 1;
748     nz   = diag[i] - ai[i];
749     idt  = 4*i;
750     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
751     while (nz--) {
752       idx   = 4*(*vi--);
753       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
754       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
755       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
756       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
757       v -= 16;
758     }
759   }
760 
761   /* copy t into x according to permutation */
762   ii = 0;
763   for (i=0; i<n; i++) {
764     ir      = 4*r[i];
765     x[ir]   = t[ii];
766     x[ir+1] = t[ii+1];
767     x[ir+2] = t[ii+2];
768     x[ir+3] = t[ii+3];
769     ii += 4;
770   }
771 
772   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
773   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
774   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
775   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
776   PetscLogFlops(2*16*(a->nz) - 4*A->n);
777   PetscFunctionReturn(0);
778 }
779 
780 #undef __FUNCT__
781 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5"
782 int MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
783 {
784   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
785   IS              iscol=a->col,isrow=a->row;
786   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
787   int             *diag = a->diag,ii,ic,ir,oidx;
788   MatScalar       *aa=a->a,*v;
789   PetscScalar     s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
790   PetscScalar     *x,*b,*t;
791 
792   PetscFunctionBegin;
793   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
794   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
795   t  = a->solve_work;
796 
797   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
798   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
799 
800   /* copy the b into temp work space according to permutation */
801   ii = 0;
802   for (i=0; i<n; i++) {
803     ic      = 5*c[i];
804     t[ii]   = b[ic];
805     t[ii+1] = b[ic+1];
806     t[ii+2] = b[ic+2];
807     t[ii+3] = b[ic+3];
808     t[ii+4] = b[ic+4];
809     ii += 5;
810   }
811 
812   /* forward solve the U^T */
813   idx = 0;
814   for (i=0; i<n; i++) {
815 
816     v     = aa + 25*diag[i];
817     /* multiply by the inverse of the block diagonal */
818     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
819     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
820     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
821     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
822     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
823     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
824     v += 25;
825 
826     vi    = aj + diag[i] + 1;
827     nz    = ai[i+1] - diag[i] - 1;
828     while (nz--) {
829       oidx = 5*(*vi++);
830       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
831       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
832       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
833       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
834       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
835       v  += 25;
836     }
837     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
838     idx += 5;
839   }
840   /* backward solve the L^T */
841   for (i=n-1; i>=0; i--){
842     v    = aa + 25*diag[i] - 25;
843     vi   = aj + diag[i] - 1;
844     nz   = diag[i] - ai[i];
845     idt  = 5*i;
846     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
847     while (nz--) {
848       idx   = 5*(*vi--);
849       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
850       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
851       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
852       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
853       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
854       v -= 25;
855     }
856   }
857 
858   /* copy t into x according to permutation */
859   ii = 0;
860   for (i=0; i<n; i++) {
861     ir      = 5*r[i];
862     x[ir]   = t[ii];
863     x[ir+1] = t[ii+1];
864     x[ir+2] = t[ii+2];
865     x[ir+3] = t[ii+3];
866     x[ir+4] = t[ii+4];
867     ii += 5;
868   }
869 
870   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
871   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
872   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
873   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
874   PetscLogFlops(2*25*(a->nz) - 5*A->n);
875   PetscFunctionReturn(0);
876 }
877 
878 #undef __FUNCT__
879 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6"
880 int MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
881 {
882   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
883   IS              iscol=a->col,isrow=a->row;
884   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
885   int             *diag = a->diag,ii,ic,ir,oidx;
886   MatScalar       *aa=a->a,*v;
887   PetscScalar     s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
888   PetscScalar     *x,*b,*t;
889 
890   PetscFunctionBegin;
891   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
892   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
893   t  = a->solve_work;
894 
895   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
896   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
897 
898   /* copy the b into temp work space according to permutation */
899   ii = 0;
900   for (i=0; i<n; i++) {
901     ic      = 6*c[i];
902     t[ii]   = b[ic];
903     t[ii+1] = b[ic+1];
904     t[ii+2] = b[ic+2];
905     t[ii+3] = b[ic+3];
906     t[ii+4] = b[ic+4];
907     t[ii+5] = b[ic+5];
908     ii += 6;
909   }
910 
911   /* forward solve the U^T */
912   idx = 0;
913   for (i=0; i<n; i++) {
914 
915     v     = aa + 36*diag[i];
916     /* multiply by the inverse of the block diagonal */
917     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
918     x6    = t[5+idx];
919     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
920     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
921     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
922     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
923     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
924     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
925     v += 36;
926 
927     vi    = aj + diag[i] + 1;
928     nz    = ai[i+1] - diag[i] - 1;
929     while (nz--) {
930       oidx = 6*(*vi++);
931       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
932       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
933       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
934       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
935       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
936       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
937       v  += 36;
938     }
939     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
940     t[5+idx] = s6;
941     idx += 6;
942   }
943   /* backward solve the L^T */
944   for (i=n-1; i>=0; i--){
945     v    = aa + 36*diag[i] - 36;
946     vi   = aj + diag[i] - 1;
947     nz   = diag[i] - ai[i];
948     idt  = 6*i;
949     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
950     s6 = t[5+idt];
951     while (nz--) {
952       idx   = 6*(*vi--);
953       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
954       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
955       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
956       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
957       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
958       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
959       v -= 36;
960     }
961   }
962 
963   /* copy t into x according to permutation */
964   ii = 0;
965   for (i=0; i<n; i++) {
966     ir      = 6*r[i];
967     x[ir]   = t[ii];
968     x[ir+1] = t[ii+1];
969     x[ir+2] = t[ii+2];
970     x[ir+3] = t[ii+3];
971     x[ir+4] = t[ii+4];
972     x[ir+5] = t[ii+5];
973     ii += 6;
974   }
975 
976   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
977   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
978   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
979   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
980   PetscLogFlops(2*36*(a->nz) - 6*A->n);
981   PetscFunctionReturn(0);
982 }
983 
984 #undef __FUNCT__
985 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7"
986 int MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
987 {
988   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
989   IS              iscol=a->col,isrow=a->row;
990   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
991   int             *diag = a->diag,ii,ic,ir,oidx;
992   MatScalar       *aa=a->a,*v;
993   PetscScalar     s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
994   PetscScalar     *x,*b,*t;
995 
996   PetscFunctionBegin;
997   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
998   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
999   t  = a->solve_work;
1000 
1001   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1002   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1003 
1004   /* copy the b into temp work space according to permutation */
1005   ii = 0;
1006   for (i=0; i<n; i++) {
1007     ic      = 7*c[i];
1008     t[ii]   = b[ic];
1009     t[ii+1] = b[ic+1];
1010     t[ii+2] = b[ic+2];
1011     t[ii+3] = b[ic+3];
1012     t[ii+4] = b[ic+4];
1013     t[ii+5] = b[ic+5];
1014     t[ii+6] = b[ic+6];
1015     ii += 7;
1016   }
1017 
1018   /* forward solve the U^T */
1019   idx = 0;
1020   for (i=0; i<n; i++) {
1021 
1022     v     = aa + 49*diag[i];
1023     /* multiply by the inverse of the block diagonal */
1024     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1025     x6    = t[5+idx]; x7 = t[6+idx];
1026     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1027     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1028     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1029     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1030     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1031     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1032     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1033     v += 49;
1034 
1035     vi    = aj + diag[i] + 1;
1036     nz    = ai[i+1] - diag[i] - 1;
1037     while (nz--) {
1038       oidx = 7*(*vi++);
1039       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1040       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1041       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1042       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1043       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1044       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1045       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1046       v  += 49;
1047     }
1048     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1049     t[5+idx] = s6;t[6+idx] = s7;
1050     idx += 7;
1051   }
1052   /* backward solve the L^T */
1053   for (i=n-1; i>=0; i--){
1054     v    = aa + 49*diag[i] - 49;
1055     vi   = aj + diag[i] - 1;
1056     nz   = diag[i] - ai[i];
1057     idt  = 7*i;
1058     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1059     s6 = t[5+idt];s7 = t[6+idt];
1060     while (nz--) {
1061       idx   = 7*(*vi--);
1062       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1063       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1064       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1065       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1066       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1067       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1068       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1069       v -= 49;
1070     }
1071   }
1072 
1073   /* copy t into x according to permutation */
1074   ii = 0;
1075   for (i=0; i<n; i++) {
1076     ir      = 7*r[i];
1077     x[ir]   = t[ii];
1078     x[ir+1] = t[ii+1];
1079     x[ir+2] = t[ii+2];
1080     x[ir+3] = t[ii+3];
1081     x[ir+4] = t[ii+4];
1082     x[ir+5] = t[ii+5];
1083     x[ir+6] = t[ii+6];
1084     ii += 7;
1085   }
1086 
1087   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1088   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1089   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1090   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1091   PetscLogFlops(2*49*(a->nz) - 7*A->n);
1092   PetscFunctionReturn(0);
1093 }
1094 
1095 /* ----------------------------------------------------------- */
1096 #undef __FUNCT__
1097 #define __FUNCT__ "MatSolve_SeqBAIJ_N"
1098 int MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
1099 {
1100   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1101   IS              iscol=a->col,isrow=a->row;
1102   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1103   int             nz,bs=a->bs,bs2=a->bs2,*rout,*cout;
1104   MatScalar       *aa=a->a,*v;
1105   PetscScalar     *x,*b,*s,*t,*ls;
1106 
1107   PetscFunctionBegin;
1108   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1109   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1110   t  = a->solve_work;
1111 
1112   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1113   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1114 
1115   /* forward solve the lower triangular */
1116   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
1117   for (i=1; i<n; i++) {
1118     v   = aa + bs2*ai[i];
1119     vi  = aj + ai[i];
1120     nz  = a->diag[i] - ai[i];
1121     s = t + bs*i;
1122     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
1123     while (nz--) {
1124       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
1125       v += bs2;
1126     }
1127   }
1128   /* backward solve the upper triangular */
1129   ls = a->solve_work + A->n;
1130   for (i=n-1; i>=0; i--){
1131     v   = aa + bs2*(a->diag[i] + 1);
1132     vi  = aj + a->diag[i] + 1;
1133     nz  = ai[i+1] - a->diag[i] - 1;
1134     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1135     while (nz--) {
1136       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
1137       v += bs2;
1138     }
1139     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
1140     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1141   }
1142 
1143   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1144   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1145   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1146   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1147   PetscLogFlops(2*(a->bs2)*(a->nz) - a->bs*A->n);
1148   PetscFunctionReturn(0);
1149 }
1150 
1151 #undef __FUNCT__
1152 #define __FUNCT__ "MatSolve_SeqBAIJ_7"
1153 int MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1154 {
1155   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1156   IS              iscol=a->col,isrow=a->row;
1157   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1158   int             *diag = a->diag;
1159   MatScalar       *aa=a->a,*v;
1160   PetscScalar     s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1161   PetscScalar     *x,*b,*t;
1162 
1163   PetscFunctionBegin;
1164   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1165   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1166   t  = a->solve_work;
1167 
1168   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1169   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1170 
1171   /* forward solve the lower triangular */
1172   idx    = 7*(*r++);
1173   t[0] = b[idx];   t[1] = b[1+idx];
1174   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1175   t[5] = b[5+idx]; t[6] = b[6+idx];
1176 
1177   for (i=1; i<n; i++) {
1178     v     = aa + 49*ai[i];
1179     vi    = aj + ai[i];
1180     nz    = diag[i] - ai[i];
1181     idx   = 7*(*r++);
1182     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1183     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1184     while (nz--) {
1185       idx   = 7*(*vi++);
1186       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1187       x4    = t[3+idx];x5 = t[4+idx];
1188       x6    = t[5+idx];x7 = t[6+idx];
1189       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1190       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1191       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1192       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1193       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1194       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1195       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1196       v += 49;
1197     }
1198     idx = 7*i;
1199     t[idx]   = s1;t[1+idx] = s2;
1200     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1201     t[5+idx] = s6;t[6+idx] = s7;
1202   }
1203   /* backward solve the upper triangular */
1204   for (i=n-1; i>=0; i--){
1205     v    = aa + 49*diag[i] + 49;
1206     vi   = aj + diag[i] + 1;
1207     nz   = ai[i+1] - diag[i] - 1;
1208     idt  = 7*i;
1209     s1 = t[idt];  s2 = t[1+idt];
1210     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1211     s6 = t[5+idt];s7 = t[6+idt];
1212     while (nz--) {
1213       idx   = 7*(*vi++);
1214       x1    = t[idx];   x2 = t[1+idx];
1215       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1216       x6    = t[5+idx]; x7 = t[6+idx];
1217       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1218       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1219       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1220       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1221       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1222       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1223       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1224       v += 49;
1225     }
1226     idc = 7*(*c--);
1227     v   = aa + 49*diag[i];
1228     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1229                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1230     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1231                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1232     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1233                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1234     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1235                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1236     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1237                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1238     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1239                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1240     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1241                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
1242   }
1243 
1244   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1245   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1246   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1247   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1248   PetscLogFlops(2*49*(a->nz) - 7*A->n);
1249   PetscFunctionReturn(0);
1250 }
1251 
1252 #undef __FUNCT__
1253 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
1254 int MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
1255 {
1256   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1257   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1258   int             ierr,*diag = a->diag,jdx;
1259   MatScalar       *aa=a->a,*v;
1260   PetscScalar     *x,*b,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1261 
1262   PetscFunctionBegin;
1263   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1264   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1265   /* forward solve the lower triangular */
1266   idx    = 0;
1267   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1268   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1269   x[6] = b[6+idx];
1270   for (i=1; i<n; i++) {
1271     v     =  aa + 49*ai[i];
1272     vi    =  aj + ai[i];
1273     nz    =  diag[i] - ai[i];
1274     idx   =  7*i;
1275     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1276     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1277     s7  =  b[6+idx];
1278     while (nz--) {
1279       jdx   = 7*(*vi++);
1280       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1281       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1282       x7    = x[6+jdx];
1283       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1284       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1285       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1286       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1287       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1288       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1289       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1290       v += 49;
1291      }
1292     x[idx]   = s1;
1293     x[1+idx] = s2;
1294     x[2+idx] = s3;
1295     x[3+idx] = s4;
1296     x[4+idx] = s5;
1297     x[5+idx] = s6;
1298     x[6+idx] = s7;
1299   }
1300   /* backward solve the upper triangular */
1301   for (i=n-1; i>=0; i--){
1302     v    = aa + 49*diag[i] + 49;
1303     vi   = aj + diag[i] + 1;
1304     nz   = ai[i+1] - diag[i] - 1;
1305     idt  = 7*i;
1306     s1 = x[idt];   s2 = x[1+idt];
1307     s3 = x[2+idt]; s4 = x[3+idt];
1308     s5 = x[4+idt]; s6 = x[5+idt];
1309     s7 = x[6+idt];
1310     while (nz--) {
1311       idx   = 7*(*vi++);
1312       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1313       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1314       x7    = x[6+idx];
1315       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1316       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1317       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1318       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1319       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1320       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1321       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1322       v += 49;
1323     }
1324     v        = aa + 49*diag[i];
1325     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
1326                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
1327     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
1328                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
1329     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
1330                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
1331     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
1332                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
1333     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
1334                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
1335     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
1336                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
1337     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
1338                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
1339   }
1340 
1341   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1342   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1343   PetscLogFlops(2*36*(a->nz) - 6*A->n);
1344   PetscFunctionReturn(0);
1345 }
1346 
1347 #undef __FUNCT__
1348 #define __FUNCT__ "MatSolve_SeqBAIJ_6"
1349 int MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
1350 {
1351   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1352   IS              iscol=a->col,isrow=a->row;
1353   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1354   int             *diag = a->diag;
1355   MatScalar       *aa=a->a,*v;
1356   PetscScalar     *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
1357 
1358   PetscFunctionBegin;
1359   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1360   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1361   t  = a->solve_work;
1362 
1363   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1364   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1365 
1366   /* forward solve the lower triangular */
1367   idx    = 6*(*r++);
1368   t[0] = b[idx];   t[1] = b[1+idx];
1369   t[2] = b[2+idx]; t[3] = b[3+idx];
1370   t[4] = b[4+idx]; t[5] = b[5+idx];
1371   for (i=1; i<n; i++) {
1372     v     = aa + 36*ai[i];
1373     vi    = aj + ai[i];
1374     nz    = diag[i] - ai[i];
1375     idx   = 6*(*r++);
1376     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1377     s5  = b[4+idx]; s6 = b[5+idx];
1378     while (nz--) {
1379       idx   = 6*(*vi++);
1380       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1381       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1382       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1383       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1384       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1385       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1386       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1387       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1388       v += 36;
1389     }
1390     idx = 6*i;
1391     t[idx]   = s1;t[1+idx] = s2;
1392     t[2+idx] = s3;t[3+idx] = s4;
1393     t[4+idx] = s5;t[5+idx] = s6;
1394   }
1395   /* backward solve the upper triangular */
1396   for (i=n-1; i>=0; i--){
1397     v    = aa + 36*diag[i] + 36;
1398     vi   = aj + diag[i] + 1;
1399     nz   = ai[i+1] - diag[i] - 1;
1400     idt  = 6*i;
1401     s1 = t[idt];  s2 = t[1+idt];
1402     s3 = t[2+idt];s4 = t[3+idt];
1403     s5 = t[4+idt];s6 = t[5+idt];
1404     while (nz--) {
1405       idx   = 6*(*vi++);
1406       x1    = t[idx];   x2 = t[1+idx];
1407       x3    = t[2+idx]; x4 = t[3+idx];
1408       x5    = t[4+idx]; x6 = t[5+idx];
1409       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1410       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1411       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1412       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1413       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1414       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1415       v += 36;
1416     }
1417     idc = 6*(*c--);
1418     v   = aa + 36*diag[i];
1419     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
1420                                  v[18]*s4+v[24]*s5+v[30]*s6;
1421     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1422                                  v[19]*s4+v[25]*s5+v[31]*s6;
1423     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1424                                  v[20]*s4+v[26]*s5+v[32]*s6;
1425     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1426                                  v[21]*s4+v[27]*s5+v[33]*s6;
1427     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1428                                  v[22]*s4+v[28]*s5+v[34]*s6;
1429     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1430                                  v[23]*s4+v[29]*s5+v[35]*s6;
1431   }
1432 
1433   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1434   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1435   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1436   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1437   PetscLogFlops(2*36*(a->nz) - 6*A->n);
1438   PetscFunctionReturn(0);
1439 }
1440 
1441 #undef __FUNCT__
1442 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
1443 int MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
1444 {
1445   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1446   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1447   int             ierr,*diag = a->diag,jdx;
1448   MatScalar       *aa=a->a,*v;
1449   PetscScalar     *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
1450 
1451   PetscFunctionBegin;
1452   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1453   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1454   /* forward solve the lower triangular */
1455   idx    = 0;
1456   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1457   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1458   for (i=1; i<n; i++) {
1459     v     =  aa + 36*ai[i];
1460     vi    =  aj + ai[i];
1461     nz    =  diag[i] - ai[i];
1462     idx   =  6*i;
1463     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1464     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1465     while (nz--) {
1466       jdx   = 6*(*vi++);
1467       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1468       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1469       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1470       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1471       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1472       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1473       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1474       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1475       v += 36;
1476      }
1477     x[idx]   = s1;
1478     x[1+idx] = s2;
1479     x[2+idx] = s3;
1480     x[3+idx] = s4;
1481     x[4+idx] = s5;
1482     x[5+idx] = s6;
1483   }
1484   /* backward solve the upper triangular */
1485   for (i=n-1; i>=0; i--){
1486     v    = aa + 36*diag[i] + 36;
1487     vi   = aj + diag[i] + 1;
1488     nz   = ai[i+1] - diag[i] - 1;
1489     idt  = 6*i;
1490     s1 = x[idt];   s2 = x[1+idt];
1491     s3 = x[2+idt]; s4 = x[3+idt];
1492     s5 = x[4+idt]; s6 = x[5+idt];
1493     while (nz--) {
1494       idx   = 6*(*vi++);
1495       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1496       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1497       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1498       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1499       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1500       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1501       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1502       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1503       v += 36;
1504     }
1505     v        = aa + 36*diag[i];
1506     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
1507     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
1508     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
1509     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
1510     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
1511     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
1512   }
1513 
1514   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1515   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1516   PetscLogFlops(2*36*(a->nz) - 6*A->n);
1517   PetscFunctionReturn(0);
1518 }
1519 
1520 #undef __FUNCT__
1521 #define __FUNCT__ "MatSolve_SeqBAIJ_5"
1522 int MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
1523 {
1524   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1525   IS              iscol=a->col,isrow=a->row;
1526   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1527   int             *diag = a->diag;
1528   MatScalar       *aa=a->a,*v;
1529   PetscScalar     *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
1530 
1531   PetscFunctionBegin;
1532   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1533   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1534   t  = a->solve_work;
1535 
1536   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1537   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1538 
1539   /* forward solve the lower triangular */
1540   idx    = 5*(*r++);
1541   t[0] = b[idx];   t[1] = b[1+idx];
1542   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1543   for (i=1; i<n; i++) {
1544     v     = aa + 25*ai[i];
1545     vi    = aj + ai[i];
1546     nz    = diag[i] - ai[i];
1547     idx   = 5*(*r++);
1548     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1549     s5  = b[4+idx];
1550     while (nz--) {
1551       idx   = 5*(*vi++);
1552       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1553       x4    = t[3+idx];x5 = t[4+idx];
1554       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1555       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1556       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1557       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1558       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1559       v += 25;
1560     }
1561     idx = 5*i;
1562     t[idx]   = s1;t[1+idx] = s2;
1563     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1564   }
1565   /* backward solve the upper triangular */
1566   for (i=n-1; i>=0; i--){
1567     v    = aa + 25*diag[i] + 25;
1568     vi   = aj + diag[i] + 1;
1569     nz   = ai[i+1] - diag[i] - 1;
1570     idt  = 5*i;
1571     s1 = t[idt];  s2 = t[1+idt];
1572     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1573     while (nz--) {
1574       idx   = 5*(*vi++);
1575       x1    = t[idx];   x2 = t[1+idx];
1576       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1577       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1578       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1579       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1580       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1581       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1582       v += 25;
1583     }
1584     idc = 5*(*c--);
1585     v   = aa + 25*diag[i];
1586     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
1587                                  v[15]*s4+v[20]*s5;
1588     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
1589                                  v[16]*s4+v[21]*s5;
1590     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
1591                                  v[17]*s4+v[22]*s5;
1592     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
1593                                  v[18]*s4+v[23]*s5;
1594     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
1595                                  v[19]*s4+v[24]*s5;
1596   }
1597 
1598   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1599   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1600   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1601   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1602   PetscLogFlops(2*25*(a->nz) - 5*A->n);
1603   PetscFunctionReturn(0);
1604 }
1605 
1606 #undef __FUNCT__
1607 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
1608 int MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
1609 {
1610   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1611   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1612   int             ierr,*diag = a->diag,jdx;
1613   MatScalar       *aa=a->a,*v;
1614   PetscScalar     *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
1615 
1616   PetscFunctionBegin;
1617   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1618   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1619   /* forward solve the lower triangular */
1620   idx    = 0;
1621   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
1622   for (i=1; i<n; i++) {
1623     v     =  aa + 25*ai[i];
1624     vi    =  aj + ai[i];
1625     nz    =  diag[i] - ai[i];
1626     idx   =  5*i;
1627     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
1628     while (nz--) {
1629       jdx   = 5*(*vi++);
1630       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
1631       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1632       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1633       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1634       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1635       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1636       v    += 25;
1637     }
1638     x[idx]   = s1;
1639     x[1+idx] = s2;
1640     x[2+idx] = s3;
1641     x[3+idx] = s4;
1642     x[4+idx] = s5;
1643   }
1644   /* backward solve the upper triangular */
1645   for (i=n-1; i>=0; i--){
1646     v    = aa + 25*diag[i] + 25;
1647     vi   = aj + diag[i] + 1;
1648     nz   = ai[i+1] - diag[i] - 1;
1649     idt  = 5*i;
1650     s1 = x[idt];  s2 = x[1+idt];
1651     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
1652     while (nz--) {
1653       idx   = 5*(*vi++);
1654       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
1655       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1656       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1657       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1658       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1659       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1660       v    += 25;
1661     }
1662     v        = aa + 25*diag[i];
1663     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
1664     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
1665     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
1666     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
1667     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
1668   }
1669 
1670   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1671   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1672   PetscLogFlops(2*25*(a->nz) - 5*A->n);
1673   PetscFunctionReturn(0);
1674 }
1675 
1676 #undef __FUNCT__
1677 #define __FUNCT__ "MatSolve_SeqBAIJ_4"
1678 int MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
1679 {
1680   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1681   IS              iscol=a->col,isrow=a->row;
1682   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1683   int             *diag = a->diag;
1684   MatScalar       *aa=a->a,*v;
1685   PetscScalar     *x,*b,s1,s2,s3,s4,x1,x2,x3,x4,*t;
1686 
1687   PetscFunctionBegin;
1688   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1689   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1690   t  = a->solve_work;
1691 
1692   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1693   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1694 
1695   /* forward solve the lower triangular */
1696   idx    = 4*(*r++);
1697   t[0] = b[idx];   t[1] = b[1+idx];
1698   t[2] = b[2+idx]; t[3] = b[3+idx];
1699   for (i=1; i<n; i++) {
1700     v     = aa + 16*ai[i];
1701     vi    = aj + ai[i];
1702     nz    = diag[i] - ai[i];
1703     idx   = 4*(*r++);
1704     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1705     while (nz--) {
1706       idx   = 4*(*vi++);
1707       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
1708       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1709       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1710       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1711       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1712       v    += 16;
1713     }
1714     idx        = 4*i;
1715     t[idx]   = s1;t[1+idx] = s2;
1716     t[2+idx] = s3;t[3+idx] = s4;
1717   }
1718   /* backward solve the upper triangular */
1719   for (i=n-1; i>=0; i--){
1720     v    = aa + 16*diag[i] + 16;
1721     vi   = aj + diag[i] + 1;
1722     nz   = ai[i+1] - diag[i] - 1;
1723     idt  = 4*i;
1724     s1 = t[idt];  s2 = t[1+idt];
1725     s3 = t[2+idt];s4 = t[3+idt];
1726     while (nz--) {
1727       idx   = 4*(*vi++);
1728       x1    = t[idx];   x2 = t[1+idx];
1729       x3    = t[2+idx]; x4 = t[3+idx];
1730       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1731       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1732       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1733       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1734       v += 16;
1735     }
1736     idc      = 4*(*c--);
1737     v        = aa + 16*diag[i];
1738     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1739     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1740     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1741     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1742   }
1743 
1744   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1745   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1746   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1747   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1748   PetscLogFlops(2*16*(a->nz) - 4*A->n);
1749   PetscFunctionReturn(0);
1750 }
1751 #if defined (PETSC_HAVE_SSE)
1752 
1753 #include PETSC_HAVE_SSE
1754 
1755 #undef __FUNCT__
1756 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
1757 int MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
1758 {
1759   /*
1760      Note: This code uses demotion of double
1761      to float when performing the mixed-mode computation.
1762      This may not be numerically reasonable for all applications.
1763   */
1764   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1765   IS              iscol=a->col,isrow=a->row;
1766   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1767   int             *diag = a->diag,ai16;
1768   MatScalar       *aa=a->a,*v;
1769   PetscScalar     *x,*b,*t;
1770 
1771   /* Make space in temp stack for 16 Byte Aligned arrays */
1772   float           ssealignedspace[11],*tmps,*tmpx;
1773   unsigned long   offset;
1774 
1775   PetscFunctionBegin;
1776   SSE_SCOPE_BEGIN;
1777 
1778     offset = (unsigned long)ssealignedspace % 16;
1779     if (offset) offset = (16 - offset)/4;
1780     tmps = &ssealignedspace[offset];
1781     tmpx = &ssealignedspace[offset+4];
1782     PREFETCH_NTA(aa+16*ai[1]);
1783 
1784     ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1785     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1786     t  = a->solve_work;
1787 
1788     ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1789     ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1790 
1791     /* forward solve the lower triangular */
1792     idx  = 4*(*r++);
1793     t[0] = b[idx];   t[1] = b[1+idx];
1794     t[2] = b[2+idx]; t[3] = b[3+idx];
1795     v    =  aa + 16*ai[1];
1796 
1797     for (i=1; i<n;) {
1798       PREFETCH_NTA(&v[8]);
1799       vi   =  aj      + ai[i];
1800       nz   =  diag[i] - ai[i];
1801       idx  =  4*(*r++);
1802 
1803       /* Demote sum from double to float */
1804       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
1805       LOAD_PS(tmps,XMM7);
1806 
1807       while (nz--) {
1808         PREFETCH_NTA(&v[16]);
1809         idx = 4*(*vi++);
1810 
1811         /* Demote solution (so far) from double to float */
1812         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
1813 
1814         /* 4x4 Matrix-Vector product with negative accumulation: */
1815         SSE_INLINE_BEGIN_2(tmpx,v)
1816           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
1817 
1818           /* First Column */
1819           SSE_COPY_PS(XMM0,XMM6)
1820           SSE_SHUFFLE(XMM0,XMM0,0x00)
1821           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1822           SSE_SUB_PS(XMM7,XMM0)
1823 
1824           /* Second Column */
1825           SSE_COPY_PS(XMM1,XMM6)
1826           SSE_SHUFFLE(XMM1,XMM1,0x55)
1827           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1828           SSE_SUB_PS(XMM7,XMM1)
1829 
1830           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
1831 
1832           /* Third Column */
1833           SSE_COPY_PS(XMM2,XMM6)
1834           SSE_SHUFFLE(XMM2,XMM2,0xAA)
1835           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1836           SSE_SUB_PS(XMM7,XMM2)
1837 
1838           /* Fourth Column */
1839           SSE_COPY_PS(XMM3,XMM6)
1840           SSE_SHUFFLE(XMM3,XMM3,0xFF)
1841           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1842           SSE_SUB_PS(XMM7,XMM3)
1843         SSE_INLINE_END_2
1844 
1845         v  += 16;
1846       }
1847       idx = 4*i;
1848       v   = aa + 16*ai[++i];
1849       PREFETCH_NTA(v);
1850       STORE_PS(tmps,XMM7);
1851 
1852       /* Promote result from float to double */
1853       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
1854     }
1855     /* backward solve the upper triangular */
1856     idt  = 4*(n-1);
1857     ai16 = 16*diag[n-1];
1858     v    = aa + ai16 + 16;
1859     for (i=n-1; i>=0;){
1860       PREFETCH_NTA(&v[8]);
1861       vi = aj + diag[i] + 1;
1862       nz = ai[i+1] - diag[i] - 1;
1863 
1864       /* Demote accumulator from double to float */
1865       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
1866       LOAD_PS(tmps,XMM7);
1867 
1868       while (nz--) {
1869         PREFETCH_NTA(&v[16]);
1870         idx = 4*(*vi++);
1871 
1872         /* Demote solution (so far) from double to float */
1873         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
1874 
1875         /* 4x4 Matrix-Vector Product with negative accumulation: */
1876         SSE_INLINE_BEGIN_2(tmpx,v)
1877           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
1878 
1879           /* First Column */
1880           SSE_COPY_PS(XMM0,XMM6)
1881           SSE_SHUFFLE(XMM0,XMM0,0x00)
1882           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1883           SSE_SUB_PS(XMM7,XMM0)
1884 
1885           /* Second Column */
1886           SSE_COPY_PS(XMM1,XMM6)
1887           SSE_SHUFFLE(XMM1,XMM1,0x55)
1888           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1889           SSE_SUB_PS(XMM7,XMM1)
1890 
1891           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
1892 
1893           /* Third Column */
1894           SSE_COPY_PS(XMM2,XMM6)
1895           SSE_SHUFFLE(XMM2,XMM2,0xAA)
1896           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1897           SSE_SUB_PS(XMM7,XMM2)
1898 
1899           /* Fourth Column */
1900           SSE_COPY_PS(XMM3,XMM6)
1901           SSE_SHUFFLE(XMM3,XMM3,0xFF)
1902           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1903           SSE_SUB_PS(XMM7,XMM3)
1904         SSE_INLINE_END_2
1905         v  += 16;
1906       }
1907       v    = aa + ai16;
1908       ai16 = 16*diag[--i];
1909       PREFETCH_NTA(aa+ai16+16);
1910       /*
1911          Scale the result by the diagonal 4x4 block,
1912          which was inverted as part of the factorization
1913       */
1914       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
1915         /* First Column */
1916         SSE_COPY_PS(XMM0,XMM7)
1917         SSE_SHUFFLE(XMM0,XMM0,0x00)
1918         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
1919 
1920         /* Second Column */
1921         SSE_COPY_PS(XMM1,XMM7)
1922         SSE_SHUFFLE(XMM1,XMM1,0x55)
1923         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
1924         SSE_ADD_PS(XMM0,XMM1)
1925 
1926         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
1927 
1928         /* Third Column */
1929         SSE_COPY_PS(XMM2,XMM7)
1930         SSE_SHUFFLE(XMM2,XMM2,0xAA)
1931         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
1932         SSE_ADD_PS(XMM0,XMM2)
1933 
1934         /* Fourth Column */
1935         SSE_COPY_PS(XMM3,XMM7)
1936         SSE_SHUFFLE(XMM3,XMM3,0xFF)
1937         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
1938         SSE_ADD_PS(XMM0,XMM3)
1939 
1940         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
1941       SSE_INLINE_END_3
1942 
1943       /* Promote solution from float to double */
1944       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
1945 
1946       /* Apply reordering to t and stream into x.    */
1947       /* This way, x doesn't pollute the cache.      */
1948       /* Be careful with size: 2 doubles = 4 floats! */
1949       idc  = 4*(*c--);
1950       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
1951         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
1952         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
1953         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
1954         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
1955         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
1956         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
1957       SSE_INLINE_END_2
1958       v    = aa + ai16 + 16;
1959       idt -= 4;
1960     }
1961 
1962     ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1963     ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1964     ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1965     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1966     PetscLogFlops(2*16*(a->nz) - 4*A->n);
1967   SSE_SCOPE_END;
1968   PetscFunctionReturn(0);
1969 }
1970 
1971 #endif
1972 
1973 
1974 /*
1975       Special case where the matrix was ILU(0) factored in the natural
1976    ordering. This eliminates the need for the column and row permutation.
1977 */
1978 #undef __FUNCT__
1979 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
1980 int MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
1981 {
1982   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1983   int             n=a->mbs,*ai=a->i,*aj=a->j;
1984   int             ierr,*diag = a->diag;
1985   MatScalar       *aa=a->a;
1986   PetscScalar     *x,*b;
1987 
1988   PetscFunctionBegin;
1989   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1990   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1991 
1992 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
1993   {
1994     static PetscScalar w[2000]; /* very BAD need to fix */
1995     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
1996   }
1997 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
1998   {
1999     static PetscScalar w[2000]; /* very BAD need to fix */
2000     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
2001   }
2002 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
2003   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
2004 #else
2005   {
2006     PetscScalar  s1,s2,s3,s4,x1,x2,x3,x4;
2007     MatScalar    *v;
2008     int          jdx,idt,idx,nz,*vi,i,ai16;
2009 
2010   /* forward solve the lower triangular */
2011   idx    = 0;
2012   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
2013   for (i=1; i<n; i++) {
2014     v     =  aa      + 16*ai[i];
2015     vi    =  aj      + ai[i];
2016     nz    =  diag[i] - ai[i];
2017     idx   +=  4;
2018     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2019     while (nz--) {
2020       jdx   = 4*(*vi++);
2021       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
2022       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2023       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2024       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2025       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2026       v    += 16;
2027     }
2028     x[idx]   = s1;
2029     x[1+idx] = s2;
2030     x[2+idx] = s3;
2031     x[3+idx] = s4;
2032   }
2033   /* backward solve the upper triangular */
2034   idt = 4*(n-1);
2035   for (i=n-1; i>=0; i--){
2036     ai16 = 16*diag[i];
2037     v    = aa + ai16 + 16;
2038     vi   = aj + diag[i] + 1;
2039     nz   = ai[i+1] - diag[i] - 1;
2040     s1 = x[idt];  s2 = x[1+idt];
2041     s3 = x[2+idt];s4 = x[3+idt];
2042     while (nz--) {
2043       idx   = 4*(*vi++);
2044       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
2045       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2046       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2047       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2048       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
2049       v    += 16;
2050     }
2051     v        = aa + ai16;
2052     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
2053     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
2054     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
2055     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
2056     idt -= 4;
2057   }
2058   }
2059 #endif
2060 
2061   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2062   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2063   PetscLogFlops(2*16*(a->nz) - 4*A->n);
2064   PetscFunctionReturn(0);
2065 }
2066 
2067 #if defined (PETSC_HAVE_SSE)
2068 
2069 #include PETSC_HAVE_SSE
2070 
2071 #undef __FUNCT__
2072 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion"
2073 int MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
2074 {
2075   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2076   int             n=a->mbs,*ai=a->i,*aj=a->j;
2077   int             ierr,*diag = a->diag;
2078   MatScalar       *aa=a->a;
2079   PetscScalar     *x,*b;
2080 
2081   PetscFunctionBegin;
2082   SSE_SCOPE_BEGIN;
2083   /*
2084      Note: This code currently uses demotion of double
2085      to float when performing the mixed-mode computation.
2086      This may not be numerically reasonable for all applications.
2087   */
2088   PREFETCH_NTA(aa+16*ai[1]);
2089 
2090   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2091   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2092   {
2093     MatScalar     *v;
2094     int           jdx,idt,idx,nz,*vi,i,ai16;
2095 
2096     /* Make space in temp stack for 16 Byte Aligned arrays */
2097     float         ssealignedspace[11],*tmps,*tmpx;
2098     unsigned long offset = (unsigned long)ssealignedspace % 16;
2099     if (offset) offset = (16 - offset)/4;
2100     tmps = &ssealignedspace[offset];
2101     tmpx = &ssealignedspace[offset+4];
2102 
2103   /* forward solve the lower triangular */
2104     idx  = 0;
2105     x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
2106     v    =  aa + 16*ai[1];
2107 
2108     for (i=1; i<n;) {
2109       PREFETCH_NTA(&v[8]);
2110       vi   =  aj      + ai[i];
2111       nz   =  diag[i] - ai[i];
2112       idx +=  4;
2113 
2114     /* Demote sum from double to float */
2115       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
2116       LOAD_PS(tmps,XMM7);
2117 
2118       while (nz--) {
2119         PREFETCH_NTA(&v[16]);
2120         jdx = 4*(*vi++);
2121 
2122         /* Demote solution (so far) from double to float */
2123         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[jdx]);
2124 
2125         /* 4x4 Matrix-Vector product with negative accumulation: */
2126         SSE_INLINE_BEGIN_2(tmpx,v)
2127           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2128 
2129           /* First Column */
2130           SSE_COPY_PS(XMM0,XMM6)
2131           SSE_SHUFFLE(XMM0,XMM0,0x00)
2132           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2133           SSE_SUB_PS(XMM7,XMM0)
2134 
2135           /* Second Column */
2136           SSE_COPY_PS(XMM1,XMM6)
2137           SSE_SHUFFLE(XMM1,XMM1,0x55)
2138           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2139           SSE_SUB_PS(XMM7,XMM1)
2140 
2141           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2142 
2143           /* Third Column */
2144           SSE_COPY_PS(XMM2,XMM6)
2145           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2146           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2147           SSE_SUB_PS(XMM7,XMM2)
2148 
2149           /* Fourth Column */
2150           SSE_COPY_PS(XMM3,XMM6)
2151           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2152           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2153           SSE_SUB_PS(XMM7,XMM3)
2154         SSE_INLINE_END_2
2155 
2156         v  += 16;
2157       }
2158       v    =  aa + 16*ai[++i];
2159       PREFETCH_NTA(v);
2160       STORE_PS(tmps,XMM7);
2161 
2162       /* Promote result from float to double */
2163       CONVERT_FLOAT4_DOUBLE4(&x[idx],tmps);
2164     }
2165     /* backward solve the upper triangular */
2166     idt  = 4*(n-1);
2167     ai16 = 16*diag[n-1];
2168     v    = aa + ai16 + 16;
2169     for (i=n-1; i>=0;){
2170       PREFETCH_NTA(&v[8]);
2171       vi = aj + diag[i] + 1;
2172       nz = ai[i+1] - diag[i] - 1;
2173 
2174       /* Demote accumulator from double to float */
2175       CONVERT_DOUBLE4_FLOAT4(tmps,&x[idt]);
2176       LOAD_PS(tmps,XMM7);
2177 
2178       while (nz--) {
2179         PREFETCH_NTA(&v[16]);
2180         idx = 4*(*vi++);
2181 
2182         /* Demote solution (so far) from double to float */
2183         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
2184 
2185         /* 4x4 Matrix-Vector Product with negative accumulation: */
2186         SSE_INLINE_BEGIN_2(tmpx,v)
2187           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2188 
2189           /* First Column */
2190           SSE_COPY_PS(XMM0,XMM6)
2191           SSE_SHUFFLE(XMM0,XMM0,0x00)
2192           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2193           SSE_SUB_PS(XMM7,XMM0)
2194 
2195           /* Second Column */
2196           SSE_COPY_PS(XMM1,XMM6)
2197           SSE_SHUFFLE(XMM1,XMM1,0x55)
2198           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2199           SSE_SUB_PS(XMM7,XMM1)
2200 
2201           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2202 
2203           /* Third Column */
2204           SSE_COPY_PS(XMM2,XMM6)
2205           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2206           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2207           SSE_SUB_PS(XMM7,XMM2)
2208 
2209           /* Fourth Column */
2210           SSE_COPY_PS(XMM3,XMM6)
2211           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2212           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2213           SSE_SUB_PS(XMM7,XMM3)
2214         SSE_INLINE_END_2
2215         v  += 16;
2216       }
2217       v    = aa + ai16;
2218       ai16 = 16*diag[--i];
2219       PREFETCH_NTA(aa+ai16+16);
2220       /*
2221          Scale the result by the diagonal 4x4 block,
2222          which was inverted as part of the factorization
2223       */
2224       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
2225         /* First Column */
2226         SSE_COPY_PS(XMM0,XMM7)
2227         SSE_SHUFFLE(XMM0,XMM0,0x00)
2228         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2229 
2230         /* Second Column */
2231         SSE_COPY_PS(XMM1,XMM7)
2232         SSE_SHUFFLE(XMM1,XMM1,0x55)
2233         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2234         SSE_ADD_PS(XMM0,XMM1)
2235 
2236         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2237 
2238         /* Third Column */
2239         SSE_COPY_PS(XMM2,XMM7)
2240         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2241         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2242         SSE_ADD_PS(XMM0,XMM2)
2243 
2244         /* Fourth Column */
2245         SSE_COPY_PS(XMM3,XMM7)
2246         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2247         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2248         SSE_ADD_PS(XMM0,XMM3)
2249 
2250         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2251       SSE_INLINE_END_3
2252 
2253       /* Promote solution from float to double */
2254       CONVERT_FLOAT4_DOUBLE4(&x[idt],tmps);
2255 
2256       v    = aa + ai16 + 16;
2257       idt -= 4;
2258     }
2259   }
2260   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2261   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2262   PetscLogFlops(2*16*(a->nz) - 4*A->n);
2263   SSE_SCOPE_END;
2264   PetscFunctionReturn(0);
2265 }
2266 
2267 #endif
2268 #undef __FUNCT__
2269 #define __FUNCT__ "MatSolve_SeqBAIJ_3"
2270 int MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
2271 {
2272   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
2273   IS              iscol=a->col,isrow=a->row;
2274   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
2275   int             *diag = a->diag;
2276   MatScalar       *aa=a->a,*v;
2277   PetscScalar     *x,*b,s1,s2,s3,x1,x2,x3,*t;
2278 
2279   PetscFunctionBegin;
2280   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2281   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2282   t  = a->solve_work;
2283 
2284   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2285   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2286 
2287   /* forward solve the lower triangular */
2288   idx    = 3*(*r++);
2289   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
2290   for (i=1; i<n; i++) {
2291     v     = aa + 9*ai[i];
2292     vi    = aj + ai[i];
2293     nz    = diag[i] - ai[i];
2294     idx   = 3*(*r++);
2295     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
2296     while (nz--) {
2297       idx   = 3*(*vi++);
2298       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2299       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2300       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2301       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2302       v += 9;
2303     }
2304     idx = 3*i;
2305     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
2306   }
2307   /* backward solve the upper triangular */
2308   for (i=n-1; i>=0; i--){
2309     v    = aa + 9*diag[i] + 9;
2310     vi   = aj + diag[i] + 1;
2311     nz   = ai[i+1] - diag[i] - 1;
2312     idt  = 3*i;
2313     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
2314     while (nz--) {
2315       idx   = 3*(*vi++);
2316       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2317       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2318       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2319       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2320       v += 9;
2321     }
2322     idc = 3*(*c--);
2323     v   = aa + 9*diag[i];
2324     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2325     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2326     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
2327   }
2328   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2329   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2330   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2331   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2332   PetscLogFlops(2*9*(a->nz) - 3*A->n);
2333   PetscFunctionReturn(0);
2334 }
2335 
2336 /*
2337       Special case where the matrix was ILU(0) factored in the natural
2338    ordering. This eliminates the need for the column and row permutation.
2339 */
2340 #undef __FUNCT__
2341 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
2342 int MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
2343 {
2344   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2345   int             n=a->mbs,*ai=a->i,*aj=a->j;
2346   int             ierr,*diag = a->diag;
2347   MatScalar       *aa=a->a,*v;
2348   PetscScalar     *x,*b,s1,s2,s3,x1,x2,x3;
2349   int             jdx,idt,idx,nz,*vi,i;
2350 
2351   PetscFunctionBegin;
2352   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2353   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2354 
2355 
2356   /* forward solve the lower triangular */
2357   idx    = 0;
2358   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
2359   for (i=1; i<n; i++) {
2360     v     =  aa      + 9*ai[i];
2361     vi    =  aj      + ai[i];
2362     nz    =  diag[i] - ai[i];
2363     idx   +=  3;
2364     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
2365     while (nz--) {
2366       jdx   = 3*(*vi++);
2367       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
2368       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2369       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2370       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2371       v    += 9;
2372     }
2373     x[idx]   = s1;
2374     x[1+idx] = s2;
2375     x[2+idx] = s3;
2376   }
2377   /* backward solve the upper triangular */
2378   for (i=n-1; i>=0; i--){
2379     v    = aa + 9*diag[i] + 9;
2380     vi   = aj + diag[i] + 1;
2381     nz   = ai[i+1] - diag[i] - 1;
2382     idt  = 3*i;
2383     s1 = x[idt];  s2 = x[1+idt];
2384     s3 = x[2+idt];
2385     while (nz--) {
2386       idx   = 3*(*vi++);
2387       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
2388       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2389       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2390       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2391       v    += 9;
2392     }
2393     v        = aa +  9*diag[i];
2394     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2395     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2396     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
2397   }
2398 
2399   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2400   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2401   PetscLogFlops(2*9*(a->nz) - 3*A->n);
2402   PetscFunctionReturn(0);
2403 }
2404 
2405 #undef __FUNCT__
2406 #define __FUNCT__ "MatSolve_SeqBAIJ_2"
2407 int MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
2408 {
2409   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
2410   IS              iscol=a->col,isrow=a->row;
2411   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
2412   int             *diag = a->diag;
2413   MatScalar       *aa=a->a,*v;
2414   PetscScalar     *x,*b,s1,s2,x1,x2,*t;
2415 
2416   PetscFunctionBegin;
2417   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2418   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2419   t  = a->solve_work;
2420 
2421   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2422   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2423 
2424   /* forward solve the lower triangular */
2425   idx    = 2*(*r++);
2426   t[0] = b[idx]; t[1] = b[1+idx];
2427   for (i=1; i<n; i++) {
2428     v     = aa + 4*ai[i];
2429     vi    = aj + ai[i];
2430     nz    = diag[i] - ai[i];
2431     idx   = 2*(*r++);
2432     s1  = b[idx]; s2 = b[1+idx];
2433     while (nz--) {
2434       idx   = 2*(*vi++);
2435       x1    = t[idx]; x2 = t[1+idx];
2436       s1 -= v[0]*x1 + v[2]*x2;
2437       s2 -= v[1]*x1 + v[3]*x2;
2438       v += 4;
2439     }
2440     idx = 2*i;
2441     t[idx] = s1; t[1+idx] = s2;
2442   }
2443   /* backward solve the upper triangular */
2444   for (i=n-1; i>=0; i--){
2445     v    = aa + 4*diag[i] + 4;
2446     vi   = aj + diag[i] + 1;
2447     nz   = ai[i+1] - diag[i] - 1;
2448     idt  = 2*i;
2449     s1 = t[idt]; s2 = t[1+idt];
2450     while (nz--) {
2451       idx   = 2*(*vi++);
2452       x1    = t[idx]; x2 = t[1+idx];
2453       s1 -= v[0]*x1 + v[2]*x2;
2454       s2 -= v[1]*x1 + v[3]*x2;
2455       v += 4;
2456     }
2457     idc = 2*(*c--);
2458     v   = aa + 4*diag[i];
2459     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
2460     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
2461   }
2462   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2463   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2464   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2465   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2466   PetscLogFlops(2*4*(a->nz) - 2*A->n);
2467   PetscFunctionReturn(0);
2468 }
2469 
2470 /*
2471       Special case where the matrix was ILU(0) factored in the natural
2472    ordering. This eliminates the need for the column and row permutation.
2473 */
2474 #undef __FUNCT__
2475 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
2476 int MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
2477 {
2478   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2479   int             n=a->mbs,*ai=a->i,*aj=a->j;
2480   int             ierr,*diag = a->diag;
2481   MatScalar       *aa=a->a,*v;
2482   PetscScalar     *x,*b,s1,s2,x1,x2;
2483   int             jdx,idt,idx,nz,*vi,i;
2484 
2485   PetscFunctionBegin;
2486   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2487   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2488 
2489   /* forward solve the lower triangular */
2490   idx    = 0;
2491   x[0]   = b[0]; x[1] = b[1];
2492   for (i=1; i<n; i++) {
2493     v     =  aa      + 4*ai[i];
2494     vi    =  aj      + ai[i];
2495     nz    =  diag[i] - ai[i];
2496     idx   +=  2;
2497     s1  =  b[idx];s2 = b[1+idx];
2498     while (nz--) {
2499       jdx   = 2*(*vi++);
2500       x1    = x[jdx];x2 = x[1+jdx];
2501       s1 -= v[0]*x1 + v[2]*x2;
2502       s2 -= v[1]*x1 + v[3]*x2;
2503       v    += 4;
2504     }
2505     x[idx]   = s1;
2506     x[1+idx] = s2;
2507   }
2508   /* backward solve the upper triangular */
2509   for (i=n-1; i>=0; i--){
2510     v    = aa + 4*diag[i] + 4;
2511     vi   = aj + diag[i] + 1;
2512     nz   = ai[i+1] - diag[i] - 1;
2513     idt  = 2*i;
2514     s1 = x[idt];  s2 = x[1+idt];
2515     while (nz--) {
2516       idx   = 2*(*vi++);
2517       x1    = x[idx];   x2 = x[1+idx];
2518       s1 -= v[0]*x1 + v[2]*x2;
2519       s2 -= v[1]*x1 + v[3]*x2;
2520       v    += 4;
2521     }
2522     v        = aa +  4*diag[i];
2523     x[idt]   = v[0]*s1 + v[2]*s2;
2524     x[1+idt] = v[1]*s1 + v[3]*s2;
2525   }
2526 
2527   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2528   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2529   PetscLogFlops(2*4*(a->nz) - 2*A->n);
2530   PetscFunctionReturn(0);
2531 }
2532 
2533 #undef __FUNCT__
2534 #define __FUNCT__ "MatSolve_SeqBAIJ_1"
2535 int MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
2536 {
2537   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
2538   IS              iscol=a->col,isrow=a->row;
2539   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
2540   int             *diag = a->diag;
2541   MatScalar       *aa=a->a,*v;
2542   PetscScalar     *x,*b,s1,*t;
2543 
2544   PetscFunctionBegin;
2545   if (!n) PetscFunctionReturn(0);
2546 
2547   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2548   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2549   t  = a->solve_work;
2550 
2551   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2552   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2553 
2554   /* forward solve the lower triangular */
2555   t[0] = b[*r++];
2556   for (i=1; i<n; i++) {
2557     v     = aa + ai[i];
2558     vi    = aj + ai[i];
2559     nz    = diag[i] - ai[i];
2560     s1  = b[*r++];
2561     while (nz--) {
2562       s1 -= (*v++)*t[*vi++];
2563     }
2564     t[i] = s1;
2565   }
2566   /* backward solve the upper triangular */
2567   for (i=n-1; i>=0; i--){
2568     v    = aa + diag[i] + 1;
2569     vi   = aj + diag[i] + 1;
2570     nz   = ai[i+1] - diag[i] - 1;
2571     s1 = t[i];
2572     while (nz--) {
2573       s1 -= (*v++)*t[*vi++];
2574     }
2575     x[*c--] = t[i] = aa[diag[i]]*s1;
2576   }
2577 
2578   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2579   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2580   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2581   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2582   PetscLogFlops(2*1*(a->nz) - A->n);
2583   PetscFunctionReturn(0);
2584 }
2585 /*
2586       Special case where the matrix was ILU(0) factored in the natural
2587    ordering. This eliminates the need for the column and row permutation.
2588 */
2589 #undef __FUNCT__
2590 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering"
2591 int MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2592 {
2593   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2594   int             n=a->mbs,*ai=a->i,*aj=a->j;
2595   int             ierr,*diag = a->diag;
2596   MatScalar       *aa=a->a;
2597   PetscScalar     *x,*b;
2598   PetscScalar     s1,x1;
2599   MatScalar       *v;
2600   int             jdx,idt,idx,nz,*vi,i;
2601 
2602   PetscFunctionBegin;
2603   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2604   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2605 
2606   /* forward solve the lower triangular */
2607   idx    = 0;
2608   x[0]   = b[0];
2609   for (i=1; i<n; i++) {
2610     v     =  aa      + ai[i];
2611     vi    =  aj      + ai[i];
2612     nz    =  diag[i] - ai[i];
2613     idx   +=  1;
2614     s1  =  b[idx];
2615     while (nz--) {
2616       jdx   = *vi++;
2617       x1    = x[jdx];
2618       s1 -= v[0]*x1;
2619       v    += 1;
2620     }
2621     x[idx]   = s1;
2622   }
2623   /* backward solve the upper triangular */
2624   for (i=n-1; i>=0; i--){
2625     v    = aa + diag[i] + 1;
2626     vi   = aj + diag[i] + 1;
2627     nz   = ai[i+1] - diag[i] - 1;
2628     idt  = i;
2629     s1 = x[idt];
2630     while (nz--) {
2631       idx   = *vi++;
2632       x1    = x[idx];
2633       s1 -= v[0]*x1;
2634       v    += 1;
2635     }
2636     v        = aa +  diag[i];
2637     x[idt]   = v[0]*s1;
2638   }
2639   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2640   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2641   PetscLogFlops(2*(a->nz) - A->n);
2642   PetscFunctionReturn(0);
2643 }
2644 
2645 /* ----------------------------------------------------------------*/
2646 /*
2647      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
2648    except that the data structure of Mat_SeqAIJ is slightly different.
2649    Not a good example of code reuse.
2650 */
2651 EXTERN int MatMissingDiagonal_SeqBAIJ(Mat);
2652 
2653 #undef __FUNCT__
2654 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ"
2655 int MatILUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,MatILUInfo *info,Mat *fact)
2656 {
2657   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b;
2658   IS          isicol;
2659   int         *r,*ic,ierr,prow,n = a->mbs,*ai = a->i,*aj = a->j;
2660   int         *ainew,*ajnew,jmax,*fill,*xi,nz,*im,*ajfill,*flev;
2661   int         *dloc,idx,row,m,fm,nzf,nzi,len, realloc = 0,dcount = 0;
2662   int         incrlev,nnz,i,bs = a->bs,bs2 = a->bs2,levels,diagonal_fill;
2663   PetscTruth  col_identity,row_identity;
2664   PetscReal   f;
2665 
2666   PetscFunctionBegin;
2667   if (info) {
2668     f             = info->fill;
2669     levels        = (int)info->levels;
2670     diagonal_fill = (int)info->diagonal_fill;
2671   } else {
2672     f             = 1.0;
2673     levels        = 0;
2674     diagonal_fill = 0;
2675   }
2676   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
2677   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
2678   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
2679 
2680   if (!levels && row_identity && col_identity) {  /* special case copy the nonzero structure */
2681     ierr = MatDuplicate_SeqBAIJ(A,MAT_DO_NOT_COPY_VALUES,fact);CHKERRQ(ierr);
2682     (*fact)->factor = FACTOR_LU;
2683     b               = (Mat_SeqBAIJ*)(*fact)->data;
2684     if (!b->diag) {
2685       ierr = MatMarkDiagonal_SeqBAIJ(*fact);CHKERRQ(ierr);
2686     }
2687     ierr = MatMissingDiagonal_SeqBAIJ(*fact);CHKERRQ(ierr);
2688     b->row        = isrow;
2689     b->col        = iscol;
2690     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
2691     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
2692     b->icol       = isicol;
2693     ierr          = PetscMalloc(((*fact)->m+1+b->bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
2694   } else { /* general case perform the symbolic factorization */
2695     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
2696     ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
2697 
2698     /* get new row pointers */
2699     ierr = PetscMalloc((n+1)*sizeof(int),&ainew);CHKERRQ(ierr);
2700     ainew[0] = 0;
2701     /* don't know how many column pointers are needed so estimate */
2702     jmax = (int)(f*ai[n] + 1);
2703     ierr = PetscMalloc((jmax)*sizeof(int),&ajnew);CHKERRQ(ierr);
2704     /* ajfill is level of fill for each fill entry */
2705     ierr = PetscMalloc((jmax)*sizeof(int),&ajfill);CHKERRQ(ierr);
2706     /* fill is a linked list of nonzeros in active row */
2707     ierr = PetscMalloc((n+1)*sizeof(int),&fill);CHKERRQ(ierr);
2708     /* im is level for each filled value */
2709     ierr = PetscMalloc((n+1)*sizeof(int),&im);CHKERRQ(ierr);
2710     /* dloc is location of diagonal in factor */
2711     ierr = PetscMalloc((n+1)*sizeof(int),&dloc);CHKERRQ(ierr);
2712     dloc[0]  = 0;
2713     for (prow=0; prow<n; prow++) {
2714 
2715       /* copy prow into linked list */
2716       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
2717       if (!nz) SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix");
2718       xi         = aj + ai[r[prow]];
2719       fill[n]    = n;
2720       fill[prow] = -1; /* marker for diagonal entry */
2721       while (nz--) {
2722 	fm  = n;
2723 	idx = ic[*xi++];
2724 	do {
2725 	  m  = fm;
2726 	  fm = fill[m];
2727 	} while (fm < idx);
2728 	fill[m]   = idx;
2729 	fill[idx] = fm;
2730 	im[idx]   = 0;
2731       }
2732 
2733       /* make sure diagonal entry is included */
2734       if (diagonal_fill && fill[prow] == -1) {
2735 	fm = n;
2736 	while (fill[fm] < prow) fm = fill[fm];
2737 	fill[prow] = fill[fm];  /* insert diagonal into linked list */
2738 	fill[fm]   = prow;
2739 	im[prow]   = 0;
2740 	nzf++;
2741 	dcount++;
2742       }
2743 
2744       nzi = 0;
2745       row = fill[n];
2746       while (row < prow) {
2747 	incrlev = im[row] + 1;
2748 	nz      = dloc[row];
2749 	xi      = ajnew  + ainew[row] + nz + 1;
2750 	flev    = ajfill + ainew[row] + nz + 1;
2751 	nnz     = ainew[row+1] - ainew[row] - nz - 1;
2752 	fm      = row;
2753 	while (nnz-- > 0) {
2754 	  idx = *xi++;
2755 	  if (*flev + incrlev > levels) {
2756 	    flev++;
2757 	    continue;
2758 	  }
2759 	  do {
2760 	    m  = fm;
2761 	    fm = fill[m];
2762 	  } while (fm < idx);
2763 	  if (fm != idx) {
2764 	    im[idx]   = *flev + incrlev;
2765 	    fill[m]   = idx;
2766 	    fill[idx] = fm;
2767 	    fm        = idx;
2768 	    nzf++;
2769 	  } else {
2770 	    if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
2771 	  }
2772 	  flev++;
2773 	}
2774 	row = fill[row];
2775 	nzi++;
2776       }
2777       /* copy new filled row into permanent storage */
2778       ainew[prow+1] = ainew[prow] + nzf;
2779       if (ainew[prow+1] > jmax) {
2780 
2781 	/* estimate how much additional space we will need */
2782 	/* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
2783 	/* just double the memory each time */
2784 	int maxadd = jmax;
2785 	/* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
2786 	if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
2787 	jmax += maxadd;
2788 
2789 	/* allocate a longer ajnew and ajfill */
2790 	ierr = PetscMalloc(jmax*sizeof(int),&xi);CHKERRQ(ierr);
2791 	ierr = PetscMemcpy(xi,ajnew,ainew[prow]*sizeof(int));CHKERRQ(ierr);
2792 	ierr = PetscFree(ajnew);CHKERRQ(ierr);
2793 	ajnew = xi;
2794 	ierr = PetscMalloc(jmax*sizeof(int),&xi);CHKERRQ(ierr);
2795 	ierr = PetscMemcpy(xi,ajfill,ainew[prow]*sizeof(int));CHKERRQ(ierr);
2796 	ierr = PetscFree(ajfill);CHKERRQ(ierr);
2797 	ajfill = xi;
2798 	realloc++; /* count how many reallocations are needed */
2799       }
2800       xi          = ajnew + ainew[prow];
2801       flev        = ajfill + ainew[prow];
2802       dloc[prow]  = nzi;
2803       fm          = fill[n];
2804       while (nzf--) {
2805 	*xi++   = fm;
2806 	*flev++ = im[fm];
2807 	fm      = fill[fm];
2808       }
2809       /* make sure row has diagonal entry */
2810       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
2811 	SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %d has missing diagonal in factored matrix\n\
2812     try running with -pc_ilu_nonzeros_along_diagonal or -pc_ilu_diagonal_fill",prow);
2813       }
2814     }
2815     ierr = PetscFree(ajfill);CHKERRQ(ierr);
2816     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
2817     ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
2818     ierr = PetscFree(fill);CHKERRQ(ierr);
2819     ierr = PetscFree(im);CHKERRQ(ierr);
2820 
2821     {
2822       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
2823       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",realloc,f,af);
2824       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Run with -pc_ilu_fill %g or use \n",af);
2825       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:PCILUSetFill(pc,%g);\n",af);
2826       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:for best performance.\n");
2827       if (diagonal_fill) {
2828 	PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Detected and replace %d missing diagonals",dcount);
2829       }
2830     }
2831 
2832     /* put together the new matrix */
2833     ierr = MatCreateSeqBAIJ(A->comm,bs,bs*n,bs*n,0,PETSC_NULL,fact);CHKERRQ(ierr);
2834     PetscLogObjectParent(*fact,isicol);
2835     b = (Mat_SeqBAIJ*)(*fact)->data;
2836     ierr = PetscFree(b->imax);CHKERRQ(ierr);
2837     b->singlemalloc = PETSC_FALSE;
2838     len = bs2*ainew[n]*sizeof(MatScalar);
2839     /* the next line frees the default space generated by the Create() */
2840     ierr = PetscFree(b->a);CHKERRQ(ierr);
2841     ierr = PetscFree(b->ilen);CHKERRQ(ierr);
2842     ierr = PetscMalloc(len,&b->a);CHKERRQ(ierr);
2843     b->j          = ajnew;
2844     b->i          = ainew;
2845     for (i=0; i<n; i++) dloc[i] += ainew[i];
2846     b->diag       = dloc;
2847     b->ilen       = 0;
2848     b->imax       = 0;
2849     b->row        = isrow;
2850     b->col        = iscol;
2851     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
2852     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
2853     b->icol       = isicol;
2854     ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
2855     /* In b structure:  Free imax, ilen, old a, old j.
2856        Allocate dloc, solve_work, new a, new j */
2857     PetscLogObjectMemory(*fact,(ainew[n]-n)*(sizeof(int))+bs2*ainew[n]*sizeof(PetscScalar));
2858     b->maxnz          = b->nz = ainew[n];
2859     (*fact)->factor   = FACTOR_LU;
2860 
2861     (*fact)->info.factor_mallocs    = realloc;
2862     (*fact)->info.fill_ratio_given  = f;
2863     (*fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
2864   }
2865 
2866   if (row_identity && col_identity) {
2867     ierr = MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(*fact);CHKERRQ(ierr);
2868   }
2869   PetscFunctionReturn(0);
2870 }
2871 
2872 #undef __FUNCT__
2873 #define __FUNCT__ "MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering"
2874 int MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(Mat inA)
2875 {
2876   /*
2877       Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver
2878       with natural ordering
2879   */
2880   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)inA->data;
2881   PetscTruth  sse_enabled_local,sse_enabled_global;
2882   int         ierr;
2883 
2884   PetscFunctionBegin;
2885   inA->ops->solve             = MatSolve_SeqBAIJ_Update;
2886   inA->ops->solvetranspose    = MatSolveTranspose_SeqBAIJ_Update;
2887   switch (a->bs) {
2888   case 1:
2889     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_1;
2890     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=1\n");
2891     break;
2892   case 2:
2893     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering;
2894     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=2\n");
2895     break;
2896   case 3:
2897     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering;
2898     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=3\n");
2899     break;
2900   case 4:
2901     ierr = PetscSSEIsEnabled(inA->comm,&sse_enabled_local,&sse_enabled_global);CHKERRQ(ierr);
2902     if (sse_enabled_local) {
2903 #if defined(PETSC_HAVE_SSE)
2904       inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE;
2905       PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special SSE, in-place natural ordering factor BS=4\n");
2906 #else
2907       /* This should never be reached.  If so, problem in PetscSSEIsEnabled. */
2908       SETERRQ(PETSC_ERR_SUP,"SSE Hardware unavailable");
2909 #endif
2910     } else {
2911       inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering;
2912       PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=4\n");
2913     }
2914     break;
2915   case 5:
2916     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_5_NaturalOrdering;
2917     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=5\n");
2918     break;
2919   case 6:
2920     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_6_NaturalOrdering;
2921     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=6\n");
2922     break;
2923   case 7:
2924     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_7_NaturalOrdering;
2925     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=7\n");
2926     break;
2927   }
2928   PetscFunctionReturn(0);
2929 }
2930 
2931 #undef __FUNCT__
2932 #define __FUNCT__ "MatSeqBAIJ_UpdateSolvers"
2933 int MatSeqBAIJ_UpdateSolvers(Mat A)
2934 {
2935   /*
2936       Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver
2937       with natural ordering
2938   */
2939   Mat_SeqBAIJ *a  = (Mat_SeqBAIJ *)A->data;
2940   IS          row = a->row, col = a->col;
2941   PetscTruth  row_identity, col_identity;
2942   PetscTruth  sse_enabled_local, sse_enabled_global;
2943   PetscTruth  use_single, use_natural;
2944   int         ierr;
2945 
2946   PetscFunctionBegin;
2947 
2948   use_single  = PETSC_FALSE;
2949   use_natural = PETSC_FALSE;
2950 
2951   ierr = ISIdentity(row,&row_identity);CHKERRQ(ierr);
2952   ierr = ISIdentity(col,&col_identity);CHKERRQ(ierr);
2953 
2954   if (row_identity && col_identity) {
2955     use_natural = PETSC_TRUE;
2956   } else {
2957     use_natural = PETSC_FALSE;
2958   }
2959   switch (a->bs) {
2960   case 1:
2961     if (use_natural) {
2962       A->ops->solve           = MatSolve_SeqBAIJ_1_NaturalOrdering;
2963       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering;
2964       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=1\n");
2965       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
2966     } else {
2967       A->ops->solve           = MatSolve_SeqBAIJ_1;
2968       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_1;
2969     }
2970     break;
2971   case 2:
2972     if (use_natural) {
2973       A->ops->solve           = MatSolve_SeqBAIJ_2_NaturalOrdering;
2974       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering;
2975       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=2\n");
2976       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
2977     } else {
2978       A->ops->solve           = MatSolve_SeqBAIJ_2;
2979       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_2;
2980     }
2981     break;
2982   case 3:
2983     if (use_natural) {
2984       A->ops->solve           = MatSolve_SeqBAIJ_3_NaturalOrdering;
2985       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering;
2986       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=3\n");
2987       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
2988     } else {
2989       A->ops->solve           = MatSolve_SeqBAIJ_3;
2990       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_3;
2991     }
2992     break;
2993   case 4:
2994     ierr = PetscSSEIsEnabled(A->comm,&sse_enabled_local,&sse_enabled_global);CHKERRQ(ierr);
2995     if (sse_enabled_local) {
2996       PetscTruth single_prec,flg;
2997       single_prec = flg = PETSC_FALSE;
2998       ierr = PetscOptionsGetLogical(PETSC_NULL,"-mat_single_precision_solves",&single_prec,&flg);CHKERRQ(ierr);
2999       if (flg) {
3000         a->single_precision_solves = single_prec;
3001       }
3002       if (a->single_precision_solves) {
3003         use_single = PETSC_TRUE;
3004       }
3005     } else {
3006       use_single = PETSC_FALSE;
3007     }
3008     if (use_natural) {
3009       if (use_single) {
3010 #if defined(PETSC_HAVE_SSE)
3011         A->ops->solve           = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion;
3012         PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE, in-place natural ordering solve BS=4\n");
3013 #else
3014       /* This should never be reached.  If so, problem in PetscSSEIsEnabled. */
3015       SETERRQ(PETSC_ERR_SUP,"SSE Hardware unavailable");
3016 #endif
3017       } else {
3018         A->ops->solve           = MatSolve_SeqBAIJ_4_NaturalOrdering;
3019         PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
3020       }
3021       A->ops->solvetranspose    = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
3022       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
3023     } else {
3024       if (use_single) {
3025 #if defined(PETSC_HAVE_SSE)
3026         A->ops->solve           = MatSolve_SeqBAIJ_4_SSE_Demotion;
3027         PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE solve BS=4\n");
3028 #else
3029       /* This should never be reached.  If so, problem in PetscSSEIsEnabled. */
3030       SETERRQ(PETSC_ERR_SUP,"SSE Hardware unavailable");
3031 #endif
3032       } else {
3033         A->ops->solve           = MatSolve_SeqBAIJ_4;
3034       }
3035       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_4;
3036     }
3037     break;
3038   case 5:
3039     if (use_natural) {
3040       A->ops->solve           = MatSolve_SeqBAIJ_5_NaturalOrdering;
3041       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_5_NaturalOrdering;
3042       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=5\n");
3043       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=5\n");
3044     } else {
3045       A->ops->solve           = MatSolve_SeqBAIJ_5;
3046       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_5;
3047     }
3048     break;
3049   case 6:
3050     if (use_natural) {
3051       A->ops->solve           = MatSolve_SeqBAIJ_6_NaturalOrdering;
3052       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_6_NaturalOrdering;
3053       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=6\n");
3054       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=6\n");
3055     } else {
3056       A->ops->solve           = MatSolve_SeqBAIJ_6;
3057       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_6;
3058     }
3059     break;
3060   case 7:
3061     if (use_natural) {
3062       A->ops->solve           = MatSolve_SeqBAIJ_7_NaturalOrdering;
3063       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_7_NaturalOrdering;
3064       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=7\n");
3065       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=7\n");
3066     } else {
3067       A->ops->solve           = MatSolve_SeqBAIJ_7;
3068       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_7;
3069     }
3070     break;
3071   default:
3072     A->ops->solve             = MatSolve_SeqBAIJ_N;
3073     break;
3074   }
3075   PetscFunctionReturn(0);
3076 }
3077 
3078 #undef __FUNCT__
3079 #define __FUNCT__ "MatSolve_SeqBAIJ_Update"
3080 int MatSolve_SeqBAIJ_Update(Mat A,Vec x,Vec y) {
3081   int ierr;
3082 
3083   PetscFunctionBegin;
3084   ierr = MatSeqBAIJ_UpdateSolvers(A);
3085   if (A->ops->solve != MatSolve_SeqBAIJ_Update) {
3086     ierr = (*A->ops->solve)(A,x,y);CHKERRQ(ierr);
3087   } else {
3088     SETERRQ(PETSC_ERR_SUP,"Something really wrong happened.");
3089   }
3090   PetscFunctionReturn(0);
3091 }
3092 
3093 #undef __FUNCT__
3094 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_Update"
3095 int MatSolveTranspose_SeqBAIJ_Update(Mat A,Vec x,Vec y) {
3096   int ierr;
3097 
3098   PetscFunctionBegin;
3099   ierr = MatSeqBAIJ_UpdateSolvers(A);
3100   ierr = (*A->ops->solvetranspose)(A,x,y);CHKERRQ(ierr);
3101   PetscFunctionReturn(0);
3102 }
3103 
3104 
3105