xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision 4482741e5b2e2bbc854fb1f8dba65221386520f2)
1 /*$Id: baijfact2.c,v 1.72 2001/09/11 16:32:33 bsmith Exp $*/
2 /*
3     Factorization code for BAIJ format.
4 */
5 
6 #include "src/mat/impls/baij/seq/baij.h"
7 #include "src/inline/ilu.h"
8 #include "src/inline/dot.h"
9 
10 #undef __FUNCT__
11 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering"
12 int MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
13 {
14   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
15   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
16   int             *diag = a->diag;
17   MatScalar       *aa=a->a,*v;
18   PetscScalar     s1,*x,*b;
19 
20   PetscFunctionBegin;
21   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
22   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
23   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
24 
25   /* forward solve the U^T */
26   for (i=0; i<n; i++) {
27 
28     v     = aa + diag[i];
29     /* multiply by the inverse of the block diagonal */
30     s1    = (*v++)*x[i];
31     vi    = aj + diag[i] + 1;
32     nz    = ai[i+1] - diag[i] - 1;
33     while (nz--) {
34       x[*vi++]  -= (*v++)*s1;
35     }
36     x[i]   = s1;
37   }
38   /* backward solve the L^T */
39   for (i=n-1; i>=0; i--){
40     v    = aa + diag[i] - 1;
41     vi   = aj + diag[i] - 1;
42     nz   = diag[i] - ai[i];
43     s1   = x[i];
44     while (nz--) {
45       x[*vi--]   -=  (*v--)*s1;
46     }
47   }
48   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
49   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
50   PetscLogFlops(2*(a->nz) - A->n);
51   PetscFunctionReturn(0);
52 }
53 
54 #undef __FUNCT__
55 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering"
56 int MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
57 {
58   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
59   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
60   int             *diag = a->diag,oidx;
61   MatScalar       *aa=a->a,*v;
62   PetscScalar     s1,s2,x1,x2;
63   PetscScalar     *x,*b;
64 
65   PetscFunctionBegin;
66   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
67   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
68   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
69 
70   /* forward solve the U^T */
71   idx = 0;
72   for (i=0; i<n; i++) {
73 
74     v     = aa + 4*diag[i];
75     /* multiply by the inverse of the block diagonal */
76     x1 = x[idx];   x2 = x[1+idx];
77     s1 = v[0]*x1  +  v[1]*x2;
78     s2 = v[2]*x1  +  v[3]*x2;
79     v += 4;
80 
81     vi    = aj + diag[i] + 1;
82     nz    = ai[i+1] - diag[i] - 1;
83     while (nz--) {
84       oidx = 2*(*vi++);
85       x[oidx]   -= v[0]*s1  +  v[1]*s2;
86       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
87       v  += 4;
88     }
89     x[idx]   = s1;x[1+idx] = s2;
90     idx += 2;
91   }
92   /* backward solve the L^T */
93   for (i=n-1; i>=0; i--){
94     v    = aa + 4*diag[i] - 4;
95     vi   = aj + diag[i] - 1;
96     nz   = diag[i] - ai[i];
97     idt  = 2*i;
98     s1   = x[idt];  s2 = x[1+idt];
99     while (nz--) {
100       idx   = 2*(*vi--);
101       x[idx]   -=  v[0]*s1 +  v[1]*s2;
102       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
103       v -= 4;
104     }
105   }
106   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
107   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
108   PetscLogFlops(2*4*(a->nz) - 2*A->n);
109   PetscFunctionReturn(0);
110 }
111 
112 #undef __FUNCT__
113 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering"
114 int MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
115 {
116   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
117   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
118   int             *diag = a->diag,oidx;
119   MatScalar       *aa=a->a,*v;
120   PetscScalar     s1,s2,s3,x1,x2,x3;
121   PetscScalar     *x,*b;
122 
123   PetscFunctionBegin;
124   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
125   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
126   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
127 
128   /* forward solve the U^T */
129   idx = 0;
130   for (i=0; i<n; i++) {
131 
132     v     = aa + 9*diag[i];
133     /* multiply by the inverse of the block diagonal */
134     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
135     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
136     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
137     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
138     v += 9;
139 
140     vi    = aj + diag[i] + 1;
141     nz    = ai[i+1] - diag[i] - 1;
142     while (nz--) {
143       oidx = 3*(*vi++);
144       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
145       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
146       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
147       v  += 9;
148     }
149     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
150     idx += 3;
151   }
152   /* backward solve the L^T */
153   for (i=n-1; i>=0; i--){
154     v    = aa + 9*diag[i] - 9;
155     vi   = aj + diag[i] - 1;
156     nz   = diag[i] - ai[i];
157     idt  = 3*i;
158     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
159     while (nz--) {
160       idx   = 3*(*vi--);
161       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
162       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
163       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
164       v -= 9;
165     }
166   }
167   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
168   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
169   PetscLogFlops(2*9*(a->nz) - 3*A->n);
170   PetscFunctionReturn(0);
171 }
172 
173 #undef __FUNCT__
174 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering"
175 int MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
176 {
177   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
178   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
179   int             *diag = a->diag,oidx;
180   MatScalar       *aa=a->a,*v;
181   PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
182   PetscScalar     *x,*b;
183 
184   PetscFunctionBegin;
185   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
186   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
187   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
188 
189   /* forward solve the U^T */
190   idx = 0;
191   for (i=0; i<n; i++) {
192 
193     v     = aa + 16*diag[i];
194     /* multiply by the inverse of the block diagonal */
195     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
196     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
197     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
198     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
199     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
200     v += 16;
201 
202     vi    = aj + diag[i] + 1;
203     nz    = ai[i+1] - diag[i] - 1;
204     while (nz--) {
205       oidx = 4*(*vi++);
206       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
207       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
208       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
209       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
210       v  += 16;
211     }
212     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
213     idx += 4;
214   }
215   /* backward solve the L^T */
216   for (i=n-1; i>=0; i--){
217     v    = aa + 16*diag[i] - 16;
218     vi   = aj + diag[i] - 1;
219     nz   = diag[i] - ai[i];
220     idt  = 4*i;
221     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
222     while (nz--) {
223       idx   = 4*(*vi--);
224       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
225       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
226       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
227       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
228       v -= 16;
229     }
230   }
231   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
232   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
233   PetscLogFlops(2*16*(a->nz) - 4*A->n);
234   PetscFunctionReturn(0);
235 }
236 
237 #undef __FUNCT__
238 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering"
239 int MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
240 {
241   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
242   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
243   int             *diag = a->diag,oidx;
244   MatScalar       *aa=a->a,*v;
245   PetscScalar     s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
246   PetscScalar     *x,*b;
247 
248   PetscFunctionBegin;
249   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
250   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
251   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
252 
253   /* forward solve the U^T */
254   idx = 0;
255   for (i=0; i<n; i++) {
256 
257     v     = aa + 25*diag[i];
258     /* multiply by the inverse of the block diagonal */
259     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
260     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
261     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
262     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
263     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
264     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
265     v += 25;
266 
267     vi    = aj + diag[i] + 1;
268     nz    = ai[i+1] - diag[i] - 1;
269     while (nz--) {
270       oidx = 5*(*vi++);
271       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
272       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
273       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
274       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
275       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
276       v  += 25;
277     }
278     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
279     idx += 5;
280   }
281   /* backward solve the L^T */
282   for (i=n-1; i>=0; i--){
283     v    = aa + 25*diag[i] - 25;
284     vi   = aj + diag[i] - 1;
285     nz   = diag[i] - ai[i];
286     idt  = 5*i;
287     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
288     while (nz--) {
289       idx   = 5*(*vi--);
290       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
291       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
292       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
293       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
294       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
295       v -= 25;
296     }
297   }
298   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
299   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
300   PetscLogFlops(2*25*(a->nz) - 5*A->n);
301   PetscFunctionReturn(0);
302 }
303 
304 #undef __FUNCT__
305 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering"
306 int MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
307 {
308   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
309   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
310   int             *diag = a->diag,oidx;
311   MatScalar       *aa=a->a,*v;
312   PetscScalar     s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
313   PetscScalar     *x,*b;
314 
315   PetscFunctionBegin;
316   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
317   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
318   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
319 
320   /* forward solve the U^T */
321   idx = 0;
322   for (i=0; i<n; i++) {
323 
324     v     = aa + 36*diag[i];
325     /* multiply by the inverse of the block diagonal */
326     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
327     x6    = x[5+idx];
328     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
329     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
330     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
331     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
332     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
333     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
334     v += 36;
335 
336     vi    = aj + diag[i] + 1;
337     nz    = ai[i+1] - diag[i] - 1;
338     while (nz--) {
339       oidx = 6*(*vi++);
340       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
341       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
342       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
343       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
344       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
345       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
346       v  += 36;
347     }
348     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
349     x[5+idx] = s6;
350     idx += 6;
351   }
352   /* backward solve the L^T */
353   for (i=n-1; i>=0; i--){
354     v    = aa + 36*diag[i] - 36;
355     vi   = aj + diag[i] - 1;
356     nz   = diag[i] - ai[i];
357     idt  = 6*i;
358     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
359     s6 = x[5+idt];
360     while (nz--) {
361       idx   = 6*(*vi--);
362       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
363       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
364       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
365       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
366       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
367       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
368       v -= 36;
369     }
370   }
371   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
372   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
373   PetscLogFlops(2*36*(a->nz) - 6*A->n);
374   PetscFunctionReturn(0);
375 }
376 
377 #undef __FUNCT__
378 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering"
379 int MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
380 {
381   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
382   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
383   int             *diag = a->diag,oidx;
384   MatScalar       *aa=a->a,*v;
385   PetscScalar     s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
386   PetscScalar     *x,*b;
387 
388   PetscFunctionBegin;
389   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
390   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
391   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
392 
393   /* forward solve the U^T */
394   idx = 0;
395   for (i=0; i<n; i++) {
396 
397     v     = aa + 49*diag[i];
398     /* multiply by the inverse of the block diagonal */
399     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
400     x6    = x[5+idx]; x7 = x[6+idx];
401     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
402     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
403     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
404     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
405     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
406     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
407     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
408     v += 49;
409 
410     vi    = aj + diag[i] + 1;
411     nz    = ai[i+1] - diag[i] - 1;
412     while (nz--) {
413       oidx = 7*(*vi++);
414       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
415       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
416       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
417       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
418       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
419       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
420       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
421       v  += 49;
422     }
423     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
424     x[5+idx] = s6;x[6+idx] = s7;
425     idx += 7;
426   }
427   /* backward solve the L^T */
428   for (i=n-1; i>=0; i--){
429     v    = aa + 49*diag[i] - 49;
430     vi   = aj + diag[i] - 1;
431     nz   = diag[i] - ai[i];
432     idt  = 7*i;
433     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
434     s6 = x[5+idt];s7 = x[6+idt];
435     while (nz--) {
436       idx   = 7*(*vi--);
437       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
438       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
439       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
440       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
441       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
442       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
443       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
444       v -= 49;
445     }
446   }
447   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
448   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
449   PetscLogFlops(2*49*(a->nz) - 7*A->n);
450   PetscFunctionReturn(0);
451 }
452 
453 /*---------------------------------------------------------------------------------------------*/
454 #undef __FUNCT__
455 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1"
456 int MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
457 {
458   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
459   IS              iscol=a->col,isrow=a->row;
460   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
461   int             *diag = a->diag;
462   MatScalar       *aa=a->a,*v;
463   PetscScalar     s1,*x,*b,*t;
464 
465   PetscFunctionBegin;
466   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
467   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
468   t  = a->solve_work;
469 
470   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
471   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
472 
473   /* copy the b into temp work space according to permutation */
474   for (i=0; i<n; i++) {
475     t[i] = b[c[i]];
476   }
477 
478   /* forward solve the U^T */
479   for (i=0; i<n; i++) {
480 
481     v     = aa + diag[i];
482     /* multiply by the inverse of the block diagonal */
483     s1    = (*v++)*t[i];
484     vi    = aj + diag[i] + 1;
485     nz    = ai[i+1] - diag[i] - 1;
486     while (nz--) {
487       t[*vi++]  -= (*v++)*s1;
488     }
489     t[i]   = s1;
490   }
491   /* backward solve the L^T */
492   for (i=n-1; i>=0; i--){
493     v    = aa + diag[i] - 1;
494     vi   = aj + diag[i] - 1;
495     nz   = diag[i] - ai[i];
496     s1   = t[i];
497     while (nz--) {
498       t[*vi--]   -=  (*v--)*s1;
499     }
500   }
501 
502   /* copy t into x according to permutation */
503   for (i=0; i<n; i++) {
504     x[r[i]]   = t[i];
505   }
506 
507   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
508   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
509   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
510   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
511   PetscLogFlops(2*(a->nz) - A->n);
512   PetscFunctionReturn(0);
513 }
514 
515 #undef __FUNCT__
516 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2"
517 int MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
518 {
519   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
520   IS              iscol=a->col,isrow=a->row;
521   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
522   int             *diag = a->diag,ii,ic,ir,oidx;
523   MatScalar       *aa=a->a,*v;
524   PetscScalar     s1,s2,x1,x2;
525   PetscScalar     *x,*b,*t;
526 
527   PetscFunctionBegin;
528   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
529   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
530   t  = a->solve_work;
531 
532   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
533   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
534 
535   /* copy the b into temp work space according to permutation */
536   ii = 0;
537   for (i=0; i<n; i++) {
538     ic      = 2*c[i];
539     t[ii]   = b[ic];
540     t[ii+1] = b[ic+1];
541     ii += 2;
542   }
543 
544   /* forward solve the U^T */
545   idx = 0;
546   for (i=0; i<n; i++) {
547 
548     v     = aa + 4*diag[i];
549     /* multiply by the inverse of the block diagonal */
550     x1    = t[idx];   x2 = t[1+idx];
551     s1 = v[0]*x1  +  v[1]*x2;
552     s2 = v[2]*x1  +  v[3]*x2;
553     v += 4;
554 
555     vi    = aj + diag[i] + 1;
556     nz    = ai[i+1] - diag[i] - 1;
557     while (nz--) {
558       oidx = 2*(*vi++);
559       t[oidx]   -= v[0]*s1  +  v[1]*s2;
560       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
561       v  += 4;
562     }
563     t[idx]   = s1;t[1+idx] = s2;
564     idx += 2;
565   }
566   /* backward solve the L^T */
567   for (i=n-1; i>=0; i--){
568     v    = aa + 4*diag[i] - 4;
569     vi   = aj + diag[i] - 1;
570     nz   = diag[i] - ai[i];
571     idt  = 2*i;
572     s1 = t[idt];  s2 = t[1+idt];
573     while (nz--) {
574       idx   = 2*(*vi--);
575       t[idx]   -=  v[0]*s1 +  v[1]*s2;
576       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
577       v -= 4;
578     }
579   }
580 
581   /* copy t into x according to permutation */
582   ii = 0;
583   for (i=0; i<n; i++) {
584     ir      = 2*r[i];
585     x[ir]   = t[ii];
586     x[ir+1] = t[ii+1];
587     ii += 2;
588   }
589 
590   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
591   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
592   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
593   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
594   PetscLogFlops(2*4*(a->nz) - 2*A->n);
595   PetscFunctionReturn(0);
596 }
597 
598 #undef __FUNCT__
599 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3"
600 int MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
601 {
602   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
603   IS              iscol=a->col,isrow=a->row;
604   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
605   int             *diag = a->diag,ii,ic,ir,oidx;
606   MatScalar       *aa=a->a,*v;
607   PetscScalar     s1,s2,s3,x1,x2,x3;
608   PetscScalar     *x,*b,*t;
609 
610   PetscFunctionBegin;
611   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
612   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
613   t  = a->solve_work;
614 
615   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
616   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
617 
618   /* copy the b into temp work space according to permutation */
619   ii = 0;
620   for (i=0; i<n; i++) {
621     ic      = 3*c[i];
622     t[ii]   = b[ic];
623     t[ii+1] = b[ic+1];
624     t[ii+2] = b[ic+2];
625     ii += 3;
626   }
627 
628   /* forward solve the U^T */
629   idx = 0;
630   for (i=0; i<n; i++) {
631 
632     v     = aa + 9*diag[i];
633     /* multiply by the inverse of the block diagonal */
634     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
635     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
636     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
637     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
638     v += 9;
639 
640     vi    = aj + diag[i] + 1;
641     nz    = ai[i+1] - diag[i] - 1;
642     while (nz--) {
643       oidx = 3*(*vi++);
644       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
645       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
646       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
647       v  += 9;
648     }
649     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
650     idx += 3;
651   }
652   /* backward solve the L^T */
653   for (i=n-1; i>=0; i--){
654     v    = aa + 9*diag[i] - 9;
655     vi   = aj + diag[i] - 1;
656     nz   = diag[i] - ai[i];
657     idt  = 3*i;
658     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
659     while (nz--) {
660       idx   = 3*(*vi--);
661       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
662       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
663       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
664       v -= 9;
665     }
666   }
667 
668   /* copy t into x according to permutation */
669   ii = 0;
670   for (i=0; i<n; i++) {
671     ir      = 3*r[i];
672     x[ir]   = t[ii];
673     x[ir+1] = t[ii+1];
674     x[ir+2] = t[ii+2];
675     ii += 3;
676   }
677 
678   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
679   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
680   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
681   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
682   PetscLogFlops(2*9*(a->nz) - 3*A->n);
683   PetscFunctionReturn(0);
684 }
685 
686 #undef __FUNCT__
687 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4"
688 int MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
689 {
690   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
691   IS              iscol=a->col,isrow=a->row;
692   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
693   int             *diag = a->diag,ii,ic,ir,oidx;
694   MatScalar       *aa=a->a,*v;
695   PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
696   PetscScalar     *x,*b,*t;
697 
698   PetscFunctionBegin;
699   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
700   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
701   t  = a->solve_work;
702 
703   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
704   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
705 
706   /* copy the b into temp work space according to permutation */
707   ii = 0;
708   for (i=0; i<n; i++) {
709     ic      = 4*c[i];
710     t[ii]   = b[ic];
711     t[ii+1] = b[ic+1];
712     t[ii+2] = b[ic+2];
713     t[ii+3] = b[ic+3];
714     ii += 4;
715   }
716 
717   /* forward solve the U^T */
718   idx = 0;
719   for (i=0; i<n; i++) {
720 
721     v     = aa + 16*diag[i];
722     /* multiply by the inverse of the block diagonal */
723     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
724     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
725     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
726     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
727     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
728     v += 16;
729 
730     vi    = aj + diag[i] + 1;
731     nz    = ai[i+1] - diag[i] - 1;
732     while (nz--) {
733       oidx = 4*(*vi++);
734       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
735       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
736       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
737       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
738       v  += 16;
739     }
740     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
741     idx += 4;
742   }
743   /* backward solve the L^T */
744   for (i=n-1; i>=0; i--){
745     v    = aa + 16*diag[i] - 16;
746     vi   = aj + diag[i] - 1;
747     nz   = diag[i] - ai[i];
748     idt  = 4*i;
749     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
750     while (nz--) {
751       idx   = 4*(*vi--);
752       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
753       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
754       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
755       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
756       v -= 16;
757     }
758   }
759 
760   /* copy t into x according to permutation */
761   ii = 0;
762   for (i=0; i<n; i++) {
763     ir      = 4*r[i];
764     x[ir]   = t[ii];
765     x[ir+1] = t[ii+1];
766     x[ir+2] = t[ii+2];
767     x[ir+3] = t[ii+3];
768     ii += 4;
769   }
770 
771   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
772   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
773   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
774   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
775   PetscLogFlops(2*16*(a->nz) - 4*A->n);
776   PetscFunctionReturn(0);
777 }
778 
779 #undef __FUNCT__
780 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5"
781 int MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
782 {
783   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
784   IS              iscol=a->col,isrow=a->row;
785   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
786   int             *diag = a->diag,ii,ic,ir,oidx;
787   MatScalar       *aa=a->a,*v;
788   PetscScalar     s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
789   PetscScalar     *x,*b,*t;
790 
791   PetscFunctionBegin;
792   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
793   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
794   t  = a->solve_work;
795 
796   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
797   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
798 
799   /* copy the b into temp work space according to permutation */
800   ii = 0;
801   for (i=0; i<n; i++) {
802     ic      = 5*c[i];
803     t[ii]   = b[ic];
804     t[ii+1] = b[ic+1];
805     t[ii+2] = b[ic+2];
806     t[ii+3] = b[ic+3];
807     t[ii+4] = b[ic+4];
808     ii += 5;
809   }
810 
811   /* forward solve the U^T */
812   idx = 0;
813   for (i=0; i<n; i++) {
814 
815     v     = aa + 25*diag[i];
816     /* multiply by the inverse of the block diagonal */
817     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
818     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
819     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
820     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
821     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
822     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
823     v += 25;
824 
825     vi    = aj + diag[i] + 1;
826     nz    = ai[i+1] - diag[i] - 1;
827     while (nz--) {
828       oidx = 5*(*vi++);
829       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
830       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
831       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
832       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
833       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
834       v  += 25;
835     }
836     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
837     idx += 5;
838   }
839   /* backward solve the L^T */
840   for (i=n-1; i>=0; i--){
841     v    = aa + 25*diag[i] - 25;
842     vi   = aj + diag[i] - 1;
843     nz   = diag[i] - ai[i];
844     idt  = 5*i;
845     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
846     while (nz--) {
847       idx   = 5*(*vi--);
848       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
849       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
850       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
851       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
852       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
853       v -= 25;
854     }
855   }
856 
857   /* copy t into x according to permutation */
858   ii = 0;
859   for (i=0; i<n; i++) {
860     ir      = 5*r[i];
861     x[ir]   = t[ii];
862     x[ir+1] = t[ii+1];
863     x[ir+2] = t[ii+2];
864     x[ir+3] = t[ii+3];
865     x[ir+4] = t[ii+4];
866     ii += 5;
867   }
868 
869   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
870   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
871   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
872   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
873   PetscLogFlops(2*25*(a->nz) - 5*A->n);
874   PetscFunctionReturn(0);
875 }
876 
877 #undef __FUNCT__
878 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6"
879 int MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
880 {
881   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
882   IS              iscol=a->col,isrow=a->row;
883   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
884   int             *diag = a->diag,ii,ic,ir,oidx;
885   MatScalar       *aa=a->a,*v;
886   PetscScalar     s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
887   PetscScalar     *x,*b,*t;
888 
889   PetscFunctionBegin;
890   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
891   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
892   t  = a->solve_work;
893 
894   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
895   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
896 
897   /* copy the b into temp work space according to permutation */
898   ii = 0;
899   for (i=0; i<n; i++) {
900     ic      = 6*c[i];
901     t[ii]   = b[ic];
902     t[ii+1] = b[ic+1];
903     t[ii+2] = b[ic+2];
904     t[ii+3] = b[ic+3];
905     t[ii+4] = b[ic+4];
906     t[ii+5] = b[ic+5];
907     ii += 6;
908   }
909 
910   /* forward solve the U^T */
911   idx = 0;
912   for (i=0; i<n; i++) {
913 
914     v     = aa + 36*diag[i];
915     /* multiply by the inverse of the block diagonal */
916     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
917     x6    = t[5+idx];
918     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
919     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
920     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
921     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
922     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
923     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
924     v += 36;
925 
926     vi    = aj + diag[i] + 1;
927     nz    = ai[i+1] - diag[i] - 1;
928     while (nz--) {
929       oidx = 6*(*vi++);
930       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
931       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
932       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
933       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
934       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
935       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
936       v  += 36;
937     }
938     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
939     t[5+idx] = s6;
940     idx += 6;
941   }
942   /* backward solve the L^T */
943   for (i=n-1; i>=0; i--){
944     v    = aa + 36*diag[i] - 36;
945     vi   = aj + diag[i] - 1;
946     nz   = diag[i] - ai[i];
947     idt  = 6*i;
948     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
949     s6 = t[5+idt];
950     while (nz--) {
951       idx   = 6*(*vi--);
952       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
953       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
954       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
955       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
956       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
957       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
958       v -= 36;
959     }
960   }
961 
962   /* copy t into x according to permutation */
963   ii = 0;
964   for (i=0; i<n; i++) {
965     ir      = 6*r[i];
966     x[ir]   = t[ii];
967     x[ir+1] = t[ii+1];
968     x[ir+2] = t[ii+2];
969     x[ir+3] = t[ii+3];
970     x[ir+4] = t[ii+4];
971     x[ir+5] = t[ii+5];
972     ii += 6;
973   }
974 
975   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
976   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
977   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
978   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
979   PetscLogFlops(2*36*(a->nz) - 6*A->n);
980   PetscFunctionReturn(0);
981 }
982 
983 #undef __FUNCT__
984 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7"
985 int MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
986 {
987   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
988   IS              iscol=a->col,isrow=a->row;
989   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
990   int             *diag = a->diag,ii,ic,ir,oidx;
991   MatScalar       *aa=a->a,*v;
992   PetscScalar     s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
993   PetscScalar     *x,*b,*t;
994 
995   PetscFunctionBegin;
996   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
997   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
998   t  = a->solve_work;
999 
1000   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1001   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1002 
1003   /* copy the b into temp work space according to permutation */
1004   ii = 0;
1005   for (i=0; i<n; i++) {
1006     ic      = 7*c[i];
1007     t[ii]   = b[ic];
1008     t[ii+1] = b[ic+1];
1009     t[ii+2] = b[ic+2];
1010     t[ii+3] = b[ic+3];
1011     t[ii+4] = b[ic+4];
1012     t[ii+5] = b[ic+5];
1013     t[ii+6] = b[ic+6];
1014     ii += 7;
1015   }
1016 
1017   /* forward solve the U^T */
1018   idx = 0;
1019   for (i=0; i<n; i++) {
1020 
1021     v     = aa + 49*diag[i];
1022     /* multiply by the inverse of the block diagonal */
1023     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1024     x6    = t[5+idx]; x7 = t[6+idx];
1025     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1026     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1027     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1028     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1029     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1030     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1031     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1032     v += 49;
1033 
1034     vi    = aj + diag[i] + 1;
1035     nz    = ai[i+1] - diag[i] - 1;
1036     while (nz--) {
1037       oidx = 7*(*vi++);
1038       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1039       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1040       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1041       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1042       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1043       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1044       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1045       v  += 49;
1046     }
1047     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1048     t[5+idx] = s6;t[6+idx] = s7;
1049     idx += 7;
1050   }
1051   /* backward solve the L^T */
1052   for (i=n-1; i>=0; i--){
1053     v    = aa + 49*diag[i] - 49;
1054     vi   = aj + diag[i] - 1;
1055     nz   = diag[i] - ai[i];
1056     idt  = 7*i;
1057     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1058     s6 = t[5+idt];s7 = t[6+idt];
1059     while (nz--) {
1060       idx   = 7*(*vi--);
1061       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1062       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1063       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1064       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1065       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1066       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1067       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1068       v -= 49;
1069     }
1070   }
1071 
1072   /* copy t into x according to permutation */
1073   ii = 0;
1074   for (i=0; i<n; i++) {
1075     ir      = 7*r[i];
1076     x[ir]   = t[ii];
1077     x[ir+1] = t[ii+1];
1078     x[ir+2] = t[ii+2];
1079     x[ir+3] = t[ii+3];
1080     x[ir+4] = t[ii+4];
1081     x[ir+5] = t[ii+5];
1082     x[ir+6] = t[ii+6];
1083     ii += 7;
1084   }
1085 
1086   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1087   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1088   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1089   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1090   PetscLogFlops(2*49*(a->nz) - 7*A->n);
1091   PetscFunctionReturn(0);
1092 }
1093 
1094 /* ----------------------------------------------------------- */
1095 #undef __FUNCT__
1096 #define __FUNCT__ "MatSolve_SeqBAIJ_N"
1097 int MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
1098 {
1099   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1100   IS              iscol=a->col,isrow=a->row;
1101   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1102   int             nz,bs=a->bs,bs2=a->bs2,*rout,*cout;
1103   MatScalar       *aa=a->a,*v;
1104   PetscScalar     *x,*b,*s,*t,*ls;
1105 
1106   PetscFunctionBegin;
1107   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1108   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1109   t  = a->solve_work;
1110 
1111   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1112   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1113 
1114   /* forward solve the lower triangular */
1115   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
1116   for (i=1; i<n; i++) {
1117     v   = aa + bs2*ai[i];
1118     vi  = aj + ai[i];
1119     nz  = a->diag[i] - ai[i];
1120     s = t + bs*i;
1121     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
1122     while (nz--) {
1123       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
1124       v += bs2;
1125     }
1126   }
1127   /* backward solve the upper triangular */
1128   ls = a->solve_work + A->n;
1129   for (i=n-1; i>=0; i--){
1130     v   = aa + bs2*(a->diag[i] + 1);
1131     vi  = aj + a->diag[i] + 1;
1132     nz  = ai[i+1] - a->diag[i] - 1;
1133     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1134     while (nz--) {
1135       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
1136       v += bs2;
1137     }
1138     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
1139     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1140   }
1141 
1142   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1143   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1144   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1145   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1146   PetscLogFlops(2*(a->bs2)*(a->nz) - a->bs*A->n);
1147   PetscFunctionReturn(0);
1148 }
1149 
1150 #undef __FUNCT__
1151 #define __FUNCT__ "MatSolve_SeqBAIJ_7"
1152 int MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1153 {
1154   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1155   IS              iscol=a->col,isrow=a->row;
1156   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1157   int             *diag = a->diag;
1158   MatScalar       *aa=a->a,*v;
1159   PetscScalar     s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1160   PetscScalar     *x,*b,*t;
1161 
1162   PetscFunctionBegin;
1163   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1164   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1165   t  = a->solve_work;
1166 
1167   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1168   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1169 
1170   /* forward solve the lower triangular */
1171   idx    = 7*(*r++);
1172   t[0] = b[idx];   t[1] = b[1+idx];
1173   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1174   t[5] = b[5+idx]; t[6] = b[6+idx];
1175 
1176   for (i=1; i<n; i++) {
1177     v     = aa + 49*ai[i];
1178     vi    = aj + ai[i];
1179     nz    = diag[i] - ai[i];
1180     idx   = 7*(*r++);
1181     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1182     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1183     while (nz--) {
1184       idx   = 7*(*vi++);
1185       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1186       x4    = t[3+idx];x5 = t[4+idx];
1187       x6    = t[5+idx];x7 = t[6+idx];
1188       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1189       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1190       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1191       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1192       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1193       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1194       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1195       v += 49;
1196     }
1197     idx = 7*i;
1198     t[idx]   = s1;t[1+idx] = s2;
1199     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1200     t[5+idx] = s6;t[6+idx] = s7;
1201   }
1202   /* backward solve the upper triangular */
1203   for (i=n-1; i>=0; i--){
1204     v    = aa + 49*diag[i] + 49;
1205     vi   = aj + diag[i] + 1;
1206     nz   = ai[i+1] - diag[i] - 1;
1207     idt  = 7*i;
1208     s1 = t[idt];  s2 = t[1+idt];
1209     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1210     s6 = t[5+idt];s7 = t[6+idt];
1211     while (nz--) {
1212       idx   = 7*(*vi++);
1213       x1    = t[idx];   x2 = t[1+idx];
1214       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1215       x6    = t[5+idx]; x7 = t[6+idx];
1216       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1217       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1218       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1219       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1220       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1221       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1222       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1223       v += 49;
1224     }
1225     idc = 7*(*c--);
1226     v   = aa + 49*diag[i];
1227     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1228                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1229     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1230                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1231     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1232                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1233     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1234                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1235     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1236                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1237     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1238                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1239     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1240                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
1241   }
1242 
1243   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1244   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1245   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1246   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1247   PetscLogFlops(2*49*(a->nz) - 7*A->n);
1248   PetscFunctionReturn(0);
1249 }
1250 
1251 #undef __FUNCT__
1252 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
1253 int MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
1254 {
1255   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1256   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1257   int             ierr,*diag = a->diag,jdx;
1258   MatScalar       *aa=a->a,*v;
1259   PetscScalar     *x,*b,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1260 
1261   PetscFunctionBegin;
1262   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1263   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1264   /* forward solve the lower triangular */
1265   idx    = 0;
1266   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1267   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1268   x[6] = b[6+idx];
1269   for (i=1; i<n; i++) {
1270     v     =  aa + 49*ai[i];
1271     vi    =  aj + ai[i];
1272     nz    =  diag[i] - ai[i];
1273     idx   =  7*i;
1274     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1275     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1276     s7  =  b[6+idx];
1277     while (nz--) {
1278       jdx   = 7*(*vi++);
1279       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1280       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1281       x7    = x[6+jdx];
1282       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1283       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1284       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1285       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1286       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1287       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1288       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1289       v += 49;
1290      }
1291     x[idx]   = s1;
1292     x[1+idx] = s2;
1293     x[2+idx] = s3;
1294     x[3+idx] = s4;
1295     x[4+idx] = s5;
1296     x[5+idx] = s6;
1297     x[6+idx] = s7;
1298   }
1299   /* backward solve the upper triangular */
1300   for (i=n-1; i>=0; i--){
1301     v    = aa + 49*diag[i] + 49;
1302     vi   = aj + diag[i] + 1;
1303     nz   = ai[i+1] - diag[i] - 1;
1304     idt  = 7*i;
1305     s1 = x[idt];   s2 = x[1+idt];
1306     s3 = x[2+idt]; s4 = x[3+idt];
1307     s5 = x[4+idt]; s6 = x[5+idt];
1308     s7 = x[6+idt];
1309     while (nz--) {
1310       idx   = 7*(*vi++);
1311       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1312       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1313       x7    = x[6+idx];
1314       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1315       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1316       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1317       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1318       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1319       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1320       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1321       v += 49;
1322     }
1323     v        = aa + 49*diag[i];
1324     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
1325                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
1326     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
1327                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
1328     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
1329                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
1330     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
1331                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
1332     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
1333                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
1334     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
1335                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
1336     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
1337                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
1338   }
1339 
1340   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1341   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1342   PetscLogFlops(2*36*(a->nz) - 6*A->n);
1343   PetscFunctionReturn(0);
1344 }
1345 
1346 #undef __FUNCT__
1347 #define __FUNCT__ "MatSolve_SeqBAIJ_6"
1348 int MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
1349 {
1350   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1351   IS              iscol=a->col,isrow=a->row;
1352   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1353   int             *diag = a->diag;
1354   MatScalar       *aa=a->a,*v;
1355   PetscScalar     *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
1356 
1357   PetscFunctionBegin;
1358   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1359   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1360   t  = a->solve_work;
1361 
1362   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1363   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1364 
1365   /* forward solve the lower triangular */
1366   idx    = 6*(*r++);
1367   t[0] = b[idx];   t[1] = b[1+idx];
1368   t[2] = b[2+idx]; t[3] = b[3+idx];
1369   t[4] = b[4+idx]; t[5] = b[5+idx];
1370   for (i=1; i<n; i++) {
1371     v     = aa + 36*ai[i];
1372     vi    = aj + ai[i];
1373     nz    = diag[i] - ai[i];
1374     idx   = 6*(*r++);
1375     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1376     s5  = b[4+idx]; s6 = b[5+idx];
1377     while (nz--) {
1378       idx   = 6*(*vi++);
1379       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1380       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1381       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1382       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1383       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1384       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1385       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1386       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1387       v += 36;
1388     }
1389     idx = 6*i;
1390     t[idx]   = s1;t[1+idx] = s2;
1391     t[2+idx] = s3;t[3+idx] = s4;
1392     t[4+idx] = s5;t[5+idx] = s6;
1393   }
1394   /* backward solve the upper triangular */
1395   for (i=n-1; i>=0; i--){
1396     v    = aa + 36*diag[i] + 36;
1397     vi   = aj + diag[i] + 1;
1398     nz   = ai[i+1] - diag[i] - 1;
1399     idt  = 6*i;
1400     s1 = t[idt];  s2 = t[1+idt];
1401     s3 = t[2+idt];s4 = t[3+idt];
1402     s5 = t[4+idt];s6 = t[5+idt];
1403     while (nz--) {
1404       idx   = 6*(*vi++);
1405       x1    = t[idx];   x2 = t[1+idx];
1406       x3    = t[2+idx]; x4 = t[3+idx];
1407       x5    = t[4+idx]; x6 = t[5+idx];
1408       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1409       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1410       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1411       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1412       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1413       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1414       v += 36;
1415     }
1416     idc = 6*(*c--);
1417     v   = aa + 36*diag[i];
1418     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
1419                                  v[18]*s4+v[24]*s5+v[30]*s6;
1420     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1421                                  v[19]*s4+v[25]*s5+v[31]*s6;
1422     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1423                                  v[20]*s4+v[26]*s5+v[32]*s6;
1424     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1425                                  v[21]*s4+v[27]*s5+v[33]*s6;
1426     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1427                                  v[22]*s4+v[28]*s5+v[34]*s6;
1428     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1429                                  v[23]*s4+v[29]*s5+v[35]*s6;
1430   }
1431 
1432   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1433   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1434   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1435   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1436   PetscLogFlops(2*36*(a->nz) - 6*A->n);
1437   PetscFunctionReturn(0);
1438 }
1439 
1440 #undef __FUNCT__
1441 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
1442 int MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
1443 {
1444   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1445   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1446   int             ierr,*diag = a->diag,jdx;
1447   MatScalar       *aa=a->a,*v;
1448   PetscScalar     *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
1449 
1450   PetscFunctionBegin;
1451   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1452   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1453   /* forward solve the lower triangular */
1454   idx    = 0;
1455   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1456   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1457   for (i=1; i<n; i++) {
1458     v     =  aa + 36*ai[i];
1459     vi    =  aj + ai[i];
1460     nz    =  diag[i] - ai[i];
1461     idx   =  6*i;
1462     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1463     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1464     while (nz--) {
1465       jdx   = 6*(*vi++);
1466       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1467       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1468       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1469       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1470       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1471       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1472       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1473       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1474       v += 36;
1475      }
1476     x[idx]   = s1;
1477     x[1+idx] = s2;
1478     x[2+idx] = s3;
1479     x[3+idx] = s4;
1480     x[4+idx] = s5;
1481     x[5+idx] = s6;
1482   }
1483   /* backward solve the upper triangular */
1484   for (i=n-1; i>=0; i--){
1485     v    = aa + 36*diag[i] + 36;
1486     vi   = aj + diag[i] + 1;
1487     nz   = ai[i+1] - diag[i] - 1;
1488     idt  = 6*i;
1489     s1 = x[idt];   s2 = x[1+idt];
1490     s3 = x[2+idt]; s4 = x[3+idt];
1491     s5 = x[4+idt]; s6 = x[5+idt];
1492     while (nz--) {
1493       idx   = 6*(*vi++);
1494       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1495       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1496       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1497       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1498       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1499       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1500       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1501       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1502       v += 36;
1503     }
1504     v        = aa + 36*diag[i];
1505     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
1506     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
1507     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
1508     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
1509     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
1510     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
1511   }
1512 
1513   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1514   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1515   PetscLogFlops(2*36*(a->nz) - 6*A->n);
1516   PetscFunctionReturn(0);
1517 }
1518 
1519 #undef __FUNCT__
1520 #define __FUNCT__ "MatSolve_SeqBAIJ_5"
1521 int MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
1522 {
1523   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1524   IS              iscol=a->col,isrow=a->row;
1525   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1526   int             *diag = a->diag;
1527   MatScalar       *aa=a->a,*v;
1528   PetscScalar     *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
1529 
1530   PetscFunctionBegin;
1531   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1532   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1533   t  = a->solve_work;
1534 
1535   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1536   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1537 
1538   /* forward solve the lower triangular */
1539   idx    = 5*(*r++);
1540   t[0] = b[idx];   t[1] = b[1+idx];
1541   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1542   for (i=1; i<n; i++) {
1543     v     = aa + 25*ai[i];
1544     vi    = aj + ai[i];
1545     nz    = diag[i] - ai[i];
1546     idx   = 5*(*r++);
1547     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1548     s5  = b[4+idx];
1549     while (nz--) {
1550       idx   = 5*(*vi++);
1551       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1552       x4    = t[3+idx];x5 = t[4+idx];
1553       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1554       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1555       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1556       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1557       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1558       v += 25;
1559     }
1560     idx = 5*i;
1561     t[idx]   = s1;t[1+idx] = s2;
1562     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1563   }
1564   /* backward solve the upper triangular */
1565   for (i=n-1; i>=0; i--){
1566     v    = aa + 25*diag[i] + 25;
1567     vi   = aj + diag[i] + 1;
1568     nz   = ai[i+1] - diag[i] - 1;
1569     idt  = 5*i;
1570     s1 = t[idt];  s2 = t[1+idt];
1571     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1572     while (nz--) {
1573       idx   = 5*(*vi++);
1574       x1    = t[idx];   x2 = t[1+idx];
1575       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1576       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1577       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1578       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1579       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1580       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1581       v += 25;
1582     }
1583     idc = 5*(*c--);
1584     v   = aa + 25*diag[i];
1585     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
1586                                  v[15]*s4+v[20]*s5;
1587     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
1588                                  v[16]*s4+v[21]*s5;
1589     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
1590                                  v[17]*s4+v[22]*s5;
1591     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
1592                                  v[18]*s4+v[23]*s5;
1593     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
1594                                  v[19]*s4+v[24]*s5;
1595   }
1596 
1597   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1598   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1599   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1600   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1601   PetscLogFlops(2*25*(a->nz) - 5*A->n);
1602   PetscFunctionReturn(0);
1603 }
1604 
1605 #undef __FUNCT__
1606 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
1607 int MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
1608 {
1609   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1610   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1611   int             ierr,*diag = a->diag,jdx;
1612   MatScalar       *aa=a->a,*v;
1613   PetscScalar     *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
1614 
1615   PetscFunctionBegin;
1616   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1617   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1618   /* forward solve the lower triangular */
1619   idx    = 0;
1620   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
1621   for (i=1; i<n; i++) {
1622     v     =  aa + 25*ai[i];
1623     vi    =  aj + ai[i];
1624     nz    =  diag[i] - ai[i];
1625     idx   =  5*i;
1626     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
1627     while (nz--) {
1628       jdx   = 5*(*vi++);
1629       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
1630       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1631       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1632       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1633       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1634       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1635       v    += 25;
1636     }
1637     x[idx]   = s1;
1638     x[1+idx] = s2;
1639     x[2+idx] = s3;
1640     x[3+idx] = s4;
1641     x[4+idx] = s5;
1642   }
1643   /* backward solve the upper triangular */
1644   for (i=n-1; i>=0; i--){
1645     v    = aa + 25*diag[i] + 25;
1646     vi   = aj + diag[i] + 1;
1647     nz   = ai[i+1] - diag[i] - 1;
1648     idt  = 5*i;
1649     s1 = x[idt];  s2 = x[1+idt];
1650     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
1651     while (nz--) {
1652       idx   = 5*(*vi++);
1653       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
1654       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1655       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1656       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1657       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1658       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1659       v    += 25;
1660     }
1661     v        = aa + 25*diag[i];
1662     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
1663     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
1664     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
1665     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
1666     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
1667   }
1668 
1669   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1670   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1671   PetscLogFlops(2*25*(a->nz) - 5*A->n);
1672   PetscFunctionReturn(0);
1673 }
1674 
1675 #undef __FUNCT__
1676 #define __FUNCT__ "MatSolve_SeqBAIJ_4"
1677 int MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
1678 {
1679   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1680   IS              iscol=a->col,isrow=a->row;
1681   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1682   int             *diag = a->diag;
1683   MatScalar       *aa=a->a,*v;
1684   PetscScalar     *x,*b,s1,s2,s3,s4,x1,x2,x3,x4,*t;
1685 
1686   PetscFunctionBegin;
1687   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1688   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1689   t  = a->solve_work;
1690 
1691   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1692   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1693 
1694   /* forward solve the lower triangular */
1695   idx    = 4*(*r++);
1696   t[0] = b[idx];   t[1] = b[1+idx];
1697   t[2] = b[2+idx]; t[3] = b[3+idx];
1698   for (i=1; i<n; i++) {
1699     v     = aa + 16*ai[i];
1700     vi    = aj + ai[i];
1701     nz    = diag[i] - ai[i];
1702     idx   = 4*(*r++);
1703     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1704     while (nz--) {
1705       idx   = 4*(*vi++);
1706       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
1707       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1708       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1709       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1710       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1711       v    += 16;
1712     }
1713     idx        = 4*i;
1714     t[idx]   = s1;t[1+idx] = s2;
1715     t[2+idx] = s3;t[3+idx] = s4;
1716   }
1717   /* backward solve the upper triangular */
1718   for (i=n-1; i>=0; i--){
1719     v    = aa + 16*diag[i] + 16;
1720     vi   = aj + diag[i] + 1;
1721     nz   = ai[i+1] - diag[i] - 1;
1722     idt  = 4*i;
1723     s1 = t[idt];  s2 = t[1+idt];
1724     s3 = t[2+idt];s4 = t[3+idt];
1725     while (nz--) {
1726       idx   = 4*(*vi++);
1727       x1    = t[idx];   x2 = t[1+idx];
1728       x3    = t[2+idx]; x4 = t[3+idx];
1729       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1730       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1731       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1732       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1733       v += 16;
1734     }
1735     idc      = 4*(*c--);
1736     v        = aa + 16*diag[i];
1737     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1738     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1739     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1740     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1741   }
1742 
1743   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1744   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1745   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1746   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1747   PetscLogFlops(2*16*(a->nz) - 4*A->n);
1748   PetscFunctionReturn(0);
1749 }
1750 
1751 #undef __FUNCT__
1752 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion"
1753 int MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
1754 {
1755   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1756   IS              iscol=a->col,isrow=a->row;
1757   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1758   int             *diag = a->diag;
1759   MatScalar       *aa=a->a,*v,s1,s2,s3,s4,x1,x2,x3,x4,*t;
1760   PetscScalar     *x,*b;
1761 
1762   PetscFunctionBegin;
1763   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1764   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1765   t  = (MatScalar *)a->solve_work;
1766 
1767   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1768   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1769 
1770   /* forward solve the lower triangular */
1771   idx    = 4*(*r++);
1772   t[0] = (MatScalar)b[idx];
1773   t[1] = (MatScalar)b[1+idx];
1774   t[2] = (MatScalar)b[2+idx];
1775   t[3] = (MatScalar)b[3+idx];
1776   for (i=1; i<n; i++) {
1777     v     = aa + 16*ai[i];
1778     vi    = aj + ai[i];
1779     nz    = diag[i] - ai[i];
1780     idx   = 4*(*r++);
1781     s1 = (MatScalar)b[idx];
1782     s2 = (MatScalar)b[1+idx];
1783     s3 = (MatScalar)b[2+idx];
1784     s4 = (MatScalar)b[3+idx];
1785     while (nz--) {
1786       idx   = 4*(*vi++);
1787       x1  = t[idx];
1788       x2  = t[1+idx];
1789       x3  = t[2+idx];
1790       x4  = t[3+idx];
1791       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1792       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1793       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1794       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1795       v    += 16;
1796     }
1797     idx        = 4*i;
1798     t[idx]   = s1;
1799     t[1+idx] = s2;
1800     t[2+idx] = s3;
1801     t[3+idx] = s4;
1802   }
1803   /* backward solve the upper triangular */
1804   for (i=n-1; i>=0; i--){
1805     v    = aa + 16*diag[i] + 16;
1806     vi   = aj + diag[i] + 1;
1807     nz   = ai[i+1] - diag[i] - 1;
1808     idt  = 4*i;
1809     s1 = t[idt];
1810     s2 = t[1+idt];
1811     s3 = t[2+idt];
1812     s4 = t[3+idt];
1813     while (nz--) {
1814       idx   = 4*(*vi++);
1815       x1  = t[idx];
1816       x2  = t[1+idx];
1817       x3  = t[2+idx];
1818       x4  = t[3+idx];
1819       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1820       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1821       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1822       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1823       v += 16;
1824     }
1825     idc      = 4*(*c--);
1826     v        = aa + 16*diag[i];
1827     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1828     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1829     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1830     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1831     x[idc]   = (PetscScalar)t[idt];
1832     x[1+idc] = (PetscScalar)t[1+idt];
1833     x[2+idc] = (PetscScalar)t[2+idt];
1834     x[3+idc] = (PetscScalar)t[3+idt];
1835  }
1836 
1837   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1838   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1839   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1840   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1841   PetscLogFlops(2*16*(a->nz) - 4*A->n);
1842   PetscFunctionReturn(0);
1843 }
1844 
1845 #if defined (PETSC_HAVE_SSE)
1846 
1847 #include PETSC_HAVE_SSE
1848 
1849 #undef __FUNCT__
1850 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
1851 int MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
1852 {
1853   /*
1854      Note: This code uses demotion of double
1855      to float when performing the mixed-mode computation.
1856      This may not be numerically reasonable for all applications.
1857   */
1858   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1859   IS              iscol=a->col,isrow=a->row;
1860   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1861   int             *diag = a->diag,ai16;
1862   MatScalar       *aa=a->a,*v;
1863   PetscScalar     *x,*b,*t;
1864 
1865   /* Make space in temp stack for 16 Byte Aligned arrays */
1866   float           ssealignedspace[11],*tmps,*tmpx;
1867   unsigned long   offset;
1868 
1869   PetscFunctionBegin;
1870   SSE_SCOPE_BEGIN;
1871 
1872     offset = (unsigned long)ssealignedspace % 16;
1873     if (offset) offset = (16 - offset)/4;
1874     tmps = &ssealignedspace[offset];
1875     tmpx = &ssealignedspace[offset+4];
1876     PREFETCH_NTA(aa+16*ai[1]);
1877 
1878     ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1879     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1880     t  = a->solve_work;
1881 
1882     ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1883     ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1884 
1885     /* forward solve the lower triangular */
1886     idx  = 4*(*r++);
1887     t[0] = b[idx];   t[1] = b[1+idx];
1888     t[2] = b[2+idx]; t[3] = b[3+idx];
1889     v    =  aa + 16*ai[1];
1890 
1891     for (i=1; i<n;) {
1892       PREFETCH_NTA(&v[8]);
1893       vi   =  aj      + ai[i];
1894       nz   =  diag[i] - ai[i];
1895       idx  =  4*(*r++);
1896 
1897       /* Demote sum from double to float */
1898       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
1899       LOAD_PS(tmps,XMM7);
1900 
1901       while (nz--) {
1902         PREFETCH_NTA(&v[16]);
1903         idx = 4*(*vi++);
1904 
1905         /* Demote solution (so far) from double to float */
1906         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
1907 
1908         /* 4x4 Matrix-Vector product with negative accumulation: */
1909         SSE_INLINE_BEGIN_2(tmpx,v)
1910           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
1911 
1912           /* First Column */
1913           SSE_COPY_PS(XMM0,XMM6)
1914           SSE_SHUFFLE(XMM0,XMM0,0x00)
1915           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1916           SSE_SUB_PS(XMM7,XMM0)
1917 
1918           /* Second Column */
1919           SSE_COPY_PS(XMM1,XMM6)
1920           SSE_SHUFFLE(XMM1,XMM1,0x55)
1921           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1922           SSE_SUB_PS(XMM7,XMM1)
1923 
1924           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
1925 
1926           /* Third Column */
1927           SSE_COPY_PS(XMM2,XMM6)
1928           SSE_SHUFFLE(XMM2,XMM2,0xAA)
1929           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1930           SSE_SUB_PS(XMM7,XMM2)
1931 
1932           /* Fourth Column */
1933           SSE_COPY_PS(XMM3,XMM6)
1934           SSE_SHUFFLE(XMM3,XMM3,0xFF)
1935           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1936           SSE_SUB_PS(XMM7,XMM3)
1937         SSE_INLINE_END_2
1938 
1939         v  += 16;
1940       }
1941       idx = 4*i;
1942       v   = aa + 16*ai[++i];
1943       PREFETCH_NTA(v);
1944       STORE_PS(tmps,XMM7);
1945 
1946       /* Promote result from float to double */
1947       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
1948     }
1949     /* backward solve the upper triangular */
1950     idt  = 4*(n-1);
1951     ai16 = 16*diag[n-1];
1952     v    = aa + ai16 + 16;
1953     for (i=n-1; i>=0;){
1954       PREFETCH_NTA(&v[8]);
1955       vi = aj + diag[i] + 1;
1956       nz = ai[i+1] - diag[i] - 1;
1957 
1958       /* Demote accumulator from double to float */
1959       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
1960       LOAD_PS(tmps,XMM7);
1961 
1962       while (nz--) {
1963         PREFETCH_NTA(&v[16]);
1964         idx = 4*(*vi++);
1965 
1966         /* Demote solution (so far) from double to float */
1967         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
1968 
1969         /* 4x4 Matrix-Vector Product with negative accumulation: */
1970         SSE_INLINE_BEGIN_2(tmpx,v)
1971           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
1972 
1973           /* First Column */
1974           SSE_COPY_PS(XMM0,XMM6)
1975           SSE_SHUFFLE(XMM0,XMM0,0x00)
1976           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1977           SSE_SUB_PS(XMM7,XMM0)
1978 
1979           /* Second Column */
1980           SSE_COPY_PS(XMM1,XMM6)
1981           SSE_SHUFFLE(XMM1,XMM1,0x55)
1982           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1983           SSE_SUB_PS(XMM7,XMM1)
1984 
1985           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
1986 
1987           /* Third Column */
1988           SSE_COPY_PS(XMM2,XMM6)
1989           SSE_SHUFFLE(XMM2,XMM2,0xAA)
1990           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1991           SSE_SUB_PS(XMM7,XMM2)
1992 
1993           /* Fourth Column */
1994           SSE_COPY_PS(XMM3,XMM6)
1995           SSE_SHUFFLE(XMM3,XMM3,0xFF)
1996           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1997           SSE_SUB_PS(XMM7,XMM3)
1998         SSE_INLINE_END_2
1999         v  += 16;
2000       }
2001       v    = aa + ai16;
2002       ai16 = 16*diag[--i];
2003       PREFETCH_NTA(aa+ai16+16);
2004       /*
2005          Scale the result by the diagonal 4x4 block,
2006          which was inverted as part of the factorization
2007       */
2008       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
2009         /* First Column */
2010         SSE_COPY_PS(XMM0,XMM7)
2011         SSE_SHUFFLE(XMM0,XMM0,0x00)
2012         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2013 
2014         /* Second Column */
2015         SSE_COPY_PS(XMM1,XMM7)
2016         SSE_SHUFFLE(XMM1,XMM1,0x55)
2017         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2018         SSE_ADD_PS(XMM0,XMM1)
2019 
2020         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2021 
2022         /* Third Column */
2023         SSE_COPY_PS(XMM2,XMM7)
2024         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2025         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2026         SSE_ADD_PS(XMM0,XMM2)
2027 
2028         /* Fourth Column */
2029         SSE_COPY_PS(XMM3,XMM7)
2030         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2031         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2032         SSE_ADD_PS(XMM0,XMM3)
2033 
2034         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2035       SSE_INLINE_END_3
2036 
2037       /* Promote solution from float to double */
2038       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
2039 
2040       /* Apply reordering to t and stream into x.    */
2041       /* This way, x doesn't pollute the cache.      */
2042       /* Be careful with size: 2 doubles = 4 floats! */
2043       idc  = 4*(*c--);
2044       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
2045         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
2046         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
2047         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
2048         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
2049         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
2050         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
2051       SSE_INLINE_END_2
2052       v    = aa + ai16 + 16;
2053       idt -= 4;
2054     }
2055 
2056     ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2057     ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2058     ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2059     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2060     PetscLogFlops(2*16*(a->nz) - 4*A->n);
2061   SSE_SCOPE_END;
2062   PetscFunctionReturn(0);
2063 }
2064 
2065 #endif
2066 
2067 
2068 /*
2069       Special case where the matrix was ILU(0) factored in the natural
2070    ordering. This eliminates the need for the column and row permutation.
2071 */
2072 #undef __FUNCT__
2073 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
2074 int MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
2075 {
2076   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2077   int             n=a->mbs,*ai=a->i,*aj=a->j;
2078   int             ierr,*diag = a->diag;
2079   MatScalar       *aa=a->a;
2080   PetscScalar     *x,*b;
2081 
2082   PetscFunctionBegin;
2083   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2084   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2085 
2086 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
2087   {
2088     static PetscScalar w[2000]; /* very BAD need to fix */
2089     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
2090   }
2091 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
2092   {
2093     static PetscScalar w[2000]; /* very BAD need to fix */
2094     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
2095   }
2096 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
2097   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
2098 #else
2099   {
2100     PetscScalar  s1,s2,s3,s4,x1,x2,x3,x4;
2101     MatScalar    *v;
2102     int          jdx,idt,idx,nz,*vi,i,ai16;
2103 
2104   /* forward solve the lower triangular */
2105   idx    = 0;
2106   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
2107   for (i=1; i<n; i++) {
2108     v     =  aa      + 16*ai[i];
2109     vi    =  aj      + ai[i];
2110     nz    =  diag[i] - ai[i];
2111     idx   +=  4;
2112     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2113     while (nz--) {
2114       jdx   = 4*(*vi++);
2115       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
2116       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2117       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2118       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2119       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2120       v    += 16;
2121     }
2122     x[idx]   = s1;
2123     x[1+idx] = s2;
2124     x[2+idx] = s3;
2125     x[3+idx] = s4;
2126   }
2127   /* backward solve the upper triangular */
2128   idt = 4*(n-1);
2129   for (i=n-1; i>=0; i--){
2130     ai16 = 16*diag[i];
2131     v    = aa + ai16 + 16;
2132     vi   = aj + diag[i] + 1;
2133     nz   = ai[i+1] - diag[i] - 1;
2134     s1 = x[idt];  s2 = x[1+idt];
2135     s3 = x[2+idt];s4 = x[3+idt];
2136     while (nz--) {
2137       idx   = 4*(*vi++);
2138       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
2139       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2140       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2141       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2142       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
2143       v    += 16;
2144     }
2145     v        = aa + ai16;
2146     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
2147     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
2148     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
2149     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
2150     idt -= 4;
2151   }
2152   }
2153 #endif
2154 
2155   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2156   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2157   PetscLogFlops(2*16*(a->nz) - 4*A->n);
2158   PetscFunctionReturn(0);
2159 }
2160 
2161 #undef __FUNCT__
2162 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion"
2163 int MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
2164 {
2165   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2166   int             n=a->mbs,*ai=a->i,*aj=a->j;
2167   int             ierr,*diag = a->diag;
2168   MatScalar       *aa=a->a;
2169   PetscScalar     *x,*b;
2170 
2171   PetscFunctionBegin;
2172   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2173   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2174 
2175   {
2176     MatScalar  s1,s2,s3,s4,x1,x2,x3,x4;
2177     MatScalar  *v,*t=(MatScalar *)x;
2178     int        jdx,idt,idx,nz,*vi,i,ai16;
2179 
2180     /* forward solve the lower triangular */
2181     idx  = 0;
2182     t[0] = (MatScalar)b[0];
2183     t[1] = (MatScalar)b[1];
2184     t[2] = (MatScalar)b[2];
2185     t[3] = (MatScalar)b[3];
2186     for (i=1; i<n; i++) {
2187       v     =  aa      + 16*ai[i];
2188       vi    =  aj      + ai[i];
2189       nz    =  diag[i] - ai[i];
2190       idx   +=  4;
2191       s1 = (MatScalar)b[idx];
2192       s2 = (MatScalar)b[1+idx];
2193       s3 = (MatScalar)b[2+idx];
2194       s4 = (MatScalar)b[3+idx];
2195       while (nz--) {
2196         jdx = 4*(*vi++);
2197         x1  = t[jdx];
2198         x2  = t[1+jdx];
2199         x3  = t[2+jdx];
2200         x4  = t[3+jdx];
2201         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2202         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2203         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2204         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2205         v    += 16;
2206       }
2207       t[idx]   = s1;
2208       t[1+idx] = s2;
2209       t[2+idx] = s3;
2210       t[3+idx] = s4;
2211     }
2212     /* backward solve the upper triangular */
2213     idt = 4*(n-1);
2214     for (i=n-1; i>=0; i--){
2215       ai16 = 16*diag[i];
2216       v    = aa + ai16 + 16;
2217       vi   = aj + diag[i] + 1;
2218       nz   = ai[i+1] - diag[i] - 1;
2219       s1   = t[idt];
2220       s2   = t[1+idt];
2221       s3   = t[2+idt];
2222       s4   = t[3+idt];
2223       while (nz--) {
2224         idx = 4*(*vi++);
2225         x1  = (MatScalar)x[idx];
2226         x2  = (MatScalar)x[1+idx];
2227         x3  = (MatScalar)x[2+idx];
2228         x4  = (MatScalar)x[3+idx];
2229         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2230         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2231         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2232         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2233         v    += 16;
2234       }
2235       v        = aa + ai16;
2236       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
2237       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
2238       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
2239       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
2240       idt -= 4;
2241     }
2242   }
2243 
2244   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2245   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2246   PetscLogFlops(2*16*(a->nz) - 4*A->n);
2247   PetscFunctionReturn(0);
2248 }
2249 
2250 #if defined (PETSC_HAVE_SSE)
2251 
2252 #include PETSC_HAVE_SSE
2253 #undef __FUNCT__
2254 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj"
2255 int MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
2256 {
2257   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2258   unsigned short *aj=(unsigned short *)a->j;
2259   int            ierr,*ai=a->i,n=a->mbs,*diag = a->diag;
2260   MatScalar      *aa=a->a;
2261   PetscScalar    *x,*b;
2262 
2263   PetscFunctionBegin;
2264   SSE_SCOPE_BEGIN;
2265   /*
2266      Note: This code currently uses demotion of double
2267      to float when performing the mixed-mode computation.
2268      This may not be numerically reasonable for all applications.
2269   */
2270   PREFETCH_NTA(aa+16*ai[1]);
2271 
2272   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2273   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2274   {
2275     /* x will first be computed in single precision then promoted inplace to double */
2276     MatScalar      *v,*t=(MatScalar *)x;
2277     int            nz,i,idt,ai16;
2278     unsigned int   jdx,idx;
2279     unsigned short *vi;
2280     /* Forward solve the lower triangular factor. */
2281 
2282     /* First block is the identity. */
2283     idx  = 0;
2284     CONVERT_DOUBLE4_FLOAT4(t,b);
2285     v    =  aa + 16*((unsigned int)ai[1]);
2286 
2287     for (i=1; i<n;) {
2288       PREFETCH_NTA(&v[8]);
2289       vi   =  aj      + ai[i];
2290       nz   =  diag[i] - ai[i];
2291       idx +=  4;
2292 
2293       /* Demote RHS from double to float. */
2294       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
2295       LOAD_PS(&t[idx],XMM7);
2296 
2297       while (nz--) {
2298         PREFETCH_NTA(&v[16]);
2299         jdx = 4*((unsigned int)(*vi++));
2300 
2301         /* 4x4 Matrix-Vector product with negative accumulation: */
2302         SSE_INLINE_BEGIN_2(&t[jdx],v)
2303           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2304 
2305           /* First Column */
2306           SSE_COPY_PS(XMM0,XMM6)
2307           SSE_SHUFFLE(XMM0,XMM0,0x00)
2308           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2309           SSE_SUB_PS(XMM7,XMM0)
2310 
2311           /* Second Column */
2312           SSE_COPY_PS(XMM1,XMM6)
2313           SSE_SHUFFLE(XMM1,XMM1,0x55)
2314           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2315           SSE_SUB_PS(XMM7,XMM1)
2316 
2317           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2318 
2319           /* Third Column */
2320           SSE_COPY_PS(XMM2,XMM6)
2321           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2322           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2323           SSE_SUB_PS(XMM7,XMM2)
2324 
2325           /* Fourth Column */
2326           SSE_COPY_PS(XMM3,XMM6)
2327           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2328           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2329           SSE_SUB_PS(XMM7,XMM3)
2330         SSE_INLINE_END_2
2331 
2332         v  += 16;
2333       }
2334       v    =  aa + 16*ai[++i];
2335       PREFETCH_NTA(v);
2336       STORE_PS(&t[idx],XMM7);
2337     }
2338 
2339     /* Backward solve the upper triangular factor.*/
2340 
2341     idt  = 4*(n-1);
2342     ai16 = 16*diag[n-1];
2343     v    = aa + ai16 + 16;
2344     for (i=n-1; i>=0;){
2345       PREFETCH_NTA(&v[8]);
2346       vi = aj + diag[i] + 1;
2347       nz = ai[i+1] - diag[i] - 1;
2348 
2349       LOAD_PS(&t[idt],XMM7);
2350 
2351       while (nz--) {
2352         PREFETCH_NTA(&v[16]);
2353         idx = 4*((unsigned int)(*vi++));
2354 
2355         /* 4x4 Matrix-Vector Product with negative accumulation: */
2356         SSE_INLINE_BEGIN_2(&t[idx],v)
2357           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2358 
2359           /* First Column */
2360           SSE_COPY_PS(XMM0,XMM6)
2361           SSE_SHUFFLE(XMM0,XMM0,0x00)
2362           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2363           SSE_SUB_PS(XMM7,XMM0)
2364 
2365           /* Second Column */
2366           SSE_COPY_PS(XMM1,XMM6)
2367           SSE_SHUFFLE(XMM1,XMM1,0x55)
2368           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2369           SSE_SUB_PS(XMM7,XMM1)
2370 
2371           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2372 
2373           /* Third Column */
2374           SSE_COPY_PS(XMM2,XMM6)
2375           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2376           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2377           SSE_SUB_PS(XMM7,XMM2)
2378 
2379           /* Fourth Column */
2380           SSE_COPY_PS(XMM3,XMM6)
2381           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2382           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2383           SSE_SUB_PS(XMM7,XMM3)
2384         SSE_INLINE_END_2
2385         v  += 16;
2386       }
2387       v    = aa + ai16;
2388       ai16 = 16*diag[--i];
2389       PREFETCH_NTA(aa+ai16+16);
2390       /*
2391          Scale the result by the diagonal 4x4 block,
2392          which was inverted as part of the factorization
2393       */
2394       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
2395         /* First Column */
2396         SSE_COPY_PS(XMM0,XMM7)
2397         SSE_SHUFFLE(XMM0,XMM0,0x00)
2398         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2399 
2400         /* Second Column */
2401         SSE_COPY_PS(XMM1,XMM7)
2402         SSE_SHUFFLE(XMM1,XMM1,0x55)
2403         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2404         SSE_ADD_PS(XMM0,XMM1)
2405 
2406         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2407 
2408         /* Third Column */
2409         SSE_COPY_PS(XMM2,XMM7)
2410         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2411         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2412         SSE_ADD_PS(XMM0,XMM2)
2413 
2414         /* Fourth Column */
2415         SSE_COPY_PS(XMM3,XMM7)
2416         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2417         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2418         SSE_ADD_PS(XMM0,XMM3)
2419 
2420         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2421       SSE_INLINE_END_3
2422 
2423       v    = aa + ai16 + 16;
2424       idt -= 4;
2425     }
2426 
2427     /* Convert t from single precision back to double precision (inplace)*/
2428     idt = 4*(n-1);
2429     for (i=n-1;i>=0;i--) {
2430       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
2431       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
2432       PetscScalar *xtemp=&x[idt];
2433       MatScalar   *ttemp=&t[idt];
2434       xtemp[3] = (PetscScalar)ttemp[3];
2435       xtemp[2] = (PetscScalar)ttemp[2];
2436       xtemp[1] = (PetscScalar)ttemp[1];
2437       xtemp[0] = (PetscScalar)ttemp[0];
2438       idt -= 4;
2439     }
2440 
2441   } /* End of artificial scope. */
2442   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2443   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2444   PetscLogFlops(2*16*(a->nz) - 4*A->n);
2445   SSE_SCOPE_END;
2446   PetscFunctionReturn(0);
2447 }
2448 
2449 #undef __FUNCT__
2450 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion"
2451 int MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
2452 {
2453   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2454   int            *aj=a->j;
2455   int            ierr,*ai=a->i,n=a->mbs,*diag = a->diag;
2456   MatScalar      *aa=a->a;
2457   PetscScalar    *x,*b;
2458 
2459   PetscFunctionBegin;
2460   SSE_SCOPE_BEGIN;
2461   /*
2462      Note: This code currently uses demotion of double
2463      to float when performing the mixed-mode computation.
2464      This may not be numerically reasonable for all applications.
2465   */
2466   PREFETCH_NTA(aa+16*ai[1]);
2467 
2468   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2469   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2470   {
2471     /* x will first be computed in single precision then promoted inplace to double */
2472     MatScalar *v,*t=(MatScalar *)x;
2473     int       nz,i,idt,ai16;
2474     int       jdx,idx;
2475     int       *vi;
2476     /* Forward solve the lower triangular factor. */
2477 
2478     /* First block is the identity. */
2479     idx  = 0;
2480     CONVERT_DOUBLE4_FLOAT4(t,b);
2481     v    =  aa + 16*ai[1];
2482 
2483     for (i=1; i<n;) {
2484       PREFETCH_NTA(&v[8]);
2485       vi   =  aj      + ai[i];
2486       nz   =  diag[i] - ai[i];
2487       idx +=  4;
2488 
2489       /* Demote RHS from double to float. */
2490       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
2491       LOAD_PS(&t[idx],XMM7);
2492 
2493       while (nz--) {
2494         PREFETCH_NTA(&v[16]);
2495         jdx = 4*(*vi++);
2496 /*          jdx = *vi++; */
2497 
2498         /* 4x4 Matrix-Vector product with negative accumulation: */
2499         SSE_INLINE_BEGIN_2(&t[jdx],v)
2500           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2501 
2502           /* First Column */
2503           SSE_COPY_PS(XMM0,XMM6)
2504           SSE_SHUFFLE(XMM0,XMM0,0x00)
2505           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2506           SSE_SUB_PS(XMM7,XMM0)
2507 
2508           /* Second Column */
2509           SSE_COPY_PS(XMM1,XMM6)
2510           SSE_SHUFFLE(XMM1,XMM1,0x55)
2511           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2512           SSE_SUB_PS(XMM7,XMM1)
2513 
2514           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2515 
2516           /* Third Column */
2517           SSE_COPY_PS(XMM2,XMM6)
2518           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2519           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2520           SSE_SUB_PS(XMM7,XMM2)
2521 
2522           /* Fourth Column */
2523           SSE_COPY_PS(XMM3,XMM6)
2524           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2525           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2526           SSE_SUB_PS(XMM7,XMM3)
2527         SSE_INLINE_END_2
2528 
2529         v  += 16;
2530       }
2531       v    =  aa + 16*ai[++i];
2532       PREFETCH_NTA(v);
2533       STORE_PS(&t[idx],XMM7);
2534     }
2535 
2536     /* Backward solve the upper triangular factor.*/
2537 
2538     idt  = 4*(n-1);
2539     ai16 = 16*diag[n-1];
2540     v    = aa + ai16 + 16;
2541     for (i=n-1; i>=0;){
2542       PREFETCH_NTA(&v[8]);
2543       vi = aj + diag[i] + 1;
2544       nz = ai[i+1] - diag[i] - 1;
2545 
2546       LOAD_PS(&t[idt],XMM7);
2547 
2548       while (nz--) {
2549         PREFETCH_NTA(&v[16]);
2550         idx = 4*(*vi++);
2551 /*          idx = *vi++; */
2552 
2553         /* 4x4 Matrix-Vector Product with negative accumulation: */
2554         SSE_INLINE_BEGIN_2(&t[idx],v)
2555           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2556 
2557           /* First Column */
2558           SSE_COPY_PS(XMM0,XMM6)
2559           SSE_SHUFFLE(XMM0,XMM0,0x00)
2560           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2561           SSE_SUB_PS(XMM7,XMM0)
2562 
2563           /* Second Column */
2564           SSE_COPY_PS(XMM1,XMM6)
2565           SSE_SHUFFLE(XMM1,XMM1,0x55)
2566           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2567           SSE_SUB_PS(XMM7,XMM1)
2568 
2569           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2570 
2571           /* Third Column */
2572           SSE_COPY_PS(XMM2,XMM6)
2573           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2574           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2575           SSE_SUB_PS(XMM7,XMM2)
2576 
2577           /* Fourth Column */
2578           SSE_COPY_PS(XMM3,XMM6)
2579           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2580           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2581           SSE_SUB_PS(XMM7,XMM3)
2582         SSE_INLINE_END_2
2583         v  += 16;
2584       }
2585       v    = aa + ai16;
2586       ai16 = 16*diag[--i];
2587       PREFETCH_NTA(aa+ai16+16);
2588       /*
2589          Scale the result by the diagonal 4x4 block,
2590          which was inverted as part of the factorization
2591       */
2592       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
2593         /* First Column */
2594         SSE_COPY_PS(XMM0,XMM7)
2595         SSE_SHUFFLE(XMM0,XMM0,0x00)
2596         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2597 
2598         /* Second Column */
2599         SSE_COPY_PS(XMM1,XMM7)
2600         SSE_SHUFFLE(XMM1,XMM1,0x55)
2601         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2602         SSE_ADD_PS(XMM0,XMM1)
2603 
2604         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2605 
2606         /* Third Column */
2607         SSE_COPY_PS(XMM2,XMM7)
2608         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2609         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2610         SSE_ADD_PS(XMM0,XMM2)
2611 
2612         /* Fourth Column */
2613         SSE_COPY_PS(XMM3,XMM7)
2614         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2615         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2616         SSE_ADD_PS(XMM0,XMM3)
2617 
2618         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2619       SSE_INLINE_END_3
2620 
2621       v    = aa + ai16 + 16;
2622       idt -= 4;
2623     }
2624 
2625     /* Convert t from single precision back to double precision (inplace)*/
2626     idt = 4*(n-1);
2627     for (i=n-1;i>=0;i--) {
2628       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
2629       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
2630       PetscScalar *xtemp=&x[idt];
2631       MatScalar   *ttemp=&t[idt];
2632       xtemp[3] = (PetscScalar)ttemp[3];
2633       xtemp[2] = (PetscScalar)ttemp[2];
2634       xtemp[1] = (PetscScalar)ttemp[1];
2635       xtemp[0] = (PetscScalar)ttemp[0];
2636       idt -= 4;
2637     }
2638 
2639   } /* End of artificial scope. */
2640   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2641   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2642   PetscLogFlops(2*16*(a->nz) - 4*A->n);
2643   SSE_SCOPE_END;
2644   PetscFunctionReturn(0);
2645 }
2646 
2647 #endif
2648 #undef __FUNCT__
2649 #define __FUNCT__ "MatSolve_SeqBAIJ_3"
2650 int MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
2651 {
2652   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
2653   IS              iscol=a->col,isrow=a->row;
2654   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
2655   int             *diag = a->diag;
2656   MatScalar       *aa=a->a,*v;
2657   PetscScalar     *x,*b,s1,s2,s3,x1,x2,x3,*t;
2658 
2659   PetscFunctionBegin;
2660   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2661   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2662   t  = a->solve_work;
2663 
2664   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2665   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2666 
2667   /* forward solve the lower triangular */
2668   idx    = 3*(*r++);
2669   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
2670   for (i=1; i<n; i++) {
2671     v     = aa + 9*ai[i];
2672     vi    = aj + ai[i];
2673     nz    = diag[i] - ai[i];
2674     idx   = 3*(*r++);
2675     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
2676     while (nz--) {
2677       idx   = 3*(*vi++);
2678       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2679       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2680       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2681       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2682       v += 9;
2683     }
2684     idx = 3*i;
2685     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
2686   }
2687   /* backward solve the upper triangular */
2688   for (i=n-1; i>=0; i--){
2689     v    = aa + 9*diag[i] + 9;
2690     vi   = aj + diag[i] + 1;
2691     nz   = ai[i+1] - diag[i] - 1;
2692     idt  = 3*i;
2693     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
2694     while (nz--) {
2695       idx   = 3*(*vi++);
2696       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2697       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2698       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2699       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2700       v += 9;
2701     }
2702     idc = 3*(*c--);
2703     v   = aa + 9*diag[i];
2704     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2705     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2706     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
2707   }
2708   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2709   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2710   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2711   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2712   PetscLogFlops(2*9*(a->nz) - 3*A->n);
2713   PetscFunctionReturn(0);
2714 }
2715 
2716 /*
2717       Special case where the matrix was ILU(0) factored in the natural
2718    ordering. This eliminates the need for the column and row permutation.
2719 */
2720 #undef __FUNCT__
2721 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
2722 int MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
2723 {
2724   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2725   int             n=a->mbs,*ai=a->i,*aj=a->j;
2726   int             ierr,*diag = a->diag;
2727   MatScalar       *aa=a->a,*v;
2728   PetscScalar     *x,*b,s1,s2,s3,x1,x2,x3;
2729   int             jdx,idt,idx,nz,*vi,i;
2730 
2731   PetscFunctionBegin;
2732   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2733   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2734 
2735 
2736   /* forward solve the lower triangular */
2737   idx    = 0;
2738   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
2739   for (i=1; i<n; i++) {
2740     v     =  aa      + 9*ai[i];
2741     vi    =  aj      + ai[i];
2742     nz    =  diag[i] - ai[i];
2743     idx   +=  3;
2744     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
2745     while (nz--) {
2746       jdx   = 3*(*vi++);
2747       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
2748       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2749       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2750       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2751       v    += 9;
2752     }
2753     x[idx]   = s1;
2754     x[1+idx] = s2;
2755     x[2+idx] = s3;
2756   }
2757   /* backward solve the upper triangular */
2758   for (i=n-1; i>=0; i--){
2759     v    = aa + 9*diag[i] + 9;
2760     vi   = aj + diag[i] + 1;
2761     nz   = ai[i+1] - diag[i] - 1;
2762     idt  = 3*i;
2763     s1 = x[idt];  s2 = x[1+idt];
2764     s3 = x[2+idt];
2765     while (nz--) {
2766       idx   = 3*(*vi++);
2767       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
2768       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2769       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2770       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2771       v    += 9;
2772     }
2773     v        = aa +  9*diag[i];
2774     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2775     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2776     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
2777   }
2778 
2779   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2780   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2781   PetscLogFlops(2*9*(a->nz) - 3*A->n);
2782   PetscFunctionReturn(0);
2783 }
2784 
2785 #undef __FUNCT__
2786 #define __FUNCT__ "MatSolve_SeqBAIJ_2"
2787 int MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
2788 {
2789   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
2790   IS              iscol=a->col,isrow=a->row;
2791   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
2792   int             *diag = a->diag;
2793   MatScalar       *aa=a->a,*v;
2794   PetscScalar     *x,*b,s1,s2,x1,x2,*t;
2795 
2796   PetscFunctionBegin;
2797   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2798   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2799   t  = a->solve_work;
2800 
2801   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2802   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2803 
2804   /* forward solve the lower triangular */
2805   idx    = 2*(*r++);
2806   t[0] = b[idx]; t[1] = b[1+idx];
2807   for (i=1; i<n; i++) {
2808     v     = aa + 4*ai[i];
2809     vi    = aj + ai[i];
2810     nz    = diag[i] - ai[i];
2811     idx   = 2*(*r++);
2812     s1  = b[idx]; s2 = b[1+idx];
2813     while (nz--) {
2814       idx   = 2*(*vi++);
2815       x1    = t[idx]; x2 = t[1+idx];
2816       s1 -= v[0]*x1 + v[2]*x2;
2817       s2 -= v[1]*x1 + v[3]*x2;
2818       v += 4;
2819     }
2820     idx = 2*i;
2821     t[idx] = s1; t[1+idx] = s2;
2822   }
2823   /* backward solve the upper triangular */
2824   for (i=n-1; i>=0; i--){
2825     v    = aa + 4*diag[i] + 4;
2826     vi   = aj + diag[i] + 1;
2827     nz   = ai[i+1] - diag[i] - 1;
2828     idt  = 2*i;
2829     s1 = t[idt]; s2 = t[1+idt];
2830     while (nz--) {
2831       idx   = 2*(*vi++);
2832       x1    = t[idx]; x2 = t[1+idx];
2833       s1 -= v[0]*x1 + v[2]*x2;
2834       s2 -= v[1]*x1 + v[3]*x2;
2835       v += 4;
2836     }
2837     idc = 2*(*c--);
2838     v   = aa + 4*diag[i];
2839     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
2840     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
2841   }
2842   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2843   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2844   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2845   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2846   PetscLogFlops(2*4*(a->nz) - 2*A->n);
2847   PetscFunctionReturn(0);
2848 }
2849 
2850 /*
2851       Special case where the matrix was ILU(0) factored in the natural
2852    ordering. This eliminates the need for the column and row permutation.
2853 */
2854 #undef __FUNCT__
2855 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
2856 int MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
2857 {
2858   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2859   int             n=a->mbs,*ai=a->i,*aj=a->j;
2860   int             ierr,*diag = a->diag;
2861   MatScalar       *aa=a->a,*v;
2862   PetscScalar     *x,*b,s1,s2,x1,x2;
2863   int             jdx,idt,idx,nz,*vi,i;
2864 
2865   PetscFunctionBegin;
2866   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2867   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2868 
2869   /* forward solve the lower triangular */
2870   idx    = 0;
2871   x[0]   = b[0]; x[1] = b[1];
2872   for (i=1; i<n; i++) {
2873     v     =  aa      + 4*ai[i];
2874     vi    =  aj      + ai[i];
2875     nz    =  diag[i] - ai[i];
2876     idx   +=  2;
2877     s1  =  b[idx];s2 = b[1+idx];
2878     while (nz--) {
2879       jdx   = 2*(*vi++);
2880       x1    = x[jdx];x2 = x[1+jdx];
2881       s1 -= v[0]*x1 + v[2]*x2;
2882       s2 -= v[1]*x1 + v[3]*x2;
2883       v    += 4;
2884     }
2885     x[idx]   = s1;
2886     x[1+idx] = s2;
2887   }
2888   /* backward solve the upper triangular */
2889   for (i=n-1; i>=0; i--){
2890     v    = aa + 4*diag[i] + 4;
2891     vi   = aj + diag[i] + 1;
2892     nz   = ai[i+1] - diag[i] - 1;
2893     idt  = 2*i;
2894     s1 = x[idt];  s2 = x[1+idt];
2895     while (nz--) {
2896       idx   = 2*(*vi++);
2897       x1    = x[idx];   x2 = x[1+idx];
2898       s1 -= v[0]*x1 + v[2]*x2;
2899       s2 -= v[1]*x1 + v[3]*x2;
2900       v    += 4;
2901     }
2902     v        = aa +  4*diag[i];
2903     x[idt]   = v[0]*s1 + v[2]*s2;
2904     x[1+idt] = v[1]*s1 + v[3]*s2;
2905   }
2906 
2907   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2908   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2909   PetscLogFlops(2*4*(a->nz) - 2*A->n);
2910   PetscFunctionReturn(0);
2911 }
2912 
2913 #undef __FUNCT__
2914 #define __FUNCT__ "MatSolve_SeqBAIJ_1"
2915 int MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
2916 {
2917   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
2918   IS              iscol=a->col,isrow=a->row;
2919   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
2920   int             *diag = a->diag;
2921   MatScalar       *aa=a->a,*v;
2922   PetscScalar     *x,*b,s1,*t;
2923 
2924   PetscFunctionBegin;
2925   if (!n) PetscFunctionReturn(0);
2926 
2927   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2928   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2929   t  = a->solve_work;
2930 
2931   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2932   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2933 
2934   /* forward solve the lower triangular */
2935   t[0] = b[*r++];
2936   for (i=1; i<n; i++) {
2937     v     = aa + ai[i];
2938     vi    = aj + ai[i];
2939     nz    = diag[i] - ai[i];
2940     s1  = b[*r++];
2941     while (nz--) {
2942       s1 -= (*v++)*t[*vi++];
2943     }
2944     t[i] = s1;
2945   }
2946   /* backward solve the upper triangular */
2947   for (i=n-1; i>=0; i--){
2948     v    = aa + diag[i] + 1;
2949     vi   = aj + diag[i] + 1;
2950     nz   = ai[i+1] - diag[i] - 1;
2951     s1 = t[i];
2952     while (nz--) {
2953       s1 -= (*v++)*t[*vi++];
2954     }
2955     x[*c--] = t[i] = aa[diag[i]]*s1;
2956   }
2957 
2958   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2959   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2960   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2961   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2962   PetscLogFlops(2*1*(a->nz) - A->n);
2963   PetscFunctionReturn(0);
2964 }
2965 /*
2966       Special case where the matrix was ILU(0) factored in the natural
2967    ordering. This eliminates the need for the column and row permutation.
2968 */
2969 #undef __FUNCT__
2970 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering"
2971 int MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2972 {
2973   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2974   int             n=a->mbs,*ai=a->i,*aj=a->j;
2975   int             ierr,*diag = a->diag;
2976   MatScalar       *aa=a->a;
2977   PetscScalar     *x,*b;
2978   PetscScalar     s1,x1;
2979   MatScalar       *v;
2980   int             jdx,idt,idx,nz,*vi,i;
2981 
2982   PetscFunctionBegin;
2983   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2984   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2985 
2986   /* forward solve the lower triangular */
2987   idx    = 0;
2988   x[0]   = b[0];
2989   for (i=1; i<n; i++) {
2990     v     =  aa      + ai[i];
2991     vi    =  aj      + ai[i];
2992     nz    =  diag[i] - ai[i];
2993     idx   +=  1;
2994     s1  =  b[idx];
2995     while (nz--) {
2996       jdx   = *vi++;
2997       x1    = x[jdx];
2998       s1 -= v[0]*x1;
2999       v    += 1;
3000     }
3001     x[idx]   = s1;
3002   }
3003   /* backward solve the upper triangular */
3004   for (i=n-1; i>=0; i--){
3005     v    = aa + diag[i] + 1;
3006     vi   = aj + diag[i] + 1;
3007     nz   = ai[i+1] - diag[i] - 1;
3008     idt  = i;
3009     s1 = x[idt];
3010     while (nz--) {
3011       idx   = *vi++;
3012       x1    = x[idx];
3013       s1 -= v[0]*x1;
3014       v    += 1;
3015     }
3016     v        = aa +  diag[i];
3017     x[idt]   = v[0]*s1;
3018   }
3019   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3020   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3021   PetscLogFlops(2*(a->nz) - A->n);
3022   PetscFunctionReturn(0);
3023 }
3024 
3025 /* ----------------------------------------------------------------*/
3026 /*
3027      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
3028    except that the data structure of Mat_SeqAIJ is slightly different.
3029    Not a good example of code reuse.
3030 */
3031 EXTERN int MatMissingDiagonal_SeqBAIJ(Mat);
3032 
3033 #undef __FUNCT__
3034 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ"
3035 int MatILUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,MatFactorInfo *info,Mat *fact)
3036 {
3037   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b;
3038   IS          isicol;
3039   int         *r,*ic,ierr,prow,n = a->mbs,*ai = a->i,*aj = a->j;
3040   int         *ainew,*ajnew,jmax,*fill,*xi,nz,*im,*ajfill,*flev;
3041   int         *dloc,idx,row,m,fm,nzf,nzi,len, reallocate = 0,dcount = 0;
3042   int         incrlev,nnz,i,bs = a->bs,bs2 = a->bs2,levels,diagonal_fill;
3043   PetscTruth  col_identity,row_identity;
3044   PetscReal   f;
3045 
3046   PetscFunctionBegin;
3047   f             = info->fill;
3048   levels        = (int)info->levels;
3049   diagonal_fill = (int)info->diagonal_fill;
3050   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
3051   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
3052   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
3053 
3054   if (!levels && row_identity && col_identity) {  /* special case copy the nonzero structure */
3055     ierr = MatDuplicate_SeqBAIJ(A,MAT_DO_NOT_COPY_VALUES,fact);CHKERRQ(ierr);
3056     (*fact)->factor = FACTOR_LU;
3057     b               = (Mat_SeqBAIJ*)(*fact)->data;
3058     if (!b->diag) {
3059       ierr = MatMarkDiagonal_SeqBAIJ(*fact);CHKERRQ(ierr);
3060     }
3061     ierr = MatMissingDiagonal_SeqBAIJ(*fact);CHKERRQ(ierr);
3062     b->row        = isrow;
3063     b->col        = iscol;
3064     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
3065     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
3066     b->icol       = isicol;
3067     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3068     ierr          = PetscMalloc(((*fact)->m+1+b->bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
3069   } else { /* general case perform the symbolic factorization */
3070     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
3071     ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
3072 
3073     /* get new row pointers */
3074     ierr = PetscMalloc((n+1)*sizeof(int),&ainew);CHKERRQ(ierr);
3075     ainew[0] = 0;
3076     /* don't know how many column pointers are needed so estimate */
3077     jmax = (int)(f*ai[n] + 1);
3078     ierr = PetscMalloc((jmax)*sizeof(int),&ajnew);CHKERRQ(ierr);
3079     /* ajfill is level of fill for each fill entry */
3080     ierr = PetscMalloc((jmax)*sizeof(int),&ajfill);CHKERRQ(ierr);
3081     /* fill is a linked list of nonzeros in active row */
3082     ierr = PetscMalloc((n+1)*sizeof(int),&fill);CHKERRQ(ierr);
3083     /* im is level for each filled value */
3084     ierr = PetscMalloc((n+1)*sizeof(int),&im);CHKERRQ(ierr);
3085     /* dloc is location of diagonal in factor */
3086     ierr = PetscMalloc((n+1)*sizeof(int),&dloc);CHKERRQ(ierr);
3087     dloc[0]  = 0;
3088     for (prow=0; prow<n; prow++) {
3089 
3090       /* copy prow into linked list */
3091       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
3092       if (!nz) SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix");
3093       xi         = aj + ai[r[prow]];
3094       fill[n]    = n;
3095       fill[prow] = -1; /* marker for diagonal entry */
3096       while (nz--) {
3097 	fm  = n;
3098 	idx = ic[*xi++];
3099 	do {
3100 	  m  = fm;
3101 	  fm = fill[m];
3102 	} while (fm < idx);
3103 	fill[m]   = idx;
3104 	fill[idx] = fm;
3105 	im[idx]   = 0;
3106       }
3107 
3108       /* make sure diagonal entry is included */
3109       if (diagonal_fill && fill[prow] == -1) {
3110 	fm = n;
3111 	while (fill[fm] < prow) fm = fill[fm];
3112 	fill[prow] = fill[fm];  /* insert diagonal into linked list */
3113 	fill[fm]   = prow;
3114 	im[prow]   = 0;
3115 	nzf++;
3116 	dcount++;
3117       }
3118 
3119       nzi = 0;
3120       row = fill[n];
3121       while (row < prow) {
3122 	incrlev = im[row] + 1;
3123 	nz      = dloc[row];
3124 	xi      = ajnew  + ainew[row] + nz + 1;
3125 	flev    = ajfill + ainew[row] + nz + 1;
3126 	nnz     = ainew[row+1] - ainew[row] - nz - 1;
3127 	fm      = row;
3128 	while (nnz-- > 0) {
3129 	  idx = *xi++;
3130 	  if (*flev + incrlev > levels) {
3131 	    flev++;
3132 	    continue;
3133 	  }
3134 	  do {
3135 	    m  = fm;
3136 	    fm = fill[m];
3137 	  } while (fm < idx);
3138 	  if (fm != idx) {
3139 	    im[idx]   = *flev + incrlev;
3140 	    fill[m]   = idx;
3141 	    fill[idx] = fm;
3142 	    fm        = idx;
3143 	    nzf++;
3144 	  } else {
3145 	    if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
3146 	  }
3147 	  flev++;
3148 	}
3149 	row = fill[row];
3150 	nzi++;
3151       }
3152       /* copy new filled row into permanent storage */
3153       ainew[prow+1] = ainew[prow] + nzf;
3154       if (ainew[prow+1] > jmax) {
3155 
3156 	/* estimate how much additional space we will need */
3157 	/* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
3158 	/* just double the memory each time */
3159 	int maxadd = jmax;
3160 	/* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
3161 	if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
3162 	jmax += maxadd;
3163 
3164 	/* allocate a longer ajnew and ajfill */
3165 	ierr = PetscMalloc(jmax*sizeof(int),&xi);CHKERRQ(ierr);
3166 	ierr = PetscMemcpy(xi,ajnew,ainew[prow]*sizeof(int));CHKERRQ(ierr);
3167 	ierr = PetscFree(ajnew);CHKERRQ(ierr);
3168 	ajnew = xi;
3169 	ierr = PetscMalloc(jmax*sizeof(int),&xi);CHKERRQ(ierr);
3170 	ierr = PetscMemcpy(xi,ajfill,ainew[prow]*sizeof(int));CHKERRQ(ierr);
3171 	ierr = PetscFree(ajfill);CHKERRQ(ierr);
3172 	ajfill = xi;
3173 	reallocate++; /* count how many reallocations are needed */
3174       }
3175       xi          = ajnew + ainew[prow];
3176       flev        = ajfill + ainew[prow];
3177       dloc[prow]  = nzi;
3178       fm          = fill[n];
3179       while (nzf--) {
3180 	*xi++   = fm;
3181 	*flev++ = im[fm];
3182 	fm      = fill[fm];
3183       }
3184       /* make sure row has diagonal entry */
3185       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
3186 	SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %d has missing diagonal in factored matrix\n\
3187     try running with -pc_ilu_nonzeros_along_diagonal or -pc_ilu_diagonal_fill",prow);
3188       }
3189     }
3190     ierr = PetscFree(ajfill);CHKERRQ(ierr);
3191     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
3192     ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
3193     ierr = PetscFree(fill);CHKERRQ(ierr);
3194     ierr = PetscFree(im);CHKERRQ(ierr);
3195 
3196     {
3197       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
3198       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",reallocate,f,af);
3199       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Run with -pc_ilu_fill %g or use \n",af);
3200       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:PCILUSetFill(pc,%g);\n",af);
3201       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:for best performance.\n");
3202       if (diagonal_fill) {
3203 	PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Detected and replaced %d missing diagonals",dcount);
3204       }
3205     }
3206 
3207     /* put together the new matrix */
3208     ierr = MatCreateSeqBAIJ(A->comm,bs,bs*n,bs*n,0,PETSC_NULL,fact);CHKERRQ(ierr);
3209     PetscLogObjectParent(*fact,isicol);
3210     b = (Mat_SeqBAIJ*)(*fact)->data;
3211     ierr = PetscFree(b->imax);CHKERRQ(ierr);
3212     b->singlemalloc = PETSC_FALSE;
3213     len = bs2*ainew[n]*sizeof(MatScalar);
3214     /* the next line frees the default space generated by the Create() */
3215     ierr = PetscFree(b->a);CHKERRQ(ierr);
3216     ierr = PetscFree(b->ilen);CHKERRQ(ierr);
3217     ierr = PetscMalloc(len,&b->a);CHKERRQ(ierr);
3218     b->j          = ajnew;
3219     b->i          = ainew;
3220     for (i=0; i<n; i++) dloc[i] += ainew[i];
3221     b->diag       = dloc;
3222     b->ilen       = 0;
3223     b->imax       = 0;
3224     b->row        = isrow;
3225     b->col        = iscol;
3226     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3227     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
3228     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
3229     b->icol       = isicol;
3230     ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
3231     /* In b structure:  Free imax, ilen, old a, old j.
3232        Allocate dloc, solve_work, new a, new j */
3233     PetscLogObjectMemory(*fact,(ainew[n]-n)*(sizeof(int))+bs2*ainew[n]*sizeof(PetscScalar));
3234     b->maxnz          = b->nz = ainew[n];
3235     (*fact)->factor   = FACTOR_LU;
3236 
3237     (*fact)->info.factor_mallocs    = reallocate;
3238     (*fact)->info.fill_ratio_given  = f;
3239     (*fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
3240   }
3241 
3242   if (row_identity && col_identity) {
3243     ierr = MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(*fact);CHKERRQ(ierr);
3244   }
3245   PetscFunctionReturn(0);
3246 }
3247 
3248 #undef __FUNCT__
3249 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE"
3250 int MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
3251 {
3252   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
3253   /* int i,*AJ=a->j,nz=a->nz; */
3254   PetscFunctionBegin;
3255   /* Undo Column scaling */
3256 /*    while (nz--) { */
3257 /*      AJ[i] = AJ[i]/4; */
3258 /*    } */
3259   /* This should really invoke a push/pop logic, but we don't have that yet. */
3260   A->ops->setunfactored = PETSC_NULL;
3261   PetscFunctionReturn(0);
3262 }
3263 
3264 #undef __FUNCT__
3265 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj"
3266 int MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
3267 {
3268   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3269   int *AJ=a->j,nz=a->nz;
3270   unsigned short *aj=(unsigned short *)AJ;
3271   PetscFunctionBegin;
3272   /* Is this really necessary? */
3273   while (nz--) {
3274     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
3275   }
3276   A->ops->setunfactored = PETSC_NULL;
3277   PetscFunctionReturn(0);
3278 }
3279 
3280 #undef __FUNCT__
3281 #define __FUNCT__ "MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering"
3282 int MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(Mat inA)
3283 {
3284   /*
3285       Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver
3286       with natural ordering
3287   */
3288   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)inA->data;
3289 
3290   PetscFunctionBegin;
3291   inA->ops->solve             = MatSolve_SeqBAIJ_Update;
3292   inA->ops->solvetranspose    = MatSolveTranspose_SeqBAIJ_Update;
3293   switch (a->bs) {
3294   case 1:
3295     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_1;
3296     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=1\n");
3297     break;
3298   case 2:
3299     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering;
3300     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=2\n");
3301     break;
3302   case 3:
3303     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering;
3304     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=3\n");
3305     break;
3306   case 4:
3307 #if defined(PETSC_USE_MAT_SINGLE)
3308     {
3309       PetscTruth  sse_enabled_local;
3310       int         ierr;
3311       ierr = PetscSSEIsEnabled(inA->comm,&sse_enabled_local,PETSC_NULL);CHKERRQ(ierr);
3312       if (sse_enabled_local) {
3313 #  if defined(PETSC_HAVE_SSE)
3314         int i,*AJ=a->j,nz=a->nz,n=a->mbs;
3315         if (n==(unsigned short)n) {
3316           unsigned short *aj=(unsigned short *)AJ;
3317           for (i=0;i<nz;i++) {
3318             aj[i] = (unsigned short)AJ[i];
3319           }
3320           inA->ops->setunfactored   = MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj;
3321           inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj;
3322           PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special SSE, in-place natural ordering, ushort j index factor BS=4\n");
3323         } else {
3324         /* Scale the column indices for easier indexing in MatSolve. */
3325 /*            for (i=0;i<nz;i++) { */
3326 /*              AJ[i] = AJ[i]*4; */
3327 /*            } */
3328           inA->ops->setunfactored   = MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE;
3329           inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE;
3330           PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special SSE, in-place natural ordering, int j index factor BS=4\n");
3331         }
3332 #  else
3333       /* This should never be reached.  If so, problem in PetscSSEIsEnabled. */
3334         SETERRQ(PETSC_ERR_SUP,"SSE Hardware unavailable");
3335 #  endif
3336       } else {
3337         inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering;
3338         PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=4\n");
3339       }
3340     }
3341 #else
3342     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering;
3343     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=4\n");
3344 #endif
3345     break;
3346   case 5:
3347     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_5_NaturalOrdering;
3348     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=5\n");
3349     break;
3350   case 6:
3351     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_6_NaturalOrdering;
3352     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=6\n");
3353     break;
3354   case 7:
3355     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_7_NaturalOrdering;
3356     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=7\n");
3357     break;
3358   }
3359   PetscFunctionReturn(0);
3360 }
3361 
3362 #undef __FUNCT__
3363 #define __FUNCT__ "MatSeqBAIJ_UpdateSolvers"
3364 int MatSeqBAIJ_UpdateSolvers(Mat A)
3365 {
3366   /*
3367       Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver
3368       with natural ordering
3369   */
3370   Mat_SeqBAIJ *a  = (Mat_SeqBAIJ *)A->data;
3371   IS          row = a->row, col = a->col;
3372   PetscTruth  row_identity, col_identity;
3373   PetscTruth  use_natural;
3374   int         ierr;
3375 
3376   PetscFunctionBegin;
3377 
3378   use_natural = PETSC_FALSE;
3379   if (row && col) {
3380     ierr = ISIdentity(row,&row_identity);CHKERRQ(ierr);
3381     ierr = ISIdentity(col,&col_identity);CHKERRQ(ierr);
3382 
3383     if (row_identity && col_identity) {
3384       use_natural = PETSC_TRUE;
3385     }
3386   } else {
3387     use_natural = PETSC_TRUE;
3388   }
3389 
3390   switch (a->bs) {
3391   case 1:
3392     if (use_natural) {
3393       A->ops->solve           = MatSolve_SeqBAIJ_1_NaturalOrdering;
3394       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering;
3395       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=1\n");
3396       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
3397     } else {
3398       A->ops->solve           = MatSolve_SeqBAIJ_1;
3399       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_1;
3400     }
3401     break;
3402   case 2:
3403     if (use_natural) {
3404       A->ops->solve           = MatSolve_SeqBAIJ_2_NaturalOrdering;
3405       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering;
3406       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=2\n");
3407       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
3408     } else {
3409       A->ops->solve           = MatSolve_SeqBAIJ_2;
3410       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_2;
3411     }
3412     break;
3413   case 3:
3414     if (use_natural) {
3415       A->ops->solve           = MatSolve_SeqBAIJ_3_NaturalOrdering;
3416       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering;
3417       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=3\n");
3418       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
3419     } else {
3420       A->ops->solve           = MatSolve_SeqBAIJ_3;
3421       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_3;
3422     }
3423     break;
3424   case 4:
3425     {
3426       PetscTruth sse_enabled_local;
3427       ierr = PetscSSEIsEnabled(A->comm,&sse_enabled_local,PETSC_NULL);CHKERRQ(ierr);
3428       if (use_natural) {
3429 #if defined(PETSC_USE_MAT_SINGLE)
3430         if (sse_enabled_local) { /* Natural + Single + SSE */
3431 #  if defined(PETSC_HAVE_SSE)
3432           int n=a->mbs;
3433           if (n==(unsigned short)n) {
3434             A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj;
3435             PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE, in-place, ushort j index, natural ordering solve BS=4\n");
3436           } else {
3437             A->ops->solve         = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion;
3438             PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE, in-place, int j index, natural ordering solve BS=4\n");
3439           }
3440 #  else
3441           /* This should never be reached, unless there is a bug in PetscSSEIsEnabled(). */
3442           SETERRQ(PETSC_ERR_SUP,"SSE implementations are unavailable.");
3443 #  endif
3444         } else { /* Natural + Single */
3445           A->ops->solve         = MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion;
3446           PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, in-place, natural ordering solve BS=4\n");
3447         }
3448 #else
3449         A->ops->solve           = MatSolve_SeqBAIJ_4_NaturalOrdering;
3450         PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place, natural ordering solve BS=4\n");
3451 #endif
3452         A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
3453         PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place, natural ordering solve BS=4\n");
3454       } else { /* Arbitrary ordering */
3455 #if defined(PETSC_USE_MAT_SINGLE)
3456         if (sse_enabled_local) { /* Arbitrary + Single + SSE */
3457 #  if defined(PETSC_HAVE_SSE)
3458           A->ops->solve         = MatSolve_SeqBAIJ_4_SSE_Demotion;
3459           PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE solve BS=4\n");
3460 #  else
3461           /* This should never be reached, unless there is a bug in PetscSSEIsEnabled(). */
3462           SETERRQ(PETSC_ERR_SUP,"SSE implementations are unavailable.");
3463 #  endif
3464         } else { /* Arbitrary + Single */
3465           A->ops->solve         = MatSolve_SeqBAIJ_4_Demotion;
3466           PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision solve BS=4\n");
3467         }
3468 #else
3469         A->ops->solve           = MatSolve_SeqBAIJ_4;
3470 #endif
3471         A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_4;
3472       }
3473     }
3474     break;
3475   case 5:
3476     if (use_natural) {
3477       A->ops->solve           = MatSolve_SeqBAIJ_5_NaturalOrdering;
3478       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_5_NaturalOrdering;
3479       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=5\n");
3480       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=5\n");
3481     } else {
3482       A->ops->solve           = MatSolve_SeqBAIJ_5;
3483       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_5;
3484     }
3485     break;
3486   case 6:
3487     if (use_natural) {
3488       A->ops->solve           = MatSolve_SeqBAIJ_6_NaturalOrdering;
3489       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_6_NaturalOrdering;
3490       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=6\n");
3491       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=6\n");
3492     } else {
3493       A->ops->solve           = MatSolve_SeqBAIJ_6;
3494       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_6;
3495     }
3496     break;
3497   case 7:
3498     if (use_natural) {
3499       A->ops->solve           = MatSolve_SeqBAIJ_7_NaturalOrdering;
3500       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_7_NaturalOrdering;
3501       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=7\n");
3502       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=7\n");
3503     } else {
3504       A->ops->solve           = MatSolve_SeqBAIJ_7;
3505       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_7;
3506     }
3507     break;
3508   default:
3509     A->ops->solve             = MatSolve_SeqBAIJ_N;
3510     break;
3511   }
3512   PetscFunctionReturn(0);
3513 }
3514 
3515 #undef __FUNCT__
3516 #define __FUNCT__ "MatSolve_SeqBAIJ_Update"
3517 int MatSolve_SeqBAIJ_Update(Mat A,Vec x,Vec y) {
3518   int ierr;
3519 
3520   PetscFunctionBegin;
3521   ierr = MatSeqBAIJ_UpdateSolvers(A);
3522   if (A->ops->solve != MatSolve_SeqBAIJ_Update) {
3523     ierr = (*A->ops->solve)(A,x,y);CHKERRQ(ierr);
3524   } else {
3525     SETERRQ(PETSC_ERR_SUP,"Something really wrong happened.");
3526   }
3527   PetscFunctionReturn(0);
3528 }
3529 
3530 #undef __FUNCT__
3531 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_Update"
3532 int MatSolveTranspose_SeqBAIJ_Update(Mat A,Vec x,Vec y) {
3533   int ierr;
3534 
3535   PetscFunctionBegin;
3536   ierr = MatSeqBAIJ_UpdateSolvers(A);
3537   ierr = (*A->ops->solvetranspose)(A,x,y);CHKERRQ(ierr);
3538   PetscFunctionReturn(0);
3539 }
3540 
3541 
3542