xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision 63bb2aa60f4717cbfb86ebed81540aa885ea575c)
1 /*$Id: baijfact2.c,v 1.72 2001/09/11 16:32:33 bsmith Exp $*/
2 /*
3     Factorization code for BAIJ format.
4 */
5 
6 #include "src/mat/impls/baij/seq/baij.h"
7 #include "src/vec/vecimpl.h"
8 #include "src/inline/ilu.h"
9 #include "src/inline/dot.h"
10 
11 #undef __FUNCT__
12 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering"
13 int MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
14 {
15   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
16   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
17   int             *diag = a->diag;
18   MatScalar       *aa=a->a,*v;
19   PetscScalar     s1,*x,*b;
20 
21   PetscFunctionBegin;
22   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
23   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
24   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
25 
26   /* forward solve the U^T */
27   for (i=0; i<n; i++) {
28 
29     v     = aa + diag[i];
30     /* multiply by the inverse of the block diagonal */
31     s1    = (*v++)*x[i];
32     vi    = aj + diag[i] + 1;
33     nz    = ai[i+1] - diag[i] - 1;
34     while (nz--) {
35       x[*vi++]  -= (*v++)*s1;
36     }
37     x[i]   = s1;
38   }
39   /* backward solve the L^T */
40   for (i=n-1; i>=0; i--){
41     v    = aa + diag[i] - 1;
42     vi   = aj + diag[i] - 1;
43     nz   = diag[i] - ai[i];
44     s1   = x[i];
45     while (nz--) {
46       x[*vi--]   -=  (*v--)*s1;
47     }
48   }
49   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
50   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
51   PetscLogFlops(2*(a->nz) - A->n);
52   PetscFunctionReturn(0);
53 }
54 
55 #undef __FUNCT__
56 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering"
57 int MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
58 {
59   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
60   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
61   int             *diag = a->diag,oidx;
62   MatScalar       *aa=a->a,*v;
63   PetscScalar     s1,s2,x1,x2;
64   PetscScalar     *x,*b;
65 
66   PetscFunctionBegin;
67   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
68   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
69   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
70 
71   /* forward solve the U^T */
72   idx = 0;
73   for (i=0; i<n; i++) {
74 
75     v     = aa + 4*diag[i];
76     /* multiply by the inverse of the block diagonal */
77     x1 = x[idx];   x2 = x[1+idx];
78     s1 = v[0]*x1  +  v[1]*x2;
79     s2 = v[2]*x1  +  v[3]*x2;
80     v += 4;
81 
82     vi    = aj + diag[i] + 1;
83     nz    = ai[i+1] - diag[i] - 1;
84     while (nz--) {
85       oidx = 2*(*vi++);
86       x[oidx]   -= v[0]*s1  +  v[1]*s2;
87       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
88       v  += 4;
89     }
90     x[idx]   = s1;x[1+idx] = s2;
91     idx += 2;
92   }
93   /* backward solve the L^T */
94   for (i=n-1; i>=0; i--){
95     v    = aa + 4*diag[i] - 4;
96     vi   = aj + diag[i] - 1;
97     nz   = diag[i] - ai[i];
98     idt  = 2*i;
99     s1   = x[idt];  s2 = x[1+idt];
100     while (nz--) {
101       idx   = 2*(*vi--);
102       x[idx]   -=  v[0]*s1 +  v[1]*s2;
103       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
104       v -= 4;
105     }
106   }
107   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
108   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
109   PetscLogFlops(2*4*(a->nz) - 2*A->n);
110   PetscFunctionReturn(0);
111 }
112 
113 #undef __FUNCT__
114 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering"
115 int MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
116 {
117   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
118   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
119   int             *diag = a->diag,oidx;
120   MatScalar       *aa=a->a,*v;
121   PetscScalar     s1,s2,s3,x1,x2,x3;
122   PetscScalar     *x,*b;
123 
124   PetscFunctionBegin;
125   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
126   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
127   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
128 
129   /* forward solve the U^T */
130   idx = 0;
131   for (i=0; i<n; i++) {
132 
133     v     = aa + 9*diag[i];
134     /* multiply by the inverse of the block diagonal */
135     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
136     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
137     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
138     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
139     v += 9;
140 
141     vi    = aj + diag[i] + 1;
142     nz    = ai[i+1] - diag[i] - 1;
143     while (nz--) {
144       oidx = 3*(*vi++);
145       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
146       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
147       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
148       v  += 9;
149     }
150     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
151     idx += 3;
152   }
153   /* backward solve the L^T */
154   for (i=n-1; i>=0; i--){
155     v    = aa + 9*diag[i] - 9;
156     vi   = aj + diag[i] - 1;
157     nz   = diag[i] - ai[i];
158     idt  = 3*i;
159     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
160     while (nz--) {
161       idx   = 3*(*vi--);
162       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
163       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
164       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
165       v -= 9;
166     }
167   }
168   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
169   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
170   PetscLogFlops(2*9*(a->nz) - 3*A->n);
171   PetscFunctionReturn(0);
172 }
173 
174 #undef __FUNCT__
175 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering"
176 int MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
177 {
178   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
179   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
180   int             *diag = a->diag,oidx;
181   MatScalar       *aa=a->a,*v;
182   PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
183   PetscScalar     *x,*b;
184 
185   PetscFunctionBegin;
186   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
187   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
188   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
189 
190   /* forward solve the U^T */
191   idx = 0;
192   for (i=0; i<n; i++) {
193 
194     v     = aa + 16*diag[i];
195     /* multiply by the inverse of the block diagonal */
196     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
197     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
198     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
199     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
200     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
201     v += 16;
202 
203     vi    = aj + diag[i] + 1;
204     nz    = ai[i+1] - diag[i] - 1;
205     while (nz--) {
206       oidx = 4*(*vi++);
207       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
208       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
209       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
210       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
211       v  += 16;
212     }
213     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
214     idx += 4;
215   }
216   /* backward solve the L^T */
217   for (i=n-1; i>=0; i--){
218     v    = aa + 16*diag[i] - 16;
219     vi   = aj + diag[i] - 1;
220     nz   = diag[i] - ai[i];
221     idt  = 4*i;
222     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
223     while (nz--) {
224       idx   = 4*(*vi--);
225       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
226       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
227       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
228       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
229       v -= 16;
230     }
231   }
232   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
233   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
234   PetscLogFlops(2*16*(a->nz) - 4*A->n);
235   PetscFunctionReturn(0);
236 }
237 
238 #undef __FUNCT__
239 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering"
240 int MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
241 {
242   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
243   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
244   int             *diag = a->diag,oidx;
245   MatScalar       *aa=a->a,*v;
246   PetscScalar     s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
247   PetscScalar     *x,*b;
248 
249   PetscFunctionBegin;
250   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
251   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
252   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
253 
254   /* forward solve the U^T */
255   idx = 0;
256   for (i=0; i<n; i++) {
257 
258     v     = aa + 25*diag[i];
259     /* multiply by the inverse of the block diagonal */
260     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
261     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
262     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
263     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
264     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
265     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
266     v += 25;
267 
268     vi    = aj + diag[i] + 1;
269     nz    = ai[i+1] - diag[i] - 1;
270     while (nz--) {
271       oidx = 5*(*vi++);
272       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
273       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
274       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
275       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
276       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
277       v  += 25;
278     }
279     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
280     idx += 5;
281   }
282   /* backward solve the L^T */
283   for (i=n-1; i>=0; i--){
284     v    = aa + 25*diag[i] - 25;
285     vi   = aj + diag[i] - 1;
286     nz   = diag[i] - ai[i];
287     idt  = 5*i;
288     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
289     while (nz--) {
290       idx   = 5*(*vi--);
291       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
292       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
293       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
294       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
295       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
296       v -= 25;
297     }
298   }
299   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
300   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
301   PetscLogFlops(2*25*(a->nz) - 5*A->n);
302   PetscFunctionReturn(0);
303 }
304 
305 #undef __FUNCT__
306 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering"
307 int MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
308 {
309   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
310   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
311   int             *diag = a->diag,oidx;
312   MatScalar       *aa=a->a,*v;
313   PetscScalar     s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
314   PetscScalar     *x,*b;
315 
316   PetscFunctionBegin;
317   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
318   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
319   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
320 
321   /* forward solve the U^T */
322   idx = 0;
323   for (i=0; i<n; i++) {
324 
325     v     = aa + 36*diag[i];
326     /* multiply by the inverse of the block diagonal */
327     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
328     x6    = x[5+idx];
329     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
330     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
331     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
332     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
333     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
334     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
335     v += 36;
336 
337     vi    = aj + diag[i] + 1;
338     nz    = ai[i+1] - diag[i] - 1;
339     while (nz--) {
340       oidx = 6*(*vi++);
341       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
342       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
343       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
344       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
345       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
346       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
347       v  += 36;
348     }
349     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
350     x[5+idx] = s6;
351     idx += 6;
352   }
353   /* backward solve the L^T */
354   for (i=n-1; i>=0; i--){
355     v    = aa + 36*diag[i] - 36;
356     vi   = aj + diag[i] - 1;
357     nz   = diag[i] - ai[i];
358     idt  = 6*i;
359     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
360     s6 = x[5+idt];
361     while (nz--) {
362       idx   = 6*(*vi--);
363       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
364       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
365       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
366       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
367       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
368       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
369       v -= 36;
370     }
371   }
372   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
373   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
374   PetscLogFlops(2*36*(a->nz) - 6*A->n);
375   PetscFunctionReturn(0);
376 }
377 
378 #undef __FUNCT__
379 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering"
380 int MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
381 {
382   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
383   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
384   int             *diag = a->diag,oidx;
385   MatScalar       *aa=a->a,*v;
386   PetscScalar     s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
387   PetscScalar     *x,*b;
388 
389   PetscFunctionBegin;
390   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
391   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
392   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
393 
394   /* forward solve the U^T */
395   idx = 0;
396   for (i=0; i<n; i++) {
397 
398     v     = aa + 49*diag[i];
399     /* multiply by the inverse of the block diagonal */
400     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
401     x6    = x[5+idx]; x7 = x[6+idx];
402     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
403     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
404     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
405     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
406     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
407     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
408     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
409     v += 49;
410 
411     vi    = aj + diag[i] + 1;
412     nz    = ai[i+1] - diag[i] - 1;
413     while (nz--) {
414       oidx = 7*(*vi++);
415       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
416       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
417       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
418       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
419       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
420       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
421       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
422       v  += 49;
423     }
424     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
425     x[5+idx] = s6;x[6+idx] = s7;
426     idx += 7;
427   }
428   /* backward solve the L^T */
429   for (i=n-1; i>=0; i--){
430     v    = aa + 49*diag[i] - 49;
431     vi   = aj + diag[i] - 1;
432     nz   = diag[i] - ai[i];
433     idt  = 7*i;
434     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
435     s6 = x[5+idt];s7 = x[6+idt];
436     while (nz--) {
437       idx   = 7*(*vi--);
438       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
439       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
440       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
441       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
442       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
443       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
444       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
445       v -= 49;
446     }
447   }
448   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
449   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
450   PetscLogFlops(2*49*(a->nz) - 7*A->n);
451   PetscFunctionReturn(0);
452 }
453 
454 /*---------------------------------------------------------------------------------------------*/
455 #undef __FUNCT__
456 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1"
457 int MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
458 {
459   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
460   IS              iscol=a->col,isrow=a->row;
461   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
462   int             *diag = a->diag;
463   MatScalar       *aa=a->a,*v;
464   PetscScalar     s1,*x,*b,*t;
465 
466   PetscFunctionBegin;
467   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
468   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
469   t  = a->solve_work;
470 
471   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
472   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
473 
474   /* copy the b into temp work space according to permutation */
475   for (i=0; i<n; i++) {
476     t[i] = b[c[i]];
477   }
478 
479   /* forward solve the U^T */
480   for (i=0; i<n; i++) {
481 
482     v     = aa + diag[i];
483     /* multiply by the inverse of the block diagonal */
484     s1    = (*v++)*t[i];
485     vi    = aj + diag[i] + 1;
486     nz    = ai[i+1] - diag[i] - 1;
487     while (nz--) {
488       t[*vi++]  -= (*v++)*s1;
489     }
490     t[i]   = s1;
491   }
492   /* backward solve the L^T */
493   for (i=n-1; i>=0; i--){
494     v    = aa + diag[i] - 1;
495     vi   = aj + diag[i] - 1;
496     nz   = diag[i] - ai[i];
497     s1   = t[i];
498     while (nz--) {
499       t[*vi--]   -=  (*v--)*s1;
500     }
501   }
502 
503   /* copy t into x according to permutation */
504   for (i=0; i<n; i++) {
505     x[r[i]]   = t[i];
506   }
507 
508   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
509   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
510   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
511   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
512   PetscLogFlops(2*(a->nz) - A->n);
513   PetscFunctionReturn(0);
514 }
515 
516 #undef __FUNCT__
517 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2"
518 int MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
519 {
520   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
521   IS              iscol=a->col,isrow=a->row;
522   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
523   int             *diag = a->diag,ii,ic,ir,oidx;
524   MatScalar       *aa=a->a,*v;
525   PetscScalar     s1,s2,x1,x2;
526   PetscScalar     *x,*b,*t;
527 
528   PetscFunctionBegin;
529   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
530   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
531   t  = a->solve_work;
532 
533   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
534   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
535 
536   /* copy the b into temp work space according to permutation */
537   ii = 0;
538   for (i=0; i<n; i++) {
539     ic      = 2*c[i];
540     t[ii]   = b[ic];
541     t[ii+1] = b[ic+1];
542     ii += 2;
543   }
544 
545   /* forward solve the U^T */
546   idx = 0;
547   for (i=0; i<n; i++) {
548 
549     v     = aa + 4*diag[i];
550     /* multiply by the inverse of the block diagonal */
551     x1    = t[idx];   x2 = t[1+idx];
552     s1 = v[0]*x1  +  v[1]*x2;
553     s2 = v[2]*x1  +  v[3]*x2;
554     v += 4;
555 
556     vi    = aj + diag[i] + 1;
557     nz    = ai[i+1] - diag[i] - 1;
558     while (nz--) {
559       oidx = 2*(*vi++);
560       t[oidx]   -= v[0]*s1  +  v[1]*s2;
561       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
562       v  += 4;
563     }
564     t[idx]   = s1;t[1+idx] = s2;
565     idx += 2;
566   }
567   /* backward solve the L^T */
568   for (i=n-1; i>=0; i--){
569     v    = aa + 4*diag[i] - 4;
570     vi   = aj + diag[i] - 1;
571     nz   = diag[i] - ai[i];
572     idt  = 2*i;
573     s1 = t[idt];  s2 = t[1+idt];
574     while (nz--) {
575       idx   = 2*(*vi--);
576       t[idx]   -=  v[0]*s1 +  v[1]*s2;
577       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
578       v -= 4;
579     }
580   }
581 
582   /* copy t into x according to permutation */
583   ii = 0;
584   for (i=0; i<n; i++) {
585     ir      = 2*r[i];
586     x[ir]   = t[ii];
587     x[ir+1] = t[ii+1];
588     ii += 2;
589   }
590 
591   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
592   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
593   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
594   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
595   PetscLogFlops(2*4*(a->nz) - 2*A->n);
596   PetscFunctionReturn(0);
597 }
598 
599 #undef __FUNCT__
600 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3"
601 int MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
602 {
603   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
604   IS              iscol=a->col,isrow=a->row;
605   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
606   int             *diag = a->diag,ii,ic,ir,oidx;
607   MatScalar       *aa=a->a,*v;
608   PetscScalar     s1,s2,s3,x1,x2,x3;
609   PetscScalar     *x,*b,*t;
610 
611   PetscFunctionBegin;
612   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
613   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
614   t  = a->solve_work;
615 
616   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
617   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
618 
619   /* copy the b into temp work space according to permutation */
620   ii = 0;
621   for (i=0; i<n; i++) {
622     ic      = 3*c[i];
623     t[ii]   = b[ic];
624     t[ii+1] = b[ic+1];
625     t[ii+2] = b[ic+2];
626     ii += 3;
627   }
628 
629   /* forward solve the U^T */
630   idx = 0;
631   for (i=0; i<n; i++) {
632 
633     v     = aa + 9*diag[i];
634     /* multiply by the inverse of the block diagonal */
635     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
636     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
637     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
638     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
639     v += 9;
640 
641     vi    = aj + diag[i] + 1;
642     nz    = ai[i+1] - diag[i] - 1;
643     while (nz--) {
644       oidx = 3*(*vi++);
645       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
646       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
647       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
648       v  += 9;
649     }
650     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
651     idx += 3;
652   }
653   /* backward solve the L^T */
654   for (i=n-1; i>=0; i--){
655     v    = aa + 9*diag[i] - 9;
656     vi   = aj + diag[i] - 1;
657     nz   = diag[i] - ai[i];
658     idt  = 3*i;
659     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
660     while (nz--) {
661       idx   = 3*(*vi--);
662       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
663       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
664       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
665       v -= 9;
666     }
667   }
668 
669   /* copy t into x according to permutation */
670   ii = 0;
671   for (i=0; i<n; i++) {
672     ir      = 3*r[i];
673     x[ir]   = t[ii];
674     x[ir+1] = t[ii+1];
675     x[ir+2] = t[ii+2];
676     ii += 3;
677   }
678 
679   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
680   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
681   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
682   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
683   PetscLogFlops(2*9*(a->nz) - 3*A->n);
684   PetscFunctionReturn(0);
685 }
686 
687 #undef __FUNCT__
688 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4"
689 int MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
690 {
691   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
692   IS              iscol=a->col,isrow=a->row;
693   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
694   int             *diag = a->diag,ii,ic,ir,oidx;
695   MatScalar       *aa=a->a,*v;
696   PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
697   PetscScalar     *x,*b,*t;
698 
699   PetscFunctionBegin;
700   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
701   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
702   t  = a->solve_work;
703 
704   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
705   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
706 
707   /* copy the b into temp work space according to permutation */
708   ii = 0;
709   for (i=0; i<n; i++) {
710     ic      = 4*c[i];
711     t[ii]   = b[ic];
712     t[ii+1] = b[ic+1];
713     t[ii+2] = b[ic+2];
714     t[ii+3] = b[ic+3];
715     ii += 4;
716   }
717 
718   /* forward solve the U^T */
719   idx = 0;
720   for (i=0; i<n; i++) {
721 
722     v     = aa + 16*diag[i];
723     /* multiply by the inverse of the block diagonal */
724     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
725     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
726     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
727     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
728     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
729     v += 16;
730 
731     vi    = aj + diag[i] + 1;
732     nz    = ai[i+1] - diag[i] - 1;
733     while (nz--) {
734       oidx = 4*(*vi++);
735       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
736       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
737       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
738       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
739       v  += 16;
740     }
741     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
742     idx += 4;
743   }
744   /* backward solve the L^T */
745   for (i=n-1; i>=0; i--){
746     v    = aa + 16*diag[i] - 16;
747     vi   = aj + diag[i] - 1;
748     nz   = diag[i] - ai[i];
749     idt  = 4*i;
750     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
751     while (nz--) {
752       idx   = 4*(*vi--);
753       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
754       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
755       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
756       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
757       v -= 16;
758     }
759   }
760 
761   /* copy t into x according to permutation */
762   ii = 0;
763   for (i=0; i<n; i++) {
764     ir      = 4*r[i];
765     x[ir]   = t[ii];
766     x[ir+1] = t[ii+1];
767     x[ir+2] = t[ii+2];
768     x[ir+3] = t[ii+3];
769     ii += 4;
770   }
771 
772   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
773   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
774   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
775   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
776   PetscLogFlops(2*16*(a->nz) - 4*A->n);
777   PetscFunctionReturn(0);
778 }
779 
780 #undef __FUNCT__
781 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5"
782 int MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
783 {
784   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
785   IS              iscol=a->col,isrow=a->row;
786   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
787   int             *diag = a->diag,ii,ic,ir,oidx;
788   MatScalar       *aa=a->a,*v;
789   PetscScalar     s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
790   PetscScalar     *x,*b,*t;
791 
792   PetscFunctionBegin;
793   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
794   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
795   t  = a->solve_work;
796 
797   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
798   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
799 
800   /* copy the b into temp work space according to permutation */
801   ii = 0;
802   for (i=0; i<n; i++) {
803     ic      = 5*c[i];
804     t[ii]   = b[ic];
805     t[ii+1] = b[ic+1];
806     t[ii+2] = b[ic+2];
807     t[ii+3] = b[ic+3];
808     t[ii+4] = b[ic+4];
809     ii += 5;
810   }
811 
812   /* forward solve the U^T */
813   idx = 0;
814   for (i=0; i<n; i++) {
815 
816     v     = aa + 25*diag[i];
817     /* multiply by the inverse of the block diagonal */
818     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
819     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
820     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
821     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
822     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
823     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
824     v += 25;
825 
826     vi    = aj + diag[i] + 1;
827     nz    = ai[i+1] - diag[i] - 1;
828     while (nz--) {
829       oidx = 5*(*vi++);
830       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
831       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
832       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
833       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
834       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
835       v  += 25;
836     }
837     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
838     idx += 5;
839   }
840   /* backward solve the L^T */
841   for (i=n-1; i>=0; i--){
842     v    = aa + 25*diag[i] - 25;
843     vi   = aj + diag[i] - 1;
844     nz   = diag[i] - ai[i];
845     idt  = 5*i;
846     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
847     while (nz--) {
848       idx   = 5*(*vi--);
849       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
850       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
851       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
852       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
853       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
854       v -= 25;
855     }
856   }
857 
858   /* copy t into x according to permutation */
859   ii = 0;
860   for (i=0; i<n; i++) {
861     ir      = 5*r[i];
862     x[ir]   = t[ii];
863     x[ir+1] = t[ii+1];
864     x[ir+2] = t[ii+2];
865     x[ir+3] = t[ii+3];
866     x[ir+4] = t[ii+4];
867     ii += 5;
868   }
869 
870   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
871   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
872   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
873   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
874   PetscLogFlops(2*25*(a->nz) - 5*A->n);
875   PetscFunctionReturn(0);
876 }
877 
878 #undef __FUNCT__
879 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6"
880 int MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
881 {
882   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
883   IS              iscol=a->col,isrow=a->row;
884   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
885   int             *diag = a->diag,ii,ic,ir,oidx;
886   MatScalar       *aa=a->a,*v;
887   PetscScalar     s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
888   PetscScalar     *x,*b,*t;
889 
890   PetscFunctionBegin;
891   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
892   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
893   t  = a->solve_work;
894 
895   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
896   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
897 
898   /* copy the b into temp work space according to permutation */
899   ii = 0;
900   for (i=0; i<n; i++) {
901     ic      = 6*c[i];
902     t[ii]   = b[ic];
903     t[ii+1] = b[ic+1];
904     t[ii+2] = b[ic+2];
905     t[ii+3] = b[ic+3];
906     t[ii+4] = b[ic+4];
907     t[ii+5] = b[ic+5];
908     ii += 6;
909   }
910 
911   /* forward solve the U^T */
912   idx = 0;
913   for (i=0; i<n; i++) {
914 
915     v     = aa + 36*diag[i];
916     /* multiply by the inverse of the block diagonal */
917     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
918     x6    = t[5+idx];
919     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
920     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
921     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
922     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
923     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
924     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
925     v += 36;
926 
927     vi    = aj + diag[i] + 1;
928     nz    = ai[i+1] - diag[i] - 1;
929     while (nz--) {
930       oidx = 6*(*vi++);
931       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
932       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
933       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
934       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
935       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
936       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
937       v  += 36;
938     }
939     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
940     t[5+idx] = s6;
941     idx += 6;
942   }
943   /* backward solve the L^T */
944   for (i=n-1; i>=0; i--){
945     v    = aa + 36*diag[i] - 36;
946     vi   = aj + diag[i] - 1;
947     nz   = diag[i] - ai[i];
948     idt  = 6*i;
949     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
950     s6 = t[5+idt];
951     while (nz--) {
952       idx   = 6*(*vi--);
953       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
954       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
955       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
956       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
957       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
958       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
959       v -= 36;
960     }
961   }
962 
963   /* copy t into x according to permutation */
964   ii = 0;
965   for (i=0; i<n; i++) {
966     ir      = 6*r[i];
967     x[ir]   = t[ii];
968     x[ir+1] = t[ii+1];
969     x[ir+2] = t[ii+2];
970     x[ir+3] = t[ii+3];
971     x[ir+4] = t[ii+4];
972     x[ir+5] = t[ii+5];
973     ii += 6;
974   }
975 
976   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
977   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
978   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
979   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
980   PetscLogFlops(2*36*(a->nz) - 6*A->n);
981   PetscFunctionReturn(0);
982 }
983 
984 #undef __FUNCT__
985 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7"
986 int MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
987 {
988   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
989   IS              iscol=a->col,isrow=a->row;
990   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
991   int             *diag = a->diag,ii,ic,ir,oidx;
992   MatScalar       *aa=a->a,*v;
993   PetscScalar     s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
994   PetscScalar     *x,*b,*t;
995 
996   PetscFunctionBegin;
997   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
998   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
999   t  = a->solve_work;
1000 
1001   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1002   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1003 
1004   /* copy the b into temp work space according to permutation */
1005   ii = 0;
1006   for (i=0; i<n; i++) {
1007     ic      = 7*c[i];
1008     t[ii]   = b[ic];
1009     t[ii+1] = b[ic+1];
1010     t[ii+2] = b[ic+2];
1011     t[ii+3] = b[ic+3];
1012     t[ii+4] = b[ic+4];
1013     t[ii+5] = b[ic+5];
1014     t[ii+6] = b[ic+6];
1015     ii += 7;
1016   }
1017 
1018   /* forward solve the U^T */
1019   idx = 0;
1020   for (i=0; i<n; i++) {
1021 
1022     v     = aa + 49*diag[i];
1023     /* multiply by the inverse of the block diagonal */
1024     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1025     x6    = t[5+idx]; x7 = t[6+idx];
1026     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1027     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1028     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1029     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1030     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1031     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1032     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1033     v += 49;
1034 
1035     vi    = aj + diag[i] + 1;
1036     nz    = ai[i+1] - diag[i] - 1;
1037     while (nz--) {
1038       oidx = 7*(*vi++);
1039       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1040       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1041       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1042       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1043       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1044       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1045       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1046       v  += 49;
1047     }
1048     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1049     t[5+idx] = s6;t[6+idx] = s7;
1050     idx += 7;
1051   }
1052   /* backward solve the L^T */
1053   for (i=n-1; i>=0; i--){
1054     v    = aa + 49*diag[i] - 49;
1055     vi   = aj + diag[i] - 1;
1056     nz   = diag[i] - ai[i];
1057     idt  = 7*i;
1058     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1059     s6 = t[5+idt];s7 = t[6+idt];
1060     while (nz--) {
1061       idx   = 7*(*vi--);
1062       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1063       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1064       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1065       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1066       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1067       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1068       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1069       v -= 49;
1070     }
1071   }
1072 
1073   /* copy t into x according to permutation */
1074   ii = 0;
1075   for (i=0; i<n; i++) {
1076     ir      = 7*r[i];
1077     x[ir]   = t[ii];
1078     x[ir+1] = t[ii+1];
1079     x[ir+2] = t[ii+2];
1080     x[ir+3] = t[ii+3];
1081     x[ir+4] = t[ii+4];
1082     x[ir+5] = t[ii+5];
1083     x[ir+6] = t[ii+6];
1084     ii += 7;
1085   }
1086 
1087   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1088   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1089   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1090   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1091   PetscLogFlops(2*49*(a->nz) - 7*A->n);
1092   PetscFunctionReturn(0);
1093 }
1094 
1095 /* ----------------------------------------------------------- */
1096 #undef __FUNCT__
1097 #define __FUNCT__ "MatSolve_SeqBAIJ_N"
1098 int MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
1099 {
1100   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1101   IS              iscol=a->col,isrow=a->row;
1102   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1103   int             nz,bs=a->bs,bs2=a->bs2,*rout,*cout;
1104   MatScalar       *aa=a->a,*v;
1105   PetscScalar     *x,*b,*s,*t,*ls;
1106 
1107   PetscFunctionBegin;
1108   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1109   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1110   t  = a->solve_work;
1111 
1112   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1113   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1114 
1115   /* forward solve the lower triangular */
1116   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
1117   for (i=1; i<n; i++) {
1118     v   = aa + bs2*ai[i];
1119     vi  = aj + ai[i];
1120     nz  = a->diag[i] - ai[i];
1121     s = t + bs*i;
1122     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
1123     while (nz--) {
1124       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
1125       v += bs2;
1126     }
1127   }
1128   /* backward solve the upper triangular */
1129   ls = a->solve_work + A->n;
1130   for (i=n-1; i>=0; i--){
1131     v   = aa + bs2*(a->diag[i] + 1);
1132     vi  = aj + a->diag[i] + 1;
1133     nz  = ai[i+1] - a->diag[i] - 1;
1134     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1135     while (nz--) {
1136       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
1137       v += bs2;
1138     }
1139     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
1140     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1141   }
1142 
1143   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1144   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1145   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1146   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1147   PetscLogFlops(2*(a->bs2)*(a->nz) - a->bs*A->n);
1148   PetscFunctionReturn(0);
1149 }
1150 
1151 #undef __FUNCT__
1152 #define __FUNCT__ "MatSolve_SeqBAIJ_7"
1153 int MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1154 {
1155   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1156   IS              iscol=a->col,isrow=a->row;
1157   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1158   int             *diag = a->diag;
1159   MatScalar       *aa=a->a,*v;
1160   PetscScalar     s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1161   PetscScalar     *x,*b,*t;
1162 
1163   PetscFunctionBegin;
1164   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1165   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1166   t  = a->solve_work;
1167 
1168   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1169   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1170 
1171   /* forward solve the lower triangular */
1172   idx    = 7*(*r++);
1173   t[0] = b[idx];   t[1] = b[1+idx];
1174   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1175   t[5] = b[5+idx]; t[6] = b[6+idx];
1176 
1177   for (i=1; i<n; i++) {
1178     v     = aa + 49*ai[i];
1179     vi    = aj + ai[i];
1180     nz    = diag[i] - ai[i];
1181     idx   = 7*(*r++);
1182     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1183     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1184     while (nz--) {
1185       idx   = 7*(*vi++);
1186       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1187       x4    = t[3+idx];x5 = t[4+idx];
1188       x6    = t[5+idx];x7 = t[6+idx];
1189       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1190       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1191       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1192       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1193       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1194       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1195       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1196       v += 49;
1197     }
1198     idx = 7*i;
1199     t[idx]   = s1;t[1+idx] = s2;
1200     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1201     t[5+idx] = s6;t[6+idx] = s7;
1202   }
1203   /* backward solve the upper triangular */
1204   for (i=n-1; i>=0; i--){
1205     v    = aa + 49*diag[i] + 49;
1206     vi   = aj + diag[i] + 1;
1207     nz   = ai[i+1] - diag[i] - 1;
1208     idt  = 7*i;
1209     s1 = t[idt];  s2 = t[1+idt];
1210     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1211     s6 = t[5+idt];s7 = t[6+idt];
1212     while (nz--) {
1213       idx   = 7*(*vi++);
1214       x1    = t[idx];   x2 = t[1+idx];
1215       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1216       x6    = t[5+idx]; x7 = t[6+idx];
1217       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1218       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1219       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1220       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1221       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1222       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1223       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1224       v += 49;
1225     }
1226     idc = 7*(*c--);
1227     v   = aa + 49*diag[i];
1228     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1229                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1230     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1231                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1232     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1233                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1234     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1235                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1236     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1237                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1238     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1239                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1240     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1241                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
1242   }
1243 
1244   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1245   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1246   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1247   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1248   PetscLogFlops(2*49*(a->nz) - 7*A->n);
1249   PetscFunctionReturn(0);
1250 }
1251 
1252 #undef __FUNCT__
1253 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
1254 int MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
1255 {
1256   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1257   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1258   int             ierr,*diag = a->diag,jdx;
1259   MatScalar       *aa=a->a,*v;
1260   PetscScalar     *x,*b,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1261 
1262   PetscFunctionBegin;
1263   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1264   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1265   /* forward solve the lower triangular */
1266   idx    = 0;
1267   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1268   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1269   x[6] = b[6+idx];
1270   for (i=1; i<n; i++) {
1271     v     =  aa + 49*ai[i];
1272     vi    =  aj + ai[i];
1273     nz    =  diag[i] - ai[i];
1274     idx   =  7*i;
1275     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1276     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1277     s7  =  b[6+idx];
1278     while (nz--) {
1279       jdx   = 7*(*vi++);
1280       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1281       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1282       x7    = x[6+jdx];
1283       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1284       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1285       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1286       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1287       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1288       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1289       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1290       v += 49;
1291      }
1292     x[idx]   = s1;
1293     x[1+idx] = s2;
1294     x[2+idx] = s3;
1295     x[3+idx] = s4;
1296     x[4+idx] = s5;
1297     x[5+idx] = s6;
1298     x[6+idx] = s7;
1299   }
1300   /* backward solve the upper triangular */
1301   for (i=n-1; i>=0; i--){
1302     v    = aa + 49*diag[i] + 49;
1303     vi   = aj + diag[i] + 1;
1304     nz   = ai[i+1] - diag[i] - 1;
1305     idt  = 7*i;
1306     s1 = x[idt];   s2 = x[1+idt];
1307     s3 = x[2+idt]; s4 = x[3+idt];
1308     s5 = x[4+idt]; s6 = x[5+idt];
1309     s7 = x[6+idt];
1310     while (nz--) {
1311       idx   = 7*(*vi++);
1312       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1313       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1314       x7    = x[6+idx];
1315       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1316       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1317       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1318       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1319       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1320       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1321       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1322       v += 49;
1323     }
1324     v        = aa + 49*diag[i];
1325     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
1326                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
1327     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
1328                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
1329     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
1330                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
1331     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
1332                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
1333     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
1334                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
1335     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
1336                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
1337     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
1338                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
1339   }
1340 
1341   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1342   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1343   PetscLogFlops(2*36*(a->nz) - 6*A->n);
1344   PetscFunctionReturn(0);
1345 }
1346 
1347 #undef __FUNCT__
1348 #define __FUNCT__ "MatSolve_SeqBAIJ_6"
1349 int MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
1350 {
1351   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1352   IS              iscol=a->col,isrow=a->row;
1353   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1354   int             *diag = a->diag;
1355   MatScalar       *aa=a->a,*v;
1356   PetscScalar     *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
1357 
1358   PetscFunctionBegin;
1359   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1360   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1361   t  = a->solve_work;
1362 
1363   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1364   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1365 
1366   /* forward solve the lower triangular */
1367   idx    = 6*(*r++);
1368   t[0] = b[idx];   t[1] = b[1+idx];
1369   t[2] = b[2+idx]; t[3] = b[3+idx];
1370   t[4] = b[4+idx]; t[5] = b[5+idx];
1371   for (i=1; i<n; i++) {
1372     v     = aa + 36*ai[i];
1373     vi    = aj + ai[i];
1374     nz    = diag[i] - ai[i];
1375     idx   = 6*(*r++);
1376     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1377     s5  = b[4+idx]; s6 = b[5+idx];
1378     while (nz--) {
1379       idx   = 6*(*vi++);
1380       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1381       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1382       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1383       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1384       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1385       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1386       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1387       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1388       v += 36;
1389     }
1390     idx = 6*i;
1391     t[idx]   = s1;t[1+idx] = s2;
1392     t[2+idx] = s3;t[3+idx] = s4;
1393     t[4+idx] = s5;t[5+idx] = s6;
1394   }
1395   /* backward solve the upper triangular */
1396   for (i=n-1; i>=0; i--){
1397     v    = aa + 36*diag[i] + 36;
1398     vi   = aj + diag[i] + 1;
1399     nz   = ai[i+1] - diag[i] - 1;
1400     idt  = 6*i;
1401     s1 = t[idt];  s2 = t[1+idt];
1402     s3 = t[2+idt];s4 = t[3+idt];
1403     s5 = t[4+idt];s6 = t[5+idt];
1404     while (nz--) {
1405       idx   = 6*(*vi++);
1406       x1    = t[idx];   x2 = t[1+idx];
1407       x3    = t[2+idx]; x4 = t[3+idx];
1408       x5    = t[4+idx]; x6 = t[5+idx];
1409       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1410       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1411       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1412       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1413       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1414       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1415       v += 36;
1416     }
1417     idc = 6*(*c--);
1418     v   = aa + 36*diag[i];
1419     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
1420                                  v[18]*s4+v[24]*s5+v[30]*s6;
1421     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1422                                  v[19]*s4+v[25]*s5+v[31]*s6;
1423     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1424                                  v[20]*s4+v[26]*s5+v[32]*s6;
1425     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1426                                  v[21]*s4+v[27]*s5+v[33]*s6;
1427     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1428                                  v[22]*s4+v[28]*s5+v[34]*s6;
1429     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1430                                  v[23]*s4+v[29]*s5+v[35]*s6;
1431   }
1432 
1433   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1434   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1435   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1436   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1437   PetscLogFlops(2*36*(a->nz) - 6*A->n);
1438   PetscFunctionReturn(0);
1439 }
1440 
1441 #undef __FUNCT__
1442 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
1443 int MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
1444 {
1445   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1446   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1447   int             ierr,*diag = a->diag,jdx;
1448   MatScalar       *aa=a->a,*v;
1449   PetscScalar     *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
1450 
1451   PetscFunctionBegin;
1452   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1453   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1454   /* forward solve the lower triangular */
1455   idx    = 0;
1456   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1457   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1458   for (i=1; i<n; i++) {
1459     v     =  aa + 36*ai[i];
1460     vi    =  aj + ai[i];
1461     nz    =  diag[i] - ai[i];
1462     idx   =  6*i;
1463     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1464     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1465     while (nz--) {
1466       jdx   = 6*(*vi++);
1467       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1468       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1469       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1470       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1471       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1472       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1473       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1474       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1475       v += 36;
1476      }
1477     x[idx]   = s1;
1478     x[1+idx] = s2;
1479     x[2+idx] = s3;
1480     x[3+idx] = s4;
1481     x[4+idx] = s5;
1482     x[5+idx] = s6;
1483   }
1484   /* backward solve the upper triangular */
1485   for (i=n-1; i>=0; i--){
1486     v    = aa + 36*diag[i] + 36;
1487     vi   = aj + diag[i] + 1;
1488     nz   = ai[i+1] - diag[i] - 1;
1489     idt  = 6*i;
1490     s1 = x[idt];   s2 = x[1+idt];
1491     s3 = x[2+idt]; s4 = x[3+idt];
1492     s5 = x[4+idt]; s6 = x[5+idt];
1493     while (nz--) {
1494       idx   = 6*(*vi++);
1495       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1496       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1497       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1498       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1499       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1500       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1501       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1502       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1503       v += 36;
1504     }
1505     v        = aa + 36*diag[i];
1506     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
1507     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
1508     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
1509     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
1510     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
1511     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
1512   }
1513 
1514   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1515   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1516   PetscLogFlops(2*36*(a->nz) - 6*A->n);
1517   PetscFunctionReturn(0);
1518 }
1519 
1520 #undef __FUNCT__
1521 #define __FUNCT__ "MatSolve_SeqBAIJ_5"
1522 int MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
1523 {
1524   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1525   IS              iscol=a->col,isrow=a->row;
1526   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1527   int             *diag = a->diag;
1528   MatScalar       *aa=a->a,*v;
1529   PetscScalar     *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
1530 
1531   PetscFunctionBegin;
1532   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1533   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1534   t  = a->solve_work;
1535 
1536   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1537   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1538 
1539   /* forward solve the lower triangular */
1540   idx    = 5*(*r++);
1541   t[0] = b[idx];   t[1] = b[1+idx];
1542   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1543   for (i=1; i<n; i++) {
1544     v     = aa + 25*ai[i];
1545     vi    = aj + ai[i];
1546     nz    = diag[i] - ai[i];
1547     idx   = 5*(*r++);
1548     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1549     s5  = b[4+idx];
1550     while (nz--) {
1551       idx   = 5*(*vi++);
1552       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1553       x4    = t[3+idx];x5 = t[4+idx];
1554       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1555       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1556       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1557       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1558       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1559       v += 25;
1560     }
1561     idx = 5*i;
1562     t[idx]   = s1;t[1+idx] = s2;
1563     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1564   }
1565   /* backward solve the upper triangular */
1566   for (i=n-1; i>=0; i--){
1567     v    = aa + 25*diag[i] + 25;
1568     vi   = aj + diag[i] + 1;
1569     nz   = ai[i+1] - diag[i] - 1;
1570     idt  = 5*i;
1571     s1 = t[idt];  s2 = t[1+idt];
1572     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1573     while (nz--) {
1574       idx   = 5*(*vi++);
1575       x1    = t[idx];   x2 = t[1+idx];
1576       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1577       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1578       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1579       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1580       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1581       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1582       v += 25;
1583     }
1584     idc = 5*(*c--);
1585     v   = aa + 25*diag[i];
1586     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
1587                                  v[15]*s4+v[20]*s5;
1588     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
1589                                  v[16]*s4+v[21]*s5;
1590     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
1591                                  v[17]*s4+v[22]*s5;
1592     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
1593                                  v[18]*s4+v[23]*s5;
1594     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
1595                                  v[19]*s4+v[24]*s5;
1596   }
1597 
1598   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1599   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1600   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1601   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1602   PetscLogFlops(2*25*(a->nz) - 5*A->n);
1603   PetscFunctionReturn(0);
1604 }
1605 
1606 #undef __FUNCT__
1607 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
1608 int MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
1609 {
1610   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1611   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1612   int             ierr,*diag = a->diag,jdx;
1613   MatScalar       *aa=a->a,*v;
1614   PetscScalar     *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
1615 
1616   PetscFunctionBegin;
1617   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1618   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1619   /* forward solve the lower triangular */
1620   idx    = 0;
1621   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
1622   for (i=1; i<n; i++) {
1623     v     =  aa + 25*ai[i];
1624     vi    =  aj + ai[i];
1625     nz    =  diag[i] - ai[i];
1626     idx   =  5*i;
1627     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
1628     while (nz--) {
1629       jdx   = 5*(*vi++);
1630       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
1631       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1632       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1633       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1634       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1635       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1636       v    += 25;
1637     }
1638     x[idx]   = s1;
1639     x[1+idx] = s2;
1640     x[2+idx] = s3;
1641     x[3+idx] = s4;
1642     x[4+idx] = s5;
1643   }
1644   /* backward solve the upper triangular */
1645   for (i=n-1; i>=0; i--){
1646     v    = aa + 25*diag[i] + 25;
1647     vi   = aj + diag[i] + 1;
1648     nz   = ai[i+1] - diag[i] - 1;
1649     idt  = 5*i;
1650     s1 = x[idt];  s2 = x[1+idt];
1651     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
1652     while (nz--) {
1653       idx   = 5*(*vi++);
1654       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
1655       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1656       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1657       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1658       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1659       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1660       v    += 25;
1661     }
1662     v        = aa + 25*diag[i];
1663     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
1664     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
1665     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
1666     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
1667     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
1668   }
1669 
1670   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1671   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1672   PetscLogFlops(2*25*(a->nz) - 5*A->n);
1673   PetscFunctionReturn(0);
1674 }
1675 
1676 #undef __FUNCT__
1677 #define __FUNCT__ "MatSolve_SeqBAIJ_4"
1678 int MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
1679 {
1680   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1681   IS              iscol=a->col,isrow=a->row;
1682   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1683   int             *diag = a->diag;
1684   MatScalar       *aa=a->a,*v;
1685   PetscScalar     *x,*b,s1,s2,s3,s4,x1,x2,x3,x4,*t;
1686 
1687   PetscFunctionBegin;
1688   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1689   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1690   t  = a->solve_work;
1691 
1692   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1693   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1694 
1695   /* forward solve the lower triangular */
1696   idx    = 4*(*r++);
1697   t[0] = b[idx];   t[1] = b[1+idx];
1698   t[2] = b[2+idx]; t[3] = b[3+idx];
1699   for (i=1; i<n; i++) {
1700     v     = aa + 16*ai[i];
1701     vi    = aj + ai[i];
1702     nz    = diag[i] - ai[i];
1703     idx   = 4*(*r++);
1704     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1705     while (nz--) {
1706       idx   = 4*(*vi++);
1707       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
1708       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1709       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1710       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1711       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1712       v    += 16;
1713     }
1714     idx        = 4*i;
1715     t[idx]   = s1;t[1+idx] = s2;
1716     t[2+idx] = s3;t[3+idx] = s4;
1717   }
1718   /* backward solve the upper triangular */
1719   for (i=n-1; i>=0; i--){
1720     v    = aa + 16*diag[i] + 16;
1721     vi   = aj + diag[i] + 1;
1722     nz   = ai[i+1] - diag[i] - 1;
1723     idt  = 4*i;
1724     s1 = t[idt];  s2 = t[1+idt];
1725     s3 = t[2+idt];s4 = t[3+idt];
1726     while (nz--) {
1727       idx   = 4*(*vi++);
1728       x1    = t[idx];   x2 = t[1+idx];
1729       x3    = t[2+idx]; x4 = t[3+idx];
1730       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1731       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1732       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1733       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1734       v += 16;
1735     }
1736     idc      = 4*(*c--);
1737     v        = aa + 16*diag[i];
1738     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1739     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1740     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1741     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1742   }
1743 
1744   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1745   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1746   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1747   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1748   PetscLogFlops(2*16*(a->nz) - 4*A->n);
1749   PetscFunctionReturn(0);
1750 }
1751 
1752 #undef __FUNCT__
1753 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion"
1754 int MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
1755 {
1756   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1757   IS              iscol=a->col,isrow=a->row;
1758   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1759   int             *diag = a->diag;
1760   MatScalar       *aa=a->a,*v,s1,s2,s3,s4,x1,x2,x3,x4,*t;
1761   PetscScalar     *x,*b;
1762 
1763   PetscFunctionBegin;
1764   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1765   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1766   t  = (MatScalar *)a->solve_work;
1767 
1768   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1769   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1770 
1771   /* forward solve the lower triangular */
1772   idx    = 4*(*r++);
1773   t[0] = (MatScalar)b[idx];
1774   t[1] = (MatScalar)b[1+idx];
1775   t[2] = (MatScalar)b[2+idx];
1776   t[3] = (MatScalar)b[3+idx];
1777   for (i=1; i<n; i++) {
1778     v     = aa + 16*ai[i];
1779     vi    = aj + ai[i];
1780     nz    = diag[i] - ai[i];
1781     idx   = 4*(*r++);
1782     s1 = (MatScalar)b[idx];
1783     s2 = (MatScalar)b[1+idx];
1784     s3 = (MatScalar)b[2+idx];
1785     s4 = (MatScalar)b[3+idx];
1786     while (nz--) {
1787       idx   = 4*(*vi++);
1788       x1  = t[idx];
1789       x2  = t[1+idx];
1790       x3  = t[2+idx];
1791       x4  = t[3+idx];
1792       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1793       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1794       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1795       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1796       v    += 16;
1797     }
1798     idx        = 4*i;
1799     t[idx]   = s1;
1800     t[1+idx] = s2;
1801     t[2+idx] = s3;
1802     t[3+idx] = s4;
1803   }
1804   /* backward solve the upper triangular */
1805   for (i=n-1; i>=0; i--){
1806     v    = aa + 16*diag[i] + 16;
1807     vi   = aj + diag[i] + 1;
1808     nz   = ai[i+1] - diag[i] - 1;
1809     idt  = 4*i;
1810     s1 = t[idt];
1811     s2 = t[1+idt];
1812     s3 = t[2+idt];
1813     s4 = t[3+idt];
1814     while (nz--) {
1815       idx   = 4*(*vi++);
1816       x1  = t[idx];
1817       x2  = t[1+idx];
1818       x3  = t[2+idx];
1819       x4  = t[3+idx];
1820       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1821       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1822       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1823       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1824       v += 16;
1825     }
1826     idc      = 4*(*c--);
1827     v        = aa + 16*diag[i];
1828     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1829     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1830     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1831     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1832     x[idc]   = (PetscScalar)t[idt];
1833     x[1+idc] = (PetscScalar)t[1+idt];
1834     x[2+idc] = (PetscScalar)t[2+idt];
1835     x[3+idc] = (PetscScalar)t[3+idt];
1836  }
1837 
1838   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1839   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1840   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1841   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1842   PetscLogFlops(2*16*(a->nz) - 4*A->n);
1843   PetscFunctionReturn(0);
1844 }
1845 
1846 #if defined (PETSC_HAVE_SSE)
1847 
1848 #include PETSC_HAVE_SSE
1849 
1850 #undef __FUNCT__
1851 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
1852 int MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
1853 {
1854   /*
1855      Note: This code uses demotion of double
1856      to float when performing the mixed-mode computation.
1857      This may not be numerically reasonable for all applications.
1858   */
1859   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1860   IS              iscol=a->col,isrow=a->row;
1861   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1862   int             *diag = a->diag,ai16;
1863   MatScalar       *aa=a->a,*v;
1864   PetscScalar     *x,*b,*t;
1865 
1866   /* Make space in temp stack for 16 Byte Aligned arrays */
1867   float           ssealignedspace[11],*tmps,*tmpx;
1868   unsigned long   offset;
1869 
1870   PetscFunctionBegin;
1871   SSE_SCOPE_BEGIN;
1872 
1873     offset = (unsigned long)ssealignedspace % 16;
1874     if (offset) offset = (16 - offset)/4;
1875     tmps = &ssealignedspace[offset];
1876     tmpx = &ssealignedspace[offset+4];
1877     PREFETCH_NTA(aa+16*ai[1]);
1878 
1879     ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1880     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1881     t  = a->solve_work;
1882 
1883     ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1884     ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1885 
1886     /* forward solve the lower triangular */
1887     idx  = 4*(*r++);
1888     t[0] = b[idx];   t[1] = b[1+idx];
1889     t[2] = b[2+idx]; t[3] = b[3+idx];
1890     v    =  aa + 16*ai[1];
1891 
1892     for (i=1; i<n;) {
1893       PREFETCH_NTA(&v[8]);
1894       vi   =  aj      + ai[i];
1895       nz   =  diag[i] - ai[i];
1896       idx  =  4*(*r++);
1897 
1898       /* Demote sum from double to float */
1899       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
1900       LOAD_PS(tmps,XMM7);
1901 
1902       while (nz--) {
1903         PREFETCH_NTA(&v[16]);
1904         idx = 4*(*vi++);
1905 
1906         /* Demote solution (so far) from double to float */
1907         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
1908 
1909         /* 4x4 Matrix-Vector product with negative accumulation: */
1910         SSE_INLINE_BEGIN_2(tmpx,v)
1911           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
1912 
1913           /* First Column */
1914           SSE_COPY_PS(XMM0,XMM6)
1915           SSE_SHUFFLE(XMM0,XMM0,0x00)
1916           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1917           SSE_SUB_PS(XMM7,XMM0)
1918 
1919           /* Second Column */
1920           SSE_COPY_PS(XMM1,XMM6)
1921           SSE_SHUFFLE(XMM1,XMM1,0x55)
1922           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1923           SSE_SUB_PS(XMM7,XMM1)
1924 
1925           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
1926 
1927           /* Third Column */
1928           SSE_COPY_PS(XMM2,XMM6)
1929           SSE_SHUFFLE(XMM2,XMM2,0xAA)
1930           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1931           SSE_SUB_PS(XMM7,XMM2)
1932 
1933           /* Fourth Column */
1934           SSE_COPY_PS(XMM3,XMM6)
1935           SSE_SHUFFLE(XMM3,XMM3,0xFF)
1936           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1937           SSE_SUB_PS(XMM7,XMM3)
1938         SSE_INLINE_END_2
1939 
1940         v  += 16;
1941       }
1942       idx = 4*i;
1943       v   = aa + 16*ai[++i];
1944       PREFETCH_NTA(v);
1945       STORE_PS(tmps,XMM7);
1946 
1947       /* Promote result from float to double */
1948       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
1949     }
1950     /* backward solve the upper triangular */
1951     idt  = 4*(n-1);
1952     ai16 = 16*diag[n-1];
1953     v    = aa + ai16 + 16;
1954     for (i=n-1; i>=0;){
1955       PREFETCH_NTA(&v[8]);
1956       vi = aj + diag[i] + 1;
1957       nz = ai[i+1] - diag[i] - 1;
1958 
1959       /* Demote accumulator from double to float */
1960       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
1961       LOAD_PS(tmps,XMM7);
1962 
1963       while (nz--) {
1964         PREFETCH_NTA(&v[16]);
1965         idx = 4*(*vi++);
1966 
1967         /* Demote solution (so far) from double to float */
1968         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
1969 
1970         /* 4x4 Matrix-Vector Product with negative accumulation: */
1971         SSE_INLINE_BEGIN_2(tmpx,v)
1972           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
1973 
1974           /* First Column */
1975           SSE_COPY_PS(XMM0,XMM6)
1976           SSE_SHUFFLE(XMM0,XMM0,0x00)
1977           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1978           SSE_SUB_PS(XMM7,XMM0)
1979 
1980           /* Second Column */
1981           SSE_COPY_PS(XMM1,XMM6)
1982           SSE_SHUFFLE(XMM1,XMM1,0x55)
1983           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1984           SSE_SUB_PS(XMM7,XMM1)
1985 
1986           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
1987 
1988           /* Third Column */
1989           SSE_COPY_PS(XMM2,XMM6)
1990           SSE_SHUFFLE(XMM2,XMM2,0xAA)
1991           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1992           SSE_SUB_PS(XMM7,XMM2)
1993 
1994           /* Fourth Column */
1995           SSE_COPY_PS(XMM3,XMM6)
1996           SSE_SHUFFLE(XMM3,XMM3,0xFF)
1997           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1998           SSE_SUB_PS(XMM7,XMM3)
1999         SSE_INLINE_END_2
2000         v  += 16;
2001       }
2002       v    = aa + ai16;
2003       ai16 = 16*diag[--i];
2004       PREFETCH_NTA(aa+ai16+16);
2005       /*
2006          Scale the result by the diagonal 4x4 block,
2007          which was inverted as part of the factorization
2008       */
2009       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
2010         /* First Column */
2011         SSE_COPY_PS(XMM0,XMM7)
2012         SSE_SHUFFLE(XMM0,XMM0,0x00)
2013         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2014 
2015         /* Second Column */
2016         SSE_COPY_PS(XMM1,XMM7)
2017         SSE_SHUFFLE(XMM1,XMM1,0x55)
2018         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2019         SSE_ADD_PS(XMM0,XMM1)
2020 
2021         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2022 
2023         /* Third Column */
2024         SSE_COPY_PS(XMM2,XMM7)
2025         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2026         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2027         SSE_ADD_PS(XMM0,XMM2)
2028 
2029         /* Fourth Column */
2030         SSE_COPY_PS(XMM3,XMM7)
2031         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2032         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2033         SSE_ADD_PS(XMM0,XMM3)
2034 
2035         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2036       SSE_INLINE_END_3
2037 
2038       /* Promote solution from float to double */
2039       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
2040 
2041       /* Apply reordering to t and stream into x.    */
2042       /* This way, x doesn't pollute the cache.      */
2043       /* Be careful with size: 2 doubles = 4 floats! */
2044       idc  = 4*(*c--);
2045       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
2046         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
2047         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
2048         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
2049         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
2050         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
2051         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
2052       SSE_INLINE_END_2
2053       v    = aa + ai16 + 16;
2054       idt -= 4;
2055     }
2056 
2057     ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2058     ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2059     ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2060     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2061     PetscLogFlops(2*16*(a->nz) - 4*A->n);
2062   SSE_SCOPE_END;
2063   PetscFunctionReturn(0);
2064 }
2065 
2066 #endif
2067 
2068 
2069 /*
2070       Special case where the matrix was ILU(0) factored in the natural
2071    ordering. This eliminates the need for the column and row permutation.
2072 */
2073 #undef __FUNCT__
2074 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
2075 int MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
2076 {
2077   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2078   int             n=a->mbs,*ai=a->i,*aj=a->j;
2079   int             ierr,*diag = a->diag;
2080   MatScalar       *aa=a->a;
2081   PetscScalar     *x,*b;
2082 
2083   PetscFunctionBegin;
2084   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2085   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2086 
2087 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
2088   {
2089     static PetscScalar w[2000]; /* very BAD need to fix */
2090     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
2091   }
2092 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
2093   {
2094     static PetscScalar w[2000]; /* very BAD need to fix */
2095     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
2096   }
2097 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
2098   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
2099 #else
2100   {
2101     PetscScalar  s1,s2,s3,s4,x1,x2,x3,x4;
2102     MatScalar    *v;
2103     int          jdx,idt,idx,nz,*vi,i,ai16;
2104 
2105   /* forward solve the lower triangular */
2106   idx    = 0;
2107   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
2108   for (i=1; i<n; i++) {
2109     v     =  aa      + 16*ai[i];
2110     vi    =  aj      + ai[i];
2111     nz    =  diag[i] - ai[i];
2112     idx   +=  4;
2113     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2114     while (nz--) {
2115       jdx   = 4*(*vi++);
2116       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
2117       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2118       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2119       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2120       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2121       v    += 16;
2122     }
2123     x[idx]   = s1;
2124     x[1+idx] = s2;
2125     x[2+idx] = s3;
2126     x[3+idx] = s4;
2127   }
2128   /* backward solve the upper triangular */
2129   idt = 4*(n-1);
2130   for (i=n-1; i>=0; i--){
2131     ai16 = 16*diag[i];
2132     v    = aa + ai16 + 16;
2133     vi   = aj + diag[i] + 1;
2134     nz   = ai[i+1] - diag[i] - 1;
2135     s1 = x[idt];  s2 = x[1+idt];
2136     s3 = x[2+idt];s4 = x[3+idt];
2137     while (nz--) {
2138       idx   = 4*(*vi++);
2139       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
2140       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2141       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2142       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2143       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
2144       v    += 16;
2145     }
2146     v        = aa + ai16;
2147     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
2148     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
2149     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
2150     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
2151     idt -= 4;
2152   }
2153   }
2154 #endif
2155 
2156   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2157   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2158   PetscLogFlops(2*16*(a->nz) - 4*A->n);
2159   PetscFunctionReturn(0);
2160 }
2161 
2162 #undef __FUNCT__
2163 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion"
2164 int MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
2165 {
2166   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2167   int             n=a->mbs,*ai=a->i,*aj=a->j;
2168   int             ierr,*diag = a->diag;
2169   MatScalar       *aa=a->a;
2170   PetscScalar     *x,*b;
2171 
2172   PetscFunctionBegin;
2173   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2174   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2175 
2176   {
2177     MatScalar  s1,s2,s3,s4,x1,x2,x3,x4;
2178     MatScalar  *v,*t=(MatScalar *)x;
2179     int        jdx,idt,idx,nz,*vi,i,ai16;
2180 
2181     /* forward solve the lower triangular */
2182     idx  = 0;
2183     t[0] = (MatScalar)b[0];
2184     t[1] = (MatScalar)b[1];
2185     t[2] = (MatScalar)b[2];
2186     t[3] = (MatScalar)b[3];
2187     for (i=1; i<n; i++) {
2188       v     =  aa      + 16*ai[i];
2189       vi    =  aj      + ai[i];
2190       nz    =  diag[i] - ai[i];
2191       idx   +=  4;
2192       s1 = (MatScalar)b[idx];
2193       s2 = (MatScalar)b[1+idx];
2194       s3 = (MatScalar)b[2+idx];
2195       s4 = (MatScalar)b[3+idx];
2196       while (nz--) {
2197         jdx = 4*(*vi++);
2198         x1  = t[jdx];
2199         x2  = t[1+jdx];
2200         x3  = t[2+jdx];
2201         x4  = t[3+jdx];
2202         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2203         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2204         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2205         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2206         v    += 16;
2207       }
2208       t[idx]   = s1;
2209       t[1+idx] = s2;
2210       t[2+idx] = s3;
2211       t[3+idx] = s4;
2212     }
2213     /* backward solve the upper triangular */
2214     idt = 4*(n-1);
2215     for (i=n-1; i>=0; i--){
2216       ai16 = 16*diag[i];
2217       v    = aa + ai16 + 16;
2218       vi   = aj + diag[i] + 1;
2219       nz   = ai[i+1] - diag[i] - 1;
2220       s1   = t[idt];
2221       s2   = t[1+idt];
2222       s3   = t[2+idt];
2223       s4   = t[3+idt];
2224       while (nz--) {
2225         idx = 4*(*vi++);
2226         x1  = (MatScalar)x[idx];
2227         x2  = (MatScalar)x[1+idx];
2228         x3  = (MatScalar)x[2+idx];
2229         x4  = (MatScalar)x[3+idx];
2230         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2231         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2232         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2233         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2234         v    += 16;
2235       }
2236       v        = aa + ai16;
2237       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
2238       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
2239       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
2240       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
2241       idt -= 4;
2242     }
2243   }
2244 
2245   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2246   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2247   PetscLogFlops(2*16*(a->nz) - 4*A->n);
2248   PetscFunctionReturn(0);
2249 }
2250 
2251 #if defined (PETSC_HAVE_SSE)
2252 
2253 #include PETSC_HAVE_SSE
2254 
2255 #undef __FUNCT__
2256 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion"
2257 int MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
2258 {
2259   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2260   int             n=a->mbs,*ai=a->i,*aj=a->j;
2261   int             ierr,*diag = a->diag;
2262   MatScalar       *aa=a->a;
2263   PetscScalar     *x,*b;
2264 
2265   PetscFunctionBegin;
2266   SSE_SCOPE_BEGIN;
2267   /*
2268      Note: This code currently uses demotion of double
2269      to float when performing the mixed-mode computation.
2270      This may not be numerically reasonable for all applications.
2271   */
2272   PREFETCH_NTA(aa+16*ai[1]);
2273 
2274   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2275   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2276   {
2277     MatScalar     *v;
2278     int           jdx,idt,idx,nz,*vi,i,ai16;
2279 
2280     /* Make space in temp stack for 16 Byte Aligned arrays */
2281     float         ssealignedspace[11],*tmps,*tmpx;
2282     unsigned long offset = (unsigned long)ssealignedspace % 16;
2283     if (offset) offset = (16 - offset)/4;
2284     tmps = &ssealignedspace[offset];
2285     tmpx = &ssealignedspace[offset+4];
2286 
2287   /* forward solve the lower triangular */
2288     idx  = 0;
2289     x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
2290     v    =  aa + 16*ai[1];
2291 
2292     for (i=1; i<n;) {
2293       PREFETCH_NTA(&v[8]);
2294       vi   =  aj      + ai[i];
2295       nz   =  diag[i] - ai[i];
2296       idx +=  4;
2297 
2298     /* Demote sum from double to float */
2299       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
2300       LOAD_PS(tmps,XMM7);
2301 
2302       while (nz--) {
2303         PREFETCH_NTA(&v[16]);
2304         jdx = 4*(*vi++);
2305 
2306         /* Demote solution (so far) from double to float */
2307         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[jdx]);
2308 
2309         /* 4x4 Matrix-Vector product with negative accumulation: */
2310         SSE_INLINE_BEGIN_2(tmpx,v)
2311           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2312 
2313           /* First Column */
2314           SSE_COPY_PS(XMM0,XMM6)
2315           SSE_SHUFFLE(XMM0,XMM0,0x00)
2316           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2317           SSE_SUB_PS(XMM7,XMM0)
2318 
2319           /* Second Column */
2320           SSE_COPY_PS(XMM1,XMM6)
2321           SSE_SHUFFLE(XMM1,XMM1,0x55)
2322           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2323           SSE_SUB_PS(XMM7,XMM1)
2324 
2325           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2326 
2327           /* Third Column */
2328           SSE_COPY_PS(XMM2,XMM6)
2329           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2330           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2331           SSE_SUB_PS(XMM7,XMM2)
2332 
2333           /* Fourth Column */
2334           SSE_COPY_PS(XMM3,XMM6)
2335           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2336           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2337           SSE_SUB_PS(XMM7,XMM3)
2338         SSE_INLINE_END_2
2339 
2340         v  += 16;
2341       }
2342       v    =  aa + 16*ai[++i];
2343       PREFETCH_NTA(v);
2344       STORE_PS(tmps,XMM7);
2345 
2346       /* Promote result from float to double */
2347       CONVERT_FLOAT4_DOUBLE4(&x[idx],tmps);
2348     }
2349     /* backward solve the upper triangular */
2350     idt  = 4*(n-1);
2351     ai16 = 16*diag[n-1];
2352     v    = aa + ai16 + 16;
2353     for (i=n-1; i>=0;){
2354       PREFETCH_NTA(&v[8]);
2355       vi = aj + diag[i] + 1;
2356       nz = ai[i+1] - diag[i] - 1;
2357 
2358       /* Demote accumulator from double to float */
2359       CONVERT_DOUBLE4_FLOAT4(tmps,&x[idt]);
2360       LOAD_PS(tmps,XMM7);
2361 
2362       while (nz--) {
2363         PREFETCH_NTA(&v[16]);
2364         idx = 4*(*vi++);
2365 
2366         /* Demote solution (so far) from double to float */
2367         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
2368 
2369         /* 4x4 Matrix-Vector Product with negative accumulation: */
2370         SSE_INLINE_BEGIN_2(tmpx,v)
2371           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2372 
2373           /* First Column */
2374           SSE_COPY_PS(XMM0,XMM6)
2375           SSE_SHUFFLE(XMM0,XMM0,0x00)
2376           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2377           SSE_SUB_PS(XMM7,XMM0)
2378 
2379           /* Second Column */
2380           SSE_COPY_PS(XMM1,XMM6)
2381           SSE_SHUFFLE(XMM1,XMM1,0x55)
2382           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2383           SSE_SUB_PS(XMM7,XMM1)
2384 
2385           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2386 
2387           /* Third Column */
2388           SSE_COPY_PS(XMM2,XMM6)
2389           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2390           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2391           SSE_SUB_PS(XMM7,XMM2)
2392 
2393           /* Fourth Column */
2394           SSE_COPY_PS(XMM3,XMM6)
2395           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2396           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2397           SSE_SUB_PS(XMM7,XMM3)
2398         SSE_INLINE_END_2
2399         v  += 16;
2400       }
2401       v    = aa + ai16;
2402       ai16 = 16*diag[--i];
2403       PREFETCH_NTA(aa+ai16+16);
2404       /*
2405          Scale the result by the diagonal 4x4 block,
2406          which was inverted as part of the factorization
2407       */
2408       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
2409         /* First Column */
2410         SSE_COPY_PS(XMM0,XMM7)
2411         SSE_SHUFFLE(XMM0,XMM0,0x00)
2412         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2413 
2414         /* Second Column */
2415         SSE_COPY_PS(XMM1,XMM7)
2416         SSE_SHUFFLE(XMM1,XMM1,0x55)
2417         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2418         SSE_ADD_PS(XMM0,XMM1)
2419 
2420         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2421 
2422         /* Third Column */
2423         SSE_COPY_PS(XMM2,XMM7)
2424         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2425         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2426         SSE_ADD_PS(XMM0,XMM2)
2427 
2428         /* Fourth Column */
2429         SSE_COPY_PS(XMM3,XMM7)
2430         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2431         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2432         SSE_ADD_PS(XMM0,XMM3)
2433 
2434         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2435       SSE_INLINE_END_3
2436 
2437       /* Promote solution from float to double */
2438       CONVERT_FLOAT4_DOUBLE4(&x[idt],tmps);
2439 
2440       v    = aa + ai16 + 16;
2441       idt -= 4;
2442     }
2443   }
2444   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2445   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2446   PetscLogFlops(2*16*(a->nz) - 4*A->n);
2447   SSE_SCOPE_END;
2448   PetscFunctionReturn(0);
2449 }
2450 
2451 #endif
2452 #undef __FUNCT__
2453 #define __FUNCT__ "MatSolve_SeqBAIJ_3"
2454 int MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
2455 {
2456   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
2457   IS              iscol=a->col,isrow=a->row;
2458   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
2459   int             *diag = a->diag;
2460   MatScalar       *aa=a->a,*v;
2461   PetscScalar     *x,*b,s1,s2,s3,x1,x2,x3,*t;
2462 
2463   PetscFunctionBegin;
2464   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2465   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2466   t  = a->solve_work;
2467 
2468   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2469   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2470 
2471   /* forward solve the lower triangular */
2472   idx    = 3*(*r++);
2473   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
2474   for (i=1; i<n; i++) {
2475     v     = aa + 9*ai[i];
2476     vi    = aj + ai[i];
2477     nz    = diag[i] - ai[i];
2478     idx   = 3*(*r++);
2479     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
2480     while (nz--) {
2481       idx   = 3*(*vi++);
2482       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2483       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2484       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2485       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2486       v += 9;
2487     }
2488     idx = 3*i;
2489     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
2490   }
2491   /* backward solve the upper triangular */
2492   for (i=n-1; i>=0; i--){
2493     v    = aa + 9*diag[i] + 9;
2494     vi   = aj + diag[i] + 1;
2495     nz   = ai[i+1] - diag[i] - 1;
2496     idt  = 3*i;
2497     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
2498     while (nz--) {
2499       idx   = 3*(*vi++);
2500       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2501       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2502       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2503       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2504       v += 9;
2505     }
2506     idc = 3*(*c--);
2507     v   = aa + 9*diag[i];
2508     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2509     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2510     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
2511   }
2512   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2513   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2514   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2515   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2516   PetscLogFlops(2*9*(a->nz) - 3*A->n);
2517   PetscFunctionReturn(0);
2518 }
2519 
2520 /*
2521       Special case where the matrix was ILU(0) factored in the natural
2522    ordering. This eliminates the need for the column and row permutation.
2523 */
2524 #undef __FUNCT__
2525 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
2526 int MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
2527 {
2528   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2529   int             n=a->mbs,*ai=a->i,*aj=a->j;
2530   int             ierr,*diag = a->diag;
2531   MatScalar       *aa=a->a,*v;
2532   PetscScalar     *x,*b,s1,s2,s3,x1,x2,x3;
2533   int             jdx,idt,idx,nz,*vi,i;
2534 
2535   PetscFunctionBegin;
2536   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2537   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2538 
2539 
2540   /* forward solve the lower triangular */
2541   idx    = 0;
2542   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
2543   for (i=1; i<n; i++) {
2544     v     =  aa      + 9*ai[i];
2545     vi    =  aj      + ai[i];
2546     nz    =  diag[i] - ai[i];
2547     idx   +=  3;
2548     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
2549     while (nz--) {
2550       jdx   = 3*(*vi++);
2551       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
2552       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2553       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2554       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2555       v    += 9;
2556     }
2557     x[idx]   = s1;
2558     x[1+idx] = s2;
2559     x[2+idx] = s3;
2560   }
2561   /* backward solve the upper triangular */
2562   for (i=n-1; i>=0; i--){
2563     v    = aa + 9*diag[i] + 9;
2564     vi   = aj + diag[i] + 1;
2565     nz   = ai[i+1] - diag[i] - 1;
2566     idt  = 3*i;
2567     s1 = x[idt];  s2 = x[1+idt];
2568     s3 = x[2+idt];
2569     while (nz--) {
2570       idx   = 3*(*vi++);
2571       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
2572       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2573       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2574       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2575       v    += 9;
2576     }
2577     v        = aa +  9*diag[i];
2578     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2579     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2580     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
2581   }
2582 
2583   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2584   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2585   PetscLogFlops(2*9*(a->nz) - 3*A->n);
2586   PetscFunctionReturn(0);
2587 }
2588 
2589 #undef __FUNCT__
2590 #define __FUNCT__ "MatSolve_SeqBAIJ_2"
2591 int MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
2592 {
2593   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
2594   IS              iscol=a->col,isrow=a->row;
2595   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
2596   int             *diag = a->diag;
2597   MatScalar       *aa=a->a,*v;
2598   PetscScalar     *x,*b,s1,s2,x1,x2,*t;
2599 
2600   PetscFunctionBegin;
2601   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2602   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2603   t  = a->solve_work;
2604 
2605   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2606   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2607 
2608   /* forward solve the lower triangular */
2609   idx    = 2*(*r++);
2610   t[0] = b[idx]; t[1] = b[1+idx];
2611   for (i=1; i<n; i++) {
2612     v     = aa + 4*ai[i];
2613     vi    = aj + ai[i];
2614     nz    = diag[i] - ai[i];
2615     idx   = 2*(*r++);
2616     s1  = b[idx]; s2 = b[1+idx];
2617     while (nz--) {
2618       idx   = 2*(*vi++);
2619       x1    = t[idx]; x2 = t[1+idx];
2620       s1 -= v[0]*x1 + v[2]*x2;
2621       s2 -= v[1]*x1 + v[3]*x2;
2622       v += 4;
2623     }
2624     idx = 2*i;
2625     t[idx] = s1; t[1+idx] = s2;
2626   }
2627   /* backward solve the upper triangular */
2628   for (i=n-1; i>=0; i--){
2629     v    = aa + 4*diag[i] + 4;
2630     vi   = aj + diag[i] + 1;
2631     nz   = ai[i+1] - diag[i] - 1;
2632     idt  = 2*i;
2633     s1 = t[idt]; s2 = t[1+idt];
2634     while (nz--) {
2635       idx   = 2*(*vi++);
2636       x1    = t[idx]; x2 = t[1+idx];
2637       s1 -= v[0]*x1 + v[2]*x2;
2638       s2 -= v[1]*x1 + v[3]*x2;
2639       v += 4;
2640     }
2641     idc = 2*(*c--);
2642     v   = aa + 4*diag[i];
2643     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
2644     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
2645   }
2646   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2647   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2648   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2649   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2650   PetscLogFlops(2*4*(a->nz) - 2*A->n);
2651   PetscFunctionReturn(0);
2652 }
2653 
2654 /*
2655       Special case where the matrix was ILU(0) factored in the natural
2656    ordering. This eliminates the need for the column and row permutation.
2657 */
2658 #undef __FUNCT__
2659 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
2660 int MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
2661 {
2662   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2663   int             n=a->mbs,*ai=a->i,*aj=a->j;
2664   int             ierr,*diag = a->diag;
2665   MatScalar       *aa=a->a,*v;
2666   PetscScalar     *x,*b,s1,s2,x1,x2;
2667   int             jdx,idt,idx,nz,*vi,i;
2668 
2669   PetscFunctionBegin;
2670   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2671   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2672 
2673   /* forward solve the lower triangular */
2674   idx    = 0;
2675   x[0]   = b[0]; x[1] = b[1];
2676   for (i=1; i<n; i++) {
2677     v     =  aa      + 4*ai[i];
2678     vi    =  aj      + ai[i];
2679     nz    =  diag[i] - ai[i];
2680     idx   +=  2;
2681     s1  =  b[idx];s2 = b[1+idx];
2682     while (nz--) {
2683       jdx   = 2*(*vi++);
2684       x1    = x[jdx];x2 = x[1+jdx];
2685       s1 -= v[0]*x1 + v[2]*x2;
2686       s2 -= v[1]*x1 + v[3]*x2;
2687       v    += 4;
2688     }
2689     x[idx]   = s1;
2690     x[1+idx] = s2;
2691   }
2692   /* backward solve the upper triangular */
2693   for (i=n-1; i>=0; i--){
2694     v    = aa + 4*diag[i] + 4;
2695     vi   = aj + diag[i] + 1;
2696     nz   = ai[i+1] - diag[i] - 1;
2697     idt  = 2*i;
2698     s1 = x[idt];  s2 = x[1+idt];
2699     while (nz--) {
2700       idx   = 2*(*vi++);
2701       x1    = x[idx];   x2 = x[1+idx];
2702       s1 -= v[0]*x1 + v[2]*x2;
2703       s2 -= v[1]*x1 + v[3]*x2;
2704       v    += 4;
2705     }
2706     v        = aa +  4*diag[i];
2707     x[idt]   = v[0]*s1 + v[2]*s2;
2708     x[1+idt] = v[1]*s1 + v[3]*s2;
2709   }
2710 
2711   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2712   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2713   PetscLogFlops(2*4*(a->nz) - 2*A->n);
2714   PetscFunctionReturn(0);
2715 }
2716 
2717 #undef __FUNCT__
2718 #define __FUNCT__ "MatSolve_SeqBAIJ_1"
2719 int MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
2720 {
2721   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
2722   IS              iscol=a->col,isrow=a->row;
2723   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
2724   int             *diag = a->diag;
2725   MatScalar       *aa=a->a,*v;
2726   PetscScalar     *x,*b,s1,*t;
2727 
2728   PetscFunctionBegin;
2729   if (!n) PetscFunctionReturn(0);
2730 
2731   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2732   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2733   t  = a->solve_work;
2734 
2735   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2736   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2737 
2738   /* forward solve the lower triangular */
2739   t[0] = b[*r++];
2740   for (i=1; i<n; i++) {
2741     v     = aa + ai[i];
2742     vi    = aj + ai[i];
2743     nz    = diag[i] - ai[i];
2744     s1  = b[*r++];
2745     while (nz--) {
2746       s1 -= (*v++)*t[*vi++];
2747     }
2748     t[i] = s1;
2749   }
2750   /* backward solve the upper triangular */
2751   for (i=n-1; i>=0; i--){
2752     v    = aa + diag[i] + 1;
2753     vi   = aj + diag[i] + 1;
2754     nz   = ai[i+1] - diag[i] - 1;
2755     s1 = t[i];
2756     while (nz--) {
2757       s1 -= (*v++)*t[*vi++];
2758     }
2759     x[*c--] = t[i] = aa[diag[i]]*s1;
2760   }
2761 
2762   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2763   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2764   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2765   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2766   PetscLogFlops(2*1*(a->nz) - A->n);
2767   PetscFunctionReturn(0);
2768 }
2769 /*
2770       Special case where the matrix was ILU(0) factored in the natural
2771    ordering. This eliminates the need for the column and row permutation.
2772 */
2773 #undef __FUNCT__
2774 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering"
2775 int MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2776 {
2777   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2778   int             n=a->mbs,*ai=a->i,*aj=a->j;
2779   int             ierr,*diag = a->diag;
2780   MatScalar       *aa=a->a;
2781   PetscScalar     *x,*b;
2782   PetscScalar     s1,x1;
2783   MatScalar       *v;
2784   int             jdx,idt,idx,nz,*vi,i;
2785 
2786   PetscFunctionBegin;
2787   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2788   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2789 
2790   /* forward solve the lower triangular */
2791   idx    = 0;
2792   x[0]   = b[0];
2793   for (i=1; i<n; i++) {
2794     v     =  aa      + ai[i];
2795     vi    =  aj      + ai[i];
2796     nz    =  diag[i] - ai[i];
2797     idx   +=  1;
2798     s1  =  b[idx];
2799     while (nz--) {
2800       jdx   = *vi++;
2801       x1    = x[jdx];
2802       s1 -= v[0]*x1;
2803       v    += 1;
2804     }
2805     x[idx]   = s1;
2806   }
2807   /* backward solve the upper triangular */
2808   for (i=n-1; i>=0; i--){
2809     v    = aa + diag[i] + 1;
2810     vi   = aj + diag[i] + 1;
2811     nz   = ai[i+1] - diag[i] - 1;
2812     idt  = i;
2813     s1 = x[idt];
2814     while (nz--) {
2815       idx   = *vi++;
2816       x1    = x[idx];
2817       s1 -= v[0]*x1;
2818       v    += 1;
2819     }
2820     v        = aa +  diag[i];
2821     x[idt]   = v[0]*s1;
2822   }
2823   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2824   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2825   PetscLogFlops(2*(a->nz) - A->n);
2826   PetscFunctionReturn(0);
2827 }
2828 
2829 /* ----------------------------------------------------------------*/
2830 /*
2831      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
2832    except that the data structure of Mat_SeqAIJ is slightly different.
2833    Not a good example of code reuse.
2834 */
2835 EXTERN int MatMissingDiagonal_SeqBAIJ(Mat);
2836 
2837 #undef __FUNCT__
2838 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ"
2839 int MatILUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,MatILUInfo *info,Mat *fact)
2840 {
2841   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b;
2842   IS          isicol;
2843   int         *r,*ic,ierr,prow,n = a->mbs,*ai = a->i,*aj = a->j;
2844   int         *ainew,*ajnew,jmax,*fill,*xi,nz,*im,*ajfill,*flev;
2845   int         *dloc,idx,row,m,fm,nzf,nzi,len, realloc = 0,dcount = 0;
2846   int         incrlev,nnz,i,bs = a->bs,bs2 = a->bs2,levels,diagonal_fill;
2847   PetscTruth  col_identity,row_identity;
2848   PetscReal   f;
2849 
2850   PetscFunctionBegin;
2851   f             = info->fill;
2852   levels        = (int)info->levels;
2853   diagonal_fill = (int)info->diagonal_fill;
2854   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
2855   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
2856   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
2857 
2858   if (!levels && row_identity && col_identity) {  /* special case copy the nonzero structure */
2859     ierr = MatDuplicate_SeqBAIJ(A,MAT_DO_NOT_COPY_VALUES,fact);CHKERRQ(ierr);
2860     (*fact)->factor = FACTOR_LU;
2861     b               = (Mat_SeqBAIJ*)(*fact)->data;
2862     if (!b->diag) {
2863       ierr = MatMarkDiagonal_SeqBAIJ(*fact);CHKERRQ(ierr);
2864     }
2865     ierr = MatMissingDiagonal_SeqBAIJ(*fact);CHKERRQ(ierr);
2866     b->row        = isrow;
2867     b->col        = iscol;
2868     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
2869     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
2870     b->icol       = isicol;
2871     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
2872     ierr          = PetscMalloc(((*fact)->m+1+b->bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
2873   } else { /* general case perform the symbolic factorization */
2874     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
2875     ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
2876 
2877     /* get new row pointers */
2878     ierr = PetscMalloc((n+1)*sizeof(int),&ainew);CHKERRQ(ierr);
2879     ainew[0] = 0;
2880     /* don't know how many column pointers are needed so estimate */
2881     jmax = (int)(f*ai[n] + 1);
2882     ierr = PetscMalloc((jmax)*sizeof(int),&ajnew);CHKERRQ(ierr);
2883     /* ajfill is level of fill for each fill entry */
2884     ierr = PetscMalloc((jmax)*sizeof(int),&ajfill);CHKERRQ(ierr);
2885     /* fill is a linked list of nonzeros in active row */
2886     ierr = PetscMalloc((n+1)*sizeof(int),&fill);CHKERRQ(ierr);
2887     /* im is level for each filled value */
2888     ierr = PetscMalloc((n+1)*sizeof(int),&im);CHKERRQ(ierr);
2889     /* dloc is location of diagonal in factor */
2890     ierr = PetscMalloc((n+1)*sizeof(int),&dloc);CHKERRQ(ierr);
2891     dloc[0]  = 0;
2892     for (prow=0; prow<n; prow++) {
2893 
2894       /* copy prow into linked list */
2895       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
2896       if (!nz) SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix");
2897       xi         = aj + ai[r[prow]];
2898       fill[n]    = n;
2899       fill[prow] = -1; /* marker for diagonal entry */
2900       while (nz--) {
2901 	fm  = n;
2902 	idx = ic[*xi++];
2903 	do {
2904 	  m  = fm;
2905 	  fm = fill[m];
2906 	} while (fm < idx);
2907 	fill[m]   = idx;
2908 	fill[idx] = fm;
2909 	im[idx]   = 0;
2910       }
2911 
2912       /* make sure diagonal entry is included */
2913       if (diagonal_fill && fill[prow] == -1) {
2914 	fm = n;
2915 	while (fill[fm] < prow) fm = fill[fm];
2916 	fill[prow] = fill[fm];  /* insert diagonal into linked list */
2917 	fill[fm]   = prow;
2918 	im[prow]   = 0;
2919 	nzf++;
2920 	dcount++;
2921       }
2922 
2923       nzi = 0;
2924       row = fill[n];
2925       while (row < prow) {
2926 	incrlev = im[row] + 1;
2927 	nz      = dloc[row];
2928 	xi      = ajnew  + ainew[row] + nz + 1;
2929 	flev    = ajfill + ainew[row] + nz + 1;
2930 	nnz     = ainew[row+1] - ainew[row] - nz - 1;
2931 	fm      = row;
2932 	while (nnz-- > 0) {
2933 	  idx = *xi++;
2934 	  if (*flev + incrlev > levels) {
2935 	    flev++;
2936 	    continue;
2937 	  }
2938 	  do {
2939 	    m  = fm;
2940 	    fm = fill[m];
2941 	  } while (fm < idx);
2942 	  if (fm != idx) {
2943 	    im[idx]   = *flev + incrlev;
2944 	    fill[m]   = idx;
2945 	    fill[idx] = fm;
2946 	    fm        = idx;
2947 	    nzf++;
2948 	  } else {
2949 	    if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
2950 	  }
2951 	  flev++;
2952 	}
2953 	row = fill[row];
2954 	nzi++;
2955       }
2956       /* copy new filled row into permanent storage */
2957       ainew[prow+1] = ainew[prow] + nzf;
2958       if (ainew[prow+1] > jmax) {
2959 
2960 	/* estimate how much additional space we will need */
2961 	/* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
2962 	/* just double the memory each time */
2963 	int maxadd = jmax;
2964 	/* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
2965 	if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
2966 	jmax += maxadd;
2967 
2968 	/* allocate a longer ajnew and ajfill */
2969 	ierr = PetscMalloc(jmax*sizeof(int),&xi);CHKERRQ(ierr);
2970 	ierr = PetscMemcpy(xi,ajnew,ainew[prow]*sizeof(int));CHKERRQ(ierr);
2971 	ierr = PetscFree(ajnew);CHKERRQ(ierr);
2972 	ajnew = xi;
2973 	ierr = PetscMalloc(jmax*sizeof(int),&xi);CHKERRQ(ierr);
2974 	ierr = PetscMemcpy(xi,ajfill,ainew[prow]*sizeof(int));CHKERRQ(ierr);
2975 	ierr = PetscFree(ajfill);CHKERRQ(ierr);
2976 	ajfill = xi;
2977 	realloc++; /* count how many reallocations are needed */
2978       }
2979       xi          = ajnew + ainew[prow];
2980       flev        = ajfill + ainew[prow];
2981       dloc[prow]  = nzi;
2982       fm          = fill[n];
2983       while (nzf--) {
2984 	*xi++   = fm;
2985 	*flev++ = im[fm];
2986 	fm      = fill[fm];
2987       }
2988       /* make sure row has diagonal entry */
2989       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
2990 	SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %d has missing diagonal in factored matrix\n\
2991     try running with -pc_ilu_nonzeros_along_diagonal or -pc_ilu_diagonal_fill",prow);
2992       }
2993     }
2994     ierr = PetscFree(ajfill);CHKERRQ(ierr);
2995     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
2996     ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
2997     ierr = PetscFree(fill);CHKERRQ(ierr);
2998     ierr = PetscFree(im);CHKERRQ(ierr);
2999 
3000     {
3001       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
3002       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",realloc,f,af);
3003       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Run with -pc_ilu_fill %g or use \n",af);
3004       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:PCILUSetFill(pc,%g);\n",af);
3005       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:for best performance.\n");
3006       if (diagonal_fill) {
3007 	PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Detected and replaced %d missing diagonals",dcount);
3008       }
3009     }
3010 
3011     /* put together the new matrix */
3012     ierr = MatCreateSeqBAIJ(A->comm,bs,bs*n,bs*n,0,PETSC_NULL,fact);CHKERRQ(ierr);
3013     PetscLogObjectParent(*fact,isicol);
3014     b = (Mat_SeqBAIJ*)(*fact)->data;
3015     ierr = PetscFree(b->imax);CHKERRQ(ierr);
3016     b->singlemalloc = PETSC_FALSE;
3017     len = bs2*ainew[n]*sizeof(MatScalar);
3018     /* the next line frees the default space generated by the Create() */
3019     ierr = PetscFree(b->a);CHKERRQ(ierr);
3020     ierr = PetscFree(b->ilen);CHKERRQ(ierr);
3021     ierr = PetscMalloc(len,&b->a);CHKERRQ(ierr);
3022     b->j          = ajnew;
3023     b->i          = ainew;
3024     for (i=0; i<n; i++) dloc[i] += ainew[i];
3025     b->diag       = dloc;
3026     b->ilen       = 0;
3027     b->imax       = 0;
3028     b->row        = isrow;
3029     b->col        = iscol;
3030     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3031     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
3032     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
3033     b->icol       = isicol;
3034     ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
3035     /* In b structure:  Free imax, ilen, old a, old j.
3036        Allocate dloc, solve_work, new a, new j */
3037     PetscLogObjectMemory(*fact,(ainew[n]-n)*(sizeof(int))+bs2*ainew[n]*sizeof(PetscScalar));
3038     b->maxnz          = b->nz = ainew[n];
3039     (*fact)->factor   = FACTOR_LU;
3040 
3041     (*fact)->info.factor_mallocs    = realloc;
3042     (*fact)->info.fill_ratio_given  = f;
3043     (*fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
3044   }
3045 
3046   if (row_identity && col_identity) {
3047     ierr = MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(*fact);CHKERRQ(ierr);
3048   }
3049   PetscFunctionReturn(0);
3050 }
3051 
3052 #undef __FUNCT__
3053 #define __FUNCT__ "MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering"
3054 int MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(Mat inA)
3055 {
3056   /*
3057       Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver
3058       with natural ordering
3059   */
3060   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)inA->data;
3061   PetscTruth  sse_enabled_local,sse_enabled_global;
3062   int         ierr;
3063 
3064   PetscFunctionBegin;
3065   inA->ops->solve             = MatSolve_SeqBAIJ_Update;
3066   inA->ops->solvetranspose    = MatSolveTranspose_SeqBAIJ_Update;
3067   switch (a->bs) {
3068   case 1:
3069     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_1;
3070     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=1\n");
3071     break;
3072   case 2:
3073     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering;
3074     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=2\n");
3075     break;
3076   case 3:
3077     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering;
3078     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=3\n");
3079     break;
3080   case 4:
3081     ierr = PetscSSEIsEnabled(inA->comm,&sse_enabled_local,&sse_enabled_global);CHKERRQ(ierr);
3082     if (sse_enabled_local) {
3083 #if defined(PETSC_HAVE_SSE)
3084       inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE;
3085       PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special SSE, in-place natural ordering factor BS=4\n");
3086 #else
3087       /* This should never be reached.  If so, problem in PetscSSEIsEnabled. */
3088       SETERRQ(PETSC_ERR_SUP,"SSE Hardware unavailable");
3089 #endif
3090     } else {
3091       inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering;
3092       PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=4\n");
3093     }
3094     break;
3095   case 5:
3096     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_5_NaturalOrdering;
3097     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=5\n");
3098     break;
3099   case 6:
3100     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_6_NaturalOrdering;
3101     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=6\n");
3102     break;
3103   case 7:
3104     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_7_NaturalOrdering;
3105     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=7\n");
3106     break;
3107   }
3108   PetscFunctionReturn(0);
3109 }
3110 
3111 #undef __FUNCT__
3112 #define __FUNCT__ "MatSeqBAIJ_UpdateSolvers"
3113 int MatSeqBAIJ_UpdateSolvers(Mat A)
3114 {
3115   /*
3116       Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver
3117       with natural ordering
3118   */
3119   Mat_SeqBAIJ *a  = (Mat_SeqBAIJ *)A->data;
3120   IS          row = a->row, col = a->col;
3121   PetscTruth  row_identity, col_identity;
3122   PetscTruth  use_single, use_natural;
3123   int         ierr;
3124 
3125   PetscFunctionBegin;
3126 
3127   use_single  = PETSC_FALSE;
3128   use_natural = PETSC_FALSE;
3129 
3130   ierr = ISIdentity(row,&row_identity);CHKERRQ(ierr);
3131   ierr = ISIdentity(col,&col_identity);CHKERRQ(ierr);
3132 
3133   if (row_identity && col_identity) {
3134     use_natural = PETSC_TRUE;
3135   } else {
3136     use_natural = PETSC_FALSE;
3137   }
3138   switch (a->bs) {
3139   case 1:
3140     if (use_natural) {
3141       A->ops->solve           = MatSolve_SeqBAIJ_1_NaturalOrdering;
3142       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering;
3143       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=1\n");
3144       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
3145     } else {
3146       A->ops->solve           = MatSolve_SeqBAIJ_1;
3147       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_1;
3148     }
3149     break;
3150   case 2:
3151     if (use_natural) {
3152       A->ops->solve           = MatSolve_SeqBAIJ_2_NaturalOrdering;
3153       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering;
3154       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=2\n");
3155       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
3156     } else {
3157       A->ops->solve           = MatSolve_SeqBAIJ_2;
3158       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_2;
3159     }
3160     break;
3161   case 3:
3162     if (use_natural) {
3163       A->ops->solve           = MatSolve_SeqBAIJ_3_NaturalOrdering;
3164       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering;
3165       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=3\n");
3166       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
3167     } else {
3168       A->ops->solve           = MatSolve_SeqBAIJ_3;
3169       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_3;
3170     }
3171     break;
3172   case 4:
3173     {
3174       PetscTruth sse_enabled_local, sse_enabled_global;
3175       PetscTruth single_prec, flg;
3176       single_prec = flg = PETSC_FALSE;
3177 
3178       ierr = PetscSSEIsEnabled(A->comm,&sse_enabled_local,&sse_enabled_global);CHKERRQ(ierr);
3179       ierr = PetscOptionsGetLogical(PETSC_NULL,"-mat_single_precision_solves",&single_prec,&flg);CHKERRQ(ierr);
3180       if (flg) {
3181         a->single_precision_solves = single_prec;
3182       }
3183       if (a->single_precision_solves) {
3184         use_single = PETSC_TRUE;
3185       }
3186       if (use_natural) {
3187         if (use_single) {
3188           if (sse_enabled_local) { /* Natural + Single + SSE */
3189 #if defined(PETSC_HAVE_SSE)
3190             A->ops->solve         = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion;
3191             PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE, in-place natural ordering solve BS=4\n");
3192 #else
3193             /* This should never be reached.  If so, problem in PetscSSEIsEnabled. */
3194             SETERRQ(PETSC_ERR_SUP,"SSE Hardware unavailable");
3195 #endif
3196           } else { /* Natural + Single */
3197             A->ops->solve         = MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion;
3198             PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, in-place natural ordering solve BS=4\n");
3199           }
3200         } else { /* Natural */
3201           A->ops->solve           = MatSolve_SeqBAIJ_4_NaturalOrdering;
3202           PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
3203         } /* Only one version of SolveTranspose for Natural Ordering */
3204         A->ops->solvetranspose    = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
3205         PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
3206       } else { /* Arbitrary ordering */
3207         if (use_single) {
3208           if (sse_enabled_local) { /* Arbitrary + Single + SSE */
3209 #if defined(PETSC_HAVE_SSE)
3210             A->ops->solve         = MatSolve_SeqBAIJ_4_SSE_Demotion;
3211             PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE solve BS=4\n");
3212 #else
3213             /* This should never be reached.  If so, problem in PetscSSEIsEnabled. */
3214             SETERRQ(PETSC_ERR_SUP,"SSE Hardware unavailable");
3215 #endif
3216           } else { /* Arbitrary + Single */
3217             A->ops->solve         = MatSolve_SeqBAIJ_4_Demotion;
3218             PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision solve BS=4\n");
3219           }
3220         } else { /* Arbitrary */
3221           A->ops->solve           = MatSolve_SeqBAIJ_4;
3222         } /* Only one version of SolveTranspose for Natural Ordering */
3223         A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_4;
3224       }
3225     }
3226     break;
3227   case 5:
3228     if (use_natural) {
3229       A->ops->solve           = MatSolve_SeqBAIJ_5_NaturalOrdering;
3230       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_5_NaturalOrdering;
3231       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=5\n");
3232       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=5\n");
3233     } else {
3234       A->ops->solve           = MatSolve_SeqBAIJ_5;
3235       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_5;
3236     }
3237     break;
3238   case 6:
3239     if (use_natural) {
3240       A->ops->solve           = MatSolve_SeqBAIJ_6_NaturalOrdering;
3241       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_6_NaturalOrdering;
3242       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=6\n");
3243       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=6\n");
3244     } else {
3245       A->ops->solve           = MatSolve_SeqBAIJ_6;
3246       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_6;
3247     }
3248     break;
3249   case 7:
3250     if (use_natural) {
3251       A->ops->solve           = MatSolve_SeqBAIJ_7_NaturalOrdering;
3252       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_7_NaturalOrdering;
3253       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=7\n");
3254       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=7\n");
3255     } else {
3256       A->ops->solve           = MatSolve_SeqBAIJ_7;
3257       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_7;
3258     }
3259     break;
3260   default:
3261     A->ops->solve             = MatSolve_SeqBAIJ_N;
3262     break;
3263   }
3264   PetscFunctionReturn(0);
3265 }
3266 
3267 #undef __FUNCT__
3268 #define __FUNCT__ "MatSolve_SeqBAIJ_Update"
3269 int MatSolve_SeqBAIJ_Update(Mat A,Vec x,Vec y) {
3270   int ierr;
3271 
3272   PetscFunctionBegin;
3273   ierr = MatSeqBAIJ_UpdateSolvers(A);
3274   if (A->ops->solve != MatSolve_SeqBAIJ_Update) {
3275     ierr = (*A->ops->solve)(A,x,y);CHKERRQ(ierr);
3276   } else {
3277     SETERRQ(PETSC_ERR_SUP,"Something really wrong happened.");
3278   }
3279   PetscFunctionReturn(0);
3280 }
3281 
3282 #undef __FUNCT__
3283 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_Update"
3284 int MatSolveTranspose_SeqBAIJ_Update(Mat A,Vec x,Vec y) {
3285   int ierr;
3286 
3287   PetscFunctionBegin;
3288   ierr = MatSeqBAIJ_UpdateSolvers(A);
3289   ierr = (*A->ops->solvetranspose)(A,x,y);CHKERRQ(ierr);
3290   PetscFunctionReturn(0);
3291 }
3292 
3293 
3294