xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision b271bb04eaec0d477b629c8be871cf42cb4980f5)
1 #define PETSCMAT_DLL
2 
3 /*
4     Factorization code for BAIJ format.
5 */
6 
7 #include "src/mat/impls/baij/seq/baij.h"
8 #include "src/inline/ilu.h"
9 #include "src/inline/dot.h"
10 
11 #undef __FUNCT__
12 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering"
13 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
14 {
15   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
16   PetscErrorCode ierr;
17   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
18   PetscInt       *diag = a->diag;
19   MatScalar      *aa=a->a,*v;
20   PetscScalar    s1,*x,*b;
21 
22   PetscFunctionBegin;
23   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
24   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
25   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
26 
27   /* forward solve the U^T */
28   for (i=0; i<n; i++) {
29 
30     v     = aa + diag[i];
31     /* multiply by the inverse of the block diagonal */
32     s1    = (*v++)*x[i];
33     vi    = aj + diag[i] + 1;
34     nz    = ai[i+1] - diag[i] - 1;
35     while (nz--) {
36       x[*vi++]  -= (*v++)*s1;
37     }
38     x[i]   = s1;
39   }
40   /* backward solve the L^T */
41   for (i=n-1; i>=0; i--){
42     v    = aa + diag[i] - 1;
43     vi   = aj + diag[i] - 1;
44     nz   = diag[i] - ai[i];
45     s1   = x[i];
46     while (nz--) {
47       x[*vi--]   -=  (*v--)*s1;
48     }
49   }
50   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
51   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
52   ierr = PetscLogFlops(2*(a->nz) - A->cmap->n);CHKERRQ(ierr);
53   PetscFunctionReturn(0);
54 }
55 
56 #undef __FUNCT__
57 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering"
58 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
59 {
60   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
61   PetscErrorCode ierr;
62   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
63   PetscInt       *diag = a->diag,oidx;
64   MatScalar      *aa=a->a,*v;
65   PetscScalar    s1,s2,x1,x2;
66   PetscScalar    *x,*b;
67 
68   PetscFunctionBegin;
69   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
70   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
71   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
72 
73   /* forward solve the U^T */
74   idx = 0;
75   for (i=0; i<n; i++) {
76 
77     v     = aa + 4*diag[i];
78     /* multiply by the inverse of the block diagonal */
79     x1 = x[idx];   x2 = x[1+idx];
80     s1 = v[0]*x1  +  v[1]*x2;
81     s2 = v[2]*x1  +  v[3]*x2;
82     v += 4;
83 
84     vi    = aj + diag[i] + 1;
85     nz    = ai[i+1] - diag[i] - 1;
86     while (nz--) {
87       oidx = 2*(*vi++);
88       x[oidx]   -= v[0]*s1  +  v[1]*s2;
89       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
90       v  += 4;
91     }
92     x[idx]   = s1;x[1+idx] = s2;
93     idx += 2;
94   }
95   /* backward solve the L^T */
96   for (i=n-1; i>=0; i--){
97     v    = aa + 4*diag[i] - 4;
98     vi   = aj + diag[i] - 1;
99     nz   = diag[i] - ai[i];
100     idt  = 2*i;
101     s1   = x[idt];  s2 = x[1+idt];
102     while (nz--) {
103       idx   = 2*(*vi--);
104       x[idx]   -=  v[0]*s1 +  v[1]*s2;
105       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
106       v -= 4;
107     }
108   }
109   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
110   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
111   ierr = PetscLogFlops(2*4*(a->nz) - 2*A->cmap->n);CHKERRQ(ierr);
112   PetscFunctionReturn(0);
113 }
114 
115 #undef __FUNCT__
116 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering"
117 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
118 {
119   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
120   PetscErrorCode ierr;
121   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
122   PetscInt       *diag = a->diag,oidx;
123   MatScalar      *aa=a->a,*v;
124   PetscScalar    s1,s2,s3,x1,x2,x3;
125   PetscScalar    *x,*b;
126 
127   PetscFunctionBegin;
128   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
129   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
130   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
131 
132   /* forward solve the U^T */
133   idx = 0;
134   for (i=0; i<n; i++) {
135 
136     v     = aa + 9*diag[i];
137     /* multiply by the inverse of the block diagonal */
138     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
139     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
140     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
141     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
142     v += 9;
143 
144     vi    = aj + diag[i] + 1;
145     nz    = ai[i+1] - diag[i] - 1;
146     while (nz--) {
147       oidx = 3*(*vi++);
148       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
149       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
150       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
151       v  += 9;
152     }
153     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
154     idx += 3;
155   }
156   /* backward solve the L^T */
157   for (i=n-1; i>=0; i--){
158     v    = aa + 9*diag[i] - 9;
159     vi   = aj + diag[i] - 1;
160     nz   = diag[i] - ai[i];
161     idt  = 3*i;
162     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
163     while (nz--) {
164       idx   = 3*(*vi--);
165       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
166       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
167       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
168       v -= 9;
169     }
170   }
171   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
172   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
173   ierr = PetscLogFlops(2*9*(a->nz) - 3*A->cmap->n);CHKERRQ(ierr);
174   PetscFunctionReturn(0);
175 }
176 
177 #undef __FUNCT__
178 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering"
179 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
180 {
181   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
182   PetscErrorCode ierr;
183   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
184   PetscInt       *diag = a->diag,oidx;
185   MatScalar      *aa=a->a,*v;
186   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
187   PetscScalar    *x,*b;
188 
189   PetscFunctionBegin;
190   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
191   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
192   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
193 
194   /* forward solve the U^T */
195   idx = 0;
196   for (i=0; i<n; i++) {
197 
198     v     = aa + 16*diag[i];
199     /* multiply by the inverse of the block diagonal */
200     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
201     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
202     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
203     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
204     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
205     v += 16;
206 
207     vi    = aj + diag[i] + 1;
208     nz    = ai[i+1] - diag[i] - 1;
209     while (nz--) {
210       oidx = 4*(*vi++);
211       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
212       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
213       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
214       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
215       v  += 16;
216     }
217     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
218     idx += 4;
219   }
220   /* backward solve the L^T */
221   for (i=n-1; i>=0; i--){
222     v    = aa + 16*diag[i] - 16;
223     vi   = aj + diag[i] - 1;
224     nz   = diag[i] - ai[i];
225     idt  = 4*i;
226     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
227     while (nz--) {
228       idx   = 4*(*vi--);
229       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
230       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
231       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
232       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
233       v -= 16;
234     }
235   }
236   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
237   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
238   ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr);
239   PetscFunctionReturn(0);
240 }
241 
242 #undef __FUNCT__
243 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering"
244 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
245 {
246   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
247   PetscErrorCode ierr;
248   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
249   PetscInt       *diag = a->diag,oidx;
250   MatScalar      *aa=a->a,*v;
251   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
252   PetscScalar    *x,*b;
253 
254   PetscFunctionBegin;
255   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
256   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
257   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
258 
259   /* forward solve the U^T */
260   idx = 0;
261   for (i=0; i<n; i++) {
262 
263     v     = aa + 25*diag[i];
264     /* multiply by the inverse of the block diagonal */
265     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
266     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
267     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
268     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
269     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
270     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
271     v += 25;
272 
273     vi    = aj + diag[i] + 1;
274     nz    = ai[i+1] - diag[i] - 1;
275     while (nz--) {
276       oidx = 5*(*vi++);
277       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
278       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
279       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
280       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
281       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
282       v  += 25;
283     }
284     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
285     idx += 5;
286   }
287   /* backward solve the L^T */
288   for (i=n-1; i>=0; i--){
289     v    = aa + 25*diag[i] - 25;
290     vi   = aj + diag[i] - 1;
291     nz   = diag[i] - ai[i];
292     idt  = 5*i;
293     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
294     while (nz--) {
295       idx   = 5*(*vi--);
296       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
297       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
298       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
299       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
300       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
301       v -= 25;
302     }
303   }
304   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
305   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
306   ierr = PetscLogFlops(2*25*(a->nz) - 5*A->cmap->n);CHKERRQ(ierr);
307   PetscFunctionReturn(0);
308 }
309 
310 #undef __FUNCT__
311 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering"
312 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
313 {
314   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
315   PetscErrorCode ierr;
316   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
317   PetscInt       *diag = a->diag,oidx;
318   MatScalar      *aa=a->a,*v;
319   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
320   PetscScalar    *x,*b;
321 
322   PetscFunctionBegin;
323   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
324   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
325   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
326 
327   /* forward solve the U^T */
328   idx = 0;
329   for (i=0; i<n; i++) {
330 
331     v     = aa + 36*diag[i];
332     /* multiply by the inverse of the block diagonal */
333     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
334     x6    = x[5+idx];
335     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
336     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
337     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
338     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
339     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
340     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
341     v += 36;
342 
343     vi    = aj + diag[i] + 1;
344     nz    = ai[i+1] - diag[i] - 1;
345     while (nz--) {
346       oidx = 6*(*vi++);
347       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
348       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
349       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
350       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
351       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
352       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
353       v  += 36;
354     }
355     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
356     x[5+idx] = s6;
357     idx += 6;
358   }
359   /* backward solve the L^T */
360   for (i=n-1; i>=0; i--){
361     v    = aa + 36*diag[i] - 36;
362     vi   = aj + diag[i] - 1;
363     nz   = diag[i] - ai[i];
364     idt  = 6*i;
365     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
366     s6 = x[5+idt];
367     while (nz--) {
368       idx   = 6*(*vi--);
369       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
370       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
371       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
372       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
373       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
374       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
375       v -= 36;
376     }
377   }
378   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
379   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
380   ierr = PetscLogFlops(2*36*(a->nz) - 6*A->cmap->n);CHKERRQ(ierr);
381   PetscFunctionReturn(0);
382 }
383 
384 #undef __FUNCT__
385 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering"
386 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
387 {
388   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
389   PetscErrorCode ierr;
390   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
391   PetscInt       *diag = a->diag,oidx;
392   MatScalar      *aa=a->a,*v;
393   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
394   PetscScalar    *x,*b;
395 
396   PetscFunctionBegin;
397   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
398   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
399   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
400 
401   /* forward solve the U^T */
402   idx = 0;
403   for (i=0; i<n; i++) {
404 
405     v     = aa + 49*diag[i];
406     /* multiply by the inverse of the block diagonal */
407     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
408     x6    = x[5+idx]; x7 = x[6+idx];
409     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
410     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
411     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
412     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
413     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
414     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
415     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
416     v += 49;
417 
418     vi    = aj + diag[i] + 1;
419     nz    = ai[i+1] - diag[i] - 1;
420     while (nz--) {
421       oidx = 7*(*vi++);
422       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
423       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
424       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
425       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
426       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
427       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
428       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
429       v  += 49;
430     }
431     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
432     x[5+idx] = s6;x[6+idx] = s7;
433     idx += 7;
434   }
435   /* backward solve the L^T */
436   for (i=n-1; i>=0; i--){
437     v    = aa + 49*diag[i] - 49;
438     vi   = aj + diag[i] - 1;
439     nz   = diag[i] - ai[i];
440     idt  = 7*i;
441     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
442     s6 = x[5+idt];s7 = x[6+idt];
443     while (nz--) {
444       idx   = 7*(*vi--);
445       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
446       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
447       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
448       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
449       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
450       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
451       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
452       v -= 49;
453     }
454   }
455   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
456   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
457   ierr = PetscLogFlops(2*49*(a->nz) - 7*A->cmap->n);CHKERRQ(ierr);
458   PetscFunctionReturn(0);
459 }
460 
461 /*---------------------------------------------------------------------------------------------*/
462 #undef __FUNCT__
463 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1"
464 PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
465 {
466   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
467   IS             iscol=a->col,isrow=a->row;
468   PetscErrorCode ierr;
469   const PetscInt *r,*c,*rout,*cout;
470   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
471   PetscInt       *diag = a->diag;
472   MatScalar      *aa=a->a,*v;
473   PetscScalar    s1,*x,*b,*t;
474 
475   PetscFunctionBegin;
476   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
477   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
478   t  = a->solve_work;
479 
480   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
481   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
482 
483   /* copy the b into temp work space according to permutation */
484   for (i=0; i<n; i++) {
485     t[i] = b[c[i]];
486   }
487 
488   /* forward solve the U^T */
489   for (i=0; i<n; i++) {
490 
491     v     = aa + diag[i];
492     /* multiply by the inverse of the block diagonal */
493     s1    = (*v++)*t[i];
494     vi    = aj + diag[i] + 1;
495     nz    = ai[i+1] - diag[i] - 1;
496     while (nz--) {
497       t[*vi++]  -= (*v++)*s1;
498     }
499     t[i]   = s1;
500   }
501   /* backward solve the L^T */
502   for (i=n-1; i>=0; i--){
503     v    = aa + diag[i] - 1;
504     vi   = aj + diag[i] - 1;
505     nz   = diag[i] - ai[i];
506     s1   = t[i];
507     while (nz--) {
508       t[*vi--]   -=  (*v--)*s1;
509     }
510   }
511 
512   /* copy t into x according to permutation */
513   for (i=0; i<n; i++) {
514     x[r[i]]   = t[i];
515   }
516 
517   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
518   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
519   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
520   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
521   ierr = PetscLogFlops(2*(a->nz) - A->cmap->n);CHKERRQ(ierr);
522   PetscFunctionReturn(0);
523 }
524 
525 #undef __FUNCT__
526 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2"
527 PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
528 {
529   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
530   IS             iscol=a->col,isrow=a->row;
531   PetscErrorCode ierr;
532   const PetscInt *r,*c,*rout,*cout;
533   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
534   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
535   MatScalar      *aa=a->a,*v;
536   PetscScalar    s1,s2,x1,x2;
537   PetscScalar    *x,*b,*t;
538 
539   PetscFunctionBegin;
540   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
541   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
542   t  = a->solve_work;
543 
544   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
545   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
546 
547   /* copy the b into temp work space according to permutation */
548   ii = 0;
549   for (i=0; i<n; i++) {
550     ic      = 2*c[i];
551     t[ii]   = b[ic];
552     t[ii+1] = b[ic+1];
553     ii += 2;
554   }
555 
556   /* forward solve the U^T */
557   idx = 0;
558   for (i=0; i<n; i++) {
559 
560     v     = aa + 4*diag[i];
561     /* multiply by the inverse of the block diagonal */
562     x1    = t[idx];   x2 = t[1+idx];
563     s1 = v[0]*x1  +  v[1]*x2;
564     s2 = v[2]*x1  +  v[3]*x2;
565     v += 4;
566 
567     vi    = aj + diag[i] + 1;
568     nz    = ai[i+1] - diag[i] - 1;
569     while (nz--) {
570       oidx = 2*(*vi++);
571       t[oidx]   -= v[0]*s1  +  v[1]*s2;
572       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
573       v  += 4;
574     }
575     t[idx]   = s1;t[1+idx] = s2;
576     idx += 2;
577   }
578   /* backward solve the L^T */
579   for (i=n-1; i>=0; i--){
580     v    = aa + 4*diag[i] - 4;
581     vi   = aj + diag[i] - 1;
582     nz   = diag[i] - ai[i];
583     idt  = 2*i;
584     s1 = t[idt];  s2 = t[1+idt];
585     while (nz--) {
586       idx   = 2*(*vi--);
587       t[idx]   -=  v[0]*s1 +  v[1]*s2;
588       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
589       v -= 4;
590     }
591   }
592 
593   /* copy t into x according to permutation */
594   ii = 0;
595   for (i=0; i<n; i++) {
596     ir      = 2*r[i];
597     x[ir]   = t[ii];
598     x[ir+1] = t[ii+1];
599     ii += 2;
600   }
601 
602   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
603   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
604   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
605   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
606   ierr = PetscLogFlops(2*4*(a->nz) - 2*A->cmap->n);CHKERRQ(ierr);
607   PetscFunctionReturn(0);
608 }
609 
610 #undef __FUNCT__
611 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3"
612 PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
613 {
614   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
615   IS             iscol=a->col,isrow=a->row;
616   PetscErrorCode ierr;
617   const PetscInt *r,*c,*rout,*cout;
618   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
619   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
620   MatScalar      *aa=a->a,*v;
621   PetscScalar    s1,s2,s3,x1,x2,x3;
622   PetscScalar    *x,*b,*t;
623 
624   PetscFunctionBegin;
625   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
626   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
627   t  = a->solve_work;
628 
629   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
630   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
631 
632   /* copy the b into temp work space according to permutation */
633   ii = 0;
634   for (i=0; i<n; i++) {
635     ic      = 3*c[i];
636     t[ii]   = b[ic];
637     t[ii+1] = b[ic+1];
638     t[ii+2] = b[ic+2];
639     ii += 3;
640   }
641 
642   /* forward solve the U^T */
643   idx = 0;
644   for (i=0; i<n; i++) {
645 
646     v     = aa + 9*diag[i];
647     /* multiply by the inverse of the block diagonal */
648     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
649     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
650     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
651     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
652     v += 9;
653 
654     vi    = aj + diag[i] + 1;
655     nz    = ai[i+1] - diag[i] - 1;
656     while (nz--) {
657       oidx = 3*(*vi++);
658       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
659       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
660       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
661       v  += 9;
662     }
663     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
664     idx += 3;
665   }
666   /* backward solve the L^T */
667   for (i=n-1; i>=0; i--){
668     v    = aa + 9*diag[i] - 9;
669     vi   = aj + diag[i] - 1;
670     nz   = diag[i] - ai[i];
671     idt  = 3*i;
672     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
673     while (nz--) {
674       idx   = 3*(*vi--);
675       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
676       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
677       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
678       v -= 9;
679     }
680   }
681 
682   /* copy t into x according to permutation */
683   ii = 0;
684   for (i=0; i<n; i++) {
685     ir      = 3*r[i];
686     x[ir]   = t[ii];
687     x[ir+1] = t[ii+1];
688     x[ir+2] = t[ii+2];
689     ii += 3;
690   }
691 
692   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
693   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
694   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
695   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
696   ierr = PetscLogFlops(2*9*(a->nz) - 3*A->cmap->n);CHKERRQ(ierr);
697   PetscFunctionReturn(0);
698 }
699 
700 #undef __FUNCT__
701 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4"
702 PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
703 {
704   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
705   IS             iscol=a->col,isrow=a->row;
706   PetscErrorCode ierr;
707   const PetscInt *r,*c,*rout,*cout;
708   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
709   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
710   MatScalar      *aa=a->a,*v;
711   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
712   PetscScalar    *x,*b,*t;
713 
714   PetscFunctionBegin;
715   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
716   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
717   t  = a->solve_work;
718 
719   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
720   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
721 
722   /* copy the b into temp work space according to permutation */
723   ii = 0;
724   for (i=0; i<n; i++) {
725     ic      = 4*c[i];
726     t[ii]   = b[ic];
727     t[ii+1] = b[ic+1];
728     t[ii+2] = b[ic+2];
729     t[ii+3] = b[ic+3];
730     ii += 4;
731   }
732 
733   /* forward solve the U^T */
734   idx = 0;
735   for (i=0; i<n; i++) {
736 
737     v     = aa + 16*diag[i];
738     /* multiply by the inverse of the block diagonal */
739     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
740     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
741     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
742     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
743     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
744     v += 16;
745 
746     vi    = aj + diag[i] + 1;
747     nz    = ai[i+1] - diag[i] - 1;
748     while (nz--) {
749       oidx = 4*(*vi++);
750       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
751       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
752       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
753       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
754       v  += 16;
755     }
756     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
757     idx += 4;
758   }
759   /* backward solve the L^T */
760   for (i=n-1; i>=0; i--){
761     v    = aa + 16*diag[i] - 16;
762     vi   = aj + diag[i] - 1;
763     nz   = diag[i] - ai[i];
764     idt  = 4*i;
765     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
766     while (nz--) {
767       idx   = 4*(*vi--);
768       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
769       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
770       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
771       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
772       v -= 16;
773     }
774   }
775 
776   /* copy t into x according to permutation */
777   ii = 0;
778   for (i=0; i<n; i++) {
779     ir      = 4*r[i];
780     x[ir]   = t[ii];
781     x[ir+1] = t[ii+1];
782     x[ir+2] = t[ii+2];
783     x[ir+3] = t[ii+3];
784     ii += 4;
785   }
786 
787   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
788   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
789   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
790   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
791   ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr);
792   PetscFunctionReturn(0);
793 }
794 
795 #undef __FUNCT__
796 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5"
797 PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
798 {
799   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
800   IS             iscol=a->col,isrow=a->row;
801   PetscErrorCode ierr;
802   const PetscInt *r,*c,*rout,*cout;
803   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
804   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
805   MatScalar      *aa=a->a,*v;
806   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
807   PetscScalar    *x,*b,*t;
808 
809   PetscFunctionBegin;
810   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
811   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
812   t  = a->solve_work;
813 
814   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
815   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
816 
817   /* copy the b into temp work space according to permutation */
818   ii = 0;
819   for (i=0; i<n; i++) {
820     ic      = 5*c[i];
821     t[ii]   = b[ic];
822     t[ii+1] = b[ic+1];
823     t[ii+2] = b[ic+2];
824     t[ii+3] = b[ic+3];
825     t[ii+4] = b[ic+4];
826     ii += 5;
827   }
828 
829   /* forward solve the U^T */
830   idx = 0;
831   for (i=0; i<n; i++) {
832 
833     v     = aa + 25*diag[i];
834     /* multiply by the inverse of the block diagonal */
835     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
836     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
837     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
838     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
839     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
840     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
841     v += 25;
842 
843     vi    = aj + diag[i] + 1;
844     nz    = ai[i+1] - diag[i] - 1;
845     while (nz--) {
846       oidx = 5*(*vi++);
847       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
848       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
849       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
850       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
851       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
852       v  += 25;
853     }
854     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
855     idx += 5;
856   }
857   /* backward solve the L^T */
858   for (i=n-1; i>=0; i--){
859     v    = aa + 25*diag[i] - 25;
860     vi   = aj + diag[i] - 1;
861     nz   = diag[i] - ai[i];
862     idt  = 5*i;
863     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
864     while (nz--) {
865       idx   = 5*(*vi--);
866       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
867       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
868       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
869       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
870       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
871       v -= 25;
872     }
873   }
874 
875   /* copy t into x according to permutation */
876   ii = 0;
877   for (i=0; i<n; i++) {
878     ir      = 5*r[i];
879     x[ir]   = t[ii];
880     x[ir+1] = t[ii+1];
881     x[ir+2] = t[ii+2];
882     x[ir+3] = t[ii+3];
883     x[ir+4] = t[ii+4];
884     ii += 5;
885   }
886 
887   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
888   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
889   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
890   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
891   ierr = PetscLogFlops(2*25*(a->nz) - 5*A->cmap->n);CHKERRQ(ierr);
892   PetscFunctionReturn(0);
893 }
894 
895 #undef __FUNCT__
896 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6"
897 PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
898 {
899   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
900   IS             iscol=a->col,isrow=a->row;
901   PetscErrorCode ierr;
902   const PetscInt *r,*c,*rout,*cout;
903   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
904   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
905   MatScalar      *aa=a->a,*v;
906   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
907   PetscScalar    *x,*b,*t;
908 
909   PetscFunctionBegin;
910   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
911   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
912   t  = a->solve_work;
913 
914   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
915   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
916 
917   /* copy the b into temp work space according to permutation */
918   ii = 0;
919   for (i=0; i<n; i++) {
920     ic      = 6*c[i];
921     t[ii]   = b[ic];
922     t[ii+1] = b[ic+1];
923     t[ii+2] = b[ic+2];
924     t[ii+3] = b[ic+3];
925     t[ii+4] = b[ic+4];
926     t[ii+5] = b[ic+5];
927     ii += 6;
928   }
929 
930   /* forward solve the U^T */
931   idx = 0;
932   for (i=0; i<n; i++) {
933 
934     v     = aa + 36*diag[i];
935     /* multiply by the inverse of the block diagonal */
936     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
937     x6    = t[5+idx];
938     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
939     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
940     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
941     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
942     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
943     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
944     v += 36;
945 
946     vi    = aj + diag[i] + 1;
947     nz    = ai[i+1] - diag[i] - 1;
948     while (nz--) {
949       oidx = 6*(*vi++);
950       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
951       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
952       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
953       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
954       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
955       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
956       v  += 36;
957     }
958     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
959     t[5+idx] = s6;
960     idx += 6;
961   }
962   /* backward solve the L^T */
963   for (i=n-1; i>=0; i--){
964     v    = aa + 36*diag[i] - 36;
965     vi   = aj + diag[i] - 1;
966     nz   = diag[i] - ai[i];
967     idt  = 6*i;
968     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
969     s6 = t[5+idt];
970     while (nz--) {
971       idx   = 6*(*vi--);
972       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
973       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
974       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
975       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
976       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
977       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
978       v -= 36;
979     }
980   }
981 
982   /* copy t into x according to permutation */
983   ii = 0;
984   for (i=0; i<n; i++) {
985     ir      = 6*r[i];
986     x[ir]   = t[ii];
987     x[ir+1] = t[ii+1];
988     x[ir+2] = t[ii+2];
989     x[ir+3] = t[ii+3];
990     x[ir+4] = t[ii+4];
991     x[ir+5] = t[ii+5];
992     ii += 6;
993   }
994 
995   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
996   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
997   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
998   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
999   ierr = PetscLogFlops(2*36*(a->nz) - 6*A->cmap->n);CHKERRQ(ierr);
1000   PetscFunctionReturn(0);
1001 }
1002 
1003 #undef __FUNCT__
1004 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7"
1005 PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1006 {
1007   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1008   IS             iscol=a->col,isrow=a->row;
1009   PetscErrorCode ierr;
1010   const PetscInt *r,*c,*rout,*cout;
1011   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1012   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1013   MatScalar      *aa=a->a,*v;
1014   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1015   PetscScalar    *x,*b,*t;
1016 
1017   PetscFunctionBegin;
1018   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1019   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1020   t  = a->solve_work;
1021 
1022   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1023   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1024 
1025   /* copy the b into temp work space according to permutation */
1026   ii = 0;
1027   for (i=0; i<n; i++) {
1028     ic      = 7*c[i];
1029     t[ii]   = b[ic];
1030     t[ii+1] = b[ic+1];
1031     t[ii+2] = b[ic+2];
1032     t[ii+3] = b[ic+3];
1033     t[ii+4] = b[ic+4];
1034     t[ii+5] = b[ic+5];
1035     t[ii+6] = b[ic+6];
1036     ii += 7;
1037   }
1038 
1039   /* forward solve the U^T */
1040   idx = 0;
1041   for (i=0; i<n; i++) {
1042 
1043     v     = aa + 49*diag[i];
1044     /* multiply by the inverse of the block diagonal */
1045     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1046     x6    = t[5+idx]; x7 = t[6+idx];
1047     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1048     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1049     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1050     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1051     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1052     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1053     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1054     v += 49;
1055 
1056     vi    = aj + diag[i] + 1;
1057     nz    = ai[i+1] - diag[i] - 1;
1058     while (nz--) {
1059       oidx = 7*(*vi++);
1060       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1061       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1062       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1063       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1064       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1065       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1066       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1067       v  += 49;
1068     }
1069     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1070     t[5+idx] = s6;t[6+idx] = s7;
1071     idx += 7;
1072   }
1073   /* backward solve the L^T */
1074   for (i=n-1; i>=0; i--){
1075     v    = aa + 49*diag[i] - 49;
1076     vi   = aj + diag[i] - 1;
1077     nz   = diag[i] - ai[i];
1078     idt  = 7*i;
1079     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1080     s6 = t[5+idt];s7 = t[6+idt];
1081     while (nz--) {
1082       idx   = 7*(*vi--);
1083       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1084       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1085       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1086       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1087       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1088       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1089       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1090       v -= 49;
1091     }
1092   }
1093 
1094   /* copy t into x according to permutation */
1095   ii = 0;
1096   for (i=0; i<n; i++) {
1097     ir      = 7*r[i];
1098     x[ir]   = t[ii];
1099     x[ir+1] = t[ii+1];
1100     x[ir+2] = t[ii+2];
1101     x[ir+3] = t[ii+3];
1102     x[ir+4] = t[ii+4];
1103     x[ir+5] = t[ii+5];
1104     x[ir+6] = t[ii+6];
1105     ii += 7;
1106   }
1107 
1108   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1109   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1110   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1111   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1112   ierr = PetscLogFlops(2*49*(a->nz) - 7*A->cmap->n);CHKERRQ(ierr);
1113   PetscFunctionReturn(0);
1114 }
1115 
1116 /* ----------------------------------------------------------- */
1117 #undef __FUNCT__
1118 #define __FUNCT__ "MatSolve_SeqBAIJ_N"
1119 PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
1120 {
1121   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1122   IS             iscol=a->col,isrow=a->row;
1123   PetscErrorCode ierr;
1124   const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
1125   PetscInt       i,n=a->mbs;
1126   PetscInt       nz,bs=A->rmap->bs,bs2=a->bs2;
1127   MatScalar      *aa=a->a,*v;
1128   PetscScalar    *x,*b,*s,*t,*ls;
1129 
1130   PetscFunctionBegin;
1131   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1132   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1133   t  = a->solve_work;
1134 
1135   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1136   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1137 
1138   /* forward solve the lower triangular */
1139   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
1140   for (i=1; i<n; i++) {
1141     v   = aa + bs2*ai[i];
1142     vi  = aj + ai[i];
1143     nz  = a->diag[i] - ai[i];
1144     s = t + bs*i;
1145     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
1146     while (nz--) {
1147       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
1148       v += bs2;
1149     }
1150   }
1151   /* backward solve the upper triangular */
1152   ls = a->solve_work + A->cmap->n;
1153   for (i=n-1; i>=0; i--){
1154     v   = aa + bs2*(a->diag[i] + 1);
1155     vi  = aj + a->diag[i] + 1;
1156     nz  = ai[i+1] - a->diag[i] - 1;
1157     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1158     while (nz--) {
1159       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
1160       v += bs2;
1161     }
1162     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
1163     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1164   }
1165 
1166   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1167   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1168   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1169   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1170   ierr = PetscLogFlops(2*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
1171   PetscFunctionReturn(0);
1172 }
1173 
1174 #undef __FUNCT__
1175 #define __FUNCT__ "MatSolve_SeqBAIJ_7"
1176 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1177 {
1178   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1179   IS             iscol=a->col,isrow=a->row;
1180   PetscErrorCode ierr;
1181   const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*diag = a->diag,*vi;
1182   PetscInt       i,n=a->mbs,nz,idx,idt,idc;
1183   MatScalar      *aa=a->a,*v;
1184   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1185   PetscScalar    *x,*b,*t;
1186 
1187   PetscFunctionBegin;
1188   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1189   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1190   t  = a->solve_work;
1191 
1192   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1193   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1194 
1195   /* forward solve the lower triangular */
1196   idx    = 7*(*r++);
1197   t[0] = b[idx];   t[1] = b[1+idx];
1198   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1199   t[5] = b[5+idx]; t[6] = b[6+idx];
1200 
1201   for (i=1; i<n; i++) {
1202     v     = aa + 49*ai[i];
1203     vi    = aj + ai[i];
1204     nz    = diag[i] - ai[i];
1205     idx   = 7*(*r++);
1206     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1207     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1208     while (nz--) {
1209       idx   = 7*(*vi++);
1210       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1211       x4    = t[3+idx];x5 = t[4+idx];
1212       x6    = t[5+idx];x7 = t[6+idx];
1213       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1214       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1215       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1216       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1217       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1218       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1219       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1220       v += 49;
1221     }
1222     idx = 7*i;
1223     t[idx]   = s1;t[1+idx] = s2;
1224     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1225     t[5+idx] = s6;t[6+idx] = s7;
1226   }
1227   /* backward solve the upper triangular */
1228   for (i=n-1; i>=0; i--){
1229     v    = aa + 49*diag[i] + 49;
1230     vi   = aj + diag[i] + 1;
1231     nz   = ai[i+1] - diag[i] - 1;
1232     idt  = 7*i;
1233     s1 = t[idt];  s2 = t[1+idt];
1234     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1235     s6 = t[5+idt];s7 = t[6+idt];
1236     while (nz--) {
1237       idx   = 7*(*vi++);
1238       x1    = t[idx];   x2 = t[1+idx];
1239       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1240       x6    = t[5+idx]; x7 = t[6+idx];
1241       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1242       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1243       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1244       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1245       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1246       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1247       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1248       v += 49;
1249     }
1250     idc = 7*(*c--);
1251     v   = aa + 49*diag[i];
1252     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1253                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1254     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1255                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1256     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1257                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1258     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1259                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1260     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1261                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1262     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1263                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1264     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1265                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
1266   }
1267 
1268   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1269   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1270   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1271   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1272   ierr = PetscLogFlops(2*49*(a->nz) - 7*A->cmap->n);CHKERRQ(ierr);
1273   PetscFunctionReturn(0);
1274 }
1275 
1276 #undef __FUNCT__
1277 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
1278 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
1279 {
1280   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1281   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1282   PetscErrorCode    ierr;
1283   PetscInt          *diag = a->diag,jdx;
1284   const MatScalar   *aa=a->a,*v;
1285   PetscScalar       *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1286   const PetscScalar *b;
1287 
1288   PetscFunctionBegin;
1289   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1290   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1291   /* forward solve the lower triangular */
1292   idx    = 0;
1293   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1294   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1295   x[6] = b[6+idx];
1296   for (i=1; i<n; i++) {
1297     v     =  aa + 49*ai[i];
1298     vi    =  aj + ai[i];
1299     nz    =  diag[i] - ai[i];
1300     idx   =  7*i;
1301     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1302     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1303     s7  =  b[6+idx];
1304     while (nz--) {
1305       jdx   = 7*(*vi++);
1306       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1307       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1308       x7    = x[6+jdx];
1309       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1310       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1311       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1312       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1313       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1314       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1315       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1316       v += 49;
1317      }
1318     x[idx]   = s1;
1319     x[1+idx] = s2;
1320     x[2+idx] = s3;
1321     x[3+idx] = s4;
1322     x[4+idx] = s5;
1323     x[5+idx] = s6;
1324     x[6+idx] = s7;
1325   }
1326   /* backward solve the upper triangular */
1327   for (i=n-1; i>=0; i--){
1328     v    = aa + 49*diag[i] + 49;
1329     vi   = aj + diag[i] + 1;
1330     nz   = ai[i+1] - diag[i] - 1;
1331     idt  = 7*i;
1332     s1 = x[idt];   s2 = x[1+idt];
1333     s3 = x[2+idt]; s4 = x[3+idt];
1334     s5 = x[4+idt]; s6 = x[5+idt];
1335     s7 = x[6+idt];
1336     while (nz--) {
1337       idx   = 7*(*vi++);
1338       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1339       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1340       x7    = x[6+idx];
1341       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1342       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1343       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1344       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1345       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1346       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1347       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1348       v += 49;
1349     }
1350     v        = aa + 49*diag[i];
1351     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
1352                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
1353     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
1354                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
1355     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
1356                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
1357     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
1358                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
1359     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
1360                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
1361     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
1362                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
1363     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
1364                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
1365   }
1366 
1367   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1368   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1369   ierr = PetscLogFlops(2*36*(a->nz) - 6*A->cmap->n);CHKERRQ(ierr);
1370   PetscFunctionReturn(0);
1371 }
1372 
1373 #undef __FUNCT__
1374 #define __FUNCT__ "MatSolve_SeqBAIJ_6"
1375 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
1376 {
1377   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1378   IS                iscol=a->col,isrow=a->row;
1379   PetscErrorCode    ierr;
1380   const PetscInt    *r,*c,*rout,*cout;
1381   PetscInt          *diag = a->diag,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
1382   const MatScalar   *aa=a->a,*v;
1383   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
1384   const PetscScalar *b;
1385   PetscFunctionBegin;
1386   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1387   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1388   t  = a->solve_work;
1389 
1390   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1391   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1392 
1393   /* forward solve the lower triangular */
1394   idx    = 6*(*r++);
1395   t[0] = b[idx];   t[1] = b[1+idx];
1396   t[2] = b[2+idx]; t[3] = b[3+idx];
1397   t[4] = b[4+idx]; t[5] = b[5+idx];
1398   for (i=1; i<n; i++) {
1399     v     = aa + 36*ai[i];
1400     vi    = aj + ai[i];
1401     nz    = diag[i] - ai[i];
1402     idx   = 6*(*r++);
1403     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1404     s5  = b[4+idx]; s6 = b[5+idx];
1405     while (nz--) {
1406       idx   = 6*(*vi++);
1407       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1408       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1409       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1410       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1411       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1412       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1413       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1414       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1415       v += 36;
1416     }
1417     idx = 6*i;
1418     t[idx]   = s1;t[1+idx] = s2;
1419     t[2+idx] = s3;t[3+idx] = s4;
1420     t[4+idx] = s5;t[5+idx] = s6;
1421   }
1422   /* backward solve the upper triangular */
1423   for (i=n-1; i>=0; i--){
1424     v    = aa + 36*diag[i] + 36;
1425     vi   = aj + diag[i] + 1;
1426     nz   = ai[i+1] - diag[i] - 1;
1427     idt  = 6*i;
1428     s1 = t[idt];  s2 = t[1+idt];
1429     s3 = t[2+idt];s4 = t[3+idt];
1430     s5 = t[4+idt];s6 = t[5+idt];
1431     while (nz--) {
1432       idx   = 6*(*vi++);
1433       x1    = t[idx];   x2 = t[1+idx];
1434       x3    = t[2+idx]; x4 = t[3+idx];
1435       x5    = t[4+idx]; x6 = t[5+idx];
1436       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1437       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1438       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1439       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1440       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1441       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1442       v += 36;
1443     }
1444     idc = 6*(*c--);
1445     v   = aa + 36*diag[i];
1446     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
1447                                  v[18]*s4+v[24]*s5+v[30]*s6;
1448     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1449                                  v[19]*s4+v[25]*s5+v[31]*s6;
1450     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1451                                  v[20]*s4+v[26]*s5+v[32]*s6;
1452     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1453                                  v[21]*s4+v[27]*s5+v[33]*s6;
1454     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1455                                  v[22]*s4+v[28]*s5+v[34]*s6;
1456     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1457                                  v[23]*s4+v[29]*s5+v[35]*s6;
1458   }
1459 
1460   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1461   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1462   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1463   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1464   ierr = PetscLogFlops(2*36*(a->nz) - 6*A->cmap->n);CHKERRQ(ierr);
1465   PetscFunctionReturn(0);
1466 }
1467 
1468 #undef __FUNCT__
1469 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
1470 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
1471 {
1472   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1473   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1474   PetscErrorCode    ierr;
1475   PetscInt          *diag = a->diag,jdx;
1476   const MatScalar   *aa=a->a,*v;
1477   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
1478   const PetscScalar *b;
1479 
1480   PetscFunctionBegin;
1481   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1482   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1483   /* forward solve the lower triangular */
1484   idx    = 0;
1485   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1486   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1487   for (i=1; i<n; i++) {
1488     v     =  aa + 36*ai[i];
1489     vi    =  aj + ai[i];
1490     nz    =  diag[i] - ai[i];
1491     idx   =  6*i;
1492     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1493     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1494     while (nz--) {
1495       jdx   = 6*(*vi++);
1496       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1497       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1498       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1499       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1500       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1501       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1502       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1503       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1504       v += 36;
1505      }
1506     x[idx]   = s1;
1507     x[1+idx] = s2;
1508     x[2+idx] = s3;
1509     x[3+idx] = s4;
1510     x[4+idx] = s5;
1511     x[5+idx] = s6;
1512   }
1513   /* backward solve the upper triangular */
1514   for (i=n-1; i>=0; i--){
1515     v    = aa + 36*diag[i] + 36;
1516     vi   = aj + diag[i] + 1;
1517     nz   = ai[i+1] - diag[i] - 1;
1518     idt  = 6*i;
1519     s1 = x[idt];   s2 = x[1+idt];
1520     s3 = x[2+idt]; s4 = x[3+idt];
1521     s5 = x[4+idt]; s6 = x[5+idt];
1522     while (nz--) {
1523       idx   = 6*(*vi++);
1524       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1525       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1526       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1527       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1528       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1529       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1530       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1531       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1532       v += 36;
1533     }
1534     v        = aa + 36*diag[i];
1535     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
1536     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
1537     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
1538     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
1539     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
1540     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
1541   }
1542 
1543   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1544   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1545   ierr = PetscLogFlops(2*36*(a->nz) - 6*A->cmap->n);CHKERRQ(ierr);
1546   PetscFunctionReturn(0);
1547 }
1548 
1549 #undef __FUNCT__
1550 #define __FUNCT__ "MatSolve_SeqBAIJ_5"
1551 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
1552 {
1553   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1554   IS                iscol=a->col,isrow=a->row;
1555   PetscErrorCode    ierr;
1556   const PetscInt    *r,*c,*rout,*cout,*diag = a->diag;
1557   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
1558   const MatScalar   *aa=a->a,*v;
1559   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
1560   const PetscScalar *b;
1561 
1562   PetscFunctionBegin;
1563   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1564   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1565   t  = a->solve_work;
1566 
1567   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1568   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1569 
1570   /* forward solve the lower triangular */
1571   idx    = 5*(*r++);
1572   t[0] = b[idx];   t[1] = b[1+idx];
1573   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1574   for (i=1; i<n; i++) {
1575     v     = aa + 25*ai[i];
1576     vi    = aj + ai[i];
1577     nz    = diag[i] - ai[i];
1578     idx   = 5*(*r++);
1579     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1580     s5  = b[4+idx];
1581     while (nz--) {
1582       idx   = 5*(*vi++);
1583       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1584       x4    = t[3+idx];x5 = t[4+idx];
1585       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1586       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1587       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1588       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1589       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1590       v += 25;
1591     }
1592     idx = 5*i;
1593     t[idx]   = s1;t[1+idx] = s2;
1594     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1595   }
1596   /* backward solve the upper triangular */
1597   for (i=n-1; i>=0; i--){
1598     v    = aa + 25*diag[i] + 25;
1599     vi   = aj + diag[i] + 1;
1600     nz   = ai[i+1] - diag[i] - 1;
1601     idt  = 5*i;
1602     s1 = t[idt];  s2 = t[1+idt];
1603     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1604     while (nz--) {
1605       idx   = 5*(*vi++);
1606       x1    = t[idx];   x2 = t[1+idx];
1607       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1608       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1609       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1610       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1611       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1612       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1613       v += 25;
1614     }
1615     idc = 5*(*c--);
1616     v   = aa + 25*diag[i];
1617     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
1618                                  v[15]*s4+v[20]*s5;
1619     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
1620                                  v[16]*s4+v[21]*s5;
1621     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
1622                                  v[17]*s4+v[22]*s5;
1623     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
1624                                  v[18]*s4+v[23]*s5;
1625     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
1626                                  v[19]*s4+v[24]*s5;
1627   }
1628 
1629   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1630   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1631   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1632   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1633   ierr = PetscLogFlops(2*25*(a->nz) - 5*A->cmap->n);CHKERRQ(ierr);
1634   PetscFunctionReturn(0);
1635 }
1636 
1637 #undef __FUNCT__
1638 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
1639 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
1640 {
1641   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1642   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1643   PetscErrorCode    ierr;
1644   PetscInt          *diag = a->diag,jdx;
1645   const MatScalar   *aa=a->a,*v;
1646   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
1647   const PetscScalar *b;
1648 
1649   PetscFunctionBegin;
1650   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1651   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1652   /* forward solve the lower triangular */
1653   idx    = 0;
1654   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
1655   for (i=1; i<n; i++) {
1656     v     =  aa + 25*ai[i];
1657     vi    =  aj + ai[i];
1658     nz    =  diag[i] - ai[i];
1659     idx   =  5*i;
1660     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
1661     while (nz--) {
1662       jdx   = 5*(*vi++);
1663       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
1664       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1665       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1666       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1667       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1668       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1669       v    += 25;
1670     }
1671     x[idx]   = s1;
1672     x[1+idx] = s2;
1673     x[2+idx] = s3;
1674     x[3+idx] = s4;
1675     x[4+idx] = s5;
1676   }
1677   /* backward solve the upper triangular */
1678   for (i=n-1; i>=0; i--){
1679     v    = aa + 25*diag[i] + 25;
1680     vi   = aj + diag[i] + 1;
1681     nz   = ai[i+1] - diag[i] - 1;
1682     idt  = 5*i;
1683     s1 = x[idt];  s2 = x[1+idt];
1684     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
1685     while (nz--) {
1686       idx   = 5*(*vi++);
1687       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
1688       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1689       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1690       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1691       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1692       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1693       v    += 25;
1694     }
1695     v        = aa + 25*diag[i];
1696     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
1697     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
1698     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
1699     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
1700     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
1701   }
1702 
1703   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1704   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1705   ierr = PetscLogFlops(2*25*(a->nz) - 5*A->cmap->n);CHKERRQ(ierr);
1706   PetscFunctionReturn(0);
1707 }
1708 
1709 #undef __FUNCT__
1710 #define __FUNCT__ "MatSolve_SeqBAIJ_4"
1711 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
1712 {
1713   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1714   IS                iscol=a->col,isrow=a->row;
1715   PetscErrorCode    ierr;
1716   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
1717   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
1718   const MatScalar   *aa=a->a,*v;
1719   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
1720   const PetscScalar *b;
1721 
1722   PetscFunctionBegin;
1723   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1724   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1725   t  = a->solve_work;
1726 
1727   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1728   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1729 
1730   /* forward solve the lower triangular */
1731   idx    = 4*(*r++);
1732   t[0] = b[idx];   t[1] = b[1+idx];
1733   t[2] = b[2+idx]; t[3] = b[3+idx];
1734   for (i=1; i<n; i++) {
1735     v     = aa + 16*ai[i];
1736     vi    = aj + ai[i];
1737     nz    = diag[i] - ai[i];
1738     idx   = 4*(*r++);
1739     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1740     while (nz--) {
1741       idx   = 4*(*vi++);
1742       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
1743       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1744       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1745       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1746       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1747       v    += 16;
1748     }
1749     idx        = 4*i;
1750     t[idx]   = s1;t[1+idx] = s2;
1751     t[2+idx] = s3;t[3+idx] = s4;
1752   }
1753   /* backward solve the upper triangular */
1754   for (i=n-1; i>=0; i--){
1755     v    = aa + 16*diag[i] + 16;
1756     vi   = aj + diag[i] + 1;
1757     nz   = ai[i+1] - diag[i] - 1;
1758     idt  = 4*i;
1759     s1 = t[idt];  s2 = t[1+idt];
1760     s3 = t[2+idt];s4 = t[3+idt];
1761     while (nz--) {
1762       idx   = 4*(*vi++);
1763       x1    = t[idx];   x2 = t[1+idx];
1764       x3    = t[2+idx]; x4 = t[3+idx];
1765       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1766       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1767       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1768       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1769       v += 16;
1770     }
1771     idc      = 4*(*c--);
1772     v        = aa + 16*diag[i];
1773     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1774     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1775     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1776     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1777   }
1778 
1779   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1780   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1781   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1782   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1783   ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr);
1784   PetscFunctionReturn(0);
1785 }
1786 
1787 #undef __FUNCT__
1788 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion"
1789 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
1790 {
1791   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1792   IS                iscol=a->col,isrow=a->row;
1793   PetscErrorCode    ierr;
1794   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
1795   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
1796   const MatScalar   *aa=a->a,*v;
1797   MatScalar         s1,s2,s3,s4,x1,x2,x3,x4,*t;
1798   PetscScalar       *x;
1799   const PetscScalar *b;
1800 
1801   PetscFunctionBegin;
1802   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1803   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1804   t  = (MatScalar *)a->solve_work;
1805 
1806   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1807   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1808 
1809   /* forward solve the lower triangular */
1810   idx    = 4*(*r++);
1811   t[0] = (MatScalar)b[idx];
1812   t[1] = (MatScalar)b[1+idx];
1813   t[2] = (MatScalar)b[2+idx];
1814   t[3] = (MatScalar)b[3+idx];
1815   for (i=1; i<n; i++) {
1816     v     = aa + 16*ai[i];
1817     vi    = aj + ai[i];
1818     nz    = diag[i] - ai[i];
1819     idx   = 4*(*r++);
1820     s1 = (MatScalar)b[idx];
1821     s2 = (MatScalar)b[1+idx];
1822     s3 = (MatScalar)b[2+idx];
1823     s4 = (MatScalar)b[3+idx];
1824     while (nz--) {
1825       idx   = 4*(*vi++);
1826       x1  = t[idx];
1827       x2  = t[1+idx];
1828       x3  = t[2+idx];
1829       x4  = t[3+idx];
1830       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1831       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1832       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1833       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1834       v    += 16;
1835     }
1836     idx        = 4*i;
1837     t[idx]   = s1;
1838     t[1+idx] = s2;
1839     t[2+idx] = s3;
1840     t[3+idx] = s4;
1841   }
1842   /* backward solve the upper triangular */
1843   for (i=n-1; i>=0; i--){
1844     v    = aa + 16*diag[i] + 16;
1845     vi   = aj + diag[i] + 1;
1846     nz   = ai[i+1] - diag[i] - 1;
1847     idt  = 4*i;
1848     s1 = t[idt];
1849     s2 = t[1+idt];
1850     s3 = t[2+idt];
1851     s4 = t[3+idt];
1852     while (nz--) {
1853       idx   = 4*(*vi++);
1854       x1  = t[idx];
1855       x2  = t[1+idx];
1856       x3  = t[2+idx];
1857       x4  = t[3+idx];
1858       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1859       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1860       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1861       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1862       v += 16;
1863     }
1864     idc      = 4*(*c--);
1865     v        = aa + 16*diag[i];
1866     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1867     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1868     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1869     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1870     x[idc]   = (PetscScalar)t[idt];
1871     x[1+idc] = (PetscScalar)t[1+idt];
1872     x[2+idc] = (PetscScalar)t[2+idt];
1873     x[3+idc] = (PetscScalar)t[3+idt];
1874  }
1875 
1876   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1877   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1878   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1879   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1880   ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr);
1881   PetscFunctionReturn(0);
1882 }
1883 
1884 #if defined (PETSC_HAVE_SSE)
1885 
1886 #include PETSC_HAVE_SSE
1887 
1888 #undef __FUNCT__
1889 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
1890 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
1891 {
1892   /*
1893      Note: This code uses demotion of double
1894      to float when performing the mixed-mode computation.
1895      This may not be numerically reasonable for all applications.
1896   */
1897   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
1898   IS             iscol=a->col,isrow=a->row;
1899   PetscErrorCode ierr;
1900   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
1901   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
1902   MatScalar      *aa=a->a,*v;
1903   PetscScalar    *x,*b,*t;
1904 
1905   /* Make space in temp stack for 16 Byte Aligned arrays */
1906   float           ssealignedspace[11],*tmps,*tmpx;
1907   unsigned long   offset;
1908 
1909   PetscFunctionBegin;
1910   SSE_SCOPE_BEGIN;
1911 
1912     offset = (unsigned long)ssealignedspace % 16;
1913     if (offset) offset = (16 - offset)/4;
1914     tmps = &ssealignedspace[offset];
1915     tmpx = &ssealignedspace[offset+4];
1916     PREFETCH_NTA(aa+16*ai[1]);
1917 
1918     ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1919     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1920     t  = a->solve_work;
1921 
1922     ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1923     ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1924 
1925     /* forward solve the lower triangular */
1926     idx  = 4*(*r++);
1927     t[0] = b[idx];   t[1] = b[1+idx];
1928     t[2] = b[2+idx]; t[3] = b[3+idx];
1929     v    =  aa + 16*ai[1];
1930 
1931     for (i=1; i<n;) {
1932       PREFETCH_NTA(&v[8]);
1933       vi   =  aj      + ai[i];
1934       nz   =  diag[i] - ai[i];
1935       idx  =  4*(*r++);
1936 
1937       /* Demote sum from double to float */
1938       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
1939       LOAD_PS(tmps,XMM7);
1940 
1941       while (nz--) {
1942         PREFETCH_NTA(&v[16]);
1943         idx = 4*(*vi++);
1944 
1945         /* Demote solution (so far) from double to float */
1946         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
1947 
1948         /* 4x4 Matrix-Vector product with negative accumulation: */
1949         SSE_INLINE_BEGIN_2(tmpx,v)
1950           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
1951 
1952           /* First Column */
1953           SSE_COPY_PS(XMM0,XMM6)
1954           SSE_SHUFFLE(XMM0,XMM0,0x00)
1955           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1956           SSE_SUB_PS(XMM7,XMM0)
1957 
1958           /* Second Column */
1959           SSE_COPY_PS(XMM1,XMM6)
1960           SSE_SHUFFLE(XMM1,XMM1,0x55)
1961           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1962           SSE_SUB_PS(XMM7,XMM1)
1963 
1964           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
1965 
1966           /* Third Column */
1967           SSE_COPY_PS(XMM2,XMM6)
1968           SSE_SHUFFLE(XMM2,XMM2,0xAA)
1969           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1970           SSE_SUB_PS(XMM7,XMM2)
1971 
1972           /* Fourth Column */
1973           SSE_COPY_PS(XMM3,XMM6)
1974           SSE_SHUFFLE(XMM3,XMM3,0xFF)
1975           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1976           SSE_SUB_PS(XMM7,XMM3)
1977         SSE_INLINE_END_2
1978 
1979         v  += 16;
1980       }
1981       idx = 4*i;
1982       v   = aa + 16*ai[++i];
1983       PREFETCH_NTA(v);
1984       STORE_PS(tmps,XMM7);
1985 
1986       /* Promote result from float to double */
1987       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
1988     }
1989     /* backward solve the upper triangular */
1990     idt  = 4*(n-1);
1991     ai16 = 16*diag[n-1];
1992     v    = aa + ai16 + 16;
1993     for (i=n-1; i>=0;){
1994       PREFETCH_NTA(&v[8]);
1995       vi = aj + diag[i] + 1;
1996       nz = ai[i+1] - diag[i] - 1;
1997 
1998       /* Demote accumulator from double to float */
1999       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
2000       LOAD_PS(tmps,XMM7);
2001 
2002       while (nz--) {
2003         PREFETCH_NTA(&v[16]);
2004         idx = 4*(*vi++);
2005 
2006         /* Demote solution (so far) from double to float */
2007         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
2008 
2009         /* 4x4 Matrix-Vector Product with negative accumulation: */
2010         SSE_INLINE_BEGIN_2(tmpx,v)
2011           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2012 
2013           /* First Column */
2014           SSE_COPY_PS(XMM0,XMM6)
2015           SSE_SHUFFLE(XMM0,XMM0,0x00)
2016           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2017           SSE_SUB_PS(XMM7,XMM0)
2018 
2019           /* Second Column */
2020           SSE_COPY_PS(XMM1,XMM6)
2021           SSE_SHUFFLE(XMM1,XMM1,0x55)
2022           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2023           SSE_SUB_PS(XMM7,XMM1)
2024 
2025           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2026 
2027           /* Third Column */
2028           SSE_COPY_PS(XMM2,XMM6)
2029           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2030           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2031           SSE_SUB_PS(XMM7,XMM2)
2032 
2033           /* Fourth Column */
2034           SSE_COPY_PS(XMM3,XMM6)
2035           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2036           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2037           SSE_SUB_PS(XMM7,XMM3)
2038         SSE_INLINE_END_2
2039         v  += 16;
2040       }
2041       v    = aa + ai16;
2042       ai16 = 16*diag[--i];
2043       PREFETCH_NTA(aa+ai16+16);
2044       /*
2045          Scale the result by the diagonal 4x4 block,
2046          which was inverted as part of the factorization
2047       */
2048       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
2049         /* First Column */
2050         SSE_COPY_PS(XMM0,XMM7)
2051         SSE_SHUFFLE(XMM0,XMM0,0x00)
2052         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2053 
2054         /* Second Column */
2055         SSE_COPY_PS(XMM1,XMM7)
2056         SSE_SHUFFLE(XMM1,XMM1,0x55)
2057         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2058         SSE_ADD_PS(XMM0,XMM1)
2059 
2060         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2061 
2062         /* Third Column */
2063         SSE_COPY_PS(XMM2,XMM7)
2064         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2065         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2066         SSE_ADD_PS(XMM0,XMM2)
2067 
2068         /* Fourth Column */
2069         SSE_COPY_PS(XMM3,XMM7)
2070         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2071         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2072         SSE_ADD_PS(XMM0,XMM3)
2073 
2074         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2075       SSE_INLINE_END_3
2076 
2077       /* Promote solution from float to double */
2078       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
2079 
2080       /* Apply reordering to t and stream into x.    */
2081       /* This way, x doesn't pollute the cache.      */
2082       /* Be careful with size: 2 doubles = 4 floats! */
2083       idc  = 4*(*c--);
2084       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
2085         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
2086         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
2087         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
2088         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
2089         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
2090         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
2091       SSE_INLINE_END_2
2092       v    = aa + ai16 + 16;
2093       idt -= 4;
2094     }
2095 
2096     ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2097     ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2098     ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2099     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2100     ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr);
2101   SSE_SCOPE_END;
2102   PetscFunctionReturn(0);
2103 }
2104 
2105 #endif
2106 
2107 
2108 /*
2109       Special case where the matrix was ILU(0) factored in the natural
2110    ordering. This eliminates the need for the column and row permutation.
2111 */
2112 #undef __FUNCT__
2113 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
2114 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
2115 {
2116   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2117   PetscInt          n=a->mbs;
2118   const PetscInt    *ai=a->i,*aj=a->j;
2119   PetscErrorCode    ierr;
2120   const PetscInt    *diag = a->diag;
2121   const MatScalar   *aa=a->a;
2122   PetscScalar       *x;
2123   const PetscScalar *b;
2124 
2125   PetscFunctionBegin;
2126   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2127   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2128 
2129 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
2130   {
2131     static PetscScalar w[2000]; /* very BAD need to fix */
2132     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
2133   }
2134 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
2135   {
2136     static PetscScalar w[2000]; /* very BAD need to fix */
2137     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
2138   }
2139 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
2140   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
2141 #else
2142   {
2143     PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
2144     const MatScalar *v;
2145     PetscInt        jdx,idt,idx,nz,i,ai16;
2146     const PetscInt  *vi;
2147 
2148   /* forward solve the lower triangular */
2149   idx    = 0;
2150   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
2151   for (i=1; i<n; i++) {
2152     v     =  aa      + 16*ai[i];
2153     vi    =  aj      + ai[i];
2154     nz    =  diag[i] - ai[i];
2155     idx   +=  4;
2156     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2157     while (nz--) {
2158       jdx   = 4*(*vi++);
2159       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
2160       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2161       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2162       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2163       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2164       v    += 16;
2165     }
2166     x[idx]   = s1;
2167     x[1+idx] = s2;
2168     x[2+idx] = s3;
2169     x[3+idx] = s4;
2170   }
2171   /* backward solve the upper triangular */
2172   idt = 4*(n-1);
2173   for (i=n-1; i>=0; i--){
2174     ai16 = 16*diag[i];
2175     v    = aa + ai16 + 16;
2176     vi   = aj + diag[i] + 1;
2177     nz   = ai[i+1] - diag[i] - 1;
2178     s1 = x[idt];  s2 = x[1+idt];
2179     s3 = x[2+idt];s4 = x[3+idt];
2180     while (nz--) {
2181       idx   = 4*(*vi++);
2182       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
2183       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2184       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2185       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2186       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
2187       v    += 16;
2188     }
2189     v        = aa + ai16;
2190     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
2191     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
2192     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
2193     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
2194     idt -= 4;
2195   }
2196   }
2197 #endif
2198 
2199   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2200   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2201   ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr);
2202   PetscFunctionReturn(0);
2203 }
2204 
2205 #undef __FUNCT__
2206 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion"
2207 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
2208 {
2209   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2210   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
2211   PetscErrorCode ierr;
2212   PetscInt       *diag = a->diag;
2213   MatScalar      *aa=a->a;
2214   PetscScalar    *x,*b;
2215 
2216   PetscFunctionBegin;
2217   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2218   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2219 
2220   {
2221     MatScalar  s1,s2,s3,s4,x1,x2,x3,x4;
2222     MatScalar  *v,*t=(MatScalar *)x;
2223     PetscInt   jdx,idt,idx,nz,*vi,i,ai16;
2224 
2225     /* forward solve the lower triangular */
2226     idx  = 0;
2227     t[0] = (MatScalar)b[0];
2228     t[1] = (MatScalar)b[1];
2229     t[2] = (MatScalar)b[2];
2230     t[3] = (MatScalar)b[3];
2231     for (i=1; i<n; i++) {
2232       v     =  aa      + 16*ai[i];
2233       vi    =  aj      + ai[i];
2234       nz    =  diag[i] - ai[i];
2235       idx   +=  4;
2236       s1 = (MatScalar)b[idx];
2237       s2 = (MatScalar)b[1+idx];
2238       s3 = (MatScalar)b[2+idx];
2239       s4 = (MatScalar)b[3+idx];
2240       while (nz--) {
2241         jdx = 4*(*vi++);
2242         x1  = t[jdx];
2243         x2  = t[1+jdx];
2244         x3  = t[2+jdx];
2245         x4  = t[3+jdx];
2246         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2247         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2248         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2249         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2250         v    += 16;
2251       }
2252       t[idx]   = s1;
2253       t[1+idx] = s2;
2254       t[2+idx] = s3;
2255       t[3+idx] = s4;
2256     }
2257     /* backward solve the upper triangular */
2258     idt = 4*(n-1);
2259     for (i=n-1; i>=0; i--){
2260       ai16 = 16*diag[i];
2261       v    = aa + ai16 + 16;
2262       vi   = aj + diag[i] + 1;
2263       nz   = ai[i+1] - diag[i] - 1;
2264       s1   = t[idt];
2265       s2   = t[1+idt];
2266       s3   = t[2+idt];
2267       s4   = t[3+idt];
2268       while (nz--) {
2269         idx = 4*(*vi++);
2270         x1  = (MatScalar)x[idx];
2271         x2  = (MatScalar)x[1+idx];
2272         x3  = (MatScalar)x[2+idx];
2273         x4  = (MatScalar)x[3+idx];
2274         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2275         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2276         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2277         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2278         v    += 16;
2279       }
2280       v        = aa + ai16;
2281       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
2282       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
2283       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
2284       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
2285       idt -= 4;
2286     }
2287   }
2288 
2289   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2290   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2291   ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr);
2292   PetscFunctionReturn(0);
2293 }
2294 
2295 #if defined (PETSC_HAVE_SSE)
2296 
2297 #include PETSC_HAVE_SSE
2298 #undef __FUNCT__
2299 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj"
2300 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
2301 {
2302   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2303   unsigned short *aj=(unsigned short *)a->j;
2304   PetscErrorCode ierr;
2305   int            *ai=a->i,n=a->mbs,*diag = a->diag;
2306   MatScalar      *aa=a->a;
2307   PetscScalar    *x,*b;
2308 
2309   PetscFunctionBegin;
2310   SSE_SCOPE_BEGIN;
2311   /*
2312      Note: This code currently uses demotion of double
2313      to float when performing the mixed-mode computation.
2314      This may not be numerically reasonable for all applications.
2315   */
2316   PREFETCH_NTA(aa+16*ai[1]);
2317 
2318   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2319   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2320   {
2321     /* x will first be computed in single precision then promoted inplace to double */
2322     MatScalar      *v,*t=(MatScalar *)x;
2323     int            nz,i,idt,ai16;
2324     unsigned int   jdx,idx;
2325     unsigned short *vi;
2326     /* Forward solve the lower triangular factor. */
2327 
2328     /* First block is the identity. */
2329     idx  = 0;
2330     CONVERT_DOUBLE4_FLOAT4(t,b);
2331     v    =  aa + 16*((unsigned int)ai[1]);
2332 
2333     for (i=1; i<n;) {
2334       PREFETCH_NTA(&v[8]);
2335       vi   =  aj      + ai[i];
2336       nz   =  diag[i] - ai[i];
2337       idx +=  4;
2338 
2339       /* Demote RHS from double to float. */
2340       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
2341       LOAD_PS(&t[idx],XMM7);
2342 
2343       while (nz--) {
2344         PREFETCH_NTA(&v[16]);
2345         jdx = 4*((unsigned int)(*vi++));
2346 
2347         /* 4x4 Matrix-Vector product with negative accumulation: */
2348         SSE_INLINE_BEGIN_2(&t[jdx],v)
2349           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2350 
2351           /* First Column */
2352           SSE_COPY_PS(XMM0,XMM6)
2353           SSE_SHUFFLE(XMM0,XMM0,0x00)
2354           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2355           SSE_SUB_PS(XMM7,XMM0)
2356 
2357           /* Second Column */
2358           SSE_COPY_PS(XMM1,XMM6)
2359           SSE_SHUFFLE(XMM1,XMM1,0x55)
2360           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2361           SSE_SUB_PS(XMM7,XMM1)
2362 
2363           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2364 
2365           /* Third Column */
2366           SSE_COPY_PS(XMM2,XMM6)
2367           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2368           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2369           SSE_SUB_PS(XMM7,XMM2)
2370 
2371           /* Fourth Column */
2372           SSE_COPY_PS(XMM3,XMM6)
2373           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2374           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2375           SSE_SUB_PS(XMM7,XMM3)
2376         SSE_INLINE_END_2
2377 
2378         v  += 16;
2379       }
2380       v    =  aa + 16*ai[++i];
2381       PREFETCH_NTA(v);
2382       STORE_PS(&t[idx],XMM7);
2383     }
2384 
2385     /* Backward solve the upper triangular factor.*/
2386 
2387     idt  = 4*(n-1);
2388     ai16 = 16*diag[n-1];
2389     v    = aa + ai16 + 16;
2390     for (i=n-1; i>=0;){
2391       PREFETCH_NTA(&v[8]);
2392       vi = aj + diag[i] + 1;
2393       nz = ai[i+1] - diag[i] - 1;
2394 
2395       LOAD_PS(&t[idt],XMM7);
2396 
2397       while (nz--) {
2398         PREFETCH_NTA(&v[16]);
2399         idx = 4*((unsigned int)(*vi++));
2400 
2401         /* 4x4 Matrix-Vector Product with negative accumulation: */
2402         SSE_INLINE_BEGIN_2(&t[idx],v)
2403           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2404 
2405           /* First Column */
2406           SSE_COPY_PS(XMM0,XMM6)
2407           SSE_SHUFFLE(XMM0,XMM0,0x00)
2408           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2409           SSE_SUB_PS(XMM7,XMM0)
2410 
2411           /* Second Column */
2412           SSE_COPY_PS(XMM1,XMM6)
2413           SSE_SHUFFLE(XMM1,XMM1,0x55)
2414           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2415           SSE_SUB_PS(XMM7,XMM1)
2416 
2417           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2418 
2419           /* Third Column */
2420           SSE_COPY_PS(XMM2,XMM6)
2421           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2422           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2423           SSE_SUB_PS(XMM7,XMM2)
2424 
2425           /* Fourth Column */
2426           SSE_COPY_PS(XMM3,XMM6)
2427           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2428           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2429           SSE_SUB_PS(XMM7,XMM3)
2430         SSE_INLINE_END_2
2431         v  += 16;
2432       }
2433       v    = aa + ai16;
2434       ai16 = 16*diag[--i];
2435       PREFETCH_NTA(aa+ai16+16);
2436       /*
2437          Scale the result by the diagonal 4x4 block,
2438          which was inverted as part of the factorization
2439       */
2440       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
2441         /* First Column */
2442         SSE_COPY_PS(XMM0,XMM7)
2443         SSE_SHUFFLE(XMM0,XMM0,0x00)
2444         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2445 
2446         /* Second Column */
2447         SSE_COPY_PS(XMM1,XMM7)
2448         SSE_SHUFFLE(XMM1,XMM1,0x55)
2449         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2450         SSE_ADD_PS(XMM0,XMM1)
2451 
2452         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2453 
2454         /* Third Column */
2455         SSE_COPY_PS(XMM2,XMM7)
2456         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2457         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2458         SSE_ADD_PS(XMM0,XMM2)
2459 
2460         /* Fourth Column */
2461         SSE_COPY_PS(XMM3,XMM7)
2462         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2463         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2464         SSE_ADD_PS(XMM0,XMM3)
2465 
2466         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2467       SSE_INLINE_END_3
2468 
2469       v    = aa + ai16 + 16;
2470       idt -= 4;
2471     }
2472 
2473     /* Convert t from single precision back to double precision (inplace)*/
2474     idt = 4*(n-1);
2475     for (i=n-1;i>=0;i--) {
2476       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
2477       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
2478       PetscScalar *xtemp=&x[idt];
2479       MatScalar   *ttemp=&t[idt];
2480       xtemp[3] = (PetscScalar)ttemp[3];
2481       xtemp[2] = (PetscScalar)ttemp[2];
2482       xtemp[1] = (PetscScalar)ttemp[1];
2483       xtemp[0] = (PetscScalar)ttemp[0];
2484       idt -= 4;
2485     }
2486 
2487   } /* End of artificial scope. */
2488   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2489   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2490   ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr);
2491   SSE_SCOPE_END;
2492   PetscFunctionReturn(0);
2493 }
2494 
2495 #undef __FUNCT__
2496 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion"
2497 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
2498 {
2499   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2500   int            *aj=a->j;
2501   PetscErrorCode ierr;
2502   int            *ai=a->i,n=a->mbs,*diag = a->diag;
2503   MatScalar      *aa=a->a;
2504   PetscScalar    *x,*b;
2505 
2506   PetscFunctionBegin;
2507   SSE_SCOPE_BEGIN;
2508   /*
2509      Note: This code currently uses demotion of double
2510      to float when performing the mixed-mode computation.
2511      This may not be numerically reasonable for all applications.
2512   */
2513   PREFETCH_NTA(aa+16*ai[1]);
2514 
2515   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2516   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2517   {
2518     /* x will first be computed in single precision then promoted inplace to double */
2519     MatScalar *v,*t=(MatScalar *)x;
2520     int       nz,i,idt,ai16;
2521     int       jdx,idx;
2522     int       *vi;
2523     /* Forward solve the lower triangular factor. */
2524 
2525     /* First block is the identity. */
2526     idx  = 0;
2527     CONVERT_DOUBLE4_FLOAT4(t,b);
2528     v    =  aa + 16*ai[1];
2529 
2530     for (i=1; i<n;) {
2531       PREFETCH_NTA(&v[8]);
2532       vi   =  aj      + ai[i];
2533       nz   =  diag[i] - ai[i];
2534       idx +=  4;
2535 
2536       /* Demote RHS from double to float. */
2537       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
2538       LOAD_PS(&t[idx],XMM7);
2539 
2540       while (nz--) {
2541         PREFETCH_NTA(&v[16]);
2542         jdx = 4*(*vi++);
2543 /*          jdx = *vi++; */
2544 
2545         /* 4x4 Matrix-Vector product with negative accumulation: */
2546         SSE_INLINE_BEGIN_2(&t[jdx],v)
2547           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2548 
2549           /* First Column */
2550           SSE_COPY_PS(XMM0,XMM6)
2551           SSE_SHUFFLE(XMM0,XMM0,0x00)
2552           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2553           SSE_SUB_PS(XMM7,XMM0)
2554 
2555           /* Second Column */
2556           SSE_COPY_PS(XMM1,XMM6)
2557           SSE_SHUFFLE(XMM1,XMM1,0x55)
2558           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2559           SSE_SUB_PS(XMM7,XMM1)
2560 
2561           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2562 
2563           /* Third Column */
2564           SSE_COPY_PS(XMM2,XMM6)
2565           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2566           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2567           SSE_SUB_PS(XMM7,XMM2)
2568 
2569           /* Fourth Column */
2570           SSE_COPY_PS(XMM3,XMM6)
2571           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2572           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2573           SSE_SUB_PS(XMM7,XMM3)
2574         SSE_INLINE_END_2
2575 
2576         v  += 16;
2577       }
2578       v    =  aa + 16*ai[++i];
2579       PREFETCH_NTA(v);
2580       STORE_PS(&t[idx],XMM7);
2581     }
2582 
2583     /* Backward solve the upper triangular factor.*/
2584 
2585     idt  = 4*(n-1);
2586     ai16 = 16*diag[n-1];
2587     v    = aa + ai16 + 16;
2588     for (i=n-1; i>=0;){
2589       PREFETCH_NTA(&v[8]);
2590       vi = aj + diag[i] + 1;
2591       nz = ai[i+1] - diag[i] - 1;
2592 
2593       LOAD_PS(&t[idt],XMM7);
2594 
2595       while (nz--) {
2596         PREFETCH_NTA(&v[16]);
2597         idx = 4*(*vi++);
2598 /*          idx = *vi++; */
2599 
2600         /* 4x4 Matrix-Vector Product with negative accumulation: */
2601         SSE_INLINE_BEGIN_2(&t[idx],v)
2602           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2603 
2604           /* First Column */
2605           SSE_COPY_PS(XMM0,XMM6)
2606           SSE_SHUFFLE(XMM0,XMM0,0x00)
2607           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2608           SSE_SUB_PS(XMM7,XMM0)
2609 
2610           /* Second Column */
2611           SSE_COPY_PS(XMM1,XMM6)
2612           SSE_SHUFFLE(XMM1,XMM1,0x55)
2613           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2614           SSE_SUB_PS(XMM7,XMM1)
2615 
2616           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2617 
2618           /* Third Column */
2619           SSE_COPY_PS(XMM2,XMM6)
2620           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2621           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2622           SSE_SUB_PS(XMM7,XMM2)
2623 
2624           /* Fourth Column */
2625           SSE_COPY_PS(XMM3,XMM6)
2626           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2627           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2628           SSE_SUB_PS(XMM7,XMM3)
2629         SSE_INLINE_END_2
2630         v  += 16;
2631       }
2632       v    = aa + ai16;
2633       ai16 = 16*diag[--i];
2634       PREFETCH_NTA(aa+ai16+16);
2635       /*
2636          Scale the result by the diagonal 4x4 block,
2637          which was inverted as part of the factorization
2638       */
2639       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
2640         /* First Column */
2641         SSE_COPY_PS(XMM0,XMM7)
2642         SSE_SHUFFLE(XMM0,XMM0,0x00)
2643         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2644 
2645         /* Second Column */
2646         SSE_COPY_PS(XMM1,XMM7)
2647         SSE_SHUFFLE(XMM1,XMM1,0x55)
2648         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2649         SSE_ADD_PS(XMM0,XMM1)
2650 
2651         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2652 
2653         /* Third Column */
2654         SSE_COPY_PS(XMM2,XMM7)
2655         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2656         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2657         SSE_ADD_PS(XMM0,XMM2)
2658 
2659         /* Fourth Column */
2660         SSE_COPY_PS(XMM3,XMM7)
2661         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2662         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2663         SSE_ADD_PS(XMM0,XMM3)
2664 
2665         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2666       SSE_INLINE_END_3
2667 
2668       v    = aa + ai16 + 16;
2669       idt -= 4;
2670     }
2671 
2672     /* Convert t from single precision back to double precision (inplace)*/
2673     idt = 4*(n-1);
2674     for (i=n-1;i>=0;i--) {
2675       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
2676       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
2677       PetscScalar *xtemp=&x[idt];
2678       MatScalar   *ttemp=&t[idt];
2679       xtemp[3] = (PetscScalar)ttemp[3];
2680       xtemp[2] = (PetscScalar)ttemp[2];
2681       xtemp[1] = (PetscScalar)ttemp[1];
2682       xtemp[0] = (PetscScalar)ttemp[0];
2683       idt -= 4;
2684     }
2685 
2686   } /* End of artificial scope. */
2687   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2688   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2689   ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr);
2690   SSE_SCOPE_END;
2691   PetscFunctionReturn(0);
2692 }
2693 
2694 #endif
2695 #undef __FUNCT__
2696 #define __FUNCT__ "MatSolve_SeqBAIJ_3"
2697 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
2698 {
2699   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2700   IS                iscol=a->col,isrow=a->row;
2701   PetscErrorCode    ierr;
2702   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
2703   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
2704   const MatScalar   *aa=a->a,*v;
2705   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
2706   const PetscScalar *b;
2707 
2708   PetscFunctionBegin;
2709   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2710   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2711   t  = a->solve_work;
2712 
2713   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2714   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2715 
2716   /* forward solve the lower triangular */
2717   idx    = 3*(*r++);
2718   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
2719   for (i=1; i<n; i++) {
2720     v     = aa + 9*ai[i];
2721     vi    = aj + ai[i];
2722     nz    = diag[i] - ai[i];
2723     idx   = 3*(*r++);
2724     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
2725     while (nz--) {
2726       idx   = 3*(*vi++);
2727       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2728       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2729       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2730       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2731       v += 9;
2732     }
2733     idx = 3*i;
2734     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
2735   }
2736   /* backward solve the upper triangular */
2737   for (i=n-1; i>=0; i--){
2738     v    = aa + 9*diag[i] + 9;
2739     vi   = aj + diag[i] + 1;
2740     nz   = ai[i+1] - diag[i] - 1;
2741     idt  = 3*i;
2742     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
2743     while (nz--) {
2744       idx   = 3*(*vi++);
2745       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2746       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2747       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2748       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2749       v += 9;
2750     }
2751     idc = 3*(*c--);
2752     v   = aa + 9*diag[i];
2753     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2754     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2755     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
2756   }
2757   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2758   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2759   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2760   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2761   ierr = PetscLogFlops(2*9*(a->nz) - 3*A->cmap->n);CHKERRQ(ierr);
2762   PetscFunctionReturn(0);
2763 }
2764 
2765 /*
2766       Special case where the matrix was ILU(0) factored in the natural
2767    ordering. This eliminates the need for the column and row permutation.
2768 */
2769 #undef __FUNCT__
2770 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
2771 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
2772 {
2773   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2774   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
2775   PetscErrorCode    ierr;
2776   PetscInt          *diag = a->diag;
2777   const MatScalar   *aa=a->a,*v;
2778   PetscScalar       *x,s1,s2,s3,x1,x2,x3;
2779   const PetscScalar *b;
2780   PetscInt          jdx,idt,idx,nz,*vi,i;
2781 
2782   PetscFunctionBegin;
2783   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2784   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2785 
2786   /* forward solve the lower triangular */
2787   idx    = 0;
2788   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
2789   for (i=1; i<n; i++) {
2790     v     =  aa      + 9*ai[i];
2791     vi    =  aj      + ai[i];
2792     nz    =  diag[i] - ai[i];
2793     idx   +=  3;
2794     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
2795     while (nz--) {
2796       jdx   = 3*(*vi++);
2797       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
2798       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2799       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2800       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2801       v    += 9;
2802     }
2803     x[idx]   = s1;
2804     x[1+idx] = s2;
2805     x[2+idx] = s3;
2806   }
2807   /* backward solve the upper triangular */
2808   for (i=n-1; i>=0; i--){
2809     v    = aa + 9*diag[i] + 9;
2810     vi   = aj + diag[i] + 1;
2811     nz   = ai[i+1] - diag[i] - 1;
2812     idt  = 3*i;
2813     s1 = x[idt];  s2 = x[1+idt];
2814     s3 = x[2+idt];
2815     while (nz--) {
2816       idx   = 3*(*vi++);
2817       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
2818       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2819       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2820       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2821       v    += 9;
2822     }
2823     v        = aa +  9*diag[i];
2824     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2825     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2826     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
2827   }
2828 
2829   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2830   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2831   ierr = PetscLogFlops(2*9*(a->nz) - 3*A->cmap->n);CHKERRQ(ierr);
2832   PetscFunctionReturn(0);
2833 }
2834 
2835 #undef __FUNCT__
2836 #define __FUNCT__ "MatSolve_SeqBAIJ_2"
2837 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
2838 {
2839   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2840   IS                iscol=a->col,isrow=a->row;
2841   PetscErrorCode    ierr;
2842   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
2843   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
2844   const MatScalar   *aa=a->a,*v;
2845   PetscScalar       *x,s1,s2,x1,x2,*t;
2846   const PetscScalar *b;
2847 
2848   PetscFunctionBegin;
2849   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2850   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2851   t  = a->solve_work;
2852 
2853   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2854   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2855 
2856   /* forward solve the lower triangular */
2857   idx    = 2*(*r++);
2858   t[0] = b[idx]; t[1] = b[1+idx];
2859   for (i=1; i<n; i++) {
2860     v     = aa + 4*ai[i];
2861     vi    = aj + ai[i];
2862     nz    = diag[i] - ai[i];
2863     idx   = 2*(*r++);
2864     s1  = b[idx]; s2 = b[1+idx];
2865     while (nz--) {
2866       idx   = 2*(*vi++);
2867       x1    = t[idx]; x2 = t[1+idx];
2868       s1 -= v[0]*x1 + v[2]*x2;
2869       s2 -= v[1]*x1 + v[3]*x2;
2870       v += 4;
2871     }
2872     idx = 2*i;
2873     t[idx] = s1; t[1+idx] = s2;
2874   }
2875   /* backward solve the upper triangular */
2876   for (i=n-1; i>=0; i--){
2877     v    = aa + 4*diag[i] + 4;
2878     vi   = aj + diag[i] + 1;
2879     nz   = ai[i+1] - diag[i] - 1;
2880     idt  = 2*i;
2881     s1 = t[idt]; s2 = t[1+idt];
2882     while (nz--) {
2883       idx   = 2*(*vi++);
2884       x1    = t[idx]; x2 = t[1+idx];
2885       s1 -= v[0]*x1 + v[2]*x2;
2886       s2 -= v[1]*x1 + v[3]*x2;
2887       v += 4;
2888     }
2889     idc = 2*(*c--);
2890     v   = aa + 4*diag[i];
2891     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
2892     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
2893   }
2894   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2895   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2896   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2897   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2898   ierr = PetscLogFlops(2*4*(a->nz) - 2*A->cmap->n);CHKERRQ(ierr);
2899   PetscFunctionReturn(0);
2900 }
2901 
2902 /*
2903       Special case where the matrix was ILU(0) factored in the natural
2904    ordering. This eliminates the need for the column and row permutation.
2905 */
2906 #undef __FUNCT__
2907 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
2908 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
2909 {
2910   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2911   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
2912   PetscErrorCode    ierr;
2913   PetscInt          *diag = a->diag;
2914   const MatScalar   *aa=a->a,*v;
2915   PetscScalar       *x,s1,s2,x1,x2;
2916   const PetscScalar *b;
2917   PetscInt          jdx,idt,idx,nz,*vi,i;
2918 
2919   PetscFunctionBegin;
2920   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2921   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2922 
2923   /* forward solve the lower triangular */
2924   idx    = 0;
2925   x[0]   = b[0]; x[1] = b[1];
2926   for (i=1; i<n; i++) {
2927     v     =  aa      + 4*ai[i];
2928     vi    =  aj      + ai[i];
2929     nz    =  diag[i] - ai[i];
2930     idx   +=  2;
2931     s1  =  b[idx];s2 = b[1+idx];
2932     while (nz--) {
2933       jdx   = 2*(*vi++);
2934       x1    = x[jdx];x2 = x[1+jdx];
2935       s1 -= v[0]*x1 + v[2]*x2;
2936       s2 -= v[1]*x1 + v[3]*x2;
2937       v    += 4;
2938     }
2939     x[idx]   = s1;
2940     x[1+idx] = s2;
2941   }
2942   /* backward solve the upper triangular */
2943   for (i=n-1; i>=0; i--){
2944     v    = aa + 4*diag[i] + 4;
2945     vi   = aj + diag[i] + 1;
2946     nz   = ai[i+1] - diag[i] - 1;
2947     idt  = 2*i;
2948     s1 = x[idt];  s2 = x[1+idt];
2949     while (nz--) {
2950       idx   = 2*(*vi++);
2951       x1    = x[idx];   x2 = x[1+idx];
2952       s1 -= v[0]*x1 + v[2]*x2;
2953       s2 -= v[1]*x1 + v[3]*x2;
2954       v    += 4;
2955     }
2956     v        = aa +  4*diag[i];
2957     x[idt]   = v[0]*s1 + v[2]*s2;
2958     x[1+idt] = v[1]*s1 + v[3]*s2;
2959   }
2960 
2961   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2962   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2963   ierr = PetscLogFlops(2*4*(a->nz) - 2*A->cmap->n);CHKERRQ(ierr);
2964   PetscFunctionReturn(0);
2965 }
2966 
2967 #undef __FUNCT__
2968 #define __FUNCT__ "MatSolve_SeqBAIJ_1"
2969 PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
2970 {
2971   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
2972   IS             iscol=a->col,isrow=a->row;
2973   PetscErrorCode ierr;
2974   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
2975   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
2976   MatScalar      *aa=a->a,*v;
2977   PetscScalar    *x,*b,s1,*t;
2978 
2979   PetscFunctionBegin;
2980   if (!n) PetscFunctionReturn(0);
2981 
2982   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2983   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2984   t  = a->solve_work;
2985 
2986   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2987   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2988 
2989   /* forward solve the lower triangular */
2990   t[0] = b[*r++];
2991   for (i=1; i<n; i++) {
2992     v     = aa + ai[i];
2993     vi    = aj + ai[i];
2994     nz    = diag[i] - ai[i];
2995     s1  = b[*r++];
2996     while (nz--) {
2997       s1 -= (*v++)*t[*vi++];
2998     }
2999     t[i] = s1;
3000   }
3001   /* backward solve the upper triangular */
3002   for (i=n-1; i>=0; i--){
3003     v    = aa + diag[i] + 1;
3004     vi   = aj + diag[i] + 1;
3005     nz   = ai[i+1] - diag[i] - 1;
3006     s1 = t[i];
3007     while (nz--) {
3008       s1 -= (*v++)*t[*vi++];
3009     }
3010     x[*c--] = t[i] = aa[diag[i]]*s1;
3011   }
3012 
3013   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3014   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3015   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3016   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3017   ierr = PetscLogFlops(2*1*(a->nz) - A->cmap->n);CHKERRQ(ierr);
3018   PetscFunctionReturn(0);
3019 }
3020 /*
3021       Special case where the matrix was ILU(0) factored in the natural
3022    ordering. This eliminates the need for the column and row permutation.
3023 */
3024 #undef __FUNCT__
3025 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering"
3026 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
3027 {
3028   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3029   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
3030   PetscErrorCode ierr;
3031   PetscInt       *diag = a->diag;
3032   MatScalar      *aa=a->a;
3033   PetscScalar    *x,*b;
3034   PetscScalar    s1,x1;
3035   MatScalar      *v;
3036   PetscInt       jdx,idt,idx,nz,*vi,i;
3037 
3038   PetscFunctionBegin;
3039   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3040   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3041 
3042   /* forward solve the lower triangular */
3043   idx    = 0;
3044   x[0]   = b[0];
3045   for (i=1; i<n; i++) {
3046     v     =  aa      + ai[i];
3047     vi    =  aj      + ai[i];
3048     nz    =  diag[i] - ai[i];
3049     idx   +=  1;
3050     s1  =  b[idx];
3051     while (nz--) {
3052       jdx   = *vi++;
3053       x1    = x[jdx];
3054       s1 -= v[0]*x1;
3055       v    += 1;
3056     }
3057     x[idx]   = s1;
3058   }
3059   /* backward solve the upper triangular */
3060   for (i=n-1; i>=0; i--){
3061     v    = aa + diag[i] + 1;
3062     vi   = aj + diag[i] + 1;
3063     nz   = ai[i+1] - diag[i] - 1;
3064     idt  = i;
3065     s1 = x[idt];
3066     while (nz--) {
3067       idx   = *vi++;
3068       x1    = x[idx];
3069       s1 -= v[0]*x1;
3070       v    += 1;
3071     }
3072     v        = aa +  diag[i];
3073     x[idt]   = v[0]*s1;
3074   }
3075   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3076   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3077   ierr = PetscLogFlops(2*(a->nz) - A->cmap->n);CHKERRQ(ierr);
3078   PetscFunctionReturn(0);
3079 }
3080 
3081 /* ----------------------------------------------------------------*/
3082 /*
3083      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
3084    except that the data structure of Mat_SeqAIJ is slightly different.
3085    Not a good example of code reuse.
3086 */
3087 EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption);
3088 EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat,PetscTruth);
3089 
3090 #undef __FUNCT__
3091 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ"
3092 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
3093 {
3094   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b;
3095   IS             isicol;
3096   PetscErrorCode ierr;
3097   const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi;
3098   PetscInt       prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp;
3099   PetscInt       *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0;
3100   PetscInt       incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd;
3101   PetscTruth     col_identity,row_identity,both_identity,flg;
3102   PetscReal      f;
3103 
3104   PetscFunctionBegin;
3105   f             = info->fill;
3106   levels        = (PetscInt)info->levels;
3107   diagonal_fill = (PetscInt)info->diagonal_fill;
3108   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
3109   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
3110   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
3111   both_identity = (PetscTruth) (row_identity && col_identity);
3112 
3113   if (!levels && both_identity) {  /* special case copy the nonzero structure */
3114     ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES);CHKERRQ(ierr);
3115     fact->factor = MAT_FACTOR_ILU;
3116     b               = (Mat_SeqBAIJ*)(fact)->data;
3117     ierr = MatMissingDiagonal_SeqBAIJ(fact,&flg,&dd);CHKERRQ(ierr);
3118     if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry in row %D",dd);
3119     b->row        = isrow;
3120     b->col        = iscol;
3121     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
3122     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
3123     b->icol       = isicol;
3124     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3125     ierr          = PetscMalloc(((fact)->rmap->N+1+(fact)->rmap->bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
3126   } else { /* general case perform the symbolic factorization */
3127     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
3128     ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
3129 
3130     /* get new row pointers */
3131     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr);
3132     ainew[0] = 0;
3133     /* don't know how many column pointers are needed so estimate */
3134     jmax = (PetscInt)(f*ai[n] + 1);
3135     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr);
3136     /* ajfill is level of fill for each fill entry */
3137     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr);
3138     /* fill is a linked list of nonzeros in active row */
3139     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr);
3140     /* im is level for each filled value */
3141     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr);
3142     /* dloc is location of diagonal in factor */
3143     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr);
3144     dloc[0]  = 0;
3145     for (prow=0; prow<n; prow++) {
3146 
3147       /* copy prow into linked list */
3148       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
3149       if (!nz) SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix");
3150       xi         = aj + ai[r[prow]];
3151       fill[n]    = n;
3152       fill[prow] = -1; /* marker for diagonal entry */
3153       while (nz--) {
3154 	fm  = n;
3155 	idx = ic[*xi++];
3156 	do {
3157 	  m  = fm;
3158 	  fm = fill[m];
3159 	} while (fm < idx);
3160 	fill[m]   = idx;
3161 	fill[idx] = fm;
3162 	im[idx]   = 0;
3163       }
3164 
3165       /* make sure diagonal entry is included */
3166       if (diagonal_fill && fill[prow] == -1) {
3167 	fm = n;
3168 	while (fill[fm] < prow) fm = fill[fm];
3169 	fill[prow] = fill[fm];  /* insert diagonal into linked list */
3170 	fill[fm]   = prow;
3171 	im[prow]   = 0;
3172 	nzf++;
3173 	dcount++;
3174       }
3175 
3176       nzi = 0;
3177       row = fill[n];
3178       while (row < prow) {
3179 	incrlev = im[row] + 1;
3180 	nz      = dloc[row];
3181 	xi      = ajnew  + ainew[row] + nz + 1;
3182 	flev    = ajfill + ainew[row] + nz + 1;
3183 	nnz     = ainew[row+1] - ainew[row] - nz - 1;
3184 	fm      = row;
3185 	while (nnz-- > 0) {
3186 	  idx = *xi++;
3187 	  if (*flev + incrlev > levels) {
3188 	    flev++;
3189 	    continue;
3190 	  }
3191 	  do {
3192 	    m  = fm;
3193 	    fm = fill[m];
3194 	  } while (fm < idx);
3195 	  if (fm != idx) {
3196 	    im[idx]   = *flev + incrlev;
3197 	    fill[m]   = idx;
3198 	    fill[idx] = fm;
3199 	    fm        = idx;
3200 	    nzf++;
3201 	  } else {
3202 	    if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
3203 	  }
3204 	  flev++;
3205 	}
3206 	row = fill[row];
3207 	nzi++;
3208       }
3209       /* copy new filled row into permanent storage */
3210       ainew[prow+1] = ainew[prow] + nzf;
3211       if (ainew[prow+1] > jmax) {
3212 
3213 	/* estimate how much additional space we will need */
3214 	/* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
3215 	/* just double the memory each time */
3216 	PetscInt maxadd = jmax;
3217 	/* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
3218 	if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
3219 	jmax += maxadd;
3220 
3221 	/* allocate a longer ajnew and ajfill */
3222 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
3223 	ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
3224 	ierr = PetscFree(ajnew);CHKERRQ(ierr);
3225 	ajnew = xitmp;
3226 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
3227 	ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
3228 	ierr = PetscFree(ajfill);CHKERRQ(ierr);
3229 	ajfill = xitmp;
3230 	reallocate++; /* count how many reallocations are needed */
3231       }
3232       xitmp       = ajnew + ainew[prow];
3233       flev        = ajfill + ainew[prow];
3234       dloc[prow]  = nzi;
3235       fm          = fill[n];
3236       while (nzf--) {
3237 	*xitmp++ = fm;
3238 	*flev++ = im[fm];
3239 	fm      = fill[fm];
3240       }
3241       /* make sure row has diagonal entry */
3242       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
3243 	SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
3244     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow);
3245       }
3246     }
3247     ierr = PetscFree(ajfill);CHKERRQ(ierr);
3248     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
3249     ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
3250     ierr = PetscFree(fill);CHKERRQ(ierr);
3251     ierr = PetscFree(im);CHKERRQ(ierr);
3252 
3253 #if defined(PETSC_USE_INFO)
3254     {
3255       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
3256       ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr);
3257       ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
3258       ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr);
3259       ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
3260       if (diagonal_fill) {
3261 	ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr);
3262       }
3263     }
3264 #endif
3265 
3266     /* put together the new matrix */
3267     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
3268     ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
3269     b    = (Mat_SeqBAIJ*)(fact)->data;
3270     b->free_a       = PETSC_TRUE;
3271     b->free_ij      = PETSC_TRUE;
3272     b->singlemalloc = PETSC_FALSE;
3273     ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
3274     b->j          = ajnew;
3275     b->i          = ainew;
3276     for (i=0; i<n; i++) dloc[i] += ainew[i];
3277     b->diag       = dloc;
3278     b->ilen       = 0;
3279     b->imax       = 0;
3280     b->row        = isrow;
3281     b->col        = iscol;
3282     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3283     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
3284     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
3285     b->icol       = isicol;
3286     ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
3287     /* In b structure:  Free imax, ilen, old a, old j.
3288        Allocate dloc, solve_work, new a, new j */
3289     ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr);
3290     b->maxnz          = b->nz = ainew[n];
3291 
3292     (fact)->info.factor_mallocs    = reallocate;
3293     (fact)->info.fill_ratio_given  = f;
3294     (fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
3295   }
3296   ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
3297   PetscFunctionReturn(0);
3298 }
3299 
3300 #undef __FUNCT__
3301 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE"
3302 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
3303 {
3304   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
3305   /* int i,*AJ=a->j,nz=a->nz; */
3306   PetscFunctionBegin;
3307   /* Undo Column scaling */
3308 /*    while (nz--) { */
3309 /*      AJ[i] = AJ[i]/4; */
3310 /*    } */
3311   /* This should really invoke a push/pop logic, but we don't have that yet. */
3312   A->ops->setunfactored = PETSC_NULL;
3313   PetscFunctionReturn(0);
3314 }
3315 
3316 #undef __FUNCT__
3317 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj"
3318 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
3319 {
3320   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3321   PetscInt       *AJ=a->j,nz=a->nz;
3322   unsigned short *aj=(unsigned short *)AJ;
3323   PetscFunctionBegin;
3324   /* Is this really necessary? */
3325   while (nz--) {
3326     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
3327   }
3328   A->ops->setunfactored = PETSC_NULL;
3329   PetscFunctionReturn(0);
3330 }
3331 
3332 
3333