xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision d4bf62d159b4eec940d9872c577cf78ea329f539)
1 #define PETSCMAT_DLL
2 
3 
4 /*
5     Factorization code for BAIJ format.
6 */
7 
8 #include "../src/mat/impls/baij/seq/baij.h"
9 #include "../src/mat/blockinvert.h"
10 
11 #undef __FUNCT__
12 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering"
13 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
14 {
15   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
16   PetscErrorCode ierr;
17   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
18   PetscInt       *diag = a->diag;
19   MatScalar      *aa=a->a,*v;
20   PetscScalar    s1,*x,*b;
21 
22   PetscFunctionBegin;
23   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
24   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
25   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
26 
27   /* forward solve the U^T */
28   for (i=0; i<n; i++) {
29 
30     v     = aa + diag[i];
31     /* multiply by the inverse of the block diagonal */
32     s1    = (*v++)*x[i];
33     vi    = aj + diag[i] + 1;
34     nz    = ai[i+1] - diag[i] - 1;
35     while (nz--) {
36       x[*vi++]  -= (*v++)*s1;
37     }
38     x[i]   = s1;
39   }
40   /* backward solve the L^T */
41   for (i=n-1; i>=0; i--){
42     v    = aa + diag[i] - 1;
43     vi   = aj + diag[i] - 1;
44     nz   = diag[i] - ai[i];
45     s1   = x[i];
46     while (nz--) {
47       x[*vi--]   -=  (*v--)*s1;
48     }
49   }
50   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
51   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
52   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
53   PetscFunctionReturn(0);
54 }
55 
56 #undef __FUNCT__
57 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering"
58 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
59 {
60   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
61   PetscErrorCode ierr;
62   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
63   PetscInt       *diag = a->diag,oidx;
64   MatScalar      *aa=a->a,*v;
65   PetscScalar    s1,s2,x1,x2;
66   PetscScalar    *x,*b;
67 
68   PetscFunctionBegin;
69   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
70   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
71   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
72 
73   /* forward solve the U^T */
74   idx = 0;
75   for (i=0; i<n; i++) {
76 
77     v     = aa + 4*diag[i];
78     /* multiply by the inverse of the block diagonal */
79     x1 = x[idx];   x2 = x[1+idx];
80     s1 = v[0]*x1  +  v[1]*x2;
81     s2 = v[2]*x1  +  v[3]*x2;
82     v += 4;
83 
84     vi    = aj + diag[i] + 1;
85     nz    = ai[i+1] - diag[i] - 1;
86     while (nz--) {
87       oidx = 2*(*vi++);
88       x[oidx]   -= v[0]*s1  +  v[1]*s2;
89       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
90       v  += 4;
91     }
92     x[idx]   = s1;x[1+idx] = s2;
93     idx += 2;
94   }
95   /* backward solve the L^T */
96   for (i=n-1; i>=0; i--){
97     v    = aa + 4*diag[i] - 4;
98     vi   = aj + diag[i] - 1;
99     nz   = diag[i] - ai[i];
100     idt  = 2*i;
101     s1   = x[idt];  s2 = x[1+idt];
102     while (nz--) {
103       idx   = 2*(*vi--);
104       x[idx]   -=  v[0]*s1 +  v[1]*s2;
105       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
106       v -= 4;
107     }
108   }
109   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
110   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
111   ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
112   PetscFunctionReturn(0);
113 }
114 
115 #undef __FUNCT__
116 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering"
117 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
118 {
119   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
120   PetscErrorCode ierr;
121   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
122   PetscInt       *diag = a->diag,oidx;
123   MatScalar      *aa=a->a,*v;
124   PetscScalar    s1,s2,s3,x1,x2,x3;
125   PetscScalar    *x,*b;
126 
127   PetscFunctionBegin;
128   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
129   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
130   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
131 
132   /* forward solve the U^T */
133   idx = 0;
134   for (i=0; i<n; i++) {
135 
136     v     = aa + 9*diag[i];
137     /* multiply by the inverse of the block diagonal */
138     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
139     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
140     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
141     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
142     v += 9;
143 
144     vi    = aj + diag[i] + 1;
145     nz    = ai[i+1] - diag[i] - 1;
146     while (nz--) {
147       oidx = 3*(*vi++);
148       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
149       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
150       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
151       v  += 9;
152     }
153     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
154     idx += 3;
155   }
156   /* backward solve the L^T */
157   for (i=n-1; i>=0; i--){
158     v    = aa + 9*diag[i] - 9;
159     vi   = aj + diag[i] - 1;
160     nz   = diag[i] - ai[i];
161     idt  = 3*i;
162     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
163     while (nz--) {
164       idx   = 3*(*vi--);
165       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
166       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
167       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
168       v -= 9;
169     }
170   }
171   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
172   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
173   ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
174   PetscFunctionReturn(0);
175 }
176 
177 #undef __FUNCT__
178 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering"
179 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
180 {
181   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
182   PetscErrorCode ierr;
183   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
184   PetscInt       *diag = a->diag,oidx;
185   MatScalar      *aa=a->a,*v;
186   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
187   PetscScalar    *x,*b;
188 
189   PetscFunctionBegin;
190   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
191   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
192   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
193 
194   /* forward solve the U^T */
195   idx = 0;
196   for (i=0; i<n; i++) {
197 
198     v     = aa + 16*diag[i];
199     /* multiply by the inverse of the block diagonal */
200     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
201     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
202     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
203     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
204     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
205     v += 16;
206 
207     vi    = aj + diag[i] + 1;
208     nz    = ai[i+1] - diag[i] - 1;
209     while (nz--) {
210       oidx = 4*(*vi++);
211       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
212       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
213       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
214       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
215       v  += 16;
216     }
217     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
218     idx += 4;
219   }
220   /* backward solve the L^T */
221   for (i=n-1; i>=0; i--){
222     v    = aa + 16*diag[i] - 16;
223     vi   = aj + diag[i] - 1;
224     nz   = diag[i] - ai[i];
225     idt  = 4*i;
226     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
227     while (nz--) {
228       idx   = 4*(*vi--);
229       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
230       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
231       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
232       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
233       v -= 16;
234     }
235   }
236   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
237   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
238   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
239   PetscFunctionReturn(0);
240 }
241 
242 #undef __FUNCT__
243 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering"
244 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
245 {
246   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
247   PetscErrorCode ierr;
248   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
249   PetscInt       *diag = a->diag,oidx;
250   MatScalar      *aa=a->a,*v;
251   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
252   PetscScalar    *x,*b;
253 
254   PetscFunctionBegin;
255   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
256   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
257   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
258 
259   /* forward solve the U^T */
260   idx = 0;
261   for (i=0; i<n; i++) {
262 
263     v     = aa + 25*diag[i];
264     /* multiply by the inverse of the block diagonal */
265     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
266     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
267     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
268     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
269     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
270     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
271     v += 25;
272 
273     vi    = aj + diag[i] + 1;
274     nz    = ai[i+1] - diag[i] - 1;
275     while (nz--) {
276       oidx = 5*(*vi++);
277       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
278       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
279       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
280       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
281       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
282       v  += 25;
283     }
284     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
285     idx += 5;
286   }
287   /* backward solve the L^T */
288   for (i=n-1; i>=0; i--){
289     v    = aa + 25*diag[i] - 25;
290     vi   = aj + diag[i] - 1;
291     nz   = diag[i] - ai[i];
292     idt  = 5*i;
293     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
294     while (nz--) {
295       idx   = 5*(*vi--);
296       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
297       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
298       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
299       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
300       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
301       v -= 25;
302     }
303   }
304   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
305   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
306   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
307   PetscFunctionReturn(0);
308 }
309 
310 #undef __FUNCT__
311 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering"
312 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
313 {
314   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
315   PetscErrorCode ierr;
316   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
317   PetscInt       *diag = a->diag,oidx;
318   MatScalar      *aa=a->a,*v;
319   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
320   PetscScalar    *x,*b;
321 
322   PetscFunctionBegin;
323   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
324   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
325   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
326 
327   /* forward solve the U^T */
328   idx = 0;
329   for (i=0; i<n; i++) {
330 
331     v     = aa + 36*diag[i];
332     /* multiply by the inverse of the block diagonal */
333     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
334     x6    = x[5+idx];
335     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
336     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
337     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
338     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
339     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
340     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
341     v += 36;
342 
343     vi    = aj + diag[i] + 1;
344     nz    = ai[i+1] - diag[i] - 1;
345     while (nz--) {
346       oidx = 6*(*vi++);
347       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
348       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
349       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
350       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
351       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
352       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
353       v  += 36;
354     }
355     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
356     x[5+idx] = s6;
357     idx += 6;
358   }
359   /* backward solve the L^T */
360   for (i=n-1; i>=0; i--){
361     v    = aa + 36*diag[i] - 36;
362     vi   = aj + diag[i] - 1;
363     nz   = diag[i] - ai[i];
364     idt  = 6*i;
365     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
366     s6 = x[5+idt];
367     while (nz--) {
368       idx   = 6*(*vi--);
369       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
370       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
371       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
372       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
373       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
374       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
375       v -= 36;
376     }
377   }
378   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
379   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
380   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
381   PetscFunctionReturn(0);
382 }
383 
384 #undef __FUNCT__
385 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering"
386 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
387 {
388   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
389   PetscErrorCode ierr;
390   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
391   PetscInt       *diag = a->diag,oidx;
392   MatScalar      *aa=a->a,*v;
393   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
394   PetscScalar    *x,*b;
395 
396   PetscFunctionBegin;
397   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
398   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
399   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
400 
401   /* forward solve the U^T */
402   idx = 0;
403   for (i=0; i<n; i++) {
404 
405     v     = aa + 49*diag[i];
406     /* multiply by the inverse of the block diagonal */
407     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
408     x6    = x[5+idx]; x7 = x[6+idx];
409     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
410     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
411     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
412     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
413     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
414     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
415     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
416     v += 49;
417 
418     vi    = aj + diag[i] + 1;
419     nz    = ai[i+1] - diag[i] - 1;
420     while (nz--) {
421       oidx = 7*(*vi++);
422       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
423       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
424       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
425       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
426       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
427       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
428       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
429       v  += 49;
430     }
431     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
432     x[5+idx] = s6;x[6+idx] = s7;
433     idx += 7;
434   }
435   /* backward solve the L^T */
436   for (i=n-1; i>=0; i--){
437     v    = aa + 49*diag[i] - 49;
438     vi   = aj + diag[i] - 1;
439     nz   = diag[i] - ai[i];
440     idt  = 7*i;
441     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
442     s6 = x[5+idt];s7 = x[6+idt];
443     while (nz--) {
444       idx   = 7*(*vi--);
445       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
446       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
447       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
448       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
449       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
450       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
451       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
452       v -= 49;
453     }
454   }
455   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
456   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
457   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
458   PetscFunctionReturn(0);
459 }
460 
461 /*---------------------------------------------------------------------------------------------*/
462 #undef __FUNCT__
463 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1"
464 PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
465 {
466   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
467   IS             iscol=a->col,isrow=a->row;
468   PetscErrorCode ierr;
469   const PetscInt *r,*c,*rout,*cout;
470   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
471   PetscInt       *diag = a->diag;
472   MatScalar      *aa=a->a,*v;
473   PetscScalar    s1,*x,*b,*t;
474 
475   PetscFunctionBegin;
476   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
477   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
478   t  = a->solve_work;
479 
480   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
481   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
482 
483   /* copy the b into temp work space according to permutation */
484   for (i=0; i<n; i++) {
485     t[i] = b[c[i]];
486   }
487 
488   /* forward solve the U^T */
489   for (i=0; i<n; i++) {
490 
491     v     = aa + diag[i];
492     /* multiply by the inverse of the block diagonal */
493     s1    = (*v++)*t[i];
494     vi    = aj + diag[i] + 1;
495     nz    = ai[i+1] - diag[i] - 1;
496     while (nz--) {
497       t[*vi++]  -= (*v++)*s1;
498     }
499     t[i]   = s1;
500   }
501   /* backward solve the L^T */
502   for (i=n-1; i>=0; i--){
503     v    = aa + diag[i] - 1;
504     vi   = aj + diag[i] - 1;
505     nz   = diag[i] - ai[i];
506     s1   = t[i];
507     while (nz--) {
508       t[*vi--]   -=  (*v--)*s1;
509     }
510   }
511 
512   /* copy t into x according to permutation */
513   for (i=0; i<n; i++) {
514     x[r[i]]   = t[i];
515   }
516 
517   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
518   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
519   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
520   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
521   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
522   PetscFunctionReturn(0);
523 }
524 
525 #undef __FUNCT__
526 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2"
527 PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
528 {
529   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
530   IS             iscol=a->col,isrow=a->row;
531   PetscErrorCode ierr;
532   const PetscInt *r,*c,*rout,*cout;
533   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
534   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
535   MatScalar      *aa=a->a,*v;
536   PetscScalar    s1,s2,x1,x2;
537   PetscScalar    *x,*b,*t;
538 
539   PetscFunctionBegin;
540   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
541   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
542   t  = a->solve_work;
543 
544   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
545   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
546 
547   /* copy the b into temp work space according to permutation */
548   ii = 0;
549   for (i=0; i<n; i++) {
550     ic      = 2*c[i];
551     t[ii]   = b[ic];
552     t[ii+1] = b[ic+1];
553     ii += 2;
554   }
555 
556   /* forward solve the U^T */
557   idx = 0;
558   for (i=0; i<n; i++) {
559 
560     v     = aa + 4*diag[i];
561     /* multiply by the inverse of the block diagonal */
562     x1    = t[idx];   x2 = t[1+idx];
563     s1 = v[0]*x1  +  v[1]*x2;
564     s2 = v[2]*x1  +  v[3]*x2;
565     v += 4;
566 
567     vi    = aj + diag[i] + 1;
568     nz    = ai[i+1] - diag[i] - 1;
569     while (nz--) {
570       oidx = 2*(*vi++);
571       t[oidx]   -= v[0]*s1  +  v[1]*s2;
572       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
573       v  += 4;
574     }
575     t[idx]   = s1;t[1+idx] = s2;
576     idx += 2;
577   }
578   /* backward solve the L^T */
579   for (i=n-1; i>=0; i--){
580     v    = aa + 4*diag[i] - 4;
581     vi   = aj + diag[i] - 1;
582     nz   = diag[i] - ai[i];
583     idt  = 2*i;
584     s1 = t[idt];  s2 = t[1+idt];
585     while (nz--) {
586       idx   = 2*(*vi--);
587       t[idx]   -=  v[0]*s1 +  v[1]*s2;
588       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
589       v -= 4;
590     }
591   }
592 
593   /* copy t into x according to permutation */
594   ii = 0;
595   for (i=0; i<n; i++) {
596     ir      = 2*r[i];
597     x[ir]   = t[ii];
598     x[ir+1] = t[ii+1];
599     ii += 2;
600   }
601 
602   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
603   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
604   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
605   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
606   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
607   PetscFunctionReturn(0);
608 }
609 
610 #undef __FUNCT__
611 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3"
612 PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
613 {
614   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
615   IS             iscol=a->col,isrow=a->row;
616   PetscErrorCode ierr;
617   const PetscInt *r,*c,*rout,*cout;
618   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
619   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
620   MatScalar      *aa=a->a,*v;
621   PetscScalar    s1,s2,s3,x1,x2,x3;
622   PetscScalar    *x,*b,*t;
623 
624   PetscFunctionBegin;
625   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
626   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
627   t  = a->solve_work;
628 
629   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
630   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
631 
632   /* copy the b into temp work space according to permutation */
633   ii = 0;
634   for (i=0; i<n; i++) {
635     ic      = 3*c[i];
636     t[ii]   = b[ic];
637     t[ii+1] = b[ic+1];
638     t[ii+2] = b[ic+2];
639     ii += 3;
640   }
641 
642   /* forward solve the U^T */
643   idx = 0;
644   for (i=0; i<n; i++) {
645 
646     v     = aa + 9*diag[i];
647     /* multiply by the inverse of the block diagonal */
648     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
649     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
650     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
651     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
652     v += 9;
653 
654     vi    = aj + diag[i] + 1;
655     nz    = ai[i+1] - diag[i] - 1;
656     while (nz--) {
657       oidx = 3*(*vi++);
658       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
659       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
660       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
661       v  += 9;
662     }
663     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
664     idx += 3;
665   }
666   /* backward solve the L^T */
667   for (i=n-1; i>=0; i--){
668     v    = aa + 9*diag[i] - 9;
669     vi   = aj + diag[i] - 1;
670     nz   = diag[i] - ai[i];
671     idt  = 3*i;
672     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
673     while (nz--) {
674       idx   = 3*(*vi--);
675       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
676       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
677       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
678       v -= 9;
679     }
680   }
681 
682   /* copy t into x according to permutation */
683   ii = 0;
684   for (i=0; i<n; i++) {
685     ir      = 3*r[i];
686     x[ir]   = t[ii];
687     x[ir+1] = t[ii+1];
688     x[ir+2] = t[ii+2];
689     ii += 3;
690   }
691 
692   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
693   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
694   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
695   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
696   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
697   PetscFunctionReturn(0);
698 }
699 
700 #undef __FUNCT__
701 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4"
702 PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
703 {
704   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
705   IS             iscol=a->col,isrow=a->row;
706   PetscErrorCode ierr;
707   const PetscInt *r,*c,*rout,*cout;
708   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
709   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
710   MatScalar      *aa=a->a,*v;
711   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
712   PetscScalar    *x,*b,*t;
713 
714   PetscFunctionBegin;
715   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
716   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
717   t  = a->solve_work;
718 
719   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
720   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
721 
722   /* copy the b into temp work space according to permutation */
723   ii = 0;
724   for (i=0; i<n; i++) {
725     ic      = 4*c[i];
726     t[ii]   = b[ic];
727     t[ii+1] = b[ic+1];
728     t[ii+2] = b[ic+2];
729     t[ii+3] = b[ic+3];
730     ii += 4;
731   }
732 
733   /* forward solve the U^T */
734   idx = 0;
735   for (i=0; i<n; i++) {
736 
737     v     = aa + 16*diag[i];
738     /* multiply by the inverse of the block diagonal */
739     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
740     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
741     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
742     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
743     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
744     v += 16;
745 
746     vi    = aj + diag[i] + 1;
747     nz    = ai[i+1] - diag[i] - 1;
748     while (nz--) {
749       oidx = 4*(*vi++);
750       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
751       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
752       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
753       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
754       v  += 16;
755     }
756     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
757     idx += 4;
758   }
759   /* backward solve the L^T */
760   for (i=n-1; i>=0; i--){
761     v    = aa + 16*diag[i] - 16;
762     vi   = aj + diag[i] - 1;
763     nz   = diag[i] - ai[i];
764     idt  = 4*i;
765     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
766     while (nz--) {
767       idx   = 4*(*vi--);
768       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
769       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
770       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
771       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
772       v -= 16;
773     }
774   }
775 
776   /* copy t into x according to permutation */
777   ii = 0;
778   for (i=0; i<n; i++) {
779     ir      = 4*r[i];
780     x[ir]   = t[ii];
781     x[ir+1] = t[ii+1];
782     x[ir+2] = t[ii+2];
783     x[ir+3] = t[ii+3];
784     ii += 4;
785   }
786 
787   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
788   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
789   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
790   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
791   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
792   PetscFunctionReturn(0);
793 }
794 
795 #undef __FUNCT__
796 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5"
797 PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
798 {
799   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
800   IS             iscol=a->col,isrow=a->row;
801   PetscErrorCode ierr;
802   const PetscInt *r,*c,*rout,*cout;
803   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
804   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
805   MatScalar      *aa=a->a,*v;
806   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
807   PetscScalar    *x,*b,*t;
808 
809   PetscFunctionBegin;
810   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
811   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
812   t  = a->solve_work;
813 
814   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
815   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
816 
817   /* copy the b into temp work space according to permutation */
818   ii = 0;
819   for (i=0; i<n; i++) {
820     ic      = 5*c[i];
821     t[ii]   = b[ic];
822     t[ii+1] = b[ic+1];
823     t[ii+2] = b[ic+2];
824     t[ii+3] = b[ic+3];
825     t[ii+4] = b[ic+4];
826     ii += 5;
827   }
828 
829   /* forward solve the U^T */
830   idx = 0;
831   for (i=0; i<n; i++) {
832 
833     v     = aa + 25*diag[i];
834     /* multiply by the inverse of the block diagonal */
835     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
836     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
837     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
838     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
839     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
840     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
841     v += 25;
842 
843     vi    = aj + diag[i] + 1;
844     nz    = ai[i+1] - diag[i] - 1;
845     while (nz--) {
846       oidx = 5*(*vi++);
847       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
848       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
849       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
850       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
851       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
852       v  += 25;
853     }
854     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
855     idx += 5;
856   }
857   /* backward solve the L^T */
858   for (i=n-1; i>=0; i--){
859     v    = aa + 25*diag[i] - 25;
860     vi   = aj + diag[i] - 1;
861     nz   = diag[i] - ai[i];
862     idt  = 5*i;
863     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
864     while (nz--) {
865       idx   = 5*(*vi--);
866       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
867       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
868       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
869       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
870       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
871       v -= 25;
872     }
873   }
874 
875   /* copy t into x according to permutation */
876   ii = 0;
877   for (i=0; i<n; i++) {
878     ir      = 5*r[i];
879     x[ir]   = t[ii];
880     x[ir+1] = t[ii+1];
881     x[ir+2] = t[ii+2];
882     x[ir+3] = t[ii+3];
883     x[ir+4] = t[ii+4];
884     ii += 5;
885   }
886 
887   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
888   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
889   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
890   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
891   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
892   PetscFunctionReturn(0);
893 }
894 
895 #undef __FUNCT__
896 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6"
897 PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
898 {
899   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
900   IS             iscol=a->col,isrow=a->row;
901   PetscErrorCode ierr;
902   const PetscInt *r,*c,*rout,*cout;
903   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
904   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
905   MatScalar      *aa=a->a,*v;
906   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
907   PetscScalar    *x,*b,*t;
908 
909   PetscFunctionBegin;
910   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
911   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
912   t  = a->solve_work;
913 
914   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
915   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
916 
917   /* copy the b into temp work space according to permutation */
918   ii = 0;
919   for (i=0; i<n; i++) {
920     ic      = 6*c[i];
921     t[ii]   = b[ic];
922     t[ii+1] = b[ic+1];
923     t[ii+2] = b[ic+2];
924     t[ii+3] = b[ic+3];
925     t[ii+4] = b[ic+4];
926     t[ii+5] = b[ic+5];
927     ii += 6;
928   }
929 
930   /* forward solve the U^T */
931   idx = 0;
932   for (i=0; i<n; i++) {
933 
934     v     = aa + 36*diag[i];
935     /* multiply by the inverse of the block diagonal */
936     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
937     x6    = t[5+idx];
938     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
939     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
940     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
941     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
942     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
943     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
944     v += 36;
945 
946     vi    = aj + diag[i] + 1;
947     nz    = ai[i+1] - diag[i] - 1;
948     while (nz--) {
949       oidx = 6*(*vi++);
950       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
951       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
952       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
953       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
954       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
955       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
956       v  += 36;
957     }
958     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
959     t[5+idx] = s6;
960     idx += 6;
961   }
962   /* backward solve the L^T */
963   for (i=n-1; i>=0; i--){
964     v    = aa + 36*diag[i] - 36;
965     vi   = aj + diag[i] - 1;
966     nz   = diag[i] - ai[i];
967     idt  = 6*i;
968     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
969     s6 = t[5+idt];
970     while (nz--) {
971       idx   = 6*(*vi--);
972       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
973       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
974       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
975       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
976       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
977       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
978       v -= 36;
979     }
980   }
981 
982   /* copy t into x according to permutation */
983   ii = 0;
984   for (i=0; i<n; i++) {
985     ir      = 6*r[i];
986     x[ir]   = t[ii];
987     x[ir+1] = t[ii+1];
988     x[ir+2] = t[ii+2];
989     x[ir+3] = t[ii+3];
990     x[ir+4] = t[ii+4];
991     x[ir+5] = t[ii+5];
992     ii += 6;
993   }
994 
995   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
996   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
997   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
998   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
999   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1000   PetscFunctionReturn(0);
1001 }
1002 
1003 #undef __FUNCT__
1004 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7"
1005 PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1006 {
1007   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1008   IS             iscol=a->col,isrow=a->row;
1009   PetscErrorCode ierr;
1010   const PetscInt *r,*c,*rout,*cout;
1011   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1012   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1013   MatScalar      *aa=a->a,*v;
1014   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1015   PetscScalar    *x,*b,*t;
1016 
1017   PetscFunctionBegin;
1018   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1019   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1020   t  = a->solve_work;
1021 
1022   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1023   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1024 
1025   /* copy the b into temp work space according to permutation */
1026   ii = 0;
1027   for (i=0; i<n; i++) {
1028     ic      = 7*c[i];
1029     t[ii]   = b[ic];
1030     t[ii+1] = b[ic+1];
1031     t[ii+2] = b[ic+2];
1032     t[ii+3] = b[ic+3];
1033     t[ii+4] = b[ic+4];
1034     t[ii+5] = b[ic+5];
1035     t[ii+6] = b[ic+6];
1036     ii += 7;
1037   }
1038 
1039   /* forward solve the U^T */
1040   idx = 0;
1041   for (i=0; i<n; i++) {
1042 
1043     v     = aa + 49*diag[i];
1044     /* multiply by the inverse of the block diagonal */
1045     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1046     x6    = t[5+idx]; x7 = t[6+idx];
1047     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1048     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1049     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1050     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1051     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1052     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1053     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1054     v += 49;
1055 
1056     vi    = aj + diag[i] + 1;
1057     nz    = ai[i+1] - diag[i] - 1;
1058     while (nz--) {
1059       oidx = 7*(*vi++);
1060       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1061       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1062       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1063       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1064       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1065       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1066       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1067       v  += 49;
1068     }
1069     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1070     t[5+idx] = s6;t[6+idx] = s7;
1071     idx += 7;
1072   }
1073   /* backward solve the L^T */
1074   for (i=n-1; i>=0; i--){
1075     v    = aa + 49*diag[i] - 49;
1076     vi   = aj + diag[i] - 1;
1077     nz   = diag[i] - ai[i];
1078     idt  = 7*i;
1079     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1080     s6 = t[5+idt];s7 = t[6+idt];
1081     while (nz--) {
1082       idx   = 7*(*vi--);
1083       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1084       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1085       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1086       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1087       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1088       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1089       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1090       v -= 49;
1091     }
1092   }
1093 
1094   /* copy t into x according to permutation */
1095   ii = 0;
1096   for (i=0; i<n; i++) {
1097     ir      = 7*r[i];
1098     x[ir]   = t[ii];
1099     x[ir+1] = t[ii+1];
1100     x[ir+2] = t[ii+2];
1101     x[ir+3] = t[ii+3];
1102     x[ir+4] = t[ii+4];
1103     x[ir+5] = t[ii+5];
1104     x[ir+6] = t[ii+6];
1105     ii += 7;
1106   }
1107 
1108   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1109   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1110   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1111   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1112   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
1113   PetscFunctionReturn(0);
1114 }
1115 
1116 /* ----------------------------------------------------------- */
1117 #undef __FUNCT__
1118 #define __FUNCT__ "MatSolve_SeqBAIJ_N"
1119 PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
1120 {
1121   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1122   IS             iscol=a->col,isrow=a->row;
1123   PetscErrorCode ierr;
1124   const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
1125   PetscInt       i,n=a->mbs;
1126   PetscInt       nz,bs=A->rmap->bs,bs2=a->bs2;
1127   MatScalar      *aa=a->a,*v;
1128   PetscScalar    *x,*b,*s,*t,*ls;
1129 
1130   PetscFunctionBegin;
1131   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1132   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1133   t  = a->solve_work;
1134 
1135   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1136   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1137 
1138   /* forward solve the lower triangular */
1139   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
1140   for (i=1; i<n; i++) {
1141     v   = aa + bs2*ai[i];
1142     vi  = aj + ai[i];
1143     nz  = a->diag[i] - ai[i];
1144     s = t + bs*i;
1145     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
1146     while (nz--) {
1147       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
1148       v += bs2;
1149     }
1150   }
1151   /* backward solve the upper triangular */
1152   ls = a->solve_work + A->cmap->n;
1153   for (i=n-1; i>=0; i--){
1154     v   = aa + bs2*(a->diag[i] + 1);
1155     vi  = aj + a->diag[i] + 1;
1156     nz  = ai[i+1] - a->diag[i] - 1;
1157     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1158     while (nz--) {
1159       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
1160       v += bs2;
1161     }
1162     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
1163     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1164   }
1165 
1166   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1167   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1168   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1169   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1170   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
1171   PetscFunctionReturn(0);
1172 }
1173 
1174 #undef __FUNCT__
1175 #define __FUNCT__ "MatSolve_SeqBAIJ_7"
1176 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1177 {
1178   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1179   IS             iscol=a->col,isrow=a->row;
1180   PetscErrorCode ierr;
1181   const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*diag = a->diag,*vi;
1182   PetscInt       i,n=a->mbs,nz,idx,idt,idc;
1183   MatScalar      *aa=a->a,*v;
1184   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1185   PetscScalar    *x,*b,*t;
1186 
1187   PetscFunctionBegin;
1188   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1189   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1190   t  = a->solve_work;
1191 
1192   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1193   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1194 
1195   /* forward solve the lower triangular */
1196   idx    = 7*(*r++);
1197   t[0] = b[idx];   t[1] = b[1+idx];
1198   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1199   t[5] = b[5+idx]; t[6] = b[6+idx];
1200 
1201   for (i=1; i<n; i++) {
1202     v     = aa + 49*ai[i];
1203     vi    = aj + ai[i];
1204     nz    = diag[i] - ai[i];
1205     idx   = 7*(*r++);
1206     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1207     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1208     while (nz--) {
1209       idx   = 7*(*vi++);
1210       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1211       x4    = t[3+idx];x5 = t[4+idx];
1212       x6    = t[5+idx];x7 = t[6+idx];
1213       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1214       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1215       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1216       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1217       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1218       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1219       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1220       v += 49;
1221     }
1222     idx = 7*i;
1223     t[idx]   = s1;t[1+idx] = s2;
1224     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1225     t[5+idx] = s6;t[6+idx] = s7;
1226   }
1227   /* backward solve the upper triangular */
1228   for (i=n-1; i>=0; i--){
1229     v    = aa + 49*diag[i] + 49;
1230     vi   = aj + diag[i] + 1;
1231     nz   = ai[i+1] - diag[i] - 1;
1232     idt  = 7*i;
1233     s1 = t[idt];  s2 = t[1+idt];
1234     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1235     s6 = t[5+idt];s7 = t[6+idt];
1236     while (nz--) {
1237       idx   = 7*(*vi++);
1238       x1    = t[idx];   x2 = t[1+idx];
1239       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1240       x6    = t[5+idx]; x7 = t[6+idx];
1241       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1242       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1243       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1244       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1245       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1246       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1247       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1248       v += 49;
1249     }
1250     idc = 7*(*c--);
1251     v   = aa + 49*diag[i];
1252     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1253                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1254     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1255                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1256     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1257                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1258     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1259                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1260     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1261                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1262     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1263                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1264     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1265                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
1266   }
1267 
1268   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1269   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1270   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1271   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1272   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
1273   PetscFunctionReturn(0);
1274 }
1275 
1276 #undef __FUNCT__
1277 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
1278 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
1279 {
1280   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1281   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1282   PetscErrorCode    ierr;
1283   PetscInt          *diag = a->diag,jdx;
1284   const MatScalar   *aa=a->a,*v;
1285   PetscScalar       *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1286   const PetscScalar *b;
1287 
1288   PetscFunctionBegin;
1289   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1290   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1291   /* forward solve the lower triangular */
1292   idx    = 0;
1293   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1294   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1295   x[6] = b[6+idx];
1296   for (i=1; i<n; i++) {
1297     v     =  aa + 49*ai[i];
1298     vi    =  aj + ai[i];
1299     nz    =  diag[i] - ai[i];
1300     idx   =  7*i;
1301     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1302     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1303     s7  =  b[6+idx];
1304     while (nz--) {
1305       jdx   = 7*(*vi++);
1306       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1307       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1308       x7    = x[6+jdx];
1309       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1310       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1311       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1312       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1313       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1314       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1315       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1316       v += 49;
1317      }
1318     x[idx]   = s1;
1319     x[1+idx] = s2;
1320     x[2+idx] = s3;
1321     x[3+idx] = s4;
1322     x[4+idx] = s5;
1323     x[5+idx] = s6;
1324     x[6+idx] = s7;
1325   }
1326   /* backward solve the upper triangular */
1327   for (i=n-1; i>=0; i--){
1328     v    = aa + 49*diag[i] + 49;
1329     vi   = aj + diag[i] + 1;
1330     nz   = ai[i+1] - diag[i] - 1;
1331     idt  = 7*i;
1332     s1 = x[idt];   s2 = x[1+idt];
1333     s3 = x[2+idt]; s4 = x[3+idt];
1334     s5 = x[4+idt]; s6 = x[5+idt];
1335     s7 = x[6+idt];
1336     while (nz--) {
1337       idx   = 7*(*vi++);
1338       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1339       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1340       x7    = x[6+idx];
1341       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1342       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1343       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1344       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1345       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1346       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1347       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1348       v += 49;
1349     }
1350     v        = aa + 49*diag[i];
1351     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
1352                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
1353     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
1354                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
1355     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
1356                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
1357     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
1358                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
1359     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
1360                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
1361     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
1362                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
1363     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
1364                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
1365   }
1366 
1367   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1368   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1369   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1370   PetscFunctionReturn(0);
1371 }
1372 
1373 #undef __FUNCT__
1374 #define __FUNCT__ "MatSolve_SeqBAIJ_6"
1375 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
1376 {
1377   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1378   IS                iscol=a->col,isrow=a->row;
1379   PetscErrorCode    ierr;
1380   const PetscInt    *r,*c,*rout,*cout;
1381   PetscInt          *diag = a->diag,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
1382   const MatScalar   *aa=a->a,*v;
1383   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
1384   const PetscScalar *b;
1385   PetscFunctionBegin;
1386   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1387   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1388   t  = a->solve_work;
1389 
1390   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1391   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1392 
1393   /* forward solve the lower triangular */
1394   idx    = 6*(*r++);
1395   t[0] = b[idx];   t[1] = b[1+idx];
1396   t[2] = b[2+idx]; t[3] = b[3+idx];
1397   t[4] = b[4+idx]; t[5] = b[5+idx];
1398   for (i=1; i<n; i++) {
1399     v     = aa + 36*ai[i];
1400     vi    = aj + ai[i];
1401     nz    = diag[i] - ai[i];
1402     idx   = 6*(*r++);
1403     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1404     s5  = b[4+idx]; s6 = b[5+idx];
1405     while (nz--) {
1406       idx   = 6*(*vi++);
1407       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1408       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1409       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1410       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1411       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1412       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1413       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1414       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1415       v += 36;
1416     }
1417     idx = 6*i;
1418     t[idx]   = s1;t[1+idx] = s2;
1419     t[2+idx] = s3;t[3+idx] = s4;
1420     t[4+idx] = s5;t[5+idx] = s6;
1421   }
1422   /* backward solve the upper triangular */
1423   for (i=n-1; i>=0; i--){
1424     v    = aa + 36*diag[i] + 36;
1425     vi   = aj + diag[i] + 1;
1426     nz   = ai[i+1] - diag[i] - 1;
1427     idt  = 6*i;
1428     s1 = t[idt];  s2 = t[1+idt];
1429     s3 = t[2+idt];s4 = t[3+idt];
1430     s5 = t[4+idt];s6 = t[5+idt];
1431     while (nz--) {
1432       idx   = 6*(*vi++);
1433       x1    = t[idx];   x2 = t[1+idx];
1434       x3    = t[2+idx]; x4 = t[3+idx];
1435       x5    = t[4+idx]; x6 = t[5+idx];
1436       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1437       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1438       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1439       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1440       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1441       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1442       v += 36;
1443     }
1444     idc = 6*(*c--);
1445     v   = aa + 36*diag[i];
1446     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
1447                                  v[18]*s4+v[24]*s5+v[30]*s6;
1448     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1449                                  v[19]*s4+v[25]*s5+v[31]*s6;
1450     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1451                                  v[20]*s4+v[26]*s5+v[32]*s6;
1452     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1453                                  v[21]*s4+v[27]*s5+v[33]*s6;
1454     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1455                                  v[22]*s4+v[28]*s5+v[34]*s6;
1456     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1457                                  v[23]*s4+v[29]*s5+v[35]*s6;
1458   }
1459 
1460   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1461   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1462   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1463   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1464   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1465   PetscFunctionReturn(0);
1466 }
1467 
1468 #undef __FUNCT__
1469 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
1470 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
1471 {
1472   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1473   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1474   PetscErrorCode    ierr;
1475   PetscInt          *diag = a->diag,jdx;
1476   const MatScalar   *aa=a->a,*v;
1477   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
1478   const PetscScalar *b;
1479 
1480   PetscFunctionBegin;
1481   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1482   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1483   /* forward solve the lower triangular */
1484   idx    = 0;
1485   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1486   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1487   for (i=1; i<n; i++) {
1488     v     =  aa + 36*ai[i];
1489     vi    =  aj + ai[i];
1490     nz    =  diag[i] - ai[i];
1491     idx   =  6*i;
1492     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1493     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1494     while (nz--) {
1495       jdx   = 6*(*vi++);
1496       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1497       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1498       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1499       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1500       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1501       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1502       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1503       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1504       v += 36;
1505      }
1506     x[idx]   = s1;
1507     x[1+idx] = s2;
1508     x[2+idx] = s3;
1509     x[3+idx] = s4;
1510     x[4+idx] = s5;
1511     x[5+idx] = s6;
1512   }
1513   /* backward solve the upper triangular */
1514   for (i=n-1; i>=0; i--){
1515     v    = aa + 36*diag[i] + 36;
1516     vi   = aj + diag[i] + 1;
1517     nz   = ai[i+1] - diag[i] - 1;
1518     idt  = 6*i;
1519     s1 = x[idt];   s2 = x[1+idt];
1520     s3 = x[2+idt]; s4 = x[3+idt];
1521     s5 = x[4+idt]; s6 = x[5+idt];
1522     while (nz--) {
1523       idx   = 6*(*vi++);
1524       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1525       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1526       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1527       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1528       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1529       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1530       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1531       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1532       v += 36;
1533     }
1534     v        = aa + 36*diag[i];
1535     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
1536     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
1537     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
1538     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
1539     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
1540     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
1541   }
1542 
1543   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1544   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1545   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1546   PetscFunctionReturn(0);
1547 }
1548 
1549 #undef __FUNCT__
1550 #define __FUNCT__ "MatSolve_SeqBAIJ_5"
1551 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
1552 {
1553   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1554   IS                iscol=a->col,isrow=a->row;
1555   PetscErrorCode    ierr;
1556   const PetscInt    *r,*c,*rout,*cout,*diag = a->diag;
1557   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
1558   const MatScalar   *aa=a->a,*v;
1559   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
1560   const PetscScalar *b;
1561 
1562   PetscFunctionBegin;
1563   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1564   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1565   t  = a->solve_work;
1566 
1567   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1568   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1569 
1570   /* forward solve the lower triangular */
1571   idx    = 5*(*r++);
1572   t[0] = b[idx];   t[1] = b[1+idx];
1573   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1574   for (i=1; i<n; i++) {
1575     v     = aa + 25*ai[i];
1576     vi    = aj + ai[i];
1577     nz    = diag[i] - ai[i];
1578     idx   = 5*(*r++);
1579     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1580     s5  = b[4+idx];
1581     while (nz--) {
1582       idx   = 5*(*vi++);
1583       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1584       x4    = t[3+idx];x5 = t[4+idx];
1585       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1586       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1587       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1588       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1589       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1590       v += 25;
1591     }
1592     idx = 5*i;
1593     t[idx]   = s1;t[1+idx] = s2;
1594     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1595   }
1596   /* backward solve the upper triangular */
1597   for (i=n-1; i>=0; i--){
1598     v    = aa + 25*diag[i] + 25;
1599     vi   = aj + diag[i] + 1;
1600     nz   = ai[i+1] - diag[i] - 1;
1601     idt  = 5*i;
1602     s1 = t[idt];  s2 = t[1+idt];
1603     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1604     while (nz--) {
1605       idx   = 5*(*vi++);
1606       x1    = t[idx];   x2 = t[1+idx];
1607       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1608       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1609       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1610       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1611       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1612       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1613       v += 25;
1614     }
1615     idc = 5*(*c--);
1616     v   = aa + 25*diag[i];
1617     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
1618                                  v[15]*s4+v[20]*s5;
1619     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
1620                                  v[16]*s4+v[21]*s5;
1621     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
1622                                  v[17]*s4+v[22]*s5;
1623     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
1624                                  v[18]*s4+v[23]*s5;
1625     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
1626                                  v[19]*s4+v[24]*s5;
1627   }
1628 
1629   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1630   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1631   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1632   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1633   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
1634   PetscFunctionReturn(0);
1635 }
1636 
1637 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
1638 {
1639   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1640   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1641   PetscErrorCode    ierr;
1642   PetscInt          jdx;
1643   const MatScalar   *aa=a->a,*v;
1644   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
1645   const PetscScalar *b;
1646 
1647   PetscFunctionBegin;
1648   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1649   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1650   /* forward solve the lower triangular */
1651   idx    = 0;
1652   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
1653   for (i=1; i<n; i++) {
1654     v   = aa + 25*ai[i];
1655     vi  = aj + ai[i];
1656     nz  = ai[i+1] - ai[i];
1657     idx = 5*i;
1658     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
1659     while (nz--) {
1660       jdx   = 5*(*vi++);
1661       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
1662       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1663       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1664       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1665       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1666       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1667       v    += 25;
1668     }
1669     x[idx]   = s1;
1670     x[1+idx] = s2;
1671     x[2+idx] = s3;
1672     x[3+idx] = s4;
1673     x[4+idx] = s5;
1674   }
1675 
1676   /* backward solve the upper triangular */
1677   for (i=n-1; i>=0; i--){
1678     v   = aa + 25*ai[2*n-i];
1679     vi  = aj + ai[2*n-i];
1680     nz  = ai[2*n-i +1] - ai[2*n-i]-1;
1681     idt = 5*i;
1682     s1 = x[idt];  s2 = x[1+idt];
1683     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
1684     while (nz--) {
1685       idx   = 5*(*vi++);
1686       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
1687       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1688       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1689       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1690       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1691       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1692       v    += 25;
1693     }
1694     /* x = inv_diagonal*x */
1695     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
1696     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
1697     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
1698     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
1699     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
1700   }
1701 
1702   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1703   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1704   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
1705   PetscFunctionReturn(0);
1706 }
1707 
1708 #undef __FUNCT__
1709 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
1710 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
1711 {
1712   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1713   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1714   PetscErrorCode    ierr;
1715   PetscInt          *diag = a->diag,jdx;
1716   const MatScalar   *aa=a->a,*v;
1717   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
1718   const PetscScalar *b;
1719 
1720   PetscFunctionBegin;
1721   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1722   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1723   /* forward solve the lower triangular */
1724   idx    = 0;
1725   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
1726   for (i=1; i<n; i++) {
1727     v     =  aa + 25*ai[i];
1728     vi    =  aj + ai[i];
1729     nz    =  diag[i] - ai[i];
1730     idx   =  5*i;
1731     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
1732     while (nz--) {
1733       jdx   = 5*(*vi++);
1734       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
1735       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1736       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1737       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1738       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1739       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1740       v    += 25;
1741     }
1742     x[idx]   = s1;
1743     x[1+idx] = s2;
1744     x[2+idx] = s3;
1745     x[3+idx] = s4;
1746     x[4+idx] = s5;
1747   }
1748   /* backward solve the upper triangular */
1749   for (i=n-1; i>=0; i--){
1750     v    = aa + 25*diag[i] + 25;
1751     vi   = aj + diag[i] + 1;
1752     nz   = ai[i+1] - diag[i] - 1;
1753     idt  = 5*i;
1754     s1 = x[idt];  s2 = x[1+idt];
1755     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
1756     while (nz--) {
1757       idx   = 5*(*vi++);
1758       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
1759       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1760       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1761       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1762       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1763       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1764       v    += 25;
1765     }
1766     v        = aa + 25*diag[i];
1767     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
1768     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
1769     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
1770     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
1771     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
1772   }
1773 
1774   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1775   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1776   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
1777   PetscFunctionReturn(0);
1778 }
1779 
1780 #undef __FUNCT__
1781 #define __FUNCT__ "MatSolve_SeqBAIJ_4"
1782 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
1783 {
1784   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1785   IS                iscol=a->col,isrow=a->row;
1786   PetscErrorCode    ierr;
1787   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
1788   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
1789   const MatScalar   *aa=a->a,*v;
1790   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
1791   const PetscScalar *b;
1792 
1793   PetscFunctionBegin;
1794   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1795   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1796   t  = a->solve_work;
1797 
1798   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1799   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1800 
1801   /* forward solve the lower triangular */
1802   idx    = 4*(*r++);
1803   t[0] = b[idx];   t[1] = b[1+idx];
1804   t[2] = b[2+idx]; t[3] = b[3+idx];
1805   for (i=1; i<n; i++) {
1806     v     = aa + 16*ai[i];
1807     vi    = aj + ai[i];
1808     nz    = diag[i] - ai[i];
1809     idx   = 4*(*r++);
1810     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1811     while (nz--) {
1812       idx   = 4*(*vi++);
1813       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
1814       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1815       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1816       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1817       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1818       v    += 16;
1819     }
1820     idx        = 4*i;
1821     t[idx]   = s1;t[1+idx] = s2;
1822     t[2+idx] = s3;t[3+idx] = s4;
1823   }
1824   /* backward solve the upper triangular */
1825   for (i=n-1; i>=0; i--){
1826     v    = aa + 16*diag[i] + 16;
1827     vi   = aj + diag[i] + 1;
1828     nz   = ai[i+1] - diag[i] - 1;
1829     idt  = 4*i;
1830     s1 = t[idt];  s2 = t[1+idt];
1831     s3 = t[2+idt];s4 = t[3+idt];
1832     while (nz--) {
1833       idx   = 4*(*vi++);
1834       x1    = t[idx];   x2 = t[1+idx];
1835       x3    = t[2+idx]; x4 = t[3+idx];
1836       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1837       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1838       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1839       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1840       v += 16;
1841     }
1842     idc      = 4*(*c--);
1843     v        = aa + 16*diag[i];
1844     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1845     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1846     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1847     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1848   }
1849 
1850   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1851   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1852   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1853   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1854   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
1855   PetscFunctionReturn(0);
1856 }
1857 
1858 #undef __FUNCT__
1859 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion"
1860 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
1861 {
1862   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1863   IS                iscol=a->col,isrow=a->row;
1864   PetscErrorCode    ierr;
1865   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
1866   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
1867   const MatScalar   *aa=a->a,*v;
1868   MatScalar         s1,s2,s3,s4,x1,x2,x3,x4,*t;
1869   PetscScalar       *x;
1870   const PetscScalar *b;
1871 
1872   PetscFunctionBegin;
1873   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1874   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1875   t  = (MatScalar *)a->solve_work;
1876 
1877   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1878   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1879 
1880   /* forward solve the lower triangular */
1881   idx    = 4*(*r++);
1882   t[0] = (MatScalar)b[idx];
1883   t[1] = (MatScalar)b[1+idx];
1884   t[2] = (MatScalar)b[2+idx];
1885   t[3] = (MatScalar)b[3+idx];
1886   for (i=1; i<n; i++) {
1887     v     = aa + 16*ai[i];
1888     vi    = aj + ai[i];
1889     nz    = diag[i] - ai[i];
1890     idx   = 4*(*r++);
1891     s1 = (MatScalar)b[idx];
1892     s2 = (MatScalar)b[1+idx];
1893     s3 = (MatScalar)b[2+idx];
1894     s4 = (MatScalar)b[3+idx];
1895     while (nz--) {
1896       idx   = 4*(*vi++);
1897       x1  = t[idx];
1898       x2  = t[1+idx];
1899       x3  = t[2+idx];
1900       x4  = t[3+idx];
1901       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1902       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1903       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1904       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1905       v    += 16;
1906     }
1907     idx        = 4*i;
1908     t[idx]   = s1;
1909     t[1+idx] = s2;
1910     t[2+idx] = s3;
1911     t[3+idx] = s4;
1912   }
1913   /* backward solve the upper triangular */
1914   for (i=n-1; i>=0; i--){
1915     v    = aa + 16*diag[i] + 16;
1916     vi   = aj + diag[i] + 1;
1917     nz   = ai[i+1] - diag[i] - 1;
1918     idt  = 4*i;
1919     s1 = t[idt];
1920     s2 = t[1+idt];
1921     s3 = t[2+idt];
1922     s4 = t[3+idt];
1923     while (nz--) {
1924       idx   = 4*(*vi++);
1925       x1  = t[idx];
1926       x2  = t[1+idx];
1927       x3  = t[2+idx];
1928       x4  = t[3+idx];
1929       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1930       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1931       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1932       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1933       v += 16;
1934     }
1935     idc      = 4*(*c--);
1936     v        = aa + 16*diag[i];
1937     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1938     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1939     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1940     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1941     x[idc]   = (PetscScalar)t[idt];
1942     x[1+idc] = (PetscScalar)t[1+idt];
1943     x[2+idc] = (PetscScalar)t[2+idt];
1944     x[3+idc] = (PetscScalar)t[3+idt];
1945  }
1946 
1947   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1948   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1949   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1950   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1951   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
1952   PetscFunctionReturn(0);
1953 }
1954 
1955 #if defined (PETSC_HAVE_SSE)
1956 
1957 #include PETSC_HAVE_SSE
1958 
1959 #undef __FUNCT__
1960 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
1961 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
1962 {
1963   /*
1964      Note: This code uses demotion of double
1965      to float when performing the mixed-mode computation.
1966      This may not be numerically reasonable for all applications.
1967   */
1968   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
1969   IS             iscol=a->col,isrow=a->row;
1970   PetscErrorCode ierr;
1971   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
1972   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
1973   MatScalar      *aa=a->a,*v;
1974   PetscScalar    *x,*b,*t;
1975 
1976   /* Make space in temp stack for 16 Byte Aligned arrays */
1977   float           ssealignedspace[11],*tmps,*tmpx;
1978   unsigned long   offset;
1979 
1980   PetscFunctionBegin;
1981   SSE_SCOPE_BEGIN;
1982 
1983     offset = (unsigned long)ssealignedspace % 16;
1984     if (offset) offset = (16 - offset)/4;
1985     tmps = &ssealignedspace[offset];
1986     tmpx = &ssealignedspace[offset+4];
1987     PREFETCH_NTA(aa+16*ai[1]);
1988 
1989     ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1990     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1991     t  = a->solve_work;
1992 
1993     ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1994     ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1995 
1996     /* forward solve the lower triangular */
1997     idx  = 4*(*r++);
1998     t[0] = b[idx];   t[1] = b[1+idx];
1999     t[2] = b[2+idx]; t[3] = b[3+idx];
2000     v    =  aa + 16*ai[1];
2001 
2002     for (i=1; i<n;) {
2003       PREFETCH_NTA(&v[8]);
2004       vi   =  aj      + ai[i];
2005       nz   =  diag[i] - ai[i];
2006       idx  =  4*(*r++);
2007 
2008       /* Demote sum from double to float */
2009       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
2010       LOAD_PS(tmps,XMM7);
2011 
2012       while (nz--) {
2013         PREFETCH_NTA(&v[16]);
2014         idx = 4*(*vi++);
2015 
2016         /* Demote solution (so far) from double to float */
2017         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
2018 
2019         /* 4x4 Matrix-Vector product with negative accumulation: */
2020         SSE_INLINE_BEGIN_2(tmpx,v)
2021           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2022 
2023           /* First Column */
2024           SSE_COPY_PS(XMM0,XMM6)
2025           SSE_SHUFFLE(XMM0,XMM0,0x00)
2026           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2027           SSE_SUB_PS(XMM7,XMM0)
2028 
2029           /* Second Column */
2030           SSE_COPY_PS(XMM1,XMM6)
2031           SSE_SHUFFLE(XMM1,XMM1,0x55)
2032           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2033           SSE_SUB_PS(XMM7,XMM1)
2034 
2035           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2036 
2037           /* Third Column */
2038           SSE_COPY_PS(XMM2,XMM6)
2039           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2040           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2041           SSE_SUB_PS(XMM7,XMM2)
2042 
2043           /* Fourth Column */
2044           SSE_COPY_PS(XMM3,XMM6)
2045           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2046           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2047           SSE_SUB_PS(XMM7,XMM3)
2048         SSE_INLINE_END_2
2049 
2050         v  += 16;
2051       }
2052       idx = 4*i;
2053       v   = aa + 16*ai[++i];
2054       PREFETCH_NTA(v);
2055       STORE_PS(tmps,XMM7);
2056 
2057       /* Promote result from float to double */
2058       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
2059     }
2060     /* backward solve the upper triangular */
2061     idt  = 4*(n-1);
2062     ai16 = 16*diag[n-1];
2063     v    = aa + ai16 + 16;
2064     for (i=n-1; i>=0;){
2065       PREFETCH_NTA(&v[8]);
2066       vi = aj + diag[i] + 1;
2067       nz = ai[i+1] - diag[i] - 1;
2068 
2069       /* Demote accumulator from double to float */
2070       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
2071       LOAD_PS(tmps,XMM7);
2072 
2073       while (nz--) {
2074         PREFETCH_NTA(&v[16]);
2075         idx = 4*(*vi++);
2076 
2077         /* Demote solution (so far) from double to float */
2078         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
2079 
2080         /* 4x4 Matrix-Vector Product with negative accumulation: */
2081         SSE_INLINE_BEGIN_2(tmpx,v)
2082           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2083 
2084           /* First Column */
2085           SSE_COPY_PS(XMM0,XMM6)
2086           SSE_SHUFFLE(XMM0,XMM0,0x00)
2087           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2088           SSE_SUB_PS(XMM7,XMM0)
2089 
2090           /* Second Column */
2091           SSE_COPY_PS(XMM1,XMM6)
2092           SSE_SHUFFLE(XMM1,XMM1,0x55)
2093           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2094           SSE_SUB_PS(XMM7,XMM1)
2095 
2096           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2097 
2098           /* Third Column */
2099           SSE_COPY_PS(XMM2,XMM6)
2100           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2101           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2102           SSE_SUB_PS(XMM7,XMM2)
2103 
2104           /* Fourth Column */
2105           SSE_COPY_PS(XMM3,XMM6)
2106           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2107           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2108           SSE_SUB_PS(XMM7,XMM3)
2109         SSE_INLINE_END_2
2110         v  += 16;
2111       }
2112       v    = aa + ai16;
2113       ai16 = 16*diag[--i];
2114       PREFETCH_NTA(aa+ai16+16);
2115       /*
2116          Scale the result by the diagonal 4x4 block,
2117          which was inverted as part of the factorization
2118       */
2119       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
2120         /* First Column */
2121         SSE_COPY_PS(XMM0,XMM7)
2122         SSE_SHUFFLE(XMM0,XMM0,0x00)
2123         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2124 
2125         /* Second Column */
2126         SSE_COPY_PS(XMM1,XMM7)
2127         SSE_SHUFFLE(XMM1,XMM1,0x55)
2128         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2129         SSE_ADD_PS(XMM0,XMM1)
2130 
2131         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2132 
2133         /* Third Column */
2134         SSE_COPY_PS(XMM2,XMM7)
2135         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2136         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2137         SSE_ADD_PS(XMM0,XMM2)
2138 
2139         /* Fourth Column */
2140         SSE_COPY_PS(XMM3,XMM7)
2141         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2142         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2143         SSE_ADD_PS(XMM0,XMM3)
2144 
2145         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2146       SSE_INLINE_END_3
2147 
2148       /* Promote solution from float to double */
2149       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
2150 
2151       /* Apply reordering to t and stream into x.    */
2152       /* This way, x doesn't pollute the cache.      */
2153       /* Be careful with size: 2 doubles = 4 floats! */
2154       idc  = 4*(*c--);
2155       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
2156         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
2157         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
2158         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
2159         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
2160         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
2161         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
2162       SSE_INLINE_END_2
2163       v    = aa + ai16 + 16;
2164       idt -= 4;
2165     }
2166 
2167     ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2168     ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2169     ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2170     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2171     ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2172   SSE_SCOPE_END;
2173   PetscFunctionReturn(0);
2174 }
2175 
2176 #endif
2177 
2178 
2179 /*
2180       Special case where the matrix was ILU(0) factored in the natural
2181    ordering. This eliminates the need for the column and row permutation.
2182 */
2183 #undef __FUNCT__
2184 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
2185 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
2186 {
2187   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2188   PetscInt          n=a->mbs;
2189   const PetscInt    *ai=a->i,*aj=a->j;
2190   PetscErrorCode    ierr;
2191   const PetscInt    *diag = a->diag;
2192   const MatScalar   *aa=a->a;
2193   PetscScalar       *x;
2194   const PetscScalar *b;
2195 
2196   PetscFunctionBegin;
2197   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2198   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2199 
2200 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
2201   {
2202     static PetscScalar w[2000]; /* very BAD need to fix */
2203     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
2204   }
2205 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
2206   {
2207     static PetscScalar w[2000]; /* very BAD need to fix */
2208     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
2209   }
2210 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
2211   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
2212 #else
2213   {
2214     PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
2215     const MatScalar *v;
2216     PetscInt        jdx,idt,idx,nz,i,ai16;
2217     const PetscInt  *vi;
2218 
2219   /* forward solve the lower triangular */
2220   idx    = 0;
2221   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
2222   for (i=1; i<n; i++) {
2223     v     =  aa      + 16*ai[i];
2224     vi    =  aj      + ai[i];
2225     nz    =  diag[i] - ai[i];
2226     idx   +=  4;
2227     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2228     while (nz--) {
2229       jdx   = 4*(*vi++);
2230       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
2231       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2232       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2233       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2234       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2235       v    += 16;
2236     }
2237     x[idx]   = s1;
2238     x[1+idx] = s2;
2239     x[2+idx] = s3;
2240     x[3+idx] = s4;
2241   }
2242   /* backward solve the upper triangular */
2243   idt = 4*(n-1);
2244   for (i=n-1; i>=0; i--){
2245     ai16 = 16*diag[i];
2246     v    = aa + ai16 + 16;
2247     vi   = aj + diag[i] + 1;
2248     nz   = ai[i+1] - diag[i] - 1;
2249     s1 = x[idt];  s2 = x[1+idt];
2250     s3 = x[2+idt];s4 = x[3+idt];
2251     while (nz--) {
2252       idx   = 4*(*vi++);
2253       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
2254       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2255       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2256       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2257       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
2258       v    += 16;
2259     }
2260     v        = aa + ai16;
2261     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
2262     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
2263     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
2264     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
2265     idt -= 4;
2266   }
2267   }
2268 #endif
2269 
2270   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2271   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2272   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2273   PetscFunctionReturn(0);
2274 }
2275 
2276 #undef __FUNCT__
2277 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion"
2278 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
2279 {
2280   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2281   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
2282   PetscErrorCode ierr;
2283   PetscInt       *diag = a->diag;
2284   MatScalar      *aa=a->a;
2285   PetscScalar    *x,*b;
2286 
2287   PetscFunctionBegin;
2288   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2289   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2290 
2291   {
2292     MatScalar  s1,s2,s3,s4,x1,x2,x3,x4;
2293     MatScalar  *v,*t=(MatScalar *)x;
2294     PetscInt   jdx,idt,idx,nz,*vi,i,ai16;
2295 
2296     /* forward solve the lower triangular */
2297     idx  = 0;
2298     t[0] = (MatScalar)b[0];
2299     t[1] = (MatScalar)b[1];
2300     t[2] = (MatScalar)b[2];
2301     t[3] = (MatScalar)b[3];
2302     for (i=1; i<n; i++) {
2303       v     =  aa      + 16*ai[i];
2304       vi    =  aj      + ai[i];
2305       nz    =  diag[i] - ai[i];
2306       idx   +=  4;
2307       s1 = (MatScalar)b[idx];
2308       s2 = (MatScalar)b[1+idx];
2309       s3 = (MatScalar)b[2+idx];
2310       s4 = (MatScalar)b[3+idx];
2311       while (nz--) {
2312         jdx = 4*(*vi++);
2313         x1  = t[jdx];
2314         x2  = t[1+jdx];
2315         x3  = t[2+jdx];
2316         x4  = t[3+jdx];
2317         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2318         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2319         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2320         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2321         v    += 16;
2322       }
2323       t[idx]   = s1;
2324       t[1+idx] = s2;
2325       t[2+idx] = s3;
2326       t[3+idx] = s4;
2327     }
2328     /* backward solve the upper triangular */
2329     idt = 4*(n-1);
2330     for (i=n-1; i>=0; i--){
2331       ai16 = 16*diag[i];
2332       v    = aa + ai16 + 16;
2333       vi   = aj + diag[i] + 1;
2334       nz   = ai[i+1] - diag[i] - 1;
2335       s1   = t[idt];
2336       s2   = t[1+idt];
2337       s3   = t[2+idt];
2338       s4   = t[3+idt];
2339       while (nz--) {
2340         idx = 4*(*vi++);
2341         x1  = (MatScalar)x[idx];
2342         x2  = (MatScalar)x[1+idx];
2343         x3  = (MatScalar)x[2+idx];
2344         x4  = (MatScalar)x[3+idx];
2345         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2346         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2347         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2348         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2349         v    += 16;
2350       }
2351       v        = aa + ai16;
2352       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
2353       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
2354       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
2355       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
2356       idt -= 4;
2357     }
2358   }
2359 
2360   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2361   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2362   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2363   PetscFunctionReturn(0);
2364 }
2365 
2366 #if defined (PETSC_HAVE_SSE)
2367 
2368 #include PETSC_HAVE_SSE
2369 #undef __FUNCT__
2370 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj"
2371 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
2372 {
2373   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2374   unsigned short *aj=(unsigned short *)a->j;
2375   PetscErrorCode ierr;
2376   int            *ai=a->i,n=a->mbs,*diag = a->diag;
2377   MatScalar      *aa=a->a;
2378   PetscScalar    *x,*b;
2379 
2380   PetscFunctionBegin;
2381   SSE_SCOPE_BEGIN;
2382   /*
2383      Note: This code currently uses demotion of double
2384      to float when performing the mixed-mode computation.
2385      This may not be numerically reasonable for all applications.
2386   */
2387   PREFETCH_NTA(aa+16*ai[1]);
2388 
2389   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2390   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2391   {
2392     /* x will first be computed in single precision then promoted inplace to double */
2393     MatScalar      *v,*t=(MatScalar *)x;
2394     int            nz,i,idt,ai16;
2395     unsigned int   jdx,idx;
2396     unsigned short *vi;
2397     /* Forward solve the lower triangular factor. */
2398 
2399     /* First block is the identity. */
2400     idx  = 0;
2401     CONVERT_DOUBLE4_FLOAT4(t,b);
2402     v    =  aa + 16*((unsigned int)ai[1]);
2403 
2404     for (i=1; i<n;) {
2405       PREFETCH_NTA(&v[8]);
2406       vi   =  aj      + ai[i];
2407       nz   =  diag[i] - ai[i];
2408       idx +=  4;
2409 
2410       /* Demote RHS from double to float. */
2411       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
2412       LOAD_PS(&t[idx],XMM7);
2413 
2414       while (nz--) {
2415         PREFETCH_NTA(&v[16]);
2416         jdx = 4*((unsigned int)(*vi++));
2417 
2418         /* 4x4 Matrix-Vector product with negative accumulation: */
2419         SSE_INLINE_BEGIN_2(&t[jdx],v)
2420           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2421 
2422           /* First Column */
2423           SSE_COPY_PS(XMM0,XMM6)
2424           SSE_SHUFFLE(XMM0,XMM0,0x00)
2425           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2426           SSE_SUB_PS(XMM7,XMM0)
2427 
2428           /* Second Column */
2429           SSE_COPY_PS(XMM1,XMM6)
2430           SSE_SHUFFLE(XMM1,XMM1,0x55)
2431           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2432           SSE_SUB_PS(XMM7,XMM1)
2433 
2434           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2435 
2436           /* Third Column */
2437           SSE_COPY_PS(XMM2,XMM6)
2438           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2439           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2440           SSE_SUB_PS(XMM7,XMM2)
2441 
2442           /* Fourth Column */
2443           SSE_COPY_PS(XMM3,XMM6)
2444           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2445           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2446           SSE_SUB_PS(XMM7,XMM3)
2447         SSE_INLINE_END_2
2448 
2449         v  += 16;
2450       }
2451       v    =  aa + 16*ai[++i];
2452       PREFETCH_NTA(v);
2453       STORE_PS(&t[idx],XMM7);
2454     }
2455 
2456     /* Backward solve the upper triangular factor.*/
2457 
2458     idt  = 4*(n-1);
2459     ai16 = 16*diag[n-1];
2460     v    = aa + ai16 + 16;
2461     for (i=n-1; i>=0;){
2462       PREFETCH_NTA(&v[8]);
2463       vi = aj + diag[i] + 1;
2464       nz = ai[i+1] - diag[i] - 1;
2465 
2466       LOAD_PS(&t[idt],XMM7);
2467 
2468       while (nz--) {
2469         PREFETCH_NTA(&v[16]);
2470         idx = 4*((unsigned int)(*vi++));
2471 
2472         /* 4x4 Matrix-Vector Product with negative accumulation: */
2473         SSE_INLINE_BEGIN_2(&t[idx],v)
2474           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2475 
2476           /* First Column */
2477           SSE_COPY_PS(XMM0,XMM6)
2478           SSE_SHUFFLE(XMM0,XMM0,0x00)
2479           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2480           SSE_SUB_PS(XMM7,XMM0)
2481 
2482           /* Second Column */
2483           SSE_COPY_PS(XMM1,XMM6)
2484           SSE_SHUFFLE(XMM1,XMM1,0x55)
2485           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2486           SSE_SUB_PS(XMM7,XMM1)
2487 
2488           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2489 
2490           /* Third Column */
2491           SSE_COPY_PS(XMM2,XMM6)
2492           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2493           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2494           SSE_SUB_PS(XMM7,XMM2)
2495 
2496           /* Fourth Column */
2497           SSE_COPY_PS(XMM3,XMM6)
2498           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2499           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2500           SSE_SUB_PS(XMM7,XMM3)
2501         SSE_INLINE_END_2
2502         v  += 16;
2503       }
2504       v    = aa + ai16;
2505       ai16 = 16*diag[--i];
2506       PREFETCH_NTA(aa+ai16+16);
2507       /*
2508          Scale the result by the diagonal 4x4 block,
2509          which was inverted as part of the factorization
2510       */
2511       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
2512         /* First Column */
2513         SSE_COPY_PS(XMM0,XMM7)
2514         SSE_SHUFFLE(XMM0,XMM0,0x00)
2515         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2516 
2517         /* Second Column */
2518         SSE_COPY_PS(XMM1,XMM7)
2519         SSE_SHUFFLE(XMM1,XMM1,0x55)
2520         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2521         SSE_ADD_PS(XMM0,XMM1)
2522 
2523         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2524 
2525         /* Third Column */
2526         SSE_COPY_PS(XMM2,XMM7)
2527         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2528         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2529         SSE_ADD_PS(XMM0,XMM2)
2530 
2531         /* Fourth Column */
2532         SSE_COPY_PS(XMM3,XMM7)
2533         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2534         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2535         SSE_ADD_PS(XMM0,XMM3)
2536 
2537         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2538       SSE_INLINE_END_3
2539 
2540       v    = aa + ai16 + 16;
2541       idt -= 4;
2542     }
2543 
2544     /* Convert t from single precision back to double precision (inplace)*/
2545     idt = 4*(n-1);
2546     for (i=n-1;i>=0;i--) {
2547       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
2548       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
2549       PetscScalar *xtemp=&x[idt];
2550       MatScalar   *ttemp=&t[idt];
2551       xtemp[3] = (PetscScalar)ttemp[3];
2552       xtemp[2] = (PetscScalar)ttemp[2];
2553       xtemp[1] = (PetscScalar)ttemp[1];
2554       xtemp[0] = (PetscScalar)ttemp[0];
2555       idt -= 4;
2556     }
2557 
2558   } /* End of artificial scope. */
2559   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2560   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2561   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2562   SSE_SCOPE_END;
2563   PetscFunctionReturn(0);
2564 }
2565 
2566 #undef __FUNCT__
2567 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion"
2568 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
2569 {
2570   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2571   int            *aj=a->j;
2572   PetscErrorCode ierr;
2573   int            *ai=a->i,n=a->mbs,*diag = a->diag;
2574   MatScalar      *aa=a->a;
2575   PetscScalar    *x,*b;
2576 
2577   PetscFunctionBegin;
2578   SSE_SCOPE_BEGIN;
2579   /*
2580      Note: This code currently uses demotion of double
2581      to float when performing the mixed-mode computation.
2582      This may not be numerically reasonable for all applications.
2583   */
2584   PREFETCH_NTA(aa+16*ai[1]);
2585 
2586   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2587   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2588   {
2589     /* x will first be computed in single precision then promoted inplace to double */
2590     MatScalar *v,*t=(MatScalar *)x;
2591     int       nz,i,idt,ai16;
2592     int       jdx,idx;
2593     int       *vi;
2594     /* Forward solve the lower triangular factor. */
2595 
2596     /* First block is the identity. */
2597     idx  = 0;
2598     CONVERT_DOUBLE4_FLOAT4(t,b);
2599     v    =  aa + 16*ai[1];
2600 
2601     for (i=1; i<n;) {
2602       PREFETCH_NTA(&v[8]);
2603       vi   =  aj      + ai[i];
2604       nz   =  diag[i] - ai[i];
2605       idx +=  4;
2606 
2607       /* Demote RHS from double to float. */
2608       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
2609       LOAD_PS(&t[idx],XMM7);
2610 
2611       while (nz--) {
2612         PREFETCH_NTA(&v[16]);
2613         jdx = 4*(*vi++);
2614 /*          jdx = *vi++; */
2615 
2616         /* 4x4 Matrix-Vector product with negative accumulation: */
2617         SSE_INLINE_BEGIN_2(&t[jdx],v)
2618           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2619 
2620           /* First Column */
2621           SSE_COPY_PS(XMM0,XMM6)
2622           SSE_SHUFFLE(XMM0,XMM0,0x00)
2623           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2624           SSE_SUB_PS(XMM7,XMM0)
2625 
2626           /* Second Column */
2627           SSE_COPY_PS(XMM1,XMM6)
2628           SSE_SHUFFLE(XMM1,XMM1,0x55)
2629           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2630           SSE_SUB_PS(XMM7,XMM1)
2631 
2632           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2633 
2634           /* Third Column */
2635           SSE_COPY_PS(XMM2,XMM6)
2636           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2637           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2638           SSE_SUB_PS(XMM7,XMM2)
2639 
2640           /* Fourth Column */
2641           SSE_COPY_PS(XMM3,XMM6)
2642           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2643           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2644           SSE_SUB_PS(XMM7,XMM3)
2645         SSE_INLINE_END_2
2646 
2647         v  += 16;
2648       }
2649       v    =  aa + 16*ai[++i];
2650       PREFETCH_NTA(v);
2651       STORE_PS(&t[idx],XMM7);
2652     }
2653 
2654     /* Backward solve the upper triangular factor.*/
2655 
2656     idt  = 4*(n-1);
2657     ai16 = 16*diag[n-1];
2658     v    = aa + ai16 + 16;
2659     for (i=n-1; i>=0;){
2660       PREFETCH_NTA(&v[8]);
2661       vi = aj + diag[i] + 1;
2662       nz = ai[i+1] - diag[i] - 1;
2663 
2664       LOAD_PS(&t[idt],XMM7);
2665 
2666       while (nz--) {
2667         PREFETCH_NTA(&v[16]);
2668         idx = 4*(*vi++);
2669 /*          idx = *vi++; */
2670 
2671         /* 4x4 Matrix-Vector Product with negative accumulation: */
2672         SSE_INLINE_BEGIN_2(&t[idx],v)
2673           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2674 
2675           /* First Column */
2676           SSE_COPY_PS(XMM0,XMM6)
2677           SSE_SHUFFLE(XMM0,XMM0,0x00)
2678           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2679           SSE_SUB_PS(XMM7,XMM0)
2680 
2681           /* Second Column */
2682           SSE_COPY_PS(XMM1,XMM6)
2683           SSE_SHUFFLE(XMM1,XMM1,0x55)
2684           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2685           SSE_SUB_PS(XMM7,XMM1)
2686 
2687           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2688 
2689           /* Third Column */
2690           SSE_COPY_PS(XMM2,XMM6)
2691           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2692           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2693           SSE_SUB_PS(XMM7,XMM2)
2694 
2695           /* Fourth Column */
2696           SSE_COPY_PS(XMM3,XMM6)
2697           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2698           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2699           SSE_SUB_PS(XMM7,XMM3)
2700         SSE_INLINE_END_2
2701         v  += 16;
2702       }
2703       v    = aa + ai16;
2704       ai16 = 16*diag[--i];
2705       PREFETCH_NTA(aa+ai16+16);
2706       /*
2707          Scale the result by the diagonal 4x4 block,
2708          which was inverted as part of the factorization
2709       */
2710       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
2711         /* First Column */
2712         SSE_COPY_PS(XMM0,XMM7)
2713         SSE_SHUFFLE(XMM0,XMM0,0x00)
2714         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2715 
2716         /* Second Column */
2717         SSE_COPY_PS(XMM1,XMM7)
2718         SSE_SHUFFLE(XMM1,XMM1,0x55)
2719         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2720         SSE_ADD_PS(XMM0,XMM1)
2721 
2722         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2723 
2724         /* Third Column */
2725         SSE_COPY_PS(XMM2,XMM7)
2726         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2727         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2728         SSE_ADD_PS(XMM0,XMM2)
2729 
2730         /* Fourth Column */
2731         SSE_COPY_PS(XMM3,XMM7)
2732         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2733         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2734         SSE_ADD_PS(XMM0,XMM3)
2735 
2736         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2737       SSE_INLINE_END_3
2738 
2739       v    = aa + ai16 + 16;
2740       idt -= 4;
2741     }
2742 
2743     /* Convert t from single precision back to double precision (inplace)*/
2744     idt = 4*(n-1);
2745     for (i=n-1;i>=0;i--) {
2746       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
2747       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
2748       PetscScalar *xtemp=&x[idt];
2749       MatScalar   *ttemp=&t[idt];
2750       xtemp[3] = (PetscScalar)ttemp[3];
2751       xtemp[2] = (PetscScalar)ttemp[2];
2752       xtemp[1] = (PetscScalar)ttemp[1];
2753       xtemp[0] = (PetscScalar)ttemp[0];
2754       idt -= 4;
2755     }
2756 
2757   } /* End of artificial scope. */
2758   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2759   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2760   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2761   SSE_SCOPE_END;
2762   PetscFunctionReturn(0);
2763 }
2764 
2765 #endif
2766 #undef __FUNCT__
2767 #define __FUNCT__ "MatSolve_SeqBAIJ_3"
2768 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
2769 {
2770   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2771   IS                iscol=a->col,isrow=a->row;
2772   PetscErrorCode    ierr;
2773   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
2774   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
2775   const MatScalar   *aa=a->a,*v;
2776   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
2777   const PetscScalar *b;
2778 
2779   PetscFunctionBegin;
2780   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2781   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2782   t  = a->solve_work;
2783 
2784   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2785   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2786 
2787   /* forward solve the lower triangular */
2788   idx    = 3*(*r++);
2789   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
2790   for (i=1; i<n; i++) {
2791     v     = aa + 9*ai[i];
2792     vi    = aj + ai[i];
2793     nz    = diag[i] - ai[i];
2794     idx   = 3*(*r++);
2795     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
2796     while (nz--) {
2797       idx   = 3*(*vi++);
2798       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2799       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2800       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2801       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2802       v += 9;
2803     }
2804     idx = 3*i;
2805     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
2806   }
2807   /* backward solve the upper triangular */
2808   for (i=n-1; i>=0; i--){
2809     v    = aa + 9*diag[i] + 9;
2810     vi   = aj + diag[i] + 1;
2811     nz   = ai[i+1] - diag[i] - 1;
2812     idt  = 3*i;
2813     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
2814     while (nz--) {
2815       idx   = 3*(*vi++);
2816       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2817       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2818       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2819       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2820       v += 9;
2821     }
2822     idc = 3*(*c--);
2823     v   = aa + 9*diag[i];
2824     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2825     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2826     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
2827   }
2828   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2829   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2830   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2831   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2832   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
2833   PetscFunctionReturn(0);
2834 }
2835 
2836 /*
2837       Special case where the matrix was ILU(0) factored in the natural
2838    ordering. This eliminates the need for the column and row permutation.
2839 */
2840 #undef __FUNCT__
2841 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
2842 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
2843 {
2844   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2845   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
2846   PetscErrorCode    ierr;
2847   PetscInt          *diag = a->diag;
2848   const MatScalar   *aa=a->a,*v;
2849   PetscScalar       *x,s1,s2,s3,x1,x2,x3;
2850   const PetscScalar *b;
2851   PetscInt          jdx,idt,idx,nz,*vi,i;
2852 
2853   PetscFunctionBegin;
2854   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2855   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2856 
2857   /* forward solve the lower triangular */
2858   idx    = 0;
2859   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
2860   for (i=1; i<n; i++) {
2861     v     =  aa      + 9*ai[i];
2862     vi    =  aj      + ai[i];
2863     nz    =  diag[i] - ai[i];
2864     idx   +=  3;
2865     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
2866     while (nz--) {
2867       jdx   = 3*(*vi++);
2868       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
2869       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2870       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2871       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2872       v    += 9;
2873     }
2874     x[idx]   = s1;
2875     x[1+idx] = s2;
2876     x[2+idx] = s3;
2877   }
2878   /* backward solve the upper triangular */
2879   for (i=n-1; i>=0; i--){
2880     v    = aa + 9*diag[i] + 9;
2881     vi   = aj + diag[i] + 1;
2882     nz   = ai[i+1] - diag[i] - 1;
2883     idt  = 3*i;
2884     s1 = x[idt];  s2 = x[1+idt];
2885     s3 = x[2+idt];
2886     while (nz--) {
2887       idx   = 3*(*vi++);
2888       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
2889       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2890       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2891       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2892       v    += 9;
2893     }
2894     v        = aa +  9*diag[i];
2895     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2896     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2897     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
2898   }
2899 
2900   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2901   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2902   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
2903   PetscFunctionReturn(0);
2904 }
2905 
2906 #undef __FUNCT__
2907 #define __FUNCT__ "MatSolve_SeqBAIJ_2"
2908 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
2909 {
2910   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2911   IS                iscol=a->col,isrow=a->row;
2912   PetscErrorCode    ierr;
2913   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
2914   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
2915   const MatScalar   *aa=a->a,*v;
2916   PetscScalar       *x,s1,s2,x1,x2,*t;
2917   const PetscScalar *b;
2918 
2919   PetscFunctionBegin;
2920   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2921   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2922   t  = a->solve_work;
2923 
2924   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2925   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2926 
2927   /* forward solve the lower triangular */
2928   idx    = 2*(*r++);
2929   t[0] = b[idx]; t[1] = b[1+idx];
2930   for (i=1; i<n; i++) {
2931     v     = aa + 4*ai[i];
2932     vi    = aj + ai[i];
2933     nz    = diag[i] - ai[i];
2934     idx   = 2*(*r++);
2935     s1  = b[idx]; s2 = b[1+idx];
2936     while (nz--) {
2937       idx   = 2*(*vi++);
2938       x1    = t[idx]; x2 = t[1+idx];
2939       s1 -= v[0]*x1 + v[2]*x2;
2940       s2 -= v[1]*x1 + v[3]*x2;
2941       v += 4;
2942     }
2943     idx = 2*i;
2944     t[idx] = s1; t[1+idx] = s2;
2945   }
2946   /* backward solve the upper triangular */
2947   for (i=n-1; i>=0; i--){
2948     v    = aa + 4*diag[i] + 4;
2949     vi   = aj + diag[i] + 1;
2950     nz   = ai[i+1] - diag[i] - 1;
2951     idt  = 2*i;
2952     s1 = t[idt]; s2 = t[1+idt];
2953     while (nz--) {
2954       idx   = 2*(*vi++);
2955       x1    = t[idx]; x2 = t[1+idx];
2956       s1 -= v[0]*x1 + v[2]*x2;
2957       s2 -= v[1]*x1 + v[3]*x2;
2958       v += 4;
2959     }
2960     idc = 2*(*c--);
2961     v   = aa + 4*diag[i];
2962     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
2963     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
2964   }
2965   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2966   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2967   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2968   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2969   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
2970   PetscFunctionReturn(0);
2971 }
2972 
2973 /*
2974       Special case where the matrix was ILU(0) factored in the natural
2975    ordering. This eliminates the need for the column and row permutation.
2976 */
2977 #undef __FUNCT__
2978 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
2979 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
2980 {
2981   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2982   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
2983   PetscErrorCode    ierr;
2984   PetscInt          *diag = a->diag;
2985   const MatScalar   *aa=a->a,*v;
2986   PetscScalar       *x,s1,s2,x1,x2;
2987   const PetscScalar *b;
2988   PetscInt          jdx,idt,idx,nz,*vi,i;
2989 
2990   PetscFunctionBegin;
2991   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2992   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2993 
2994   /* forward solve the lower triangular */
2995   idx    = 0;
2996   x[0]   = b[0]; x[1] = b[1];
2997   for (i=1; i<n; i++) {
2998     v     =  aa      + 4*ai[i];
2999     vi    =  aj      + ai[i];
3000     nz    =  diag[i] - ai[i];
3001     idx   +=  2;
3002     s1  =  b[idx];s2 = b[1+idx];
3003     while (nz--) {
3004       jdx   = 2*(*vi++);
3005       x1    = x[jdx];x2 = x[1+jdx];
3006       s1 -= v[0]*x1 + v[2]*x2;
3007       s2 -= v[1]*x1 + v[3]*x2;
3008       v    += 4;
3009     }
3010     x[idx]   = s1;
3011     x[1+idx] = s2;
3012   }
3013   /* backward solve the upper triangular */
3014   for (i=n-1; i>=0; i--){
3015     v    = aa + 4*diag[i] + 4;
3016     vi   = aj + diag[i] + 1;
3017     nz   = ai[i+1] - diag[i] - 1;
3018     idt  = 2*i;
3019     s1 = x[idt];  s2 = x[1+idt];
3020     while (nz--) {
3021       idx   = 2*(*vi++);
3022       x1    = x[idx];   x2 = x[1+idx];
3023       s1 -= v[0]*x1 + v[2]*x2;
3024       s2 -= v[1]*x1 + v[3]*x2;
3025       v    += 4;
3026     }
3027     v        = aa +  4*diag[i];
3028     x[idt]   = v[0]*s1 + v[2]*s2;
3029     x[1+idt] = v[1]*s1 + v[3]*s2;
3030   }
3031 
3032   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3033   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3034   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
3035   PetscFunctionReturn(0);
3036 }
3037 
3038 #undef __FUNCT__
3039 #define __FUNCT__ "MatSolve_SeqBAIJ_1"
3040 PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
3041 {
3042   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
3043   IS             iscol=a->col,isrow=a->row;
3044   PetscErrorCode ierr;
3045   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
3046   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
3047   MatScalar      *aa=a->a,*v;
3048   PetscScalar    *x,*b,s1,*t;
3049 
3050   PetscFunctionBegin;
3051   if (!n) PetscFunctionReturn(0);
3052 
3053   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3054   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3055   t  = a->solve_work;
3056 
3057   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3058   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3059 
3060   /* forward solve the lower triangular */
3061   t[0] = b[*r++];
3062   for (i=1; i<n; i++) {
3063     v     = aa + ai[i];
3064     vi    = aj + ai[i];
3065     nz    = diag[i] - ai[i];
3066     s1  = b[*r++];
3067     while (nz--) {
3068       s1 -= (*v++)*t[*vi++];
3069     }
3070     t[i] = s1;
3071   }
3072   /* backward solve the upper triangular */
3073   for (i=n-1; i>=0; i--){
3074     v    = aa + diag[i] + 1;
3075     vi   = aj + diag[i] + 1;
3076     nz   = ai[i+1] - diag[i] - 1;
3077     s1 = t[i];
3078     while (nz--) {
3079       s1 -= (*v++)*t[*vi++];
3080     }
3081     x[*c--] = t[i] = aa[diag[i]]*s1;
3082   }
3083 
3084   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3085   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3086   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3087   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3088   ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr);
3089   PetscFunctionReturn(0);
3090 }
3091 /*
3092       Special case where the matrix was ILU(0) factored in the natural
3093    ordering. This eliminates the need for the column and row permutation.
3094 */
3095 #undef __FUNCT__
3096 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering"
3097 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
3098 {
3099   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3100   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
3101   PetscErrorCode ierr;
3102   PetscInt       *diag = a->diag;
3103   MatScalar      *aa=a->a;
3104   PetscScalar    *x,*b;
3105   PetscScalar    s1,x1;
3106   MatScalar      *v;
3107   PetscInt       jdx,idt,idx,nz,*vi,i;
3108 
3109   PetscFunctionBegin;
3110   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3111   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3112 
3113   /* forward solve the lower triangular */
3114   idx    = 0;
3115   x[0]   = b[0];
3116   for (i=1; i<n; i++) {
3117     v     =  aa      + ai[i];
3118     vi    =  aj      + ai[i];
3119     nz    =  diag[i] - ai[i];
3120     idx   +=  1;
3121     s1  =  b[idx];
3122     while (nz--) {
3123       jdx   = *vi++;
3124       x1    = x[jdx];
3125       s1 -= v[0]*x1;
3126       v    += 1;
3127     }
3128     x[idx]   = s1;
3129   }
3130   /* backward solve the upper triangular */
3131   for (i=n-1; i>=0; i--){
3132     v    = aa + diag[i] + 1;
3133     vi   = aj + diag[i] + 1;
3134     nz   = ai[i+1] - diag[i] - 1;
3135     idt  = i;
3136     s1 = x[idt];
3137     while (nz--) {
3138       idx   = *vi++;
3139       x1    = x[idx];
3140       s1 -= v[0]*x1;
3141       v    += 1;
3142     }
3143     v        = aa +  diag[i];
3144     x[idt]   = v[0]*s1;
3145   }
3146   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3147   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3148   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
3149   PetscFunctionReturn(0);
3150 }
3151 
3152 /* ----------------------------------------------------------------*/
3153 EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption);
3154 EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat,PetscTruth);
3155 
3156 extern PetscErrorCode MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct(Mat,Vec,Vec);
3157 #undef __FUNCT__
3158 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N_newdatastruct"
3159 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N_newdatastruct(Mat B,Mat A,const MatFactorInfo *info)
3160 {
3161   Mat            C=B;
3162   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
3163   IS             isrow = b->row,isicol = b->icol;
3164   PetscErrorCode ierr;
3165   const PetscInt *r,*ic,*ics;
3166   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
3167   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
3168   MatScalar      *rtmp,*pc,*multiplier,*v,*pv,*aa=a->a;
3169   PetscInt       bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg;
3170   MatScalar      *v_work;
3171 
3172   PetscFunctionBegin;
3173   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
3174   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
3175   ierr = PetscMalloc((bs2*n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
3176   ierr = PetscMemzero(rtmp,(bs2*n+1)*sizeof(MatScalar));CHKERRQ(ierr);
3177   ics  = ic;
3178 
3179   /* generate work space needed by dense LU factorization */
3180   ierr       = PetscMalloc(bs*sizeof(PetscInt) + (bs+bs2)*sizeof(MatScalar),&v_work);CHKERRQ(ierr);
3181   multiplier = v_work + bs;
3182   v_pivots   = (PetscInt*)(multiplier + bs2);
3183 
3184   for (i=0; i<n; i++){
3185     /* zero rtmp */
3186     /* L part */
3187     nz    = bi[i+1] - bi[i];
3188     bjtmp = bj + bi[i];
3189     for  (j=0; j<nz; j++){
3190       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
3191     }
3192 
3193     /* U part */
3194     nz = bi[2*n-i+1] - bi[2*n-i];
3195     bjtmp = bj + bi[2*n-i];
3196     for  (j=0; j<nz; j++){
3197       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
3198     }
3199 
3200     /* load in initial (unfactored row) */
3201     nz    = ai[r[i]+1] - ai[r[i]];
3202     ajtmp = aj + ai[r[i]];
3203     v     = aa + bs2*ai[r[i]];
3204     for (j=0; j<nz; j++) {
3205       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
3206     }
3207 
3208     /* elimination */
3209     bjtmp = bj + bi[i];
3210     row   = *bjtmp++;
3211     nzL   = bi[i+1] - bi[i];
3212     k   = 0;
3213     while  (k < nzL) {
3214       pc = rtmp + bs2*row;
3215       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
3216       if (flg) {
3217         pv         = b->a + bs2*bdiag[row];
3218         Kernel_A_gets_A_times_B(bs,pc,pv,multiplier); /* *pc = *pc * (*pv); */
3219         pj         = b->j + bi[2*n-row]; /* begining of U(row,:) */
3220         pv         = b->a + bs2*bi[2*n-row];
3221         nz         = bi[2*n-row+1] - bi[2*n-row] - 1; /* num of entries inU(row,:), excluding diag */
3222         for (j=0; j<nz; j++) {
3223           Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j);
3224         }
3225         ierr = PetscLogFlops(2.0*nz);CHKERRQ(ierr);
3226       }
3227       row = *bjtmp++; k++;
3228     }
3229 
3230     /* finished row so stick it into b->a */
3231     /* L part */
3232     pv   = b->a + bs2*bi[i] ;
3233     pj   = b->j + bi[i] ;
3234     nz   = bi[i+1] - bi[i];
3235     for (j=0; j<nz; j++) {
3236       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
3237     }
3238 
3239     /* Mark diagonal and invert diagonal for simplier triangular solves */
3240     pv  = b->a + bs2*bdiag[i];
3241     pj  = b->j + bdiag[i];
3242     /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */
3243     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
3244     ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr);
3245 
3246     /* U part */
3247     pv = b->a + bs2*bi[2*n-i];
3248     pj = b->j + bi[2*n-i];
3249     nz = bi[2*n-i+1] - bi[2*n-i] - 1;
3250     for (j=0; j<nz; j++){
3251       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
3252     }
3253   }
3254 
3255   ierr = PetscFree(rtmp);CHKERRQ(ierr);
3256   ierr = PetscFree(v_work);CHKERRQ(ierr);
3257   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
3258   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
3259   if (bs == 5){
3260     C->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct;
3261   } else {
3262     C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct;
3263   }
3264   C->assembled = PETSC_TRUE;
3265   ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
3266   PetscFunctionReturn(0);
3267 }
3268 
3269 /*
3270    ilu(0) with natural ordering under new data structure.
3271    Factored arrays bj and ba are stored as
3272      L(0,:), L(1,:), ...,L(n-1,:),  U(n-1,:),...,U(i,:),U(i-1,:),...,U(0,:)
3273 
3274    bi=fact->i is an array of size 2n+2, in which
3275    bi+
3276      bi[i]      ->  1st entry of L(i,:),i=0,...,i-1
3277      bi[n]      ->  end of L(n-1,:)+1
3278      bi[n+1]    ->  1st entry of U(n-1,:)
3279      bi[2n-i]   ->  1st entry of U(i,:)
3280      bi[2n-i+1] ->  end of U(i,:)+1, the 1st entry of U(i-1,:)
3281      bi[2n]     ->  end of U(0,:)+1
3282 
3283    U(i,:) contains diag[i] as its last entry, i.e.,
3284     U(i,:) = (u[i,i+1],...,u[i,n-1],diag[i])
3285 */
3286 #undef __FUNCT__
3287 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct"
3288 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
3289 {
3290 
3291   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
3292   PetscErrorCode     ierr;
3293   PetscInt           mbs=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2;
3294   PetscInt           i,j,nz=a->nz,*bi,*bj,*bdiag;
3295 
3296   PetscFunctionBegin;
3297   ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES);CHKERRQ(ierr);
3298   b     = (Mat_SeqBAIJ*)(fact)->data;
3299   bdiag = b->diag;
3300 
3301   /* replace matrix arrays with single allocations, then reset values */
3302   ierr = PetscFree3(b->a,b->j,b->i);CHKERRQ(ierr);
3303 
3304   ierr = PetscMalloc((2*mbs+2)*sizeof(PetscInt),&b->i);CHKERRQ(ierr);
3305   ierr = PetscMalloc((nz+1)*sizeof(PetscInt),&b->j);CHKERRQ(ierr);
3306   ierr = PetscMalloc((bs2*nz+1)*sizeof(PetscScalar),&b->a);CHKERRQ(ierr);
3307   b->singlemalloc = PETSC_FALSE;
3308   if (mbs > 0) {
3309     ierr = PetscMemzero(b->a,bs2*nz*sizeof(MatScalar));CHKERRQ(ierr);
3310   }
3311 
3312   /* set bi and bj with new data structure */
3313   bi = b->i;
3314   bj = b->j;
3315 
3316   /* L part */
3317   bi[0] = 0;
3318   for (i=0; i<mbs; i++){
3319     nz = adiag[i] - ai[i];
3320     bi[i+1] = bi[i] + nz;
3321     aj = a->j + ai[i];
3322     for (j=0; j<nz; j++){
3323       *bj = aj[j]; bj++;
3324     }
3325   }
3326 
3327   /* U part */
3328   bi[mbs+1] = bi[mbs];
3329   for (i=mbs-1; i>=0; i--){
3330     nz = ai[i+1] - adiag[i] - 1;
3331     if (nz < 0) SETERRQ2(0,"row %d Unz %d",i,nz);
3332     bi[2*mbs-i+1] = bi[2*mbs-i] + nz + 1;
3333     aj = a->j + adiag[i] + 1;
3334     for (j=0; j<nz; j++){
3335       *bj = aj[j]; bj++;
3336     }
3337     /* diag[i] */
3338     *bj = i; bj++;
3339     bdiag[i] = bi[2*mbs-i+1]-1;
3340   }
3341   PetscFunctionReturn(0);
3342 }
3343 
3344 /*
3345      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
3346    except that the data structure of Mat_SeqAIJ is slightly different.
3347    Not a good example of code reuse.
3348 */
3349 
3350 #undef __FUNCT__
3351 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ"
3352 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
3353 {
3354   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b;
3355   IS             isicol;
3356   PetscErrorCode ierr;
3357   const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi;
3358   PetscInt       prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp;
3359   PetscInt       *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0;
3360   PetscInt       incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd;
3361   PetscTruth     col_identity,row_identity,both_identity,flg;
3362   PetscReal      f;
3363 
3364   PetscFunctionBegin;
3365   ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr);
3366   if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd);
3367 
3368   f             = info->fill;
3369   levels        = (PetscInt)info->levels;
3370   diagonal_fill = (PetscInt)info->diagonal_fill;
3371   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
3372   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
3373   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
3374   both_identity = (PetscTruth) (row_identity && col_identity);
3375 
3376   if (!levels && both_identity) {  /* special case copy the nonzero structure */
3377 
3378     PetscTruth newdatastruct=PETSC_FALSE;
3379     ierr = PetscOptionsGetTruth(PETSC_NULL,"-ilu_new",&newdatastruct,PETSC_NULL);CHKERRQ(ierr);
3380     if (newdatastruct){
3381       ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr);
3382       (fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_N_newdatastruct;
3383     } else {
3384       ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES);CHKERRQ(ierr);
3385       ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
3386     }
3387 
3388     fact->factor = MAT_FACTOR_ILU;
3389     b            = (Mat_SeqBAIJ*)(fact)->data;
3390     b->row       = isrow;
3391     b->col       = iscol;
3392     ierr         = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
3393     ierr         = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
3394     b->icol      = isicol;
3395     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3396     ierr         = PetscMalloc(((fact)->rmap->N+1+(fact)->rmap->bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
3397     PetscFunctionReturn(0);
3398   }
3399 
3400   /* general case perform the symbolic factorization */
3401     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
3402     ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
3403 
3404     /* get new row pointers */
3405     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr);
3406     ainew[0] = 0;
3407     /* don't know how many column pointers are needed so estimate */
3408     jmax = (PetscInt)(f*ai[n] + 1);
3409     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr);
3410     /* ajfill is level of fill for each fill entry */
3411     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr);
3412     /* fill is a linked list of nonzeros in active row */
3413     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr);
3414     /* im is level for each filled value */
3415     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr);
3416     /* dloc is location of diagonal in factor */
3417     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr);
3418     dloc[0]  = 0;
3419     for (prow=0; prow<n; prow++) {
3420 
3421       /* copy prow into linked list */
3422       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
3423       if (!nz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow);
3424       xi         = aj + ai[r[prow]];
3425       fill[n]    = n;
3426       fill[prow] = -1; /* marker for diagonal entry */
3427       while (nz--) {
3428 	fm  = n;
3429 	idx = ic[*xi++];
3430 	do {
3431 	  m  = fm;
3432 	  fm = fill[m];
3433 	} while (fm < idx);
3434 	fill[m]   = idx;
3435 	fill[idx] = fm;
3436 	im[idx]   = 0;
3437       }
3438 
3439       /* make sure diagonal entry is included */
3440       if (diagonal_fill && fill[prow] == -1) {
3441 	fm = n;
3442 	while (fill[fm] < prow) fm = fill[fm];
3443 	fill[prow] = fill[fm];  /* insert diagonal into linked list */
3444 	fill[fm]   = prow;
3445 	im[prow]   = 0;
3446 	nzf++;
3447 	dcount++;
3448       }
3449 
3450       nzi = 0;
3451       row = fill[n];
3452       while (row < prow) {
3453 	incrlev = im[row] + 1;
3454 	nz      = dloc[row];
3455 	xi      = ajnew  + ainew[row] + nz + 1;
3456 	flev    = ajfill + ainew[row] + nz + 1;
3457 	nnz     = ainew[row+1] - ainew[row] - nz - 1;
3458 	fm      = row;
3459 	while (nnz-- > 0) {
3460 	  idx = *xi++;
3461 	  if (*flev + incrlev > levels) {
3462 	    flev++;
3463 	    continue;
3464 	  }
3465 	  do {
3466 	    m  = fm;
3467 	    fm = fill[m];
3468 	  } while (fm < idx);
3469 	  if (fm != idx) {
3470 	    im[idx]   = *flev + incrlev;
3471 	    fill[m]   = idx;
3472 	    fill[idx] = fm;
3473 	    fm        = idx;
3474 	    nzf++;
3475 	  } else {
3476 	    if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
3477 	  }
3478 	  flev++;
3479 	}
3480 	row = fill[row];
3481 	nzi++;
3482       }
3483       /* copy new filled row into permanent storage */
3484       ainew[prow+1] = ainew[prow] + nzf;
3485       if (ainew[prow+1] > jmax) {
3486 
3487 	/* estimate how much additional space we will need */
3488 	/* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
3489 	/* just double the memory each time */
3490 	PetscInt maxadd = jmax;
3491 	/* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
3492 	if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
3493 	jmax += maxadd;
3494 
3495 	/* allocate a longer ajnew and ajfill */
3496 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
3497 	ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
3498 	ierr = PetscFree(ajnew);CHKERRQ(ierr);
3499 	ajnew = xitmp;
3500 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
3501 	ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
3502 	ierr = PetscFree(ajfill);CHKERRQ(ierr);
3503 	ajfill = xitmp;
3504 	reallocate++; /* count how many reallocations are needed */
3505       }
3506       xitmp       = ajnew + ainew[prow];
3507       flev        = ajfill + ainew[prow];
3508       dloc[prow]  = nzi;
3509       fm          = fill[n];
3510       while (nzf--) {
3511 	*xitmp++ = fm;
3512 	*flev++ = im[fm];
3513 	fm      = fill[fm];
3514       }
3515       /* make sure row has diagonal entry */
3516       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
3517 	SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
3518     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow);
3519       }
3520     }
3521     ierr = PetscFree(ajfill);CHKERRQ(ierr);
3522     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
3523     ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
3524     ierr = PetscFree(fill);CHKERRQ(ierr);
3525     ierr = PetscFree(im);CHKERRQ(ierr);
3526 
3527 #if defined(PETSC_USE_INFO)
3528     {
3529       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
3530       ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr);
3531       ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
3532       ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr);
3533       ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
3534       if (diagonal_fill) {
3535 	ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr);
3536       }
3537     }
3538 #endif
3539 
3540     /* put together the new matrix */
3541     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
3542     ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
3543     b    = (Mat_SeqBAIJ*)(fact)->data;
3544     b->free_a       = PETSC_TRUE;
3545     b->free_ij      = PETSC_TRUE;
3546     b->singlemalloc = PETSC_FALSE;
3547     ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
3548     b->j          = ajnew;
3549     b->i          = ainew;
3550     for (i=0; i<n; i++) dloc[i] += ainew[i];
3551     b->diag       = dloc;
3552     b->ilen       = 0;
3553     b->imax       = 0;
3554     b->row        = isrow;
3555     b->col        = iscol;
3556     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3557     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
3558     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
3559     b->icol       = isicol;
3560     ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
3561     /* In b structure:  Free imax, ilen, old a, old j.
3562        Allocate dloc, solve_work, new a, new j */
3563     ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr);
3564     b->maxnz          = b->nz = ainew[n];
3565 
3566     (fact)->info.factor_mallocs    = reallocate;
3567     (fact)->info.fill_ratio_given  = f;
3568     (fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
3569 
3570   ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
3571   PetscFunctionReturn(0);
3572 }
3573 
3574 #undef __FUNCT__
3575 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE"
3576 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
3577 {
3578   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
3579   /* int i,*AJ=a->j,nz=a->nz; */
3580   PetscFunctionBegin;
3581   /* Undo Column scaling */
3582 /*    while (nz--) { */
3583 /*      AJ[i] = AJ[i]/4; */
3584 /*    } */
3585   /* This should really invoke a push/pop logic, but we don't have that yet. */
3586   A->ops->setunfactored = PETSC_NULL;
3587   PetscFunctionReturn(0);
3588 }
3589 
3590 #undef __FUNCT__
3591 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj"
3592 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
3593 {
3594   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3595   PetscInt       *AJ=a->j,nz=a->nz;
3596   unsigned short *aj=(unsigned short *)AJ;
3597   PetscFunctionBegin;
3598   /* Is this really necessary? */
3599   while (nz--) {
3600     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
3601   }
3602   A->ops->setunfactored = PETSC_NULL;
3603   PetscFunctionReturn(0);
3604 }
3605 
3606 
3607