xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision 7b6bb2c608b6fc6714ef38fda02c2dbb91c82665)
1 
2 /*
3     Factorization code for BAIJ format.
4 */
5 
6 #include <../src/mat/impls/baij/seq/baij.h>
7 #include <../src/mat/blockinvert.h>
8 #include <petscbt.h>
9 #include <../src/mat/utils/freespace.h>
10 
11 #undef __FUNCT__
12 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering"
13 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
14 {
15   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
16   PetscErrorCode    ierr;
17   const PetscInt    *adiag = a->diag,*ai = a->i,*aj = a->j,*vi;
18   PetscInt          i,n = a->mbs,j;
19   PetscInt          nz;
20   PetscScalar       *x,*tmp,s1;
21   const MatScalar   *aa = a->a,*v;
22   const PetscScalar *b;
23 
24   PetscFunctionBegin;
25   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
26   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
27   tmp  = a->solve_work;
28 
29 
30   /* copy the b into temp work space according to permutation */
31   for (i=0; i<n; i++) tmp[i] = b[i];
32 
33   /* forward solve the U^T */
34   for (i=0; i<n; i++) {
35     v   = aa + adiag[i+1] + 1;
36     vi  = aj + adiag[i+1] + 1;
37     nz  = adiag[i] - adiag[i+1] - 1;
38     s1  = tmp[i];
39     s1 *= v[nz];  /* multiply by inverse of diagonal entry */
40     for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
41     tmp[i] = s1;
42   }
43 
44   /* backward solve the L^T */
45   for (i=n-1; i>=0; i--){
46     v   = aa + ai[i];
47     vi  = aj + ai[i];
48     nz  = ai[i+1] - ai[i];
49     s1  = tmp[i];
50     for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
51   }
52 
53   /* copy tmp into x according to permutation */
54   for (i=0; i<n; i++) x[i] = tmp[i];
55 
56   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
57   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
58 
59   ierr = PetscLogFlops(2.0*a->nz-A->cmap->n);CHKERRQ(ierr);
60   PetscFunctionReturn(0);
61 }
62 
63 #undef __FUNCT__
64 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace"
65 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
66 {
67   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
68   PetscErrorCode    ierr;
69   PetscInt          i,nz;
70   const PetscInt    *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
71   const MatScalar   *aa=a->a,*v;
72   PetscScalar       s1,*x;
73 
74   PetscFunctionBegin;
75   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
76   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
77 
78   /* forward solve the U^T */
79   for (i=0; i<n; i++) {
80 
81     v     = aa + diag[i];
82     /* multiply by the inverse of the block diagonal */
83     s1    = (*v++)*x[i];
84     vi    = aj + diag[i] + 1;
85     nz    = ai[i+1] - diag[i] - 1;
86     while (nz--) {
87       x[*vi++]  -= (*v++)*s1;
88     }
89     x[i]   = s1;
90   }
91   /* backward solve the L^T */
92   for (i=n-1; i>=0; i--){
93     v    = aa + diag[i] - 1;
94     vi   = aj + diag[i] - 1;
95     nz   = diag[i] - ai[i];
96     s1   = x[i];
97     while (nz--) {
98       x[*vi--]   -=  (*v--)*s1;
99     }
100   }
101   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
102   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
103   PetscFunctionReturn(0);
104 }
105 
106 #undef __FUNCT__
107 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace"
108 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
109 {
110   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
111   PetscErrorCode    ierr;
112   PetscInt          i,nz,idx,idt,oidx;
113   const PetscInt    *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j;
114   const MatScalar   *aa=a->a,*v;
115   PetscScalar       s1,s2,x1,x2,*x;
116 
117   PetscFunctionBegin;
118   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
119   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
120 
121   /* forward solve the U^T */
122   idx = 0;
123   for (i=0; i<n; i++) {
124 
125     v     = aa + 4*diag[i];
126     /* multiply by the inverse of the block diagonal */
127     x1 = x[idx];   x2 = x[1+idx];
128     s1 = v[0]*x1  +  v[1]*x2;
129     s2 = v[2]*x1  +  v[3]*x2;
130     v += 4;
131 
132     vi    = aj + diag[i] + 1;
133     nz    = ai[i+1] - diag[i] - 1;
134     while (nz--) {
135       oidx = 2*(*vi++);
136       x[oidx]   -= v[0]*s1  +  v[1]*s2;
137       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
138       v  += 4;
139     }
140     x[idx]   = s1;x[1+idx] = s2;
141     idx += 2;
142   }
143   /* backward solve the L^T */
144   for (i=n-1; i>=0; i--){
145     v    = aa + 4*diag[i] - 4;
146     vi   = aj + diag[i] - 1;
147     nz   = diag[i] - ai[i];
148     idt  = 2*i;
149     s1   = x[idt];  s2 = x[1+idt];
150     while (nz--) {
151       idx   = 2*(*vi--);
152       x[idx]   -=  v[0]*s1 +  v[1]*s2;
153       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
154       v -= 4;
155     }
156   }
157   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
158   ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
159   PetscFunctionReturn(0);
160 }
161 
162 #undef __FUNCT__
163 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering"
164 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
165 {
166   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
167   PetscErrorCode    ierr;
168   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
169   PetscInt          nz,idx,idt,j,i,oidx;
170   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
171   const MatScalar   *aa=a->a,*v;
172   PetscScalar       s1,s2,x1,x2,*x;
173 
174   PetscFunctionBegin;
175   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
176   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
177 
178   /* forward solve the U^T */
179   idx = 0;
180   for (i=0; i<n; i++) {
181     v     = aa + bs2*diag[i];
182     /* multiply by the inverse of the block diagonal */
183     x1 = x[idx];   x2 = x[1+idx];
184     s1 = v[0]*x1  +  v[1]*x2;
185     s2 = v[2]*x1  +  v[3]*x2;
186     v -= bs2;
187 
188     vi    = aj + diag[i] - 1;
189     nz    = diag[i] - diag[i+1] - 1;
190     for(j=0;j>-nz;j--){
191       oidx = bs*vi[j];
192       x[oidx]   -= v[0]*s1  +  v[1]*s2;
193       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
194       v  -= bs2;
195     }
196     x[idx]   = s1;x[1+idx] = s2;
197     idx += bs;
198   }
199   /* backward solve the L^T */
200   for (i=n-1; i>=0; i--){
201     v    = aa + bs2*ai[i];
202     vi   = aj + ai[i];
203     nz   = ai[i+1] - ai[i];
204     idt  = bs*i;
205     s1   = x[idt];  s2 = x[1+idt];
206     for(j=0;j<nz;j++){
207       idx   = bs*vi[j];
208       x[idx]   -=  v[0]*s1 +  v[1]*s2;
209       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
210       v += bs2;
211     }
212   }
213   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
214   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
215   PetscFunctionReturn(0);
216 }
217 
218 #undef __FUNCT__
219 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace"
220 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
221 {
222   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
223   PetscErrorCode    ierr;
224   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
225   PetscInt          i,nz,idx,idt,oidx;
226   const MatScalar   *aa=a->a,*v;
227   PetscScalar       s1,s2,s3,x1,x2,x3,*x;
228 
229   PetscFunctionBegin;
230   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
231   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
232 
233   /* forward solve the U^T */
234   idx = 0;
235   for (i=0; i<n; i++) {
236 
237     v     = aa + 9*diag[i];
238     /* multiply by the inverse of the block diagonal */
239     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
240     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
241     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
242     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
243     v += 9;
244 
245     vi    = aj + diag[i] + 1;
246     nz    = ai[i+1] - diag[i] - 1;
247     while (nz--) {
248       oidx = 3*(*vi++);
249       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
250       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
251       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
252       v  += 9;
253     }
254     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
255     idx += 3;
256   }
257   /* backward solve the L^T */
258   for (i=n-1; i>=0; i--){
259     v    = aa + 9*diag[i] - 9;
260     vi   = aj + diag[i] - 1;
261     nz   = diag[i] - ai[i];
262     idt  = 3*i;
263     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
264     while (nz--) {
265       idx   = 3*(*vi--);
266       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
267       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
268       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
269       v -= 9;
270     }
271   }
272   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
273   ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
274   PetscFunctionReturn(0);
275 }
276 
277 #undef __FUNCT__
278 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering"
279 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
280 {
281   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
282   PetscErrorCode    ierr;
283   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
284   PetscInt          nz,idx,idt,j,i,oidx;
285   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
286   const MatScalar   *aa=a->a,*v;
287   PetscScalar       s1,s2,s3,x1,x2,x3,*x;
288 
289   PetscFunctionBegin;
290   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
291   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
292 
293   /* forward solve the U^T */
294   idx = 0;
295   for (i=0; i<n; i++) {
296     v     = aa + bs2*diag[i];
297     /* multiply by the inverse of the block diagonal */
298     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];
299     s1 = v[0]*x1  +  v[1]*x2  + v[2]*x3;
300     s2 = v[3]*x1  +  v[4]*x2  + v[5]*x3;
301     s3 = v[6]*x1  +  v[7]*x2  + v[8]*x3;
302     v -= bs2;
303 
304     vi    = aj + diag[i] - 1;
305     nz    = diag[i] - diag[i+1] - 1;
306     for(j=0;j>-nz;j--){
307       oidx = bs*vi[j];
308       x[oidx]   -= v[0]*s1  +  v[1]*s2  + v[2]*s3;
309       x[oidx+1] -= v[3]*s1  +  v[4]*s2  + v[5]*s3;
310       x[oidx+2] -= v[6]*s1  +  v[7]*s2  + v[8]*s3;
311       v  -= bs2;
312     }
313     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;
314     idx += bs;
315   }
316   /* backward solve the L^T */
317   for (i=n-1; i>=0; i--){
318     v    = aa + bs2*ai[i];
319     vi   = aj + ai[i];
320     nz   = ai[i+1] - ai[i];
321     idt  = bs*i;
322     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];
323     for(j=0;j<nz;j++){
324       idx   = bs*vi[j];
325       x[idx]   -= v[0]*s1  +  v[1]*s2  + v[2]*s3;
326       x[idx+1] -= v[3]*s1  +  v[4]*s2  + v[5]*s3;
327       x[idx+2] -= v[6]*s1  +  v[7]*s2  + v[8]*s3;
328       v += bs2;
329     }
330   }
331   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
332   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
333   PetscFunctionReturn(0);
334 }
335 
336 #undef __FUNCT__
337 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace"
338 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
339 {
340   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
341   PetscErrorCode    ierr;
342   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
343   PetscInt          i,nz,idx,idt,oidx;
344   const MatScalar   *aa=a->a,*v;
345   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x;
346 
347   PetscFunctionBegin;
348   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
349   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
350 
351   /* forward solve the U^T */
352   idx = 0;
353   for (i=0; i<n; i++) {
354 
355     v     = aa + 16*diag[i];
356     /* multiply by the inverse of the block diagonal */
357     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
358     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
359     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
360     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
361     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
362     v += 16;
363 
364     vi    = aj + diag[i] + 1;
365     nz    = ai[i+1] - diag[i] - 1;
366     while (nz--) {
367       oidx = 4*(*vi++);
368       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
369       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
370       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
371       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
372       v  += 16;
373     }
374     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
375     idx += 4;
376   }
377   /* backward solve the L^T */
378   for (i=n-1; i>=0; i--){
379     v    = aa + 16*diag[i] - 16;
380     vi   = aj + diag[i] - 1;
381     nz   = diag[i] - ai[i];
382     idt  = 4*i;
383     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
384     while (nz--) {
385       idx   = 4*(*vi--);
386       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
387       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
388       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
389       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
390       v -= 16;
391     }
392   }
393   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
394   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
395   PetscFunctionReturn(0);
396 }
397 
398 #undef __FUNCT__
399 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering"
400 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
401 {
402   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
403   PetscErrorCode    ierr;
404   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
405   PetscInt          nz,idx,idt,j,i,oidx;
406   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
407   const MatScalar   *aa=a->a,*v;
408   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x;
409 
410   PetscFunctionBegin;
411   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
412   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
413 
414   /* forward solve the U^T */
415   idx = 0;
416   for (i=0; i<n; i++) {
417     v     = aa + bs2*diag[i];
418     /* multiply by the inverse of the block diagonal */
419     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
420     s1 =  v[0]*x1  +  v[1]*x2  + v[2]*x3  + v[3]*x4;
421     s2 =  v[4]*x1  +  v[5]*x2  + v[6]*x3  + v[7]*x4;
422     s3 =  v[8]*x1  +  v[9]*x2  + v[10]*x3 + v[11]*x4;
423     s4 =  v[12]*x1 +  v[13]*x2 + v[14]*x3 + v[15]*x4;
424     v -= bs2;
425 
426     vi    = aj + diag[i] - 1;
427     nz    = diag[i] - diag[i+1] - 1;
428     for(j=0;j>-nz;j--){
429       oidx = bs*vi[j];
430       x[oidx]   -=  v[0]*s1  +  v[1]*s2  + v[2]*s3  + v[3]*s4;
431       x[oidx+1] -=  v[4]*s1  +  v[5]*s2  + v[6]*s3  + v[7]*s4;
432       x[oidx+2] -=  v[8]*s1  +  v[9]*s2  + v[10]*s3 + v[11]*s4;
433       x[oidx+3] -=  v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
434       v  -= bs2;
435     }
436     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4;
437     idx += bs;
438   }
439   /* backward solve the L^T */
440   for (i=n-1; i>=0; i--){
441     v    = aa + bs2*ai[i];
442     vi   = aj + ai[i];
443     nz   = ai[i+1] - ai[i];
444     idt  = bs*i;
445     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];
446     for(j=0;j<nz;j++){
447       idx   = bs*vi[j];
448       x[idx]   -=  v[0]*s1  +  v[1]*s2  + v[2]*s3  + v[3]*s4;
449       x[idx+1] -=  v[4]*s1  +  v[5]*s2  + v[6]*s3  + v[7]*s4;
450       x[idx+2] -=  v[8]*s1  +  v[9]*s2  + v[10]*s3 + v[11]*s4;
451       x[idx+3] -=  v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
452       v += bs2;
453     }
454   }
455   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
456   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
457   PetscFunctionReturn(0);
458 }
459 
460 #undef __FUNCT__
461 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace"
462 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
463 {
464   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
465   PetscErrorCode    ierr;
466   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
467   PetscInt          i,nz,idx,idt,oidx;
468   const MatScalar   *aa=a->a,*v;
469   PetscScalar       s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x;
470 
471   PetscFunctionBegin;
472   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
473   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
474 
475   /* forward solve the U^T */
476   idx = 0;
477   for (i=0; i<n; i++) {
478 
479     v     = aa + 25*diag[i];
480     /* multiply by the inverse of the block diagonal */
481     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
482     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
483     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
484     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
485     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
486     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
487     v += 25;
488 
489     vi    = aj + diag[i] + 1;
490     nz    = ai[i+1] - diag[i] - 1;
491     while (nz--) {
492       oidx = 5*(*vi++);
493       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
494       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
495       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
496       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
497       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
498       v  += 25;
499     }
500     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
501     idx += 5;
502   }
503   /* backward solve the L^T */
504   for (i=n-1; i>=0; i--){
505     v    = aa + 25*diag[i] - 25;
506     vi   = aj + diag[i] - 1;
507     nz   = diag[i] - ai[i];
508     idt  = 5*i;
509     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
510     while (nz--) {
511       idx   = 5*(*vi--);
512       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
513       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
514       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
515       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
516       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
517       v -= 25;
518     }
519   }
520   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
521   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
522   PetscFunctionReturn(0);
523 }
524 
525 #undef __FUNCT__
526 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering"
527 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
528 {
529   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
530   PetscErrorCode ierr;
531   const PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
532   PetscInt       nz,idx,idt,j,i,oidx;
533   const PetscInt       bs=A->rmap->bs,bs2=a->bs2;
534   const MatScalar      *aa=a->a,*v;
535   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x;
536 
537   PetscFunctionBegin;
538   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
539   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
540 
541   /* forward solve the U^T */
542   idx = 0;
543   for (i=0; i<n; i++) {
544     v     = aa + bs2*diag[i];
545     /* multiply by the inverse of the block diagonal */
546     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
547     x5 = x[4+idx];
548     s1 =  v[0]*x1   +  v[1]*x2   + v[2]*x3   + v[3]*x4   + v[4]*x5;
549     s2 =  v[5]*x1   +  v[6]*x2   + v[7]*x3   + v[8]*x4   + v[9]*x5;
550     s3 =  v[10]*x1  +  v[11]*x2  + v[12]*x3  + v[13]*x4  + v[14]*x5;
551     s4 =  v[15]*x1  +  v[16]*x2  + v[17]*x3  + v[18]*x4  + v[19]*x5;
552     s5 =  v[20]*x1  +  v[21]*x2  + v[22]*x3  + v[23]*x4   + v[24]*x5;
553     v -= bs2;
554 
555     vi    = aj + diag[i] - 1;
556     nz    = diag[i] - diag[i+1] - 1;
557     for(j=0;j>-nz;j--){
558       oidx = bs*vi[j];
559       x[oidx]   -=  v[0]*s1   +  v[1]*s2   + v[2]*s3   + v[3]*s4   + v[4]*s5;
560       x[oidx+1] -=  v[5]*s1   +  v[6]*s2   + v[7]*s3   + v[8]*s4   + v[9]*s5;
561       x[oidx+2] -=  v[10]*s1  +  v[11]*s2  + v[12]*s3  + v[13]*s4  + v[14]*s5;
562       x[oidx+3] -=  v[15]*s1  +  v[16]*s2  + v[17]*s3  + v[18]*s4  + v[19]*s5;
563       x[oidx+4] -=  v[20]*s1  +  v[21]*s2  + v[22]*s3  + v[23]*s4   + v[24]*s5;
564       v  -= bs2;
565     }
566     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
567     idx += bs;
568   }
569   /* backward solve the L^T */
570   for (i=n-1; i>=0; i--){
571     v    = aa + bs2*ai[i];
572     vi   = aj + ai[i];
573     nz   = ai[i+1] - ai[i];
574     idt  = bs*i;
575     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
576     for(j=0;j<nz;j++){
577       idx   = bs*vi[j];
578       x[idx]   -=  v[0]*s1   +  v[1]*s2   + v[2]*s3   + v[3]*s4   + v[4]*s5;
579       x[idx+1] -=  v[5]*s1   +  v[6]*s2   + v[7]*s3   + v[8]*s4   + v[9]*s5;
580       x[idx+2] -=  v[10]*s1  +  v[11]*s2  + v[12]*s3  + v[13]*s4  + v[14]*s5;
581       x[idx+3] -=  v[15]*s1  +  v[16]*s2  + v[17]*s3  + v[18]*s4  + v[19]*s5;
582       x[idx+4] -=  v[20]*s1  +  v[21]*s2  + v[22]*s3  + v[23]*s4   + v[24]*s5;
583       v += bs2;
584     }
585   }
586   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
587   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
588   PetscFunctionReturn(0);
589 }
590 
591 #undef __FUNCT__
592 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace"
593 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
594 {
595   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
596   PetscErrorCode    ierr;
597   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
598   PetscInt          i,nz,idx,idt,oidx;
599   const MatScalar   *aa=a->a,*v;
600   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x;
601 
602   PetscFunctionBegin;
603   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
604   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
605 
606   /* forward solve the U^T */
607   idx = 0;
608   for (i=0; i<n; i++) {
609 
610     v     = aa + 36*diag[i];
611     /* multiply by the inverse of the block diagonal */
612     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
613     x6    = x[5+idx];
614     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
615     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
616     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
617     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
618     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
619     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
620     v += 36;
621 
622     vi    = aj + diag[i] + 1;
623     nz    = ai[i+1] - diag[i] - 1;
624     while (nz--) {
625       oidx = 6*(*vi++);
626       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
627       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
628       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
629       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
630       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
631       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
632       v  += 36;
633     }
634     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
635     x[5+idx] = s6;
636     idx += 6;
637   }
638   /* backward solve the L^T */
639   for (i=n-1; i>=0; i--){
640     v    = aa + 36*diag[i] - 36;
641     vi   = aj + diag[i] - 1;
642     nz   = diag[i] - ai[i];
643     idt  = 6*i;
644     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
645     s6 = x[5+idt];
646     while (nz--) {
647       idx   = 6*(*vi--);
648       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
649       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
650       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
651       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
652       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
653       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
654       v -= 36;
655     }
656   }
657   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
658   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
659   PetscFunctionReturn(0);
660 }
661 
662 #undef __FUNCT__
663 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering"
664 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
665 {
666   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
667   PetscErrorCode    ierr;
668   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
669   PetscInt          nz,idx,idt,j,i,oidx;
670   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
671   const MatScalar   *aa=a->a,*v;
672   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x;
673 
674   PetscFunctionBegin;
675   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
676   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
677 
678   /* forward solve the U^T */
679   idx = 0;
680   for (i=0; i<n; i++) {
681     v     = aa + bs2*diag[i];
682     /* multiply by the inverse of the block diagonal */
683     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
684     x5 = x[4+idx]; x6 = x[5+idx];
685     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
686     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
687     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
688     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
689     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
690     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
691     v -= bs2;
692 
693     vi    = aj + diag[i] - 1;
694     nz    = diag[i] - diag[i+1] - 1;
695     for(j=0;j>-nz;j--){
696       oidx = bs*vi[j];
697       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
698       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
699       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
700       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
701       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
702       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
703       v  -= bs2;
704     }
705     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
706     x[5+idx] = s6;
707     idx += bs;
708   }
709   /* backward solve the L^T */
710   for (i=n-1; i>=0; i--){
711     v    = aa + bs2*ai[i];
712     vi   = aj + ai[i];
713     nz   = ai[i+1] - ai[i];
714     idt  = bs*i;
715     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
716     s6   = x[5+idt];
717     for(j=0;j<nz;j++){
718       idx   = bs*vi[j];
719       x[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
720       x[idx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
721       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
722       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
723       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
724       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
725       v += bs2;
726     }
727   }
728   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
729   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
730   PetscFunctionReturn(0);
731 }
732 
733 #undef __FUNCT__
734 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace"
735 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
736 {
737   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
738   PetscErrorCode    ierr;
739   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
740   PetscInt          i,nz,idx,idt,oidx;
741   const MatScalar   *aa=a->a,*v;
742   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x;
743 
744   PetscFunctionBegin;
745   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
746   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
747 
748   /* forward solve the U^T */
749   idx = 0;
750   for (i=0; i<n; i++) {
751 
752     v     = aa + 49*diag[i];
753     /* multiply by the inverse of the block diagonal */
754     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
755     x6    = x[5+idx]; x7 = x[6+idx];
756     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
757     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
758     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
759     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
760     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
761     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
762     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
763     v += 49;
764 
765     vi    = aj + diag[i] + 1;
766     nz    = ai[i+1] - diag[i] - 1;
767     while (nz--) {
768       oidx = 7*(*vi++);
769       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
770       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
771       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
772       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
773       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
774       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
775       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
776       v  += 49;
777     }
778     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
779     x[5+idx] = s6;x[6+idx] = s7;
780     idx += 7;
781   }
782   /* backward solve the L^T */
783   for (i=n-1; i>=0; i--){
784     v    = aa + 49*diag[i] - 49;
785     vi   = aj + diag[i] - 1;
786     nz   = diag[i] - ai[i];
787     idt  = 7*i;
788     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
789     s6 = x[5+idt];s7 = x[6+idt];
790     while (nz--) {
791       idx   = 7*(*vi--);
792       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
793       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
794       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
795       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
796       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
797       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
798       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
799       v -= 49;
800     }
801   }
802   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
803   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
804   PetscFunctionReturn(0);
805 }
806 #undef __FUNCT__
807 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering"
808 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
809 {
810   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
811   PetscErrorCode    ierr;
812   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
813   PetscInt          nz,idx,idt,j,i,oidx;
814   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
815   const MatScalar   *aa=a->a,*v;
816   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x;
817 
818   PetscFunctionBegin;
819   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
820   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
821 
822   /* forward solve the U^T */
823   idx = 0;
824   for (i=0; i<n; i++) {
825     v     = aa + bs2*diag[i];
826     /* multiply by the inverse of the block diagonal */
827     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
828     x5 = x[4+idx]; x6 = x[5+idx];  x7 = x[6+idx];
829     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
830     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
831     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
832     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
833     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
834     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
835     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
836     v -= bs2;
837     vi    = aj + diag[i] - 1;
838     nz    = diag[i] - diag[i+1] - 1;
839     for(j=0;j>-nz;j--){
840       oidx = bs*vi[j];
841       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
842       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
843       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
844       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
845       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
846       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
847       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
848       v  -= bs2;
849     }
850     x[idx]   = s1;  x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
851     x[5+idx] = s6;  x[6+idx] = s7;
852     idx += bs;
853   }
854   /* backward solve the L^T */
855   for (i=n-1; i>=0; i--){
856     v    = aa + bs2*ai[i];
857     vi   = aj + ai[i];
858     nz   = ai[i+1] - ai[i];
859     idt  = bs*i;
860     s1   = x[idt];    s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
861     s6   = x[5+idt];  s7 = x[6+idt];
862     for(j=0;j<nz;j++){
863       idx   = bs*vi[j];
864       x[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
865       x[idx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
866       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
867       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
868       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
869       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
870       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
871       v += bs2;
872     }
873   }
874   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
875   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
876   PetscFunctionReturn(0);
877 }
878 
879 /*---------------------------------------------------------------------------------------------*/
880 #undef __FUNCT__
881 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1"
882 PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
883 {
884   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data;
885   IS                iscol = a->col,isrow = a->row;
886   PetscErrorCode    ierr;
887   const PetscInt    *rout,*cout,*r,*c,*adiag = a->diag,*ai = a->i,*aj = a->j,*vi;
888   PetscInt          i,n = a->mbs,j;
889   PetscInt          nz;
890   PetscScalar       *x,*tmp,s1;
891   const MatScalar   *aa = a->a,*v;
892   const PetscScalar *b;
893 
894   PetscFunctionBegin;
895   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
896   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
897   tmp  = a->solve_work;
898 
899   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
900   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
901 
902   /* copy the b into temp work space according to permutation */
903   for (i=0; i<n; i++) tmp[i] = b[c[i]];
904 
905   /* forward solve the U^T */
906   for (i=0; i<n; i++) {
907     v   = aa + adiag[i+1] + 1;
908     vi  = aj + adiag[i+1] + 1;
909     nz  = adiag[i] - adiag[i+1] - 1;
910     s1  = tmp[i];
911     s1 *= v[nz];  /* multiply by inverse of diagonal entry */
912     for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
913     tmp[i] = s1;
914   }
915 
916   /* backward solve the L^T */
917   for (i=n-1; i>=0; i--){
918     v   = aa + ai[i];
919     vi  = aj + ai[i];
920     nz  = ai[i+1] - ai[i];
921     s1  = tmp[i];
922     for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
923   }
924 
925   /* copy tmp into x according to permutation */
926   for (i=0; i<n; i++) x[r[i]] = tmp[i];
927 
928   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
929   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
930   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
931   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
932 
933   ierr = PetscLogFlops(2.0*a->nz-A->cmap->n);CHKERRQ(ierr);
934   PetscFunctionReturn(0);
935 }
936 
937 #undef __FUNCT__
938 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_inplace"
939 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
940 {
941   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
942   IS                iscol=a->col,isrow=a->row;
943   PetscErrorCode    ierr;
944   const PetscInt    *r,*c,*rout,*cout;
945   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
946   PetscInt          i,nz;
947   const MatScalar   *aa=a->a,*v;
948   PetscScalar       s1,*x,*t;
949   const PetscScalar *b;
950 
951   PetscFunctionBegin;
952   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
953   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
954   t  = a->solve_work;
955 
956   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
957   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
958 
959   /* copy the b into temp work space according to permutation */
960   for (i=0; i<n; i++) {
961     t[i] = b[c[i]];
962   }
963 
964   /* forward solve the U^T */
965   for (i=0; i<n; i++) {
966 
967     v     = aa + diag[i];
968     /* multiply by the inverse of the block diagonal */
969     s1    = (*v++)*t[i];
970     vi    = aj + diag[i] + 1;
971     nz    = ai[i+1] - diag[i] - 1;
972     while (nz--) {
973       t[*vi++]  -= (*v++)*s1;
974     }
975     t[i]   = s1;
976   }
977   /* backward solve the L^T */
978   for (i=n-1; i>=0; i--){
979     v    = aa + diag[i] - 1;
980     vi   = aj + diag[i] - 1;
981     nz   = diag[i] - ai[i];
982     s1   = t[i];
983     while (nz--) {
984       t[*vi--]   -=  (*v--)*s1;
985     }
986   }
987 
988   /* copy t into x according to permutation */
989   for (i=0; i<n; i++) {
990     x[r[i]]   = t[i];
991   }
992 
993   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
994   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
995   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
996   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
997   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
998   PetscFunctionReturn(0);
999 }
1000 
1001 #undef __FUNCT__
1002 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_inplace"
1003 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
1004 {
1005   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1006   IS                iscol=a->col,isrow=a->row;
1007   PetscErrorCode    ierr;
1008   const PetscInt    *r,*c,*rout,*cout;
1009   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1010   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1011   const MatScalar   *aa=a->a,*v;
1012   PetscScalar       s1,s2,x1,x2,*x,*t;
1013   const PetscScalar *b;
1014 
1015   PetscFunctionBegin;
1016   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1017   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1018   t  = a->solve_work;
1019 
1020   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1021   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1022 
1023   /* copy the b into temp work space according to permutation */
1024   ii = 0;
1025   for (i=0; i<n; i++) {
1026     ic      = 2*c[i];
1027     t[ii]   = b[ic];
1028     t[ii+1] = b[ic+1];
1029     ii += 2;
1030   }
1031 
1032   /* forward solve the U^T */
1033   idx = 0;
1034   for (i=0; i<n; i++) {
1035 
1036     v     = aa + 4*diag[i];
1037     /* multiply by the inverse of the block diagonal */
1038     x1    = t[idx];   x2 = t[1+idx];
1039     s1 = v[0]*x1  +  v[1]*x2;
1040     s2 = v[2]*x1  +  v[3]*x2;
1041     v += 4;
1042 
1043     vi    = aj + diag[i] + 1;
1044     nz    = ai[i+1] - diag[i] - 1;
1045     while (nz--) {
1046       oidx = 2*(*vi++);
1047       t[oidx]   -= v[0]*s1  +  v[1]*s2;
1048       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
1049       v  += 4;
1050     }
1051     t[idx]   = s1;t[1+idx] = s2;
1052     idx += 2;
1053   }
1054   /* backward solve the L^T */
1055   for (i=n-1; i>=0; i--){
1056     v    = aa + 4*diag[i] - 4;
1057     vi   = aj + diag[i] - 1;
1058     nz   = diag[i] - ai[i];
1059     idt  = 2*i;
1060     s1 = t[idt];  s2 = t[1+idt];
1061     while (nz--) {
1062       idx   = 2*(*vi--);
1063       t[idx]   -=  v[0]*s1 +  v[1]*s2;
1064       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
1065       v -= 4;
1066     }
1067   }
1068 
1069   /* copy t into x according to permutation */
1070   ii = 0;
1071   for (i=0; i<n; i++) {
1072     ir      = 2*r[i];
1073     x[ir]   = t[ii];
1074     x[ir+1] = t[ii+1];
1075     ii += 2;
1076   }
1077 
1078   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1079   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1080   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1081   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1082   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
1083   PetscFunctionReturn(0);
1084 }
1085 
1086 #undef __FUNCT__
1087 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2"
1088 PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
1089 {
1090   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1091   PetscErrorCode    ierr;
1092   IS                iscol=a->col,isrow=a->row;
1093   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1094   const PetscInt    *r,*c,*rout,*cout;
1095   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1096   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1097   const MatScalar   *aa=a->a,*v;
1098   PetscScalar       s1,s2,x1,x2,*x,*t;
1099   const PetscScalar *b;
1100 
1101   PetscFunctionBegin;
1102   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1103   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1104   t = a->solve_work;
1105 
1106   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1107   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1108 
1109   /* copy b into temp work space according to permutation */
1110   for(i=0;i<n;i++){
1111     ii = bs*i; ic = bs*c[i];
1112     t[ii] = b[ic]; t[ii+1] = b[ic+1];
1113   }
1114 
1115   /* forward solve the U^T */
1116   idx = 0;
1117   for (i=0; i<n; i++) {
1118     v     = aa + bs2*diag[i];
1119     /* multiply by the inverse of the block diagonal */
1120     x1 = t[idx];   x2 = t[1+idx];
1121     s1 = v[0]*x1  +  v[1]*x2;
1122     s2 = v[2]*x1  +  v[3]*x2;
1123     v -= bs2;
1124 
1125     vi    = aj + diag[i] - 1;
1126     nz    = diag[i] - diag[i+1] - 1;
1127     for(j=0;j>-nz;j--){
1128       oidx = bs*vi[j];
1129       t[oidx]   -= v[0]*s1  +  v[1]*s2;
1130       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
1131       v  -= bs2;
1132     }
1133     t[idx]   = s1;t[1+idx] = s2;
1134     idx += bs;
1135   }
1136   /* backward solve the L^T */
1137   for (i=n-1; i>=0; i--){
1138     v    = aa + bs2*ai[i];
1139     vi   = aj + ai[i];
1140     nz   = ai[i+1] - ai[i];
1141     idt  = bs*i;
1142     s1   = t[idt];  s2 = t[1+idt];
1143     for(j=0;j<nz;j++){
1144       idx   = bs*vi[j];
1145       t[idx]   -=  v[0]*s1 +  v[1]*s2;
1146       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
1147       v += bs2;
1148     }
1149   }
1150 
1151   /* copy t into x according to permutation */
1152   for(i=0;i<n;i++){
1153     ii = bs*i;  ir = bs*r[i];
1154     x[ir] = t[ii];  x[ir+1] = t[ii+1];
1155   }
1156 
1157   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1158   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1159   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1160   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1161   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1162   PetscFunctionReturn(0);
1163 }
1164 
1165 #undef __FUNCT__
1166 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_inplace"
1167 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
1168 {
1169   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1170   IS                iscol=a->col,isrow=a->row;
1171   PetscErrorCode    ierr;
1172   const PetscInt    *r,*c,*rout,*cout;
1173   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1174   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1175   const MatScalar   *aa=a->a,*v;
1176   PetscScalar       s1,s2,s3,x1,x2,x3,*x,*t;
1177   const PetscScalar *b;
1178 
1179   PetscFunctionBegin;
1180   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1181   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1182   t  = a->solve_work;
1183 
1184   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1185   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1186 
1187   /* copy the b into temp work space according to permutation */
1188   ii = 0;
1189   for (i=0; i<n; i++) {
1190     ic      = 3*c[i];
1191     t[ii]   = b[ic];
1192     t[ii+1] = b[ic+1];
1193     t[ii+2] = b[ic+2];
1194     ii += 3;
1195   }
1196 
1197   /* forward solve the U^T */
1198   idx = 0;
1199   for (i=0; i<n; i++) {
1200 
1201     v     = aa + 9*diag[i];
1202     /* multiply by the inverse of the block diagonal */
1203     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
1204     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
1205     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
1206     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
1207     v += 9;
1208 
1209     vi    = aj + diag[i] + 1;
1210     nz    = ai[i+1] - diag[i] - 1;
1211     while (nz--) {
1212       oidx = 3*(*vi++);
1213       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
1214       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
1215       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1216       v  += 9;
1217     }
1218     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
1219     idx += 3;
1220   }
1221   /* backward solve the L^T */
1222   for (i=n-1; i>=0; i--){
1223     v    = aa + 9*diag[i] - 9;
1224     vi   = aj + diag[i] - 1;
1225     nz   = diag[i] - ai[i];
1226     idt  = 3*i;
1227     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
1228     while (nz--) {
1229       idx   = 3*(*vi--);
1230       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
1231       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
1232       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1233       v -= 9;
1234     }
1235   }
1236 
1237   /* copy t into x according to permutation */
1238   ii = 0;
1239   for (i=0; i<n; i++) {
1240     ir      = 3*r[i];
1241     x[ir]   = t[ii];
1242     x[ir+1] = t[ii+1];
1243     x[ir+2] = t[ii+2];
1244     ii += 3;
1245   }
1246 
1247   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1248   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1249   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1250   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1251   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
1252   PetscFunctionReturn(0);
1253 }
1254 
1255 #undef __FUNCT__
1256 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3"
1257 PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
1258 {
1259   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1260   PetscErrorCode    ierr;
1261   IS                iscol=a->col,isrow=a->row;
1262   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1263   const PetscInt    *r,*c,*rout,*cout;
1264   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1265   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1266   const MatScalar   *aa=a->a,*v;
1267   PetscScalar       s1,s2,s3,x1,x2,x3,*x,*t;
1268   const PetscScalar *b;
1269 
1270   PetscFunctionBegin;
1271   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1272   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1273   t = a->solve_work;
1274 
1275   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1276   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1277 
1278   /* copy b into temp work space according to permutation */
1279   for(i=0;i<n;i++){
1280     ii = bs*i; ic = bs*c[i];
1281     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2];
1282   }
1283 
1284   /* forward solve the U^T */
1285   idx = 0;
1286   for (i=0; i<n; i++) {
1287     v     = aa + bs2*diag[i];
1288     /* multiply by the inverse of the block diagonal */
1289     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
1290     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
1291     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
1292     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
1293     v -= bs2;
1294 
1295     vi    = aj + diag[i] - 1;
1296     nz    = diag[i] - diag[i+1] - 1;
1297     for(j=0;j>-nz;j--){
1298       oidx = bs*vi[j];
1299       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
1300       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
1301       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1302       v  -= bs2;
1303     }
1304     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;
1305     idx += bs;
1306   }
1307   /* backward solve the L^T */
1308   for (i=n-1; i>=0; i--){
1309     v    = aa + bs2*ai[i];
1310     vi   = aj + ai[i];
1311     nz   = ai[i+1] - ai[i];
1312     idt  = bs*i;
1313     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];
1314     for(j=0;j<nz;j++){
1315       idx   = bs*vi[j];
1316       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
1317       t[idx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
1318       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1319       v += bs2;
1320     }
1321   }
1322 
1323   /* copy t into x according to permutation */
1324   for(i=0;i<n;i++){
1325     ii = bs*i;  ir = bs*r[i];
1326     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];
1327   }
1328 
1329   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1330   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1331   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1332   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1333   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1334   PetscFunctionReturn(0);
1335 }
1336 
1337 #undef __FUNCT__
1338 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_inplace"
1339 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
1340 {
1341   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1342   IS                iscol=a->col,isrow=a->row;
1343   PetscErrorCode    ierr;
1344   const PetscInt    *r,*c,*rout,*cout;
1345   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1346   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1347   const MatScalar   *aa=a->a,*v;
1348   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x,*t;
1349   const PetscScalar *b;
1350 
1351   PetscFunctionBegin;
1352   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1353   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1354   t  = a->solve_work;
1355 
1356   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1357   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1358 
1359   /* copy the b into temp work space according to permutation */
1360   ii = 0;
1361   for (i=0; i<n; i++) {
1362     ic      = 4*c[i];
1363     t[ii]   = b[ic];
1364     t[ii+1] = b[ic+1];
1365     t[ii+2] = b[ic+2];
1366     t[ii+3] = b[ic+3];
1367     ii += 4;
1368   }
1369 
1370   /* forward solve the U^T */
1371   idx = 0;
1372   for (i=0; i<n; i++) {
1373 
1374     v     = aa + 16*diag[i];
1375     /* multiply by the inverse of the block diagonal */
1376     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
1377     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1378     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1379     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1380     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1381     v += 16;
1382 
1383     vi    = aj + diag[i] + 1;
1384     nz    = ai[i+1] - diag[i] - 1;
1385     while (nz--) {
1386       oidx = 4*(*vi++);
1387       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
1388       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
1389       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1390       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1391       v  += 16;
1392     }
1393     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
1394     idx += 4;
1395   }
1396   /* backward solve the L^T */
1397   for (i=n-1; i>=0; i--){
1398     v    = aa + 16*diag[i] - 16;
1399     vi   = aj + diag[i] - 1;
1400     nz   = diag[i] - ai[i];
1401     idt  = 4*i;
1402     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
1403     while (nz--) {
1404       idx   = 4*(*vi--);
1405       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
1406       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
1407       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1408       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1409       v -= 16;
1410     }
1411   }
1412 
1413   /* copy t into x according to permutation */
1414   ii = 0;
1415   for (i=0; i<n; i++) {
1416     ir      = 4*r[i];
1417     x[ir]   = t[ii];
1418     x[ir+1] = t[ii+1];
1419     x[ir+2] = t[ii+2];
1420     x[ir+3] = t[ii+3];
1421     ii += 4;
1422   }
1423 
1424   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1425   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1426   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1427   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1428   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
1429   PetscFunctionReturn(0);
1430 }
1431 
1432 #undef __FUNCT__
1433 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4"
1434 PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
1435 {
1436   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1437   PetscErrorCode    ierr;
1438   IS                iscol=a->col,isrow=a->row;
1439   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1440   const PetscInt    *r,*c,*rout,*cout;
1441   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1442   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1443   const MatScalar   *aa=a->a,*v;
1444   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x,*t;
1445   const PetscScalar *b;
1446 
1447   PetscFunctionBegin;
1448   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1449   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1450   t = a->solve_work;
1451 
1452   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1453   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1454 
1455   /* copy b into temp work space according to permutation */
1456   for(i=0;i<n;i++){
1457     ii = bs*i; ic = bs*c[i];
1458     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1459   }
1460 
1461   /* forward solve the U^T */
1462   idx = 0;
1463   for (i=0; i<n; i++) {
1464     v     = aa + bs2*diag[i];
1465     /* multiply by the inverse of the block diagonal */
1466     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];  x4 = t[3+idx];
1467     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1468     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1469     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1470     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1471     v -= bs2;
1472 
1473     vi    = aj + diag[i] - 1;
1474     nz    = diag[i] - diag[i+1] - 1;
1475     for(j=0;j>-nz;j--){
1476       oidx = bs*vi[j];
1477       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
1478       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
1479       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1480       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1481       v  -= bs2;
1482     }
1483     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4;
1484     idx += bs;
1485   }
1486   /* backward solve the L^T */
1487   for (i=n-1; i>=0; i--){
1488     v    = aa + bs2*ai[i];
1489     vi   = aj + ai[i];
1490     nz   = ai[i+1] - ai[i];
1491     idt  = bs*i;
1492     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt];
1493     for(j=0;j<nz;j++){
1494       idx   = bs*vi[j];
1495       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3  +  v[3]*s4;
1496       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3  +  v[7]*s4;
1497       t[idx+2] -=  v[8]*s1 +  v[9]*s2 +  v[10]*s3 + v[11]*s4;
1498       t[idx+3] -= v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
1499       v += bs2;
1500     }
1501   }
1502 
1503   /* copy t into x according to permutation */
1504   for(i=0;i<n;i++){
1505     ii = bs*i;  ir = bs*r[i];
1506     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
1507   }
1508 
1509   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1510   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1511   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1512   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1513   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1514   PetscFunctionReturn(0);
1515 }
1516 
1517 #undef __FUNCT__
1518 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_inplace"
1519 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
1520 {
1521   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1522   IS                iscol=a->col,isrow=a->row;
1523   PetscErrorCode    ierr;
1524   const PetscInt    *r,*c,*rout,*cout;
1525   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1526   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1527   const MatScalar   *aa=a->a,*v;
1528   PetscScalar       s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t;
1529   const PetscScalar *b;
1530 
1531   PetscFunctionBegin;
1532   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1533   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1534   t  = a->solve_work;
1535 
1536   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1537   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1538 
1539   /* copy the b into temp work space according to permutation */
1540   ii = 0;
1541   for (i=0; i<n; i++) {
1542     ic      = 5*c[i];
1543     t[ii]   = b[ic];
1544     t[ii+1] = b[ic+1];
1545     t[ii+2] = b[ic+2];
1546     t[ii+3] = b[ic+3];
1547     t[ii+4] = b[ic+4];
1548     ii += 5;
1549   }
1550 
1551   /* forward solve the U^T */
1552   idx = 0;
1553   for (i=0; i<n; i++) {
1554 
1555     v     = aa + 25*diag[i];
1556     /* multiply by the inverse of the block diagonal */
1557     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1558     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1559     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1560     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1561     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1562     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1563     v += 25;
1564 
1565     vi    = aj + diag[i] + 1;
1566     nz    = ai[i+1] - diag[i] - 1;
1567     while (nz--) {
1568       oidx = 5*(*vi++);
1569       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1570       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1571       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1572       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1573       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1574       v  += 25;
1575     }
1576     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1577     idx += 5;
1578   }
1579   /* backward solve the L^T */
1580   for (i=n-1; i>=0; i--){
1581     v    = aa + 25*diag[i] - 25;
1582     vi   = aj + diag[i] - 1;
1583     nz   = diag[i] - ai[i];
1584     idt  = 5*i;
1585     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1586     while (nz--) {
1587       idx   = 5*(*vi--);
1588       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1589       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1590       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1591       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1592       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1593       v -= 25;
1594     }
1595   }
1596 
1597   /* copy t into x according to permutation */
1598   ii = 0;
1599   for (i=0; i<n; i++) {
1600     ir      = 5*r[i];
1601     x[ir]   = t[ii];
1602     x[ir+1] = t[ii+1];
1603     x[ir+2] = t[ii+2];
1604     x[ir+3] = t[ii+3];
1605     x[ir+4] = t[ii+4];
1606     ii += 5;
1607   }
1608 
1609   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1610   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1611   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1612   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1613   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
1614   PetscFunctionReturn(0);
1615 }
1616 
1617 #undef __FUNCT__
1618 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5"
1619 PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
1620 {
1621   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1622   PetscErrorCode    ierr;
1623   IS                iscol=a->col,isrow=a->row;
1624   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1625   const PetscInt    *r,*c,*rout,*cout;
1626   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1627   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1628   const MatScalar   *aa=a->a,*v;
1629   PetscScalar       s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t;
1630   const PetscScalar *b;
1631 
1632   PetscFunctionBegin;
1633   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1634   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1635   t = a->solve_work;
1636 
1637   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1638   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1639 
1640   /* copy b into temp work space according to permutation */
1641   for(i=0;i<n;i++){
1642     ii = bs*i; ic = bs*c[i];
1643     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1644     t[ii+4] = b[ic+4];
1645   }
1646 
1647   /* forward solve the U^T */
1648   idx = 0;
1649   for (i=0; i<n; i++) {
1650     v     = aa + bs2*diag[i];
1651     /* multiply by the inverse of the block diagonal */
1652     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1653     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1654     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1655     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1656     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1657     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1658     v -= bs2;
1659 
1660     vi    = aj + diag[i] - 1;
1661     nz    = diag[i] - diag[i+1] - 1;
1662     for(j=0;j>-nz;j--){
1663       oidx = bs*vi[j];
1664       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1665       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1666       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1667       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1668       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1669       v  -= bs2;
1670     }
1671     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
1672     idx += bs;
1673   }
1674   /* backward solve the L^T */
1675   for (i=n-1; i>=0; i--){
1676     v    = aa + bs2*ai[i];
1677     vi   = aj + ai[i];
1678     nz   = ai[i+1] - ai[i];
1679     idt  = bs*i;
1680     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
1681     for(j=0;j<nz;j++){
1682       idx   = bs*vi[j];
1683       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1684       t[idx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1685       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1686       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1687       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1688       v += bs2;
1689     }
1690   }
1691 
1692   /* copy t into x according to permutation */
1693   for(i=0;i<n;i++){
1694     ii = bs*i;  ir = bs*r[i];
1695     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
1696     x[ir+4] = t[ii+4];
1697   }
1698 
1699   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1700   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1701   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1702   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1703   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1704   PetscFunctionReturn(0);
1705 }
1706 
1707 #undef __FUNCT__
1708 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_inplace"
1709 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
1710 {
1711   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1712   IS                iscol=a->col,isrow=a->row;
1713   PetscErrorCode    ierr;
1714   const PetscInt    *r,*c,*rout,*cout;
1715   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1716   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1717   const MatScalar   *aa=a->a,*v;
1718   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t;
1719   const PetscScalar *b;
1720 
1721   PetscFunctionBegin;
1722   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1723   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1724   t  = a->solve_work;
1725 
1726   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1727   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1728 
1729   /* copy the b into temp work space according to permutation */
1730   ii = 0;
1731   for (i=0; i<n; i++) {
1732     ic      = 6*c[i];
1733     t[ii]   = b[ic];
1734     t[ii+1] = b[ic+1];
1735     t[ii+2] = b[ic+2];
1736     t[ii+3] = b[ic+3];
1737     t[ii+4] = b[ic+4];
1738     t[ii+5] = b[ic+5];
1739     ii += 6;
1740   }
1741 
1742   /* forward solve the U^T */
1743   idx = 0;
1744   for (i=0; i<n; i++) {
1745 
1746     v     = aa + 36*diag[i];
1747     /* multiply by the inverse of the block diagonal */
1748     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1749     x6    = t[5+idx];
1750     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
1751     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
1752     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
1753     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
1754     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
1755     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
1756     v += 36;
1757 
1758     vi    = aj + diag[i] + 1;
1759     nz    = ai[i+1] - diag[i] - 1;
1760     while (nz--) {
1761       oidx = 6*(*vi++);
1762       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1763       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1764       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1765       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1766       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1767       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1768       v  += 36;
1769     }
1770     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1771     t[5+idx] = s6;
1772     idx += 6;
1773   }
1774   /* backward solve the L^T */
1775   for (i=n-1; i>=0; i--){
1776     v    = aa + 36*diag[i] - 36;
1777     vi   = aj + diag[i] - 1;
1778     nz   = diag[i] - ai[i];
1779     idt  = 6*i;
1780     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1781     s6 = t[5+idt];
1782     while (nz--) {
1783       idx   = 6*(*vi--);
1784       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1785       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1786       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1787       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1788       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1789       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1790       v -= 36;
1791     }
1792   }
1793 
1794   /* copy t into x according to permutation */
1795   ii = 0;
1796   for (i=0; i<n; i++) {
1797     ir      = 6*r[i];
1798     x[ir]   = t[ii];
1799     x[ir+1] = t[ii+1];
1800     x[ir+2] = t[ii+2];
1801     x[ir+3] = t[ii+3];
1802     x[ir+4] = t[ii+4];
1803     x[ir+5] = t[ii+5];
1804     ii += 6;
1805   }
1806 
1807   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1808   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1809   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1810   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1811   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1812   PetscFunctionReturn(0);
1813 }
1814 
1815 #undef __FUNCT__
1816 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6"
1817 PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
1818 {
1819   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1820   PetscErrorCode    ierr;
1821   IS                iscol=a->col,isrow=a->row;
1822   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1823   const PetscInt    *r,*c,*rout,*cout;
1824   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1825   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1826   const MatScalar   *aa=a->a,*v;
1827   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t;
1828   const PetscScalar *b;
1829 
1830   PetscFunctionBegin;
1831   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1832   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1833   t = a->solve_work;
1834 
1835   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1836   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1837 
1838   /* copy b into temp work space according to permutation */
1839   for(i=0;i<n;i++){
1840     ii = bs*i; ic = bs*c[i];
1841     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1842     t[ii+4] = b[ic+4];  t[ii+5] = b[ic+5];
1843   }
1844 
1845   /* forward solve the U^T */
1846   idx = 0;
1847   for (i=0; i<n; i++) {
1848     v     = aa + bs2*diag[i];
1849     /* multiply by the inverse of the block diagonal */
1850     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1851     x6    = t[5+idx];
1852     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
1853     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
1854     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
1855     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
1856     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
1857     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
1858     v -= bs2;
1859 
1860     vi    = aj + diag[i] - 1;
1861     nz    = diag[i] - diag[i+1] - 1;
1862     for(j=0;j>-nz;j--){
1863       oidx = bs*vi[j];
1864       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1865       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1866       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1867       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1868       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1869       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1870       v  -= bs2;
1871     }
1872     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
1873     t[5+idx] = s6;
1874     idx += bs;
1875   }
1876   /* backward solve the L^T */
1877   for (i=n-1; i>=0; i--){
1878     v    = aa + bs2*ai[i];
1879     vi   = aj + ai[i];
1880     nz   = ai[i+1] - ai[i];
1881     idt  = bs*i;
1882     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
1883     s6   = t[5+idt];
1884    for(j=0;j<nz;j++){
1885       idx   = bs*vi[j];
1886       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1887       t[idx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1888       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1889       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1890       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1891       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1892       v += bs2;
1893     }
1894   }
1895 
1896   /* copy t into x according to permutation */
1897   for(i=0;i<n;i++){
1898     ii = bs*i;  ir = bs*r[i];
1899     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
1900     x[ir+4] = t[ii+4];  x[ir+5] = t[ii+5];
1901   }
1902 
1903   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1904   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1905   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1906   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1907   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1908   PetscFunctionReturn(0);
1909 }
1910 
1911 #undef __FUNCT__
1912 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_inplace"
1913 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
1914 {
1915   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1916   IS                iscol=a->col,isrow=a->row;
1917   PetscErrorCode    ierr;
1918   const PetscInt    *r,*c,*rout,*cout;
1919   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1920   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1921   const MatScalar   *aa=a->a,*v;
1922   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
1923   const PetscScalar *b;
1924 
1925   PetscFunctionBegin;
1926   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1927   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1928   t  = a->solve_work;
1929 
1930   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1931   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1932 
1933   /* copy the b into temp work space according to permutation */
1934   ii = 0;
1935   for (i=0; i<n; i++) {
1936     ic      = 7*c[i];
1937     t[ii]   = b[ic];
1938     t[ii+1] = b[ic+1];
1939     t[ii+2] = b[ic+2];
1940     t[ii+3] = b[ic+3];
1941     t[ii+4] = b[ic+4];
1942     t[ii+5] = b[ic+5];
1943     t[ii+6] = b[ic+6];
1944     ii += 7;
1945   }
1946 
1947   /* forward solve the U^T */
1948   idx = 0;
1949   for (i=0; i<n; i++) {
1950 
1951     v     = aa + 49*diag[i];
1952     /* multiply by the inverse of the block diagonal */
1953     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1954     x6    = t[5+idx]; x7 = t[6+idx];
1955     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1956     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1957     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1958     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1959     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1960     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1961     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1962     v += 49;
1963 
1964     vi    = aj + diag[i] + 1;
1965     nz    = ai[i+1] - diag[i] - 1;
1966     while (nz--) {
1967       oidx = 7*(*vi++);
1968       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1969       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1970       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1971       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1972       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1973       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1974       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1975       v  += 49;
1976     }
1977     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1978     t[5+idx] = s6;t[6+idx] = s7;
1979     idx += 7;
1980   }
1981   /* backward solve the L^T */
1982   for (i=n-1; i>=0; i--){
1983     v    = aa + 49*diag[i] - 49;
1984     vi   = aj + diag[i] - 1;
1985     nz   = diag[i] - ai[i];
1986     idt  = 7*i;
1987     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1988     s6 = t[5+idt];s7 = t[6+idt];
1989     while (nz--) {
1990       idx   = 7*(*vi--);
1991       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1992       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1993       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1994       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1995       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1996       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1997       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1998       v -= 49;
1999     }
2000   }
2001 
2002   /* copy t into x according to permutation */
2003   ii = 0;
2004   for (i=0; i<n; i++) {
2005     ir      = 7*r[i];
2006     x[ir]   = t[ii];
2007     x[ir+1] = t[ii+1];
2008     x[ir+2] = t[ii+2];
2009     x[ir+3] = t[ii+3];
2010     x[ir+4] = t[ii+4];
2011     x[ir+5] = t[ii+5];
2012     x[ir+6] = t[ii+6];
2013     ii += 7;
2014   }
2015 
2016   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2017   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2018   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2019   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2020   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
2021   PetscFunctionReturn(0);
2022 }
2023 #undef __FUNCT__
2024 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7"
2025 PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
2026 {
2027   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2028   PetscErrorCode    ierr;
2029   IS                iscol=a->col,isrow=a->row;
2030   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
2031   const PetscInt    *r,*c,*rout,*cout;
2032   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
2033   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
2034   const MatScalar   *aa=a->a,*v;
2035   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
2036   const PetscScalar *b;
2037 
2038   PetscFunctionBegin;
2039   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2040   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2041   t = a->solve_work;
2042 
2043   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2044   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2045 
2046   /* copy b into temp work space according to permutation */
2047   for(i=0;i<n;i++){
2048     ii = bs*i; ic = bs*c[i];
2049     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
2050     t[ii+4] = b[ic+4];  t[ii+5] = b[ic+5];  t[ii+6] = b[ic+6];
2051   }
2052 
2053   /* forward solve the U^T */
2054   idx = 0;
2055   for (i=0; i<n; i++) {
2056     v     = aa + bs2*diag[i];
2057     /* multiply by the inverse of the block diagonal */
2058     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2059     x6    = t[5+idx]; x7 = t[6+idx];
2060     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
2061     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
2062     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
2063     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
2064     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
2065     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
2066     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
2067     v -= bs2;
2068 
2069     vi    = aj + diag[i] - 1;
2070     nz    = diag[i] - diag[i+1] - 1;
2071     for(j=0;j>-nz;j--){
2072       oidx = bs*vi[j];
2073       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
2074       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
2075       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
2076       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
2077       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
2078       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
2079       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
2080       v  -= bs2;
2081     }
2082     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
2083     t[5+idx] = s6;  t[6+idx] = s7;
2084     idx += bs;
2085   }
2086   /* backward solve the L^T */
2087   for (i=n-1; i>=0; i--){
2088     v    = aa + bs2*ai[i];
2089     vi   = aj + ai[i];
2090     nz   = ai[i+1] - ai[i];
2091     idt  = bs*i;
2092     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
2093     s6   = t[5+idt];  s7 = t[6+idt];
2094    for(j=0;j<nz;j++){
2095       idx   = bs*vi[j];
2096       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
2097       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
2098       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
2099       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
2100       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
2101       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
2102       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
2103       v += bs2;
2104     }
2105   }
2106 
2107   /* copy t into x according to permutation */
2108   for(i=0;i<n;i++){
2109     ii = bs*i;  ir = bs*r[i];
2110     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
2111     x[ir+4] = t[ii+4];  x[ir+5] = t[ii+5];  x[ir+6] = t[ii+6];
2112   }
2113 
2114   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2115   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2116   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2117   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2118   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2119   PetscFunctionReturn(0);
2120 }
2121 
2122 /* ----------------------------------------------------------- */
2123 #undef __FUNCT__
2124 #define __FUNCT__ "MatSolve_SeqBAIJ_N_inplace"
2125 PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
2126 {
2127   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2128   IS                iscol=a->col,isrow=a->row;
2129   PetscErrorCode    ierr;
2130   const PetscInt    *r,*c,*rout,*cout;
2131   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*vi;
2132   PetscInt          i,nz;
2133   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
2134   const MatScalar   *aa=a->a,*v;
2135   PetscScalar       *x,*s,*t,*ls;
2136   const PetscScalar *b;
2137 
2138   PetscFunctionBegin;
2139   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2140   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2141   t  = a->solve_work;
2142 
2143   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2144   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2145 
2146   /* forward solve the lower triangular */
2147   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
2148   for (i=1; i<n; i++) {
2149     v   = aa + bs2*ai[i];
2150     vi  = aj + ai[i];
2151     nz  = a->diag[i] - ai[i];
2152     s = t + bs*i;
2153     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
2154     while (nz--) {
2155       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
2156       v += bs2;
2157     }
2158   }
2159   /* backward solve the upper triangular */
2160   ls = a->solve_work + A->cmap->n;
2161   for (i=n-1; i>=0; i--){
2162     v   = aa + bs2*(a->diag[i] + 1);
2163     vi  = aj + a->diag[i] + 1;
2164     nz  = ai[i+1] - a->diag[i] - 1;
2165     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
2166     while (nz--) {
2167       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
2168       v += bs2;
2169     }
2170     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
2171     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
2172   }
2173 
2174   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2175   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2176   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2177   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2178   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
2179   PetscFunctionReturn(0);
2180 }
2181 
2182 /* ----------------------------------------------------------- */
2183 #undef __FUNCT__
2184 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N_inplace"
2185 PetscErrorCode MatSolveTranspose_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
2186 {
2187   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2188   IS                iscol=a->col,isrow=a->row;
2189   PetscErrorCode    ierr;
2190   const PetscInt    *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
2191   PetscInt          i,nz,j;
2192   const PetscInt    n=a->mbs,bs=A->rmap->bs,bs2=a->bs2;
2193   const MatScalar   *aa=a->a,*v;
2194   PetscScalar       *x,*t,*ls;
2195   const PetscScalar *b;
2196   PetscFunctionBegin;
2197   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2198   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2199   t    = a->solve_work;
2200 
2201   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2202   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2203 
2204   /* copy the b into temp work space according to permutation */
2205   for (i=0; i<n; i++) {
2206     for (j=0; j<bs; j++) {
2207       t[i*bs+j] = b[c[i]*bs+j];
2208     }
2209   }
2210 
2211 
2212   /* forward solve the upper triangular transpose */
2213   ls = a->solve_work + A->cmap->n;
2214   for (i=0; i<n; i++){
2215     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
2216     Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
2217     v   = aa + bs2*(a->diag[i] + 1);
2218     vi  = aj + a->diag[i] + 1;
2219     nz  = ai[i+1] - a->diag[i] - 1;
2220     while (nz--) {
2221       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
2222       v += bs2;
2223     }
2224   }
2225 
2226   /* backward solve the lower triangular transpose */
2227   for (i=n-1; i>=0; i--) {
2228     v   = aa + bs2*ai[i];
2229     vi  = aj + ai[i];
2230     nz  = a->diag[i] - ai[i];
2231     while (nz--) {
2232       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
2233       v += bs2;
2234     }
2235   }
2236 
2237   /* copy t into x according to permutation */
2238   for (i=0; i<n; i++) {
2239     for (j=0; j<bs; j++) {
2240       x[bs*r[i]+j]   = t[bs*i+j];
2241     }
2242   }
2243 
2244   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2245   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2246   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2247   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2248   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
2249   PetscFunctionReturn(0);
2250 }
2251 
2252 #undef __FUNCT__
2253 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N"
2254 PetscErrorCode MatSolveTranspose_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
2255 {
2256   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2257   IS                iscol=a->col,isrow=a->row;
2258   PetscErrorCode    ierr;
2259   const PetscInt    *r,*c,*rout,*cout;
2260   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*vi,*diag=a->diag;
2261   PetscInt          i,j,nz;
2262   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
2263   const MatScalar   *aa=a->a,*v;
2264   PetscScalar       *x,*t,*ls;
2265   const PetscScalar *b;
2266 
2267   PetscFunctionBegin;
2268   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2269   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2270   t    = a->solve_work;
2271 
2272   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2273   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2274 
2275   /* copy the b into temp work space according to permutation */
2276   for (i=0; i<n; i++) {
2277     for (j=0; j<bs; j++) {
2278       t[i*bs+j] = b[c[i]*bs+j];
2279     }
2280   }
2281 
2282 
2283   /* forward solve the upper triangular transpose */
2284   ls = a->solve_work + A->cmap->n;
2285   for (i=0; i<n; i++){
2286     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
2287     Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*diag[i],t+i*bs);
2288     v   = aa + bs2*(diag[i] - 1);
2289     vi  = aj + diag[i] - 1;
2290     nz  = diag[i] - diag[i+1] - 1;
2291     for(j=0;j>-nz;j--){
2292       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs);
2293       v -= bs2;
2294     }
2295   }
2296 
2297   /* backward solve the lower triangular transpose */
2298   for (i=n-1; i>=0; i--) {
2299     v   = aa + bs2*ai[i];
2300     vi  = aj + ai[i];
2301     nz  = ai[i+1] - ai[i];
2302     for(j=0;j<nz;j++){
2303       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs);
2304       v += bs2;
2305     }
2306   }
2307 
2308   /* copy t into x according to permutation */
2309   for (i=0; i<n; i++) {
2310     for (j=0; j<bs; j++) {
2311       x[bs*r[i]+j]   = t[bs*i+j];
2312     }
2313   }
2314 
2315   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2316   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2317   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2318   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2319   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
2320   PetscFunctionReturn(0);
2321 }
2322 
2323 /* bs = 15 for PFLOTRAN. Block operations are done by accessing all the columns   of the block at once */
2324 
2325 #undef __FUNCT__
2326 #define __FUNCT__ "MatSolve_SeqBAIJ_15_NaturalOrdering_ver2"
2327 PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver2(Mat A,Vec bb,Vec xx)
2328 {
2329   Mat_SeqBAIJ      *a=(Mat_SeqBAIJ *)A->data;
2330   PetscErrorCode    ierr;
2331   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
2332   PetscInt          i,nz,idx,idt,m;
2333   const MatScalar   *aa=a->a,*v;
2334   PetscScalar       s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15;
2335   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15;
2336   PetscScalar       *x;
2337   const PetscScalar *b;
2338 
2339   PetscFunctionBegin;
2340   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2341   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2342 
2343   /* forward solve the lower triangular */
2344   idx    = 0;
2345   x[0]  = b[idx];    x[1]  = b[1+idx];  x[2]  = b[2+idx];  x[3]  = b[3+idx];  x[4]  = b[4+idx];
2346   x[5]  = b[5+idx];  x[6]  = b[6+idx];  x[7]  = b[7+idx];  x[8]  = b[8+idx];  x[9]  = b[9+idx];
2347   x[10] = b[10+idx]; x[11] = b[11+idx]; x[12] = b[12+idx]; x[13] = b[13+idx]; x[14] = b[14+idx];
2348 
2349   for (i=1; i<n; i++) {
2350     v     = aa + bs2*ai[i];
2351     vi    = aj + ai[i];
2352     nz    = ai[i+1] - ai[i];
2353     idt   = bs*i;
2354     s1   = b[idt];    s2  = b[1+idt];  s3  = b[2+idt];  s4  = b[3+idt];  s5  = b[4+idt];
2355     s6   = b[5+idt];  s7  = b[6+idt];  s8  = b[7+idt];  s9  = b[8+idt];  s10 = b[9+idt];
2356     s11  = b[10+idt]; s12 = b[11+idt]; s13 = b[12+idt]; s14 = b[13+idt]; s15 = b[14+idt];
2357     for(m=0;m<nz;m++){
2358       idx   = bs*vi[m];
2359       x1   = x[idx];     x2  = x[1+idx];  x3  = x[2+idx];  x4  = x[3+idx];  x5  = x[4+idx];
2360       x6   = x[5+idx];   x7  = x[6+idx];  x8  = x[7+idx];  x9  = x[8+idx];  x10 = x[9+idx];
2361       x11  = x[10+idx]; x12  = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx];
2362 
2363 
2364       s1 -=  v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
2365       s2 -=  v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
2366       s3 -=  v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
2367       s4 -=  v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
2368       s5  -= v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
2369       s6  -= v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
2370       s7  -= v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
2371       s8  -= v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
2372       s9  -= v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
2373       s10 -= v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
2374       s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
2375       s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
2376       s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
2377       s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
2378       s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
2379 
2380       v += bs2;
2381     }
2382     x[idt]    = s1;  x[1+idt]  = s2;  x[2+idt]  = s3;  x[3+idt]  = s4;  x[4+idt]  = s5;
2383     x[5+idt]  = s6;  x[6+idt]  = s7;  x[7+idt]  = s8;  x[8+idt]  = s9;  x[9+idt]  = s10;
2384     x[10+idt] = s11; x[11+idt] = s12; x[12+idt] = s13; x[13+idt] = s14; x[14+idt] = s15;
2385 
2386   }
2387   /* backward solve the upper triangular */
2388   for (i=n-1; i>=0; i--){
2389     v    = aa + bs2*(adiag[i+1]+1);
2390     vi   = aj + adiag[i+1]+1;
2391     nz   = adiag[i] - adiag[i+1] - 1;
2392     idt  = bs*i;
2393     s1   = x[idt];     s2  = x[1+idt];  s3  = x[2+idt];  s4  = x[3+idt];  s5  = x[4+idt];
2394     s6   = x[5+idt];   s7  = x[6+idt];  s8  = x[7+idt];  s9  = x[8+idt];  s10 = x[9+idt];
2395     s11  = x[10+idt]; s12  = x[11+idt]; s13 = x[12+idt]; s14 = x[13+idt]; s15 = x[14+idt];
2396 
2397     for(m=0;m<nz;m++){
2398       idx   = bs*vi[m];
2399       x1   = x[idx];     x2  = x[1+idx];  x3  = x[2+idx];  x4  = x[3+idx];  x5  = x[4+idx];
2400       x6   = x[5+idx];   x7  = x[6+idx];  x8  = x[7+idx];  x9  = x[8+idx];  x10 = x[9+idx];
2401       x11  = x[10+idx]; x12  = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx];
2402 
2403       s1  -= v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
2404       s2  -= v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
2405       s3  -= v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
2406       s4  -= v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
2407       s5  -= v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
2408       s6  -= v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
2409       s7  -= v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
2410       s8  -= v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
2411       s9  -= v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
2412       s10 -= v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
2413       s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
2414       s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
2415       s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
2416       s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
2417       s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
2418 
2419       v += bs2;
2420     }
2421 
2422     x[idt] = v[0]*s1  + v[15]*s2 + v[30]*s3 + v[45]*s4 + v[60]*s5 + v[75]*s6 + v[90]*s7  + v[105]*s8 + v[120]*s9 + v[135]*s10 + v[150]*s11 + v[165]*s12 + v[180]*s13 + v[195]*s14 + v[210]*s15;
2423     x[1+idt] = v[1]*s1  + v[16]*s2 + v[31]*s3 + v[46]*s4 + v[61]*s5 + v[76]*s6 + v[91]*s7  + v[106]*s8 + v[121]*s9 + v[136]*s10 + v[151]*s11 + v[166]*s12 + v[181]*s13 + v[196]*s14 + v[211]*s15;
2424     x[2+idt] = v[2]*s1  + v[17]*s2 + v[32]*s3 + v[47]*s4 + v[62]*s5 + v[77]*s6 + v[92]*s7  + v[107]*s8 + v[122]*s9 + v[137]*s10 + v[152]*s11 + v[167]*s12 + v[182]*s13 + v[197]*s14 + v[212]*s15;
2425     x[3+idt] = v[3]*s1  + v[18]*s2 + v[33]*s3 + v[48]*s4 + v[63]*s5 + v[78]*s6 + v[93]*s7  + v[108]*s8 + v[123]*s9 + v[138]*s10 + v[153]*s11 + v[168]*s12 + v[183]*s13 + v[198]*s14 + v[213]*s15;
2426     x[4+idt] = v[4]*s1  + v[19]*s2 + v[34]*s3 + v[49]*s4 + v[64]*s5 + v[79]*s6 + v[94]*s7  + v[109]*s8 + v[124]*s9 + v[139]*s10 + v[154]*s11 + v[169]*s12 + v[184]*s13 + v[199]*s14 + v[214]*s15;
2427     x[5+idt] = v[5]*s1  + v[20]*s2 + v[35]*s3 + v[50]*s4 + v[65]*s5 + v[80]*s6 + v[95]*s7  + v[110]*s8 + v[125]*s9 + v[140]*s10 + v[155]*s11 + v[170]*s12 + v[185]*s13 + v[200]*s14 + v[215]*s15;
2428     x[6+idt] = v[6]*s1  + v[21]*s2 + v[36]*s3 + v[51]*s4 + v[66]*s5 + v[81]*s6 + v[96]*s7  + v[111]*s8 + v[126]*s9 + v[141]*s10 + v[156]*s11 + v[171]*s12 + v[186]*s13 + v[201]*s14 + v[216]*s15;
2429     x[7+idt] = v[7]*s1  + v[22]*s2 + v[37]*s3 + v[52]*s4 + v[67]*s5 + v[82]*s6 + v[97]*s7  + v[112]*s8 + v[127]*s9 + v[142]*s10 + v[157]*s11 + v[172]*s12 + v[187]*s13 + v[202]*s14 + v[217]*s15;
2430     x[8+idt] = v[8]*s1  + v[23]*s2 + v[38]*s3 + v[53]*s4 + v[68]*s5 + v[83]*s6 + v[98]*s7  + v[113]*s8 + v[128]*s9 + v[143]*s10 + v[158]*s11 + v[173]*s12 + v[188]*s13 + v[203]*s14 + v[218]*s15;
2431     x[9+idt] = v[9]*s1  + v[24]*s2 + v[39]*s3 + v[54]*s4 + v[69]*s5 + v[84]*s6 + v[99]*s7  + v[114]*s8 + v[129]*s9 + v[144]*s10 + v[159]*s11 + v[174]*s12 + v[189]*s13 + v[204]*s14 + v[219]*s15;
2432     x[10+idt] = v[10]*s1 + v[25]*s2 + v[40]*s3 + v[55]*s4 + v[70]*s5 + v[85]*s6 + v[100]*s7 + v[115]*s8 + v[130]*s9 + v[145]*s10 + v[160]*s11 + v[175]*s12 + v[190]*s13 + v[205]*s14 + v[220]*s15;
2433     x[11+idt] = v[11]*s1 + v[26]*s2 + v[41]*s3 + v[56]*s4 + v[71]*s5 + v[86]*s6 + v[101]*s7 + v[116]*s8 + v[131]*s9 + v[146]*s10 + v[161]*s11 + v[176]*s12 + v[191]*s13 + v[206]*s14 + v[221]*s15;
2434     x[12+idt] = v[12]*s1 + v[27]*s2 + v[42]*s3 + v[57]*s4 + v[72]*s5 + v[87]*s6 + v[102]*s7 + v[117]*s8 + v[132]*s9 + v[147]*s10 + v[162]*s11 + v[177]*s12 + v[192]*s13 + v[207]*s14 + v[222]*s15;
2435     x[13+idt] = v[13]*s1 + v[28]*s2 + v[43]*s3 + v[58]*s4 + v[73]*s5 + v[88]*s6 + v[103]*s7 + v[118]*s8 + v[133]*s9 + v[148]*s10 + v[163]*s11 + v[178]*s12 + v[193]*s13 + v[208]*s14 + v[223]*s15;
2436     x[14+idt] = v[14]*s1 + v[29]*s2 + v[44]*s3 + v[59]*s4 + v[74]*s5 + v[89]*s6 + v[104]*s7 + v[119]*s8 + v[134]*s9 + v[149]*s10 + v[164]*s11 + v[179]*s12 + v[194]*s13 + v[209]*s14 + v[224]*s15;
2437 
2438   }
2439 
2440   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2441   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2442   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2443   PetscFunctionReturn(0);
2444 }
2445 
2446 /* bs = 15 for PFLOTRAN. Block operations are done by accessing one column at at time */
2447 /* Default MatSolve for block size 15 */
2448 
2449 #undef __FUNCT__
2450 #define __FUNCT__ "MatSolve_SeqBAIJ_15_NaturalOrdering_ver1"
2451 PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver1(Mat A,Vec bb,Vec xx)
2452 {
2453   Mat_SeqBAIJ      *a=(Mat_SeqBAIJ *)A->data;
2454   PetscErrorCode    ierr;
2455   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
2456   PetscInt          i,k,nz,idx,idt,m;
2457   const MatScalar   *aa=a->a,*v;
2458   PetscScalar       s[15];
2459   PetscScalar       *x,xv;
2460   const PetscScalar *b;
2461 
2462   PetscFunctionBegin;
2463   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2464   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2465 
2466   /* forward solve the lower triangular */
2467   for (i=0; i<n; i++) {
2468     v     = aa + bs2*ai[i];
2469     vi    = aj + ai[i];
2470     nz    = ai[i+1] - ai[i];
2471     idt   = bs*i;
2472     x[idt]   = b[idt];    x[1+idt]  = b[1+idt];  x[2+idt]  = b[2+idt];  x[3+idt]  = b[3+idt];  x[4+idt]  = b[4+idt];
2473     x[5+idt]   = b[5+idt];  x[6+idt]  = b[6+idt];  x[7+idt]  = b[7+idt];  x[8+idt]  = b[8+idt];  x[9+idt] = b[9+idt];
2474     x[10+idt]  = b[10+idt]; x[11+idt] = b[11+idt]; x[12+idt] = b[12+idt]; x[13+idt] = b[13+idt]; x[14+idt] = b[14+idt];
2475     for(m=0;m<nz;m++){
2476       idx   = bs*vi[m];
2477       for(k=0;k<15;k++){
2478 	xv        = x[k + idx];
2479 	x[idt]    -= v[0]*xv;
2480 	x[1+idt]  -= v[1]*xv;
2481 	x[2+idt]  -= v[2]*xv;
2482         x[3+idt]  -= v[3]*xv;
2483 	x[4+idt]  -= v[4]*xv;
2484 	x[5+idt]  -= v[5]*xv;
2485 	x[6+idt]  -= v[6]*xv;
2486         x[7+idt]  -= v[7]*xv;
2487 	x[8+idt]  -= v[8]*xv;
2488 	x[9+idt]  -= v[9]*xv;
2489 	x[10+idt] -= v[10]*xv;
2490         x[11+idt] -= v[11]*xv;
2491 	x[12+idt] -= v[12]*xv;
2492 	x[13+idt] -= v[13]*xv;
2493 	x[14+idt] -= v[14]*xv;
2494 	v += 15;
2495       }
2496     }
2497   }
2498   /* backward solve the upper triangular */
2499   for (i=n-1; i>=0; i--){
2500     v    = aa + bs2*(adiag[i+1]+1);
2501     vi   = aj + adiag[i+1]+1;
2502     nz   = adiag[i] - adiag[i+1] - 1;
2503     idt  = bs*i;
2504     s[0]   = x[idt];    s[1]  = x[1+idt];  s[2]  = x[2+idt];  s[3]  = x[3+idt];  s[4]  = x[4+idt];
2505     s[5]   = x[5+idt];  s[6]  = x[6+idt];  s[7]  = x[7+idt];  s[8]  = x[8+idt];  s[9]  = x[9+idt];
2506     s[10]  = x[10+idt]; s[11] = x[11+idt]; s[12] = x[12+idt]; s[13] = x[13+idt]; s[14] = x[14+idt];
2507 
2508     for(m=0;m<nz;m++){
2509       idx   = bs*vi[m];
2510       for(k=0;k<15;k++){
2511 	xv = x[k + idx];
2512 	s[0]  -= v[0]*xv;
2513 	s[1]  -= v[1]*xv;
2514 	s[2]  -= v[2]*xv;
2515         s[3]  -= v[3]*xv;
2516 	s[4]  -= v[4]*xv;
2517 	s[5]  -= v[5]*xv;
2518 	s[6]  -= v[6]*xv;
2519         s[7]  -= v[7]*xv;
2520 	s[8]  -= v[8]*xv;
2521 	s[9]  -= v[9]*xv;
2522 	s[10] -= v[10]*xv;
2523         s[11] -= v[11]*xv;
2524 	s[12] -= v[12]*xv;
2525 	s[13] -= v[13]*xv;
2526 	s[14] -= v[14]*xv;
2527 	v += 15;
2528       }
2529     }
2530     ierr = PetscMemzero(x+idt,bs*sizeof(MatScalar));CHKERRQ(ierr);
2531     for(k=0;k<15;k++){
2532       x[idt]    += v[0]*s[k];
2533       x[1+idt]  += v[1]*s[k];
2534       x[2+idt]  += v[2]*s[k];
2535       x[3+idt]  += v[3]*s[k];
2536       x[4+idt]  += v[4]*s[k];
2537       x[5+idt]  += v[5]*s[k];
2538       x[6+idt]  += v[6]*s[k];
2539       x[7+idt]  += v[7]*s[k];
2540       x[8+idt]  += v[8]*s[k];
2541       x[9+idt]  += v[9]*s[k];
2542       x[10+idt] += v[10]*s[k];
2543       x[11+idt] += v[11]*s[k];
2544       x[12+idt] += v[12]*s[k];
2545       x[13+idt] += v[13]*s[k];
2546       x[14+idt] += v[14]*s[k];
2547       v += 15;
2548     }
2549   }
2550   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2551   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2552   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2553   PetscFunctionReturn(0);
2554 }
2555 
2556 
2557 #undef __FUNCT__
2558 #define __FUNCT__ "MatSolve_SeqBAIJ_7_inplace"
2559 PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
2560 {
2561   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2562   IS                iscol=a->col,isrow=a->row;
2563   PetscErrorCode    ierr;
2564   const PetscInt    *r,*c,*ai=a->i,*aj=a->j;
2565   const PetscInt    *rout,*cout,*diag = a->diag,*vi,n=a->mbs;
2566   PetscInt          i,nz,idx,idt,idc;
2567   const MatScalar   *aa=a->a,*v;
2568   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
2569   const PetscScalar *b;
2570 
2571   PetscFunctionBegin;
2572   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2573   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2574   t  = a->solve_work;
2575 
2576   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2577   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2578 
2579   /* forward solve the lower triangular */
2580   idx    = 7*(*r++);
2581   t[0] = b[idx];   t[1] = b[1+idx];
2582   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
2583   t[5] = b[5+idx]; t[6] = b[6+idx];
2584 
2585   for (i=1; i<n; i++) {
2586     v     = aa + 49*ai[i];
2587     vi    = aj + ai[i];
2588     nz    = diag[i] - ai[i];
2589     idx   = 7*(*r++);
2590     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2591     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
2592     while (nz--) {
2593       idx   = 7*(*vi++);
2594       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
2595       x4    = t[3+idx];x5 = t[4+idx];
2596       x6    = t[5+idx];x7 = t[6+idx];
2597       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2598       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2599       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2600       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2601       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2602       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2603       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2604       v += 49;
2605     }
2606     idx = 7*i;
2607     t[idx]   = s1;t[1+idx] = s2;
2608     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2609     t[5+idx] = s6;t[6+idx] = s7;
2610   }
2611   /* backward solve the upper triangular */
2612   for (i=n-1; i>=0; i--){
2613     v    = aa + 49*diag[i] + 49;
2614     vi   = aj + diag[i] + 1;
2615     nz   = ai[i+1] - diag[i] - 1;
2616     idt  = 7*i;
2617     s1 = t[idt];  s2 = t[1+idt];
2618     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2619     s6 = t[5+idt];s7 = t[6+idt];
2620     while (nz--) {
2621       idx   = 7*(*vi++);
2622       x1    = t[idx];   x2 = t[1+idx];
2623       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2624       x6    = t[5+idx]; x7 = t[6+idx];
2625       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2626       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2627       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2628       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2629       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2630       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2631       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2632       v += 49;
2633     }
2634     idc = 7*(*c--);
2635     v   = aa + 49*diag[i];
2636     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
2637                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
2638     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
2639                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
2640     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
2641                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
2642     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
2643                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
2644     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
2645                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
2646     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
2647                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
2648     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
2649                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
2650   }
2651 
2652   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2653   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2654   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2655   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2656   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
2657   PetscFunctionReturn(0);
2658 }
2659 
2660 #undef __FUNCT__
2661 #define __FUNCT__ "MatSolve_SeqBAIJ_7"
2662 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
2663 {
2664   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2665   IS                iscol=a->col,isrow=a->row;
2666   PetscErrorCode    ierr;
2667   const PetscInt    *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag;
2668   const PetscInt    n=a->mbs,*rout,*cout,*vi;
2669   PetscInt          i,nz,idx,idt,idc,m;
2670   const MatScalar   *aa=a->a,*v;
2671   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
2672   const PetscScalar *b;
2673 
2674   PetscFunctionBegin;
2675   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2676   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2677   t  = a->solve_work;
2678 
2679   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2680   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2681 
2682   /* forward solve the lower triangular */
2683   idx    = 7*r[0];
2684   t[0] = b[idx];   t[1] = b[1+idx];
2685   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
2686   t[5] = b[5+idx]; t[6] = b[6+idx];
2687 
2688   for (i=1; i<n; i++) {
2689     v     = aa + 49*ai[i];
2690     vi    = aj + ai[i];
2691     nz    = ai[i+1] - ai[i];
2692     idx   = 7*r[i];
2693     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2694     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
2695     for(m=0;m<nz;m++){
2696       idx   = 7*vi[m];
2697       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
2698       x4    = t[3+idx];x5 = t[4+idx];
2699       x6    = t[5+idx];x7 = t[6+idx];
2700       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2701       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2702       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2703       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2704       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2705       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2706       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2707       v += 49;
2708     }
2709     idx = 7*i;
2710     t[idx]   = s1;t[1+idx] = s2;
2711     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2712     t[5+idx] = s6;t[6+idx] = s7;
2713   }
2714   /* backward solve the upper triangular */
2715   for (i=n-1; i>=0; i--){
2716     v    = aa + 49*(adiag[i+1]+1);
2717     vi   = aj + adiag[i+1]+1;
2718     nz   = adiag[i] - adiag[i+1] - 1;
2719     idt  = 7*i;
2720     s1 = t[idt];  s2 = t[1+idt];
2721     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2722     s6 = t[5+idt];s7 = t[6+idt];
2723     for(m=0;m<nz;m++){
2724       idx   = 7*vi[m];
2725       x1    = t[idx];   x2 = t[1+idx];
2726       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2727       x6    = t[5+idx]; x7 = t[6+idx];
2728       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2729       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2730       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2731       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2732       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2733       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2734       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2735       v += 49;
2736     }
2737     idc = 7*c[i];
2738     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
2739                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
2740     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
2741                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
2742     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
2743                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
2744     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
2745                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
2746     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
2747                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
2748     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
2749                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
2750     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
2751                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
2752   }
2753 
2754   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2755   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2756   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2757   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2758   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
2759   PetscFunctionReturn(0);
2760 }
2761 
2762 #undef __FUNCT__
2763 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_inplace"
2764 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2765 {
2766   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2767   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
2768   PetscErrorCode    ierr;
2769   PetscInt          i,nz,idx,idt,jdx;
2770   const MatScalar   *aa=a->a,*v;
2771   PetscScalar       *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
2772   const PetscScalar *b;
2773 
2774   PetscFunctionBegin;
2775   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2776   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2777   /* forward solve the lower triangular */
2778   idx    = 0;
2779   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
2780   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
2781   x[6] = b[6+idx];
2782   for (i=1; i<n; i++) {
2783     v     =  aa + 49*ai[i];
2784     vi    =  aj + ai[i];
2785     nz    =  diag[i] - ai[i];
2786     idx   =  7*i;
2787     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
2788     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
2789     s7  =  b[6+idx];
2790     while (nz--) {
2791       jdx   = 7*(*vi++);
2792       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
2793       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
2794       x7    = x[6+jdx];
2795       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2796       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2797       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2798       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2799       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2800       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2801       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2802       v += 49;
2803      }
2804     x[idx]   = s1;
2805     x[1+idx] = s2;
2806     x[2+idx] = s3;
2807     x[3+idx] = s4;
2808     x[4+idx] = s5;
2809     x[5+idx] = s6;
2810     x[6+idx] = s7;
2811   }
2812   /* backward solve the upper triangular */
2813   for (i=n-1; i>=0; i--){
2814     v    = aa + 49*diag[i] + 49;
2815     vi   = aj + diag[i] + 1;
2816     nz   = ai[i+1] - diag[i] - 1;
2817     idt  = 7*i;
2818     s1 = x[idt];   s2 = x[1+idt];
2819     s3 = x[2+idt]; s4 = x[3+idt];
2820     s5 = x[4+idt]; s6 = x[5+idt];
2821     s7 = x[6+idt];
2822     while (nz--) {
2823       idx   = 7*(*vi++);
2824       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
2825       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
2826       x7    = x[6+idx];
2827       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2828       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2829       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2830       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2831       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2832       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2833       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2834       v += 49;
2835     }
2836     v        = aa + 49*diag[i];
2837     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
2838                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
2839     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
2840                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
2841     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
2842                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
2843     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
2844                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
2845     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
2846                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
2847     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
2848                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
2849     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
2850                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
2851   }
2852 
2853   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2854   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2855   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
2856   PetscFunctionReturn(0);
2857 }
2858 
2859 #undef __FUNCT__
2860 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
2861 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
2862 {
2863     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2864     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
2865     PetscErrorCode    ierr;
2866     PetscInt          i,k,nz,idx,jdx,idt;
2867     const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
2868     const MatScalar   *aa=a->a,*v;
2869     PetscScalar       *x;
2870     const PetscScalar *b;
2871     PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
2872 
2873     PetscFunctionBegin;
2874     ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2875     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2876     /* forward solve the lower triangular */
2877     idx    = 0;
2878     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
2879     x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx];
2880     for (i=1; i<n; i++) {
2881        v    = aa + bs2*ai[i];
2882        vi   = aj + ai[i];
2883        nz   = ai[i+1] - ai[i];
2884       idx   = bs*i;
2885        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2886        s5   = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
2887        for(k=0;k<nz;k++) {
2888           jdx   = bs*vi[k];
2889           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
2890 	  x5    = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx];
2891           s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
2892           s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
2893           s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
2894 	  s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
2895           s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
2896 	  s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
2897 	  s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
2898           v   +=  bs2;
2899         }
2900 
2901        x[idx]   = s1;
2902        x[1+idx] = s2;
2903        x[2+idx] = s3;
2904        x[3+idx] = s4;
2905        x[4+idx] = s5;
2906        x[5+idx] = s6;
2907        x[6+idx] = s7;
2908     }
2909 
2910    /* backward solve the upper triangular */
2911   for (i=n-1; i>=0; i--){
2912     v   = aa + bs2*(adiag[i+1]+1);
2913      vi  = aj + adiag[i+1]+1;
2914      nz  = adiag[i] - adiag[i+1]-1;
2915      idt = bs*i;
2916      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
2917      s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt];
2918     for(k=0;k<nz;k++) {
2919       idx   = bs*vi[k];
2920        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
2921        x5    = x[4+idx];x6 = x[5+idx];x7 = x[6+idx];
2922        s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
2923        s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
2924        s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
2925        s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
2926        s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
2927        s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
2928        s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
2929         v   +=  bs2;
2930     }
2931     /* x = inv_diagonal*x */
2932     x[idt]   = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4  + v[28]*s5 + v[35]*s6 + v[42]*s7;
2933     x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4  + v[29]*s5 + v[36]*s6 + v[43]*s7;
2934     x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4  + v[30]*s5 + v[37]*s6 + v[44]*s7;
2935     x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4  + v[31]*s5 + v[38]*s6 + v[45]*s7;
2936     x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4  + v[32]*s5 + v[39]*s6 + v[46]*s7;
2937     x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4  + v[33]*s5 + v[40]*s6 + v[47]*s7;
2938     x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4  + v[34]*s5 + v[41]*s6 + v[48]*s7;
2939   }
2940 
2941   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2942   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2943   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2944   PetscFunctionReturn(0);
2945 }
2946 
2947 #undef __FUNCT__
2948 #define __FUNCT__ "MatSolve_SeqBAIJ_6_inplace"
2949 PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
2950 {
2951   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2952   IS                iscol=a->col,isrow=a->row;
2953   PetscErrorCode    ierr;
2954   const PetscInt    *r,*c,*rout,*cout;
2955   const PetscInt    *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
2956   PetscInt          i,nz,idx,idt,idc;
2957   const MatScalar   *aa=a->a,*v;
2958   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
2959   const PetscScalar *b;
2960 
2961   PetscFunctionBegin;
2962   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2963   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2964   t  = a->solve_work;
2965 
2966   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2967   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2968 
2969   /* forward solve the lower triangular */
2970   idx    = 6*(*r++);
2971   t[0] = b[idx];   t[1] = b[1+idx];
2972   t[2] = b[2+idx]; t[3] = b[3+idx];
2973   t[4] = b[4+idx]; t[5] = b[5+idx];
2974   for (i=1; i<n; i++) {
2975     v     = aa + 36*ai[i];
2976     vi    = aj + ai[i];
2977     nz    = diag[i] - ai[i];
2978     idx   = 6*(*r++);
2979     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2980     s5  = b[4+idx]; s6 = b[5+idx];
2981     while (nz--) {
2982       idx   = 6*(*vi++);
2983       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
2984       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
2985       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2986       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2987       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2988       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2989       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2990       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
2991       v += 36;
2992     }
2993     idx = 6*i;
2994     t[idx]   = s1;t[1+idx] = s2;
2995     t[2+idx] = s3;t[3+idx] = s4;
2996     t[4+idx] = s5;t[5+idx] = s6;
2997   }
2998   /* backward solve the upper triangular */
2999   for (i=n-1; i>=0; i--){
3000     v    = aa + 36*diag[i] + 36;
3001     vi   = aj + diag[i] + 1;
3002     nz   = ai[i+1] - diag[i] - 1;
3003     idt  = 6*i;
3004     s1 = t[idt];  s2 = t[1+idt];
3005     s3 = t[2+idt];s4 = t[3+idt];
3006     s5 = t[4+idt];s6 = t[5+idt];
3007     while (nz--) {
3008       idx   = 6*(*vi++);
3009       x1    = t[idx];   x2 = t[1+idx];
3010       x3    = t[2+idx]; x4 = t[3+idx];
3011       x5    = t[4+idx]; x6 = t[5+idx];
3012       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3013       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3014       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3015       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3016       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3017       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3018       v += 36;
3019     }
3020     idc = 6*(*c--);
3021     v   = aa + 36*diag[i];
3022     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
3023                                  v[18]*s4+v[24]*s5+v[30]*s6;
3024     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
3025                                  v[19]*s4+v[25]*s5+v[31]*s6;
3026     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
3027                                  v[20]*s4+v[26]*s5+v[32]*s6;
3028     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
3029                                  v[21]*s4+v[27]*s5+v[33]*s6;
3030     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
3031                                  v[22]*s4+v[28]*s5+v[34]*s6;
3032     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
3033                                  v[23]*s4+v[29]*s5+v[35]*s6;
3034   }
3035 
3036   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3037   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3038   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3039   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3040   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
3041   PetscFunctionReturn(0);
3042 }
3043 
3044 #undef __FUNCT__
3045 #define __FUNCT__ "MatSolve_SeqBAIJ_6"
3046 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
3047 {
3048   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
3049   IS                iscol=a->col,isrow=a->row;
3050   PetscErrorCode    ierr;
3051   const PetscInt    *r,*c,*rout,*cout;
3052   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3053   PetscInt          i,nz,idx,idt,idc,m;
3054   const MatScalar   *aa=a->a,*v;
3055   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
3056   const PetscScalar *b;
3057 
3058   PetscFunctionBegin;
3059   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3060   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3061   t  = a->solve_work;
3062 
3063   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3064   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
3065 
3066   /* forward solve the lower triangular */
3067   idx    = 6*r[0];
3068   t[0] = b[idx];   t[1] = b[1+idx];
3069   t[2] = b[2+idx]; t[3] = b[3+idx];
3070   t[4] = b[4+idx]; t[5] = b[5+idx];
3071   for (i=1; i<n; i++) {
3072     v     = aa + 36*ai[i];
3073     vi    = aj + ai[i];
3074     nz    = ai[i+1] - ai[i];
3075     idx   = 6*r[i];
3076     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3077     s5  = b[4+idx]; s6 = b[5+idx];
3078     for(m=0;m<nz;m++){
3079       idx   = 6*vi[m];
3080       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
3081       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
3082       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3083       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3084       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3085       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3086       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3087       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3088       v += 36;
3089     }
3090     idx = 6*i;
3091     t[idx]   = s1;t[1+idx] = s2;
3092     t[2+idx] = s3;t[3+idx] = s4;
3093     t[4+idx] = s5;t[5+idx] = s6;
3094   }
3095   /* backward solve the upper triangular */
3096   for (i=n-1; i>=0; i--){
3097     v    = aa + 36*(adiag[i+1]+1);
3098     vi   = aj + adiag[i+1]+1;
3099     nz   = adiag[i] - adiag[i+1] - 1;
3100     idt  = 6*i;
3101     s1 = t[idt];  s2 = t[1+idt];
3102     s3 = t[2+idt];s4 = t[3+idt];
3103     s5 = t[4+idt];s6 = t[5+idt];
3104     for(m=0;m<nz;m++){
3105       idx   = 6*vi[m];
3106       x1    = t[idx];   x2 = t[1+idx];
3107       x3    = t[2+idx]; x4 = t[3+idx];
3108       x5    = t[4+idx]; x6 = t[5+idx];
3109       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3110       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3111       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3112       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3113       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3114       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3115       v += 36;
3116     }
3117     idc = 6*c[i];
3118     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
3119                                  v[18]*s4+v[24]*s5+v[30]*s6;
3120     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
3121                                  v[19]*s4+v[25]*s5+v[31]*s6;
3122     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
3123                                  v[20]*s4+v[26]*s5+v[32]*s6;
3124     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
3125                                  v[21]*s4+v[27]*s5+v[33]*s6;
3126     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
3127                                  v[22]*s4+v[28]*s5+v[34]*s6;
3128     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
3129                                  v[23]*s4+v[29]*s5+v[35]*s6;
3130   }
3131 
3132   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3133   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3134   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3135   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3136   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
3137   PetscFunctionReturn(0);
3138 }
3139 
3140 #undef __FUNCT__
3141 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_inplace"
3142 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
3143 {
3144   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3145   PetscInt          i,nz,idx,idt,jdx;
3146   PetscErrorCode    ierr;
3147   const PetscInt    *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j;
3148   const MatScalar   *aa=a->a,*v;
3149   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
3150   const PetscScalar *b;
3151 
3152   PetscFunctionBegin;
3153   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3154   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3155   /* forward solve the lower triangular */
3156   idx    = 0;
3157   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
3158   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
3159   for (i=1; i<n; i++) {
3160     v     =  aa + 36*ai[i];
3161     vi    =  aj + ai[i];
3162     nz    =  diag[i] - ai[i];
3163     idx   =  6*i;
3164     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
3165     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
3166     while (nz--) {
3167       jdx   = 6*(*vi++);
3168       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
3169       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
3170       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3171       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3172       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3173       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3174       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3175       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3176       v += 36;
3177      }
3178     x[idx]   = s1;
3179     x[1+idx] = s2;
3180     x[2+idx] = s3;
3181     x[3+idx] = s4;
3182     x[4+idx] = s5;
3183     x[5+idx] = s6;
3184   }
3185   /* backward solve the upper triangular */
3186   for (i=n-1; i>=0; i--){
3187     v    = aa + 36*diag[i] + 36;
3188     vi   = aj + diag[i] + 1;
3189     nz   = ai[i+1] - diag[i] - 1;
3190     idt  = 6*i;
3191     s1 = x[idt];   s2 = x[1+idt];
3192     s3 = x[2+idt]; s4 = x[3+idt];
3193     s5 = x[4+idt]; s6 = x[5+idt];
3194     while (nz--) {
3195       idx   = 6*(*vi++);
3196       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
3197       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
3198       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3199       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3200       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3201       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3202       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3203       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3204       v += 36;
3205     }
3206     v        = aa + 36*diag[i];
3207     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
3208     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
3209     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
3210     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
3211     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
3212     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
3213   }
3214 
3215   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3216   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3217   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
3218   PetscFunctionReturn(0);
3219 }
3220 
3221 #undef __FUNCT__
3222 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
3223 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
3224 {
3225     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3226     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3227     PetscErrorCode    ierr;
3228     PetscInt          i,k,nz,idx,jdx,idt;
3229     const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
3230     const MatScalar   *aa=a->a,*v;
3231     PetscScalar       *x;
3232     const PetscScalar *b;
3233     PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
3234 
3235     PetscFunctionBegin;
3236     ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3237     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3238     /* forward solve the lower triangular */
3239     idx    = 0;
3240     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
3241     x[4] = b[4+idx];x[5] = b[5+idx];
3242     for (i=1; i<n; i++) {
3243        v    = aa + bs2*ai[i];
3244        vi   = aj + ai[i];
3245        nz   = ai[i+1] - ai[i];
3246       idx   = bs*i;
3247        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3248        s5   = b[4+idx];s6 = b[5+idx];
3249        for(k=0;k<nz;k++){
3250           jdx   = bs*vi[k];
3251           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
3252 	  x5    = x[4+jdx]; x6 = x[5+jdx];
3253           s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
3254           s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
3255           s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
3256 	  s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
3257           s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
3258 	  s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
3259           v   +=  bs2;
3260         }
3261 
3262        x[idx]   = s1;
3263        x[1+idx] = s2;
3264        x[2+idx] = s3;
3265        x[3+idx] = s4;
3266        x[4+idx] = s5;
3267        x[5+idx] = s6;
3268     }
3269 
3270    /* backward solve the upper triangular */
3271   for (i=n-1; i>=0; i--){
3272     v   = aa + bs2*(adiag[i+1]+1);
3273      vi  = aj + adiag[i+1]+1;
3274      nz  = adiag[i] - adiag[i+1]-1;
3275      idt = bs*i;
3276      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
3277      s5 = x[4+idt];s6 = x[5+idt];
3278      for(k=0;k<nz;k++){
3279       idx   = bs*vi[k];
3280        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
3281        x5    = x[4+idx];x6 = x[5+idx];
3282        s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
3283        s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
3284        s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
3285        s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
3286        s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
3287        s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
3288         v   +=  bs2;
3289     }
3290     /* x = inv_diagonal*x */
3291    x[idt]   = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
3292    x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
3293    x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
3294    x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
3295    x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
3296    x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
3297   }
3298 
3299   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3300   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3301   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
3302   PetscFunctionReturn(0);
3303 }
3304 
3305 #undef __FUNCT__
3306 #define __FUNCT__ "MatSolve_SeqBAIJ_5_inplace"
3307 PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
3308 {
3309   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
3310   IS                iscol=a->col,isrow=a->row;
3311   PetscErrorCode    ierr;
3312   const PetscInt    *r,*c,*rout,*cout,*diag = a->diag;
3313   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3314   PetscInt          i,nz,idx,idt,idc;
3315   const MatScalar   *aa=a->a,*v;
3316   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
3317   const PetscScalar *b;
3318 
3319   PetscFunctionBegin;
3320   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3321   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3322   t  = a->solve_work;
3323 
3324   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3325   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3326 
3327   /* forward solve the lower triangular */
3328   idx    = 5*(*r++);
3329   t[0] = b[idx];   t[1] = b[1+idx];
3330   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
3331   for (i=1; i<n; i++) {
3332     v     = aa + 25*ai[i];
3333     vi    = aj + ai[i];
3334     nz    = diag[i] - ai[i];
3335     idx   = 5*(*r++);
3336     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3337     s5  = b[4+idx];
3338     while (nz--) {
3339       idx   = 5*(*vi++);
3340       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
3341       x4    = t[3+idx];x5 = t[4+idx];
3342       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3343       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3344       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3345       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3346       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3347       v += 25;
3348     }
3349     idx = 5*i;
3350     t[idx]   = s1;t[1+idx] = s2;
3351     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
3352   }
3353   /* backward solve the upper triangular */
3354   for (i=n-1; i>=0; i--){
3355     v    = aa + 25*diag[i] + 25;
3356     vi   = aj + diag[i] + 1;
3357     nz   = ai[i+1] - diag[i] - 1;
3358     idt  = 5*i;
3359     s1 = t[idt];  s2 = t[1+idt];
3360     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
3361     while (nz--) {
3362       idx   = 5*(*vi++);
3363       x1    = t[idx];   x2 = t[1+idx];
3364       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
3365       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3366       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3367       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3368       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3369       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3370       v += 25;
3371     }
3372     idc = 5*(*c--);
3373     v   = aa + 25*diag[i];
3374     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
3375                                  v[15]*s4+v[20]*s5;
3376     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
3377                                  v[16]*s4+v[21]*s5;
3378     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
3379                                  v[17]*s4+v[22]*s5;
3380     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
3381                                  v[18]*s4+v[23]*s5;
3382     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
3383                                  v[19]*s4+v[24]*s5;
3384   }
3385 
3386   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3387   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3388   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3389   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3390   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
3391   PetscFunctionReturn(0);
3392 }
3393 
3394 #undef __FUNCT__
3395 #define __FUNCT__ "MatSolve_SeqBAIJ_5"
3396 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
3397 {
3398   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
3399   IS                iscol=a->col,isrow=a->row;
3400   PetscErrorCode    ierr;
3401   const PetscInt    *r,*c,*rout,*cout;
3402   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3403   PetscInt          i,nz,idx,idt,idc,m;
3404   const MatScalar   *aa=a->a,*v;
3405   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
3406   const PetscScalar *b;
3407 
3408   PetscFunctionBegin;
3409   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3410   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3411   t  = a->solve_work;
3412 
3413   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3414   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
3415 
3416   /* forward solve the lower triangular */
3417   idx    = 5*r[0];
3418   t[0] = b[idx];   t[1] = b[1+idx];
3419   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
3420   for (i=1; i<n; i++) {
3421     v     = aa + 25*ai[i];
3422     vi    = aj + ai[i];
3423     nz    = ai[i+1] - ai[i];
3424     idx   = 5*r[i];
3425     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3426     s5  = b[4+idx];
3427     for(m=0;m<nz;m++){
3428       idx   = 5*vi[m];
3429       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
3430       x4    = t[3+idx];x5 = t[4+idx];
3431       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3432       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3433       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3434       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3435       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3436       v += 25;
3437     }
3438     idx = 5*i;
3439     t[idx]   = s1;t[1+idx] = s2;
3440     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
3441   }
3442   /* backward solve the upper triangular */
3443   for (i=n-1; i>=0; i--){
3444     v    = aa + 25*(adiag[i+1]+1);
3445     vi   = aj + adiag[i+1]+1;
3446     nz   = adiag[i] - adiag[i+1] - 1;
3447     idt  = 5*i;
3448     s1 = t[idt];  s2 = t[1+idt];
3449     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
3450     for(m=0;m<nz;m++){
3451       idx   = 5*vi[m];
3452       x1    = t[idx];   x2 = t[1+idx];
3453       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
3454       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3455       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3456       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3457       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3458       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3459       v += 25;
3460     }
3461     idc = 5*c[i];
3462     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
3463                                  v[15]*s4+v[20]*s5;
3464     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
3465                                  v[16]*s4+v[21]*s5;
3466     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
3467                                  v[17]*s4+v[22]*s5;
3468     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
3469                                  v[18]*s4+v[23]*s5;
3470     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
3471                                  v[19]*s4+v[24]*s5;
3472   }
3473 
3474   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3475   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3476   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3477   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3478   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
3479   PetscFunctionReturn(0);
3480 }
3481 
3482 #undef __FUNCT__
3483 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_inplace"
3484 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
3485 {
3486   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3487   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3488   PetscInt          i,nz,idx,idt,jdx;
3489   PetscErrorCode    ierr;
3490   const MatScalar   *aa=a->a,*v;
3491   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
3492   const PetscScalar *b;
3493 
3494   PetscFunctionBegin;
3495   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3496   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3497   /* forward solve the lower triangular */
3498   idx    = 0;
3499   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
3500   for (i=1; i<n; i++) {
3501     v     =  aa + 25*ai[i];
3502     vi    =  aj + ai[i];
3503     nz    =  diag[i] - ai[i];
3504     idx   =  5*i;
3505     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
3506     while (nz--) {
3507       jdx   = 5*(*vi++);
3508       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
3509       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3510       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3511       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3512       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3513       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3514       v    += 25;
3515     }
3516     x[idx]   = s1;
3517     x[1+idx] = s2;
3518     x[2+idx] = s3;
3519     x[3+idx] = s4;
3520     x[4+idx] = s5;
3521   }
3522   /* backward solve the upper triangular */
3523   for (i=n-1; i>=0; i--){
3524     v    = aa + 25*diag[i] + 25;
3525     vi   = aj + diag[i] + 1;
3526     nz   = ai[i+1] - diag[i] - 1;
3527     idt  = 5*i;
3528     s1 = x[idt];  s2 = x[1+idt];
3529     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
3530     while (nz--) {
3531       idx   = 5*(*vi++);
3532       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
3533       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3534       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3535       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3536       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3537       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3538       v    += 25;
3539     }
3540     v        = aa + 25*diag[i];
3541     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
3542     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
3543     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
3544     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
3545     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
3546   }
3547 
3548   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3549   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3550   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
3551   PetscFunctionReturn(0);
3552 }
3553 
3554 #undef __FUNCT__
3555 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
3556 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
3557 {
3558   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3559   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3560   PetscInt          i,k,nz,idx,idt,jdx;
3561   PetscErrorCode    ierr;
3562   const MatScalar   *aa=a->a,*v;
3563   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
3564   const PetscScalar *b;
3565 
3566   PetscFunctionBegin;
3567   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3568   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3569   /* forward solve the lower triangular */
3570   idx    = 0;
3571   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
3572   for (i=1; i<n; i++) {
3573     v   = aa + 25*ai[i];
3574     vi  = aj + ai[i];
3575     nz  = ai[i+1] - ai[i];
3576     idx = 5*i;
3577     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
3578     for(k=0;k<nz;k++) {
3579       jdx   = 5*vi[k];
3580       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
3581       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3582       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3583       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3584       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3585       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3586       v    += 25;
3587     }
3588     x[idx]   = s1;
3589     x[1+idx] = s2;
3590     x[2+idx] = s3;
3591     x[3+idx] = s4;
3592     x[4+idx] = s5;
3593   }
3594 
3595   /* backward solve the upper triangular */
3596   for (i=n-1; i>=0; i--){
3597     v   = aa + 25*(adiag[i+1]+1);
3598     vi  = aj + adiag[i+1]+1;
3599     nz  = adiag[i] - adiag[i+1]-1;
3600     idt = 5*i;
3601     s1 = x[idt];  s2 = x[1+idt];
3602     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
3603     for(k=0;k<nz;k++){
3604       idx   = 5*vi[k];
3605       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
3606       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3607       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3608       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3609       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3610       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3611       v    += 25;
3612     }
3613     /* x = inv_diagonal*x */
3614     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
3615     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
3616     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
3617     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
3618     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
3619   }
3620 
3621   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3622   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3623   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
3624   PetscFunctionReturn(0);
3625 }
3626 
3627 #undef __FUNCT__
3628 #define __FUNCT__ "MatSolve_SeqBAIJ_4_inplace"
3629 PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
3630 {
3631   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3632   IS                iscol=a->col,isrow=a->row;
3633   PetscErrorCode    ierr;
3634   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3635   PetscInt          i,nz,idx,idt,idc;
3636   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3637   const MatScalar   *aa=a->a,*v;
3638   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
3639   const PetscScalar *b;
3640 
3641   PetscFunctionBegin;
3642   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3643   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3644   t  = a->solve_work;
3645 
3646   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3647   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3648 
3649   /* forward solve the lower triangular */
3650   idx    = 4*(*r++);
3651   t[0] = b[idx];   t[1] = b[1+idx];
3652   t[2] = b[2+idx]; t[3] = b[3+idx];
3653   for (i=1; i<n; i++) {
3654     v     = aa + 16*ai[i];
3655     vi    = aj + ai[i];
3656     nz    = diag[i] - ai[i];
3657     idx   = 4*(*r++);
3658     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3659     while (nz--) {
3660       idx   = 4*(*vi++);
3661       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
3662       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3663       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3664       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3665       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3666       v    += 16;
3667     }
3668     idx        = 4*i;
3669     t[idx]   = s1;t[1+idx] = s2;
3670     t[2+idx] = s3;t[3+idx] = s4;
3671   }
3672   /* backward solve the upper triangular */
3673   for (i=n-1; i>=0; i--){
3674     v    = aa + 16*diag[i] + 16;
3675     vi   = aj + diag[i] + 1;
3676     nz   = ai[i+1] - diag[i] - 1;
3677     idt  = 4*i;
3678     s1 = t[idt];  s2 = t[1+idt];
3679     s3 = t[2+idt];s4 = t[3+idt];
3680     while (nz--) {
3681       idx   = 4*(*vi++);
3682       x1    = t[idx];   x2 = t[1+idx];
3683       x3    = t[2+idx]; x4 = t[3+idx];
3684       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3685       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3686       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3687       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3688       v += 16;
3689     }
3690     idc      = 4*(*c--);
3691     v        = aa + 16*diag[i];
3692     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3693     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3694     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3695     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3696   }
3697 
3698   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3699   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3700   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3701   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3702   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3703   PetscFunctionReturn(0);
3704 }
3705 
3706 #undef __FUNCT__
3707 #define __FUNCT__ "MatSolve_SeqBAIJ_4"
3708 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
3709 {
3710   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3711   IS                iscol=a->col,isrow=a->row;
3712   PetscErrorCode    ierr;
3713   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3714   PetscInt          i,nz,idx,idt,idc,m;
3715   const PetscInt    *r,*c,*rout,*cout;
3716   const MatScalar   *aa=a->a,*v;
3717   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
3718   const PetscScalar *b;
3719 
3720   PetscFunctionBegin;
3721   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3722   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3723   t  = a->solve_work;
3724 
3725   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3726   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
3727 
3728   /* forward solve the lower triangular */
3729   idx    = 4*r[0];
3730   t[0] = b[idx];   t[1] = b[1+idx];
3731   t[2] = b[2+idx]; t[3] = b[3+idx];
3732   for (i=1; i<n; i++) {
3733     v     = aa + 16*ai[i];
3734     vi    = aj + ai[i];
3735     nz    = ai[i+1] - ai[i];
3736     idx   = 4*r[i];
3737     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3738     for(m=0;m<nz;m++){
3739       idx   = 4*vi[m];
3740       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
3741       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3742       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3743       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3744       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3745       v    += 16;
3746     }
3747     idx        = 4*i;
3748     t[idx]   = s1;t[1+idx] = s2;
3749     t[2+idx] = s3;t[3+idx] = s4;
3750   }
3751   /* backward solve the upper triangular */
3752   for (i=n-1; i>=0; i--){
3753     v    = aa + 16*(adiag[i+1]+1);
3754     vi   = aj + adiag[i+1]+1;
3755     nz   = adiag[i] - adiag[i+1] - 1;
3756     idt  = 4*i;
3757     s1 = t[idt];  s2 = t[1+idt];
3758     s3 = t[2+idt];s4 = t[3+idt];
3759     for(m=0;m<nz;m++){
3760       idx   = 4*vi[m];
3761       x1    = t[idx];   x2 = t[1+idx];
3762       x3    = t[2+idx]; x4 = t[3+idx];
3763       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3764       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3765       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3766       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3767       v += 16;
3768     }
3769     idc      = 4*c[i];
3770     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3771     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3772     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3773     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3774   }
3775 
3776   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3777   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3778   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3779   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3780   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3781   PetscFunctionReturn(0);
3782 }
3783 
3784 #undef __FUNCT__
3785 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion"
3786 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
3787 {
3788   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3789   IS                iscol=a->col,isrow=a->row;
3790   PetscErrorCode    ierr;
3791   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3792   PetscInt          i,nz,idx,idt,idc;
3793   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3794   const MatScalar   *aa=a->a,*v;
3795   MatScalar         s1,s2,s3,s4,x1,x2,x3,x4,*t;
3796   PetscScalar       *x;
3797   const PetscScalar *b;
3798 
3799   PetscFunctionBegin;
3800   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3801   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3802   t  = (MatScalar *)a->solve_work;
3803 
3804   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3805   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3806 
3807   /* forward solve the lower triangular */
3808   idx    = 4*(*r++);
3809   t[0] = (MatScalar)b[idx];
3810   t[1] = (MatScalar)b[1+idx];
3811   t[2] = (MatScalar)b[2+idx];
3812   t[3] = (MatScalar)b[3+idx];
3813   for (i=1; i<n; i++) {
3814     v     = aa + 16*ai[i];
3815     vi    = aj + ai[i];
3816     nz    = diag[i] - ai[i];
3817     idx   = 4*(*r++);
3818     s1 = (MatScalar)b[idx];
3819     s2 = (MatScalar)b[1+idx];
3820     s3 = (MatScalar)b[2+idx];
3821     s4 = (MatScalar)b[3+idx];
3822     while (nz--) {
3823       idx   = 4*(*vi++);
3824       x1  = t[idx];
3825       x2  = t[1+idx];
3826       x3  = t[2+idx];
3827       x4  = t[3+idx];
3828       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3829       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3830       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3831       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3832       v    += 16;
3833     }
3834     idx        = 4*i;
3835     t[idx]   = s1;
3836     t[1+idx] = s2;
3837     t[2+idx] = s3;
3838     t[3+idx] = s4;
3839   }
3840   /* backward solve the upper triangular */
3841   for (i=n-1; i>=0; i--){
3842     v    = aa + 16*diag[i] + 16;
3843     vi   = aj + diag[i] + 1;
3844     nz   = ai[i+1] - diag[i] - 1;
3845     idt  = 4*i;
3846     s1 = t[idt];
3847     s2 = t[1+idt];
3848     s3 = t[2+idt];
3849     s4 = t[3+idt];
3850     while (nz--) {
3851       idx   = 4*(*vi++);
3852       x1  = t[idx];
3853       x2  = t[1+idx];
3854       x3  = t[2+idx];
3855       x4  = t[3+idx];
3856       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3857       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3858       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3859       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3860       v += 16;
3861     }
3862     idc      = 4*(*c--);
3863     v        = aa + 16*diag[i];
3864     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3865     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3866     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3867     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3868     x[idc]   = (PetscScalar)t[idt];
3869     x[1+idc] = (PetscScalar)t[1+idt];
3870     x[2+idc] = (PetscScalar)t[2+idt];
3871     x[3+idc] = (PetscScalar)t[3+idt];
3872  }
3873 
3874   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3875   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3876   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3877   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3878   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3879   PetscFunctionReturn(0);
3880 }
3881 
3882 #if defined (PETSC_HAVE_SSE)
3883 
3884 #include PETSC_HAVE_SSE
3885 
3886 #undef __FUNCT__
3887 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
3888 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
3889 {
3890   /*
3891      Note: This code uses demotion of double
3892      to float when performing the mixed-mode computation.
3893      This may not be numerically reasonable for all applications.
3894   */
3895   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3896   IS             iscol=a->col,isrow=a->row;
3897   PetscErrorCode ierr;
3898   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
3899   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
3900   MatScalar      *aa=a->a,*v;
3901   PetscScalar    *x,*b,*t;
3902 
3903   /* Make space in temp stack for 16 Byte Aligned arrays */
3904   float           ssealignedspace[11],*tmps,*tmpx;
3905   unsigned long   offset;
3906 
3907   PetscFunctionBegin;
3908   SSE_SCOPE_BEGIN;
3909 
3910     offset = (unsigned long)ssealignedspace % 16;
3911     if (offset) offset = (16 - offset)/4;
3912     tmps = &ssealignedspace[offset];
3913     tmpx = &ssealignedspace[offset+4];
3914     PREFETCH_NTA(aa+16*ai[1]);
3915 
3916     ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3917     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3918     t  = a->solve_work;
3919 
3920     ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3921     ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3922 
3923     /* forward solve the lower triangular */
3924     idx  = 4*(*r++);
3925     t[0] = b[idx];   t[1] = b[1+idx];
3926     t[2] = b[2+idx]; t[3] = b[3+idx];
3927     v    =  aa + 16*ai[1];
3928 
3929     for (i=1; i<n;) {
3930       PREFETCH_NTA(&v[8]);
3931       vi   =  aj      + ai[i];
3932       nz   =  diag[i] - ai[i];
3933       idx  =  4*(*r++);
3934 
3935       /* Demote sum from double to float */
3936       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
3937       LOAD_PS(tmps,XMM7);
3938 
3939       while (nz--) {
3940         PREFETCH_NTA(&v[16]);
3941         idx = 4*(*vi++);
3942 
3943         /* Demote solution (so far) from double to float */
3944         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
3945 
3946         /* 4x4 Matrix-Vector product with negative accumulation: */
3947         SSE_INLINE_BEGIN_2(tmpx,v)
3948           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3949 
3950           /* First Column */
3951           SSE_COPY_PS(XMM0,XMM6)
3952           SSE_SHUFFLE(XMM0,XMM0,0x00)
3953           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3954           SSE_SUB_PS(XMM7,XMM0)
3955 
3956           /* Second Column */
3957           SSE_COPY_PS(XMM1,XMM6)
3958           SSE_SHUFFLE(XMM1,XMM1,0x55)
3959           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3960           SSE_SUB_PS(XMM7,XMM1)
3961 
3962           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3963 
3964           /* Third Column */
3965           SSE_COPY_PS(XMM2,XMM6)
3966           SSE_SHUFFLE(XMM2,XMM2,0xAA)
3967           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3968           SSE_SUB_PS(XMM7,XMM2)
3969 
3970           /* Fourth Column */
3971           SSE_COPY_PS(XMM3,XMM6)
3972           SSE_SHUFFLE(XMM3,XMM3,0xFF)
3973           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3974           SSE_SUB_PS(XMM7,XMM3)
3975         SSE_INLINE_END_2
3976 
3977         v  += 16;
3978       }
3979       idx = 4*i;
3980       v   = aa + 16*ai[++i];
3981       PREFETCH_NTA(v);
3982       STORE_PS(tmps,XMM7);
3983 
3984       /* Promote result from float to double */
3985       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
3986     }
3987     /* backward solve the upper triangular */
3988     idt  = 4*(n-1);
3989     ai16 = 16*diag[n-1];
3990     v    = aa + ai16 + 16;
3991     for (i=n-1; i>=0;){
3992       PREFETCH_NTA(&v[8]);
3993       vi = aj + diag[i] + 1;
3994       nz = ai[i+1] - diag[i] - 1;
3995 
3996       /* Demote accumulator from double to float */
3997       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
3998       LOAD_PS(tmps,XMM7);
3999 
4000       while (nz--) {
4001         PREFETCH_NTA(&v[16]);
4002         idx = 4*(*vi++);
4003 
4004         /* Demote solution (so far) from double to float */
4005         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
4006 
4007         /* 4x4 Matrix-Vector Product with negative accumulation: */
4008         SSE_INLINE_BEGIN_2(tmpx,v)
4009           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4010 
4011           /* First Column */
4012           SSE_COPY_PS(XMM0,XMM6)
4013           SSE_SHUFFLE(XMM0,XMM0,0x00)
4014           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4015           SSE_SUB_PS(XMM7,XMM0)
4016 
4017           /* Second Column */
4018           SSE_COPY_PS(XMM1,XMM6)
4019           SSE_SHUFFLE(XMM1,XMM1,0x55)
4020           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4021           SSE_SUB_PS(XMM7,XMM1)
4022 
4023           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4024 
4025           /* Third Column */
4026           SSE_COPY_PS(XMM2,XMM6)
4027           SSE_SHUFFLE(XMM2,XMM2,0xAA)
4028           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4029           SSE_SUB_PS(XMM7,XMM2)
4030 
4031           /* Fourth Column */
4032           SSE_COPY_PS(XMM3,XMM6)
4033           SSE_SHUFFLE(XMM3,XMM3,0xFF)
4034           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4035           SSE_SUB_PS(XMM7,XMM3)
4036         SSE_INLINE_END_2
4037         v  += 16;
4038       }
4039       v    = aa + ai16;
4040       ai16 = 16*diag[--i];
4041       PREFETCH_NTA(aa+ai16+16);
4042       /*
4043          Scale the result by the diagonal 4x4 block,
4044          which was inverted as part of the factorization
4045       */
4046       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
4047         /* First Column */
4048         SSE_COPY_PS(XMM0,XMM7)
4049         SSE_SHUFFLE(XMM0,XMM0,0x00)
4050         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
4051 
4052         /* Second Column */
4053         SSE_COPY_PS(XMM1,XMM7)
4054         SSE_SHUFFLE(XMM1,XMM1,0x55)
4055         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
4056         SSE_ADD_PS(XMM0,XMM1)
4057 
4058         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
4059 
4060         /* Third Column */
4061         SSE_COPY_PS(XMM2,XMM7)
4062         SSE_SHUFFLE(XMM2,XMM2,0xAA)
4063         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
4064         SSE_ADD_PS(XMM0,XMM2)
4065 
4066         /* Fourth Column */
4067         SSE_COPY_PS(XMM3,XMM7)
4068         SSE_SHUFFLE(XMM3,XMM3,0xFF)
4069         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
4070         SSE_ADD_PS(XMM0,XMM3)
4071 
4072         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
4073       SSE_INLINE_END_3
4074 
4075       /* Promote solution from float to double */
4076       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
4077 
4078       /* Apply reordering to t and stream into x.    */
4079       /* This way, x doesn't pollute the cache.      */
4080       /* Be careful with size: 2 doubles = 4 floats! */
4081       idc  = 4*(*c--);
4082       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
4083         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
4084         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
4085         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
4086         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
4087         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
4088         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
4089       SSE_INLINE_END_2
4090       v    = aa + ai16 + 16;
4091       idt -= 4;
4092     }
4093 
4094     ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4095     ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4096     ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4097     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4098     ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4099   SSE_SCOPE_END;
4100   PetscFunctionReturn(0);
4101 }
4102 
4103 #endif
4104 
4105 
4106 /*
4107       Special case where the matrix was ILU(0) factored in the natural
4108    ordering. This eliminates the need for the column and row permutation.
4109 */
4110 #undef __FUNCT__
4111 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_inplace"
4112 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
4113 {
4114   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4115   PetscInt          n=a->mbs;
4116   const PetscInt    *ai=a->i,*aj=a->j;
4117   PetscErrorCode    ierr;
4118   const PetscInt    *diag = a->diag;
4119   const MatScalar   *aa=a->a;
4120   PetscScalar       *x;
4121   const PetscScalar *b;
4122 
4123   PetscFunctionBegin;
4124   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
4125   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4126 
4127 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
4128   {
4129     static PetscScalar w[2000]; /* very BAD need to fix */
4130     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
4131   }
4132 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
4133   {
4134     static PetscScalar w[2000]; /* very BAD need to fix */
4135     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
4136   }
4137 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
4138   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
4139 #else
4140   {
4141     PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
4142     const MatScalar *v;
4143     PetscInt        jdx,idt,idx,nz,i,ai16;
4144     const PetscInt  *vi;
4145 
4146   /* forward solve the lower triangular */
4147   idx    = 0;
4148   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
4149   for (i=1; i<n; i++) {
4150     v     =  aa      + 16*ai[i];
4151     vi    =  aj      + ai[i];
4152     nz    =  diag[i] - ai[i];
4153     idx   +=  4;
4154     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
4155     while (nz--) {
4156       jdx   = 4*(*vi++);
4157       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
4158       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
4159       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
4160       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4161       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4162       v    += 16;
4163     }
4164     x[idx]   = s1;
4165     x[1+idx] = s2;
4166     x[2+idx] = s3;
4167     x[3+idx] = s4;
4168   }
4169   /* backward solve the upper triangular */
4170   idt = 4*(n-1);
4171   for (i=n-1; i>=0; i--){
4172     ai16 = 16*diag[i];
4173     v    = aa + ai16 + 16;
4174     vi   = aj + diag[i] + 1;
4175     nz   = ai[i+1] - diag[i] - 1;
4176     s1 = x[idt];  s2 = x[1+idt];
4177     s3 = x[2+idt];s4 = x[3+idt];
4178     while (nz--) {
4179       idx   = 4*(*vi++);
4180       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
4181       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
4182       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
4183       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
4184       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
4185       v    += 16;
4186     }
4187     v        = aa + ai16;
4188     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
4189     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
4190     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
4191     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
4192     idt -= 4;
4193   }
4194   }
4195 #endif
4196 
4197   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
4198   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4199   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4200   PetscFunctionReturn(0);
4201 }
4202 
4203 #undef __FUNCT__
4204 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
4205 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
4206 {
4207     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4208     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
4209     PetscInt          i,k,nz,idx,jdx,idt;
4210     PetscErrorCode    ierr;
4211     const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
4212     const MatScalar   *aa=a->a,*v;
4213     PetscScalar       *x;
4214     const PetscScalar *b;
4215     PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4;
4216 
4217     PetscFunctionBegin;
4218     ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
4219     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4220     /* forward solve the lower triangular */
4221     idx    = 0;
4222     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
4223     for (i=1; i<n; i++) {
4224        v    = aa + bs2*ai[i];
4225        vi   = aj + ai[i];
4226        nz   = ai[i+1] - ai[i];
4227       idx   = bs*i;
4228        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
4229       for(k=0;k<nz;k++) {
4230           jdx   = bs*vi[k];
4231           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
4232           s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
4233           s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
4234           s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4235 	  s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4236 
4237           v   +=  bs2;
4238         }
4239 
4240        x[idx]   = s1;
4241        x[1+idx] = s2;
4242        x[2+idx] = s3;
4243        x[3+idx] = s4;
4244     }
4245 
4246    /* backward solve the upper triangular */
4247   for (i=n-1; i>=0; i--){
4248     v   = aa + bs2*(adiag[i+1]+1);
4249      vi  = aj + adiag[i+1]+1;
4250      nz  = adiag[i] - adiag[i+1]-1;
4251      idt = bs*i;
4252      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
4253 
4254     for(k=0;k<nz;k++){
4255       idx   = bs*vi[k];
4256        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
4257        s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
4258        s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
4259        s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4260        s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4261 
4262         v   +=  bs2;
4263     }
4264     /* x = inv_diagonal*x */
4265    x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
4266    x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;;
4267    x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
4268    x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
4269 
4270   }
4271 
4272   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
4273   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4274   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
4275   PetscFunctionReturn(0);
4276 }
4277 
4278 #undef __FUNCT__
4279 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion"
4280 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
4281 {
4282   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4283   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*diag=a->diag;
4284   PetscErrorCode    ierr;
4285   const MatScalar   *aa=a->a;
4286   const PetscScalar *b;
4287   PetscScalar       *x;
4288 
4289   PetscFunctionBegin;
4290   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
4291   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4292 
4293   {
4294     MatScalar        s1,s2,s3,s4,x1,x2,x3,x4;
4295     const MatScalar  *v;
4296     MatScalar        *t=(MatScalar *)x;
4297     PetscInt         jdx,idt,idx,nz,i,ai16;
4298     const PetscInt   *vi;
4299 
4300     /* forward solve the lower triangular */
4301     idx  = 0;
4302     t[0] = (MatScalar)b[0];
4303     t[1] = (MatScalar)b[1];
4304     t[2] = (MatScalar)b[2];
4305     t[3] = (MatScalar)b[3];
4306     for (i=1; i<n; i++) {
4307       v     =  aa      + 16*ai[i];
4308       vi    =  aj      + ai[i];
4309       nz    =  diag[i] - ai[i];
4310       idx   +=  4;
4311       s1 = (MatScalar)b[idx];
4312       s2 = (MatScalar)b[1+idx];
4313       s3 = (MatScalar)b[2+idx];
4314       s4 = (MatScalar)b[3+idx];
4315       while (nz--) {
4316         jdx = 4*(*vi++);
4317         x1  = t[jdx];
4318         x2  = t[1+jdx];
4319         x3  = t[2+jdx];
4320         x4  = t[3+jdx];
4321         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
4322         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
4323         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4324         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4325         v    += 16;
4326       }
4327       t[idx]   = s1;
4328       t[1+idx] = s2;
4329       t[2+idx] = s3;
4330       t[3+idx] = s4;
4331     }
4332     /* backward solve the upper triangular */
4333     idt = 4*(n-1);
4334     for (i=n-1; i>=0; i--){
4335       ai16 = 16*diag[i];
4336       v    = aa + ai16 + 16;
4337       vi   = aj + diag[i] + 1;
4338       nz   = ai[i+1] - diag[i] - 1;
4339       s1   = t[idt];
4340       s2   = t[1+idt];
4341       s3   = t[2+idt];
4342       s4   = t[3+idt];
4343       while (nz--) {
4344         idx = 4*(*vi++);
4345         x1  = (MatScalar)x[idx];
4346         x2  = (MatScalar)x[1+idx];
4347         x3  = (MatScalar)x[2+idx];
4348         x4  = (MatScalar)x[3+idx];
4349         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
4350         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
4351         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4352         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4353         v    += 16;
4354       }
4355       v        = aa + ai16;
4356       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
4357       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
4358       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
4359       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
4360       idt -= 4;
4361     }
4362   }
4363 
4364   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
4365   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4366   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4367   PetscFunctionReturn(0);
4368 }
4369 
4370 #if defined (PETSC_HAVE_SSE)
4371 
4372 #include PETSC_HAVE_SSE
4373 #undef __FUNCT__
4374 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj"
4375 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
4376 {
4377   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
4378   unsigned short *aj=(unsigned short *)a->j;
4379   PetscErrorCode ierr;
4380   int            *ai=a->i,n=a->mbs,*diag = a->diag;
4381   MatScalar      *aa=a->a;
4382   PetscScalar    *x,*b;
4383 
4384   PetscFunctionBegin;
4385   SSE_SCOPE_BEGIN;
4386   /*
4387      Note: This code currently uses demotion of double
4388      to float when performing the mixed-mode computation.
4389      This may not be numerically reasonable for all applications.
4390   */
4391   PREFETCH_NTA(aa+16*ai[1]);
4392 
4393   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4394   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4395   {
4396     /* x will first be computed in single precision then promoted inplace to double */
4397     MatScalar      *v,*t=(MatScalar *)x;
4398     int            nz,i,idt,ai16;
4399     unsigned int   jdx,idx;
4400     unsigned short *vi;
4401     /* Forward solve the lower triangular factor. */
4402 
4403     /* First block is the identity. */
4404     idx  = 0;
4405     CONVERT_DOUBLE4_FLOAT4(t,b);
4406     v    =  aa + 16*((unsigned int)ai[1]);
4407 
4408     for (i=1; i<n;) {
4409       PREFETCH_NTA(&v[8]);
4410       vi   =  aj      + ai[i];
4411       nz   =  diag[i] - ai[i];
4412       idx +=  4;
4413 
4414       /* Demote RHS from double to float. */
4415       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
4416       LOAD_PS(&t[idx],XMM7);
4417 
4418       while (nz--) {
4419         PREFETCH_NTA(&v[16]);
4420         jdx = 4*((unsigned int)(*vi++));
4421 
4422         /* 4x4 Matrix-Vector product with negative accumulation: */
4423         SSE_INLINE_BEGIN_2(&t[jdx],v)
4424           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4425 
4426           /* First Column */
4427           SSE_COPY_PS(XMM0,XMM6)
4428           SSE_SHUFFLE(XMM0,XMM0,0x00)
4429           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4430           SSE_SUB_PS(XMM7,XMM0)
4431 
4432           /* Second Column */
4433           SSE_COPY_PS(XMM1,XMM6)
4434           SSE_SHUFFLE(XMM1,XMM1,0x55)
4435           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4436           SSE_SUB_PS(XMM7,XMM1)
4437 
4438           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4439 
4440           /* Third Column */
4441           SSE_COPY_PS(XMM2,XMM6)
4442           SSE_SHUFFLE(XMM2,XMM2,0xAA)
4443           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4444           SSE_SUB_PS(XMM7,XMM2)
4445 
4446           /* Fourth Column */
4447           SSE_COPY_PS(XMM3,XMM6)
4448           SSE_SHUFFLE(XMM3,XMM3,0xFF)
4449           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4450           SSE_SUB_PS(XMM7,XMM3)
4451         SSE_INLINE_END_2
4452 
4453         v  += 16;
4454       }
4455       v    =  aa + 16*ai[++i];
4456       PREFETCH_NTA(v);
4457       STORE_PS(&t[idx],XMM7);
4458     }
4459 
4460     /* Backward solve the upper triangular factor.*/
4461 
4462     idt  = 4*(n-1);
4463     ai16 = 16*diag[n-1];
4464     v    = aa + ai16 + 16;
4465     for (i=n-1; i>=0;){
4466       PREFETCH_NTA(&v[8]);
4467       vi = aj + diag[i] + 1;
4468       nz = ai[i+1] - diag[i] - 1;
4469 
4470       LOAD_PS(&t[idt],XMM7);
4471 
4472       while (nz--) {
4473         PREFETCH_NTA(&v[16]);
4474         idx = 4*((unsigned int)(*vi++));
4475 
4476         /* 4x4 Matrix-Vector Product with negative accumulation: */
4477         SSE_INLINE_BEGIN_2(&t[idx],v)
4478           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4479 
4480           /* First Column */
4481           SSE_COPY_PS(XMM0,XMM6)
4482           SSE_SHUFFLE(XMM0,XMM0,0x00)
4483           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4484           SSE_SUB_PS(XMM7,XMM0)
4485 
4486           /* Second Column */
4487           SSE_COPY_PS(XMM1,XMM6)
4488           SSE_SHUFFLE(XMM1,XMM1,0x55)
4489           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4490           SSE_SUB_PS(XMM7,XMM1)
4491 
4492           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4493 
4494           /* Third Column */
4495           SSE_COPY_PS(XMM2,XMM6)
4496           SSE_SHUFFLE(XMM2,XMM2,0xAA)
4497           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4498           SSE_SUB_PS(XMM7,XMM2)
4499 
4500           /* Fourth Column */
4501           SSE_COPY_PS(XMM3,XMM6)
4502           SSE_SHUFFLE(XMM3,XMM3,0xFF)
4503           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4504           SSE_SUB_PS(XMM7,XMM3)
4505         SSE_INLINE_END_2
4506         v  += 16;
4507       }
4508       v    = aa + ai16;
4509       ai16 = 16*diag[--i];
4510       PREFETCH_NTA(aa+ai16+16);
4511       /*
4512          Scale the result by the diagonal 4x4 block,
4513          which was inverted as part of the factorization
4514       */
4515       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
4516         /* First Column */
4517         SSE_COPY_PS(XMM0,XMM7)
4518         SSE_SHUFFLE(XMM0,XMM0,0x00)
4519         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
4520 
4521         /* Second Column */
4522         SSE_COPY_PS(XMM1,XMM7)
4523         SSE_SHUFFLE(XMM1,XMM1,0x55)
4524         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
4525         SSE_ADD_PS(XMM0,XMM1)
4526 
4527         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
4528 
4529         /* Third Column */
4530         SSE_COPY_PS(XMM2,XMM7)
4531         SSE_SHUFFLE(XMM2,XMM2,0xAA)
4532         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
4533         SSE_ADD_PS(XMM0,XMM2)
4534 
4535         /* Fourth Column */
4536         SSE_COPY_PS(XMM3,XMM7)
4537         SSE_SHUFFLE(XMM3,XMM3,0xFF)
4538         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
4539         SSE_ADD_PS(XMM0,XMM3)
4540 
4541         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
4542       SSE_INLINE_END_3
4543 
4544       v    = aa + ai16 + 16;
4545       idt -= 4;
4546     }
4547 
4548     /* Convert t from single precision back to double precision (inplace)*/
4549     idt = 4*(n-1);
4550     for (i=n-1;i>=0;i--) {
4551       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
4552       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
4553       PetscScalar *xtemp=&x[idt];
4554       MatScalar   *ttemp=&t[idt];
4555       xtemp[3] = (PetscScalar)ttemp[3];
4556       xtemp[2] = (PetscScalar)ttemp[2];
4557       xtemp[1] = (PetscScalar)ttemp[1];
4558       xtemp[0] = (PetscScalar)ttemp[0];
4559       idt -= 4;
4560     }
4561 
4562   } /* End of artificial scope. */
4563   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4564   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4565   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4566   SSE_SCOPE_END;
4567   PetscFunctionReturn(0);
4568 }
4569 
4570 #undef __FUNCT__
4571 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion"
4572 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
4573 {
4574   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
4575   int            *aj=a->j;
4576   PetscErrorCode ierr;
4577   int            *ai=a->i,n=a->mbs,*diag = a->diag;
4578   MatScalar      *aa=a->a;
4579   PetscScalar    *x,*b;
4580 
4581   PetscFunctionBegin;
4582   SSE_SCOPE_BEGIN;
4583   /*
4584      Note: This code currently uses demotion of double
4585      to float when performing the mixed-mode computation.
4586      This may not be numerically reasonable for all applications.
4587   */
4588   PREFETCH_NTA(aa+16*ai[1]);
4589 
4590   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4591   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4592   {
4593     /* x will first be computed in single precision then promoted inplace to double */
4594     MatScalar *v,*t=(MatScalar *)x;
4595     int       nz,i,idt,ai16;
4596     int       jdx,idx;
4597     int       *vi;
4598     /* Forward solve the lower triangular factor. */
4599 
4600     /* First block is the identity. */
4601     idx  = 0;
4602     CONVERT_DOUBLE4_FLOAT4(t,b);
4603     v    =  aa + 16*ai[1];
4604 
4605     for (i=1; i<n;) {
4606       PREFETCH_NTA(&v[8]);
4607       vi   =  aj      + ai[i];
4608       nz   =  diag[i] - ai[i];
4609       idx +=  4;
4610 
4611       /* Demote RHS from double to float. */
4612       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
4613       LOAD_PS(&t[idx],XMM7);
4614 
4615       while (nz--) {
4616         PREFETCH_NTA(&v[16]);
4617         jdx = 4*(*vi++);
4618 /*          jdx = *vi++; */
4619 
4620         /* 4x4 Matrix-Vector product with negative accumulation: */
4621         SSE_INLINE_BEGIN_2(&t[jdx],v)
4622           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4623 
4624           /* First Column */
4625           SSE_COPY_PS(XMM0,XMM6)
4626           SSE_SHUFFLE(XMM0,XMM0,0x00)
4627           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4628           SSE_SUB_PS(XMM7,XMM0)
4629 
4630           /* Second Column */
4631           SSE_COPY_PS(XMM1,XMM6)
4632           SSE_SHUFFLE(XMM1,XMM1,0x55)
4633           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4634           SSE_SUB_PS(XMM7,XMM1)
4635 
4636           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4637 
4638           /* Third Column */
4639           SSE_COPY_PS(XMM2,XMM6)
4640           SSE_SHUFFLE(XMM2,XMM2,0xAA)
4641           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4642           SSE_SUB_PS(XMM7,XMM2)
4643 
4644           /* Fourth Column */
4645           SSE_COPY_PS(XMM3,XMM6)
4646           SSE_SHUFFLE(XMM3,XMM3,0xFF)
4647           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4648           SSE_SUB_PS(XMM7,XMM3)
4649         SSE_INLINE_END_2
4650 
4651         v  += 16;
4652       }
4653       v    =  aa + 16*ai[++i];
4654       PREFETCH_NTA(v);
4655       STORE_PS(&t[idx],XMM7);
4656     }
4657 
4658     /* Backward solve the upper triangular factor.*/
4659 
4660     idt  = 4*(n-1);
4661     ai16 = 16*diag[n-1];
4662     v    = aa + ai16 + 16;
4663     for (i=n-1; i>=0;){
4664       PREFETCH_NTA(&v[8]);
4665       vi = aj + diag[i] + 1;
4666       nz = ai[i+1] - diag[i] - 1;
4667 
4668       LOAD_PS(&t[idt],XMM7);
4669 
4670       while (nz--) {
4671         PREFETCH_NTA(&v[16]);
4672         idx = 4*(*vi++);
4673 /*          idx = *vi++; */
4674 
4675         /* 4x4 Matrix-Vector Product with negative accumulation: */
4676         SSE_INLINE_BEGIN_2(&t[idx],v)
4677           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4678 
4679           /* First Column */
4680           SSE_COPY_PS(XMM0,XMM6)
4681           SSE_SHUFFLE(XMM0,XMM0,0x00)
4682           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4683           SSE_SUB_PS(XMM7,XMM0)
4684 
4685           /* Second Column */
4686           SSE_COPY_PS(XMM1,XMM6)
4687           SSE_SHUFFLE(XMM1,XMM1,0x55)
4688           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4689           SSE_SUB_PS(XMM7,XMM1)
4690 
4691           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4692 
4693           /* Third Column */
4694           SSE_COPY_PS(XMM2,XMM6)
4695           SSE_SHUFFLE(XMM2,XMM2,0xAA)
4696           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4697           SSE_SUB_PS(XMM7,XMM2)
4698 
4699           /* Fourth Column */
4700           SSE_COPY_PS(XMM3,XMM6)
4701           SSE_SHUFFLE(XMM3,XMM3,0xFF)
4702           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4703           SSE_SUB_PS(XMM7,XMM3)
4704         SSE_INLINE_END_2
4705         v  += 16;
4706       }
4707       v    = aa + ai16;
4708       ai16 = 16*diag[--i];
4709       PREFETCH_NTA(aa+ai16+16);
4710       /*
4711          Scale the result by the diagonal 4x4 block,
4712          which was inverted as part of the factorization
4713       */
4714       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
4715         /* First Column */
4716         SSE_COPY_PS(XMM0,XMM7)
4717         SSE_SHUFFLE(XMM0,XMM0,0x00)
4718         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
4719 
4720         /* Second Column */
4721         SSE_COPY_PS(XMM1,XMM7)
4722         SSE_SHUFFLE(XMM1,XMM1,0x55)
4723         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
4724         SSE_ADD_PS(XMM0,XMM1)
4725 
4726         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
4727 
4728         /* Third Column */
4729         SSE_COPY_PS(XMM2,XMM7)
4730         SSE_SHUFFLE(XMM2,XMM2,0xAA)
4731         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
4732         SSE_ADD_PS(XMM0,XMM2)
4733 
4734         /* Fourth Column */
4735         SSE_COPY_PS(XMM3,XMM7)
4736         SSE_SHUFFLE(XMM3,XMM3,0xFF)
4737         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
4738         SSE_ADD_PS(XMM0,XMM3)
4739 
4740         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
4741       SSE_INLINE_END_3
4742 
4743       v    = aa + ai16 + 16;
4744       idt -= 4;
4745     }
4746 
4747     /* Convert t from single precision back to double precision (inplace)*/
4748     idt = 4*(n-1);
4749     for (i=n-1;i>=0;i--) {
4750       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
4751       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
4752       PetscScalar *xtemp=&x[idt];
4753       MatScalar   *ttemp=&t[idt];
4754       xtemp[3] = (PetscScalar)ttemp[3];
4755       xtemp[2] = (PetscScalar)ttemp[2];
4756       xtemp[1] = (PetscScalar)ttemp[1];
4757       xtemp[0] = (PetscScalar)ttemp[0];
4758       idt -= 4;
4759     }
4760 
4761   } /* End of artificial scope. */
4762   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4763   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4764   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4765   SSE_SCOPE_END;
4766   PetscFunctionReturn(0);
4767 }
4768 
4769 #endif
4770 
4771 #undef __FUNCT__
4772 #define __FUNCT__ "MatSolve_SeqBAIJ_3_inplace"
4773 PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
4774 {
4775   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4776   IS                iscol=a->col,isrow=a->row;
4777   PetscErrorCode    ierr;
4778   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
4779   PetscInt          i,nz,idx,idt,idc;
4780   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
4781   const MatScalar   *aa=a->a,*v;
4782   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
4783   const PetscScalar *b;
4784 
4785   PetscFunctionBegin;
4786   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
4787   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4788   t  = a->solve_work;
4789 
4790   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4791   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
4792 
4793   /* forward solve the lower triangular */
4794   idx    = 3*(*r++);
4795   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
4796   for (i=1; i<n; i++) {
4797     v     = aa + 9*ai[i];
4798     vi    = aj + ai[i];
4799     nz    = diag[i] - ai[i];
4800     idx   = 3*(*r++);
4801     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
4802     while (nz--) {
4803       idx   = 3*(*vi++);
4804       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4805       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4806       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4807       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4808       v += 9;
4809     }
4810     idx = 3*i;
4811     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
4812   }
4813   /* backward solve the upper triangular */
4814   for (i=n-1; i>=0; i--){
4815     v    = aa + 9*diag[i] + 9;
4816     vi   = aj + diag[i] + 1;
4817     nz   = ai[i+1] - diag[i] - 1;
4818     idt  = 3*i;
4819     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
4820     while (nz--) {
4821       idx   = 3*(*vi++);
4822       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4823       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4824       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4825       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4826       v += 9;
4827     }
4828     idc = 3*(*c--);
4829     v   = aa + 9*diag[i];
4830     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4831     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4832     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4833   }
4834   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4835   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4836   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
4837   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4838   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
4839   PetscFunctionReturn(0);
4840 }
4841 
4842 #undef __FUNCT__
4843 #define __FUNCT__ "MatSolve_SeqBAIJ_3"
4844 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
4845 {
4846   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4847   IS                iscol=a->col,isrow=a->row;
4848   PetscErrorCode    ierr;
4849   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
4850   PetscInt          i,nz,idx,idt,idc,m;
4851   const PetscInt    *r,*c,*rout,*cout;
4852   const MatScalar   *aa=a->a,*v;
4853   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
4854   const PetscScalar *b;
4855 
4856   PetscFunctionBegin;
4857   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
4858   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4859   t  = a->solve_work;
4860 
4861   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4862   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
4863 
4864   /* forward solve the lower triangular */
4865   idx    = 3*r[0];
4866   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
4867   for (i=1; i<n; i++) {
4868     v     = aa + 9*ai[i];
4869     vi    = aj + ai[i];
4870     nz    = ai[i+1] - ai[i];
4871     idx   = 3*r[i];
4872     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
4873     for(m=0;m<nz;m++){
4874       idx   = 3*vi[m];
4875       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4876       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4877       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4878       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4879       v += 9;
4880     }
4881     idx = 3*i;
4882     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
4883   }
4884   /* backward solve the upper triangular */
4885   for (i=n-1; i>=0; i--){
4886     v    = aa + 9*(adiag[i+1]+1);
4887     vi   = aj + adiag[i+1]+1;
4888     nz   = adiag[i] - adiag[i+1] - 1;
4889     idt  = 3*i;
4890     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
4891     for(m=0;m<nz;m++){
4892       idx   = 3*vi[m];
4893       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4894       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4895       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4896       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4897       v += 9;
4898     }
4899     idc = 3*c[i];
4900     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4901     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4902     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4903   }
4904   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4905   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4906   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
4907   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4908   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
4909   PetscFunctionReturn(0);
4910 }
4911 
4912 /*
4913       Special case where the matrix was ILU(0) factored in the natural
4914    ordering. This eliminates the need for the column and row permutation.
4915 */
4916 #undef __FUNCT__
4917 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_inplace"
4918 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
4919 {
4920   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4921   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j;
4922   PetscErrorCode    ierr;
4923   const PetscInt    *diag = a->diag,*vi;
4924   const MatScalar   *aa=a->a,*v;
4925   PetscScalar       *x,s1,s2,s3,x1,x2,x3;
4926   const PetscScalar *b;
4927   PetscInt          jdx,idt,idx,nz,i;
4928 
4929   PetscFunctionBegin;
4930   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
4931   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4932 
4933   /* forward solve the lower triangular */
4934   idx    = 0;
4935   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
4936   for (i=1; i<n; i++) {
4937     v     =  aa      + 9*ai[i];
4938     vi    =  aj      + ai[i];
4939     nz    =  diag[i] - ai[i];
4940     idx   +=  3;
4941     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
4942     while (nz--) {
4943       jdx   = 3*(*vi++);
4944       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
4945       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4946       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4947       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4948       v    += 9;
4949     }
4950     x[idx]   = s1;
4951     x[1+idx] = s2;
4952     x[2+idx] = s3;
4953   }
4954   /* backward solve the upper triangular */
4955   for (i=n-1; i>=0; i--){
4956     v    = aa + 9*diag[i] + 9;
4957     vi   = aj + diag[i] + 1;
4958     nz   = ai[i+1] - diag[i] - 1;
4959     idt  = 3*i;
4960     s1 = x[idt];  s2 = x[1+idt];
4961     s3 = x[2+idt];
4962     while (nz--) {
4963       idx   = 3*(*vi++);
4964       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
4965       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4966       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4967       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4968       v    += 9;
4969     }
4970     v        = aa +  9*diag[i];
4971     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4972     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4973     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4974   }
4975 
4976   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
4977   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4978   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
4979   PetscFunctionReturn(0);
4980 }
4981 
4982 #undef __FUNCT__
4983 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
4984 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
4985 {
4986     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4987     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
4988     PetscErrorCode    ierr;
4989     PetscInt          i,k,nz,idx,jdx,idt;
4990     const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
4991     const MatScalar   *aa=a->a,*v;
4992     PetscScalar       *x;
4993     const PetscScalar *b;
4994     PetscScalar        s1,s2,s3,x1,x2,x3;
4995 
4996     PetscFunctionBegin;
4997     ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
4998     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4999     /* forward solve the lower triangular */
5000     idx    = 0;
5001     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
5002     for (i=1; i<n; i++) {
5003        v    = aa + bs2*ai[i];
5004        vi   = aj + ai[i];
5005        nz   = ai[i+1] - ai[i];
5006       idx   = bs*i;
5007        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];
5008       for(k=0;k<nz;k++){
5009          jdx   = bs*vi[k];
5010           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
5011           s1   -= v[0]*x1 + v[3]*x2 + v[6]*x3;
5012           s2   -= v[1]*x1 + v[4]*x2 + v[7]*x3;
5013           s3   -= v[2]*x1 + v[5]*x2 + v[8]*x3;
5014 
5015           v   +=  bs2;
5016         }
5017 
5018        x[idx]   = s1;
5019        x[1+idx] = s2;
5020        x[2+idx] = s3;
5021     }
5022 
5023    /* backward solve the upper triangular */
5024   for (i=n-1; i>=0; i--){
5025     v   = aa + bs2*(adiag[i+1]+1);
5026      vi  = aj + adiag[i+1]+1;
5027      nz  = adiag[i] - adiag[i+1]-1;
5028      idt = bs*i;
5029      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];
5030 
5031      for(k=0;k<nz;k++){
5032        idx   = bs*vi[k];
5033        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
5034        s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
5035        s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
5036        s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
5037 
5038         v   +=  bs2;
5039     }
5040     /* x = inv_diagonal*x */
5041    x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
5042    x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
5043    x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
5044 
5045   }
5046 
5047   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5048   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5049   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
5050   PetscFunctionReturn(0);
5051 }
5052 
5053 #undef __FUNCT__
5054 #define __FUNCT__ "MatSolve_SeqBAIJ_2_inplace"
5055 PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
5056 {
5057   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
5058   IS                iscol=a->col,isrow=a->row;
5059   PetscErrorCode    ierr;
5060   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
5061   PetscInt          i,nz,idx,idt,idc;
5062   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
5063   const MatScalar   *aa=a->a,*v;
5064   PetscScalar       *x,s1,s2,x1,x2,*t;
5065   const PetscScalar *b;
5066 
5067   PetscFunctionBegin;
5068   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
5069   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5070   t  = a->solve_work;
5071 
5072   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
5073   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
5074 
5075   /* forward solve the lower triangular */
5076   idx    = 2*(*r++);
5077   t[0] = b[idx]; t[1] = b[1+idx];
5078   for (i=1; i<n; i++) {
5079     v     = aa + 4*ai[i];
5080     vi    = aj + ai[i];
5081     nz    = diag[i] - ai[i];
5082     idx   = 2*(*r++);
5083     s1  = b[idx]; s2 = b[1+idx];
5084     while (nz--) {
5085       idx   = 2*(*vi++);
5086       x1    = t[idx]; x2 = t[1+idx];
5087       s1 -= v[0]*x1 + v[2]*x2;
5088       s2 -= v[1]*x1 + v[3]*x2;
5089       v += 4;
5090     }
5091     idx = 2*i;
5092     t[idx] = s1; t[1+idx] = s2;
5093   }
5094   /* backward solve the upper triangular */
5095   for (i=n-1; i>=0; i--){
5096     v    = aa + 4*diag[i] + 4;
5097     vi   = aj + diag[i] + 1;
5098     nz   = ai[i+1] - diag[i] - 1;
5099     idt  = 2*i;
5100     s1 = t[idt]; s2 = t[1+idt];
5101     while (nz--) {
5102       idx   = 2*(*vi++);
5103       x1    = t[idx]; x2 = t[1+idx];
5104       s1 -= v[0]*x1 + v[2]*x2;
5105       s2 -= v[1]*x1 + v[3]*x2;
5106       v += 4;
5107     }
5108     idc = 2*(*c--);
5109     v   = aa + 4*diag[i];
5110     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
5111     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
5112   }
5113   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
5114   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
5115   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5116   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5117   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
5118   PetscFunctionReturn(0);
5119 }
5120 
5121 #undef __FUNCT__
5122 #define __FUNCT__ "MatSolve_SeqBAIJ_2"
5123 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
5124 {
5125   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
5126   IS                iscol=a->col,isrow=a->row;
5127   PetscErrorCode    ierr;
5128   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
5129   PetscInt          i,nz,idx,jdx,idt,idc,m;
5130   const PetscInt    *r,*c,*rout,*cout;
5131   const MatScalar   *aa=a->a,*v;
5132   PetscScalar       *x,s1,s2,x1,x2,*t;
5133   const PetscScalar *b;
5134 
5135   PetscFunctionBegin;
5136   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
5137   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5138   t  = a->solve_work;
5139 
5140   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
5141   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
5142 
5143   /* forward solve the lower triangular */
5144   idx    = 2*r[0];
5145   t[0] = b[idx]; t[1] = b[1+idx];
5146   for (i=1; i<n; i++) {
5147     v     = aa + 4*ai[i];
5148     vi    = aj + ai[i];
5149     nz    = ai[i+1] - ai[i];
5150     idx   = 2*r[i];
5151     s1  = b[idx]; s2 = b[1+idx];
5152     for(m=0;m<nz;m++){
5153       jdx   = 2*vi[m];
5154       x1    = t[jdx]; x2 = t[1+jdx];
5155       s1 -= v[0]*x1 + v[2]*x2;
5156       s2 -= v[1]*x1 + v[3]*x2;
5157       v += 4;
5158     }
5159     idx = 2*i;
5160     t[idx] = s1; t[1+idx] = s2;
5161   }
5162   /* backward solve the upper triangular */
5163   for (i=n-1; i>=0; i--){
5164     v    = aa + 4*(adiag[i+1]+1);
5165     vi   = aj + adiag[i+1]+1;
5166     nz   = adiag[i] - adiag[i+1] - 1;
5167     idt  = 2*i;
5168     s1 = t[idt]; s2 = t[1+idt];
5169     for(m=0;m<nz;m++){
5170       idx   = 2*vi[m];
5171       x1    = t[idx]; x2 = t[1+idx];
5172       s1 -= v[0]*x1 + v[2]*x2;
5173       s2 -= v[1]*x1 + v[3]*x2;
5174       v += 4;
5175     }
5176     idc = 2*c[i];
5177     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
5178     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
5179   }
5180   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
5181   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
5182   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5183   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5184   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
5185   PetscFunctionReturn(0);
5186 }
5187 
5188 /*
5189       Special case where the matrix was ILU(0) factored in the natural
5190    ordering. This eliminates the need for the column and row permutation.
5191 */
5192 #undef __FUNCT__
5193 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_inplace"
5194 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
5195 {
5196   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
5197   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
5198   PetscErrorCode    ierr;
5199   const MatScalar   *aa=a->a,*v;
5200   PetscScalar       *x,s1,s2,x1,x2;
5201   const PetscScalar *b;
5202   PetscInt          jdx,idt,idx,nz,i;
5203 
5204   PetscFunctionBegin;
5205   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
5206   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5207 
5208   /* forward solve the lower triangular */
5209   idx    = 0;
5210   x[0]   = b[0]; x[1] = b[1];
5211   for (i=1; i<n; i++) {
5212     v     =  aa      + 4*ai[i];
5213     vi    =  aj      + ai[i];
5214     nz    =  diag[i] - ai[i];
5215     idx   +=  2;
5216     s1  =  b[idx];s2 = b[1+idx];
5217     while (nz--) {
5218       jdx   = 2*(*vi++);
5219       x1    = x[jdx];x2 = x[1+jdx];
5220       s1 -= v[0]*x1 + v[2]*x2;
5221       s2 -= v[1]*x1 + v[3]*x2;
5222       v    += 4;
5223     }
5224     x[idx]   = s1;
5225     x[1+idx] = s2;
5226   }
5227   /* backward solve the upper triangular */
5228   for (i=n-1; i>=0; i--){
5229     v    = aa + 4*diag[i] + 4;
5230     vi   = aj + diag[i] + 1;
5231     nz   = ai[i+1] - diag[i] - 1;
5232     idt  = 2*i;
5233     s1 = x[idt];  s2 = x[1+idt];
5234     while (nz--) {
5235       idx   = 2*(*vi++);
5236       x1    = x[idx];   x2 = x[1+idx];
5237       s1 -= v[0]*x1 + v[2]*x2;
5238       s2 -= v[1]*x1 + v[3]*x2;
5239       v    += 4;
5240     }
5241     v        = aa +  4*diag[i];
5242     x[idt]   = v[0]*s1 + v[2]*s2;
5243     x[1+idt] = v[1]*s1 + v[3]*s2;
5244   }
5245 
5246   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5247   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5248   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
5249   PetscFunctionReturn(0);
5250 }
5251 
5252 #undef __FUNCT__
5253 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
5254 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
5255 {
5256     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
5257     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
5258     PetscInt          i,k,nz,idx,idt,jdx;
5259     PetscErrorCode    ierr;
5260     const MatScalar   *aa=a->a,*v;
5261     PetscScalar       *x,s1,s2,x1,x2;
5262     const PetscScalar *b;
5263 
5264     PetscFunctionBegin;
5265     ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
5266     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5267     /* forward solve the lower triangular */
5268     idx    = 0;
5269     x[0] = b[idx]; x[1] = b[1+idx];
5270     for (i=1; i<n; i++) {
5271         v   = aa + 4*ai[i];
5272        vi   = aj + ai[i];
5273        nz   = ai[i+1] - ai[i];
5274        idx  = 2*i;
5275        s1   = b[idx];s2 = b[1+idx];
5276        PetscPrefetchBlock(vi+nz,nz,0,PETSC_PREFETCH_HINT_NTA);
5277        PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA);
5278       for(k=0;k<nz;k++){
5279          jdx   = 2*vi[k];
5280           x1    = x[jdx];x2 = x[1+jdx];
5281           s1   -= v[0]*x1 + v[2]*x2;
5282           s2   -= v[1]*x1 + v[3]*x2;
5283            v   +=  4;
5284         }
5285        x[idx]   = s1;
5286        x[1+idx] = s2;
5287     }
5288 
5289    /* backward solve the upper triangular */
5290   for (i=n-1; i>=0; i--){
5291      v   = aa + 4*(adiag[i+1]+1);
5292      vi  = aj + adiag[i+1]+1;
5293      nz  = adiag[i] - adiag[i+1]-1;
5294      idt = 2*i;
5295      s1 = x[idt];  s2 = x[1+idt];
5296      PetscPrefetchBlock(vi+nz,nz,0,PETSC_PREFETCH_HINT_NTA);
5297      PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA);
5298      for(k=0;k<nz;k++){
5299       idx   = 2*vi[k];
5300        x1    = x[idx];   x2 = x[1+idx];
5301        s1 -= v[0]*x1 + v[2]*x2;
5302        s2 -= v[1]*x1 + v[3]*x2;
5303          v    += 4;
5304     }
5305     /* x = inv_diagonal*x */
5306    x[idt]   = v[0]*s1 + v[2]*s2;
5307    x[1+idt] = v[1]*s1 + v[3]*s2;
5308   }
5309 
5310   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5311   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5312   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
5313   PetscFunctionReturn(0);
5314 }
5315 
5316 #undef __FUNCT__
5317 #define __FUNCT__ "MatSolve_SeqBAIJ_1_inplace"
5318 PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
5319 {
5320   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
5321   IS                iscol=a->col,isrow=a->row;
5322   PetscErrorCode    ierr;
5323   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
5324   PetscInt          i,nz;
5325   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
5326   const MatScalar   *aa=a->a,*v;
5327   PetscScalar       *x,s1,*t;
5328   const PetscScalar *b;
5329 
5330   PetscFunctionBegin;
5331   if (!n) PetscFunctionReturn(0);
5332 
5333   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
5334   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5335   t  = a->solve_work;
5336 
5337   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
5338   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
5339 
5340   /* forward solve the lower triangular */
5341   t[0] = b[*r++];
5342   for (i=1; i<n; i++) {
5343     v     = aa + ai[i];
5344     vi    = aj + ai[i];
5345     nz    = diag[i] - ai[i];
5346     s1  = b[*r++];
5347     while (nz--) {
5348       s1 -= (*v++)*t[*vi++];
5349     }
5350     t[i] = s1;
5351   }
5352   /* backward solve the upper triangular */
5353   for (i=n-1; i>=0; i--){
5354     v    = aa + diag[i] + 1;
5355     vi   = aj + diag[i] + 1;
5356     nz   = ai[i+1] - diag[i] - 1;
5357     s1 = t[i];
5358     while (nz--) {
5359       s1 -= (*v++)*t[*vi++];
5360     }
5361     x[*c--] = t[i] = aa[diag[i]]*s1;
5362   }
5363 
5364   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
5365   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
5366   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5367   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5368   ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr);
5369   PetscFunctionReturn(0);
5370 }
5371 
5372 #undef __FUNCT__
5373 #define __FUNCT__ "MatSolve_SeqBAIJ_1"
5374 PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
5375 {
5376   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data;
5377   IS                iscol = a->col,isrow = a->row;
5378   PetscErrorCode    ierr;
5379   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag = a->diag,nz;
5380   const PetscInt    *rout,*cout,*r,*c;
5381   PetscScalar       *x,*tmp,sum;
5382   const PetscScalar *b;
5383   const MatScalar   *aa = a->a,*v;
5384 
5385   PetscFunctionBegin;
5386   if (!n) PetscFunctionReturn(0);
5387 
5388   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
5389   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5390   tmp  = a->solve_work;
5391 
5392   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
5393   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
5394 
5395   /* forward solve the lower triangular */
5396   tmp[0] = b[r[0]];
5397   v      = aa;
5398   vi     = aj;
5399   for (i=1; i<n; i++) {
5400     nz  = ai[i+1] - ai[i];
5401     sum = b[r[i]];
5402     PetscSparseDenseMinusDot(sum,tmp,v,vi,nz);
5403     tmp[i] = sum;
5404     v += nz; vi += nz;
5405   }
5406 
5407   /* backward solve the upper triangular */
5408   for (i=n-1; i>=0; i--){
5409     v   = aa + adiag[i+1]+1;
5410     vi  = aj + adiag[i+1]+1;
5411     nz  = adiag[i]-adiag[i+1]-1;
5412     sum = tmp[i];
5413     PetscSparseDenseMinusDot(sum,tmp,v,vi,nz);
5414     x[c[i]] = tmp[i] = sum*v[nz]; /* v[nz] = aa[adiag[i]] */
5415   }
5416 
5417   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
5418   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
5419   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5420   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5421   ierr = PetscLogFlops(2*a->nz - A->cmap->n);CHKERRQ(ierr);
5422   PetscFunctionReturn(0);
5423 }
5424 
5425 /*
5426       Special case where the matrix was ILU(0) factored in the natural
5427    ordering. This eliminates the need for the column and row permutation.
5428 */
5429 #undef __FUNCT__
5430 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering_inplace"
5431 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
5432 {
5433   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
5434   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
5435   PetscErrorCode    ierr;
5436   const MatScalar   *aa=a->a,*v;
5437   PetscScalar       *x;
5438   const PetscScalar *b;
5439   PetscScalar       s1,x1;
5440   PetscInt          jdx,idt,idx,nz,i;
5441 
5442   PetscFunctionBegin;
5443   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
5444   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5445 
5446   /* forward solve the lower triangular */
5447   idx    = 0;
5448   x[0]   = b[0];
5449   for (i=1; i<n; i++) {
5450     v     =  aa      + ai[i];
5451     vi    =  aj      + ai[i];
5452     nz    =  diag[i] - ai[i];
5453     idx   +=  1;
5454     s1  =  b[idx];
5455     while (nz--) {
5456       jdx   = *vi++;
5457       x1    = x[jdx];
5458       s1 -= v[0]*x1;
5459       v    += 1;
5460     }
5461     x[idx]   = s1;
5462   }
5463   /* backward solve the upper triangular */
5464   for (i=n-1; i>=0; i--){
5465     v    = aa + diag[i] + 1;
5466     vi   = aj + diag[i] + 1;
5467     nz   = ai[i+1] - diag[i] - 1;
5468     idt  = i;
5469     s1 = x[idt];
5470     while (nz--) {
5471       idx   = *vi++;
5472       x1    = x[idx];
5473       s1 -= v[0]*x1;
5474       v    += 1;
5475     }
5476     v        = aa +  diag[i];
5477     x[idt]   = v[0]*s1;
5478   }
5479   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5480   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5481   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
5482   PetscFunctionReturn(0);
5483 }
5484 
5485 
5486 #undef __FUNCT__
5487 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering"
5488 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
5489 {
5490   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
5491   PetscErrorCode    ierr;
5492   const PetscInt    n = a->mbs,*ai = a->i,*aj = a->j,*adiag = a->diag,*vi;
5493   PetscScalar       *x,sum;
5494   const PetscScalar *b;
5495   const MatScalar   *aa = a->a,*v;
5496   PetscInt          i,nz;
5497 
5498   PetscFunctionBegin;
5499   if (!n) PetscFunctionReturn(0);
5500 
5501   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
5502   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5503 
5504   /* forward solve the lower triangular */
5505   x[0] = b[0];
5506   v    = aa;
5507   vi   = aj;
5508   for (i=1; i<n; i++) {
5509     nz  = ai[i+1] - ai[i];
5510     sum = b[i];
5511     PetscSparseDenseMinusDot(sum,x,v,vi,nz);
5512     v  += nz;
5513     vi += nz;
5514     x[i] = sum;
5515   }
5516 
5517   /* backward solve the upper triangular */
5518   for (i=n-1; i>=0; i--){
5519     v   = aa + adiag[i+1] + 1;
5520     vi  = aj + adiag[i+1] + 1;
5521     nz = adiag[i] - adiag[i+1]-1;
5522     sum = x[i];
5523     PetscSparseDenseMinusDot(sum,x,v,vi,nz);
5524     x[i] = sum*v[nz]; /* x[i]=aa[adiag[i]]*sum; v++; */
5525   }
5526 
5527   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
5528   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5529   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5530   PetscFunctionReturn(0);
5531 }
5532 
5533 /* ----------------------------------------------------------------*/
5534 extern PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscBool );
5535 
5536 #undef __FUNCT__
5537 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering"
5538 /*
5539    This is not much faster than MatLUFactorNumeric_SeqBAIJ_N() but the solve is faster at least sometimes
5540 */
5541 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info)
5542 {
5543   Mat             C=B;
5544   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
5545   PetscErrorCode  ierr;
5546   PetscInt        i,j,k,ipvt[15];
5547   const PetscInt  n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*ajtmp,*bjtmp,*bdiag=b->diag,*pj;
5548   PetscInt        nz,nzL,row;
5549   MatScalar       *rtmp,*pc,*mwork,*pv,*vv,work[225];
5550   const MatScalar *v,*aa=a->a;
5551   PetscInt        bs2 = a->bs2,bs=A->rmap->bs,flg;
5552   PetscInt        sol_ver;
5553 
5554   PetscFunctionBegin;
5555 
5556   ierr = PetscOptionsGetInt(PETSC_NULL,"-sol_ver",&sol_ver,PETSC_NULL);CHKERRQ(ierr);
5557 
5558   /* generate work space needed by the factorization */
5559   ierr = PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);CHKERRQ(ierr);
5560   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
5561 
5562   for (i=0; i<n; i++){
5563     /* zero rtmp */
5564     /* L part */
5565     nz    = bi[i+1] - bi[i];
5566     bjtmp = bj + bi[i];
5567     for  (j=0; j<nz; j++){
5568       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5569     }
5570 
5571     /* U part */
5572     nz = bdiag[i] - bdiag[i+1];
5573     bjtmp = bj + bdiag[i+1]+1;
5574     for  (j=0; j<nz; j++){
5575       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5576     }
5577 
5578     /* load in initial (unfactored row) */
5579     nz    = ai[i+1] - ai[i];
5580     ajtmp = aj + ai[i];
5581     v     = aa + bs2*ai[i];
5582     for (j=0; j<nz; j++) {
5583       ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
5584     }
5585 
5586     /* elimination */
5587     bjtmp = bj + bi[i];
5588     nzL   = bi[i+1] - bi[i];
5589     for(k=0;k < nzL;k++) {
5590       row = bjtmp[k];
5591       pc = rtmp + bs2*row;
5592       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
5593       if (flg) {
5594         pv = b->a + bs2*bdiag[row];
5595 	Kernel_A_gets_A_times_B(bs,pc,pv,mwork);
5596 	/*ierr = Kernel_A_gets_A_times_B_15(pc,pv,mwork);CHKERRQ(ierr);*/
5597 	pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
5598         pv = b->a + bs2*(bdiag[row+1]+1);
5599         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
5600         for (j=0; j<nz; j++) {
5601           vv   = rtmp + bs2*pj[j];
5602           Kernel_A_gets_A_minus_B_times_C(bs,vv,pc,pv);
5603 	  /* ierr = Kernel_A_gets_A_minus_B_times_C_15(vv,pc,pv);CHKERRQ(ierr); */
5604 	  pv  += bs2;
5605         }
5606         ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
5607       }
5608     }
5609 
5610     /* finished row so stick it into b->a */
5611     /* L part */
5612     pv   = b->a + bs2*bi[i] ;
5613     pj   = b->j + bi[i] ;
5614     nz   = bi[i+1] - bi[i];
5615     for (j=0; j<nz; j++) {
5616       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5617     }
5618 
5619     /* Mark diagonal and invert diagonal for simplier triangular solves */
5620     pv   = b->a + bs2*bdiag[i];
5621     pj   = b->j + bdiag[i];
5622     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5623     /* Kernel_A_gets_inverse_A(bs,pv,pivots,work); */
5624     ierr = Kernel_A_gets_inverse_A_15(pv,ipvt,work,info->shiftamount);CHKERRQ(ierr);
5625 
5626     /* U part */
5627     pv = b->a + bs2*(bdiag[i+1]+1);
5628     pj = b->j + bdiag[i+1]+1;
5629     nz = bdiag[i] - bdiag[i+1] - 1;
5630     for (j=0; j<nz; j++){
5631       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5632     }
5633   }
5634 
5635   ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr);
5636   C->ops->solve = MatSolve_SeqBAIJ_15_NaturalOrdering_ver1;
5637   C->ops->solvetranspose = MatSolve_SeqBAIJ_N_NaturalOrdering;
5638   C->assembled = PETSC_TRUE;
5639   ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
5640   PetscFunctionReturn(0);
5641 }
5642 
5643 #undef __FUNCT__
5644 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N"
5645 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N(Mat B,Mat A,const MatFactorInfo *info)
5646 {
5647   Mat            C=B;
5648   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
5649   IS             isrow = b->row,isicol = b->icol;
5650   PetscErrorCode ierr;
5651   const PetscInt *r,*ic;
5652   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
5653   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
5654   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
5655   PetscInt       bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg;
5656   MatScalar      *v_work;
5657   PetscBool      col_identity,row_identity,both_identity;
5658 
5659   PetscFunctionBegin;
5660   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
5661   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
5662 
5663   ierr = PetscMalloc(bs2*n*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
5664   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
5665 
5666   /* generate work space needed by dense LU factorization */
5667   ierr  = PetscMalloc3(bs,MatScalar,&v_work,bs2,MatScalar,&mwork,bs,PetscInt,&v_pivots);CHKERRQ(ierr);
5668 
5669   for (i=0; i<n; i++){
5670     /* zero rtmp */
5671     /* L part */
5672     nz    = bi[i+1] - bi[i];
5673     bjtmp = bj + bi[i];
5674     for  (j=0; j<nz; j++){
5675       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5676     }
5677 
5678     /* U part */
5679     nz = bdiag[i] - bdiag[i+1];
5680     bjtmp = bj + bdiag[i+1]+1;
5681     for  (j=0; j<nz; j++){
5682       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5683     }
5684 
5685     /* load in initial (unfactored row) */
5686     nz    = ai[r[i]+1] - ai[r[i]];
5687     ajtmp = aj + ai[r[i]];
5688     v     = aa + bs2*ai[r[i]];
5689     for (j=0; j<nz; j++) {
5690       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
5691     }
5692 
5693     /* elimination */
5694     bjtmp = bj + bi[i];
5695     nzL   = bi[i+1] - bi[i];
5696     for(k=0;k < nzL;k++) {
5697       row = bjtmp[k];
5698       pc = rtmp + bs2*row;
5699       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
5700       if (flg) {
5701         pv         = b->a + bs2*bdiag[row];
5702         Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */
5703         pj         = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
5704         pv         = b->a + bs2*(bdiag[row+1]+1);
5705         nz         = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
5706         for (j=0; j<nz; j++) {
5707           Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j);
5708         }
5709         ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
5710       }
5711     }
5712 
5713     /* finished row so stick it into b->a */
5714     /* L part */
5715     pv   = b->a + bs2*bi[i] ;
5716     pj   = b->j + bi[i] ;
5717     nz   = bi[i+1] - bi[i];
5718     for (j=0; j<nz; j++) {
5719       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5720     }
5721 
5722     /* Mark diagonal and invert diagonal for simplier triangular solves */
5723     pv  = b->a + bs2*bdiag[i];
5724     pj  = b->j + bdiag[i];
5725     /* if (*pj != i)SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */
5726     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5727     ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr);
5728 
5729     /* U part */
5730     pv = b->a + bs2*(bdiag[i+1]+1);
5731     pj = b->j + bdiag[i+1]+1;
5732     nz = bdiag[i] - bdiag[i+1] - 1;
5733     for (j=0; j<nz; j++){
5734       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5735     }
5736   }
5737 
5738   ierr = PetscFree(rtmp);CHKERRQ(ierr);
5739   ierr = PetscFree3(v_work,mwork,v_pivots);CHKERRQ(ierr);
5740   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
5741   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
5742 
5743   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5744   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
5745   both_identity = (PetscBool) (row_identity && col_identity);
5746   if (both_identity){
5747     C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering;
5748   } else {
5749     C->ops->solve = MatSolve_SeqBAIJ_N;
5750   }
5751   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_N;
5752 
5753   C->assembled = PETSC_TRUE;
5754   ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
5755   PetscFunctionReturn(0);
5756 }
5757 
5758 /*
5759    ilu(0) with natural ordering under new data structure.
5760    See MatILUFactorSymbolic_SeqAIJ_ilu0() for detailed description
5761    because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_inplace().
5762 */
5763 
5764 #undef __FUNCT__
5765 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0"
5766 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
5767 {
5768 
5769   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
5770   PetscErrorCode     ierr;
5771   PetscInt           n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2;
5772   PetscInt           i,j,nz,*bi,*bj,*bdiag,bi_temp;
5773 
5774   PetscFunctionBegin;
5775   ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr);
5776   b    = (Mat_SeqBAIJ*)(fact)->data;
5777 
5778   /* allocate matrix arrays for new data structure */
5779   ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,n+1,PetscInt,&b->i);CHKERRQ(ierr);
5780   ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(n+1)*sizeof(PetscInt));CHKERRQ(ierr);
5781   b->singlemalloc = PETSC_TRUE;
5782   if (!b->diag){
5783     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr);
5784     ierr = PetscLogObjectMemory(fact,(n+1)*sizeof(PetscInt));CHKERRQ(ierr);
5785   }
5786   bdiag = b->diag;
5787 
5788   if (n > 0) {
5789     ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr);
5790   }
5791 
5792   /* set bi and bj with new data structure */
5793   bi = b->i;
5794   bj = b->j;
5795 
5796   /* L part */
5797   bi[0] = 0;
5798   for (i=0; i<n; i++){
5799     nz = adiag[i] - ai[i];
5800     bi[i+1] = bi[i] + nz;
5801     aj = a->j + ai[i];
5802     for (j=0; j<nz; j++){
5803       *bj = aj[j]; bj++;
5804     }
5805   }
5806 
5807   /* U part */
5808   bi_temp = bi[n];
5809   bdiag[n] = bi[n]-1;
5810   for (i=n-1; i>=0; i--){
5811     nz = ai[i+1] - adiag[i] - 1;
5812     bi_temp = bi_temp + nz + 1;
5813     aj = a->j + adiag[i] + 1;
5814     for (j=0; j<nz; j++){
5815       *bj = aj[j]; bj++;
5816     }
5817     /* diag[i] */
5818     *bj = i; bj++;
5819     bdiag[i] = bi_temp - 1;
5820   }
5821   PetscFunctionReturn(0);
5822 }
5823 
5824 #undef __FUNCT__
5825 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ"
5826 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
5827 {
5828   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
5829   IS                 isicol;
5830   PetscErrorCode     ierr;
5831   const PetscInt     *r,*ic;
5832   PetscInt           n=a->mbs,*ai=a->i,*aj=a->j,d;
5833   PetscInt           *bi,*cols,nnz,*cols_lvl;
5834   PetscInt           *bdiag,prow,fm,nzbd,reallocs=0,dcount=0;
5835   PetscInt           i,levels,diagonal_fill;
5836   PetscBool          col_identity,row_identity,both_identity;
5837   PetscReal          f;
5838   PetscInt           nlnk,*lnk,*lnk_lvl=PETSC_NULL;
5839   PetscBT            lnkbt;
5840   PetscInt           nzi,*bj,**bj_ptr,**bjlvl_ptr;
5841   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
5842   PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL;
5843   PetscBool          missing;
5844   PetscInt           bs=A->rmap->bs,bs2=a->bs2;
5845 
5846   PetscFunctionBegin;
5847   if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
5848   if (bs>1){  /* check shifttype */
5849     if (info->shifttype == MAT_SHIFT_NONZERO || info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE)
5850       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Only MAT_SHIFT_NONE and MAT_SHIFT_INBLOCKS are supported for BAIJ matrix");
5851   }
5852 
5853   ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr);
5854   if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
5855 
5856   f             = info->fill;
5857   levels        = (PetscInt)info->levels;
5858   diagonal_fill = (PetscInt)info->diagonal_fill;
5859   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
5860 
5861   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5862   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
5863   both_identity = (PetscBool) (row_identity && col_identity);
5864 
5865   if (!levels && both_identity) {
5866     /* special case: ilu(0) with natural ordering */
5867     ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0(fact,A,isrow,iscol,info);CHKERRQ(ierr);
5868     ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
5869 
5870     fact->factortype               = MAT_FACTOR_ILU;
5871     (fact)->info.factor_mallocs    = 0;
5872     (fact)->info.fill_ratio_given  = info->fill;
5873     (fact)->info.fill_ratio_needed = 1.0;
5874     b                = (Mat_SeqBAIJ*)(fact)->data;
5875     b->row           = isrow;
5876     b->col           = iscol;
5877     b->icol          = isicol;
5878     ierr             = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5879     ierr             = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5880     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5881     ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
5882     PetscFunctionReturn(0);
5883   }
5884 
5885   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
5886   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
5887 
5888   /* get new row pointers */
5889   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
5890   bi[0] = 0;
5891   /* bdiag is location of diagonal in factor */
5892   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr);
5893   bdiag[0]  = 0;
5894 
5895   ierr = PetscMalloc2(n,PetscInt*,&bj_ptr,n,PetscInt*,&bjlvl_ptr);CHKERRQ(ierr);
5896 
5897   /* create a linked list for storing column indices of the active row */
5898   nlnk = n + 1;
5899   ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
5900 
5901   /* initial FreeSpace size is f*(ai[n]+1) */
5902   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr);
5903   current_space = free_space;
5904   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr);
5905   current_space_lvl = free_space_lvl;
5906 
5907   for (i=0; i<n; i++) {
5908     nzi = 0;
5909     /* copy current row into linked list */
5910     nnz  = ai[r[i]+1] - ai[r[i]];
5911     if (!nnz) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i);
5912     cols = aj + ai[r[i]];
5913     lnk[i] = -1; /* marker to indicate if diagonal exists */
5914     ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
5915     nzi += nlnk;
5916 
5917     /* make sure diagonal entry is included */
5918     if (diagonal_fill && lnk[i] == -1) {
5919       fm = n;
5920       while (lnk[fm] < i) fm = lnk[fm];
5921       lnk[i]     = lnk[fm]; /* insert diagonal into linked list */
5922       lnk[fm]    = i;
5923       lnk_lvl[i] = 0;
5924       nzi++; dcount++;
5925     }
5926 
5927     /* add pivot rows into the active row */
5928     nzbd = 0;
5929     prow = lnk[n];
5930     while (prow < i) {
5931       nnz      = bdiag[prow];
5932       cols     = bj_ptr[prow] + nnz + 1;
5933       cols_lvl = bjlvl_ptr[prow] + nnz + 1;
5934       nnz      = bi[prow+1] - bi[prow] - nnz - 1;
5935       ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr);
5936       nzi += nlnk;
5937       prow = lnk[prow];
5938       nzbd++;
5939     }
5940     bdiag[i] = nzbd;
5941     bi[i+1]  = bi[i] + nzi;
5942 
5943     /* if free space is not available, make more free space */
5944     if (current_space->local_remaining<nzi) {
5945       nnz = 2*nzi*(n - i); /* estimated and max additional space needed */
5946       ierr = PetscFreeSpaceGet(nnz,&current_space);CHKERRQ(ierr);
5947       ierr = PetscFreeSpaceGet(nnz,&current_space_lvl);CHKERRQ(ierr);
5948       reallocs++;
5949     }
5950 
5951     /* copy data into free_space and free_space_lvl, then initialize lnk */
5952     ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr);
5953     bj_ptr[i]    = current_space->array;
5954     bjlvl_ptr[i] = current_space_lvl->array;
5955 
5956     /* make sure the active row i has diagonal entry */
5957     if (*(bj_ptr[i]+bdiag[i]) != i) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\ntry running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i);
5958 
5959     current_space->array           += nzi;
5960     current_space->local_used      += nzi;
5961     current_space->local_remaining -= nzi;
5962     current_space_lvl->array           += nzi;
5963     current_space_lvl->local_used      += nzi;
5964     current_space_lvl->local_remaining -= nzi;
5965   }
5966 
5967   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
5968   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
5969 
5970   /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */
5971   ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr);
5972   ierr = PetscFreeSpaceContiguous_LU(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr);
5973 
5974   ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
5975   ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr);
5976   ierr = PetscFree2(bj_ptr,bjlvl_ptr);CHKERRQ(ierr);
5977 
5978 #if defined(PETSC_USE_INFO)
5979   {
5980     PetscReal af = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]);
5981     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr);
5982     ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
5983     ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr);
5984     ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
5985     if (diagonal_fill) {
5986       ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr);
5987     }
5988   }
5989 #endif
5990 
5991   /* put together the new matrix */
5992   ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
5993   ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
5994   b = (Mat_SeqBAIJ*)(fact)->data;
5995   b->free_a       = PETSC_TRUE;
5996   b->free_ij      = PETSC_TRUE;
5997   b->singlemalloc = PETSC_FALSE;
5998   ierr = PetscMalloc( (bs2*(bdiag[0]+1) )*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
5999   b->j          = bj;
6000   b->i          = bi;
6001   b->diag       = bdiag;
6002   b->free_diag  = PETSC_TRUE;
6003   b->ilen       = 0;
6004   b->imax       = 0;
6005   b->row        = isrow;
6006   b->col        = iscol;
6007   ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
6008   ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
6009   b->icol       = isicol;
6010   ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
6011   /* In b structure:  Free imax, ilen, old a, old j.
6012      Allocate bdiag, solve_work, new a, new j */
6013   ierr = PetscLogObjectMemory(fact,(bdiag[0]+1) * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr);
6014   b->maxnz = b->nz = bdiag[0]+1;
6015   fact->info.factor_mallocs    = reallocs;
6016   fact->info.fill_ratio_given  = f;
6017   fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]);
6018   ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
6019   PetscFunctionReturn(0);
6020 }
6021 
6022 /*
6023      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
6024    except that the data structure of Mat_SeqAIJ is slightly different.
6025    Not a good example of code reuse.
6026 */
6027 #undef __FUNCT__
6028 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_inplace"
6029 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_inplace(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
6030 {
6031   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b;
6032   IS             isicol;
6033   PetscErrorCode ierr;
6034   const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi;
6035   PetscInt       prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp;
6036   PetscInt       *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0;
6037   PetscInt       incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd;
6038   PetscBool      col_identity,row_identity,both_identity,flg;
6039   PetscReal      f;
6040 
6041   PetscFunctionBegin;
6042   ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr);
6043   if (flg) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd);
6044 
6045   f             = info->fill;
6046   levels        = (PetscInt)info->levels;
6047   diagonal_fill = (PetscInt)info->diagonal_fill;
6048   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
6049 
6050   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
6051   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
6052   both_identity = (PetscBool) (row_identity && col_identity);
6053 
6054   if (!levels && both_identity) {  /* special case copy the nonzero structure */
6055     ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr);
6056     ierr = MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);CHKERRQ(ierr);
6057 
6058     fact->factortype = MAT_FACTOR_ILU;
6059     b            = (Mat_SeqBAIJ*)fact->data;
6060     b->row       = isrow;
6061     b->col       = iscol;
6062     ierr         = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
6063     ierr         = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
6064     b->icol      = isicol;
6065     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
6066     ierr         = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
6067     PetscFunctionReturn(0);
6068   }
6069 
6070   /* general case perform the symbolic factorization */
6071     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
6072     ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
6073 
6074     /* get new row pointers */
6075     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr);
6076     ainew[0] = 0;
6077     /* don't know how many column pointers are needed so estimate */
6078     jmax = (PetscInt)(f*ai[n] + 1);
6079     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr);
6080     /* ajfill is level of fill for each fill entry */
6081     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr);
6082     /* fill is a linked list of nonzeros in active row */
6083     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr);
6084     /* im is level for each filled value */
6085     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr);
6086     /* dloc is location of diagonal in factor */
6087     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr);
6088     dloc[0]  = 0;
6089     for (prow=0; prow<n; prow++) {
6090 
6091       /* copy prow into linked list */
6092       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
6093       if (!nz) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow);
6094       xi         = aj + ai[r[prow]];
6095       fill[n]    = n;
6096       fill[prow] = -1; /* marker for diagonal entry */
6097       while (nz--) {
6098 	fm  = n;
6099 	idx = ic[*xi++];
6100 	do {
6101 	  m  = fm;
6102 	  fm = fill[m];
6103 	} while (fm < idx);
6104 	fill[m]   = idx;
6105 	fill[idx] = fm;
6106 	im[idx]   = 0;
6107       }
6108 
6109       /* make sure diagonal entry is included */
6110       if (diagonal_fill && fill[prow] == -1) {
6111 	fm = n;
6112 	while (fill[fm] < prow) fm = fill[fm];
6113 	fill[prow] = fill[fm];  /* insert diagonal into linked list */
6114 	fill[fm]   = prow;
6115 	im[prow]   = 0;
6116 	nzf++;
6117 	dcount++;
6118       }
6119 
6120       nzi = 0;
6121       row = fill[n];
6122       while (row < prow) {
6123 	incrlev = im[row] + 1;
6124 	nz      = dloc[row];
6125 	xi      = ajnew  + ainew[row] + nz + 1;
6126 	flev    = ajfill + ainew[row] + nz + 1;
6127 	nnz     = ainew[row+1] - ainew[row] - nz - 1;
6128 	fm      = row;
6129 	while (nnz-- > 0) {
6130 	  idx = *xi++;
6131 	  if (*flev + incrlev > levels) {
6132 	    flev++;
6133 	    continue;
6134 	  }
6135 	  do {
6136 	    m  = fm;
6137 	    fm = fill[m];
6138 	  } while (fm < idx);
6139 	  if (fm != idx) {
6140 	    im[idx]   = *flev + incrlev;
6141 	    fill[m]   = idx;
6142 	    fill[idx] = fm;
6143 	    fm        = idx;
6144 	    nzf++;
6145 	  } else {
6146 	    if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
6147 	  }
6148 	  flev++;
6149 	}
6150 	row = fill[row];
6151 	nzi++;
6152       }
6153       /* copy new filled row into permanent storage */
6154       ainew[prow+1] = ainew[prow] + nzf;
6155       if (ainew[prow+1] > jmax) {
6156 
6157 	/* estimate how much additional space we will need */
6158 	/* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
6159 	/* just double the memory each time */
6160 	PetscInt maxadd = jmax;
6161 	/* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
6162 	if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
6163 	jmax += maxadd;
6164 
6165 	/* allocate a longer ajnew and ajfill */
6166 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
6167 	ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
6168 	ierr = PetscFree(ajnew);CHKERRQ(ierr);
6169 	ajnew = xitmp;
6170 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
6171 	ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
6172 	ierr = PetscFree(ajfill);CHKERRQ(ierr);
6173 	ajfill = xitmp;
6174 	reallocate++; /* count how many reallocations are needed */
6175       }
6176       xitmp       = ajnew + ainew[prow];
6177       flev        = ajfill + ainew[prow];
6178       dloc[prow]  = nzi;
6179       fm          = fill[n];
6180       while (nzf--) {
6181 	*xitmp++ = fm;
6182 	*flev++ = im[fm];
6183 	fm      = fill[fm];
6184       }
6185       /* make sure row has diagonal entry */
6186       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
6187 	SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
6188     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow);
6189       }
6190     }
6191     ierr = PetscFree(ajfill);CHKERRQ(ierr);
6192     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
6193     ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
6194     ierr = PetscFree(fill);CHKERRQ(ierr);
6195     ierr = PetscFree(im);CHKERRQ(ierr);
6196 
6197 #if defined(PETSC_USE_INFO)
6198     {
6199       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
6200       ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr);
6201       ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
6202       ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr);
6203       ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
6204       if (diagonal_fill) {
6205 	ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr);
6206       }
6207     }
6208 #endif
6209 
6210     /* put together the new matrix */
6211     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
6212     ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
6213     b    = (Mat_SeqBAIJ*)fact->data;
6214     b->free_a       = PETSC_TRUE;
6215     b->free_ij      = PETSC_TRUE;
6216     b->singlemalloc = PETSC_FALSE;
6217     ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
6218     b->j          = ajnew;
6219     b->i          = ainew;
6220     for (i=0; i<n; i++) dloc[i] += ainew[i];
6221     b->diag       = dloc;
6222     b->free_diag  = PETSC_TRUE;
6223     b->ilen       = 0;
6224     b->imax       = 0;
6225     b->row        = isrow;
6226     b->col        = iscol;
6227     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
6228     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
6229     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
6230     b->icol       = isicol;
6231     ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
6232     /* In b structure:  Free imax, ilen, old a, old j.
6233        Allocate dloc, solve_work, new a, new j */
6234     ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr);
6235     b->maxnz          = b->nz = ainew[n];
6236 
6237     fact->info.factor_mallocs    = reallocate;
6238     fact->info.fill_ratio_given  = f;
6239     fact->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
6240 
6241   ierr = MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);CHKERRQ(ierr);
6242   PetscFunctionReturn(0);
6243 }
6244 
6245 #undef __FUNCT__
6246 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE"
6247 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
6248 {
6249   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
6250   /* int i,*AJ=a->j,nz=a->nz; */
6251   PetscFunctionBegin;
6252   /* Undo Column scaling */
6253 /*    while (nz--) { */
6254 /*      AJ[i] = AJ[i]/4; */
6255 /*    } */
6256   /* This should really invoke a push/pop logic, but we don't have that yet. */
6257   A->ops->setunfactored = PETSC_NULL;
6258   PetscFunctionReturn(0);
6259 }
6260 
6261 #undef __FUNCT__
6262 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj"
6263 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
6264 {
6265   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
6266   PetscInt       *AJ=a->j,nz=a->nz;
6267   unsigned short *aj=(unsigned short *)AJ;
6268   PetscFunctionBegin;
6269   /* Is this really necessary? */
6270   while (nz--) {
6271     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
6272   }
6273   A->ops->setunfactored = PETSC_NULL;
6274   PetscFunctionReturn(0);
6275 }
6276 
6277 
6278