xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision 047240e14af00aad1ef65e96f6fface8924f7f7e)
1 
2 /*
3     Factorization code for BAIJ format.
4 */
5 
6 #include <../src/mat/impls/baij/seq/baij.h>
7 #include <../src/mat/blockinvert.h>
8 #include <petscbt.h>
9 #include <../src/mat/utils/freespace.h>
10 
11 #undef __FUNCT__
12 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering"
13 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
14 {
15   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
16   PetscErrorCode    ierr;
17   const PetscInt    *adiag = a->diag,*ai = a->i,*aj = a->j,*vi;
18   PetscInt          i,n = a->mbs,j;
19   PetscInt          nz;
20   PetscScalar       *x,*tmp,s1;
21   const MatScalar   *aa = a->a,*v;
22   const PetscScalar *b;
23 
24   PetscFunctionBegin;
25   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
26   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
27   tmp  = a->solve_work;
28 
29 
30   /* copy the b into temp work space according to permutation */
31   for (i=0; i<n; i++) tmp[i] = b[i];
32 
33   /* forward solve the U^T */
34   for (i=0; i<n; i++) {
35     v   = aa + adiag[i+1] + 1;
36     vi  = aj + adiag[i+1] + 1;
37     nz  = adiag[i] - adiag[i+1] - 1;
38     s1  = tmp[i];
39     s1 *= v[nz];  /* multiply by inverse of diagonal entry */
40     for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
41     tmp[i] = s1;
42   }
43 
44   /* backward solve the L^T */
45   for (i=n-1; i>=0; i--) {
46     v  = aa + ai[i];
47     vi = aj + ai[i];
48     nz = ai[i+1] - ai[i];
49     s1 = tmp[i];
50     for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
51   }
52 
53   /* copy tmp into x according to permutation */
54   for (i=0; i<n; i++) x[i] = tmp[i];
55 
56   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
57   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
58 
59   ierr = PetscLogFlops(2.0*a->nz-A->cmap->n);CHKERRQ(ierr);
60   PetscFunctionReturn(0);
61 }
62 
63 #undef __FUNCT__
64 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace"
65 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
66 {
67   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data;
68   PetscErrorCode  ierr;
69   PetscInt        i,nz;
70   const PetscInt  *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
71   const MatScalar *aa   =a->a,*v;
72   PetscScalar     s1,*x;
73 
74   PetscFunctionBegin;
75   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
76   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
77 
78   /* forward solve the U^T */
79   for (i=0; i<n; i++) {
80 
81     v = aa + diag[i];
82     /* multiply by the inverse of the block diagonal */
83     s1 = (*v++)*x[i];
84     vi = aj + diag[i] + 1;
85     nz = ai[i+1] - diag[i] - 1;
86     while (nz--) {
87       x[*vi++] -= (*v++)*s1;
88     }
89     x[i] = s1;
90   }
91   /* backward solve the L^T */
92   for (i=n-1; i>=0; i--) {
93     v  = aa + diag[i] - 1;
94     vi = aj + diag[i] - 1;
95     nz = diag[i] - ai[i];
96     s1 = x[i];
97     while (nz--) {
98       x[*vi--] -=  (*v--)*s1;
99     }
100   }
101   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
102   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
103   PetscFunctionReturn(0);
104 }
105 
106 #undef __FUNCT__
107 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace"
108 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
109 {
110   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data;
111   PetscErrorCode  ierr;
112   PetscInt        i,nz,idx,idt,oidx;
113   const PetscInt  *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j;
114   const MatScalar *aa   =a->a,*v;
115   PetscScalar     s1,s2,x1,x2,*x;
116 
117   PetscFunctionBegin;
118   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
119   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
120 
121   /* forward solve the U^T */
122   idx = 0;
123   for (i=0; i<n; i++) {
124 
125     v = aa + 4*diag[i];
126     /* multiply by the inverse of the block diagonal */
127     x1 = x[idx];   x2 = x[1+idx];
128     s1 = v[0]*x1  +  v[1]*x2;
129     s2 = v[2]*x1  +  v[3]*x2;
130     v += 4;
131 
132     vi = aj + diag[i] + 1;
133     nz = ai[i+1] - diag[i] - 1;
134     while (nz--) {
135       oidx       = 2*(*vi++);
136       x[oidx]   -= v[0]*s1  +  v[1]*s2;
137       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
138       v         += 4;
139     }
140     x[idx] = s1;x[1+idx] = s2;
141     idx   += 2;
142   }
143   /* backward solve the L^T */
144   for (i=n-1; i>=0; i--) {
145     v   = aa + 4*diag[i] - 4;
146     vi  = aj + diag[i] - 1;
147     nz  = diag[i] - ai[i];
148     idt = 2*i;
149     s1  = x[idt];  s2 = x[1+idt];
150     while (nz--) {
151       idx       = 2*(*vi--);
152       x[idx]   -=  v[0]*s1 +  v[1]*s2;
153       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
154       v        -= 4;
155     }
156   }
157   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
158   ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
159   PetscFunctionReturn(0);
160 }
161 
162 #undef __FUNCT__
163 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering"
164 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
165 {
166   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data;
167   PetscErrorCode  ierr;
168   const PetscInt  n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
169   PetscInt        nz,idx,idt,j,i,oidx;
170   const PetscInt  bs =A->rmap->bs,bs2=a->bs2;
171   const MatScalar *aa=a->a,*v;
172   PetscScalar     s1,s2,x1,x2,*x;
173 
174   PetscFunctionBegin;
175   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
176   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
177 
178   /* forward solve the U^T */
179   idx = 0;
180   for (i=0; i<n; i++) {
181     v = aa + bs2*diag[i];
182     /* multiply by the inverse of the block diagonal */
183     x1 = x[idx];   x2 = x[1+idx];
184     s1 = v[0]*x1  +  v[1]*x2;
185     s2 = v[2]*x1  +  v[3]*x2;
186     v -= bs2;
187 
188     vi = aj + diag[i] - 1;
189     nz = diag[i] - diag[i+1] - 1;
190     for (j=0; j>-nz; j--) {
191       oidx       = bs*vi[j];
192       x[oidx]   -= v[0]*s1  +  v[1]*s2;
193       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
194       v         -= bs2;
195     }
196     x[idx] = s1;x[1+idx] = s2;
197     idx   += bs;
198   }
199   /* backward solve the L^T */
200   for (i=n-1; i>=0; i--) {
201     v   = aa + bs2*ai[i];
202     vi  = aj + ai[i];
203     nz  = ai[i+1] - ai[i];
204     idt = bs*i;
205     s1  = x[idt];  s2 = x[1+idt];
206     for (j=0; j<nz; j++) {
207       idx       = bs*vi[j];
208       x[idx]   -=  v[0]*s1 +  v[1]*s2;
209       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
210       v        += bs2;
211     }
212   }
213   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
214   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
215   PetscFunctionReturn(0);
216 }
217 
218 #undef __FUNCT__
219 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace"
220 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
221 {
222   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data;
223   PetscErrorCode  ierr;
224   const PetscInt  n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
225   PetscInt        i,nz,idx,idt,oidx;
226   const MatScalar *aa=a->a,*v;
227   PetscScalar     s1,s2,s3,x1,x2,x3,*x;
228 
229   PetscFunctionBegin;
230   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
231   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
232 
233   /* forward solve the U^T */
234   idx = 0;
235   for (i=0; i<n; i++) {
236 
237     v = aa + 9*diag[i];
238     /* multiply by the inverse of the block diagonal */
239     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
240     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
241     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
242     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
243     v += 9;
244 
245     vi = aj + diag[i] + 1;
246     nz = ai[i+1] - diag[i] - 1;
247     while (nz--) {
248       oidx       = 3*(*vi++);
249       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
250       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
251       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
252       v         += 9;
253     }
254     x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;
255     idx   += 3;
256   }
257   /* backward solve the L^T */
258   for (i=n-1; i>=0; i--) {
259     v   = aa + 9*diag[i] - 9;
260     vi  = aj + diag[i] - 1;
261     nz  = diag[i] - ai[i];
262     idt = 3*i;
263     s1  = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
264     while (nz--) {
265       idx       = 3*(*vi--);
266       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
267       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
268       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
269       v        -= 9;
270     }
271   }
272   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
273   ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
274   PetscFunctionReturn(0);
275 }
276 
277 #undef __FUNCT__
278 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering"
279 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
280 {
281   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data;
282   PetscErrorCode  ierr;
283   const PetscInt  n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
284   PetscInt        nz,idx,idt,j,i,oidx;
285   const PetscInt  bs =A->rmap->bs,bs2=a->bs2;
286   const MatScalar *aa=a->a,*v;
287   PetscScalar     s1,s2,s3,x1,x2,x3,*x;
288 
289   PetscFunctionBegin;
290   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
291   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
292 
293   /* forward solve the U^T */
294   idx = 0;
295   for (i=0; i<n; i++) {
296     v = aa + bs2*diag[i];
297     /* multiply by the inverse of the block diagonal */
298     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];
299     s1 = v[0]*x1  +  v[1]*x2  + v[2]*x3;
300     s2 = v[3]*x1  +  v[4]*x2  + v[5]*x3;
301     s3 = v[6]*x1  +  v[7]*x2  + v[8]*x3;
302     v -= bs2;
303 
304     vi = aj + diag[i] - 1;
305     nz = diag[i] - diag[i+1] - 1;
306     for (j=0; j>-nz; j--) {
307       oidx       = bs*vi[j];
308       x[oidx]   -= v[0]*s1  +  v[1]*s2  + v[2]*s3;
309       x[oidx+1] -= v[3]*s1  +  v[4]*s2  + v[5]*s3;
310       x[oidx+2] -= v[6]*s1  +  v[7]*s2  + v[8]*s3;
311       v         -= bs2;
312     }
313     x[idx] = s1;x[1+idx] = s2;  x[2+idx] = s3;
314     idx   += bs;
315   }
316   /* backward solve the L^T */
317   for (i=n-1; i>=0; i--) {
318     v   = aa + bs2*ai[i];
319     vi  = aj + ai[i];
320     nz  = ai[i+1] - ai[i];
321     idt = bs*i;
322     s1  = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];
323     for (j=0; j<nz; j++) {
324       idx       = bs*vi[j];
325       x[idx]   -= v[0]*s1  +  v[1]*s2  + v[2]*s3;
326       x[idx+1] -= v[3]*s1  +  v[4]*s2  + v[5]*s3;
327       x[idx+2] -= v[6]*s1  +  v[7]*s2  + v[8]*s3;
328       v        += bs2;
329     }
330   }
331   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
332   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
333   PetscFunctionReturn(0);
334 }
335 
336 #undef __FUNCT__
337 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace"
338 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
339 {
340   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data;
341   PetscErrorCode  ierr;
342   const PetscInt  *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
343   PetscInt        i,nz,idx,idt,oidx;
344   const MatScalar *aa=a->a,*v;
345   PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4,*x;
346 
347   PetscFunctionBegin;
348   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
349   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
350 
351   /* forward solve the U^T */
352   idx = 0;
353   for (i=0; i<n; i++) {
354 
355     v = aa + 16*diag[i];
356     /* multiply by the inverse of the block diagonal */
357     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
358     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
359     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
360     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
361     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
362     v += 16;
363 
364     vi = aj + diag[i] + 1;
365     nz = ai[i+1] - diag[i] - 1;
366     while (nz--) {
367       oidx       = 4*(*vi++);
368       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
369       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
370       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
371       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
372       v         += 16;
373     }
374     x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
375     idx   += 4;
376   }
377   /* backward solve the L^T */
378   for (i=n-1; i>=0; i--) {
379     v   = aa + 16*diag[i] - 16;
380     vi  = aj + diag[i] - 1;
381     nz  = diag[i] - ai[i];
382     idt = 4*i;
383     s1  = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
384     while (nz--) {
385       idx       = 4*(*vi--);
386       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
387       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
388       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
389       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
390       v        -= 16;
391     }
392   }
393   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
394   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
395   PetscFunctionReturn(0);
396 }
397 
398 #undef __FUNCT__
399 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering"
400 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
401 {
402   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data;
403   PetscErrorCode  ierr;
404   const PetscInt  n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
405   PetscInt        nz,idx,idt,j,i,oidx;
406   const PetscInt  bs =A->rmap->bs,bs2=a->bs2;
407   const MatScalar *aa=a->a,*v;
408   PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4,*x;
409 
410   PetscFunctionBegin;
411   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
412   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
413 
414   /* forward solve the U^T */
415   idx = 0;
416   for (i=0; i<n; i++) {
417     v = aa + bs2*diag[i];
418     /* multiply by the inverse of the block diagonal */
419     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
420     s1 =  v[0]*x1  +  v[1]*x2  + v[2]*x3  + v[3]*x4;
421     s2 =  v[4]*x1  +  v[5]*x2  + v[6]*x3  + v[7]*x4;
422     s3 =  v[8]*x1  +  v[9]*x2  + v[10]*x3 + v[11]*x4;
423     s4 =  v[12]*x1 +  v[13]*x2 + v[14]*x3 + v[15]*x4;
424     v -= bs2;
425 
426     vi = aj + diag[i] - 1;
427     nz = diag[i] - diag[i+1] - 1;
428     for (j=0; j>-nz; j--) {
429       oidx       = bs*vi[j];
430       x[oidx]   -=  v[0]*s1  +  v[1]*s2  + v[2]*s3  + v[3]*s4;
431       x[oidx+1] -=  v[4]*s1  +  v[5]*s2  + v[6]*s3  + v[7]*s4;
432       x[oidx+2] -=  v[8]*s1  +  v[9]*s2  + v[10]*s3 + v[11]*s4;
433       x[oidx+3] -=  v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
434       v         -= bs2;
435     }
436     x[idx] = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4;
437     idx   += bs;
438   }
439   /* backward solve the L^T */
440   for (i=n-1; i>=0; i--) {
441     v   = aa + bs2*ai[i];
442     vi  = aj + ai[i];
443     nz  = ai[i+1] - ai[i];
444     idt = bs*i;
445     s1  = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];
446     for (j=0; j<nz; j++) {
447       idx       = bs*vi[j];
448       x[idx]   -=  v[0]*s1  +  v[1]*s2  + v[2]*s3  + v[3]*s4;
449       x[idx+1] -=  v[4]*s1  +  v[5]*s2  + v[6]*s3  + v[7]*s4;
450       x[idx+2] -=  v[8]*s1  +  v[9]*s2  + v[10]*s3 + v[11]*s4;
451       x[idx+3] -=  v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
452       v        += bs2;
453     }
454   }
455   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
456   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
457   PetscFunctionReturn(0);
458 }
459 
460 #undef __FUNCT__
461 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace"
462 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
463 {
464   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data;
465   PetscErrorCode  ierr;
466   const PetscInt  *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
467   PetscInt        i,nz,idx,idt,oidx;
468   const MatScalar *aa=a->a,*v;
469   PetscScalar     s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x;
470 
471   PetscFunctionBegin;
472   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
473   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
474 
475   /* forward solve the U^T */
476   idx = 0;
477   for (i=0; i<n; i++) {
478 
479     v = aa + 25*diag[i];
480     /* multiply by the inverse of the block diagonal */
481     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
482     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
483     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
484     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
485     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
486     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
487     v += 25;
488 
489     vi = aj + diag[i] + 1;
490     nz = ai[i+1] - diag[i] - 1;
491     while (nz--) {
492       oidx       = 5*(*vi++);
493       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
494       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
495       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
496       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
497       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
498       v         += 25;
499     }
500     x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
501     idx   += 5;
502   }
503   /* backward solve the L^T */
504   for (i=n-1; i>=0; i--) {
505     v   = aa + 25*diag[i] - 25;
506     vi  = aj + diag[i] - 1;
507     nz  = diag[i] - ai[i];
508     idt = 5*i;
509     s1  = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
510     while (nz--) {
511       idx       = 5*(*vi--);
512       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
513       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
514       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
515       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
516       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
517       v        -= 25;
518     }
519   }
520   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
521   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
522   PetscFunctionReturn(0);
523 }
524 
525 #undef __FUNCT__
526 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering"
527 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
528 {
529   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data;
530   PetscErrorCode  ierr;
531   const PetscInt  n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
532   PetscInt        nz,idx,idt,j,i,oidx;
533   const PetscInt  bs =A->rmap->bs,bs2=a->bs2;
534   const MatScalar *aa=a->a,*v;
535   PetscScalar     s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x;
536 
537   PetscFunctionBegin;
538   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
539   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
540 
541   /* forward solve the U^T */
542   idx = 0;
543   for (i=0; i<n; i++) {
544     v = aa + bs2*diag[i];
545     /* multiply by the inverse of the block diagonal */
546     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
547     x5 = x[4+idx];
548     s1 =  v[0]*x1   +  v[1]*x2   + v[2]*x3   + v[3]*x4   + v[4]*x5;
549     s2 =  v[5]*x1   +  v[6]*x2   + v[7]*x3   + v[8]*x4   + v[9]*x5;
550     s3 =  v[10]*x1  +  v[11]*x2  + v[12]*x3  + v[13]*x4  + v[14]*x5;
551     s4 =  v[15]*x1  +  v[16]*x2  + v[17]*x3  + v[18]*x4  + v[19]*x5;
552     s5 =  v[20]*x1  +  v[21]*x2  + v[22]*x3  + v[23]*x4   + v[24]*x5;
553     v -= bs2;
554 
555     vi = aj + diag[i] - 1;
556     nz = diag[i] - diag[i+1] - 1;
557     for (j=0; j>-nz; j--) {
558       oidx       = bs*vi[j];
559       x[oidx]   -=  v[0]*s1   +  v[1]*s2   + v[2]*s3   + v[3]*s4   + v[4]*s5;
560       x[oidx+1] -=  v[5]*s1   +  v[6]*s2   + v[7]*s3   + v[8]*s4   + v[9]*s5;
561       x[oidx+2] -=  v[10]*s1  +  v[11]*s2  + v[12]*s3  + v[13]*s4  + v[14]*s5;
562       x[oidx+3] -=  v[15]*s1  +  v[16]*s2  + v[17]*s3  + v[18]*s4  + v[19]*s5;
563       x[oidx+4] -=  v[20]*s1  +  v[21]*s2  + v[22]*s3  + v[23]*s4   + v[24]*s5;
564       v         -= bs2;
565     }
566     x[idx] = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
567     idx   += bs;
568   }
569   /* backward solve the L^T */
570   for (i=n-1; i>=0; i--) {
571     v   = aa + bs2*ai[i];
572     vi  = aj + ai[i];
573     nz  = ai[i+1] - ai[i];
574     idt = bs*i;
575     s1  = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
576     for (j=0; j<nz; j++) {
577       idx       = bs*vi[j];
578       x[idx]   -=  v[0]*s1   +  v[1]*s2   + v[2]*s3   + v[3]*s4   + v[4]*s5;
579       x[idx+1] -=  v[5]*s1   +  v[6]*s2   + v[7]*s3   + v[8]*s4   + v[9]*s5;
580       x[idx+2] -=  v[10]*s1  +  v[11]*s2  + v[12]*s3  + v[13]*s4  + v[14]*s5;
581       x[idx+3] -=  v[15]*s1  +  v[16]*s2  + v[17]*s3  + v[18]*s4  + v[19]*s5;
582       x[idx+4] -=  v[20]*s1  +  v[21]*s2  + v[22]*s3  + v[23]*s4   + v[24]*s5;
583       v        += bs2;
584     }
585   }
586   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
587   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
588   PetscFunctionReturn(0);
589 }
590 
591 #undef __FUNCT__
592 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace"
593 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
594 {
595   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data;
596   PetscErrorCode  ierr;
597   const PetscInt  *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
598   PetscInt        i,nz,idx,idt,oidx;
599   const MatScalar *aa=a->a,*v;
600   PetscScalar     s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x;
601 
602   PetscFunctionBegin;
603   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
604   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
605 
606   /* forward solve the U^T */
607   idx = 0;
608   for (i=0; i<n; i++) {
609 
610     v = aa + 36*diag[i];
611     /* multiply by the inverse of the block diagonal */
612     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
613     x6 = x[5+idx];
614     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
615     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
616     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
617     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
618     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
619     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
620     v += 36;
621 
622     vi = aj + diag[i] + 1;
623     nz = ai[i+1] - diag[i] - 1;
624     while (nz--) {
625       oidx       = 6*(*vi++);
626       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
627       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
628       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
629       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
630       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
631       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
632       v         += 36;
633     }
634     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
635     x[5+idx] = s6;
636     idx     += 6;
637   }
638   /* backward solve the L^T */
639   for (i=n-1; i>=0; i--) {
640     v   = aa + 36*diag[i] - 36;
641     vi  = aj + diag[i] - 1;
642     nz  = diag[i] - ai[i];
643     idt = 6*i;
644     s1  = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
645     s6  = x[5+idt];
646     while (nz--) {
647       idx       = 6*(*vi--);
648       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
649       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
650       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
651       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
652       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
653       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
654       v        -= 36;
655     }
656   }
657   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
658   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
659   PetscFunctionReturn(0);
660 }
661 
662 #undef __FUNCT__
663 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering"
664 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
665 {
666   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data;
667   PetscErrorCode  ierr;
668   const PetscInt  n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
669   PetscInt        nz,idx,idt,j,i,oidx;
670   const PetscInt  bs =A->rmap->bs,bs2=a->bs2;
671   const MatScalar *aa=a->a,*v;
672   PetscScalar     s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x;
673 
674   PetscFunctionBegin;
675   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
676   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
677 
678   /* forward solve the U^T */
679   idx = 0;
680   for (i=0; i<n; i++) {
681     v = aa + bs2*diag[i];
682     /* multiply by the inverse of the block diagonal */
683     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
684     x5 = x[4+idx]; x6 = x[5+idx];
685     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
686     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
687     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
688     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
689     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
690     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
691     v -= bs2;
692 
693     vi = aj + diag[i] - 1;
694     nz = diag[i] - diag[i+1] - 1;
695     for (j=0; j>-nz; j--) {
696       oidx       = bs*vi[j];
697       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
698       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
699       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
700       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
701       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
702       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
703       v         -= bs2;
704     }
705     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
706     x[5+idx] = s6;
707     idx     += bs;
708   }
709   /* backward solve the L^T */
710   for (i=n-1; i>=0; i--) {
711     v   = aa + bs2*ai[i];
712     vi  = aj + ai[i];
713     nz  = ai[i+1] - ai[i];
714     idt = bs*i;
715     s1  = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
716     s6  = x[5+idt];
717     for (j=0; j<nz; j++) {
718       idx       = bs*vi[j];
719       x[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
720       x[idx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
721       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
722       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
723       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
724       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
725       v        += bs2;
726     }
727   }
728   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
729   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
730   PetscFunctionReturn(0);
731 }
732 
733 #undef __FUNCT__
734 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace"
735 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
736 {
737   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data;
738   PetscErrorCode  ierr;
739   const PetscInt  *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
740   PetscInt        i,nz,idx,idt,oidx;
741   const MatScalar *aa=a->a,*v;
742   PetscScalar     s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x;
743 
744   PetscFunctionBegin;
745   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
746   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
747 
748   /* forward solve the U^T */
749   idx = 0;
750   for (i=0; i<n; i++) {
751 
752     v = aa + 49*diag[i];
753     /* multiply by the inverse of the block diagonal */
754     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
755     x6 = x[5+idx]; x7 = x[6+idx];
756     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
757     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
758     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
759     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
760     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
761     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
762     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
763     v += 49;
764 
765     vi = aj + diag[i] + 1;
766     nz = ai[i+1] - diag[i] - 1;
767     while (nz--) {
768       oidx       = 7*(*vi++);
769       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
770       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
771       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
772       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
773       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
774       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
775       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
776       v         += 49;
777     }
778     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
779     x[5+idx] = s6;x[6+idx] = s7;
780     idx     += 7;
781   }
782   /* backward solve the L^T */
783   for (i=n-1; i>=0; i--) {
784     v   = aa + 49*diag[i] - 49;
785     vi  = aj + diag[i] - 1;
786     nz  = diag[i] - ai[i];
787     idt = 7*i;
788     s1  = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
789     s6  = x[5+idt];s7 = x[6+idt];
790     while (nz--) {
791       idx       = 7*(*vi--);
792       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
793       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
794       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
795       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
796       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
797       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
798       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
799       v        -= 49;
800     }
801   }
802   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
803   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
804   PetscFunctionReturn(0);
805 }
806 #undef __FUNCT__
807 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering"
808 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
809 {
810   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data;
811   PetscErrorCode  ierr;
812   const PetscInt  n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
813   PetscInt        nz,idx,idt,j,i,oidx;
814   const PetscInt  bs =A->rmap->bs,bs2=a->bs2;
815   const MatScalar *aa=a->a,*v;
816   PetscScalar     s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x;
817 
818   PetscFunctionBegin;
819   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
820   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
821 
822   /* forward solve the U^T */
823   idx = 0;
824   for (i=0; i<n; i++) {
825     v = aa + bs2*diag[i];
826     /* multiply by the inverse of the block diagonal */
827     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
828     x5 = x[4+idx]; x6 = x[5+idx];  x7 = x[6+idx];
829     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
830     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
831     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
832     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
833     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
834     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
835     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
836     v -= bs2;
837     vi = aj + diag[i] - 1;
838     nz = diag[i] - diag[i+1] - 1;
839     for (j=0; j>-nz; j--) {
840       oidx       = bs*vi[j];
841       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
842       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
843       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
844       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
845       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
846       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
847       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
848       v         -= bs2;
849     }
850     x[idx]   = s1;  x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
851     x[5+idx] = s6;  x[6+idx] = s7;
852     idx     += bs;
853   }
854   /* backward solve the L^T */
855   for (i=n-1; i>=0; i--) {
856     v   = aa + bs2*ai[i];
857     vi  = aj + ai[i];
858     nz  = ai[i+1] - ai[i];
859     idt = bs*i;
860     s1  = x[idt];    s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
861     s6  = x[5+idt];  s7 = x[6+idt];
862     for (j=0; j<nz; j++) {
863       idx       = bs*vi[j];
864       x[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
865       x[idx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
866       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
867       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
868       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
869       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
870       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
871       v        += bs2;
872     }
873   }
874   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
875   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
876   PetscFunctionReturn(0);
877 }
878 
879 /*---------------------------------------------------------------------------------------------*/
880 #undef __FUNCT__
881 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1"
882 PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
883 {
884   Mat_SeqBAIJ       *a    = (Mat_SeqBAIJ*)A->data;
885   IS                iscol = a->col,isrow = a->row;
886   PetscErrorCode    ierr;
887   const PetscInt    *rout,*cout,*r,*c,*adiag = a->diag,*ai = a->i,*aj = a->j,*vi;
888   PetscInt          i,n = a->mbs,j;
889   PetscInt          nz;
890   PetscScalar       *x,*tmp,s1;
891   const MatScalar   *aa = a->a,*v;
892   const PetscScalar *b;
893 
894   PetscFunctionBegin;
895   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
896   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
897   tmp  = a->solve_work;
898 
899   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
900   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
901 
902   /* copy the b into temp work space according to permutation */
903   for (i=0; i<n; i++) tmp[i] = b[c[i]];
904 
905   /* forward solve the U^T */
906   for (i=0; i<n; i++) {
907     v   = aa + adiag[i+1] + 1;
908     vi  = aj + adiag[i+1] + 1;
909     nz  = adiag[i] - adiag[i+1] - 1;
910     s1  = tmp[i];
911     s1 *= v[nz];  /* multiply by inverse of diagonal entry */
912     for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
913     tmp[i] = s1;
914   }
915 
916   /* backward solve the L^T */
917   for (i=n-1; i>=0; i--) {
918     v  = aa + ai[i];
919     vi = aj + ai[i];
920     nz = ai[i+1] - ai[i];
921     s1 = tmp[i];
922     for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
923   }
924 
925   /* copy tmp into x according to permutation */
926   for (i=0; i<n; i++) x[r[i]] = tmp[i];
927 
928   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
929   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
930   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
931   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
932 
933   ierr = PetscLogFlops(2.0*a->nz-A->cmap->n);CHKERRQ(ierr);
934   PetscFunctionReturn(0);
935 }
936 
937 #undef __FUNCT__
938 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_inplace"
939 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
940 {
941   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
942   IS                iscol=a->col,isrow=a->row;
943   PetscErrorCode    ierr;
944   const PetscInt    *r,*c,*rout,*cout;
945   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
946   PetscInt          i,nz;
947   const MatScalar   *aa=a->a,*v;
948   PetscScalar       s1,*x,*t;
949   const PetscScalar *b;
950 
951   PetscFunctionBegin;
952   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
953   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
954   t    = a->solve_work;
955 
956   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
957   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
958 
959   /* copy the b into temp work space according to permutation */
960   for (i=0; i<n; i++) t[i] = b[c[i]];
961 
962   /* forward solve the U^T */
963   for (i=0; i<n; i++) {
964 
965     v = aa + diag[i];
966     /* multiply by the inverse of the block diagonal */
967     s1 = (*v++)*t[i];
968     vi = aj + diag[i] + 1;
969     nz = ai[i+1] - diag[i] - 1;
970     while (nz--) {
971       t[*vi++] -= (*v++)*s1;
972     }
973     t[i] = s1;
974   }
975   /* backward solve the L^T */
976   for (i=n-1; i>=0; i--) {
977     v  = aa + diag[i] - 1;
978     vi = aj + diag[i] - 1;
979     nz = diag[i] - ai[i];
980     s1 = t[i];
981     while (nz--) {
982       t[*vi--] -=  (*v--)*s1;
983     }
984   }
985 
986   /* copy t into x according to permutation */
987   for (i=0; i<n; i++) x[r[i]] = t[i];
988 
989   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
990   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
991   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
992   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
993   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
994   PetscFunctionReturn(0);
995 }
996 
997 #undef __FUNCT__
998 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_inplace"
999 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
1000 {
1001   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
1002   IS                iscol=a->col,isrow=a->row;
1003   PetscErrorCode    ierr;
1004   const PetscInt    *r,*c,*rout,*cout;
1005   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1006   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1007   const MatScalar   *aa=a->a,*v;
1008   PetscScalar       s1,s2,x1,x2,*x,*t;
1009   const PetscScalar *b;
1010 
1011   PetscFunctionBegin;
1012   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1013   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1014   t    = a->solve_work;
1015 
1016   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1017   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1018 
1019   /* copy the b into temp work space according to permutation */
1020   ii = 0;
1021   for (i=0; i<n; i++) {
1022     ic      = 2*c[i];
1023     t[ii]   = b[ic];
1024     t[ii+1] = b[ic+1];
1025     ii     += 2;
1026   }
1027 
1028   /* forward solve the U^T */
1029   idx = 0;
1030   for (i=0; i<n; i++) {
1031 
1032     v = aa + 4*diag[i];
1033     /* multiply by the inverse of the block diagonal */
1034     x1 = t[idx];   x2 = t[1+idx];
1035     s1 = v[0]*x1  +  v[1]*x2;
1036     s2 = v[2]*x1  +  v[3]*x2;
1037     v += 4;
1038 
1039     vi = aj + diag[i] + 1;
1040     nz = ai[i+1] - diag[i] - 1;
1041     while (nz--) {
1042       oidx       = 2*(*vi++);
1043       t[oidx]   -= v[0]*s1  +  v[1]*s2;
1044       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
1045       v         += 4;
1046     }
1047     t[idx] = s1;t[1+idx] = s2;
1048     idx   += 2;
1049   }
1050   /* backward solve the L^T */
1051   for (i=n-1; i>=0; i--) {
1052     v   = aa + 4*diag[i] - 4;
1053     vi  = aj + diag[i] - 1;
1054     nz  = diag[i] - ai[i];
1055     idt = 2*i;
1056     s1  = t[idt];  s2 = t[1+idt];
1057     while (nz--) {
1058       idx       = 2*(*vi--);
1059       t[idx]   -=  v[0]*s1 +  v[1]*s2;
1060       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
1061       v        -= 4;
1062     }
1063   }
1064 
1065   /* copy t into x according to permutation */
1066   ii = 0;
1067   for (i=0; i<n; i++) {
1068     ir      = 2*r[i];
1069     x[ir]   = t[ii];
1070     x[ir+1] = t[ii+1];
1071     ii     += 2;
1072   }
1073 
1074   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1075   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1076   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1077   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1078   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
1079   PetscFunctionReturn(0);
1080 }
1081 
1082 #undef __FUNCT__
1083 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2"
1084 PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
1085 {
1086   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ*)A->data;
1087   PetscErrorCode    ierr;
1088   IS                iscol=a->col,isrow=a->row;
1089   const PetscInt    n    =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1090   const PetscInt    *r,*c,*rout,*cout;
1091   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1092   const PetscInt    bs =A->rmap->bs,bs2=a->bs2;
1093   const MatScalar   *aa=a->a,*v;
1094   PetscScalar       s1,s2,x1,x2,*x,*t;
1095   const PetscScalar *b;
1096 
1097   PetscFunctionBegin;
1098   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1099   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1100   t    = a->solve_work;
1101 
1102   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1103   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1104 
1105   /* copy b into temp work space according to permutation */
1106   for (i=0; i<n; i++) {
1107     ii    = bs*i; ic = bs*c[i];
1108     t[ii] = b[ic]; t[ii+1] = b[ic+1];
1109   }
1110 
1111   /* forward solve the U^T */
1112   idx = 0;
1113   for (i=0; i<n; i++) {
1114     v = aa + bs2*diag[i];
1115     /* multiply by the inverse of the block diagonal */
1116     x1 = t[idx];   x2 = t[1+idx];
1117     s1 = v[0]*x1  +  v[1]*x2;
1118     s2 = v[2]*x1  +  v[3]*x2;
1119     v -= bs2;
1120 
1121     vi = aj + diag[i] - 1;
1122     nz = diag[i] - diag[i+1] - 1;
1123     for (j=0; j>-nz; j--) {
1124       oidx       = bs*vi[j];
1125       t[oidx]   -= v[0]*s1  +  v[1]*s2;
1126       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
1127       v         -= bs2;
1128     }
1129     t[idx] = s1;t[1+idx] = s2;
1130     idx   += bs;
1131   }
1132   /* backward solve the L^T */
1133   for (i=n-1; i>=0; i--) {
1134     v   = aa + bs2*ai[i];
1135     vi  = aj + ai[i];
1136     nz  = ai[i+1] - ai[i];
1137     idt = bs*i;
1138     s1  = t[idt];  s2 = t[1+idt];
1139     for (j=0; j<nz; j++) {
1140       idx       = bs*vi[j];
1141       t[idx]   -=  v[0]*s1 +  v[1]*s2;
1142       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
1143       v        += bs2;
1144     }
1145   }
1146 
1147   /* copy t into x according to permutation */
1148   for (i=0; i<n; i++) {
1149     ii    = bs*i;  ir = bs*r[i];
1150     x[ir] = t[ii];  x[ir+1] = t[ii+1];
1151   }
1152 
1153   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1154   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1155   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1156   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1157   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1158   PetscFunctionReturn(0);
1159 }
1160 
1161 #undef __FUNCT__
1162 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_inplace"
1163 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
1164 {
1165   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
1166   IS                iscol=a->col,isrow=a->row;
1167   PetscErrorCode    ierr;
1168   const PetscInt    *r,*c,*rout,*cout;
1169   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1170   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1171   const MatScalar   *aa=a->a,*v;
1172   PetscScalar       s1,s2,s3,x1,x2,x3,*x,*t;
1173   const PetscScalar *b;
1174 
1175   PetscFunctionBegin;
1176   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1177   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1178   t    = a->solve_work;
1179 
1180   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1181   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1182 
1183   /* copy the b into temp work space according to permutation */
1184   ii = 0;
1185   for (i=0; i<n; i++) {
1186     ic      = 3*c[i];
1187     t[ii]   = b[ic];
1188     t[ii+1] = b[ic+1];
1189     t[ii+2] = b[ic+2];
1190     ii     += 3;
1191   }
1192 
1193   /* forward solve the U^T */
1194   idx = 0;
1195   for (i=0; i<n; i++) {
1196 
1197     v = aa + 9*diag[i];
1198     /* multiply by the inverse of the block diagonal */
1199     x1 = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
1200     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
1201     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
1202     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
1203     v += 9;
1204 
1205     vi = aj + diag[i] + 1;
1206     nz = ai[i+1] - diag[i] - 1;
1207     while (nz--) {
1208       oidx       = 3*(*vi++);
1209       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
1210       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
1211       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1212       v         += 9;
1213     }
1214     t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;
1215     idx   += 3;
1216   }
1217   /* backward solve the L^T */
1218   for (i=n-1; i>=0; i--) {
1219     v   = aa + 9*diag[i] - 9;
1220     vi  = aj + diag[i] - 1;
1221     nz  = diag[i] - ai[i];
1222     idt = 3*i;
1223     s1  = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
1224     while (nz--) {
1225       idx       = 3*(*vi--);
1226       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
1227       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
1228       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1229       v        -= 9;
1230     }
1231   }
1232 
1233   /* copy t into x according to permutation */
1234   ii = 0;
1235   for (i=0; i<n; i++) {
1236     ir      = 3*r[i];
1237     x[ir]   = t[ii];
1238     x[ir+1] = t[ii+1];
1239     x[ir+2] = t[ii+2];
1240     ii     += 3;
1241   }
1242 
1243   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1244   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1245   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1246   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1247   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
1248   PetscFunctionReturn(0);
1249 }
1250 
1251 #undef __FUNCT__
1252 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3"
1253 PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
1254 {
1255   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ*)A->data;
1256   PetscErrorCode    ierr;
1257   IS                iscol=a->col,isrow=a->row;
1258   const PetscInt    n    =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1259   const PetscInt    *r,*c,*rout,*cout;
1260   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1261   const PetscInt    bs =A->rmap->bs,bs2=a->bs2;
1262   const MatScalar   *aa=a->a,*v;
1263   PetscScalar       s1,s2,s3,x1,x2,x3,*x,*t;
1264   const PetscScalar *b;
1265 
1266   PetscFunctionBegin;
1267   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1268   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1269   t    = a->solve_work;
1270 
1271   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1272   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1273 
1274   /* copy b into temp work space according to permutation */
1275   for (i=0; i<n; i++) {
1276     ii    = bs*i; ic = bs*c[i];
1277     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2];
1278   }
1279 
1280   /* forward solve the U^T */
1281   idx = 0;
1282   for (i=0; i<n; i++) {
1283     v = aa + bs2*diag[i];
1284     /* multiply by the inverse of the block diagonal */
1285     x1 = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
1286     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
1287     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
1288     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
1289     v -= bs2;
1290 
1291     vi = aj + diag[i] - 1;
1292     nz = diag[i] - diag[i+1] - 1;
1293     for (j=0; j>-nz; j--) {
1294       oidx       = bs*vi[j];
1295       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
1296       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
1297       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1298       v         -= bs2;
1299     }
1300     t[idx] = s1;t[1+idx] = s2;  t[2+idx] = s3;
1301     idx   += bs;
1302   }
1303   /* backward solve the L^T */
1304   for (i=n-1; i>=0; i--) {
1305     v   = aa + bs2*ai[i];
1306     vi  = aj + ai[i];
1307     nz  = ai[i+1] - ai[i];
1308     idt = bs*i;
1309     s1  = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];
1310     for (j=0; j<nz; j++) {
1311       idx       = bs*vi[j];
1312       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
1313       t[idx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
1314       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1315       v        += bs2;
1316     }
1317   }
1318 
1319   /* copy t into x according to permutation */
1320   for (i=0; i<n; i++) {
1321     ii    = bs*i;  ir = bs*r[i];
1322     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];
1323   }
1324 
1325   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1326   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1327   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1328   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1329   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1330   PetscFunctionReturn(0);
1331 }
1332 
1333 #undef __FUNCT__
1334 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_inplace"
1335 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
1336 {
1337   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
1338   IS                iscol=a->col,isrow=a->row;
1339   PetscErrorCode    ierr;
1340   const PetscInt    *r,*c,*rout,*cout;
1341   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1342   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1343   const MatScalar   *aa=a->a,*v;
1344   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x,*t;
1345   const PetscScalar *b;
1346 
1347   PetscFunctionBegin;
1348   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1349   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1350   t    = a->solve_work;
1351 
1352   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1353   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1354 
1355   /* copy the b into temp work space according to permutation */
1356   ii = 0;
1357   for (i=0; i<n; i++) {
1358     ic      = 4*c[i];
1359     t[ii]   = b[ic];
1360     t[ii+1] = b[ic+1];
1361     t[ii+2] = b[ic+2];
1362     t[ii+3] = b[ic+3];
1363     ii     += 4;
1364   }
1365 
1366   /* forward solve the U^T */
1367   idx = 0;
1368   for (i=0; i<n; i++) {
1369 
1370     v = aa + 16*diag[i];
1371     /* multiply by the inverse of the block diagonal */
1372     x1 = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
1373     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1374     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1375     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1376     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1377     v += 16;
1378 
1379     vi = aj + diag[i] + 1;
1380     nz = ai[i+1] - diag[i] - 1;
1381     while (nz--) {
1382       oidx       = 4*(*vi++);
1383       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
1384       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
1385       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1386       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1387       v         += 16;
1388     }
1389     t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
1390     idx   += 4;
1391   }
1392   /* backward solve the L^T */
1393   for (i=n-1; i>=0; i--) {
1394     v   = aa + 16*diag[i] - 16;
1395     vi  = aj + diag[i] - 1;
1396     nz  = diag[i] - ai[i];
1397     idt = 4*i;
1398     s1  = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
1399     while (nz--) {
1400       idx       = 4*(*vi--);
1401       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
1402       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
1403       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1404       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1405       v        -= 16;
1406     }
1407   }
1408 
1409   /* copy t into x according to permutation */
1410   ii = 0;
1411   for (i=0; i<n; i++) {
1412     ir      = 4*r[i];
1413     x[ir]   = t[ii];
1414     x[ir+1] = t[ii+1];
1415     x[ir+2] = t[ii+2];
1416     x[ir+3] = t[ii+3];
1417     ii     += 4;
1418   }
1419 
1420   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1421   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1422   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1423   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1424   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
1425   PetscFunctionReturn(0);
1426 }
1427 
1428 #undef __FUNCT__
1429 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4"
1430 PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
1431 {
1432   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ*)A->data;
1433   PetscErrorCode    ierr;
1434   IS                iscol=a->col,isrow=a->row;
1435   const PetscInt    n    =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1436   const PetscInt    *r,*c,*rout,*cout;
1437   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1438   const PetscInt    bs =A->rmap->bs,bs2=a->bs2;
1439   const MatScalar   *aa=a->a,*v;
1440   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x,*t;
1441   const PetscScalar *b;
1442 
1443   PetscFunctionBegin;
1444   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1445   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1446   t    = a->solve_work;
1447 
1448   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1449   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1450 
1451   /* copy b into temp work space according to permutation */
1452   for (i=0; i<n; i++) {
1453     ii    = bs*i; ic = bs*c[i];
1454     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1455   }
1456 
1457   /* forward solve the U^T */
1458   idx = 0;
1459   for (i=0; i<n; i++) {
1460     v = aa + bs2*diag[i];
1461     /* multiply by the inverse of the block diagonal */
1462     x1 = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];  x4 = t[3+idx];
1463     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1464     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1465     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1466     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1467     v -= bs2;
1468 
1469     vi = aj + diag[i] - 1;
1470     nz = diag[i] - diag[i+1] - 1;
1471     for (j=0; j>-nz; j--) {
1472       oidx       = bs*vi[j];
1473       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
1474       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
1475       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1476       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1477       v         -= bs2;
1478     }
1479     t[idx] = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4;
1480     idx   += bs;
1481   }
1482   /* backward solve the L^T */
1483   for (i=n-1; i>=0; i--) {
1484     v   = aa + bs2*ai[i];
1485     vi  = aj + ai[i];
1486     nz  = ai[i+1] - ai[i];
1487     idt = bs*i;
1488     s1  = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt];
1489     for (j=0; j<nz; j++) {
1490       idx       = bs*vi[j];
1491       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3  +  v[3]*s4;
1492       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3  +  v[7]*s4;
1493       t[idx+2] -=  v[8]*s1 +  v[9]*s2 +  v[10]*s3 + v[11]*s4;
1494       t[idx+3] -= v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
1495       v        += bs2;
1496     }
1497   }
1498 
1499   /* copy t into x according to permutation */
1500   for (i=0; i<n; i++) {
1501     ii    = bs*i;  ir = bs*r[i];
1502     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
1503   }
1504 
1505   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1506   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1507   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1508   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1509   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1510   PetscFunctionReturn(0);
1511 }
1512 
1513 #undef __FUNCT__
1514 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_inplace"
1515 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
1516 {
1517   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
1518   IS                iscol=a->col,isrow=a->row;
1519   PetscErrorCode    ierr;
1520   const PetscInt    *r,*c,*rout,*cout;
1521   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1522   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1523   const MatScalar   *aa=a->a,*v;
1524   PetscScalar       s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t;
1525   const PetscScalar *b;
1526 
1527   PetscFunctionBegin;
1528   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1529   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1530   t    = a->solve_work;
1531 
1532   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1533   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1534 
1535   /* copy the b into temp work space according to permutation */
1536   ii = 0;
1537   for (i=0; i<n; i++) {
1538     ic      = 5*c[i];
1539     t[ii]   = b[ic];
1540     t[ii+1] = b[ic+1];
1541     t[ii+2] = b[ic+2];
1542     t[ii+3] = b[ic+3];
1543     t[ii+4] = b[ic+4];
1544     ii     += 5;
1545   }
1546 
1547   /* forward solve the U^T */
1548   idx = 0;
1549   for (i=0; i<n; i++) {
1550 
1551     v = aa + 25*diag[i];
1552     /* multiply by the inverse of the block diagonal */
1553     x1 = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1554     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1555     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1556     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1557     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1558     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1559     v += 25;
1560 
1561     vi = aj + diag[i] + 1;
1562     nz = ai[i+1] - diag[i] - 1;
1563     while (nz--) {
1564       oidx       = 5*(*vi++);
1565       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1566       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1567       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1568       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1569       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1570       v         += 25;
1571     }
1572     t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1573     idx   += 5;
1574   }
1575   /* backward solve the L^T */
1576   for (i=n-1; i>=0; i--) {
1577     v   = aa + 25*diag[i] - 25;
1578     vi  = aj + diag[i] - 1;
1579     nz  = diag[i] - ai[i];
1580     idt = 5*i;
1581     s1  = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1582     while (nz--) {
1583       idx       = 5*(*vi--);
1584       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1585       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1586       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1587       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1588       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1589       v        -= 25;
1590     }
1591   }
1592 
1593   /* copy t into x according to permutation */
1594   ii = 0;
1595   for (i=0; i<n; i++) {
1596     ir      = 5*r[i];
1597     x[ir]   = t[ii];
1598     x[ir+1] = t[ii+1];
1599     x[ir+2] = t[ii+2];
1600     x[ir+3] = t[ii+3];
1601     x[ir+4] = t[ii+4];
1602     ii     += 5;
1603   }
1604 
1605   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1606   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1607   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1608   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1609   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
1610   PetscFunctionReturn(0);
1611 }
1612 
1613 #undef __FUNCT__
1614 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5"
1615 PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
1616 {
1617   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ*)A->data;
1618   PetscErrorCode    ierr;
1619   IS                iscol=a->col,isrow=a->row;
1620   const PetscInt    n    =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1621   const PetscInt    *r,*c,*rout,*cout;
1622   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1623   const PetscInt    bs =A->rmap->bs,bs2=a->bs2;
1624   const MatScalar   *aa=a->a,*v;
1625   PetscScalar       s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t;
1626   const PetscScalar *b;
1627 
1628   PetscFunctionBegin;
1629   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1630   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1631   t    = a->solve_work;
1632 
1633   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1634   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1635 
1636   /* copy b into temp work space according to permutation */
1637   for (i=0; i<n; i++) {
1638     ii      = bs*i; ic = bs*c[i];
1639     t[ii]   = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1640     t[ii+4] = b[ic+4];
1641   }
1642 
1643   /* forward solve the U^T */
1644   idx = 0;
1645   for (i=0; i<n; i++) {
1646     v = aa + bs2*diag[i];
1647     /* multiply by the inverse of the block diagonal */
1648     x1 = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1649     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1650     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1651     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1652     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1653     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1654     v -= bs2;
1655 
1656     vi = aj + diag[i] - 1;
1657     nz = diag[i] - diag[i+1] - 1;
1658     for (j=0; j>-nz; j--) {
1659       oidx       = bs*vi[j];
1660       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1661       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1662       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1663       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1664       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1665       v         -= bs2;
1666     }
1667     t[idx] = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
1668     idx   += bs;
1669   }
1670   /* backward solve the L^T */
1671   for (i=n-1; i>=0; i--) {
1672     v   = aa + bs2*ai[i];
1673     vi  = aj + ai[i];
1674     nz  = ai[i+1] - ai[i];
1675     idt = bs*i;
1676     s1  = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
1677     for (j=0; j<nz; j++) {
1678       idx       = bs*vi[j];
1679       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1680       t[idx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1681       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1682       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1683       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1684       v        += bs2;
1685     }
1686   }
1687 
1688   /* copy t into x according to permutation */
1689   for (i=0; i<n; i++) {
1690     ii      = bs*i;  ir = bs*r[i];
1691     x[ir]   = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
1692     x[ir+4] = t[ii+4];
1693   }
1694 
1695   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1696   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1697   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1698   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1699   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1700   PetscFunctionReturn(0);
1701 }
1702 
1703 #undef __FUNCT__
1704 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_inplace"
1705 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
1706 {
1707   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
1708   IS                iscol=a->col,isrow=a->row;
1709   PetscErrorCode    ierr;
1710   const PetscInt    *r,*c,*rout,*cout;
1711   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1712   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1713   const MatScalar   *aa=a->a,*v;
1714   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t;
1715   const PetscScalar *b;
1716 
1717   PetscFunctionBegin;
1718   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1719   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1720   t    = a->solve_work;
1721 
1722   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1723   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1724 
1725   /* copy the b into temp work space according to permutation */
1726   ii = 0;
1727   for (i=0; i<n; i++) {
1728     ic      = 6*c[i];
1729     t[ii]   = b[ic];
1730     t[ii+1] = b[ic+1];
1731     t[ii+2] = b[ic+2];
1732     t[ii+3] = b[ic+3];
1733     t[ii+4] = b[ic+4];
1734     t[ii+5] = b[ic+5];
1735     ii     += 6;
1736   }
1737 
1738   /* forward solve the U^T */
1739   idx = 0;
1740   for (i=0; i<n; i++) {
1741 
1742     v = aa + 36*diag[i];
1743     /* multiply by the inverse of the block diagonal */
1744     x1 = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1745     x6 = t[5+idx];
1746     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
1747     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
1748     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
1749     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
1750     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
1751     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
1752     v += 36;
1753 
1754     vi = aj + diag[i] + 1;
1755     nz = ai[i+1] - diag[i] - 1;
1756     while (nz--) {
1757       oidx       = 6*(*vi++);
1758       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1759       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1760       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1761       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1762       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1763       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1764       v         += 36;
1765     }
1766     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1767     t[5+idx] = s6;
1768     idx     += 6;
1769   }
1770   /* backward solve the L^T */
1771   for (i=n-1; i>=0; i--) {
1772     v   = aa + 36*diag[i] - 36;
1773     vi  = aj + diag[i] - 1;
1774     nz  = diag[i] - ai[i];
1775     idt = 6*i;
1776     s1  = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1777     s6  = t[5+idt];
1778     while (nz--) {
1779       idx       = 6*(*vi--);
1780       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1781       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1782       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1783       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1784       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1785       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1786       v        -= 36;
1787     }
1788   }
1789 
1790   /* copy t into x according to permutation */
1791   ii = 0;
1792   for (i=0; i<n; i++) {
1793     ir      = 6*r[i];
1794     x[ir]   = t[ii];
1795     x[ir+1] = t[ii+1];
1796     x[ir+2] = t[ii+2];
1797     x[ir+3] = t[ii+3];
1798     x[ir+4] = t[ii+4];
1799     x[ir+5] = t[ii+5];
1800     ii     += 6;
1801   }
1802 
1803   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1804   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1805   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1806   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1807   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1808   PetscFunctionReturn(0);
1809 }
1810 
1811 #undef __FUNCT__
1812 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6"
1813 PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
1814 {
1815   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ*)A->data;
1816   PetscErrorCode    ierr;
1817   IS                iscol=a->col,isrow=a->row;
1818   const PetscInt    n    =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1819   const PetscInt    *r,*c,*rout,*cout;
1820   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1821   const PetscInt    bs =A->rmap->bs,bs2=a->bs2;
1822   const MatScalar   *aa=a->a,*v;
1823   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t;
1824   const PetscScalar *b;
1825 
1826   PetscFunctionBegin;
1827   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1828   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1829   t    = a->solve_work;
1830 
1831   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1832   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1833 
1834   /* copy b into temp work space according to permutation */
1835   for (i=0; i<n; i++) {
1836     ii      = bs*i; ic = bs*c[i];
1837     t[ii]   = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1838     t[ii+4] = b[ic+4];  t[ii+5] = b[ic+5];
1839   }
1840 
1841   /* forward solve the U^T */
1842   idx = 0;
1843   for (i=0; i<n; i++) {
1844     v = aa + bs2*diag[i];
1845     /* multiply by the inverse of the block diagonal */
1846     x1 = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1847     x6 = t[5+idx];
1848     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
1849     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
1850     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
1851     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
1852     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
1853     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
1854     v -= bs2;
1855 
1856     vi = aj + diag[i] - 1;
1857     nz = diag[i] - diag[i+1] - 1;
1858     for (j=0; j>-nz; j--) {
1859       oidx       = bs*vi[j];
1860       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1861       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1862       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1863       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1864       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1865       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1866       v         -= bs2;
1867     }
1868     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
1869     t[5+idx] = s6;
1870     idx     += bs;
1871   }
1872   /* backward solve the L^T */
1873   for (i=n-1; i>=0; i--) {
1874     v   = aa + bs2*ai[i];
1875     vi  = aj + ai[i];
1876     nz  = ai[i+1] - ai[i];
1877     idt = bs*i;
1878     s1  = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
1879     s6  = t[5+idt];
1880     for (j=0; j<nz; j++) {
1881       idx       = bs*vi[j];
1882       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1883       t[idx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1884       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1885       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1886       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1887       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1888       v        += bs2;
1889     }
1890   }
1891 
1892   /* copy t into x according to permutation */
1893   for (i=0; i<n; i++) {
1894     ii      = bs*i;  ir = bs*r[i];
1895     x[ir]   = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
1896     x[ir+4] = t[ii+4];  x[ir+5] = t[ii+5];
1897   }
1898 
1899   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1900   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1901   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1902   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1903   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1904   PetscFunctionReturn(0);
1905 }
1906 
1907 #undef __FUNCT__
1908 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_inplace"
1909 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
1910 {
1911   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
1912   IS                iscol=a->col,isrow=a->row;
1913   PetscErrorCode    ierr;
1914   const PetscInt    *r,*c,*rout,*cout;
1915   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1916   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1917   const MatScalar   *aa=a->a,*v;
1918   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
1919   const PetscScalar *b;
1920 
1921   PetscFunctionBegin;
1922   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1923   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1924   t    = a->solve_work;
1925 
1926   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1927   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1928 
1929   /* copy the b into temp work space according to permutation */
1930   ii = 0;
1931   for (i=0; i<n; i++) {
1932     ic      = 7*c[i];
1933     t[ii]   = b[ic];
1934     t[ii+1] = b[ic+1];
1935     t[ii+2] = b[ic+2];
1936     t[ii+3] = b[ic+3];
1937     t[ii+4] = b[ic+4];
1938     t[ii+5] = b[ic+5];
1939     t[ii+6] = b[ic+6];
1940     ii     += 7;
1941   }
1942 
1943   /* forward solve the U^T */
1944   idx = 0;
1945   for (i=0; i<n; i++) {
1946 
1947     v = aa + 49*diag[i];
1948     /* multiply by the inverse of the block diagonal */
1949     x1 = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1950     x6 = t[5+idx]; x7 = t[6+idx];
1951     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1952     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1953     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1954     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1955     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1956     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1957     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1958     v += 49;
1959 
1960     vi = aj + diag[i] + 1;
1961     nz = ai[i+1] - diag[i] - 1;
1962     while (nz--) {
1963       oidx       = 7*(*vi++);
1964       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1965       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1966       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1967       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1968       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1969       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1970       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1971       v         += 49;
1972     }
1973     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1974     t[5+idx] = s6;t[6+idx] = s7;
1975     idx     += 7;
1976   }
1977   /* backward solve the L^T */
1978   for (i=n-1; i>=0; i--) {
1979     v   = aa + 49*diag[i] - 49;
1980     vi  = aj + diag[i] - 1;
1981     nz  = diag[i] - ai[i];
1982     idt = 7*i;
1983     s1  = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1984     s6  = t[5+idt];s7 = t[6+idt];
1985     while (nz--) {
1986       idx       = 7*(*vi--);
1987       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1988       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1989       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1990       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1991       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1992       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1993       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1994       v        -= 49;
1995     }
1996   }
1997 
1998   /* copy t into x according to permutation */
1999   ii = 0;
2000   for (i=0; i<n; i++) {
2001     ir      = 7*r[i];
2002     x[ir]   = t[ii];
2003     x[ir+1] = t[ii+1];
2004     x[ir+2] = t[ii+2];
2005     x[ir+3] = t[ii+3];
2006     x[ir+4] = t[ii+4];
2007     x[ir+5] = t[ii+5];
2008     x[ir+6] = t[ii+6];
2009     ii     += 7;
2010   }
2011 
2012   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2013   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2014   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2015   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2016   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
2017   PetscFunctionReturn(0);
2018 }
2019 #undef __FUNCT__
2020 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7"
2021 PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
2022 {
2023   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ*)A->data;
2024   PetscErrorCode    ierr;
2025   IS                iscol=a->col,isrow=a->row;
2026   const PetscInt    n    =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
2027   const PetscInt    *r,*c,*rout,*cout;
2028   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
2029   const PetscInt    bs =A->rmap->bs,bs2=a->bs2;
2030   const MatScalar   *aa=a->a,*v;
2031   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
2032   const PetscScalar *b;
2033 
2034   PetscFunctionBegin;
2035   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2036   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2037   t    = a->solve_work;
2038 
2039   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2040   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2041 
2042   /* copy b into temp work space according to permutation */
2043   for (i=0; i<n; i++) {
2044     ii      = bs*i; ic = bs*c[i];
2045     t[ii]   = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
2046     t[ii+4] = b[ic+4];  t[ii+5] = b[ic+5];  t[ii+6] = b[ic+6];
2047   }
2048 
2049   /* forward solve the U^T */
2050   idx = 0;
2051   for (i=0; i<n; i++) {
2052     v = aa + bs2*diag[i];
2053     /* multiply by the inverse of the block diagonal */
2054     x1 = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2055     x6 = t[5+idx]; x7 = t[6+idx];
2056     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
2057     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
2058     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
2059     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
2060     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
2061     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
2062     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
2063     v -= bs2;
2064 
2065     vi = aj + diag[i] - 1;
2066     nz = diag[i] - diag[i+1] - 1;
2067     for (j=0; j>-nz; j--) {
2068       oidx       = bs*vi[j];
2069       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
2070       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
2071       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
2072       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
2073       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
2074       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
2075       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
2076       v         -= bs2;
2077     }
2078     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
2079     t[5+idx] = s6;  t[6+idx] = s7;
2080     idx     += bs;
2081   }
2082   /* backward solve the L^T */
2083   for (i=n-1; i>=0; i--) {
2084     v   = aa + bs2*ai[i];
2085     vi  = aj + ai[i];
2086     nz  = ai[i+1] - ai[i];
2087     idt = bs*i;
2088     s1  = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
2089     s6  = t[5+idt];  s7 = t[6+idt];
2090     for (j=0; j<nz; j++) {
2091       idx       = bs*vi[j];
2092       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
2093       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
2094       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
2095       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
2096       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
2097       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
2098       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
2099       v        += bs2;
2100     }
2101   }
2102 
2103   /* copy t into x according to permutation */
2104   for (i=0; i<n; i++) {
2105     ii      = bs*i;  ir = bs*r[i];
2106     x[ir]   = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
2107     x[ir+4] = t[ii+4];  x[ir+5] = t[ii+5];  x[ir+6] = t[ii+6];
2108   }
2109 
2110   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2111   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2112   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2113   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2114   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2115   PetscFunctionReturn(0);
2116 }
2117 
2118 /* ----------------------------------------------------------- */
2119 #undef __FUNCT__
2120 #define __FUNCT__ "MatSolve_SeqBAIJ_N_inplace"
2121 PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
2122 {
2123   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
2124   IS                iscol=a->col,isrow=a->row;
2125   PetscErrorCode    ierr;
2126   const PetscInt    *r,*c,*rout,*cout;
2127   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*vi;
2128   PetscInt          i,nz;
2129   const PetscInt    bs =A->rmap->bs,bs2=a->bs2;
2130   const MatScalar   *aa=a->a,*v;
2131   PetscScalar       *x,*s,*t,*ls;
2132   const PetscScalar *b;
2133 
2134   PetscFunctionBegin;
2135   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2136   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2137   t    = a->solve_work;
2138 
2139   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2140   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2141 
2142   /* forward solve the lower triangular */
2143   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
2144   for (i=1; i<n; i++) {
2145     v    = aa + bs2*ai[i];
2146     vi   = aj + ai[i];
2147     nz   = a->diag[i] - ai[i];
2148     s    = t + bs*i;
2149     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
2150     while (nz--) {
2151       PetscKernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
2152       v += bs2;
2153     }
2154   }
2155   /* backward solve the upper triangular */
2156   ls = a->solve_work + A->cmap->n;
2157   for (i=n-1; i>=0; i--) {
2158     v    = aa + bs2*(a->diag[i] + 1);
2159     vi   = aj + a->diag[i] + 1;
2160     nz   = ai[i+1] - a->diag[i] - 1;
2161     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
2162     while (nz--) {
2163       PetscKernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
2164       v += bs2;
2165     }
2166     PetscKernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
2167     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
2168   }
2169 
2170   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2171   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2172   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2173   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2174   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
2175   PetscFunctionReturn(0);
2176 }
2177 
2178 /* ----------------------------------------------------------- */
2179 #undef __FUNCT__
2180 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N_inplace"
2181 PetscErrorCode MatSolveTranspose_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
2182 {
2183   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
2184   IS                iscol=a->col,isrow=a->row;
2185   PetscErrorCode    ierr;
2186   const PetscInt    *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
2187   PetscInt          i,nz,j;
2188   const PetscInt    n  =a->mbs,bs=A->rmap->bs,bs2=a->bs2;
2189   const MatScalar   *aa=a->a,*v;
2190   PetscScalar       *x,*t,*ls;
2191   const PetscScalar *b;
2192 
2193   PetscFunctionBegin;
2194   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2195   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2196   t    = a->solve_work;
2197 
2198   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2199   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2200 
2201   /* copy the b into temp work space according to permutation */
2202   for (i=0; i<n; i++) {
2203     for (j=0; j<bs; j++) {
2204       t[i*bs+j] = b[c[i]*bs+j];
2205     }
2206   }
2207 
2208 
2209   /* forward solve the upper triangular transpose */
2210   ls = a->solve_work + A->cmap->n;
2211   for (i=0; i<n; i++) {
2212     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
2213     PetscKernel_w_gets_transA_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
2214     v  = aa + bs2*(a->diag[i] + 1);
2215     vi = aj + a->diag[i] + 1;
2216     nz = ai[i+1] - a->diag[i] - 1;
2217     while (nz--) {
2218       PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
2219       v += bs2;
2220     }
2221   }
2222 
2223   /* backward solve the lower triangular transpose */
2224   for (i=n-1; i>=0; i--) {
2225     v  = aa + bs2*ai[i];
2226     vi = aj + ai[i];
2227     nz = a->diag[i] - ai[i];
2228     while (nz--) {
2229       PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
2230       v += bs2;
2231     }
2232   }
2233 
2234   /* copy t into x according to permutation */
2235   for (i=0; i<n; i++) {
2236     for (j=0; j<bs; j++) {
2237       x[bs*r[i]+j]   = t[bs*i+j];
2238     }
2239   }
2240 
2241   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2242   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2243   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2244   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2245   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
2246   PetscFunctionReturn(0);
2247 }
2248 
2249 #undef __FUNCT__
2250 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N"
2251 PetscErrorCode MatSolveTranspose_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
2252 {
2253   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
2254   IS                iscol=a->col,isrow=a->row;
2255   PetscErrorCode    ierr;
2256   const PetscInt    *r,*c,*rout,*cout;
2257   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*vi,*diag=a->diag;
2258   PetscInt          i,j,nz;
2259   const PetscInt    bs =A->rmap->bs,bs2=a->bs2;
2260   const MatScalar   *aa=a->a,*v;
2261   PetscScalar       *x,*t,*ls;
2262   const PetscScalar *b;
2263 
2264   PetscFunctionBegin;
2265   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2266   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2267   t    = a->solve_work;
2268 
2269   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2270   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2271 
2272   /* copy the b into temp work space according to permutation */
2273   for (i=0; i<n; i++) {
2274     for (j=0; j<bs; j++) {
2275       t[i*bs+j] = b[c[i]*bs+j];
2276     }
2277   }
2278 
2279 
2280   /* forward solve the upper triangular transpose */
2281   ls = a->solve_work + A->cmap->n;
2282   for (i=0; i<n; i++) {
2283     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
2284     PetscKernel_w_gets_transA_times_v(bs,ls,aa+bs2*diag[i],t+i*bs);
2285     v  = aa + bs2*(diag[i] - 1);
2286     vi = aj + diag[i] - 1;
2287     nz = diag[i] - diag[i+1] - 1;
2288     for (j=0; j>-nz; j--) {
2289       PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs);
2290       v -= bs2;
2291     }
2292   }
2293 
2294   /* backward solve the lower triangular transpose */
2295   for (i=n-1; i>=0; i--) {
2296     v  = aa + bs2*ai[i];
2297     vi = aj + ai[i];
2298     nz = ai[i+1] - ai[i];
2299     for (j=0; j<nz; j++) {
2300       PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs);
2301       v += bs2;
2302     }
2303   }
2304 
2305   /* copy t into x according to permutation */
2306   for (i=0; i<n; i++) {
2307     for (j=0; j<bs; j++) {
2308       x[bs*r[i]+j]   = t[bs*i+j];
2309     }
2310   }
2311 
2312   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2313   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2314   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2315   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2316   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
2317   PetscFunctionReturn(0);
2318 }
2319 
2320 /* bs = 15 for PFLOTRAN. Block operations are done by accessing all the columns   of the block at once */
2321 
2322 #undef __FUNCT__
2323 #define __FUNCT__ "MatSolve_SeqBAIJ_15_NaturalOrdering_ver2"
2324 PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver2(Mat A,Vec bb,Vec xx)
2325 {
2326   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ*)A->data;
2327   PetscErrorCode    ierr;
2328   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
2329   PetscInt          i,nz,idx,idt,m;
2330   const MatScalar   *aa=a->a,*v;
2331   PetscScalar       s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15;
2332   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15;
2333   PetscScalar       *x;
2334   const PetscScalar *b;
2335 
2336   PetscFunctionBegin;
2337   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2338   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2339 
2340   /* forward solve the lower triangular */
2341   idx   = 0;
2342   x[0]  = b[idx];    x[1]  = b[1+idx];  x[2]  = b[2+idx];  x[3]  = b[3+idx];  x[4]  = b[4+idx];
2343   x[5]  = b[5+idx];  x[6]  = b[6+idx];  x[7]  = b[7+idx];  x[8]  = b[8+idx];  x[9]  = b[9+idx];
2344   x[10] = b[10+idx]; x[11] = b[11+idx]; x[12] = b[12+idx]; x[13] = b[13+idx]; x[14] = b[14+idx];
2345 
2346   for (i=1; i<n; i++) {
2347     v   = aa + bs2*ai[i];
2348     vi  = aj + ai[i];
2349     nz  = ai[i+1] - ai[i];
2350     idt = bs*i;
2351     s1  = b[idt];    s2  = b[1+idt];  s3  = b[2+idt];  s4  = b[3+idt];  s5  = b[4+idt];
2352     s6  = b[5+idt];  s7  = b[6+idt];  s8  = b[7+idt];  s9  = b[8+idt];  s10 = b[9+idt];
2353     s11 = b[10+idt]; s12 = b[11+idt]; s13 = b[12+idt]; s14 = b[13+idt]; s15 = b[14+idt];
2354     for (m=0; m<nz; m++) {
2355       idx = bs*vi[m];
2356       x1  = x[idx];     x2  = x[1+idx];  x3  = x[2+idx];  x4  = x[3+idx];  x5  = x[4+idx];
2357       x6  = x[5+idx];   x7  = x[6+idx];  x8  = x[7+idx];  x9  = x[8+idx];  x10 = x[9+idx];
2358       x11 = x[10+idx]; x12  = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx];
2359 
2360 
2361       s1  -= v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
2362       s2  -= v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
2363       s3  -= v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
2364       s4  -= v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
2365       s5  -= v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
2366       s6  -= v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
2367       s7  -= v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
2368       s8  -= v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
2369       s9  -= v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
2370       s10 -= v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
2371       s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
2372       s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
2373       s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
2374       s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
2375       s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
2376 
2377       v += bs2;
2378     }
2379     x[idt]    = s1;  x[1+idt]  = s2;  x[2+idt]  = s3;  x[3+idt]  = s4;  x[4+idt]  = s5;
2380     x[5+idt]  = s6;  x[6+idt]  = s7;  x[7+idt]  = s8;  x[8+idt]  = s9;  x[9+idt]  = s10;
2381     x[10+idt] = s11; x[11+idt] = s12; x[12+idt] = s13; x[13+idt] = s14; x[14+idt] = s15;
2382 
2383   }
2384   /* backward solve the upper triangular */
2385   for (i=n-1; i>=0; i--) {
2386     v   = aa + bs2*(adiag[i+1]+1);
2387     vi  = aj + adiag[i+1]+1;
2388     nz  = adiag[i] - adiag[i+1] - 1;
2389     idt = bs*i;
2390     s1  = x[idt];     s2  = x[1+idt];  s3  = x[2+idt];  s4  = x[3+idt];  s5  = x[4+idt];
2391     s6  = x[5+idt];   s7  = x[6+idt];  s8  = x[7+idt];  s9  = x[8+idt];  s10 = x[9+idt];
2392     s11 = x[10+idt]; s12  = x[11+idt]; s13 = x[12+idt]; s14 = x[13+idt]; s15 = x[14+idt];
2393 
2394     for (m=0; m<nz; m++) {
2395       idx = bs*vi[m];
2396       x1  = x[idx];     x2  = x[1+idx];  x3  = x[2+idx];  x4  = x[3+idx];  x5  = x[4+idx];
2397       x6  = x[5+idx];   x7  = x[6+idx];  x8  = x[7+idx];  x9  = x[8+idx];  x10 = x[9+idx];
2398       x11 = x[10+idx]; x12  = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx];
2399 
2400       s1  -= v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
2401       s2  -= v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
2402       s3  -= v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
2403       s4  -= v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
2404       s5  -= v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
2405       s6  -= v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
2406       s7  -= v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
2407       s8  -= v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
2408       s9  -= v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
2409       s10 -= v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
2410       s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
2411       s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
2412       s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
2413       s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
2414       s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
2415 
2416       v += bs2;
2417     }
2418 
2419     x[idt]    = v[0]*s1  + v[15]*s2 + v[30]*s3 + v[45]*s4 + v[60]*s5 + v[75]*s6 + v[90]*s7  + v[105]*s8 + v[120]*s9 + v[135]*s10 + v[150]*s11 + v[165]*s12 + v[180]*s13 + v[195]*s14 + v[210]*s15;
2420     x[1+idt]  = v[1]*s1  + v[16]*s2 + v[31]*s3 + v[46]*s4 + v[61]*s5 + v[76]*s6 + v[91]*s7  + v[106]*s8 + v[121]*s9 + v[136]*s10 + v[151]*s11 + v[166]*s12 + v[181]*s13 + v[196]*s14 + v[211]*s15;
2421     x[2+idt]  = v[2]*s1  + v[17]*s2 + v[32]*s3 + v[47]*s4 + v[62]*s5 + v[77]*s6 + v[92]*s7  + v[107]*s8 + v[122]*s9 + v[137]*s10 + v[152]*s11 + v[167]*s12 + v[182]*s13 + v[197]*s14 + v[212]*s15;
2422     x[3+idt]  = v[3]*s1  + v[18]*s2 + v[33]*s3 + v[48]*s4 + v[63]*s5 + v[78]*s6 + v[93]*s7  + v[108]*s8 + v[123]*s9 + v[138]*s10 + v[153]*s11 + v[168]*s12 + v[183]*s13 + v[198]*s14 + v[213]*s15;
2423     x[4+idt]  = v[4]*s1  + v[19]*s2 + v[34]*s3 + v[49]*s4 + v[64]*s5 + v[79]*s6 + v[94]*s7  + v[109]*s8 + v[124]*s9 + v[139]*s10 + v[154]*s11 + v[169]*s12 + v[184]*s13 + v[199]*s14 + v[214]*s15;
2424     x[5+idt]  = v[5]*s1  + v[20]*s2 + v[35]*s3 + v[50]*s4 + v[65]*s5 + v[80]*s6 + v[95]*s7  + v[110]*s8 + v[125]*s9 + v[140]*s10 + v[155]*s11 + v[170]*s12 + v[185]*s13 + v[200]*s14 + v[215]*s15;
2425     x[6+idt]  = v[6]*s1  + v[21]*s2 + v[36]*s3 + v[51]*s4 + v[66]*s5 + v[81]*s6 + v[96]*s7  + v[111]*s8 + v[126]*s9 + v[141]*s10 + v[156]*s11 + v[171]*s12 + v[186]*s13 + v[201]*s14 + v[216]*s15;
2426     x[7+idt]  = v[7]*s1  + v[22]*s2 + v[37]*s3 + v[52]*s4 + v[67]*s5 + v[82]*s6 + v[97]*s7  + v[112]*s8 + v[127]*s9 + v[142]*s10 + v[157]*s11 + v[172]*s12 + v[187]*s13 + v[202]*s14 + v[217]*s15;
2427     x[8+idt]  = v[8]*s1  + v[23]*s2 + v[38]*s3 + v[53]*s4 + v[68]*s5 + v[83]*s6 + v[98]*s7  + v[113]*s8 + v[128]*s9 + v[143]*s10 + v[158]*s11 + v[173]*s12 + v[188]*s13 + v[203]*s14 + v[218]*s15;
2428     x[9+idt]  = v[9]*s1  + v[24]*s2 + v[39]*s3 + v[54]*s4 + v[69]*s5 + v[84]*s6 + v[99]*s7  + v[114]*s8 + v[129]*s9 + v[144]*s10 + v[159]*s11 + v[174]*s12 + v[189]*s13 + v[204]*s14 + v[219]*s15;
2429     x[10+idt] = v[10]*s1 + v[25]*s2 + v[40]*s3 + v[55]*s4 + v[70]*s5 + v[85]*s6 + v[100]*s7 + v[115]*s8 + v[130]*s9 + v[145]*s10 + v[160]*s11 + v[175]*s12 + v[190]*s13 + v[205]*s14 + v[220]*s15;
2430     x[11+idt] = v[11]*s1 + v[26]*s2 + v[41]*s3 + v[56]*s4 + v[71]*s5 + v[86]*s6 + v[101]*s7 + v[116]*s8 + v[131]*s9 + v[146]*s10 + v[161]*s11 + v[176]*s12 + v[191]*s13 + v[206]*s14 + v[221]*s15;
2431     x[12+idt] = v[12]*s1 + v[27]*s2 + v[42]*s3 + v[57]*s4 + v[72]*s5 + v[87]*s6 + v[102]*s7 + v[117]*s8 + v[132]*s9 + v[147]*s10 + v[162]*s11 + v[177]*s12 + v[192]*s13 + v[207]*s14 + v[222]*s15;
2432     x[13+idt] = v[13]*s1 + v[28]*s2 + v[43]*s3 + v[58]*s4 + v[73]*s5 + v[88]*s6 + v[103]*s7 + v[118]*s8 + v[133]*s9 + v[148]*s10 + v[163]*s11 + v[178]*s12 + v[193]*s13 + v[208]*s14 + v[223]*s15;
2433     x[14+idt] = v[14]*s1 + v[29]*s2 + v[44]*s3 + v[59]*s4 + v[74]*s5 + v[89]*s6 + v[104]*s7 + v[119]*s8 + v[134]*s9 + v[149]*s10 + v[164]*s11 + v[179]*s12 + v[194]*s13 + v[209]*s14 + v[224]*s15;
2434 
2435   }
2436 
2437   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2438   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2439   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2440   PetscFunctionReturn(0);
2441 }
2442 
2443 /* bs = 15 for PFLOTRAN. Block operations are done by accessing one column at at time */
2444 /* Default MatSolve for block size 15 */
2445 
2446 #undef __FUNCT__
2447 #define __FUNCT__ "MatSolve_SeqBAIJ_15_NaturalOrdering_ver1"
2448 PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver1(Mat A,Vec bb,Vec xx)
2449 {
2450   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ*)A->data;
2451   PetscErrorCode    ierr;
2452   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
2453   PetscInt          i,k,nz,idx,idt,m;
2454   const MatScalar   *aa=a->a,*v;
2455   PetscScalar       s[15];
2456   PetscScalar       *x,xv;
2457   const PetscScalar *b;
2458 
2459   PetscFunctionBegin;
2460   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2461   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2462 
2463   /* forward solve the lower triangular */
2464   for (i=0; i<n; i++) {
2465     v         = aa + bs2*ai[i];
2466     vi        = aj + ai[i];
2467     nz        = ai[i+1] - ai[i];
2468     idt       = bs*i;
2469     x[idt]    = b[idt];    x[1+idt]  = b[1+idt];  x[2+idt]  = b[2+idt];  x[3+idt]  = b[3+idt];  x[4+idt]  = b[4+idt];
2470     x[5+idt]  = b[5+idt];  x[6+idt]  = b[6+idt];  x[7+idt]  = b[7+idt];  x[8+idt]  = b[8+idt];  x[9+idt] = b[9+idt];
2471     x[10+idt] = b[10+idt]; x[11+idt] = b[11+idt]; x[12+idt] = b[12+idt]; x[13+idt] = b[13+idt]; x[14+idt] = b[14+idt];
2472     for (m=0; m<nz; m++) {
2473       idx = bs*vi[m];
2474       for (k=0; k<15; k++) {
2475         xv         = x[k + idx];
2476         x[idt]    -= v[0]*xv;
2477         x[1+idt]  -= v[1]*xv;
2478         x[2+idt]  -= v[2]*xv;
2479         x[3+idt]  -= v[3]*xv;
2480         x[4+idt]  -= v[4]*xv;
2481         x[5+idt]  -= v[5]*xv;
2482         x[6+idt]  -= v[6]*xv;
2483         x[7+idt]  -= v[7]*xv;
2484         x[8+idt]  -= v[8]*xv;
2485         x[9+idt]  -= v[9]*xv;
2486         x[10+idt] -= v[10]*xv;
2487         x[11+idt] -= v[11]*xv;
2488         x[12+idt] -= v[12]*xv;
2489         x[13+idt] -= v[13]*xv;
2490         x[14+idt] -= v[14]*xv;
2491         v         += 15;
2492       }
2493     }
2494   }
2495   /* backward solve the upper triangular */
2496   for (i=n-1; i>=0; i--) {
2497     v     = aa + bs2*(adiag[i+1]+1);
2498     vi    = aj + adiag[i+1]+1;
2499     nz    = adiag[i] - adiag[i+1] - 1;
2500     idt   = bs*i;
2501     s[0]  = x[idt];    s[1]  = x[1+idt];  s[2]  = x[2+idt];  s[3]  = x[3+idt];  s[4]  = x[4+idt];
2502     s[5]  = x[5+idt];  s[6]  = x[6+idt];  s[7]  = x[7+idt];  s[8]  = x[8+idt];  s[9]  = x[9+idt];
2503     s[10] = x[10+idt]; s[11] = x[11+idt]; s[12] = x[12+idt]; s[13] = x[13+idt]; s[14] = x[14+idt];
2504 
2505     for (m=0; m<nz; m++) {
2506       idx = bs*vi[m];
2507       for (k=0; k<15; k++) {
2508         xv     = x[k + idx];
2509         s[0]  -= v[0]*xv;
2510         s[1]  -= v[1]*xv;
2511         s[2]  -= v[2]*xv;
2512         s[3]  -= v[3]*xv;
2513         s[4]  -= v[4]*xv;
2514         s[5]  -= v[5]*xv;
2515         s[6]  -= v[6]*xv;
2516         s[7]  -= v[7]*xv;
2517         s[8]  -= v[8]*xv;
2518         s[9]  -= v[9]*xv;
2519         s[10] -= v[10]*xv;
2520         s[11] -= v[11]*xv;
2521         s[12] -= v[12]*xv;
2522         s[13] -= v[13]*xv;
2523         s[14] -= v[14]*xv;
2524         v     += 15;
2525       }
2526     }
2527     ierr = PetscMemzero(x+idt,bs*sizeof(MatScalar));CHKERRQ(ierr);
2528     for (k=0; k<15; k++) {
2529       x[idt]    += v[0]*s[k];
2530       x[1+idt]  += v[1]*s[k];
2531       x[2+idt]  += v[2]*s[k];
2532       x[3+idt]  += v[3]*s[k];
2533       x[4+idt]  += v[4]*s[k];
2534       x[5+idt]  += v[5]*s[k];
2535       x[6+idt]  += v[6]*s[k];
2536       x[7+idt]  += v[7]*s[k];
2537       x[8+idt]  += v[8]*s[k];
2538       x[9+idt]  += v[9]*s[k];
2539       x[10+idt] += v[10]*s[k];
2540       x[11+idt] += v[11]*s[k];
2541       x[12+idt] += v[12]*s[k];
2542       x[13+idt] += v[13]*s[k];
2543       x[14+idt] += v[14]*s[k];
2544       v         += 15;
2545     }
2546   }
2547   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2548   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2549   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2550   PetscFunctionReturn(0);
2551 }
2552 
2553 
2554 #undef __FUNCT__
2555 #define __FUNCT__ "MatSolve_SeqBAIJ_7_inplace"
2556 PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
2557 {
2558   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
2559   IS                iscol=a->col,isrow=a->row;
2560   PetscErrorCode    ierr;
2561   const PetscInt    *r,*c,*ai=a->i,*aj=a->j;
2562   const PetscInt    *rout,*cout,*diag = a->diag,*vi,n=a->mbs;
2563   PetscInt          i,nz,idx,idt,idc;
2564   const MatScalar   *aa=a->a,*v;
2565   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
2566   const PetscScalar *b;
2567 
2568   PetscFunctionBegin;
2569   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2570   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2571   t    = a->solve_work;
2572 
2573   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2574   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2575 
2576   /* forward solve the lower triangular */
2577   idx  = 7*(*r++);
2578   t[0] = b[idx];   t[1] = b[1+idx];
2579   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
2580   t[5] = b[5+idx]; t[6] = b[6+idx];
2581 
2582   for (i=1; i<n; i++) {
2583     v   = aa + 49*ai[i];
2584     vi  = aj + ai[i];
2585     nz  = diag[i] - ai[i];
2586     idx = 7*(*r++);
2587     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2588     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
2589     while (nz--) {
2590       idx = 7*(*vi++);
2591       x1  = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
2592       x4  = t[3+idx];x5 = t[4+idx];
2593       x6  = t[5+idx];x7 = t[6+idx];
2594       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2595       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2596       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2597       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2598       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2599       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2600       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2601       v  += 49;
2602     }
2603     idx      = 7*i;
2604     t[idx]   = s1;t[1+idx] = s2;
2605     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2606     t[5+idx] = s6;t[6+idx] = s7;
2607   }
2608   /* backward solve the upper triangular */
2609   for (i=n-1; i>=0; i--) {
2610     v   = aa + 49*diag[i] + 49;
2611     vi  = aj + diag[i] + 1;
2612     nz  = ai[i+1] - diag[i] - 1;
2613     idt = 7*i;
2614     s1  = t[idt];  s2 = t[1+idt];
2615     s3  = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2616     s6  = t[5+idt];s7 = t[6+idt];
2617     while (nz--) {
2618       idx = 7*(*vi++);
2619       x1  = t[idx];   x2 = t[1+idx];
2620       x3  = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2621       x6  = t[5+idx]; x7 = t[6+idx];
2622       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2623       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2624       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2625       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2626       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2627       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2628       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2629       v  += 49;
2630     }
2631     idc    = 7*(*c--);
2632     v      = aa + 49*diag[i];
2633     x[idc] = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
2634                         v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
2635     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
2636                           v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
2637     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
2638                           v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
2639     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
2640                           v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
2641     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
2642                           v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
2643     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
2644                           v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
2645     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
2646                           v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
2647   }
2648 
2649   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2650   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2651   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2652   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2653   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
2654   PetscFunctionReturn(0);
2655 }
2656 
2657 #undef __FUNCT__
2658 #define __FUNCT__ "MatSolve_SeqBAIJ_7"
2659 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
2660 {
2661   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
2662   IS                iscol=a->col,isrow=a->row;
2663   PetscErrorCode    ierr;
2664   const PetscInt    *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag;
2665   const PetscInt    n=a->mbs,*rout,*cout,*vi;
2666   PetscInt          i,nz,idx,idt,idc,m;
2667   const MatScalar   *aa=a->a,*v;
2668   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
2669   const PetscScalar *b;
2670 
2671   PetscFunctionBegin;
2672   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2673   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2674   t    = a->solve_work;
2675 
2676   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2677   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2678 
2679   /* forward solve the lower triangular */
2680   idx  = 7*r[0];
2681   t[0] = b[idx];   t[1] = b[1+idx];
2682   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
2683   t[5] = b[5+idx]; t[6] = b[6+idx];
2684 
2685   for (i=1; i<n; i++) {
2686     v   = aa + 49*ai[i];
2687     vi  = aj + ai[i];
2688     nz  = ai[i+1] - ai[i];
2689     idx = 7*r[i];
2690     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2691     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
2692     for (m=0; m<nz; m++) {
2693       idx = 7*vi[m];
2694       x1  = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
2695       x4  = t[3+idx];x5 = t[4+idx];
2696       x6  = t[5+idx];x7 = t[6+idx];
2697       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2698       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2699       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2700       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2701       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2702       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2703       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2704       v  += 49;
2705     }
2706     idx      = 7*i;
2707     t[idx]   = s1;t[1+idx] = s2;
2708     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2709     t[5+idx] = s6;t[6+idx] = s7;
2710   }
2711   /* backward solve the upper triangular */
2712   for (i=n-1; i>=0; i--) {
2713     v   = aa + 49*(adiag[i+1]+1);
2714     vi  = aj + adiag[i+1]+1;
2715     nz  = adiag[i] - adiag[i+1] - 1;
2716     idt = 7*i;
2717     s1  = t[idt];  s2 = t[1+idt];
2718     s3  = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2719     s6  = t[5+idt];s7 = t[6+idt];
2720     for (m=0; m<nz; m++) {
2721       idx = 7*vi[m];
2722       x1  = t[idx];   x2 = t[1+idx];
2723       x3  = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2724       x6  = t[5+idx]; x7 = t[6+idx];
2725       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2726       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2727       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2728       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2729       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2730       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2731       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2732       v  += 49;
2733     }
2734     idc    = 7*c[i];
2735     x[idc] = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
2736                         v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
2737     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
2738                           v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
2739     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
2740                           v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
2741     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
2742                           v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
2743     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
2744                           v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
2745     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
2746                           v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
2747     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
2748                           v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
2749   }
2750 
2751   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2752   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2753   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2754   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2755   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
2756   PetscFunctionReturn(0);
2757 }
2758 
2759 #undef __FUNCT__
2760 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_inplace"
2761 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2762 {
2763   Mat_SeqBAIJ       *a   = (Mat_SeqBAIJ*)A->data;
2764   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
2765   PetscErrorCode    ierr;
2766   PetscInt          i,nz,idx,idt,jdx;
2767   const MatScalar   *aa=a->a,*v;
2768   PetscScalar       *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
2769   const PetscScalar *b;
2770 
2771   PetscFunctionBegin;
2772   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2773   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2774   /* forward solve the lower triangular */
2775   idx  = 0;
2776   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
2777   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
2778   x[6] = b[6+idx];
2779   for (i=1; i<n; i++) {
2780     v   =  aa + 49*ai[i];
2781     vi  =  aj + ai[i];
2782     nz  =  diag[i] - ai[i];
2783     idx =  7*i;
2784     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
2785     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
2786     s7  =  b[6+idx];
2787     while (nz--) {
2788       jdx = 7*(*vi++);
2789       x1  = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
2790       x4  = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
2791       x7  = x[6+jdx];
2792       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2793       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2794       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2795       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2796       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2797       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2798       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2799       v  += 49;
2800     }
2801     x[idx]   = s1;
2802     x[1+idx] = s2;
2803     x[2+idx] = s3;
2804     x[3+idx] = s4;
2805     x[4+idx] = s5;
2806     x[5+idx] = s6;
2807     x[6+idx] = s7;
2808   }
2809   /* backward solve the upper triangular */
2810   for (i=n-1; i>=0; i--) {
2811     v   = aa + 49*diag[i] + 49;
2812     vi  = aj + diag[i] + 1;
2813     nz  = ai[i+1] - diag[i] - 1;
2814     idt = 7*i;
2815     s1  = x[idt];   s2 = x[1+idt];
2816     s3  = x[2+idt]; s4 = x[3+idt];
2817     s5  = x[4+idt]; s6 = x[5+idt];
2818     s7  = x[6+idt];
2819     while (nz--) {
2820       idx = 7*(*vi++);
2821       x1  = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
2822       x4  = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
2823       x7  = x[6+idx];
2824       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2825       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2826       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2827       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2828       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2829       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2830       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2831       v  += 49;
2832     }
2833     v      = aa + 49*diag[i];
2834     x[idt] = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
2835              + v[28]*s5 + v[35]*s6 + v[42]*s7;
2836     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
2837                + v[29]*s5 + v[36]*s6 + v[43]*s7;
2838     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
2839                + v[30]*s5 + v[37]*s6 + v[44]*s7;
2840     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
2841                + v[31]*s5 + v[38]*s6 + v[45]*s7;
2842     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
2843                + v[32]*s5 + v[39]*s6 + v[46]*s7;
2844     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
2845                + v[33]*s5 + v[40]*s6 + v[47]*s7;
2846     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
2847                + v[34]*s5 + v[41]*s6 + v[48]*s7;
2848   }
2849 
2850   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2851   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2852   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
2853   PetscFunctionReturn(0);
2854 }
2855 
2856 #undef __FUNCT__
2857 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
2858 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
2859 {
2860   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2861   const PetscInt    n  =a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
2862   PetscErrorCode    ierr;
2863   PetscInt          i,k,nz,idx,jdx,idt;
2864   const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
2865   const MatScalar   *aa=a->a,*v;
2866   PetscScalar       *x;
2867   const PetscScalar *b;
2868   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
2869 
2870   PetscFunctionBegin;
2871   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2872   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2873   /* forward solve the lower triangular */
2874   idx  = 0;
2875   x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
2876   x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx];
2877   for (i=1; i<n; i++) {
2878     v   = aa + bs2*ai[i];
2879     vi  = aj + ai[i];
2880     nz  = ai[i+1] - ai[i];
2881     idx = bs*i;
2882     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2883     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
2884     for (k=0; k<nz; k++) {
2885       jdx = bs*vi[k];
2886       x1  = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
2887       x5  = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx];
2888       s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
2889       s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
2890       s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
2891       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
2892       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
2893       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
2894       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
2895       v  +=  bs2;
2896     }
2897 
2898     x[idx]   = s1;
2899     x[1+idx] = s2;
2900     x[2+idx] = s3;
2901     x[3+idx] = s4;
2902     x[4+idx] = s5;
2903     x[5+idx] = s6;
2904     x[6+idx] = s7;
2905   }
2906 
2907   /* backward solve the upper triangular */
2908   for (i=n-1; i>=0; i--) {
2909     v   = aa + bs2*(adiag[i+1]+1);
2910     vi  = aj + adiag[i+1]+1;
2911     nz  = adiag[i] - adiag[i+1]-1;
2912     idt = bs*i;
2913     s1  = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
2914     s5  = x[4+idt];s6 = x[5+idt];s7 = x[6+idt];
2915     for (k=0; k<nz; k++) {
2916       idx = bs*vi[k];
2917       x1  = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
2918       x5  = x[4+idx];x6 = x[5+idx];x7 = x[6+idx];
2919       s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
2920       s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
2921       s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
2922       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
2923       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
2924       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
2925       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
2926       v  +=  bs2;
2927     }
2928     /* x = inv_diagonal*x */
2929     x[idt]   = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4  + v[28]*s5 + v[35]*s6 + v[42]*s7;
2930     x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4  + v[29]*s5 + v[36]*s6 + v[43]*s7;
2931     x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4  + v[30]*s5 + v[37]*s6 + v[44]*s7;
2932     x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4  + v[31]*s5 + v[38]*s6 + v[45]*s7;
2933     x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4  + v[32]*s5 + v[39]*s6 + v[46]*s7;
2934     x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4  + v[33]*s5 + v[40]*s6 + v[47]*s7;
2935     x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4  + v[34]*s5 + v[41]*s6 + v[48]*s7;
2936   }
2937 
2938   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2939   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2940   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2941   PetscFunctionReturn(0);
2942 }
2943 
2944 #undef __FUNCT__
2945 #define __FUNCT__ "MatSolve_SeqBAIJ_6_inplace"
2946 PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
2947 {
2948   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
2949   IS                iscol=a->col,isrow=a->row;
2950   PetscErrorCode    ierr;
2951   const PetscInt    *r,*c,*rout,*cout;
2952   const PetscInt    *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
2953   PetscInt          i,nz,idx,idt,idc;
2954   const MatScalar   *aa=a->a,*v;
2955   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
2956   const PetscScalar *b;
2957 
2958   PetscFunctionBegin;
2959   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2960   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2961   t    = a->solve_work;
2962 
2963   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2964   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2965 
2966   /* forward solve the lower triangular */
2967   idx  = 6*(*r++);
2968   t[0] = b[idx];   t[1] = b[1+idx];
2969   t[2] = b[2+idx]; t[3] = b[3+idx];
2970   t[4] = b[4+idx]; t[5] = b[5+idx];
2971   for (i=1; i<n; i++) {
2972     v   = aa + 36*ai[i];
2973     vi  = aj + ai[i];
2974     nz  = diag[i] - ai[i];
2975     idx = 6*(*r++);
2976     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2977     s5  = b[4+idx]; s6 = b[5+idx];
2978     while (nz--) {
2979       idx = 6*(*vi++);
2980       x1  = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
2981       x4  = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
2982       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2983       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2984       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2985       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2986       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2987       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
2988       v  += 36;
2989     }
2990     idx      = 6*i;
2991     t[idx]   = s1;t[1+idx] = s2;
2992     t[2+idx] = s3;t[3+idx] = s4;
2993     t[4+idx] = s5;t[5+idx] = s6;
2994   }
2995   /* backward solve the upper triangular */
2996   for (i=n-1; i>=0; i--) {
2997     v   = aa + 36*diag[i] + 36;
2998     vi  = aj + diag[i] + 1;
2999     nz  = ai[i+1] - diag[i] - 1;
3000     idt = 6*i;
3001     s1  = t[idt];  s2 = t[1+idt];
3002     s3  = t[2+idt];s4 = t[3+idt];
3003     s5  = t[4+idt];s6 = t[5+idt];
3004     while (nz--) {
3005       idx = 6*(*vi++);
3006       x1  = t[idx];   x2 = t[1+idx];
3007       x3  = t[2+idx]; x4 = t[3+idx];
3008       x5  = t[4+idx]; x6 = t[5+idx];
3009       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3010       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3011       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3012       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3013       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3014       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3015       v  += 36;
3016     }
3017     idc    = 6*(*c--);
3018     v      = aa + 36*diag[i];
3019     x[idc] = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
3020                         v[18]*s4+v[24]*s5+v[30]*s6;
3021     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
3022                           v[19]*s4+v[25]*s5+v[31]*s6;
3023     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
3024                           v[20]*s4+v[26]*s5+v[32]*s6;
3025     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
3026                           v[21]*s4+v[27]*s5+v[33]*s6;
3027     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
3028                           v[22]*s4+v[28]*s5+v[34]*s6;
3029     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
3030                           v[23]*s4+v[29]*s5+v[35]*s6;
3031   }
3032 
3033   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3034   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3035   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3036   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3037   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
3038   PetscFunctionReturn(0);
3039 }
3040 
3041 #undef __FUNCT__
3042 #define __FUNCT__ "MatSolve_SeqBAIJ_6"
3043 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
3044 {
3045   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
3046   IS                iscol=a->col,isrow=a->row;
3047   PetscErrorCode    ierr;
3048   const PetscInt    *r,*c,*rout,*cout;
3049   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3050   PetscInt          i,nz,idx,idt,idc,m;
3051   const MatScalar   *aa=a->a,*v;
3052   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
3053   const PetscScalar *b;
3054 
3055   PetscFunctionBegin;
3056   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3057   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3058   t    = a->solve_work;
3059 
3060   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3061   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
3062 
3063   /* forward solve the lower triangular */
3064   idx  = 6*r[0];
3065   t[0] = b[idx];   t[1] = b[1+idx];
3066   t[2] = b[2+idx]; t[3] = b[3+idx];
3067   t[4] = b[4+idx]; t[5] = b[5+idx];
3068   for (i=1; i<n; i++) {
3069     v   = aa + 36*ai[i];
3070     vi  = aj + ai[i];
3071     nz  = ai[i+1] - ai[i];
3072     idx = 6*r[i];
3073     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3074     s5  = b[4+idx]; s6 = b[5+idx];
3075     for (m=0; m<nz; m++) {
3076       idx = 6*vi[m];
3077       x1  = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
3078       x4  = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
3079       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3080       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3081       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3082       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3083       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3084       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3085       v  += 36;
3086     }
3087     idx      = 6*i;
3088     t[idx]   = s1;t[1+idx] = s2;
3089     t[2+idx] = s3;t[3+idx] = s4;
3090     t[4+idx] = s5;t[5+idx] = s6;
3091   }
3092   /* backward solve the upper triangular */
3093   for (i=n-1; i>=0; i--) {
3094     v   = aa + 36*(adiag[i+1]+1);
3095     vi  = aj + adiag[i+1]+1;
3096     nz  = adiag[i] - adiag[i+1] - 1;
3097     idt = 6*i;
3098     s1  = t[idt];  s2 = t[1+idt];
3099     s3  = t[2+idt];s4 = t[3+idt];
3100     s5  = t[4+idt];s6 = t[5+idt];
3101     for (m=0; m<nz; m++) {
3102       idx = 6*vi[m];
3103       x1  = t[idx];   x2 = t[1+idx];
3104       x3  = t[2+idx]; x4 = t[3+idx];
3105       x5  = t[4+idx]; x6 = t[5+idx];
3106       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3107       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3108       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3109       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3110       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3111       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3112       v  += 36;
3113     }
3114     idc    = 6*c[i];
3115     x[idc] = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
3116                         v[18]*s4+v[24]*s5+v[30]*s6;
3117     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
3118                           v[19]*s4+v[25]*s5+v[31]*s6;
3119     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
3120                           v[20]*s4+v[26]*s5+v[32]*s6;
3121     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
3122                           v[21]*s4+v[27]*s5+v[33]*s6;
3123     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
3124                           v[22]*s4+v[28]*s5+v[34]*s6;
3125     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
3126                           v[23]*s4+v[29]*s5+v[35]*s6;
3127   }
3128 
3129   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3130   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3131   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3132   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3133   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
3134   PetscFunctionReturn(0);
3135 }
3136 
3137 #undef __FUNCT__
3138 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_inplace"
3139 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
3140 {
3141   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
3142   PetscInt          i,nz,idx,idt,jdx;
3143   PetscErrorCode    ierr;
3144   const PetscInt    *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j;
3145   const MatScalar   *aa   =a->a,*v;
3146   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
3147   const PetscScalar *b;
3148 
3149   PetscFunctionBegin;
3150   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3151   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3152   /* forward solve the lower triangular */
3153   idx  = 0;
3154   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
3155   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
3156   for (i=1; i<n; i++) {
3157     v   =  aa + 36*ai[i];
3158     vi  =  aj + ai[i];
3159     nz  =  diag[i] - ai[i];
3160     idx =  6*i;
3161     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
3162     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
3163     while (nz--) {
3164       jdx = 6*(*vi++);
3165       x1  = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
3166       x4  = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
3167       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3168       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3169       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3170       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3171       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3172       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3173       v  += 36;
3174     }
3175     x[idx]   = s1;
3176     x[1+idx] = s2;
3177     x[2+idx] = s3;
3178     x[3+idx] = s4;
3179     x[4+idx] = s5;
3180     x[5+idx] = s6;
3181   }
3182   /* backward solve the upper triangular */
3183   for (i=n-1; i>=0; i--) {
3184     v   = aa + 36*diag[i] + 36;
3185     vi  = aj + diag[i] + 1;
3186     nz  = ai[i+1] - diag[i] - 1;
3187     idt = 6*i;
3188     s1  = x[idt];   s2 = x[1+idt];
3189     s3  = x[2+idt]; s4 = x[3+idt];
3190     s5  = x[4+idt]; s6 = x[5+idt];
3191     while (nz--) {
3192       idx = 6*(*vi++);
3193       x1  = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
3194       x4  = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
3195       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3196       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3197       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3198       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3199       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3200       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3201       v  += 36;
3202     }
3203     v        = aa + 36*diag[i];
3204     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
3205     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
3206     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
3207     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
3208     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
3209     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
3210   }
3211 
3212   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3213   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3214   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
3215   PetscFunctionReturn(0);
3216 }
3217 
3218 #undef __FUNCT__
3219 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
3220 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
3221 {
3222   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
3223   const PetscInt    n  =a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3224   PetscErrorCode    ierr;
3225   PetscInt          i,k,nz,idx,jdx,idt;
3226   const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
3227   const MatScalar   *aa=a->a,*v;
3228   PetscScalar       *x;
3229   const PetscScalar *b;
3230   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
3231 
3232   PetscFunctionBegin;
3233   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3234   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3235   /* forward solve the lower triangular */
3236   idx  = 0;
3237   x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
3238   x[4] = b[4+idx];x[5] = b[5+idx];
3239   for (i=1; i<n; i++) {
3240     v   = aa + bs2*ai[i];
3241     vi  = aj + ai[i];
3242     nz  = ai[i+1] - ai[i];
3243     idx = bs*i;
3244     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3245     s5  = b[4+idx];s6 = b[5+idx];
3246     for (k=0; k<nz; k++) {
3247       jdx = bs*vi[k];
3248       x1  = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
3249       x5  = x[4+jdx]; x6 = x[5+jdx];
3250       s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
3251       s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
3252       s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
3253       s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
3254       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
3255       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
3256       v  +=  bs2;
3257     }
3258 
3259     x[idx]   = s1;
3260     x[1+idx] = s2;
3261     x[2+idx] = s3;
3262     x[3+idx] = s4;
3263     x[4+idx] = s5;
3264     x[5+idx] = s6;
3265   }
3266 
3267   /* backward solve the upper triangular */
3268   for (i=n-1; i>=0; i--) {
3269     v   = aa + bs2*(adiag[i+1]+1);
3270     vi  = aj + adiag[i+1]+1;
3271     nz  = adiag[i] - adiag[i+1]-1;
3272     idt = bs*i;
3273     s1  = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
3274     s5  = x[4+idt];s6 = x[5+idt];
3275     for (k=0; k<nz; k++) {
3276       idx = bs*vi[k];
3277       x1  = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
3278       x5  = x[4+idx];x6 = x[5+idx];
3279       s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
3280       s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
3281       s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
3282       s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
3283       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
3284       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
3285       v  +=  bs2;
3286     }
3287     /* x = inv_diagonal*x */
3288     x[idt]   = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
3289     x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
3290     x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
3291     x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
3292     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
3293     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
3294   }
3295 
3296   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3297   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3298   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
3299   PetscFunctionReturn(0);
3300 }
3301 
3302 #undef __FUNCT__
3303 #define __FUNCT__ "MatSolve_SeqBAIJ_5_inplace"
3304 PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
3305 {
3306   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
3307   IS                iscol=a->col,isrow=a->row;
3308   PetscErrorCode    ierr;
3309   const PetscInt    *r,*c,*rout,*cout,*diag = a->diag;
3310   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3311   PetscInt          i,nz,idx,idt,idc;
3312   const MatScalar   *aa=a->a,*v;
3313   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
3314   const PetscScalar *b;
3315 
3316   PetscFunctionBegin;
3317   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3318   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3319   t    = a->solve_work;
3320 
3321   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3322   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3323 
3324   /* forward solve the lower triangular */
3325   idx  = 5*(*r++);
3326   t[0] = b[idx];   t[1] = b[1+idx];
3327   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
3328   for (i=1; i<n; i++) {
3329     v   = aa + 25*ai[i];
3330     vi  = aj + ai[i];
3331     nz  = diag[i] - ai[i];
3332     idx = 5*(*r++);
3333     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3334     s5  = b[4+idx];
3335     while (nz--) {
3336       idx = 5*(*vi++);
3337       x1  = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
3338       x4  = t[3+idx];x5 = t[4+idx];
3339       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3340       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3341       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3342       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3343       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3344       v  += 25;
3345     }
3346     idx      = 5*i;
3347     t[idx]   = s1;t[1+idx] = s2;
3348     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
3349   }
3350   /* backward solve the upper triangular */
3351   for (i=n-1; i>=0; i--) {
3352     v   = aa + 25*diag[i] + 25;
3353     vi  = aj + diag[i] + 1;
3354     nz  = ai[i+1] - diag[i] - 1;
3355     idt = 5*i;
3356     s1  = t[idt];  s2 = t[1+idt];
3357     s3  = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
3358     while (nz--) {
3359       idx = 5*(*vi++);
3360       x1  = t[idx];   x2 = t[1+idx];
3361       x3  = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
3362       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3363       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3364       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3365       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3366       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3367       v  += 25;
3368     }
3369     idc    = 5*(*c--);
3370     v      = aa + 25*diag[i];
3371     x[idc] = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
3372                         v[15]*s4+v[20]*s5;
3373     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
3374                           v[16]*s4+v[21]*s5;
3375     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
3376                           v[17]*s4+v[22]*s5;
3377     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
3378                           v[18]*s4+v[23]*s5;
3379     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
3380                           v[19]*s4+v[24]*s5;
3381   }
3382 
3383   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3384   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3385   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3386   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3387   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
3388   PetscFunctionReturn(0);
3389 }
3390 
3391 #undef __FUNCT__
3392 #define __FUNCT__ "MatSolve_SeqBAIJ_5"
3393 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
3394 {
3395   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
3396   IS                iscol=a->col,isrow=a->row;
3397   PetscErrorCode    ierr;
3398   const PetscInt    *r,*c,*rout,*cout;
3399   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3400   PetscInt          i,nz,idx,idt,idc,m;
3401   const MatScalar   *aa=a->a,*v;
3402   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
3403   const PetscScalar *b;
3404 
3405   PetscFunctionBegin;
3406   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3407   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3408   t    = a->solve_work;
3409 
3410   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3411   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
3412 
3413   /* forward solve the lower triangular */
3414   idx  = 5*r[0];
3415   t[0] = b[idx];   t[1] = b[1+idx];
3416   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
3417   for (i=1; i<n; i++) {
3418     v   = aa + 25*ai[i];
3419     vi  = aj + ai[i];
3420     nz  = ai[i+1] - ai[i];
3421     idx = 5*r[i];
3422     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3423     s5  = b[4+idx];
3424     for (m=0; m<nz; m++) {
3425       idx = 5*vi[m];
3426       x1  = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
3427       x4  = t[3+idx];x5 = t[4+idx];
3428       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3429       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3430       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3431       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3432       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3433       v  += 25;
3434     }
3435     idx      = 5*i;
3436     t[idx]   = s1;t[1+idx] = s2;
3437     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
3438   }
3439   /* backward solve the upper triangular */
3440   for (i=n-1; i>=0; i--) {
3441     v   = aa + 25*(adiag[i+1]+1);
3442     vi  = aj + adiag[i+1]+1;
3443     nz  = adiag[i] - adiag[i+1] - 1;
3444     idt = 5*i;
3445     s1  = t[idt];  s2 = t[1+idt];
3446     s3  = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
3447     for (m=0; m<nz; m++) {
3448       idx = 5*vi[m];
3449       x1  = t[idx];   x2 = t[1+idx];
3450       x3  = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
3451       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3452       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3453       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3454       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3455       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3456       v  += 25;
3457     }
3458     idc    = 5*c[i];
3459     x[idc] = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
3460                         v[15]*s4+v[20]*s5;
3461     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
3462                           v[16]*s4+v[21]*s5;
3463     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
3464                           v[17]*s4+v[22]*s5;
3465     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
3466                           v[18]*s4+v[23]*s5;
3467     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
3468                           v[19]*s4+v[24]*s5;
3469   }
3470 
3471   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3472   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3473   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3474   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3475   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
3476   PetscFunctionReturn(0);
3477 }
3478 
3479 #undef __FUNCT__
3480 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_inplace"
3481 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
3482 {
3483   Mat_SeqBAIJ       *a   = (Mat_SeqBAIJ*)A->data;
3484   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3485   PetscInt          i,nz,idx,idt,jdx;
3486   PetscErrorCode    ierr;
3487   const MatScalar   *aa=a->a,*v;
3488   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
3489   const PetscScalar *b;
3490 
3491   PetscFunctionBegin;
3492   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3493   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3494   /* forward solve the lower triangular */
3495   idx  = 0;
3496   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
3497   for (i=1; i<n; i++) {
3498     v   =  aa + 25*ai[i];
3499     vi  =  aj + ai[i];
3500     nz  =  diag[i] - ai[i];
3501     idx =  5*i;
3502     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
3503     while (nz--) {
3504       jdx = 5*(*vi++);
3505       x1  = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
3506       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3507       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3508       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3509       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3510       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3511       v  += 25;
3512     }
3513     x[idx]   = s1;
3514     x[1+idx] = s2;
3515     x[2+idx] = s3;
3516     x[3+idx] = s4;
3517     x[4+idx] = s5;
3518   }
3519   /* backward solve the upper triangular */
3520   for (i=n-1; i>=0; i--) {
3521     v   = aa + 25*diag[i] + 25;
3522     vi  = aj + diag[i] + 1;
3523     nz  = ai[i+1] - diag[i] - 1;
3524     idt = 5*i;
3525     s1  = x[idt];  s2 = x[1+idt];
3526     s3  = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
3527     while (nz--) {
3528       idx = 5*(*vi++);
3529       x1  = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
3530       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3531       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3532       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3533       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3534       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3535       v  += 25;
3536     }
3537     v        = aa + 25*diag[i];
3538     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
3539     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
3540     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
3541     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
3542     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
3543   }
3544 
3545   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3546   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3547   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
3548   PetscFunctionReturn(0);
3549 }
3550 
3551 #undef __FUNCT__
3552 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
3553 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
3554 {
3555   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
3556   const PetscInt    n  = a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3557   PetscInt          i,k,nz,idx,idt,jdx;
3558   PetscErrorCode    ierr;
3559   const MatScalar   *aa=a->a,*v;
3560   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
3561   const PetscScalar *b;
3562 
3563   PetscFunctionBegin;
3564   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3565   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3566   /* forward solve the lower triangular */
3567   idx  = 0;
3568   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
3569   for (i=1; i<n; i++) {
3570     v   = aa + 25*ai[i];
3571     vi  = aj + ai[i];
3572     nz  = ai[i+1] - ai[i];
3573     idx = 5*i;
3574     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
3575     for (k=0; k<nz; k++) {
3576       jdx = 5*vi[k];
3577       x1  = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
3578       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3579       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3580       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3581       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3582       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3583       v  += 25;
3584     }
3585     x[idx]   = s1;
3586     x[1+idx] = s2;
3587     x[2+idx] = s3;
3588     x[3+idx] = s4;
3589     x[4+idx] = s5;
3590   }
3591 
3592   /* backward solve the upper triangular */
3593   for (i=n-1; i>=0; i--) {
3594     v   = aa + 25*(adiag[i+1]+1);
3595     vi  = aj + adiag[i+1]+1;
3596     nz  = adiag[i] - adiag[i+1]-1;
3597     idt = 5*i;
3598     s1  = x[idt];  s2 = x[1+idt];
3599     s3  = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
3600     for (k=0; k<nz; k++) {
3601       idx = 5*vi[k];
3602       x1  = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
3603       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3604       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3605       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3606       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3607       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3608       v  += 25;
3609     }
3610     /* x = inv_diagonal*x */
3611     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
3612     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
3613     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
3614     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
3615     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
3616   }
3617 
3618   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3619   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3620   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
3621   PetscFunctionReturn(0);
3622 }
3623 
3624 #undef __FUNCT__
3625 #define __FUNCT__ "MatSolve_SeqBAIJ_4_inplace"
3626 PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
3627 {
3628   Mat_SeqBAIJ       *a   = (Mat_SeqBAIJ*)A->data;
3629   IS                iscol=a->col,isrow=a->row;
3630   PetscErrorCode    ierr;
3631   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3632   PetscInt          i,nz,idx,idt,idc;
3633   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3634   const MatScalar   *aa=a->a,*v;
3635   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
3636   const PetscScalar *b;
3637 
3638   PetscFunctionBegin;
3639   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3640   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3641   t    = a->solve_work;
3642 
3643   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3644   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3645 
3646   /* forward solve the lower triangular */
3647   idx  = 4*(*r++);
3648   t[0] = b[idx];   t[1] = b[1+idx];
3649   t[2] = b[2+idx]; t[3] = b[3+idx];
3650   for (i=1; i<n; i++) {
3651     v   = aa + 16*ai[i];
3652     vi  = aj + ai[i];
3653     nz  = diag[i] - ai[i];
3654     idx = 4*(*r++);
3655     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3656     while (nz--) {
3657       idx = 4*(*vi++);
3658       x1  = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
3659       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3660       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3661       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3662       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3663       v  += 16;
3664     }
3665     idx      = 4*i;
3666     t[idx]   = s1;t[1+idx] = s2;
3667     t[2+idx] = s3;t[3+idx] = s4;
3668   }
3669   /* backward solve the upper triangular */
3670   for (i=n-1; i>=0; i--) {
3671     v   = aa + 16*diag[i] + 16;
3672     vi  = aj + diag[i] + 1;
3673     nz  = ai[i+1] - diag[i] - 1;
3674     idt = 4*i;
3675     s1  = t[idt];  s2 = t[1+idt];
3676     s3  = t[2+idt];s4 = t[3+idt];
3677     while (nz--) {
3678       idx = 4*(*vi++);
3679       x1  = t[idx];   x2 = t[1+idx];
3680       x3  = t[2+idx]; x4 = t[3+idx];
3681       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3682       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3683       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3684       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3685       v  += 16;
3686     }
3687     idc      = 4*(*c--);
3688     v        = aa + 16*diag[i];
3689     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3690     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3691     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3692     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3693   }
3694 
3695   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3696   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3697   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3698   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3699   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3700   PetscFunctionReturn(0);
3701 }
3702 
3703 #undef __FUNCT__
3704 #define __FUNCT__ "MatSolve_SeqBAIJ_4"
3705 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
3706 {
3707   Mat_SeqBAIJ       *a   = (Mat_SeqBAIJ*)A->data;
3708   IS                iscol=a->col,isrow=a->row;
3709   PetscErrorCode    ierr;
3710   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3711   PetscInt          i,nz,idx,idt,idc,m;
3712   const PetscInt    *r,*c,*rout,*cout;
3713   const MatScalar   *aa=a->a,*v;
3714   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
3715   const PetscScalar *b;
3716 
3717   PetscFunctionBegin;
3718   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3719   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3720   t    = a->solve_work;
3721 
3722   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3723   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
3724 
3725   /* forward solve the lower triangular */
3726   idx  = 4*r[0];
3727   t[0] = b[idx];   t[1] = b[1+idx];
3728   t[2] = b[2+idx]; t[3] = b[3+idx];
3729   for (i=1; i<n; i++) {
3730     v   = aa + 16*ai[i];
3731     vi  = aj + ai[i];
3732     nz  = ai[i+1] - ai[i];
3733     idx = 4*r[i];
3734     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3735     for (m=0; m<nz; m++) {
3736       idx = 4*vi[m];
3737       x1  = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
3738       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3739       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3740       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3741       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3742       v  += 16;
3743     }
3744     idx      = 4*i;
3745     t[idx]   = s1;t[1+idx] = s2;
3746     t[2+idx] = s3;t[3+idx] = s4;
3747   }
3748   /* backward solve the upper triangular */
3749   for (i=n-1; i>=0; i--) {
3750     v   = aa + 16*(adiag[i+1]+1);
3751     vi  = aj + adiag[i+1]+1;
3752     nz  = adiag[i] - adiag[i+1] - 1;
3753     idt = 4*i;
3754     s1  = t[idt];  s2 = t[1+idt];
3755     s3  = t[2+idt];s4 = t[3+idt];
3756     for (m=0; m<nz; m++) {
3757       idx = 4*vi[m];
3758       x1  = t[idx];   x2 = t[1+idx];
3759       x3  = t[2+idx]; x4 = t[3+idx];
3760       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3761       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3762       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3763       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3764       v  += 16;
3765     }
3766     idc      = 4*c[i];
3767     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3768     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3769     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3770     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3771   }
3772 
3773   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3774   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3775   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3776   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3777   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3778   PetscFunctionReturn(0);
3779 }
3780 
3781 #undef __FUNCT__
3782 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion"
3783 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
3784 {
3785   Mat_SeqBAIJ       *a   = (Mat_SeqBAIJ*)A->data;
3786   IS                iscol=a->col,isrow=a->row;
3787   PetscErrorCode    ierr;
3788   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3789   PetscInt          i,nz,idx,idt,idc;
3790   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3791   const MatScalar   *aa=a->a,*v;
3792   MatScalar         s1,s2,s3,s4,x1,x2,x3,x4,*t;
3793   PetscScalar       *x;
3794   const PetscScalar *b;
3795 
3796   PetscFunctionBegin;
3797   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3798   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3799   t    = (MatScalar*)a->solve_work;
3800 
3801   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3802   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3803 
3804   /* forward solve the lower triangular */
3805   idx  = 4*(*r++);
3806   t[0] = (MatScalar)b[idx];
3807   t[1] = (MatScalar)b[1+idx];
3808   t[2] = (MatScalar)b[2+idx];
3809   t[3] = (MatScalar)b[3+idx];
3810   for (i=1; i<n; i++) {
3811     v   = aa + 16*ai[i];
3812     vi  = aj + ai[i];
3813     nz  = diag[i] - ai[i];
3814     idx = 4*(*r++);
3815     s1  = (MatScalar)b[idx];
3816     s2  = (MatScalar)b[1+idx];
3817     s3  = (MatScalar)b[2+idx];
3818     s4  = (MatScalar)b[3+idx];
3819     while (nz--) {
3820       idx = 4*(*vi++);
3821       x1  = t[idx];
3822       x2  = t[1+idx];
3823       x3  = t[2+idx];
3824       x4  = t[3+idx];
3825       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3826       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3827       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3828       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3829       v  += 16;
3830     }
3831     idx      = 4*i;
3832     t[idx]   = s1;
3833     t[1+idx] = s2;
3834     t[2+idx] = s3;
3835     t[3+idx] = s4;
3836   }
3837   /* backward solve the upper triangular */
3838   for (i=n-1; i>=0; i--) {
3839     v   = aa + 16*diag[i] + 16;
3840     vi  = aj + diag[i] + 1;
3841     nz  = ai[i+1] - diag[i] - 1;
3842     idt = 4*i;
3843     s1  = t[idt];
3844     s2  = t[1+idt];
3845     s3  = t[2+idt];
3846     s4  = t[3+idt];
3847     while (nz--) {
3848       idx = 4*(*vi++);
3849       x1  = t[idx];
3850       x2  = t[1+idx];
3851       x3  = t[2+idx];
3852       x4  = t[3+idx];
3853       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3854       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3855       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3856       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3857       v  += 16;
3858     }
3859     idc      = 4*(*c--);
3860     v        = aa + 16*diag[i];
3861     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3862     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3863     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3864     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3865     x[idc]   = (PetscScalar)t[idt];
3866     x[1+idc] = (PetscScalar)t[1+idt];
3867     x[2+idc] = (PetscScalar)t[2+idt];
3868     x[3+idc] = (PetscScalar)t[3+idt];
3869   }
3870 
3871   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3872   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3873   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3874   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3875   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3876   PetscFunctionReturn(0);
3877 }
3878 
3879 #if defined(PETSC_HAVE_SSE)
3880 
3881 #include PETSC_HAVE_SSE
3882 
3883 #undef __FUNCT__
3884 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
3885 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
3886 {
3887   /*
3888      Note: This code uses demotion of double
3889      to float when performing the mixed-mode computation.
3890      This may not be numerically reasonable for all applications.
3891   */
3892   Mat_SeqBAIJ    *a   = (Mat_SeqBAIJ*)A->data;
3893   IS             iscol=a->col,isrow=a->row;
3894   PetscErrorCode ierr;
3895   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
3896   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
3897   MatScalar      *aa=a->a,*v;
3898   PetscScalar    *x,*b,*t;
3899 
3900   /* Make space in temp stack for 16 Byte Aligned arrays */
3901   float         ssealignedspace[11],*tmps,*tmpx;
3902   unsigned long offset;
3903 
3904   PetscFunctionBegin;
3905   SSE_SCOPE_BEGIN;
3906 
3907   offset = (unsigned long)ssealignedspace % 16;
3908   if (offset) offset = (16 - offset)/4;
3909   tmps = &ssealignedspace[offset];
3910   tmpx = &ssealignedspace[offset+4];
3911   PREFETCH_NTA(aa+16*ai[1]);
3912 
3913   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3914   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3915   t    = a->solve_work;
3916 
3917   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3918   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3919 
3920   /* forward solve the lower triangular */
3921   idx  = 4*(*r++);
3922   t[0] = b[idx];   t[1] = b[1+idx];
3923   t[2] = b[2+idx]; t[3] = b[3+idx];
3924   v    =  aa + 16*ai[1];
3925 
3926   for (i=1; i<n; ) {
3927     PREFETCH_NTA(&v[8]);
3928     vi  =  aj      + ai[i];
3929     nz  =  diag[i] - ai[i];
3930     idx =  4*(*r++);
3931 
3932     /* Demote sum from double to float */
3933     CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
3934     LOAD_PS(tmps,XMM7);
3935 
3936     while (nz--) {
3937       PREFETCH_NTA(&v[16]);
3938       idx = 4*(*vi++);
3939 
3940       /* Demote solution (so far) from double to float */
3941       CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
3942 
3943       /* 4x4 Matrix-Vector product with negative accumulation: */
3944       SSE_INLINE_BEGIN_2(tmpx,v)
3945       SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3946 
3947       /* First Column */
3948       SSE_COPY_PS(XMM0,XMM6)
3949       SSE_SHUFFLE(XMM0,XMM0,0x00)
3950       SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3951       SSE_SUB_PS(XMM7,XMM0)
3952 
3953       /* Second Column */
3954       SSE_COPY_PS(XMM1,XMM6)
3955       SSE_SHUFFLE(XMM1,XMM1,0x55)
3956       SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3957       SSE_SUB_PS(XMM7,XMM1)
3958 
3959       SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3960 
3961       /* Third Column */
3962       SSE_COPY_PS(XMM2,XMM6)
3963       SSE_SHUFFLE(XMM2,XMM2,0xAA)
3964       SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3965       SSE_SUB_PS(XMM7,XMM2)
3966 
3967       /* Fourth Column */
3968       SSE_COPY_PS(XMM3,XMM6)
3969       SSE_SHUFFLE(XMM3,XMM3,0xFF)
3970       SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3971       SSE_SUB_PS(XMM7,XMM3)
3972       SSE_INLINE_END_2
3973 
3974       v += 16;
3975     }
3976     idx = 4*i;
3977     v   = aa + 16*ai[++i];
3978     PREFETCH_NTA(v);
3979     STORE_PS(tmps,XMM7);
3980 
3981     /* Promote result from float to double */
3982     CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
3983   }
3984   /* backward solve the upper triangular */
3985   idt  = 4*(n-1);
3986   ai16 = 16*diag[n-1];
3987   v    = aa + ai16 + 16;
3988   for (i=n-1; i>=0; ) {
3989     PREFETCH_NTA(&v[8]);
3990     vi = aj + diag[i] + 1;
3991     nz = ai[i+1] - diag[i] - 1;
3992 
3993     /* Demote accumulator from double to float */
3994     CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
3995     LOAD_PS(tmps,XMM7);
3996 
3997     while (nz--) {
3998       PREFETCH_NTA(&v[16]);
3999       idx = 4*(*vi++);
4000 
4001       /* Demote solution (so far) from double to float */
4002       CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
4003 
4004       /* 4x4 Matrix-Vector Product with negative accumulation: */
4005       SSE_INLINE_BEGIN_2(tmpx,v)
4006       SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4007 
4008       /* First Column */
4009       SSE_COPY_PS(XMM0,XMM6)
4010       SSE_SHUFFLE(XMM0,XMM0,0x00)
4011       SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4012       SSE_SUB_PS(XMM7,XMM0)
4013 
4014       /* Second Column */
4015       SSE_COPY_PS(XMM1,XMM6)
4016       SSE_SHUFFLE(XMM1,XMM1,0x55)
4017       SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4018       SSE_SUB_PS(XMM7,XMM1)
4019 
4020       SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4021 
4022       /* Third Column */
4023       SSE_COPY_PS(XMM2,XMM6)
4024       SSE_SHUFFLE(XMM2,XMM2,0xAA)
4025       SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4026       SSE_SUB_PS(XMM7,XMM2)
4027 
4028       /* Fourth Column */
4029       SSE_COPY_PS(XMM3,XMM6)
4030       SSE_SHUFFLE(XMM3,XMM3,0xFF)
4031       SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4032       SSE_SUB_PS(XMM7,XMM3)
4033       SSE_INLINE_END_2
4034       v += 16;
4035     }
4036     v    = aa + ai16;
4037     ai16 = 16*diag[--i];
4038     PREFETCH_NTA(aa+ai16+16);
4039     /*
4040        Scale the result by the diagonal 4x4 block,
4041        which was inverted as part of the factorization
4042     */
4043     SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
4044     /* First Column */
4045     SSE_COPY_PS(XMM0,XMM7)
4046     SSE_SHUFFLE(XMM0,XMM0,0x00)
4047     SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
4048 
4049     /* Second Column */
4050     SSE_COPY_PS(XMM1,XMM7)
4051     SSE_SHUFFLE(XMM1,XMM1,0x55)
4052     SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
4053     SSE_ADD_PS(XMM0,XMM1)
4054 
4055     SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
4056 
4057     /* Third Column */
4058     SSE_COPY_PS(XMM2,XMM7)
4059     SSE_SHUFFLE(XMM2,XMM2,0xAA)
4060     SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
4061     SSE_ADD_PS(XMM0,XMM2)
4062 
4063     /* Fourth Column */
4064     SSE_COPY_PS(XMM3,XMM7)
4065     SSE_SHUFFLE(XMM3,XMM3,0xFF)
4066     SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
4067     SSE_ADD_PS(XMM0,XMM3)
4068 
4069     SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
4070     SSE_INLINE_END_3
4071 
4072     /* Promote solution from float to double */
4073     CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
4074 
4075     /* Apply reordering to t and stream into x.    */
4076     /* This way, x doesn't pollute the cache.      */
4077     /* Be careful with size: 2 doubles = 4 floats! */
4078     idc = 4*(*c--);
4079     SSE_INLINE_BEGIN_2((float*)&t[idt],(float*)&x[idc])
4080     /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
4081     SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
4082     SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
4083     /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
4084     SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
4085     SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
4086     SSE_INLINE_END_2
4087     v    = aa + ai16 + 16;
4088     idt -= 4;
4089   }
4090 
4091   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4092   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4093   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4094   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4095   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4096   SSE_SCOPE_END;
4097   PetscFunctionReturn(0);
4098 }
4099 
4100 #endif
4101 
4102 
4103 /*
4104       Special case where the matrix was ILU(0) factored in the natural
4105    ordering. This eliminates the need for the column and row permutation.
4106 */
4107 #undef __FUNCT__
4108 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_inplace"
4109 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
4110 {
4111   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
4112   PetscInt          n  =a->mbs;
4113   const PetscInt    *ai=a->i,*aj=a->j;
4114   PetscErrorCode    ierr;
4115   const PetscInt    *diag = a->diag;
4116   const MatScalar   *aa   =a->a;
4117   PetscScalar       *x;
4118   const PetscScalar *b;
4119 
4120   PetscFunctionBegin;
4121   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
4122   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4123 
4124 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
4125   {
4126     static PetscScalar w[2000]; /* very BAD need to fix */
4127     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
4128   }
4129 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
4130   {
4131     static PetscScalar w[2000]; /* very BAD need to fix */
4132     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
4133   }
4134 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
4135   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
4136 #else
4137   {
4138     PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
4139     const MatScalar *v;
4140     PetscInt        jdx,idt,idx,nz,i,ai16;
4141     const PetscInt  *vi;
4142 
4143     /* forward solve the lower triangular */
4144     idx  = 0;
4145     x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
4146     for (i=1; i<n; i++) {
4147       v    =  aa      + 16*ai[i];
4148       vi   =  aj      + ai[i];
4149       nz   =  diag[i] - ai[i];
4150       idx +=  4;
4151       s1   =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
4152       while (nz--) {
4153         jdx = 4*(*vi++);
4154         x1  = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
4155         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
4156         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
4157         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4158         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4159         v  += 16;
4160       }
4161       x[idx]   = s1;
4162       x[1+idx] = s2;
4163       x[2+idx] = s3;
4164       x[3+idx] = s4;
4165     }
4166     /* backward solve the upper triangular */
4167     idt = 4*(n-1);
4168     for (i=n-1; i>=0; i--) {
4169       ai16 = 16*diag[i];
4170       v    = aa + ai16 + 16;
4171       vi   = aj + diag[i] + 1;
4172       nz   = ai[i+1] - diag[i] - 1;
4173       s1   = x[idt];  s2 = x[1+idt];
4174       s3   = x[2+idt];s4 = x[3+idt];
4175       while (nz--) {
4176         idx = 4*(*vi++);
4177         x1  = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
4178         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
4179         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
4180         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
4181         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
4182         v  += 16;
4183       }
4184       v        = aa + ai16;
4185       x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
4186       x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
4187       x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
4188       x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
4189       idt     -= 4;
4190     }
4191   }
4192 #endif
4193 
4194   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
4195   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4196   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4197   PetscFunctionReturn(0);
4198 }
4199 
4200 #undef __FUNCT__
4201 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
4202 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
4203 {
4204   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
4205   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
4206   PetscInt          i,k,nz,idx,jdx,idt;
4207   PetscErrorCode    ierr;
4208   const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
4209   const MatScalar   *aa=a->a,*v;
4210   PetscScalar       *x;
4211   const PetscScalar *b;
4212   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4;
4213 
4214   PetscFunctionBegin;
4215   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
4216   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4217   /* forward solve the lower triangular */
4218   idx  = 0;
4219   x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
4220   for (i=1; i<n; i++) {
4221     v   = aa + bs2*ai[i];
4222     vi  = aj + ai[i];
4223     nz  = ai[i+1] - ai[i];
4224     idx = bs*i;
4225     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
4226     for (k=0; k<nz; k++) {
4227       jdx = bs*vi[k];
4228       x1  = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
4229       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
4230       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
4231       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4232       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4233 
4234       v +=  bs2;
4235     }
4236 
4237     x[idx]   = s1;
4238     x[1+idx] = s2;
4239     x[2+idx] = s3;
4240     x[3+idx] = s4;
4241   }
4242 
4243   /* backward solve the upper triangular */
4244   for (i=n-1; i>=0; i--) {
4245     v   = aa + bs2*(adiag[i+1]+1);
4246     vi  = aj + adiag[i+1]+1;
4247     nz  = adiag[i] - adiag[i+1]-1;
4248     idt = bs*i;
4249     s1  = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
4250 
4251     for (k=0; k<nz; k++) {
4252       idx = bs*vi[k];
4253       x1  = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
4254       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
4255       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
4256       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4257       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4258 
4259       v +=  bs2;
4260     }
4261     /* x = inv_diagonal*x */
4262     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
4263     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;;
4264     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
4265     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
4266 
4267   }
4268 
4269   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
4270   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4271   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
4272   PetscFunctionReturn(0);
4273 }
4274 
4275 #undef __FUNCT__
4276 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion"
4277 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
4278 {
4279   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
4280   const PetscInt    n  =a->mbs,*ai=a->i,*aj=a->j,*diag=a->diag;
4281   PetscErrorCode    ierr;
4282   const MatScalar   *aa=a->a;
4283   const PetscScalar *b;
4284   PetscScalar       *x;
4285 
4286   PetscFunctionBegin;
4287   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
4288   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4289 
4290   {
4291     MatScalar       s1,s2,s3,s4,x1,x2,x3,x4;
4292     const MatScalar *v;
4293     MatScalar       *t=(MatScalar*)x;
4294     PetscInt        jdx,idt,idx,nz,i,ai16;
4295     const PetscInt  *vi;
4296 
4297     /* forward solve the lower triangular */
4298     idx  = 0;
4299     t[0] = (MatScalar)b[0];
4300     t[1] = (MatScalar)b[1];
4301     t[2] = (MatScalar)b[2];
4302     t[3] = (MatScalar)b[3];
4303     for (i=1; i<n; i++) {
4304       v    =  aa      + 16*ai[i];
4305       vi   =  aj      + ai[i];
4306       nz   =  diag[i] - ai[i];
4307       idx +=  4;
4308       s1   = (MatScalar)b[idx];
4309       s2   = (MatScalar)b[1+idx];
4310       s3   = (MatScalar)b[2+idx];
4311       s4   = (MatScalar)b[3+idx];
4312       while (nz--) {
4313         jdx = 4*(*vi++);
4314         x1  = t[jdx];
4315         x2  = t[1+jdx];
4316         x3  = t[2+jdx];
4317         x4  = t[3+jdx];
4318         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
4319         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
4320         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4321         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4322         v  += 16;
4323       }
4324       t[idx]   = s1;
4325       t[1+idx] = s2;
4326       t[2+idx] = s3;
4327       t[3+idx] = s4;
4328     }
4329     /* backward solve the upper triangular */
4330     idt = 4*(n-1);
4331     for (i=n-1; i>=0; i--) {
4332       ai16 = 16*diag[i];
4333       v    = aa + ai16 + 16;
4334       vi   = aj + diag[i] + 1;
4335       nz   = ai[i+1] - diag[i] - 1;
4336       s1   = t[idt];
4337       s2   = t[1+idt];
4338       s3   = t[2+idt];
4339       s4   = t[3+idt];
4340       while (nz--) {
4341         idx = 4*(*vi++);
4342         x1  = (MatScalar)x[idx];
4343         x2  = (MatScalar)x[1+idx];
4344         x3  = (MatScalar)x[2+idx];
4345         x4  = (MatScalar)x[3+idx];
4346         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
4347         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
4348         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4349         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4350         v  += 16;
4351       }
4352       v        = aa + ai16;
4353       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
4354       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
4355       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
4356       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
4357       idt     -= 4;
4358     }
4359   }
4360 
4361   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
4362   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4363   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4364   PetscFunctionReturn(0);
4365 }
4366 
4367 #if defined(PETSC_HAVE_SSE)
4368 
4369 #include PETSC_HAVE_SSE
4370 #undef __FUNCT__
4371 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj"
4372 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
4373 {
4374   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
4375   unsigned short *aj=(unsigned short*)a->j;
4376   PetscErrorCode ierr;
4377   int            *ai=a->i,n=a->mbs,*diag = a->diag;
4378   MatScalar      *aa=a->a;
4379   PetscScalar    *x,*b;
4380 
4381   PetscFunctionBegin;
4382   SSE_SCOPE_BEGIN;
4383   /*
4384      Note: This code currently uses demotion of double
4385      to float when performing the mixed-mode computation.
4386      This may not be numerically reasonable for all applications.
4387   */
4388   PREFETCH_NTA(aa+16*ai[1]);
4389 
4390   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4391   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4392   {
4393     /* x will first be computed in single precision then promoted inplace to double */
4394     MatScalar      *v,*t=(MatScalar*)x;
4395     int            nz,i,idt,ai16;
4396     unsigned int   jdx,idx;
4397     unsigned short *vi;
4398     /* Forward solve the lower triangular factor. */
4399 
4400     /* First block is the identity. */
4401     idx = 0;
4402     CONVERT_DOUBLE4_FLOAT4(t,b);
4403     v =  aa + 16*((unsigned int)ai[1]);
4404 
4405     for (i=1; i<n; ) {
4406       PREFETCH_NTA(&v[8]);
4407       vi   =  aj      + ai[i];
4408       nz   =  diag[i] - ai[i];
4409       idx +=  4;
4410 
4411       /* Demote RHS from double to float. */
4412       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
4413       LOAD_PS(&t[idx],XMM7);
4414 
4415       while (nz--) {
4416         PREFETCH_NTA(&v[16]);
4417         jdx = 4*((unsigned int)(*vi++));
4418 
4419         /* 4x4 Matrix-Vector product with negative accumulation: */
4420         SSE_INLINE_BEGIN_2(&t[jdx],v)
4421         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4422 
4423         /* First Column */
4424         SSE_COPY_PS(XMM0,XMM6)
4425         SSE_SHUFFLE(XMM0,XMM0,0x00)
4426         SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4427         SSE_SUB_PS(XMM7,XMM0)
4428 
4429         /* Second Column */
4430         SSE_COPY_PS(XMM1,XMM6)
4431         SSE_SHUFFLE(XMM1,XMM1,0x55)
4432         SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4433         SSE_SUB_PS(XMM7,XMM1)
4434 
4435         SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4436 
4437         /* Third Column */
4438         SSE_COPY_PS(XMM2,XMM6)
4439         SSE_SHUFFLE(XMM2,XMM2,0xAA)
4440         SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4441         SSE_SUB_PS(XMM7,XMM2)
4442 
4443         /* Fourth Column */
4444         SSE_COPY_PS(XMM3,XMM6)
4445         SSE_SHUFFLE(XMM3,XMM3,0xFF)
4446         SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4447         SSE_SUB_PS(XMM7,XMM3)
4448         SSE_INLINE_END_2
4449 
4450         v += 16;
4451       }
4452       v =  aa + 16*ai[++i];
4453       PREFETCH_NTA(v);
4454       STORE_PS(&t[idx],XMM7);
4455     }
4456 
4457     /* Backward solve the upper triangular factor.*/
4458 
4459     idt  = 4*(n-1);
4460     ai16 = 16*diag[n-1];
4461     v    = aa + ai16 + 16;
4462     for (i=n-1; i>=0; ) {
4463       PREFETCH_NTA(&v[8]);
4464       vi = aj + diag[i] + 1;
4465       nz = ai[i+1] - diag[i] - 1;
4466 
4467       LOAD_PS(&t[idt],XMM7);
4468 
4469       while (nz--) {
4470         PREFETCH_NTA(&v[16]);
4471         idx = 4*((unsigned int)(*vi++));
4472 
4473         /* 4x4 Matrix-Vector Product with negative accumulation: */
4474         SSE_INLINE_BEGIN_2(&t[idx],v)
4475         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4476 
4477         /* First Column */
4478         SSE_COPY_PS(XMM0,XMM6)
4479         SSE_SHUFFLE(XMM0,XMM0,0x00)
4480         SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4481         SSE_SUB_PS(XMM7,XMM0)
4482 
4483         /* Second Column */
4484         SSE_COPY_PS(XMM1,XMM6)
4485         SSE_SHUFFLE(XMM1,XMM1,0x55)
4486         SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4487         SSE_SUB_PS(XMM7,XMM1)
4488 
4489         SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4490 
4491         /* Third Column */
4492         SSE_COPY_PS(XMM2,XMM6)
4493         SSE_SHUFFLE(XMM2,XMM2,0xAA)
4494         SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4495         SSE_SUB_PS(XMM7,XMM2)
4496 
4497         /* Fourth Column */
4498         SSE_COPY_PS(XMM3,XMM6)
4499         SSE_SHUFFLE(XMM3,XMM3,0xFF)
4500         SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4501         SSE_SUB_PS(XMM7,XMM3)
4502         SSE_INLINE_END_2
4503         v += 16;
4504       }
4505       v    = aa + ai16;
4506       ai16 = 16*diag[--i];
4507       PREFETCH_NTA(aa+ai16+16);
4508       /*
4509          Scale the result by the diagonal 4x4 block,
4510          which was inverted as part of the factorization
4511       */
4512       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
4513       /* First Column */
4514       SSE_COPY_PS(XMM0,XMM7)
4515       SSE_SHUFFLE(XMM0,XMM0,0x00)
4516       SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
4517 
4518       /* Second Column */
4519       SSE_COPY_PS(XMM1,XMM7)
4520       SSE_SHUFFLE(XMM1,XMM1,0x55)
4521       SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
4522       SSE_ADD_PS(XMM0,XMM1)
4523 
4524       SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
4525 
4526       /* Third Column */
4527       SSE_COPY_PS(XMM2,XMM7)
4528       SSE_SHUFFLE(XMM2,XMM2,0xAA)
4529       SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
4530       SSE_ADD_PS(XMM0,XMM2)
4531 
4532       /* Fourth Column */
4533       SSE_COPY_PS(XMM3,XMM7)
4534       SSE_SHUFFLE(XMM3,XMM3,0xFF)
4535       SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
4536       SSE_ADD_PS(XMM0,XMM3)
4537 
4538       SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
4539       SSE_INLINE_END_3
4540 
4541       v    = aa + ai16 + 16;
4542       idt -= 4;
4543     }
4544 
4545     /* Convert t from single precision back to double precision (inplace)*/
4546     idt = 4*(n-1);
4547     for (i=n-1; i>=0; i--) {
4548       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
4549       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
4550       PetscScalar *xtemp=&x[idt];
4551       MatScalar   *ttemp=&t[idt];
4552       xtemp[3] = (PetscScalar)ttemp[3];
4553       xtemp[2] = (PetscScalar)ttemp[2];
4554       xtemp[1] = (PetscScalar)ttemp[1];
4555       xtemp[0] = (PetscScalar)ttemp[0];
4556       idt     -= 4;
4557     }
4558 
4559   } /* End of artificial scope. */
4560   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4561   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4562   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4563   SSE_SCOPE_END;
4564   PetscFunctionReturn(0);
4565 }
4566 
4567 #undef __FUNCT__
4568 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion"
4569 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
4570 {
4571   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
4572   int            *aj=a->j;
4573   PetscErrorCode ierr;
4574   int            *ai=a->i,n=a->mbs,*diag = a->diag;
4575   MatScalar      *aa=a->a;
4576   PetscScalar    *x,*b;
4577 
4578   PetscFunctionBegin;
4579   SSE_SCOPE_BEGIN;
4580   /*
4581      Note: This code currently uses demotion of double
4582      to float when performing the mixed-mode computation.
4583      This may not be numerically reasonable for all applications.
4584   */
4585   PREFETCH_NTA(aa+16*ai[1]);
4586 
4587   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4588   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4589   {
4590     /* x will first be computed in single precision then promoted inplace to double */
4591     MatScalar *v,*t=(MatScalar*)x;
4592     int       nz,i,idt,ai16;
4593     int       jdx,idx;
4594     int       *vi;
4595     /* Forward solve the lower triangular factor. */
4596 
4597     /* First block is the identity. */
4598     idx = 0;
4599     CONVERT_DOUBLE4_FLOAT4(t,b);
4600     v =  aa + 16*ai[1];
4601 
4602     for (i=1; i<n; ) {
4603       PREFETCH_NTA(&v[8]);
4604       vi   =  aj      + ai[i];
4605       nz   =  diag[i] - ai[i];
4606       idx +=  4;
4607 
4608       /* Demote RHS from double to float. */
4609       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
4610       LOAD_PS(&t[idx],XMM7);
4611 
4612       while (nz--) {
4613         PREFETCH_NTA(&v[16]);
4614         jdx = 4*(*vi++);
4615 /*          jdx = *vi++; */
4616 
4617         /* 4x4 Matrix-Vector product with negative accumulation: */
4618         SSE_INLINE_BEGIN_2(&t[jdx],v)
4619         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4620 
4621         /* First Column */
4622         SSE_COPY_PS(XMM0,XMM6)
4623         SSE_SHUFFLE(XMM0,XMM0,0x00)
4624         SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4625         SSE_SUB_PS(XMM7,XMM0)
4626 
4627         /* Second Column */
4628         SSE_COPY_PS(XMM1,XMM6)
4629         SSE_SHUFFLE(XMM1,XMM1,0x55)
4630         SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4631         SSE_SUB_PS(XMM7,XMM1)
4632 
4633         SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4634 
4635         /* Third Column */
4636         SSE_COPY_PS(XMM2,XMM6)
4637         SSE_SHUFFLE(XMM2,XMM2,0xAA)
4638         SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4639         SSE_SUB_PS(XMM7,XMM2)
4640 
4641         /* Fourth Column */
4642         SSE_COPY_PS(XMM3,XMM6)
4643         SSE_SHUFFLE(XMM3,XMM3,0xFF)
4644         SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4645         SSE_SUB_PS(XMM7,XMM3)
4646         SSE_INLINE_END_2
4647 
4648         v += 16;
4649       }
4650       v =  aa + 16*ai[++i];
4651       PREFETCH_NTA(v);
4652       STORE_PS(&t[idx],XMM7);
4653     }
4654 
4655     /* Backward solve the upper triangular factor.*/
4656 
4657     idt  = 4*(n-1);
4658     ai16 = 16*diag[n-1];
4659     v    = aa + ai16 + 16;
4660     for (i=n-1; i>=0; ) {
4661       PREFETCH_NTA(&v[8]);
4662       vi = aj + diag[i] + 1;
4663       nz = ai[i+1] - diag[i] - 1;
4664 
4665       LOAD_PS(&t[idt],XMM7);
4666 
4667       while (nz--) {
4668         PREFETCH_NTA(&v[16]);
4669         idx = 4*(*vi++);
4670 /*          idx = *vi++; */
4671 
4672         /* 4x4 Matrix-Vector Product with negative accumulation: */
4673         SSE_INLINE_BEGIN_2(&t[idx],v)
4674         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4675 
4676         /* First Column */
4677         SSE_COPY_PS(XMM0,XMM6)
4678         SSE_SHUFFLE(XMM0,XMM0,0x00)
4679         SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4680         SSE_SUB_PS(XMM7,XMM0)
4681 
4682         /* Second Column */
4683         SSE_COPY_PS(XMM1,XMM6)
4684         SSE_SHUFFLE(XMM1,XMM1,0x55)
4685         SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4686         SSE_SUB_PS(XMM7,XMM1)
4687 
4688         SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4689 
4690         /* Third Column */
4691         SSE_COPY_PS(XMM2,XMM6)
4692         SSE_SHUFFLE(XMM2,XMM2,0xAA)
4693         SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4694         SSE_SUB_PS(XMM7,XMM2)
4695 
4696         /* Fourth Column */
4697         SSE_COPY_PS(XMM3,XMM6)
4698         SSE_SHUFFLE(XMM3,XMM3,0xFF)
4699         SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4700         SSE_SUB_PS(XMM7,XMM3)
4701         SSE_INLINE_END_2
4702         v += 16;
4703       }
4704       v    = aa + ai16;
4705       ai16 = 16*diag[--i];
4706       PREFETCH_NTA(aa+ai16+16);
4707       /*
4708          Scale the result by the diagonal 4x4 block,
4709          which was inverted as part of the factorization
4710       */
4711       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
4712       /* First Column */
4713       SSE_COPY_PS(XMM0,XMM7)
4714       SSE_SHUFFLE(XMM0,XMM0,0x00)
4715       SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
4716 
4717       /* Second Column */
4718       SSE_COPY_PS(XMM1,XMM7)
4719       SSE_SHUFFLE(XMM1,XMM1,0x55)
4720       SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
4721       SSE_ADD_PS(XMM0,XMM1)
4722 
4723       SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
4724 
4725       /* Third Column */
4726       SSE_COPY_PS(XMM2,XMM7)
4727       SSE_SHUFFLE(XMM2,XMM2,0xAA)
4728       SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
4729       SSE_ADD_PS(XMM0,XMM2)
4730 
4731       /* Fourth Column */
4732       SSE_COPY_PS(XMM3,XMM7)
4733       SSE_SHUFFLE(XMM3,XMM3,0xFF)
4734       SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
4735       SSE_ADD_PS(XMM0,XMM3)
4736 
4737       SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
4738       SSE_INLINE_END_3
4739 
4740       v    = aa + ai16 + 16;
4741       idt -= 4;
4742     }
4743 
4744     /* Convert t from single precision back to double precision (inplace)*/
4745     idt = 4*(n-1);
4746     for (i=n-1; i>=0; i--) {
4747       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
4748       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
4749       PetscScalar *xtemp=&x[idt];
4750       MatScalar   *ttemp=&t[idt];
4751       xtemp[3] = (PetscScalar)ttemp[3];
4752       xtemp[2] = (PetscScalar)ttemp[2];
4753       xtemp[1] = (PetscScalar)ttemp[1];
4754       xtemp[0] = (PetscScalar)ttemp[0];
4755       idt     -= 4;
4756     }
4757 
4758   } /* End of artificial scope. */
4759   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4760   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4761   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4762   SSE_SCOPE_END;
4763   PetscFunctionReturn(0);
4764 }
4765 
4766 #endif
4767 
4768 #undef __FUNCT__
4769 #define __FUNCT__ "MatSolve_SeqBAIJ_3_inplace"
4770 PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
4771 {
4772   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
4773   IS                iscol=a->col,isrow=a->row;
4774   PetscErrorCode    ierr;
4775   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
4776   PetscInt          i,nz,idx,idt,idc;
4777   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
4778   const MatScalar   *aa=a->a,*v;
4779   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
4780   const PetscScalar *b;
4781 
4782   PetscFunctionBegin;
4783   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
4784   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4785   t    = a->solve_work;
4786 
4787   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4788   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
4789 
4790   /* forward solve the lower triangular */
4791   idx  = 3*(*r++);
4792   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
4793   for (i=1; i<n; i++) {
4794     v   = aa + 9*ai[i];
4795     vi  = aj + ai[i];
4796     nz  = diag[i] - ai[i];
4797     idx = 3*(*r++);
4798     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
4799     while (nz--) {
4800       idx = 3*(*vi++);
4801       x1  = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4802       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4803       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4804       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4805       v  += 9;
4806     }
4807     idx    = 3*i;
4808     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
4809   }
4810   /* backward solve the upper triangular */
4811   for (i=n-1; i>=0; i--) {
4812     v   = aa + 9*diag[i] + 9;
4813     vi  = aj + diag[i] + 1;
4814     nz  = ai[i+1] - diag[i] - 1;
4815     idt = 3*i;
4816     s1  = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
4817     while (nz--) {
4818       idx = 3*(*vi++);
4819       x1  = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4820       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4821       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4822       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4823       v  += 9;
4824     }
4825     idc      = 3*(*c--);
4826     v        = aa + 9*diag[i];
4827     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4828     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4829     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4830   }
4831   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4832   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4833   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
4834   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4835   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
4836   PetscFunctionReturn(0);
4837 }
4838 
4839 #undef __FUNCT__
4840 #define __FUNCT__ "MatSolve_SeqBAIJ_3"
4841 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
4842 {
4843   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
4844   IS                iscol=a->col,isrow=a->row;
4845   PetscErrorCode    ierr;
4846   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
4847   PetscInt          i,nz,idx,idt,idc,m;
4848   const PetscInt    *r,*c,*rout,*cout;
4849   const MatScalar   *aa=a->a,*v;
4850   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
4851   const PetscScalar *b;
4852 
4853   PetscFunctionBegin;
4854   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
4855   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4856   t    = a->solve_work;
4857 
4858   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4859   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
4860 
4861   /* forward solve the lower triangular */
4862   idx  = 3*r[0];
4863   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
4864   for (i=1; i<n; i++) {
4865     v   = aa + 9*ai[i];
4866     vi  = aj + ai[i];
4867     nz  = ai[i+1] - ai[i];
4868     idx = 3*r[i];
4869     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
4870     for (m=0; m<nz; m++) {
4871       idx = 3*vi[m];
4872       x1  = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4873       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4874       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4875       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4876       v  += 9;
4877     }
4878     idx    = 3*i;
4879     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
4880   }
4881   /* backward solve the upper triangular */
4882   for (i=n-1; i>=0; i--) {
4883     v   = aa + 9*(adiag[i+1]+1);
4884     vi  = aj + adiag[i+1]+1;
4885     nz  = adiag[i] - adiag[i+1] - 1;
4886     idt = 3*i;
4887     s1  = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
4888     for (m=0; m<nz; m++) {
4889       idx = 3*vi[m];
4890       x1  = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4891       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4892       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4893       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4894       v  += 9;
4895     }
4896     idc      = 3*c[i];
4897     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4898     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4899     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4900   }
4901   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4902   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4903   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
4904   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4905   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
4906   PetscFunctionReturn(0);
4907 }
4908 
4909 /*
4910       Special case where the matrix was ILU(0) factored in the natural
4911    ordering. This eliminates the need for the column and row permutation.
4912 */
4913 #undef __FUNCT__
4914 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_inplace"
4915 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
4916 {
4917   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
4918   const PetscInt    n  =a->mbs,*ai=a->i,*aj=a->j;
4919   PetscErrorCode    ierr;
4920   const PetscInt    *diag = a->diag,*vi;
4921   const MatScalar   *aa   =a->a,*v;
4922   PetscScalar       *x,s1,s2,s3,x1,x2,x3;
4923   const PetscScalar *b;
4924   PetscInt          jdx,idt,idx,nz,i;
4925 
4926   PetscFunctionBegin;
4927   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
4928   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4929 
4930   /* forward solve the lower triangular */
4931   idx  = 0;
4932   x[0] = b[0]; x[1] = b[1]; x[2] = b[2];
4933   for (i=1; i<n; i++) {
4934     v    =  aa      + 9*ai[i];
4935     vi   =  aj      + ai[i];
4936     nz   =  diag[i] - ai[i];
4937     idx +=  3;
4938     s1   =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
4939     while (nz--) {
4940       jdx = 3*(*vi++);
4941       x1  = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
4942       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4943       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4944       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4945       v  += 9;
4946     }
4947     x[idx]   = s1;
4948     x[1+idx] = s2;
4949     x[2+idx] = s3;
4950   }
4951   /* backward solve the upper triangular */
4952   for (i=n-1; i>=0; i--) {
4953     v   = aa + 9*diag[i] + 9;
4954     vi  = aj + diag[i] + 1;
4955     nz  = ai[i+1] - diag[i] - 1;
4956     idt = 3*i;
4957     s1  = x[idt];  s2 = x[1+idt];
4958     s3  = x[2+idt];
4959     while (nz--) {
4960       idx = 3*(*vi++);
4961       x1  = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
4962       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4963       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4964       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4965       v  += 9;
4966     }
4967     v        = aa +  9*diag[i];
4968     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4969     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4970     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4971   }
4972 
4973   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
4974   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4975   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
4976   PetscFunctionReturn(0);
4977 }
4978 
4979 #undef __FUNCT__
4980 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
4981 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
4982 {
4983   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
4984   const PetscInt    n  =a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
4985   PetscErrorCode    ierr;
4986   PetscInt          i,k,nz,idx,jdx,idt;
4987   const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
4988   const MatScalar   *aa=a->a,*v;
4989   PetscScalar       *x;
4990   const PetscScalar *b;
4991   PetscScalar       s1,s2,s3,x1,x2,x3;
4992 
4993   PetscFunctionBegin;
4994   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
4995   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4996   /* forward solve the lower triangular */
4997   idx  = 0;
4998   x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
4999   for (i=1; i<n; i++) {
5000     v   = aa + bs2*ai[i];
5001     vi  = aj + ai[i];
5002     nz  = ai[i+1] - ai[i];
5003     idx = bs*i;
5004     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];
5005     for (k=0; k<nz; k++) {
5006       jdx = bs*vi[k];
5007       x1  = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
5008       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
5009       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
5010       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
5011 
5012       v +=  bs2;
5013     }
5014 
5015     x[idx]   = s1;
5016     x[1+idx] = s2;
5017     x[2+idx] = s3;
5018   }
5019 
5020   /* backward solve the upper triangular */
5021   for (i=n-1; i>=0; i--) {
5022     v   = aa + bs2*(adiag[i+1]+1);
5023     vi  = aj + adiag[i+1]+1;
5024     nz  = adiag[i] - adiag[i+1]-1;
5025     idt = bs*i;
5026     s1  = x[idt];  s2 = x[1+idt];s3 = x[2+idt];
5027 
5028     for (k=0; k<nz; k++) {
5029       idx = bs*vi[k];
5030       x1  = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
5031       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
5032       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
5033       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
5034 
5035       v +=  bs2;
5036     }
5037     /* x = inv_diagonal*x */
5038     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
5039     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
5040     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
5041 
5042   }
5043 
5044   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5045   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5046   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
5047   PetscFunctionReturn(0);
5048 }
5049 
5050 #undef __FUNCT__
5051 #define __FUNCT__ "MatSolve_SeqBAIJ_2_inplace"
5052 PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
5053 {
5054   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
5055   IS                iscol=a->col,isrow=a->row;
5056   PetscErrorCode    ierr;
5057   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
5058   PetscInt          i,nz,idx,idt,idc;
5059   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
5060   const MatScalar   *aa=a->a,*v;
5061   PetscScalar       *x,s1,s2,x1,x2,*t;
5062   const PetscScalar *b;
5063 
5064   PetscFunctionBegin;
5065   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
5066   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5067   t    = a->solve_work;
5068 
5069   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
5070   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
5071 
5072   /* forward solve the lower triangular */
5073   idx  = 2*(*r++);
5074   t[0] = b[idx]; t[1] = b[1+idx];
5075   for (i=1; i<n; i++) {
5076     v   = aa + 4*ai[i];
5077     vi  = aj + ai[i];
5078     nz  = diag[i] - ai[i];
5079     idx = 2*(*r++);
5080     s1  = b[idx]; s2 = b[1+idx];
5081     while (nz--) {
5082       idx = 2*(*vi++);
5083       x1  = t[idx]; x2 = t[1+idx];
5084       s1 -= v[0]*x1 + v[2]*x2;
5085       s2 -= v[1]*x1 + v[3]*x2;
5086       v  += 4;
5087     }
5088     idx    = 2*i;
5089     t[idx] = s1; t[1+idx] = s2;
5090   }
5091   /* backward solve the upper triangular */
5092   for (i=n-1; i>=0; i--) {
5093     v   = aa + 4*diag[i] + 4;
5094     vi  = aj + diag[i] + 1;
5095     nz  = ai[i+1] - diag[i] - 1;
5096     idt = 2*i;
5097     s1  = t[idt]; s2 = t[1+idt];
5098     while (nz--) {
5099       idx = 2*(*vi++);
5100       x1  = t[idx]; x2 = t[1+idx];
5101       s1 -= v[0]*x1 + v[2]*x2;
5102       s2 -= v[1]*x1 + v[3]*x2;
5103       v  += 4;
5104     }
5105     idc      = 2*(*c--);
5106     v        = aa + 4*diag[i];
5107     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
5108     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
5109   }
5110   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
5111   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
5112   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5113   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5114   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
5115   PetscFunctionReturn(0);
5116 }
5117 
5118 #undef __FUNCT__
5119 #define __FUNCT__ "MatSolve_SeqBAIJ_2"
5120 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
5121 {
5122   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
5123   IS                iscol=a->col,isrow=a->row;
5124   PetscErrorCode    ierr;
5125   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
5126   PetscInt          i,nz,idx,jdx,idt,idc,m;
5127   const PetscInt    *r,*c,*rout,*cout;
5128   const MatScalar   *aa=a->a,*v;
5129   PetscScalar       *x,s1,s2,x1,x2,*t;
5130   const PetscScalar *b;
5131 
5132   PetscFunctionBegin;
5133   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
5134   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5135   t    = a->solve_work;
5136 
5137   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
5138   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
5139 
5140   /* forward solve the lower triangular */
5141   idx  = 2*r[0];
5142   t[0] = b[idx]; t[1] = b[1+idx];
5143   for (i=1; i<n; i++) {
5144     v   = aa + 4*ai[i];
5145     vi  = aj + ai[i];
5146     nz  = ai[i+1] - ai[i];
5147     idx = 2*r[i];
5148     s1  = b[idx]; s2 = b[1+idx];
5149     for (m=0; m<nz; m++) {
5150       jdx = 2*vi[m];
5151       x1  = t[jdx]; x2 = t[1+jdx];
5152       s1 -= v[0]*x1 + v[2]*x2;
5153       s2 -= v[1]*x1 + v[3]*x2;
5154       v  += 4;
5155     }
5156     idx    = 2*i;
5157     t[idx] = s1; t[1+idx] = s2;
5158   }
5159   /* backward solve the upper triangular */
5160   for (i=n-1; i>=0; i--) {
5161     v   = aa + 4*(adiag[i+1]+1);
5162     vi  = aj + adiag[i+1]+1;
5163     nz  = adiag[i] - adiag[i+1] - 1;
5164     idt = 2*i;
5165     s1  = t[idt]; s2 = t[1+idt];
5166     for (m=0; m<nz; m++) {
5167       idx = 2*vi[m];
5168       x1  = t[idx]; x2 = t[1+idx];
5169       s1 -= v[0]*x1 + v[2]*x2;
5170       s2 -= v[1]*x1 + v[3]*x2;
5171       v  += 4;
5172     }
5173     idc      = 2*c[i];
5174     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
5175     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
5176   }
5177   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
5178   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
5179   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5180   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5181   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
5182   PetscFunctionReturn(0);
5183 }
5184 
5185 /*
5186       Special case where the matrix was ILU(0) factored in the natural
5187    ordering. This eliminates the need for the column and row permutation.
5188 */
5189 #undef __FUNCT__
5190 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_inplace"
5191 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
5192 {
5193   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
5194   const PetscInt    n  =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
5195   PetscErrorCode    ierr;
5196   const MatScalar   *aa=a->a,*v;
5197   PetscScalar       *x,s1,s2,x1,x2;
5198   const PetscScalar *b;
5199   PetscInt          jdx,idt,idx,nz,i;
5200 
5201   PetscFunctionBegin;
5202   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
5203   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5204 
5205   /* forward solve the lower triangular */
5206   idx  = 0;
5207   x[0] = b[0]; x[1] = b[1];
5208   for (i=1; i<n; i++) {
5209     v    =  aa      + 4*ai[i];
5210     vi   =  aj      + ai[i];
5211     nz   =  diag[i] - ai[i];
5212     idx +=  2;
5213     s1   =  b[idx];s2 = b[1+idx];
5214     while (nz--) {
5215       jdx = 2*(*vi++);
5216       x1  = x[jdx];x2 = x[1+jdx];
5217       s1 -= v[0]*x1 + v[2]*x2;
5218       s2 -= v[1]*x1 + v[3]*x2;
5219       v  += 4;
5220     }
5221     x[idx]   = s1;
5222     x[1+idx] = s2;
5223   }
5224   /* backward solve the upper triangular */
5225   for (i=n-1; i>=0; i--) {
5226     v   = aa + 4*diag[i] + 4;
5227     vi  = aj + diag[i] + 1;
5228     nz  = ai[i+1] - diag[i] - 1;
5229     idt = 2*i;
5230     s1  = x[idt];  s2 = x[1+idt];
5231     while (nz--) {
5232       idx = 2*(*vi++);
5233       x1  = x[idx];   x2 = x[1+idx];
5234       s1 -= v[0]*x1 + v[2]*x2;
5235       s2 -= v[1]*x1 + v[3]*x2;
5236       v  += 4;
5237     }
5238     v        = aa +  4*diag[i];
5239     x[idt]   = v[0]*s1 + v[2]*s2;
5240     x[1+idt] = v[1]*s1 + v[3]*s2;
5241   }
5242 
5243   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5244   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5245   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
5246   PetscFunctionReturn(0);
5247 }
5248 
5249 #undef __FUNCT__
5250 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
5251 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
5252 {
5253   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
5254   const PetscInt    n  = a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
5255   PetscInt          i,k,nz,idx,idt,jdx;
5256   PetscErrorCode    ierr;
5257   const MatScalar   *aa=a->a,*v;
5258   PetscScalar       *x,s1,s2,x1,x2;
5259   const PetscScalar *b;
5260 
5261   PetscFunctionBegin;
5262   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
5263   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5264   /* forward solve the lower triangular */
5265   idx  = 0;
5266   x[0] = b[idx]; x[1] = b[1+idx];
5267   for (i=1; i<n; i++) {
5268     v   = aa + 4*ai[i];
5269     vi  = aj + ai[i];
5270     nz  = ai[i+1] - ai[i];
5271     idx = 2*i;
5272     s1  = b[idx];s2 = b[1+idx];
5273     PetscPrefetchBlock(vi+nz,nz,0,PETSC_PREFETCH_HINT_NTA);
5274     PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA);
5275     for (k=0; k<nz; k++) {
5276       jdx = 2*vi[k];
5277       x1  = x[jdx];x2 = x[1+jdx];
5278       s1 -= v[0]*x1 + v[2]*x2;
5279       s2 -= v[1]*x1 + v[3]*x2;
5280       v  +=  4;
5281     }
5282     x[idx]   = s1;
5283     x[1+idx] = s2;
5284   }
5285 
5286   /* backward solve the upper triangular */
5287   for (i=n-1; i>=0; i--) {
5288     v   = aa + 4*(adiag[i+1]+1);
5289     vi  = aj + adiag[i+1]+1;
5290     nz  = adiag[i] - adiag[i+1]-1;
5291     idt = 2*i;
5292     s1  = x[idt];  s2 = x[1+idt];
5293     PetscPrefetchBlock(vi+nz,nz,0,PETSC_PREFETCH_HINT_NTA);
5294     PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA);
5295     for (k=0; k<nz; k++) {
5296       idx = 2*vi[k];
5297       x1  = x[idx];   x2 = x[1+idx];
5298       s1 -= v[0]*x1 + v[2]*x2;
5299       s2 -= v[1]*x1 + v[3]*x2;
5300       v  += 4;
5301     }
5302     /* x = inv_diagonal*x */
5303     x[idt]   = v[0]*s1 + v[2]*s2;
5304     x[1+idt] = v[1]*s1 + v[3]*s2;
5305   }
5306 
5307   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5308   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5309   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
5310   PetscFunctionReturn(0);
5311 }
5312 
5313 #undef __FUNCT__
5314 #define __FUNCT__ "MatSolve_SeqBAIJ_1_inplace"
5315 PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
5316 {
5317   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
5318   IS                iscol=a->col,isrow=a->row;
5319   PetscErrorCode    ierr;
5320   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
5321   PetscInt          i,nz;
5322   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
5323   const MatScalar   *aa=a->a,*v;
5324   PetscScalar       *x,s1,*t;
5325   const PetscScalar *b;
5326 
5327   PetscFunctionBegin;
5328   if (!n) PetscFunctionReturn(0);
5329 
5330   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
5331   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5332   t    = a->solve_work;
5333 
5334   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
5335   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
5336 
5337   /* forward solve the lower triangular */
5338   t[0] = b[*r++];
5339   for (i=1; i<n; i++) {
5340     v  = aa + ai[i];
5341     vi = aj + ai[i];
5342     nz = diag[i] - ai[i];
5343     s1 = b[*r++];
5344     while (nz--) {
5345       s1 -= (*v++)*t[*vi++];
5346     }
5347     t[i] = s1;
5348   }
5349   /* backward solve the upper triangular */
5350   for (i=n-1; i>=0; i--) {
5351     v  = aa + diag[i] + 1;
5352     vi = aj + diag[i] + 1;
5353     nz = ai[i+1] - diag[i] - 1;
5354     s1 = t[i];
5355     while (nz--) {
5356       s1 -= (*v++)*t[*vi++];
5357     }
5358     x[*c--] = t[i] = aa[diag[i]]*s1;
5359   }
5360 
5361   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
5362   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
5363   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5364   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5365   ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr);
5366   PetscFunctionReturn(0);
5367 }
5368 
5369 #undef __FUNCT__
5370 #define __FUNCT__ "MatSolve_SeqBAIJ_1"
5371 PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
5372 {
5373   Mat_SeqBAIJ       *a    = (Mat_SeqBAIJ*)A->data;
5374   IS                iscol = a->col,isrow = a->row;
5375   PetscErrorCode    ierr;
5376   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag = a->diag,nz;
5377   const PetscInt    *rout,*cout,*r,*c;
5378   PetscScalar       *x,*tmp,sum;
5379   const PetscScalar *b;
5380   const MatScalar   *aa = a->a,*v;
5381 
5382   PetscFunctionBegin;
5383   if (!n) PetscFunctionReturn(0);
5384 
5385   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
5386   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5387   tmp  = a->solve_work;
5388 
5389   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
5390   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
5391 
5392   /* forward solve the lower triangular */
5393   tmp[0] = b[r[0]];
5394   v      = aa;
5395   vi     = aj;
5396   for (i=1; i<n; i++) {
5397     nz  = ai[i+1] - ai[i];
5398     sum = b[r[i]];
5399     PetscSparseDenseMinusDot(sum,tmp,v,vi,nz);
5400     tmp[i] = sum;
5401     v     += nz; vi += nz;
5402   }
5403 
5404   /* backward solve the upper triangular */
5405   for (i=n-1; i>=0; i--) {
5406     v   = aa + adiag[i+1]+1;
5407     vi  = aj + adiag[i+1]+1;
5408     nz  = adiag[i]-adiag[i+1]-1;
5409     sum = tmp[i];
5410     PetscSparseDenseMinusDot(sum,tmp,v,vi,nz);
5411     x[c[i]] = tmp[i] = sum*v[nz]; /* v[nz] = aa[adiag[i]] */
5412   }
5413 
5414   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
5415   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
5416   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5417   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5418   ierr = PetscLogFlops(2*a->nz - A->cmap->n);CHKERRQ(ierr);
5419   PetscFunctionReturn(0);
5420 }
5421 
5422 /*
5423       Special case where the matrix was ILU(0) factored in the natural
5424    ordering. This eliminates the need for the column and row permutation.
5425 */
5426 #undef __FUNCT__
5427 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering_inplace"
5428 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
5429 {
5430   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
5431   const PetscInt    n  = a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
5432   PetscErrorCode    ierr;
5433   const MatScalar   *aa=a->a,*v;
5434   PetscScalar       *x;
5435   const PetscScalar *b;
5436   PetscScalar       s1,x1;
5437   PetscInt          jdx,idt,idx,nz,i;
5438 
5439   PetscFunctionBegin;
5440   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
5441   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5442 
5443   /* forward solve the lower triangular */
5444   idx  = 0;
5445   x[0] = b[0];
5446   for (i=1; i<n; i++) {
5447     v    =  aa      + ai[i];
5448     vi   =  aj      + ai[i];
5449     nz   =  diag[i] - ai[i];
5450     idx +=  1;
5451     s1   =  b[idx];
5452     while (nz--) {
5453       jdx = *vi++;
5454       x1  = x[jdx];
5455       s1 -= v[0]*x1;
5456       v  += 1;
5457     }
5458     x[idx] = s1;
5459   }
5460   /* backward solve the upper triangular */
5461   for (i=n-1; i>=0; i--) {
5462     v   = aa + diag[i] + 1;
5463     vi  = aj + diag[i] + 1;
5464     nz  = ai[i+1] - diag[i] - 1;
5465     idt = i;
5466     s1  = x[idt];
5467     while (nz--) {
5468       idx = *vi++;
5469       x1  = x[idx];
5470       s1 -= v[0]*x1;
5471       v  += 1;
5472     }
5473     v      = aa +  diag[i];
5474     x[idt] = v[0]*s1;
5475   }
5476   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5477   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5478   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
5479   PetscFunctionReturn(0);
5480 }
5481 
5482 
5483 #undef __FUNCT__
5484 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering"
5485 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
5486 {
5487   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
5488   PetscErrorCode    ierr;
5489   const PetscInt    n = a->mbs,*ai = a->i,*aj = a->j,*adiag = a->diag,*vi;
5490   PetscScalar       *x,sum;
5491   const PetscScalar *b;
5492   const MatScalar   *aa = a->a,*v;
5493   PetscInt          i,nz;
5494 
5495   PetscFunctionBegin;
5496   if (!n) PetscFunctionReturn(0);
5497 
5498   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
5499   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5500 
5501   /* forward solve the lower triangular */
5502   x[0] = b[0];
5503   v    = aa;
5504   vi   = aj;
5505   for (i=1; i<n; i++) {
5506     nz  = ai[i+1] - ai[i];
5507     sum = b[i];
5508     PetscSparseDenseMinusDot(sum,x,v,vi,nz);
5509     v   += nz;
5510     vi  += nz;
5511     x[i] = sum;
5512   }
5513 
5514   /* backward solve the upper triangular */
5515   for (i=n-1; i>=0; i--) {
5516     v   = aa + adiag[i+1] + 1;
5517     vi  = aj + adiag[i+1] + 1;
5518     nz  = adiag[i] - adiag[i+1]-1;
5519     sum = x[i];
5520     PetscSparseDenseMinusDot(sum,x,v,vi,nz);
5521     x[i] = sum*v[nz]; /* x[i]=aa[adiag[i]]*sum; v++; */
5522   }
5523 
5524   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
5525   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5526   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5527   PetscFunctionReturn(0);
5528 }
5529 
5530 /* ----------------------------------------------------------------*/
5531 extern PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscBool);
5532 
5533 #undef __FUNCT__
5534 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering"
5535 /*
5536    This is not much faster than MatLUFactorNumeric_SeqBAIJ_N() but the solve is faster at least sometimes
5537 */
5538 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info)
5539 {
5540   Mat             C =B;
5541   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data;
5542   PetscErrorCode  ierr;
5543   PetscInt        i,j,k,ipvt[15];
5544   const PetscInt  n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*ajtmp,*bjtmp,*bdiag=b->diag,*pj;
5545   PetscInt        nz,nzL,row;
5546   MatScalar       *rtmp,*pc,*mwork,*pv,*vv,work[225];
5547   const MatScalar *v,*aa=a->a;
5548   PetscInt        bs2 = a->bs2,bs=A->rmap->bs,flg;
5549   PetscInt        sol_ver;
5550 
5551   PetscFunctionBegin;
5552   ierr = PetscOptionsGetInt(((PetscObject)A)->prefix,"-sol_ver",&sol_ver,PETSC_NULL);CHKERRQ(ierr);
5553 
5554   /* generate work space needed by the factorization */
5555   ierr = PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);CHKERRQ(ierr);
5556   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
5557 
5558   for (i=0; i<n; i++) {
5559     /* zero rtmp */
5560     /* L part */
5561     nz    = bi[i+1] - bi[i];
5562     bjtmp = bj + bi[i];
5563     for  (j=0; j<nz; j++) {
5564       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5565     }
5566 
5567     /* U part */
5568     nz    = bdiag[i] - bdiag[i+1];
5569     bjtmp = bj + bdiag[i+1]+1;
5570     for  (j=0; j<nz; j++) {
5571       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5572     }
5573 
5574     /* load in initial (unfactored row) */
5575     nz    = ai[i+1] - ai[i];
5576     ajtmp = aj + ai[i];
5577     v     = aa + bs2*ai[i];
5578     for (j=0; j<nz; j++) {
5579       ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
5580     }
5581 
5582     /* elimination */
5583     bjtmp = bj + bi[i];
5584     nzL   = bi[i+1] - bi[i];
5585     for (k=0; k < nzL; k++) {
5586       row = bjtmp[k];
5587       pc  = rtmp + bs2*row;
5588       for (flg=0,j=0; j<bs2; j++) {
5589         if (pc[j]!=0.0) {
5590           flg = 1;
5591           break;
5592         }
5593       }
5594       if (flg) {
5595         pv = b->a + bs2*bdiag[row];
5596         PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork);
5597         /*ierr = PetscKernel_A_gets_A_times_B_15(pc,pv,mwork);CHKERRQ(ierr);*/
5598         pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
5599         pv = b->a + bs2*(bdiag[row+1]+1);
5600         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
5601         for (j=0; j<nz; j++) {
5602           vv = rtmp + bs2*pj[j];
5603           PetscKernel_A_gets_A_minus_B_times_C(bs,vv,pc,pv);
5604           /* ierr = PetscKernel_A_gets_A_minus_B_times_C_15(vv,pc,pv);CHKERRQ(ierr); */
5605           pv += bs2;
5606         }
5607         ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
5608       }
5609     }
5610 
5611     /* finished row so stick it into b->a */
5612     /* L part */
5613     pv = b->a + bs2*bi[i];
5614     pj = b->j + bi[i];
5615     nz = bi[i+1] - bi[i];
5616     for (j=0; j<nz; j++) {
5617       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5618     }
5619 
5620     /* Mark diagonal and invert diagonal for simplier triangular solves */
5621     pv   = b->a + bs2*bdiag[i];
5622     pj   = b->j + bdiag[i];
5623     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5624     /* PetscKernel_A_gets_inverse_A(bs,pv,pivots,work); */
5625     ierr = PetscKernel_A_gets_inverse_A_15(pv,ipvt,work,info->shiftamount);CHKERRQ(ierr);
5626 
5627     /* U part */
5628     pv = b->a + bs2*(bdiag[i+1]+1);
5629     pj = b->j + bdiag[i+1]+1;
5630     nz = bdiag[i] - bdiag[i+1] - 1;
5631     for (j=0; j<nz; j++) {
5632       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5633     }
5634   }
5635 
5636   ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr);
5637 
5638   C->ops->solve          = MatSolve_SeqBAIJ_15_NaturalOrdering_ver1;
5639   C->ops->solvetranspose = MatSolve_SeqBAIJ_N_NaturalOrdering;
5640   C->assembled           = PETSC_TRUE;
5641 
5642   ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
5643   PetscFunctionReturn(0);
5644 }
5645 
5646 #undef __FUNCT__
5647 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N"
5648 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N(Mat B,Mat A,const MatFactorInfo *info)
5649 {
5650   Mat            C     =B;
5651   Mat_SeqBAIJ    *a    =(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data;
5652   IS             isrow = b->row,isicol = b->icol;
5653   PetscErrorCode ierr;
5654   const PetscInt *r,*ic;
5655   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
5656   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
5657   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
5658   PetscInt       bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg;
5659   MatScalar      *v_work;
5660   PetscBool      col_identity,row_identity,both_identity;
5661 
5662   PetscFunctionBegin;
5663   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
5664   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
5665 
5666   ierr = PetscMalloc(bs2*n*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
5667   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
5668 
5669   /* generate work space needed by dense LU factorization */
5670   ierr = PetscMalloc3(bs,MatScalar,&v_work,bs2,MatScalar,&mwork,bs,PetscInt,&v_pivots);CHKERRQ(ierr);
5671 
5672   for (i=0; i<n; i++) {
5673     /* zero rtmp */
5674     /* L part */
5675     nz    = bi[i+1] - bi[i];
5676     bjtmp = bj + bi[i];
5677     for  (j=0; j<nz; j++) {
5678       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5679     }
5680 
5681     /* U part */
5682     nz    = bdiag[i] - bdiag[i+1];
5683     bjtmp = bj + bdiag[i+1]+1;
5684     for  (j=0; j<nz; j++) {
5685       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5686     }
5687 
5688     /* load in initial (unfactored row) */
5689     nz    = ai[r[i]+1] - ai[r[i]];
5690     ajtmp = aj + ai[r[i]];
5691     v     = aa + bs2*ai[r[i]];
5692     for (j=0; j<nz; j++) {
5693       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
5694     }
5695 
5696     /* elimination */
5697     bjtmp = bj + bi[i];
5698     nzL   = bi[i+1] - bi[i];
5699     for (k=0; k < nzL; k++) {
5700       row = bjtmp[k];
5701       pc  = rtmp + bs2*row;
5702       for (flg=0,j=0; j<bs2; j++) {
5703         if (pc[j]!=0.0) {
5704           flg = 1;
5705           break;
5706         }
5707       }
5708       if (flg) {
5709         pv = b->a + bs2*bdiag[row];
5710         PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */
5711         pj = b->j + bdiag[row+1]+1;         /* begining of U(row,:) */
5712         pv = b->a + bs2*(bdiag[row+1]+1);
5713         nz = bdiag[row] - bdiag[row+1] - 1;         /* num of entries inU(row,:), excluding diag */
5714         for (j=0; j<nz; j++) {
5715           PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j);
5716         }
5717         ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
5718       }
5719     }
5720 
5721     /* finished row so stick it into b->a */
5722     /* L part */
5723     pv = b->a + bs2*bi[i];
5724     pj = b->j + bi[i];
5725     nz = bi[i+1] - bi[i];
5726     for (j=0; j<nz; j++) {
5727       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5728     }
5729 
5730     /* Mark diagonal and invert diagonal for simplier triangular solves */
5731     pv = b->a + bs2*bdiag[i];
5732     pj = b->j + bdiag[i];
5733     /* if (*pj != i)SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */
5734     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5735     ierr = PetscKernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr);
5736 
5737     /* U part */
5738     pv = b->a + bs2*(bdiag[i+1]+1);
5739     pj = b->j + bdiag[i+1]+1;
5740     nz = bdiag[i] - bdiag[i+1] - 1;
5741     for (j=0; j<nz; j++) {
5742       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5743     }
5744   }
5745 
5746   ierr = PetscFree(rtmp);CHKERRQ(ierr);
5747   ierr = PetscFree3(v_work,mwork,v_pivots);CHKERRQ(ierr);
5748   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
5749   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
5750 
5751   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5752   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
5753 
5754   both_identity = (PetscBool) (row_identity && col_identity);
5755   if (both_identity) {
5756     C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering;
5757   } else {
5758     C->ops->solve = MatSolve_SeqBAIJ_N;
5759   }
5760   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_N;
5761 
5762   C->assembled = PETSC_TRUE;
5763 
5764   ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
5765   PetscFunctionReturn(0);
5766 }
5767 
5768 /*
5769    ilu(0) with natural ordering under new data structure.
5770    See MatILUFactorSymbolic_SeqAIJ_ilu0() for detailed description
5771    because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_inplace().
5772 */
5773 
5774 #undef __FUNCT__
5775 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0"
5776 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
5777 {
5778 
5779   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b;
5780   PetscErrorCode ierr;
5781   PetscInt       n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2;
5782   PetscInt       i,j,nz,*bi,*bj,*bdiag,bi_temp;
5783 
5784   PetscFunctionBegin;
5785   ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr);
5786   b    = (Mat_SeqBAIJ*)(fact)->data;
5787 
5788   /* allocate matrix arrays for new data structure */
5789   ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,n+1,PetscInt,&b->i);CHKERRQ(ierr);
5790   ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(n+1)*sizeof(PetscInt));CHKERRQ(ierr);
5791 
5792   b->singlemalloc    = PETSC_TRUE;
5793   b->free_a          = PETSC_TRUE;
5794   b->free_ij         = PETSC_TRUE;
5795   fact->preallocated = PETSC_TRUE;
5796   fact->assembled    = PETSC_TRUE;
5797   if (!b->diag) {
5798     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr);
5799     ierr = PetscLogObjectMemory(fact,(n+1)*sizeof(PetscInt));CHKERRQ(ierr);
5800   }
5801   bdiag = b->diag;
5802 
5803   if (n > 0) {
5804     ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr);
5805   }
5806 
5807   /* set bi and bj with new data structure */
5808   bi = b->i;
5809   bj = b->j;
5810 
5811   /* L part */
5812   bi[0] = 0;
5813   for (i=0; i<n; i++) {
5814     nz      = adiag[i] - ai[i];
5815     bi[i+1] = bi[i] + nz;
5816     aj      = a->j + ai[i];
5817     for (j=0; j<nz; j++) {
5818       *bj = aj[j]; bj++;
5819     }
5820   }
5821 
5822   /* U part */
5823   bi_temp  = bi[n];
5824   bdiag[n] = bi[n]-1;
5825   for (i=n-1; i>=0; i--) {
5826     nz      = ai[i+1] - adiag[i] - 1;
5827     bi_temp = bi_temp + nz + 1;
5828     aj      = a->j + adiag[i] + 1;
5829     for (j=0; j<nz; j++) {
5830       *bj = aj[j]; bj++;
5831     }
5832     /* diag[i] */
5833     *bj      = i; bj++;
5834     bdiag[i] = bi_temp - 1;
5835   }
5836   PetscFunctionReturn(0);
5837 }
5838 
5839 #undef __FUNCT__
5840 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ"
5841 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
5842 {
5843   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
5844   IS                 isicol;
5845   PetscErrorCode     ierr;
5846   const PetscInt     *r,*ic;
5847   PetscInt           n=a->mbs,*ai=a->i,*aj=a->j,d;
5848   PetscInt           *bi,*cols,nnz,*cols_lvl;
5849   PetscInt           *bdiag,prow,fm,nzbd,reallocs=0,dcount=0;
5850   PetscInt           i,levels,diagonal_fill;
5851   PetscBool          col_identity,row_identity,both_identity;
5852   PetscReal          f;
5853   PetscInt           nlnk,*lnk,*lnk_lvl=PETSC_NULL;
5854   PetscBT            lnkbt;
5855   PetscInt           nzi,*bj,**bj_ptr,**bjlvl_ptr;
5856   PetscFreeSpaceList free_space    =PETSC_NULL,current_space=PETSC_NULL;
5857   PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL;
5858   PetscBool          missing;
5859   PetscInt           bs=A->rmap->bs,bs2=a->bs2;
5860 
5861   PetscFunctionBegin;
5862   if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
5863   if (bs>1) {  /* check shifttype */
5864     if (info->shifttype == MAT_SHIFT_NONZERO || info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE)
5865       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Only MAT_SHIFT_NONE and MAT_SHIFT_INBLOCKS are supported for BAIJ matrix");
5866   }
5867 
5868   ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr);
5869   if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
5870 
5871   f             = info->fill;
5872   levels        = (PetscInt)info->levels;
5873   diagonal_fill = (PetscInt)info->diagonal_fill;
5874 
5875   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
5876 
5877   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5878   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
5879 
5880   both_identity = (PetscBool) (row_identity && col_identity);
5881 
5882   if (!levels && both_identity) {
5883     /* special case: ilu(0) with natural ordering */
5884     ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0(fact,A,isrow,iscol,info);CHKERRQ(ierr);
5885     ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
5886 
5887     fact->factortype               = MAT_FACTOR_ILU;
5888     (fact)->info.factor_mallocs    = 0;
5889     (fact)->info.fill_ratio_given  = info->fill;
5890     (fact)->info.fill_ratio_needed = 1.0;
5891 
5892     b                = (Mat_SeqBAIJ*)(fact)->data;
5893     b->row           = isrow;
5894     b->col           = iscol;
5895     b->icol          = isicol;
5896     ierr             = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5897     ierr             = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5898     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5899 
5900     ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
5901     PetscFunctionReturn(0);
5902   }
5903 
5904   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
5905   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
5906 
5907   /* get new row pointers */
5908   ierr  = PetscMalloc((n+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
5909   bi[0] = 0;
5910   /* bdiag is location of diagonal in factor */
5911   ierr     = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr);
5912   bdiag[0] = 0;
5913 
5914   ierr = PetscMalloc2(n,PetscInt*,&bj_ptr,n,PetscInt*,&bjlvl_ptr);CHKERRQ(ierr);
5915 
5916   /* create a linked list for storing column indices of the active row */
5917   nlnk = n + 1;
5918   ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
5919 
5920   /* initial FreeSpace size is f*(ai[n]+1) */
5921   ierr              = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr);
5922   current_space     = free_space;
5923   ierr              = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr);
5924   current_space_lvl = free_space_lvl;
5925 
5926   for (i=0; i<n; i++) {
5927     nzi = 0;
5928     /* copy current row into linked list */
5929     nnz = ai[r[i]+1] - ai[r[i]];
5930     if (!nnz) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i);
5931     cols   = aj + ai[r[i]];
5932     lnk[i] = -1; /* marker to indicate if diagonal exists */
5933     ierr   = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
5934     nzi   += nlnk;
5935 
5936     /* make sure diagonal entry is included */
5937     if (diagonal_fill && lnk[i] == -1) {
5938       fm = n;
5939       while (lnk[fm] < i) fm = lnk[fm];
5940       lnk[i]     = lnk[fm]; /* insert diagonal into linked list */
5941       lnk[fm]    = i;
5942       lnk_lvl[i] = 0;
5943       nzi++; dcount++;
5944     }
5945 
5946     /* add pivot rows into the active row */
5947     nzbd = 0;
5948     prow = lnk[n];
5949     while (prow < i) {
5950       nnz      = bdiag[prow];
5951       cols     = bj_ptr[prow] + nnz + 1;
5952       cols_lvl = bjlvl_ptr[prow] + nnz + 1;
5953       nnz      = bi[prow+1] - bi[prow] - nnz - 1;
5954 
5955       ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr);
5956       nzi += nlnk;
5957       prow = lnk[prow];
5958       nzbd++;
5959     }
5960     bdiag[i] = nzbd;
5961     bi[i+1]  = bi[i] + nzi;
5962 
5963     /* if free space is not available, make more free space */
5964     if (current_space->local_remaining<nzi) {
5965       nnz  = 2*nzi*(n - i); /* estimated and max additional space needed */
5966       ierr = PetscFreeSpaceGet(nnz,&current_space);CHKERRQ(ierr);
5967       ierr = PetscFreeSpaceGet(nnz,&current_space_lvl);CHKERRQ(ierr);
5968       reallocs++;
5969     }
5970 
5971     /* copy data into free_space and free_space_lvl, then initialize lnk */
5972     ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr);
5973 
5974     bj_ptr[i]    = current_space->array;
5975     bjlvl_ptr[i] = current_space_lvl->array;
5976 
5977     /* make sure the active row i has diagonal entry */
5978     if (*(bj_ptr[i]+bdiag[i]) != i) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\ntry running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i);
5979 
5980     current_space->array           += nzi;
5981     current_space->local_used      += nzi;
5982     current_space->local_remaining -= nzi;
5983 
5984     current_space_lvl->array           += nzi;
5985     current_space_lvl->local_used      += nzi;
5986     current_space_lvl->local_remaining -= nzi;
5987   }
5988 
5989   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
5990   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
5991 
5992   /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */
5993   ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr);
5994   ierr = PetscFreeSpaceContiguous_LU(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr);
5995 
5996   ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
5997   ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr);
5998   ierr = PetscFree2(bj_ptr,bjlvl_ptr);CHKERRQ(ierr);
5999 
6000 #if defined(PETSC_USE_INFO)
6001   {
6002     PetscReal af = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]);
6003     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr);
6004     ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
6005     ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr);
6006     ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
6007     if (diagonal_fill) {
6008       ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr);
6009     }
6010   }
6011 #endif
6012 
6013   /* put together the new matrix */
6014   ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
6015   ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
6016 
6017   b               = (Mat_SeqBAIJ*)(fact)->data;
6018   b->free_a       = PETSC_TRUE;
6019   b->free_ij      = PETSC_TRUE;
6020   b->singlemalloc = PETSC_FALSE;
6021 
6022   ierr = PetscMalloc((bs2*(bdiag[0]+1))*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
6023 
6024   b->j          = bj;
6025   b->i          = bi;
6026   b->diag       = bdiag;
6027   b->free_diag  = PETSC_TRUE;
6028   b->ilen       = 0;
6029   b->imax       = 0;
6030   b->row        = isrow;
6031   b->col        = iscol;
6032   ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
6033   ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
6034   b->icol       = isicol;
6035 
6036   ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
6037   /* In b structure:  Free imax, ilen, old a, old j.
6038      Allocate bdiag, solve_work, new a, new j */
6039   ierr     = PetscLogObjectMemory(fact,(bdiag[0]+1) * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr);
6040   b->maxnz = b->nz = bdiag[0]+1;
6041 
6042   fact->info.factor_mallocs    = reallocs;
6043   fact->info.fill_ratio_given  = f;
6044   fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]);
6045 
6046   ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
6047   PetscFunctionReturn(0);
6048 }
6049 
6050 /*
6051      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
6052    except that the data structure of Mat_SeqAIJ is slightly different.
6053    Not a good example of code reuse.
6054 */
6055 #undef __FUNCT__
6056 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_inplace"
6057 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_inplace(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
6058 {
6059   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b;
6060   IS             isicol;
6061   PetscErrorCode ierr;
6062   const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi;
6063   PetscInt       prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp;
6064   PetscInt       *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0;
6065   PetscInt       incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd;
6066   PetscBool      col_identity,row_identity,both_identity,flg;
6067   PetscReal      f;
6068 
6069   PetscFunctionBegin;
6070   ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr);
6071   if (flg) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd);
6072 
6073   f             = info->fill;
6074   levels        = (PetscInt)info->levels;
6075   diagonal_fill = (PetscInt)info->diagonal_fill;
6076 
6077   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
6078 
6079   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
6080   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
6081   both_identity = (PetscBool) (row_identity && col_identity);
6082 
6083   if (!levels && both_identity) {  /* special case copy the nonzero structure */
6084     ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr);
6085     ierr = MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);CHKERRQ(ierr);
6086 
6087     fact->factortype = MAT_FACTOR_ILU;
6088     b                = (Mat_SeqBAIJ*)fact->data;
6089     b->row           = isrow;
6090     b->col           = iscol;
6091     ierr             = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
6092     ierr             = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
6093     b->icol          = isicol;
6094     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
6095 
6096     ierr         = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
6097     PetscFunctionReturn(0);
6098   }
6099 
6100   /* general case perform the symbolic factorization */
6101   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
6102   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
6103 
6104   /* get new row pointers */
6105   ierr     = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr);
6106   ainew[0] = 0;
6107   /* don't know how many column pointers are needed so estimate */
6108   jmax = (PetscInt)(f*ai[n] + 1);
6109   ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr);
6110   /* ajfill is level of fill for each fill entry */
6111   ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr);
6112   /* fill is a linked list of nonzeros in active row */
6113   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr);
6114   /* im is level for each filled value */
6115   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr);
6116   /* dloc is location of diagonal in factor */
6117   ierr    = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr);
6118   dloc[0] = 0;
6119   for (prow=0; prow<n; prow++) {
6120 
6121     /* copy prow into linked list */
6122     nzf = nz = ai[r[prow]+1] - ai[r[prow]];
6123     if (!nz) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow);
6124     xi         = aj + ai[r[prow]];
6125     fill[n]    = n;
6126     fill[prow] = -1;   /* marker for diagonal entry */
6127     while (nz--) {
6128       fm  = n;
6129       idx = ic[*xi++];
6130       do {
6131         m  = fm;
6132         fm = fill[m];
6133       } while (fm < idx);
6134       fill[m]   = idx;
6135       fill[idx] = fm;
6136       im[idx]   = 0;
6137     }
6138 
6139     /* make sure diagonal entry is included */
6140     if (diagonal_fill && fill[prow] == -1) {
6141       fm = n;
6142       while (fill[fm] < prow) fm = fill[fm];
6143       fill[prow] = fill[fm];    /* insert diagonal into linked list */
6144       fill[fm]   = prow;
6145       im[prow]   = 0;
6146       nzf++;
6147       dcount++;
6148     }
6149 
6150     nzi = 0;
6151     row = fill[n];
6152     while (row < prow) {
6153       incrlev = im[row] + 1;
6154       nz      = dloc[row];
6155       xi      = ajnew  + ainew[row] + nz + 1;
6156       flev    = ajfill + ainew[row] + nz + 1;
6157       nnz     = ainew[row+1] - ainew[row] - nz - 1;
6158       fm      = row;
6159       while (nnz-- > 0) {
6160         idx = *xi++;
6161         if (*flev + incrlev > levels) {
6162           flev++;
6163           continue;
6164         }
6165         do {
6166           m  = fm;
6167           fm = fill[m];
6168         } while (fm < idx);
6169         if (fm != idx) {
6170           im[idx]   = *flev + incrlev;
6171           fill[m]   = idx;
6172           fill[idx] = fm;
6173           fm        = idx;
6174           nzf++;
6175         } else if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
6176         flev++;
6177       }
6178       row = fill[row];
6179       nzi++;
6180     }
6181     /* copy new filled row into permanent storage */
6182     ainew[prow+1] = ainew[prow] + nzf;
6183     if (ainew[prow+1] > jmax) {
6184 
6185       /* estimate how much additional space we will need */
6186       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
6187       /* just double the memory each time */
6188       PetscInt maxadd = jmax;
6189       /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
6190       if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
6191       jmax += maxadd;
6192 
6193       /* allocate a longer ajnew and ajfill */
6194       ierr   = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
6195       ierr   = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
6196       ierr   = PetscFree(ajnew);CHKERRQ(ierr);
6197       ajnew  = xitmp;
6198       ierr   = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
6199       ierr   = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
6200       ierr   = PetscFree(ajfill);CHKERRQ(ierr);
6201       ajfill = xitmp;
6202       reallocate++;   /* count how many reallocations are needed */
6203     }
6204     xitmp      = ajnew + ainew[prow];
6205     flev       = ajfill + ainew[prow];
6206     dloc[prow] = nzi;
6207     fm         = fill[n];
6208     while (nzf--) {
6209       *xitmp++ = fm;
6210       *flev++  = im[fm];
6211       fm       = fill[fm];
6212     }
6213     /* make sure row has diagonal entry */
6214     if (ajnew[ainew[prow]+dloc[prow]] != prow) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
6215                                                         try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow);
6216   }
6217   ierr = PetscFree(ajfill);CHKERRQ(ierr);
6218   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
6219   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
6220   ierr = PetscFree(fill);CHKERRQ(ierr);
6221   ierr = PetscFree(im);CHKERRQ(ierr);
6222 
6223 #if defined(PETSC_USE_INFO)
6224   {
6225     PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
6226     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr);
6227     ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
6228     ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr);
6229     ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
6230     if (diagonal_fill) {
6231       ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr);
6232     }
6233   }
6234 #endif
6235 
6236   /* put together the new matrix */
6237   ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
6238   ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
6239   b    = (Mat_SeqBAIJ*)fact->data;
6240 
6241   b->free_a       = PETSC_TRUE;
6242   b->free_ij      = PETSC_TRUE;
6243   b->singlemalloc = PETSC_FALSE;
6244 
6245   ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
6246 
6247   b->j          = ajnew;
6248   b->i          = ainew;
6249   for (i=0; i<n; i++) dloc[i] += ainew[i];
6250   b->diag          = dloc;
6251   b->free_diag     = PETSC_TRUE;
6252   b->ilen          = 0;
6253   b->imax          = 0;
6254   b->row           = isrow;
6255   b->col           = iscol;
6256   b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
6257 
6258   ierr    = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
6259   ierr    = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
6260   b->icol = isicol;
6261   ierr    = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
6262   /* In b structure:  Free imax, ilen, old a, old j.
6263      Allocate dloc, solve_work, new a, new j */
6264   ierr     = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr);
6265   b->maxnz = b->nz = ainew[n];
6266 
6267   fact->info.factor_mallocs    = reallocate;
6268   fact->info.fill_ratio_given  = f;
6269   fact->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
6270 
6271   ierr = MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);CHKERRQ(ierr);
6272   PetscFunctionReturn(0);
6273 }
6274 
6275 #undef __FUNCT__
6276 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE"
6277 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
6278 {
6279   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; */
6280   /* int i,*AJ=a->j,nz=a->nz; */
6281 
6282   PetscFunctionBegin;
6283   /* Undo Column scaling */
6284   /*    while (nz--) { */
6285   /*      AJ[i] = AJ[i]/4; */
6286   /*    } */
6287   /* This should really invoke a push/pop logic, but we don't have that yet. */
6288   A->ops->setunfactored = PETSC_NULL;
6289   PetscFunctionReturn(0);
6290 }
6291 
6292 #undef __FUNCT__
6293 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj"
6294 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
6295 {
6296   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
6297   PetscInt       *AJ=a->j,nz=a->nz;
6298   unsigned short *aj=(unsigned short*)AJ;
6299 
6300   PetscFunctionBegin;
6301   /* Is this really necessary? */
6302   while (nz--) {
6303     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
6304   }
6305   A->ops->setunfactored = PETSC_NULL;
6306   PetscFunctionReturn(0);
6307 }
6308 
6309 
6310