xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision f3fe499b4cc4d64bf04aa4f5e4963dcc4eb56541)
1 #define PETSCMAT_DLL
2 
3 /*
4     Factorization code for BAIJ format.
5 */
6 
7 #include "../src/mat/impls/baij/seq/baij.h"
8 #include "../src/mat/blockinvert.h"
9 #include "petscbt.h"
10 #include "../src/mat/utils/freespace.h"
11 
12 #undef __FUNCT__
13 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering"
14 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
15 {
16   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
17   PetscErrorCode    ierr;
18   const PetscInt    *adiag = a->diag,*ai = a->i,*aj = a->j,*vi;
19   PetscInt          i,n = a->mbs,j;
20   PetscInt          nz;
21   PetscScalar       *x,*tmp,s1;
22   const MatScalar   *aa = a->a,*v;
23   const PetscScalar *b;
24 
25   PetscFunctionBegin;
26   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
27   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
28   tmp  = a->solve_work;
29 
30 
31   /* copy the b into temp work space according to permutation */
32   for (i=0; i<n; i++) tmp[i] = b[i];
33 
34   /* forward solve the U^T */
35   for (i=0; i<n; i++) {
36     v   = aa + adiag[i+1] + 1;
37     vi  = aj + adiag[i+1] + 1;
38     nz  = adiag[i] - adiag[i+1] - 1;
39     s1  = tmp[i];
40     s1 *= v[nz];  /* multiply by inverse of diagonal entry */
41     for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
42     tmp[i] = s1;
43   }
44 
45   /* backward solve the L^T */
46   for (i=n-1; i>=0; i--){
47     v   = aa + ai[i];
48     vi  = aj + ai[i];
49     nz  = ai[i+1] - ai[i];
50     s1  = tmp[i];
51     for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
52   }
53 
54   /* copy tmp into x according to permutation */
55   for (i=0; i<n; i++) x[i] = tmp[i];
56 
57   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
58   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
59 
60   ierr = PetscLogFlops(2.0*a->nz-A->cmap->n);CHKERRQ(ierr);
61   PetscFunctionReturn(0);
62 }
63 
64 #undef __FUNCT__
65 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace"
66 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
67 {
68   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
69   PetscErrorCode    ierr;
70   PetscInt          i,nz;
71   const PetscInt    *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
72   const MatScalar   *aa=a->a,*v;
73   PetscScalar       s1,*x;
74   const PetscScalar *b;
75 
76   PetscFunctionBegin;
77   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
78   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
79   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
80 
81   /* forward solve the U^T */
82   for (i=0; i<n; i++) {
83 
84     v     = aa + diag[i];
85     /* multiply by the inverse of the block diagonal */
86     s1    = (*v++)*x[i];
87     vi    = aj + diag[i] + 1;
88     nz    = ai[i+1] - diag[i] - 1;
89     while (nz--) {
90       x[*vi++]  -= (*v++)*s1;
91     }
92     x[i]   = s1;
93   }
94   /* backward solve the L^T */
95   for (i=n-1; i>=0; i--){
96     v    = aa + diag[i] - 1;
97     vi   = aj + diag[i] - 1;
98     nz   = diag[i] - ai[i];
99     s1   = x[i];
100     while (nz--) {
101       x[*vi--]   -=  (*v--)*s1;
102     }
103   }
104   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
105   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
106   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
107   PetscFunctionReturn(0);
108 }
109 
110 #undef __FUNCT__
111 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace"
112 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
113 {
114   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
115   PetscErrorCode    ierr;
116   PetscInt          i,nz,idx,idt,oidx;
117   const PetscInt    *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j;
118   const MatScalar   *aa=a->a,*v;
119   PetscScalar       s1,s2,x1,x2,*x;
120   const PetscScalar *b;
121 
122   PetscFunctionBegin;
123   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
124   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
125   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
126 
127   /* forward solve the U^T */
128   idx = 0;
129   for (i=0; i<n; i++) {
130 
131     v     = aa + 4*diag[i];
132     /* multiply by the inverse of the block diagonal */
133     x1 = x[idx];   x2 = x[1+idx];
134     s1 = v[0]*x1  +  v[1]*x2;
135     s2 = v[2]*x1  +  v[3]*x2;
136     v += 4;
137 
138     vi    = aj + diag[i] + 1;
139     nz    = ai[i+1] - diag[i] - 1;
140     while (nz--) {
141       oidx = 2*(*vi++);
142       x[oidx]   -= v[0]*s1  +  v[1]*s2;
143       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
144       v  += 4;
145     }
146     x[idx]   = s1;x[1+idx] = s2;
147     idx += 2;
148   }
149   /* backward solve the L^T */
150   for (i=n-1; i>=0; i--){
151     v    = aa + 4*diag[i] - 4;
152     vi   = aj + diag[i] - 1;
153     nz   = diag[i] - ai[i];
154     idt  = 2*i;
155     s1   = x[idt];  s2 = x[1+idt];
156     while (nz--) {
157       idx   = 2*(*vi--);
158       x[idx]   -=  v[0]*s1 +  v[1]*s2;
159       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
160       v -= 4;
161     }
162   }
163   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
164   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
165   ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
166   PetscFunctionReturn(0);
167 }
168 
169 #undef __FUNCT__
170 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering"
171 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
172 {
173   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
174   PetscErrorCode    ierr;
175   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
176   PetscInt          nz,idx,idt,j,i,oidx;
177   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
178   const MatScalar   *aa=a->a,*v;
179   PetscScalar       s1,s2,x1,x2,*x;
180   const PetscScalar *b;
181 
182   PetscFunctionBegin;
183   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
184   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
185   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
186 
187   /* forward solve the U^T */
188   idx = 0;
189   for (i=0; i<n; i++) {
190     v     = aa + bs2*diag[i];
191     /* multiply by the inverse of the block diagonal */
192     x1 = x[idx];   x2 = x[1+idx];
193     s1 = v[0]*x1  +  v[1]*x2;
194     s2 = v[2]*x1  +  v[3]*x2;
195     v -= bs2;
196 
197     vi    = aj + diag[i] - 1;
198     nz    = diag[i] - diag[i+1] - 1;
199     for(j=0;j>-nz;j--){
200       oidx = bs*vi[j];
201       x[oidx]   -= v[0]*s1  +  v[1]*s2;
202       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
203       v  -= bs2;
204     }
205     x[idx]   = s1;x[1+idx] = s2;
206     idx += bs;
207   }
208   /* backward solve the L^T */
209   for (i=n-1; i>=0; i--){
210     v    = aa + bs2*ai[i];
211     vi   = aj + ai[i];
212     nz   = ai[i+1] - ai[i];
213     idt  = bs*i;
214     s1   = x[idt];  s2 = x[1+idt];
215     for(j=0;j<nz;j++){
216       idx   = bs*vi[j];
217       x[idx]   -=  v[0]*s1 +  v[1]*s2;
218       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
219       v += bs2;
220     }
221   }
222   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
223   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
224   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
225   PetscFunctionReturn(0);
226 }
227 
228 #undef __FUNCT__
229 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace"
230 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
231 {
232   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
233   PetscErrorCode    ierr;
234   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
235   PetscInt          i,nz,idx,idt,oidx;
236   const MatScalar   *aa=a->a,*v;
237   PetscScalar       s1,s2,s3,x1,x2,x3,*x;
238   const PetscScalar *b;
239 
240   PetscFunctionBegin;
241   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
242   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
243   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
244 
245   /* forward solve the U^T */
246   idx = 0;
247   for (i=0; i<n; i++) {
248 
249     v     = aa + 9*diag[i];
250     /* multiply by the inverse of the block diagonal */
251     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
252     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
253     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
254     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
255     v += 9;
256 
257     vi    = aj + diag[i] + 1;
258     nz    = ai[i+1] - diag[i] - 1;
259     while (nz--) {
260       oidx = 3*(*vi++);
261       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
262       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
263       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
264       v  += 9;
265     }
266     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
267     idx += 3;
268   }
269   /* backward solve the L^T */
270   for (i=n-1; i>=0; i--){
271     v    = aa + 9*diag[i] - 9;
272     vi   = aj + diag[i] - 1;
273     nz   = diag[i] - ai[i];
274     idt  = 3*i;
275     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
276     while (nz--) {
277       idx   = 3*(*vi--);
278       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
279       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
280       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
281       v -= 9;
282     }
283   }
284   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
285   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
286   ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
287   PetscFunctionReturn(0);
288 }
289 
290 #undef __FUNCT__
291 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering"
292 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
293 {
294   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
295   PetscErrorCode    ierr;
296   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
297   PetscInt          nz,idx,idt,j,i,oidx;
298   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
299   const MatScalar   *aa=a->a,*v;
300   PetscScalar       s1,s2,s3,x1,x2,x3,*x;
301   const PetscScalar *b;
302 
303   PetscFunctionBegin;
304   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
305   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
306   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
307 
308   /* forward solve the U^T */
309   idx = 0;
310   for (i=0; i<n; i++) {
311     v     = aa + bs2*diag[i];
312     /* multiply by the inverse of the block diagonal */
313     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];
314     s1 = v[0]*x1  +  v[1]*x2  + v[2]*x3;
315     s2 = v[3]*x1  +  v[4]*x2  + v[5]*x3;
316     s3 = v[6]*x1  +  v[7]*x2  + v[8]*x3;
317     v -= bs2;
318 
319     vi    = aj + diag[i] - 1;
320     nz    = diag[i] - diag[i+1] - 1;
321     for(j=0;j>-nz;j--){
322       oidx = bs*vi[j];
323       x[oidx]   -= v[0]*s1  +  v[1]*s2  + v[2]*s3;
324       x[oidx+1] -= v[3]*s1  +  v[4]*s2  + v[5]*s3;
325       x[oidx+2] -= v[6]*s1  +  v[7]*s2  + v[8]*s3;
326       v  -= bs2;
327     }
328     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;
329     idx += bs;
330   }
331   /* backward solve the L^T */
332   for (i=n-1; i>=0; i--){
333     v    = aa + bs2*ai[i];
334     vi   = aj + ai[i];
335     nz   = ai[i+1] - ai[i];
336     idt  = bs*i;
337     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];
338     for(j=0;j<nz;j++){
339       idx   = bs*vi[j];
340       x[idx]   -= v[0]*s1  +  v[1]*s2  + v[2]*s3;
341       x[idx+1] -= v[3]*s1  +  v[4]*s2  + v[5]*s3;
342       x[idx+2] -= v[6]*s1  +  v[7]*s2  + v[8]*s3;
343       v += bs2;
344     }
345   }
346   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
347   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
348   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
349   PetscFunctionReturn(0);
350 }
351 
352 #undef __FUNCT__
353 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace"
354 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
355 {
356   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
357   PetscErrorCode    ierr;
358   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
359   PetscInt          i,nz,idx,idt,oidx;
360   const MatScalar   *aa=a->a,*v;
361   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x;
362   const PetscScalar *b;
363 
364   PetscFunctionBegin;
365   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
366   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
367   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
368 
369   /* forward solve the U^T */
370   idx = 0;
371   for (i=0; i<n; i++) {
372 
373     v     = aa + 16*diag[i];
374     /* multiply by the inverse of the block diagonal */
375     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
376     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
377     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
378     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
379     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
380     v += 16;
381 
382     vi    = aj + diag[i] + 1;
383     nz    = ai[i+1] - diag[i] - 1;
384     while (nz--) {
385       oidx = 4*(*vi++);
386       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
387       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
388       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
389       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
390       v  += 16;
391     }
392     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
393     idx += 4;
394   }
395   /* backward solve the L^T */
396   for (i=n-1; i>=0; i--){
397     v    = aa + 16*diag[i] - 16;
398     vi   = aj + diag[i] - 1;
399     nz   = diag[i] - ai[i];
400     idt  = 4*i;
401     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
402     while (nz--) {
403       idx   = 4*(*vi--);
404       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
405       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
406       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
407       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
408       v -= 16;
409     }
410   }
411   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
412   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
413   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
414   PetscFunctionReturn(0);
415 }
416 
417 #undef __FUNCT__
418 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering"
419 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
420 {
421   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
422   PetscErrorCode    ierr;
423   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
424   PetscInt          nz,idx,idt,j,i,oidx;
425   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
426   const MatScalar   *aa=a->a,*v;
427   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x;
428   const PetscScalar *b;
429 
430   PetscFunctionBegin;
431   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
432   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
433   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
434 
435   /* forward solve the U^T */
436   idx = 0;
437   for (i=0; i<n; i++) {
438     v     = aa + bs2*diag[i];
439     /* multiply by the inverse of the block diagonal */
440     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
441     s1 =  v[0]*x1  +  v[1]*x2  + v[2]*x3  + v[3]*x4;
442     s2 =  v[4]*x1  +  v[5]*x2  + v[6]*x3  + v[7]*x4;
443     s3 =  v[8]*x1  +  v[9]*x2  + v[10]*x3 + v[11]*x4;
444     s4 =  v[12]*x1 +  v[13]*x2 + v[14]*x3 + v[15]*x4;
445     v -= bs2;
446 
447     vi    = aj + diag[i] - 1;
448     nz    = diag[i] - diag[i+1] - 1;
449     for(j=0;j>-nz;j--){
450       oidx = bs*vi[j];
451       x[oidx]   -=  v[0]*s1  +  v[1]*s2  + v[2]*s3  + v[3]*s4;
452       x[oidx+1] -=  v[4]*s1  +  v[5]*s2  + v[6]*s3  + v[7]*s4;
453       x[oidx+2] -=  v[8]*s1  +  v[9]*s2  + v[10]*s3 + v[11]*s4;
454       x[oidx+3] -=  v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
455       v  -= bs2;
456     }
457     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4;
458     idx += bs;
459   }
460   /* backward solve the L^T */
461   for (i=n-1; i>=0; i--){
462     v    = aa + bs2*ai[i];
463     vi   = aj + ai[i];
464     nz   = ai[i+1] - ai[i];
465     idt  = bs*i;
466     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];
467     for(j=0;j<nz;j++){
468       idx   = bs*vi[j];
469       x[idx]   -=  v[0]*s1  +  v[1]*s2  + v[2]*s3  + v[3]*s4;
470       x[idx+1] -=  v[4]*s1  +  v[5]*s2  + v[6]*s3  + v[7]*s4;
471       x[idx+2] -=  v[8]*s1  +  v[9]*s2  + v[10]*s3 + v[11]*s4;
472       x[idx+3] -=  v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
473       v += bs2;
474     }
475   }
476   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
477   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
478   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
479   PetscFunctionReturn(0);
480 }
481 
482 #undef __FUNCT__
483 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace"
484 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
485 {
486   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
487   PetscErrorCode    ierr;
488   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
489   PetscInt          i,nz,idx,idt,oidx;
490   const MatScalar   *aa=a->a,*v;
491   PetscScalar       s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x;
492   const PetscScalar *b;
493 
494   PetscFunctionBegin;
495   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
496   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
497   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
498 
499   /* forward solve the U^T */
500   idx = 0;
501   for (i=0; i<n; i++) {
502 
503     v     = aa + 25*diag[i];
504     /* multiply by the inverse of the block diagonal */
505     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
506     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
507     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
508     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
509     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
510     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
511     v += 25;
512 
513     vi    = aj + diag[i] + 1;
514     nz    = ai[i+1] - diag[i] - 1;
515     while (nz--) {
516       oidx = 5*(*vi++);
517       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
518       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
519       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
520       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
521       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
522       v  += 25;
523     }
524     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
525     idx += 5;
526   }
527   /* backward solve the L^T */
528   for (i=n-1; i>=0; i--){
529     v    = aa + 25*diag[i] - 25;
530     vi   = aj + diag[i] - 1;
531     nz   = diag[i] - ai[i];
532     idt  = 5*i;
533     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
534     while (nz--) {
535       idx   = 5*(*vi--);
536       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
537       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
538       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
539       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
540       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
541       v -= 25;
542     }
543   }
544   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
545   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
546   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
547   PetscFunctionReturn(0);
548 }
549 
550 #undef __FUNCT__
551 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering"
552 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
553 {
554   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
555   PetscErrorCode ierr;
556   const PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
557   PetscInt       nz,idx,idt,j,i,oidx;
558   const PetscInt       bs=A->rmap->bs,bs2=a->bs2;
559   const MatScalar      *aa=a->a,*v;
560   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x;
561   const PetscScalar    *b;
562 
563   PetscFunctionBegin;
564   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
565   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
566   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
567 
568   /* forward solve the U^T */
569   idx = 0;
570   for (i=0; i<n; i++) {
571     v     = aa + bs2*diag[i];
572     /* multiply by the inverse of the block diagonal */
573     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
574     x5 = x[4+idx];
575     s1 =  v[0]*x1   +  v[1]*x2   + v[2]*x3   + v[3]*x4   + v[4]*x5;
576     s2 =  v[5]*x1   +  v[6]*x2   + v[7]*x3   + v[8]*x4   + v[9]*x5;
577     s3 =  v[10]*x1  +  v[11]*x2  + v[12]*x3  + v[13]*x4  + v[14]*x5;
578     s4 =  v[15]*x1  +  v[16]*x2  + v[17]*x3  + v[18]*x4  + v[19]*x5;
579     s5 =  v[20]*x1  +  v[21]*x2  + v[22]*x3  + v[23]*x4   + v[24]*x5;
580     v -= bs2;
581 
582     vi    = aj + diag[i] - 1;
583     nz    = diag[i] - diag[i+1] - 1;
584     for(j=0;j>-nz;j--){
585       oidx = bs*vi[j];
586       x[oidx]   -=  v[0]*s1   +  v[1]*s2   + v[2]*s3   + v[3]*s4   + v[4]*s5;
587       x[oidx+1] -=  v[5]*s1   +  v[6]*s2   + v[7]*s3   + v[8]*s4   + v[9]*s5;
588       x[oidx+2] -=  v[10]*s1  +  v[11]*s2  + v[12]*s3  + v[13]*s4  + v[14]*s5;
589       x[oidx+3] -=  v[15]*s1  +  v[16]*s2  + v[17]*s3  + v[18]*s4  + v[19]*s5;
590       x[oidx+4] -=  v[20]*s1  +  v[21]*s2  + v[22]*s3  + v[23]*s4   + v[24]*s5;
591       v  -= bs2;
592     }
593     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
594     idx += bs;
595   }
596   /* backward solve the L^T */
597   for (i=n-1; i>=0; i--){
598     v    = aa + bs2*ai[i];
599     vi   = aj + ai[i];
600     nz   = ai[i+1] - ai[i];
601     idt  = bs*i;
602     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
603     for(j=0;j<nz;j++){
604       idx   = bs*vi[j];
605       x[idx]   -=  v[0]*s1   +  v[1]*s2   + v[2]*s3   + v[3]*s4   + v[4]*s5;
606       x[idx+1] -=  v[5]*s1   +  v[6]*s2   + v[7]*s3   + v[8]*s4   + v[9]*s5;
607       x[idx+2] -=  v[10]*s1  +  v[11]*s2  + v[12]*s3  + v[13]*s4  + v[14]*s5;
608       x[idx+3] -=  v[15]*s1  +  v[16]*s2  + v[17]*s3  + v[18]*s4  + v[19]*s5;
609       x[idx+4] -=  v[20]*s1  +  v[21]*s2  + v[22]*s3  + v[23]*s4   + v[24]*s5;
610       v += bs2;
611     }
612   }
613   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
614   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
615   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
616   PetscFunctionReturn(0);
617 }
618 
619 #undef __FUNCT__
620 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace"
621 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
622 {
623   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
624   PetscErrorCode    ierr;
625   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
626   PetscInt          i,nz,idx,idt,oidx;
627   const MatScalar   *aa=a->a,*v;
628   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x;
629   const PetscScalar *b;
630 
631   PetscFunctionBegin;
632   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
633   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
634   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
635 
636   /* forward solve the U^T */
637   idx = 0;
638   for (i=0; i<n; i++) {
639 
640     v     = aa + 36*diag[i];
641     /* multiply by the inverse of the block diagonal */
642     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
643     x6    = x[5+idx];
644     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
645     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
646     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
647     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
648     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
649     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
650     v += 36;
651 
652     vi    = aj + diag[i] + 1;
653     nz    = ai[i+1] - diag[i] - 1;
654     while (nz--) {
655       oidx = 6*(*vi++);
656       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
657       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
658       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
659       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
660       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
661       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
662       v  += 36;
663     }
664     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
665     x[5+idx] = s6;
666     idx += 6;
667   }
668   /* backward solve the L^T */
669   for (i=n-1; i>=0; i--){
670     v    = aa + 36*diag[i] - 36;
671     vi   = aj + diag[i] - 1;
672     nz   = diag[i] - ai[i];
673     idt  = 6*i;
674     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
675     s6 = x[5+idt];
676     while (nz--) {
677       idx   = 6*(*vi--);
678       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
679       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
680       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
681       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
682       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
683       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
684       v -= 36;
685     }
686   }
687   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
688   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
689   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
690   PetscFunctionReturn(0);
691 }
692 
693 #undef __FUNCT__
694 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering"
695 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
696 {
697   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
698   PetscErrorCode    ierr;
699   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
700   PetscInt          nz,idx,idt,j,i,oidx;
701   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
702   const MatScalar   *aa=a->a,*v;
703   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x;
704   const PetscScalar *b;
705 
706   PetscFunctionBegin;
707   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
708   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
709   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
710 
711   /* forward solve the U^T */
712   idx = 0;
713   for (i=0; i<n; i++) {
714     v     = aa + bs2*diag[i];
715     /* multiply by the inverse of the block diagonal */
716     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
717     x5 = x[4+idx]; x6 = x[5+idx];
718     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
719     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
720     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
721     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
722     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
723     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
724     v -= bs2;
725 
726     vi    = aj + diag[i] - 1;
727     nz    = diag[i] - diag[i+1] - 1;
728     for(j=0;j>-nz;j--){
729       oidx = bs*vi[j];
730       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
731       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
732       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
733       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
734       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
735       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
736       v  -= bs2;
737     }
738     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
739     x[5+idx] = s6;
740     idx += bs;
741   }
742   /* backward solve the L^T */
743   for (i=n-1; i>=0; i--){
744     v    = aa + bs2*ai[i];
745     vi   = aj + ai[i];
746     nz   = ai[i+1] - ai[i];
747     idt  = bs*i;
748     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
749     s6   = x[5+idt];
750     for(j=0;j<nz;j++){
751       idx   = bs*vi[j];
752       x[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
753       x[idx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
754       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
755       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
756       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
757       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
758       v += bs2;
759     }
760   }
761   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
762   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
763   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
764   PetscFunctionReturn(0);
765 }
766 
767 #undef __FUNCT__
768 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace"
769 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
770 {
771   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
772   PetscErrorCode    ierr;
773   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
774   PetscInt          i,nz,idx,idt,oidx;
775   const MatScalar   *aa=a->a,*v;
776   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x;
777   const PetscScalar *b;
778 
779   PetscFunctionBegin;
780   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
781   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
782   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
783 
784   /* forward solve the U^T */
785   idx = 0;
786   for (i=0; i<n; i++) {
787 
788     v     = aa + 49*diag[i];
789     /* multiply by the inverse of the block diagonal */
790     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
791     x6    = x[5+idx]; x7 = x[6+idx];
792     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
793     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
794     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
795     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
796     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
797     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
798     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
799     v += 49;
800 
801     vi    = aj + diag[i] + 1;
802     nz    = ai[i+1] - diag[i] - 1;
803     while (nz--) {
804       oidx = 7*(*vi++);
805       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
806       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
807       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
808       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
809       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
810       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
811       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
812       v  += 49;
813     }
814     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
815     x[5+idx] = s6;x[6+idx] = s7;
816     idx += 7;
817   }
818   /* backward solve the L^T */
819   for (i=n-1; i>=0; i--){
820     v    = aa + 49*diag[i] - 49;
821     vi   = aj + diag[i] - 1;
822     nz   = diag[i] - ai[i];
823     idt  = 7*i;
824     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
825     s6 = x[5+idt];s7 = x[6+idt];
826     while (nz--) {
827       idx   = 7*(*vi--);
828       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
829       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
830       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
831       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
832       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
833       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
834       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
835       v -= 49;
836     }
837   }
838   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
839   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
840   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
841   PetscFunctionReturn(0);
842 }
843 #undef __FUNCT__
844 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering"
845 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
846 {
847   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
848   PetscErrorCode    ierr;
849   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
850   PetscInt          nz,idx,idt,j,i,oidx;
851   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
852   const MatScalar   *aa=a->a,*v;
853   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x;
854   const PetscScalar *b;
855 
856   PetscFunctionBegin;
857   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
858   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
859   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
860 
861   /* forward solve the U^T */
862   idx = 0;
863   for (i=0; i<n; i++) {
864     v     = aa + bs2*diag[i];
865     /* multiply by the inverse of the block diagonal */
866     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
867     x5 = x[4+idx]; x6 = x[5+idx];  x7 = x[6+idx];
868     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
869     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
870     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
871     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
872     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
873     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
874     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
875     v -= bs2;
876     vi    = aj + diag[i] - 1;
877     nz    = diag[i] - diag[i+1] - 1;
878     for(j=0;j>-nz;j--){
879       oidx = bs*vi[j];
880       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
881       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
882       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
883       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
884       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
885       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
886       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
887       v  -= bs2;
888     }
889     x[idx]   = s1;  x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
890     x[5+idx] = s6;  x[6+idx] = s7;
891     idx += bs;
892   }
893   /* backward solve the L^T */
894   for (i=n-1; i>=0; i--){
895     v    = aa + bs2*ai[i];
896     vi   = aj + ai[i];
897     nz   = ai[i+1] - ai[i];
898     idt  = bs*i;
899     s1   = x[idt];    s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
900     s6   = x[5+idt];  s7 = x[6+idt];
901     for(j=0;j<nz;j++){
902       idx   = bs*vi[j];
903       x[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
904       x[idx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
905       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
906       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
907       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
908       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
909       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
910       v += bs2;
911     }
912   }
913   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
914   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
915   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
916   PetscFunctionReturn(0);
917 }
918 
919 /*---------------------------------------------------------------------------------------------*/
920 #undef __FUNCT__
921 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1"
922 PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
923 {
924   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data;
925   IS                iscol = a->col,isrow = a->row;
926   PetscErrorCode    ierr;
927   const PetscInt    *rout,*cout,*r,*c,*adiag = a->diag,*ai = a->i,*aj = a->j,*vi;
928   PetscInt          i,n = a->mbs,j;
929   PetscInt          nz;
930   PetscScalar       *x,*tmp,s1;
931   const MatScalar   *aa = a->a,*v;
932   const PetscScalar *b;
933 
934   PetscFunctionBegin;
935   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
936   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
937   tmp  = a->solve_work;
938 
939   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
940   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
941 
942   /* copy the b into temp work space according to permutation */
943   for (i=0; i<n; i++) tmp[i] = b[c[i]];
944 
945   /* forward solve the U^T */
946   for (i=0; i<n; i++) {
947     v   = aa + adiag[i+1] + 1;
948     vi  = aj + adiag[i+1] + 1;
949     nz  = adiag[i] - adiag[i+1] - 1;
950     s1  = tmp[i];
951     s1 *= v[nz];  /* multiply by inverse of diagonal entry */
952     for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
953     tmp[i] = s1;
954   }
955 
956   /* backward solve the L^T */
957   for (i=n-1; i>=0; i--){
958     v   = aa + ai[i];
959     vi  = aj + ai[i];
960     nz  = ai[i+1] - ai[i];
961     s1  = tmp[i];
962     for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
963   }
964 
965   /* copy tmp into x according to permutation */
966   for (i=0; i<n; i++) x[r[i]] = tmp[i];
967 
968   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
969   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
970   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
971   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
972 
973   ierr = PetscLogFlops(2.0*a->nz-A->cmap->n);CHKERRQ(ierr);
974   PetscFunctionReturn(0);
975 }
976 
977 #undef __FUNCT__
978 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_inplace"
979 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
980 {
981   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
982   IS                iscol=a->col,isrow=a->row;
983   PetscErrorCode    ierr;
984   const PetscInt    *r,*c,*rout,*cout;
985   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
986   PetscInt          i,nz;
987   const MatScalar   *aa=a->a,*v;
988   PetscScalar       s1,*x,*t;
989   const PetscScalar *b;
990 
991   PetscFunctionBegin;
992   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
993   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
994   t  = a->solve_work;
995 
996   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
997   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
998 
999   /* copy the b into temp work space according to permutation */
1000   for (i=0; i<n; i++) {
1001     t[i] = b[c[i]];
1002   }
1003 
1004   /* forward solve the U^T */
1005   for (i=0; i<n; i++) {
1006 
1007     v     = aa + diag[i];
1008     /* multiply by the inverse of the block diagonal */
1009     s1    = (*v++)*t[i];
1010     vi    = aj + diag[i] + 1;
1011     nz    = ai[i+1] - diag[i] - 1;
1012     while (nz--) {
1013       t[*vi++]  -= (*v++)*s1;
1014     }
1015     t[i]   = s1;
1016   }
1017   /* backward solve the L^T */
1018   for (i=n-1; i>=0; i--){
1019     v    = aa + diag[i] - 1;
1020     vi   = aj + diag[i] - 1;
1021     nz   = diag[i] - ai[i];
1022     s1   = t[i];
1023     while (nz--) {
1024       t[*vi--]   -=  (*v--)*s1;
1025     }
1026   }
1027 
1028   /* copy t into x according to permutation */
1029   for (i=0; i<n; i++) {
1030     x[r[i]]   = t[i];
1031   }
1032 
1033   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1034   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1035   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1036   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1037   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
1038   PetscFunctionReturn(0);
1039 }
1040 
1041 #undef __FUNCT__
1042 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_inplace"
1043 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
1044 {
1045   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1046   IS                iscol=a->col,isrow=a->row;
1047   PetscErrorCode    ierr;
1048   const PetscInt    *r,*c,*rout,*cout;
1049   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1050   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1051   const MatScalar   *aa=a->a,*v;
1052   PetscScalar       s1,s2,x1,x2,*x,*t;
1053   const PetscScalar *b;
1054 
1055   PetscFunctionBegin;
1056   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1057   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1058   t  = a->solve_work;
1059 
1060   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1061   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1062 
1063   /* copy the b into temp work space according to permutation */
1064   ii = 0;
1065   for (i=0; i<n; i++) {
1066     ic      = 2*c[i];
1067     t[ii]   = b[ic];
1068     t[ii+1] = b[ic+1];
1069     ii += 2;
1070   }
1071 
1072   /* forward solve the U^T */
1073   idx = 0;
1074   for (i=0; i<n; i++) {
1075 
1076     v     = aa + 4*diag[i];
1077     /* multiply by the inverse of the block diagonal */
1078     x1    = t[idx];   x2 = t[1+idx];
1079     s1 = v[0]*x1  +  v[1]*x2;
1080     s2 = v[2]*x1  +  v[3]*x2;
1081     v += 4;
1082 
1083     vi    = aj + diag[i] + 1;
1084     nz    = ai[i+1] - diag[i] - 1;
1085     while (nz--) {
1086       oidx = 2*(*vi++);
1087       t[oidx]   -= v[0]*s1  +  v[1]*s2;
1088       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
1089       v  += 4;
1090     }
1091     t[idx]   = s1;t[1+idx] = s2;
1092     idx += 2;
1093   }
1094   /* backward solve the L^T */
1095   for (i=n-1; i>=0; i--){
1096     v    = aa + 4*diag[i] - 4;
1097     vi   = aj + diag[i] - 1;
1098     nz   = diag[i] - ai[i];
1099     idt  = 2*i;
1100     s1 = t[idt];  s2 = t[1+idt];
1101     while (nz--) {
1102       idx   = 2*(*vi--);
1103       t[idx]   -=  v[0]*s1 +  v[1]*s2;
1104       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
1105       v -= 4;
1106     }
1107   }
1108 
1109   /* copy t into x according to permutation */
1110   ii = 0;
1111   for (i=0; i<n; i++) {
1112     ir      = 2*r[i];
1113     x[ir]   = t[ii];
1114     x[ir+1] = t[ii+1];
1115     ii += 2;
1116   }
1117 
1118   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1119   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1120   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1121   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1122   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
1123   PetscFunctionReturn(0);
1124 }
1125 
1126 #undef __FUNCT__
1127 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2"
1128 PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
1129 {
1130   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1131   PetscErrorCode    ierr;
1132   IS                iscol=a->col,isrow=a->row;
1133   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1134   const PetscInt    *r,*c,*rout,*cout;
1135   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1136   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1137   const MatScalar   *aa=a->a,*v;
1138   PetscScalar       s1,s2,x1,x2,*x,*t;
1139   const PetscScalar *b;
1140 
1141   PetscFunctionBegin;
1142   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1143   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1144   t = a->solve_work;
1145 
1146   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1147   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1148 
1149   /* copy b into temp work space according to permutation */
1150   for(i=0;i<n;i++){
1151     ii = bs*i; ic = bs*c[i];
1152     t[ii] = b[ic]; t[ii+1] = b[ic+1];
1153   }
1154 
1155   /* forward solve the U^T */
1156   idx = 0;
1157   for (i=0; i<n; i++) {
1158     v     = aa + bs2*diag[i];
1159     /* multiply by the inverse of the block diagonal */
1160     x1 = t[idx];   x2 = t[1+idx];
1161     s1 = v[0]*x1  +  v[1]*x2;
1162     s2 = v[2]*x1  +  v[3]*x2;
1163     v -= bs2;
1164 
1165     vi    = aj + diag[i] - 1;
1166     nz    = diag[i] - diag[i+1] - 1;
1167     for(j=0;j>-nz;j--){
1168       oidx = bs*vi[j];
1169       t[oidx]   -= v[0]*s1  +  v[1]*s2;
1170       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
1171       v  -= bs2;
1172     }
1173     t[idx]   = s1;t[1+idx] = s2;
1174     idx += bs;
1175   }
1176   /* backward solve the L^T */
1177   for (i=n-1; i>=0; i--){
1178     v    = aa + bs2*ai[i];
1179     vi   = aj + ai[i];
1180     nz   = ai[i+1] - ai[i];
1181     idt  = bs*i;
1182     s1   = t[idt];  s2 = t[1+idt];
1183     for(j=0;j<nz;j++){
1184       idx   = bs*vi[j];
1185       t[idx]   -=  v[0]*s1 +  v[1]*s2;
1186       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
1187       v += bs2;
1188     }
1189   }
1190 
1191   /* copy t into x according to permutation */
1192   for(i=0;i<n;i++){
1193     ii = bs*i;  ir = bs*r[i];
1194     x[ir] = t[ii];  x[ir+1] = t[ii+1];
1195   }
1196 
1197   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1198   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1199   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1200   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1201   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1202   PetscFunctionReturn(0);
1203 }
1204 
1205 #undef __FUNCT__
1206 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_inplace"
1207 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
1208 {
1209   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1210   IS                iscol=a->col,isrow=a->row;
1211   PetscErrorCode    ierr;
1212   const PetscInt    *r,*c,*rout,*cout;
1213   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1214   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1215   const MatScalar   *aa=a->a,*v;
1216   PetscScalar       s1,s2,s3,x1,x2,x3,*x,*t;
1217   const PetscScalar *b;
1218 
1219   PetscFunctionBegin;
1220   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1221   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1222   t  = a->solve_work;
1223 
1224   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1225   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1226 
1227   /* copy the b into temp work space according to permutation */
1228   ii = 0;
1229   for (i=0; i<n; i++) {
1230     ic      = 3*c[i];
1231     t[ii]   = b[ic];
1232     t[ii+1] = b[ic+1];
1233     t[ii+2] = b[ic+2];
1234     ii += 3;
1235   }
1236 
1237   /* forward solve the U^T */
1238   idx = 0;
1239   for (i=0; i<n; i++) {
1240 
1241     v     = aa + 9*diag[i];
1242     /* multiply by the inverse of the block diagonal */
1243     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
1244     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
1245     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
1246     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
1247     v += 9;
1248 
1249     vi    = aj + diag[i] + 1;
1250     nz    = ai[i+1] - diag[i] - 1;
1251     while (nz--) {
1252       oidx = 3*(*vi++);
1253       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
1254       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
1255       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1256       v  += 9;
1257     }
1258     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
1259     idx += 3;
1260   }
1261   /* backward solve the L^T */
1262   for (i=n-1; i>=0; i--){
1263     v    = aa + 9*diag[i] - 9;
1264     vi   = aj + diag[i] - 1;
1265     nz   = diag[i] - ai[i];
1266     idt  = 3*i;
1267     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
1268     while (nz--) {
1269       idx   = 3*(*vi--);
1270       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
1271       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
1272       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1273       v -= 9;
1274     }
1275   }
1276 
1277   /* copy t into x according to permutation */
1278   ii = 0;
1279   for (i=0; i<n; i++) {
1280     ir      = 3*r[i];
1281     x[ir]   = t[ii];
1282     x[ir+1] = t[ii+1];
1283     x[ir+2] = t[ii+2];
1284     ii += 3;
1285   }
1286 
1287   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1288   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1289   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1290   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1291   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
1292   PetscFunctionReturn(0);
1293 }
1294 
1295 #undef __FUNCT__
1296 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3"
1297 PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
1298 {
1299   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1300   PetscErrorCode    ierr;
1301   IS                iscol=a->col,isrow=a->row;
1302   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1303   const PetscInt    *r,*c,*rout,*cout;
1304   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1305   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1306   const MatScalar   *aa=a->a,*v;
1307   PetscScalar       s1,s2,s3,x1,x2,x3,*x,*t;
1308   const PetscScalar *b;
1309 
1310   PetscFunctionBegin;
1311   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1312   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1313   t = a->solve_work;
1314 
1315   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1316   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1317 
1318   /* copy b into temp work space according to permutation */
1319   for(i=0;i<n;i++){
1320     ii = bs*i; ic = bs*c[i];
1321     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2];
1322   }
1323 
1324   /* forward solve the U^T */
1325   idx = 0;
1326   for (i=0; i<n; i++) {
1327     v     = aa + bs2*diag[i];
1328     /* multiply by the inverse of the block diagonal */
1329     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
1330     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
1331     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
1332     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
1333     v -= bs2;
1334 
1335     vi    = aj + diag[i] - 1;
1336     nz    = diag[i] - diag[i+1] - 1;
1337     for(j=0;j>-nz;j--){
1338       oidx = bs*vi[j];
1339       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
1340       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
1341       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1342       v  -= bs2;
1343     }
1344     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;
1345     idx += bs;
1346   }
1347   /* backward solve the L^T */
1348   for (i=n-1; i>=0; i--){
1349     v    = aa + bs2*ai[i];
1350     vi   = aj + ai[i];
1351     nz   = ai[i+1] - ai[i];
1352     idt  = bs*i;
1353     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];
1354     for(j=0;j<nz;j++){
1355       idx   = bs*vi[j];
1356       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
1357       t[idx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
1358       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1359       v += bs2;
1360     }
1361   }
1362 
1363   /* copy t into x according to permutation */
1364   for(i=0;i<n;i++){
1365     ii = bs*i;  ir = bs*r[i];
1366     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];
1367   }
1368 
1369   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1370   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1371   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1372   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1373   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1374   PetscFunctionReturn(0);
1375 }
1376 
1377 #undef __FUNCT__
1378 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_inplace"
1379 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
1380 {
1381   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1382   IS                iscol=a->col,isrow=a->row;
1383   PetscErrorCode    ierr;
1384   const PetscInt    *r,*c,*rout,*cout;
1385   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1386   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1387   const MatScalar   *aa=a->a,*v;
1388   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x,*t;
1389   const PetscScalar *b;
1390 
1391   PetscFunctionBegin;
1392   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1393   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1394   t  = a->solve_work;
1395 
1396   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1397   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1398 
1399   /* copy the b into temp work space according to permutation */
1400   ii = 0;
1401   for (i=0; i<n; i++) {
1402     ic      = 4*c[i];
1403     t[ii]   = b[ic];
1404     t[ii+1] = b[ic+1];
1405     t[ii+2] = b[ic+2];
1406     t[ii+3] = b[ic+3];
1407     ii += 4;
1408   }
1409 
1410   /* forward solve the U^T */
1411   idx = 0;
1412   for (i=0; i<n; i++) {
1413 
1414     v     = aa + 16*diag[i];
1415     /* multiply by the inverse of the block diagonal */
1416     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
1417     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1418     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1419     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1420     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1421     v += 16;
1422 
1423     vi    = aj + diag[i] + 1;
1424     nz    = ai[i+1] - diag[i] - 1;
1425     while (nz--) {
1426       oidx = 4*(*vi++);
1427       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
1428       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
1429       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1430       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1431       v  += 16;
1432     }
1433     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
1434     idx += 4;
1435   }
1436   /* backward solve the L^T */
1437   for (i=n-1; i>=0; i--){
1438     v    = aa + 16*diag[i] - 16;
1439     vi   = aj + diag[i] - 1;
1440     nz   = diag[i] - ai[i];
1441     idt  = 4*i;
1442     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
1443     while (nz--) {
1444       idx   = 4*(*vi--);
1445       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
1446       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
1447       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1448       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1449       v -= 16;
1450     }
1451   }
1452 
1453   /* copy t into x according to permutation */
1454   ii = 0;
1455   for (i=0; i<n; i++) {
1456     ir      = 4*r[i];
1457     x[ir]   = t[ii];
1458     x[ir+1] = t[ii+1];
1459     x[ir+2] = t[ii+2];
1460     x[ir+3] = t[ii+3];
1461     ii += 4;
1462   }
1463 
1464   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1465   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1466   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1467   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1468   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
1469   PetscFunctionReturn(0);
1470 }
1471 
1472 #undef __FUNCT__
1473 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4"
1474 PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
1475 {
1476   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1477   PetscErrorCode    ierr;
1478   IS                iscol=a->col,isrow=a->row;
1479   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1480   const PetscInt    *r,*c,*rout,*cout;
1481   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1482   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1483   const MatScalar   *aa=a->a,*v;
1484   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x,*t;
1485   const PetscScalar *b;
1486 
1487   PetscFunctionBegin;
1488   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1489   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1490   t = a->solve_work;
1491 
1492   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1493   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1494 
1495   /* copy b into temp work space according to permutation */
1496   for(i=0;i<n;i++){
1497     ii = bs*i; ic = bs*c[i];
1498     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1499   }
1500 
1501   /* forward solve the U^T */
1502   idx = 0;
1503   for (i=0; i<n; i++) {
1504     v     = aa + bs2*diag[i];
1505     /* multiply by the inverse of the block diagonal */
1506     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];  x4 = t[3+idx];
1507     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1508     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1509     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1510     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1511     v -= bs2;
1512 
1513     vi    = aj + diag[i] - 1;
1514     nz    = diag[i] - diag[i+1] - 1;
1515     for(j=0;j>-nz;j--){
1516       oidx = bs*vi[j];
1517       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
1518       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
1519       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1520       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1521       v  -= bs2;
1522     }
1523     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4;
1524     idx += bs;
1525   }
1526   /* backward solve the L^T */
1527   for (i=n-1; i>=0; i--){
1528     v    = aa + bs2*ai[i];
1529     vi   = aj + ai[i];
1530     nz   = ai[i+1] - ai[i];
1531     idt  = bs*i;
1532     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt];
1533     for(j=0;j<nz;j++){
1534       idx   = bs*vi[j];
1535       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3  +  v[3]*s4;
1536       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3  +  v[7]*s4;
1537       t[idx+2] -=  v[8]*s1 +  v[9]*s2 +  v[10]*s3 + v[11]*s4;
1538       t[idx+3] -= v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
1539       v += bs2;
1540     }
1541   }
1542 
1543   /* copy t into x according to permutation */
1544   for(i=0;i<n;i++){
1545     ii = bs*i;  ir = bs*r[i];
1546     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
1547   }
1548 
1549   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1550   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1551   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1552   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1553   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1554   PetscFunctionReturn(0);
1555 }
1556 
1557 #undef __FUNCT__
1558 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_inplace"
1559 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
1560 {
1561   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1562   IS                iscol=a->col,isrow=a->row;
1563   PetscErrorCode    ierr;
1564   const PetscInt    *r,*c,*rout,*cout;
1565   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1566   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1567   const MatScalar   *aa=a->a,*v;
1568   PetscScalar       s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t;
1569   const PetscScalar *b;
1570 
1571   PetscFunctionBegin;
1572   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1573   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1574   t  = a->solve_work;
1575 
1576   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1577   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1578 
1579   /* copy the b into temp work space according to permutation */
1580   ii = 0;
1581   for (i=0; i<n; i++) {
1582     ic      = 5*c[i];
1583     t[ii]   = b[ic];
1584     t[ii+1] = b[ic+1];
1585     t[ii+2] = b[ic+2];
1586     t[ii+3] = b[ic+3];
1587     t[ii+4] = b[ic+4];
1588     ii += 5;
1589   }
1590 
1591   /* forward solve the U^T */
1592   idx = 0;
1593   for (i=0; i<n; i++) {
1594 
1595     v     = aa + 25*diag[i];
1596     /* multiply by the inverse of the block diagonal */
1597     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1598     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1599     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1600     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1601     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1602     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1603     v += 25;
1604 
1605     vi    = aj + diag[i] + 1;
1606     nz    = ai[i+1] - diag[i] - 1;
1607     while (nz--) {
1608       oidx = 5*(*vi++);
1609       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1610       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1611       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1612       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1613       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1614       v  += 25;
1615     }
1616     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1617     idx += 5;
1618   }
1619   /* backward solve the L^T */
1620   for (i=n-1; i>=0; i--){
1621     v    = aa + 25*diag[i] - 25;
1622     vi   = aj + diag[i] - 1;
1623     nz   = diag[i] - ai[i];
1624     idt  = 5*i;
1625     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1626     while (nz--) {
1627       idx   = 5*(*vi--);
1628       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1629       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1630       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1631       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1632       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1633       v -= 25;
1634     }
1635   }
1636 
1637   /* copy t into x according to permutation */
1638   ii = 0;
1639   for (i=0; i<n; i++) {
1640     ir      = 5*r[i];
1641     x[ir]   = t[ii];
1642     x[ir+1] = t[ii+1];
1643     x[ir+2] = t[ii+2];
1644     x[ir+3] = t[ii+3];
1645     x[ir+4] = t[ii+4];
1646     ii += 5;
1647   }
1648 
1649   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1650   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1651   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1652   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1653   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
1654   PetscFunctionReturn(0);
1655 }
1656 
1657 #undef __FUNCT__
1658 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5"
1659 PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
1660 {
1661   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1662   PetscErrorCode    ierr;
1663   IS                iscol=a->col,isrow=a->row;
1664   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1665   const PetscInt    *r,*c,*rout,*cout;
1666   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1667   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1668   const MatScalar   *aa=a->a,*v;
1669   PetscScalar       s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t;
1670   const PetscScalar *b;
1671 
1672   PetscFunctionBegin;
1673   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1674   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1675   t = a->solve_work;
1676 
1677   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1678   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1679 
1680   /* copy b into temp work space according to permutation */
1681   for(i=0;i<n;i++){
1682     ii = bs*i; ic = bs*c[i];
1683     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1684     t[ii+4] = b[ic+4];
1685   }
1686 
1687   /* forward solve the U^T */
1688   idx = 0;
1689   for (i=0; i<n; i++) {
1690     v     = aa + bs2*diag[i];
1691     /* multiply by the inverse of the block diagonal */
1692     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1693     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1694     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1695     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1696     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1697     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1698     v -= bs2;
1699 
1700     vi    = aj + diag[i] - 1;
1701     nz    = diag[i] - diag[i+1] - 1;
1702     for(j=0;j>-nz;j--){
1703       oidx = bs*vi[j];
1704       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1705       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1706       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1707       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1708       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1709       v  -= bs2;
1710     }
1711     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
1712     idx += bs;
1713   }
1714   /* backward solve the L^T */
1715   for (i=n-1; i>=0; i--){
1716     v    = aa + bs2*ai[i];
1717     vi   = aj + ai[i];
1718     nz   = ai[i+1] - ai[i];
1719     idt  = bs*i;
1720     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
1721     for(j=0;j<nz;j++){
1722       idx   = bs*vi[j];
1723       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1724       t[idx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1725       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1726       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1727       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1728       v += bs2;
1729     }
1730   }
1731 
1732   /* copy t into x according to permutation */
1733   for(i=0;i<n;i++){
1734     ii = bs*i;  ir = bs*r[i];
1735     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
1736     x[ir+4] = t[ii+4];
1737   }
1738 
1739   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1740   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1741   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1742   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1743   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1744   PetscFunctionReturn(0);
1745 }
1746 
1747 #undef __FUNCT__
1748 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_inplace"
1749 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
1750 {
1751   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1752   IS                iscol=a->col,isrow=a->row;
1753   PetscErrorCode    ierr;
1754   const PetscInt    *r,*c,*rout,*cout;
1755   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1756   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1757   const MatScalar   *aa=a->a,*v;
1758   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t;
1759   const PetscScalar *b;
1760 
1761   PetscFunctionBegin;
1762   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1763   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1764   t  = a->solve_work;
1765 
1766   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1767   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1768 
1769   /* copy the b into temp work space according to permutation */
1770   ii = 0;
1771   for (i=0; i<n; i++) {
1772     ic      = 6*c[i];
1773     t[ii]   = b[ic];
1774     t[ii+1] = b[ic+1];
1775     t[ii+2] = b[ic+2];
1776     t[ii+3] = b[ic+3];
1777     t[ii+4] = b[ic+4];
1778     t[ii+5] = b[ic+5];
1779     ii += 6;
1780   }
1781 
1782   /* forward solve the U^T */
1783   idx = 0;
1784   for (i=0; i<n; i++) {
1785 
1786     v     = aa + 36*diag[i];
1787     /* multiply by the inverse of the block diagonal */
1788     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1789     x6    = t[5+idx];
1790     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
1791     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
1792     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
1793     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
1794     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
1795     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
1796     v += 36;
1797 
1798     vi    = aj + diag[i] + 1;
1799     nz    = ai[i+1] - diag[i] - 1;
1800     while (nz--) {
1801       oidx = 6*(*vi++);
1802       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1803       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1804       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1805       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1806       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1807       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1808       v  += 36;
1809     }
1810     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1811     t[5+idx] = s6;
1812     idx += 6;
1813   }
1814   /* backward solve the L^T */
1815   for (i=n-1; i>=0; i--){
1816     v    = aa + 36*diag[i] - 36;
1817     vi   = aj + diag[i] - 1;
1818     nz   = diag[i] - ai[i];
1819     idt  = 6*i;
1820     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1821     s6 = t[5+idt];
1822     while (nz--) {
1823       idx   = 6*(*vi--);
1824       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1825       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1826       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1827       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1828       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1829       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1830       v -= 36;
1831     }
1832   }
1833 
1834   /* copy t into x according to permutation */
1835   ii = 0;
1836   for (i=0; i<n; i++) {
1837     ir      = 6*r[i];
1838     x[ir]   = t[ii];
1839     x[ir+1] = t[ii+1];
1840     x[ir+2] = t[ii+2];
1841     x[ir+3] = t[ii+3];
1842     x[ir+4] = t[ii+4];
1843     x[ir+5] = t[ii+5];
1844     ii += 6;
1845   }
1846 
1847   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1848   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1849   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1850   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1851   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1852   PetscFunctionReturn(0);
1853 }
1854 
1855 #undef __FUNCT__
1856 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6"
1857 PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
1858 {
1859   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1860   PetscErrorCode    ierr;
1861   IS                iscol=a->col,isrow=a->row;
1862   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1863   const PetscInt    *r,*c,*rout,*cout;
1864   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1865   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1866   const MatScalar   *aa=a->a,*v;
1867   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t;
1868   const PetscScalar *b;
1869 
1870   PetscFunctionBegin;
1871   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1872   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1873   t = a->solve_work;
1874 
1875   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1876   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1877 
1878   /* copy b into temp work space according to permutation */
1879   for(i=0;i<n;i++){
1880     ii = bs*i; ic = bs*c[i];
1881     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1882     t[ii+4] = b[ic+4];  t[ii+5] = b[ic+5];
1883   }
1884 
1885   /* forward solve the U^T */
1886   idx = 0;
1887   for (i=0; i<n; i++) {
1888     v     = aa + bs2*diag[i];
1889     /* multiply by the inverse of the block diagonal */
1890     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1891     x6    = t[5+idx];
1892     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
1893     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
1894     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
1895     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
1896     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
1897     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
1898     v -= bs2;
1899 
1900     vi    = aj + diag[i] - 1;
1901     nz    = diag[i] - diag[i+1] - 1;
1902     for(j=0;j>-nz;j--){
1903       oidx = bs*vi[j];
1904       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1905       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1906       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1907       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1908       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1909       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1910       v  -= bs2;
1911     }
1912     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
1913     t[5+idx] = s6;
1914     idx += bs;
1915   }
1916   /* backward solve the L^T */
1917   for (i=n-1; i>=0; i--){
1918     v    = aa + bs2*ai[i];
1919     vi   = aj + ai[i];
1920     nz   = ai[i+1] - ai[i];
1921     idt  = bs*i;
1922     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
1923     s6   = t[5+idt];
1924    for(j=0;j<nz;j++){
1925       idx   = bs*vi[j];
1926       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1927       t[idx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1928       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1929       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1930       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1931       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1932       v += bs2;
1933     }
1934   }
1935 
1936   /* copy t into x according to permutation */
1937   for(i=0;i<n;i++){
1938     ii = bs*i;  ir = bs*r[i];
1939     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
1940     x[ir+4] = t[ii+4];  x[ir+5] = t[ii+5];
1941   }
1942 
1943   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1944   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1945   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1946   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1947   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1948   PetscFunctionReturn(0);
1949 }
1950 
1951 #undef __FUNCT__
1952 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_inplace"
1953 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
1954 {
1955   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1956   IS                iscol=a->col,isrow=a->row;
1957   PetscErrorCode    ierr;
1958   const PetscInt    *r,*c,*rout,*cout;
1959   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1960   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1961   const MatScalar   *aa=a->a,*v;
1962   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
1963   const PetscScalar *b;
1964 
1965   PetscFunctionBegin;
1966   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1967   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1968   t  = a->solve_work;
1969 
1970   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1971   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1972 
1973   /* copy the b into temp work space according to permutation */
1974   ii = 0;
1975   for (i=0; i<n; i++) {
1976     ic      = 7*c[i];
1977     t[ii]   = b[ic];
1978     t[ii+1] = b[ic+1];
1979     t[ii+2] = b[ic+2];
1980     t[ii+3] = b[ic+3];
1981     t[ii+4] = b[ic+4];
1982     t[ii+5] = b[ic+5];
1983     t[ii+6] = b[ic+6];
1984     ii += 7;
1985   }
1986 
1987   /* forward solve the U^T */
1988   idx = 0;
1989   for (i=0; i<n; i++) {
1990 
1991     v     = aa + 49*diag[i];
1992     /* multiply by the inverse of the block diagonal */
1993     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1994     x6    = t[5+idx]; x7 = t[6+idx];
1995     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1996     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1997     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1998     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1999     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
2000     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
2001     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
2002     v += 49;
2003 
2004     vi    = aj + diag[i] + 1;
2005     nz    = ai[i+1] - diag[i] - 1;
2006     while (nz--) {
2007       oidx = 7*(*vi++);
2008       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
2009       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
2010       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
2011       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
2012       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
2013       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
2014       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
2015       v  += 49;
2016     }
2017     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2018     t[5+idx] = s6;t[6+idx] = s7;
2019     idx += 7;
2020   }
2021   /* backward solve the L^T */
2022   for (i=n-1; i>=0; i--){
2023     v    = aa + 49*diag[i] - 49;
2024     vi   = aj + diag[i] - 1;
2025     nz   = diag[i] - ai[i];
2026     idt  = 7*i;
2027     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2028     s6 = t[5+idt];s7 = t[6+idt];
2029     while (nz--) {
2030       idx   = 7*(*vi--);
2031       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
2032       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
2033       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
2034       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
2035       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
2036       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
2037       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
2038       v -= 49;
2039     }
2040   }
2041 
2042   /* copy t into x according to permutation */
2043   ii = 0;
2044   for (i=0; i<n; i++) {
2045     ir      = 7*r[i];
2046     x[ir]   = t[ii];
2047     x[ir+1] = t[ii+1];
2048     x[ir+2] = t[ii+2];
2049     x[ir+3] = t[ii+3];
2050     x[ir+4] = t[ii+4];
2051     x[ir+5] = t[ii+5];
2052     x[ir+6] = t[ii+6];
2053     ii += 7;
2054   }
2055 
2056   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2057   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2058   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2059   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2060   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
2061   PetscFunctionReturn(0);
2062 }
2063 #undef __FUNCT__
2064 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7"
2065 PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
2066 {
2067   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2068   PetscErrorCode    ierr;
2069   IS                iscol=a->col,isrow=a->row;
2070   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
2071   const PetscInt    *r,*c,*rout,*cout;
2072   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
2073   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
2074   const MatScalar   *aa=a->a,*v;
2075   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
2076   const PetscScalar *b;
2077 
2078   PetscFunctionBegin;
2079   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2080   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2081   t = a->solve_work;
2082 
2083   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2084   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2085 
2086   /* copy b into temp work space according to permutation */
2087   for(i=0;i<n;i++){
2088     ii = bs*i; ic = bs*c[i];
2089     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
2090     t[ii+4] = b[ic+4];  t[ii+5] = b[ic+5];  t[ii+6] = b[ic+6];
2091   }
2092 
2093   /* forward solve the U^T */
2094   idx = 0;
2095   for (i=0; i<n; i++) {
2096     v     = aa + bs2*diag[i];
2097     /* multiply by the inverse of the block diagonal */
2098     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2099     x6    = t[5+idx]; x7 = t[6+idx];
2100     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
2101     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
2102     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
2103     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
2104     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
2105     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
2106     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
2107     v -= bs2;
2108 
2109     vi    = aj + diag[i] - 1;
2110     nz    = diag[i] - diag[i+1] - 1;
2111     for(j=0;j>-nz;j--){
2112       oidx = bs*vi[j];
2113       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
2114       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
2115       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
2116       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
2117       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
2118       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
2119       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
2120       v  -= bs2;
2121     }
2122     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
2123     t[5+idx] = s6;  t[6+idx] = s7;
2124     idx += bs;
2125   }
2126   /* backward solve the L^T */
2127   for (i=n-1; i>=0; i--){
2128     v    = aa + bs2*ai[i];
2129     vi   = aj + ai[i];
2130     nz   = ai[i+1] - ai[i];
2131     idt  = bs*i;
2132     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
2133     s6   = t[5+idt];  s7 = t[6+idt];
2134    for(j=0;j<nz;j++){
2135       idx   = bs*vi[j];
2136       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
2137       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
2138       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
2139       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
2140       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
2141       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
2142       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
2143       v += bs2;
2144     }
2145   }
2146 
2147   /* copy t into x according to permutation */
2148   for(i=0;i<n;i++){
2149     ii = bs*i;  ir = bs*r[i];
2150     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
2151     x[ir+4] = t[ii+4];  x[ir+5] = t[ii+5];  x[ir+6] = t[ii+6];
2152   }
2153 
2154   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2155   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2156   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2157   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2158   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2159   PetscFunctionReturn(0);
2160 }
2161 
2162 /* ----------------------------------------------------------- */
2163 #undef __FUNCT__
2164 #define __FUNCT__ "MatSolve_SeqBAIJ_N_inplace"
2165 PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
2166 {
2167   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2168   IS                iscol=a->col,isrow=a->row;
2169   PetscErrorCode    ierr;
2170   const PetscInt    *r,*c,*rout,*cout;
2171   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*vi;
2172   PetscInt          i,nz;
2173   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
2174   const MatScalar   *aa=a->a,*v;
2175   PetscScalar       *x,*s,*t,*ls;
2176   const PetscScalar *b;
2177 
2178   PetscFunctionBegin;
2179   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2180   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2181   t  = a->solve_work;
2182 
2183   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2184   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2185 
2186   /* forward solve the lower triangular */
2187   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
2188   for (i=1; i<n; i++) {
2189     v   = aa + bs2*ai[i];
2190     vi  = aj + ai[i];
2191     nz  = a->diag[i] - ai[i];
2192     s = t + bs*i;
2193     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
2194     while (nz--) {
2195       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
2196       v += bs2;
2197     }
2198   }
2199   /* backward solve the upper triangular */
2200   ls = a->solve_work + A->cmap->n;
2201   for (i=n-1; i>=0; i--){
2202     v   = aa + bs2*(a->diag[i] + 1);
2203     vi  = aj + a->diag[i] + 1;
2204     nz  = ai[i+1] - a->diag[i] - 1;
2205     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
2206     while (nz--) {
2207       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
2208       v += bs2;
2209     }
2210     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
2211     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
2212   }
2213 
2214   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2215   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2216   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2217   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2218   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
2219   PetscFunctionReturn(0);
2220 }
2221 
2222 /* ----------------------------------------------------------- */
2223 #undef __FUNCT__
2224 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N_inplace"
2225 PetscErrorCode MatSolveTranspose_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
2226 {
2227   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2228   IS                iscol=a->col,isrow=a->row;
2229   PetscErrorCode    ierr;
2230   const PetscInt    *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
2231   PetscInt          i,nz,j;
2232   const PetscInt    n=a->mbs,bs=A->rmap->bs,bs2=a->bs2;
2233   const MatScalar   *aa=a->a,*v;
2234   PetscScalar       *x,*t,*ls;
2235   const PetscScalar *b;
2236   PetscFunctionBegin;
2237   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2238   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2239   t    = a->solve_work;
2240 
2241   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2242   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2243 
2244   /* copy the b into temp work space according to permutation */
2245   for (i=0; i<n; i++) {
2246     for (j=0; j<bs; j++) {
2247       t[i*bs+j] = b[c[i]*bs+j];
2248     }
2249   }
2250 
2251 
2252   /* forward solve the upper triangular transpose */
2253   ls = a->solve_work + A->cmap->n;
2254   for (i=0; i<n; i++){
2255     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
2256     Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
2257     v   = aa + bs2*(a->diag[i] + 1);
2258     vi  = aj + a->diag[i] + 1;
2259     nz  = ai[i+1] - a->diag[i] - 1;
2260     while (nz--) {
2261       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
2262       v += bs2;
2263     }
2264   }
2265 
2266   /* backward solve the lower triangular transpose */
2267   for (i=n-1; i>=0; i--) {
2268     v   = aa + bs2*ai[i];
2269     vi  = aj + ai[i];
2270     nz  = a->diag[i] - ai[i];
2271     while (nz--) {
2272       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
2273       v += bs2;
2274     }
2275   }
2276 
2277   /* copy t into x according to permutation */
2278   for (i=0; i<n; i++) {
2279     for (j=0; j<bs; j++) {
2280       x[bs*r[i]+j]   = t[bs*i+j];
2281     }
2282   }
2283 
2284   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2285   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2286   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2287   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2288   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
2289   PetscFunctionReturn(0);
2290 }
2291 
2292 #undef __FUNCT__
2293 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N"
2294 PetscErrorCode MatSolveTranspose_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
2295 {
2296   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2297   IS                iscol=a->col,isrow=a->row;
2298   PetscErrorCode    ierr;
2299   const PetscInt    *r,*c,*rout,*cout;
2300   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*vi,*diag=a->diag;
2301   PetscInt          i,j,nz;
2302   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
2303   const MatScalar   *aa=a->a,*v;
2304   PetscScalar       *x,*t,*ls;
2305   const PetscScalar *b;
2306 
2307   PetscFunctionBegin;
2308   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2309   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2310   t    = a->solve_work;
2311 
2312   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2313   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2314 
2315   /* copy the b into temp work space according to permutation */
2316   for (i=0; i<n; i++) {
2317     for (j=0; j<bs; j++) {
2318       t[i*bs+j] = b[c[i]*bs+j];
2319     }
2320   }
2321 
2322 
2323   /* forward solve the upper triangular transpose */
2324   ls = a->solve_work + A->cmap->n;
2325   for (i=0; i<n; i++){
2326     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
2327     Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*diag[i],t+i*bs);
2328     v   = aa + bs2*(diag[i] - 1);
2329     vi  = aj + diag[i] - 1;
2330     nz  = diag[i] - diag[i+1] - 1;
2331     for(j=0;j>-nz;j--){
2332       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs);
2333       v -= bs2;
2334     }
2335   }
2336 
2337   /* backward solve the lower triangular transpose */
2338   for (i=n-1; i>=0; i--) {
2339     v   = aa + bs2*ai[i];
2340     vi  = aj + ai[i];
2341     nz  = ai[i+1] - ai[i];
2342     for(j=0;j<nz;j++){
2343       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs);
2344       v += bs2;
2345     }
2346   }
2347 
2348   /* copy t into x according to permutation */
2349   for (i=0; i<n; i++) {
2350     for (j=0; j<bs; j++) {
2351       x[bs*r[i]+j]   = t[bs*i+j];
2352     }
2353   }
2354 
2355   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2356   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2357   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2358   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2359   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
2360   PetscFunctionReturn(0);
2361 }
2362 
2363 /* bs = 15 for PFLOTRAN. Block operations are done by accessing all the columns   of the block at once */
2364 
2365 #undef __FUNCT__
2366 #define __FUNCT__ "MatSolve_SeqBAIJ_15_NaturalOrdering_ver2"
2367 PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver2(Mat A,Vec bb,Vec xx)
2368 {
2369   Mat_SeqBAIJ      *a=(Mat_SeqBAIJ *)A->data;
2370   PetscErrorCode    ierr;
2371   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
2372   PetscInt          i,nz,idx,idt,m;
2373   const MatScalar   *aa=a->a,*v;
2374   PetscScalar       s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15;
2375   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15;
2376   PetscScalar       *x;
2377   const PetscScalar *b;
2378 
2379   PetscFunctionBegin;
2380   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2381   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2382 
2383   /* forward solve the lower triangular */
2384   idx    = 0;
2385   x[0]  = b[idx];    x[1]  = b[1+idx];  x[2]  = b[2+idx];  x[3]  = b[3+idx];  x[4]  = b[4+idx];
2386   x[5]  = b[5+idx];  x[6]  = b[6+idx];  x[7]  = b[7+idx];  x[8]  = b[8+idx];  x[9]  = b[9+idx];
2387   x[10] = b[10+idx]; x[11] = b[11+idx]; x[12] = b[12+idx]; x[13] = b[13+idx]; x[14] = b[14+idx];
2388 
2389   for (i=1; i<n; i++) {
2390     v     = aa + bs2*ai[i];
2391     vi    = aj + ai[i];
2392     nz    = ai[i+1] - ai[i];
2393     idt   = bs*i;
2394     s1   = b[idt];    s2  = b[1+idt];  s3  = b[2+idt];  s4  = b[3+idt];  s5  = b[4+idt];
2395     s6   = b[5+idt];  s7  = b[6+idt];  s8  = b[7+idt];  s9  = b[8+idt];  s10 = b[9+idt];
2396     s11  = b[10+idt]; s12 = b[11+idt]; s13 = b[12+idt]; s14 = b[13+idt]; s15 = b[14+idt];
2397     for(m=0;m<nz;m++){
2398       idx   = bs*vi[m];
2399       x1   = x[idx];     x2  = x[1+idx];  x3  = x[2+idx];  x4  = x[3+idx];  x5  = x[4+idx];
2400       x6   = x[5+idx];   x7  = x[6+idx];  x8  = x[7+idx];  x9  = x[8+idx];  x10 = x[9+idx];
2401       x11  = x[10+idx]; x12  = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx];
2402 
2403 
2404       s1 -=  v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
2405       s2 -=  v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
2406       s3 -=  v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
2407       s4 -=  v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
2408       s5  -= v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
2409       s6  -= v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
2410       s7  -= v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
2411       s8  -= v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
2412       s9  -= v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
2413       s10 -= v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
2414       s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
2415       s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
2416       s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
2417       s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
2418       s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
2419 
2420       v += bs2;
2421     }
2422     x[idt]    = s1;  x[1+idt]  = s2;  x[2+idt]  = s3;  x[3+idt]  = s4;  x[4+idt]  = s5;
2423     x[5+idt]  = s6;  x[6+idt]  = s7;  x[7+idt]  = s8;  x[8+idt]  = s9;  x[9+idt]  = s10;
2424     x[10+idt] = s11; x[11+idt] = s12; x[12+idt] = s13; x[13+idt] = s14; x[14+idt] = s15;
2425 
2426   }
2427   /* backward solve the upper triangular */
2428   for (i=n-1; i>=0; i--){
2429     v    = aa + bs2*(adiag[i+1]+1);
2430     vi   = aj + adiag[i+1]+1;
2431     nz   = adiag[i] - adiag[i+1] - 1;
2432     idt  = bs*i;
2433     s1   = x[idt];     s2  = x[1+idt];  s3  = x[2+idt];  s4  = x[3+idt];  s5  = x[4+idt];
2434     s6   = x[5+idt];   s7  = x[6+idt];  s8  = x[7+idt];  s9  = x[8+idt];  s10 = x[9+idt];
2435     s11  = x[10+idt]; s12  = x[11+idt]; s13 = x[12+idt]; s14 = x[13+idt]; s15 = x[14+idt];
2436 
2437     for(m=0;m<nz;m++){
2438       idx   = bs*vi[m];
2439       x1   = x[idx];     x2  = x[1+idx];  x3  = x[2+idx];  x4  = x[3+idx];  x5  = x[4+idx];
2440       x6   = x[5+idx];   x7  = x[6+idx];  x8  = x[7+idx];  x9  = x[8+idx];  x10 = x[9+idx];
2441       x11  = x[10+idx]; x12  = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx];
2442 
2443       s1  -= v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
2444       s2  -= v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
2445       s3  -= v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
2446       s4  -= v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
2447       s5  -= v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
2448       s6  -= v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
2449       s7  -= v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
2450       s8  -= v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
2451       s9  -= v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
2452       s10 -= v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
2453       s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
2454       s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
2455       s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
2456       s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
2457       s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
2458 
2459       v += bs2;
2460     }
2461 
2462     x[idt] = v[0]*s1  + v[15]*s2 + v[30]*s3 + v[45]*s4 + v[60]*s5 + v[75]*s6 + v[90]*s7  + v[105]*s8 + v[120]*s9 + v[135]*s10 + v[150]*s11 + v[165]*s12 + v[180]*s13 + v[195]*s14 + v[210]*s15;
2463     x[1+idt] = v[1]*s1  + v[16]*s2 + v[31]*s3 + v[46]*s4 + v[61]*s5 + v[76]*s6 + v[91]*s7  + v[106]*s8 + v[121]*s9 + v[136]*s10 + v[151]*s11 + v[166]*s12 + v[181]*s13 + v[196]*s14 + v[211]*s15;
2464     x[2+idt] = v[2]*s1  + v[17]*s2 + v[32]*s3 + v[47]*s4 + v[62]*s5 + v[77]*s6 + v[92]*s7  + v[107]*s8 + v[122]*s9 + v[137]*s10 + v[152]*s11 + v[167]*s12 + v[182]*s13 + v[197]*s14 + v[212]*s15;
2465     x[3+idt] = v[3]*s1  + v[18]*s2 + v[33]*s3 + v[48]*s4 + v[63]*s5 + v[78]*s6 + v[93]*s7  + v[108]*s8 + v[123]*s9 + v[138]*s10 + v[153]*s11 + v[168]*s12 + v[183]*s13 + v[198]*s14 + v[213]*s15;
2466     x[4+idt] = v[4]*s1  + v[19]*s2 + v[34]*s3 + v[49]*s4 + v[64]*s5 + v[79]*s6 + v[94]*s7  + v[109]*s8 + v[124]*s9 + v[139]*s10 + v[154]*s11 + v[169]*s12 + v[184]*s13 + v[199]*s14 + v[214]*s15;
2467     x[5+idt] = v[5]*s1  + v[20]*s2 + v[35]*s3 + v[50]*s4 + v[65]*s5 + v[80]*s6 + v[95]*s7  + v[110]*s8 + v[125]*s9 + v[140]*s10 + v[155]*s11 + v[170]*s12 + v[185]*s13 + v[200]*s14 + v[215]*s15;
2468     x[6+idt] = v[6]*s1  + v[21]*s2 + v[36]*s3 + v[51]*s4 + v[66]*s5 + v[81]*s6 + v[96]*s7  + v[111]*s8 + v[126]*s9 + v[141]*s10 + v[156]*s11 + v[171]*s12 + v[186]*s13 + v[201]*s14 + v[216]*s15;
2469     x[7+idt] = v[7]*s1  + v[22]*s2 + v[37]*s3 + v[52]*s4 + v[67]*s5 + v[82]*s6 + v[97]*s7  + v[112]*s8 + v[127]*s9 + v[142]*s10 + v[157]*s11 + v[172]*s12 + v[187]*s13 + v[202]*s14 + v[217]*s15;
2470     x[8+idt] = v[8]*s1  + v[23]*s2 + v[38]*s3 + v[53]*s4 + v[68]*s5 + v[83]*s6 + v[98]*s7  + v[113]*s8 + v[128]*s9 + v[143]*s10 + v[158]*s11 + v[173]*s12 + v[188]*s13 + v[203]*s14 + v[218]*s15;
2471     x[9+idt] = v[9]*s1  + v[24]*s2 + v[39]*s3 + v[54]*s4 + v[69]*s5 + v[84]*s6 + v[99]*s7  + v[114]*s8 + v[129]*s9 + v[144]*s10 + v[159]*s11 + v[174]*s12 + v[189]*s13 + v[204]*s14 + v[219]*s15;
2472     x[10+idt] = v[10]*s1 + v[25]*s2 + v[40]*s3 + v[55]*s4 + v[70]*s5 + v[85]*s6 + v[100]*s7 + v[115]*s8 + v[130]*s9 + v[145]*s10 + v[160]*s11 + v[175]*s12 + v[190]*s13 + v[205]*s14 + v[220]*s15;
2473     x[11+idt] = v[11]*s1 + v[26]*s2 + v[41]*s3 + v[56]*s4 + v[71]*s5 + v[86]*s6 + v[101]*s7 + v[116]*s8 + v[131]*s9 + v[146]*s10 + v[161]*s11 + v[176]*s12 + v[191]*s13 + v[206]*s14 + v[221]*s15;
2474     x[12+idt] = v[12]*s1 + v[27]*s2 + v[42]*s3 + v[57]*s4 + v[72]*s5 + v[87]*s6 + v[102]*s7 + v[117]*s8 + v[132]*s9 + v[147]*s10 + v[162]*s11 + v[177]*s12 + v[192]*s13 + v[207]*s14 + v[222]*s15;
2475     x[13+idt] = v[13]*s1 + v[28]*s2 + v[43]*s3 + v[58]*s4 + v[73]*s5 + v[88]*s6 + v[103]*s7 + v[118]*s8 + v[133]*s9 + v[148]*s10 + v[163]*s11 + v[178]*s12 + v[193]*s13 + v[208]*s14 + v[223]*s15;
2476     x[14+idt] = v[14]*s1 + v[29]*s2 + v[44]*s3 + v[59]*s4 + v[74]*s5 + v[89]*s6 + v[104]*s7 + v[119]*s8 + v[134]*s9 + v[149]*s10 + v[164]*s11 + v[179]*s12 + v[194]*s13 + v[209]*s14 + v[224]*s15;
2477 
2478   }
2479 
2480   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2481   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2482   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2483   PetscFunctionReturn(0);
2484 }
2485 
2486 /* bs = 15 for PFLOTRAN. Block operations are done by accessing one column at at time */
2487 /* Default MatSolve for block size 15 */
2488 
2489 #undef __FUNCT__
2490 #define __FUNCT__ "MatSolve_SeqBAIJ_15_NaturalOrdering_ver1"
2491 PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver1(Mat A,Vec bb,Vec xx)
2492 {
2493   Mat_SeqBAIJ      *a=(Mat_SeqBAIJ *)A->data;
2494   PetscErrorCode    ierr;
2495   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
2496   PetscInt          i,k,nz,idx,idt,m;
2497   const MatScalar   *aa=a->a,*v;
2498   PetscScalar       s[15];
2499   PetscScalar       *x,xv;
2500   const PetscScalar *b;
2501 
2502   PetscFunctionBegin;
2503   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2504   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2505 
2506   /* forward solve the lower triangular */
2507   for (i=0; i<n; i++) {
2508     v     = aa + bs2*ai[i];
2509     vi    = aj + ai[i];
2510     nz    = ai[i+1] - ai[i];
2511     idt   = bs*i;
2512     x[idt]   = b[idt];    x[1+idt]  = b[1+idt];  x[2+idt]  = b[2+idt];  x[3+idt]  = b[3+idt];  x[4+idt]  = b[4+idt];
2513     x[5+idt]   = b[5+idt];  x[6+idt]  = b[6+idt];  x[7+idt]  = b[7+idt];  x[8+idt]  = b[8+idt];  x[9+idt] = b[9+idt];
2514     x[10+idt]  = b[10+idt]; x[11+idt] = b[11+idt]; x[12+idt] = b[12+idt]; x[13+idt] = b[13+idt]; x[14+idt] = b[14+idt];
2515     for(m=0;m<nz;m++){
2516       idx   = bs*vi[m];
2517       for(k=0;k<15;k++){
2518 	xv        = x[k + idx];
2519 	x[idt]    -= v[0]*xv;
2520 	x[1+idt]  -= v[1]*xv;
2521 	x[2+idt]  -= v[2]*xv;
2522         x[3+idt]  -= v[3]*xv;
2523 	x[4+idt]  -= v[4]*xv;
2524 	x[5+idt]  -= v[5]*xv;
2525 	x[6+idt]  -= v[6]*xv;
2526         x[7+idt]  -= v[7]*xv;
2527 	x[8+idt]  -= v[8]*xv;
2528 	x[9+idt]  -= v[9]*xv;
2529 	x[10+idt] -= v[10]*xv;
2530         x[11+idt] -= v[11]*xv;
2531 	x[12+idt] -= v[12]*xv;
2532 	x[13+idt] -= v[13]*xv;
2533 	x[14+idt] -= v[14]*xv;
2534 	v += 15;
2535       }
2536     }
2537   }
2538   /* backward solve the upper triangular */
2539   for (i=n-1; i>=0; i--){
2540     v    = aa + bs2*(adiag[i+1]+1);
2541     vi   = aj + adiag[i+1]+1;
2542     nz   = adiag[i] - adiag[i+1] - 1;
2543     idt  = bs*i;
2544     s[0]   = x[idt];    s[1]  = x[1+idt];  s[2]  = x[2+idt];  s[3]  = x[3+idt];  s[4]  = x[4+idt];
2545     s[5]   = x[5+idt];  s[6]  = x[6+idt];  s[7]  = x[7+idt];  s[8]  = x[8+idt];  s[9]  = x[9+idt];
2546     s[10]  = x[10+idt]; s[11] = x[11+idt]; s[12] = x[12+idt]; s[13] = x[13+idt]; s[14] = x[14+idt];
2547 
2548     for(m=0;m<nz;m++){
2549       idx   = bs*vi[m];
2550       for(k=0;k<15;k++){
2551 	xv = x[k + idx];
2552 	s[0]  -= v[0]*xv;
2553 	s[1]  -= v[1]*xv;
2554 	s[2]  -= v[2]*xv;
2555         s[3]  -= v[3]*xv;
2556 	s[4]  -= v[4]*xv;
2557 	s[5]  -= v[5]*xv;
2558 	s[6]  -= v[6]*xv;
2559         s[7]  -= v[7]*xv;
2560 	s[8]  -= v[8]*xv;
2561 	s[9]  -= v[9]*xv;
2562 	s[10] -= v[10]*xv;
2563         s[11] -= v[11]*xv;
2564 	s[12] -= v[12]*xv;
2565 	s[13] -= v[13]*xv;
2566 	s[14] -= v[14]*xv;
2567 	v += 15;
2568       }
2569     }
2570     ierr = PetscMemzero(x+idt,bs*sizeof(MatScalar));CHKERRQ(ierr);
2571     for(k=0;k<15;k++){
2572       x[idt]    += v[0]*s[k];
2573       x[1+idt]  += v[1]*s[k];
2574       x[2+idt]  += v[2]*s[k];
2575       x[3+idt]  += v[3]*s[k];
2576       x[4+idt]  += v[4]*s[k];
2577       x[5+idt]  += v[5]*s[k];
2578       x[6+idt]  += v[6]*s[k];
2579       x[7+idt]  += v[7]*s[k];
2580       x[8+idt]  += v[8]*s[k];
2581       x[9+idt]  += v[9]*s[k];
2582       x[10+idt] += v[10]*s[k];
2583       x[11+idt] += v[11]*s[k];
2584       x[12+idt] += v[12]*s[k];
2585       x[13+idt] += v[13]*s[k];
2586       x[14+idt] += v[14]*s[k];
2587       v += 15;
2588     }
2589   }
2590   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2591   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2592   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2593   PetscFunctionReturn(0);
2594 }
2595 
2596 
2597 #undef __FUNCT__
2598 #define __FUNCT__ "MatSolve_SeqBAIJ_7_inplace"
2599 PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
2600 {
2601   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2602   IS                iscol=a->col,isrow=a->row;
2603   PetscErrorCode    ierr;
2604   const PetscInt    *r,*c,*ai=a->i,*aj=a->j;
2605   const PetscInt    *rout,*cout,*diag = a->diag,*vi,n=a->mbs;
2606   PetscInt          i,nz,idx,idt,idc;
2607   const MatScalar   *aa=a->a,*v;
2608   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
2609   const PetscScalar *b;
2610 
2611   PetscFunctionBegin;
2612   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2613   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2614   t  = a->solve_work;
2615 
2616   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2617   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2618 
2619   /* forward solve the lower triangular */
2620   idx    = 7*(*r++);
2621   t[0] = b[idx];   t[1] = b[1+idx];
2622   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
2623   t[5] = b[5+idx]; t[6] = b[6+idx];
2624 
2625   for (i=1; i<n; i++) {
2626     v     = aa + 49*ai[i];
2627     vi    = aj + ai[i];
2628     nz    = diag[i] - ai[i];
2629     idx   = 7*(*r++);
2630     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2631     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
2632     while (nz--) {
2633       idx   = 7*(*vi++);
2634       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
2635       x4    = t[3+idx];x5 = t[4+idx];
2636       x6    = t[5+idx];x7 = t[6+idx];
2637       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2638       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2639       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2640       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2641       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2642       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2643       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2644       v += 49;
2645     }
2646     idx = 7*i;
2647     t[idx]   = s1;t[1+idx] = s2;
2648     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2649     t[5+idx] = s6;t[6+idx] = s7;
2650   }
2651   /* backward solve the upper triangular */
2652   for (i=n-1; i>=0; i--){
2653     v    = aa + 49*diag[i] + 49;
2654     vi   = aj + diag[i] + 1;
2655     nz   = ai[i+1] - diag[i] - 1;
2656     idt  = 7*i;
2657     s1 = t[idt];  s2 = t[1+idt];
2658     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2659     s6 = t[5+idt];s7 = t[6+idt];
2660     while (nz--) {
2661       idx   = 7*(*vi++);
2662       x1    = t[idx];   x2 = t[1+idx];
2663       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2664       x6    = t[5+idx]; x7 = t[6+idx];
2665       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2666       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2667       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2668       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2669       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2670       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2671       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2672       v += 49;
2673     }
2674     idc = 7*(*c--);
2675     v   = aa + 49*diag[i];
2676     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
2677                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
2678     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
2679                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
2680     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
2681                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
2682     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
2683                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
2684     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
2685                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
2686     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
2687                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
2688     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
2689                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
2690   }
2691 
2692   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2693   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2694   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2695   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2696   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
2697   PetscFunctionReturn(0);
2698 }
2699 
2700 #undef __FUNCT__
2701 #define __FUNCT__ "MatSolve_SeqBAIJ_7"
2702 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
2703 {
2704   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2705   IS                iscol=a->col,isrow=a->row;
2706   PetscErrorCode    ierr;
2707   const PetscInt    *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag;
2708   const PetscInt    n=a->mbs,*rout,*cout,*vi;
2709   PetscInt          i,nz,idx,idt,idc,m;
2710   const MatScalar   *aa=a->a,*v;
2711   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
2712   const PetscScalar *b;
2713 
2714   PetscFunctionBegin;
2715   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2716   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2717   t  = a->solve_work;
2718 
2719   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2720   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2721 
2722   /* forward solve the lower triangular */
2723   idx    = 7*r[0];
2724   t[0] = b[idx];   t[1] = b[1+idx];
2725   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
2726   t[5] = b[5+idx]; t[6] = b[6+idx];
2727 
2728   for (i=1; i<n; i++) {
2729     v     = aa + 49*ai[i];
2730     vi    = aj + ai[i];
2731     nz    = ai[i+1] - ai[i];
2732     idx   = 7*r[i];
2733     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2734     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
2735     for(m=0;m<nz;m++){
2736       idx   = 7*vi[m];
2737       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
2738       x4    = t[3+idx];x5 = t[4+idx];
2739       x6    = t[5+idx];x7 = t[6+idx];
2740       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2741       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2742       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2743       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2744       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2745       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2746       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2747       v += 49;
2748     }
2749     idx = 7*i;
2750     t[idx]   = s1;t[1+idx] = s2;
2751     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2752     t[5+idx] = s6;t[6+idx] = s7;
2753   }
2754   /* backward solve the upper triangular */
2755   for (i=n-1; i>=0; i--){
2756     v    = aa + 49*(adiag[i+1]+1);
2757     vi   = aj + adiag[i+1]+1;
2758     nz   = adiag[i] - adiag[i+1] - 1;
2759     idt  = 7*i;
2760     s1 = t[idt];  s2 = t[1+idt];
2761     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2762     s6 = t[5+idt];s7 = t[6+idt];
2763     for(m=0;m<nz;m++){
2764       idx   = 7*vi[m];
2765       x1    = t[idx];   x2 = t[1+idx];
2766       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2767       x6    = t[5+idx]; x7 = t[6+idx];
2768       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2769       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2770       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2771       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2772       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2773       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2774       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2775       v += 49;
2776     }
2777     idc = 7*c[i];
2778     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
2779                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
2780     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
2781                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
2782     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
2783                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
2784     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
2785                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
2786     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
2787                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
2788     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
2789                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
2790     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
2791                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
2792   }
2793 
2794   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2795   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2796   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2797   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2798   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
2799   PetscFunctionReturn(0);
2800 }
2801 
2802 #undef __FUNCT__
2803 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_inplace"
2804 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2805 {
2806   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2807   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
2808   PetscErrorCode    ierr;
2809   PetscInt          i,nz,idx,idt,jdx;
2810   const MatScalar   *aa=a->a,*v;
2811   PetscScalar       *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
2812   const PetscScalar *b;
2813 
2814   PetscFunctionBegin;
2815   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2816   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2817   /* forward solve the lower triangular */
2818   idx    = 0;
2819   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
2820   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
2821   x[6] = b[6+idx];
2822   for (i=1; i<n; i++) {
2823     v     =  aa + 49*ai[i];
2824     vi    =  aj + ai[i];
2825     nz    =  diag[i] - ai[i];
2826     idx   =  7*i;
2827     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
2828     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
2829     s7  =  b[6+idx];
2830     while (nz--) {
2831       jdx   = 7*(*vi++);
2832       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
2833       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
2834       x7    = x[6+jdx];
2835       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2836       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2837       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2838       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2839       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2840       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2841       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2842       v += 49;
2843      }
2844     x[idx]   = s1;
2845     x[1+idx] = s2;
2846     x[2+idx] = s3;
2847     x[3+idx] = s4;
2848     x[4+idx] = s5;
2849     x[5+idx] = s6;
2850     x[6+idx] = s7;
2851   }
2852   /* backward solve the upper triangular */
2853   for (i=n-1; i>=0; i--){
2854     v    = aa + 49*diag[i] + 49;
2855     vi   = aj + diag[i] + 1;
2856     nz   = ai[i+1] - diag[i] - 1;
2857     idt  = 7*i;
2858     s1 = x[idt];   s2 = x[1+idt];
2859     s3 = x[2+idt]; s4 = x[3+idt];
2860     s5 = x[4+idt]; s6 = x[5+idt];
2861     s7 = x[6+idt];
2862     while (nz--) {
2863       idx   = 7*(*vi++);
2864       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
2865       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
2866       x7    = x[6+idx];
2867       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2868       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2869       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2870       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2871       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2872       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2873       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2874       v += 49;
2875     }
2876     v        = aa + 49*diag[i];
2877     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
2878                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
2879     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
2880                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
2881     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
2882                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
2883     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
2884                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
2885     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
2886                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
2887     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
2888                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
2889     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
2890                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
2891   }
2892 
2893   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2894   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2895   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
2896   PetscFunctionReturn(0);
2897 }
2898 
2899 #undef __FUNCT__
2900 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
2901 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
2902 {
2903     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2904     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
2905     PetscErrorCode    ierr;
2906     PetscInt          i,k,nz,idx,jdx,idt;
2907     const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
2908     const MatScalar   *aa=a->a,*v;
2909     PetscScalar       *x;
2910     const PetscScalar *b;
2911     PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
2912 
2913     PetscFunctionBegin;
2914     ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2915     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2916     /* forward solve the lower triangular */
2917     idx    = 0;
2918     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
2919     x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx];
2920     for (i=1; i<n; i++) {
2921        v    = aa + bs2*ai[i];
2922        vi   = aj + ai[i];
2923        nz   = ai[i+1] - ai[i];
2924       idx   = bs*i;
2925        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2926        s5   = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
2927        for(k=0;k<nz;k++) {
2928           jdx   = bs*vi[k];
2929           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
2930 	  x5    = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx];
2931           s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
2932           s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
2933           s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
2934 	  s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
2935           s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
2936 	  s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
2937 	  s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
2938           v   +=  bs2;
2939         }
2940 
2941        x[idx]   = s1;
2942        x[1+idx] = s2;
2943        x[2+idx] = s3;
2944        x[3+idx] = s4;
2945        x[4+idx] = s5;
2946        x[5+idx] = s6;
2947        x[6+idx] = s7;
2948     }
2949 
2950    /* backward solve the upper triangular */
2951   for (i=n-1; i>=0; i--){
2952     v   = aa + bs2*(adiag[i+1]+1);
2953      vi  = aj + adiag[i+1]+1;
2954      nz  = adiag[i] - adiag[i+1]-1;
2955      idt = bs*i;
2956      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
2957      s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt];
2958     for(k=0;k<nz;k++) {
2959       idx   = bs*vi[k];
2960        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
2961        x5    = x[4+idx];x6 = x[5+idx];x7 = x[6+idx];
2962        s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
2963        s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
2964        s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
2965        s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
2966        s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
2967        s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
2968        s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
2969         v   +=  bs2;
2970     }
2971     /* x = inv_diagonal*x */
2972     x[idt]   = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4  + v[28]*s5 + v[35]*s6 + v[42]*s7;
2973     x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4  + v[29]*s5 + v[36]*s6 + v[43]*s7;
2974     x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4  + v[30]*s5 + v[37]*s6 + v[44]*s7;
2975     x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4  + v[31]*s5 + v[38]*s6 + v[45]*s7;
2976     x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4  + v[32]*s5 + v[39]*s6 + v[46]*s7;
2977     x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4  + v[33]*s5 + v[40]*s6 + v[47]*s7;
2978     x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4  + v[34]*s5 + v[41]*s6 + v[48]*s7;
2979   }
2980 
2981   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2982   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2983   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2984   PetscFunctionReturn(0);
2985 }
2986 
2987 #undef __FUNCT__
2988 #define __FUNCT__ "MatSolve_SeqBAIJ_6_inplace"
2989 PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
2990 {
2991   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2992   IS                iscol=a->col,isrow=a->row;
2993   PetscErrorCode    ierr;
2994   const PetscInt    *r,*c,*rout,*cout;
2995   const PetscInt    *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
2996   PetscInt          i,nz,idx,idt,idc;
2997   const MatScalar   *aa=a->a,*v;
2998   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
2999   const PetscScalar *b;
3000 
3001   PetscFunctionBegin;
3002   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3003   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3004   t  = a->solve_work;
3005 
3006   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3007   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3008 
3009   /* forward solve the lower triangular */
3010   idx    = 6*(*r++);
3011   t[0] = b[idx];   t[1] = b[1+idx];
3012   t[2] = b[2+idx]; t[3] = b[3+idx];
3013   t[4] = b[4+idx]; t[5] = b[5+idx];
3014   for (i=1; i<n; i++) {
3015     v     = aa + 36*ai[i];
3016     vi    = aj + ai[i];
3017     nz    = diag[i] - ai[i];
3018     idx   = 6*(*r++);
3019     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3020     s5  = b[4+idx]; s6 = b[5+idx];
3021     while (nz--) {
3022       idx   = 6*(*vi++);
3023       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
3024       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
3025       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3026       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3027       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3028       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3029       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3030       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3031       v += 36;
3032     }
3033     idx = 6*i;
3034     t[idx]   = s1;t[1+idx] = s2;
3035     t[2+idx] = s3;t[3+idx] = s4;
3036     t[4+idx] = s5;t[5+idx] = s6;
3037   }
3038   /* backward solve the upper triangular */
3039   for (i=n-1; i>=0; i--){
3040     v    = aa + 36*diag[i] + 36;
3041     vi   = aj + diag[i] + 1;
3042     nz   = ai[i+1] - diag[i] - 1;
3043     idt  = 6*i;
3044     s1 = t[idt];  s2 = t[1+idt];
3045     s3 = t[2+idt];s4 = t[3+idt];
3046     s5 = t[4+idt];s6 = t[5+idt];
3047     while (nz--) {
3048       idx   = 6*(*vi++);
3049       x1    = t[idx];   x2 = t[1+idx];
3050       x3    = t[2+idx]; x4 = t[3+idx];
3051       x5    = t[4+idx]; x6 = t[5+idx];
3052       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3053       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3054       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3055       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3056       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3057       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3058       v += 36;
3059     }
3060     idc = 6*(*c--);
3061     v   = aa + 36*diag[i];
3062     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
3063                                  v[18]*s4+v[24]*s5+v[30]*s6;
3064     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
3065                                  v[19]*s4+v[25]*s5+v[31]*s6;
3066     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
3067                                  v[20]*s4+v[26]*s5+v[32]*s6;
3068     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
3069                                  v[21]*s4+v[27]*s5+v[33]*s6;
3070     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
3071                                  v[22]*s4+v[28]*s5+v[34]*s6;
3072     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
3073                                  v[23]*s4+v[29]*s5+v[35]*s6;
3074   }
3075 
3076   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3077   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3078   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3079   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3080   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
3081   PetscFunctionReturn(0);
3082 }
3083 
3084 #undef __FUNCT__
3085 #define __FUNCT__ "MatSolve_SeqBAIJ_6"
3086 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
3087 {
3088   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
3089   IS                iscol=a->col,isrow=a->row;
3090   PetscErrorCode    ierr;
3091   const PetscInt    *r,*c,*rout,*cout;
3092   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3093   PetscInt          i,nz,idx,idt,idc,m;
3094   const MatScalar   *aa=a->a,*v;
3095   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
3096   const PetscScalar *b;
3097 
3098   PetscFunctionBegin;
3099   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3100   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3101   t  = a->solve_work;
3102 
3103   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3104   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
3105 
3106   /* forward solve the lower triangular */
3107   idx    = 6*r[0];
3108   t[0] = b[idx];   t[1] = b[1+idx];
3109   t[2] = b[2+idx]; t[3] = b[3+idx];
3110   t[4] = b[4+idx]; t[5] = b[5+idx];
3111   for (i=1; i<n; i++) {
3112     v     = aa + 36*ai[i];
3113     vi    = aj + ai[i];
3114     nz    = ai[i+1] - ai[i];
3115     idx   = 6*r[i];
3116     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3117     s5  = b[4+idx]; s6 = b[5+idx];
3118     for(m=0;m<nz;m++){
3119       idx   = 6*vi[m];
3120       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
3121       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
3122       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3123       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3124       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3125       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3126       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3127       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3128       v += 36;
3129     }
3130     idx = 6*i;
3131     t[idx]   = s1;t[1+idx] = s2;
3132     t[2+idx] = s3;t[3+idx] = s4;
3133     t[4+idx] = s5;t[5+idx] = s6;
3134   }
3135   /* backward solve the upper triangular */
3136   for (i=n-1; i>=0; i--){
3137     v    = aa + 36*(adiag[i+1]+1);
3138     vi   = aj + adiag[i+1]+1;
3139     nz   = adiag[i] - adiag[i+1] - 1;
3140     idt  = 6*i;
3141     s1 = t[idt];  s2 = t[1+idt];
3142     s3 = t[2+idt];s4 = t[3+idt];
3143     s5 = t[4+idt];s6 = t[5+idt];
3144     for(m=0;m<nz;m++){
3145       idx   = 6*vi[m];
3146       x1    = t[idx];   x2 = t[1+idx];
3147       x3    = t[2+idx]; x4 = t[3+idx];
3148       x5    = t[4+idx]; x6 = t[5+idx];
3149       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3150       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3151       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3152       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3153       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3154       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3155       v += 36;
3156     }
3157     idc = 6*c[i];
3158     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
3159                                  v[18]*s4+v[24]*s5+v[30]*s6;
3160     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
3161                                  v[19]*s4+v[25]*s5+v[31]*s6;
3162     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
3163                                  v[20]*s4+v[26]*s5+v[32]*s6;
3164     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
3165                                  v[21]*s4+v[27]*s5+v[33]*s6;
3166     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
3167                                  v[22]*s4+v[28]*s5+v[34]*s6;
3168     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
3169                                  v[23]*s4+v[29]*s5+v[35]*s6;
3170   }
3171 
3172   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3173   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3174   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3175   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3176   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
3177   PetscFunctionReturn(0);
3178 }
3179 
3180 #undef __FUNCT__
3181 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_inplace"
3182 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
3183 {
3184   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3185   PetscInt          i,nz,idx,idt,jdx;
3186   PetscErrorCode    ierr;
3187   const PetscInt    *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j;
3188   const MatScalar   *aa=a->a,*v;
3189   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
3190   const PetscScalar *b;
3191 
3192   PetscFunctionBegin;
3193   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3194   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3195   /* forward solve the lower triangular */
3196   idx    = 0;
3197   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
3198   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
3199   for (i=1; i<n; i++) {
3200     v     =  aa + 36*ai[i];
3201     vi    =  aj + ai[i];
3202     nz    =  diag[i] - ai[i];
3203     idx   =  6*i;
3204     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
3205     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
3206     while (nz--) {
3207       jdx   = 6*(*vi++);
3208       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
3209       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
3210       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3211       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3212       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3213       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3214       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3215       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3216       v += 36;
3217      }
3218     x[idx]   = s1;
3219     x[1+idx] = s2;
3220     x[2+idx] = s3;
3221     x[3+idx] = s4;
3222     x[4+idx] = s5;
3223     x[5+idx] = s6;
3224   }
3225   /* backward solve the upper triangular */
3226   for (i=n-1; i>=0; i--){
3227     v    = aa + 36*diag[i] + 36;
3228     vi   = aj + diag[i] + 1;
3229     nz   = ai[i+1] - diag[i] - 1;
3230     idt  = 6*i;
3231     s1 = x[idt];   s2 = x[1+idt];
3232     s3 = x[2+idt]; s4 = x[3+idt];
3233     s5 = x[4+idt]; s6 = x[5+idt];
3234     while (nz--) {
3235       idx   = 6*(*vi++);
3236       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
3237       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
3238       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3239       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3240       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3241       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3242       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3243       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3244       v += 36;
3245     }
3246     v        = aa + 36*diag[i];
3247     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
3248     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
3249     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
3250     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
3251     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
3252     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
3253   }
3254 
3255   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3256   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3257   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
3258   PetscFunctionReturn(0);
3259 }
3260 
3261 #undef __FUNCT__
3262 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
3263 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
3264 {
3265     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3266     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3267     PetscErrorCode    ierr;
3268     PetscInt          i,k,nz,idx,jdx,idt;
3269     const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
3270     const MatScalar   *aa=a->a,*v;
3271     PetscScalar       *x;
3272     const PetscScalar *b;
3273     PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
3274 
3275     PetscFunctionBegin;
3276     ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3277     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3278     /* forward solve the lower triangular */
3279     idx    = 0;
3280     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
3281     x[4] = b[4+idx];x[5] = b[5+idx];
3282     for (i=1; i<n; i++) {
3283        v    = aa + bs2*ai[i];
3284        vi   = aj + ai[i];
3285        nz   = ai[i+1] - ai[i];
3286       idx   = bs*i;
3287        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3288        s5   = b[4+idx];s6 = b[5+idx];
3289        for(k=0;k<nz;k++){
3290           jdx   = bs*vi[k];
3291           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
3292 	  x5    = x[4+jdx]; x6 = x[5+jdx];
3293           s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
3294           s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
3295           s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
3296 	  s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
3297           s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
3298 	  s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
3299           v   +=  bs2;
3300         }
3301 
3302        x[idx]   = s1;
3303        x[1+idx] = s2;
3304        x[2+idx] = s3;
3305        x[3+idx] = s4;
3306        x[4+idx] = s5;
3307        x[5+idx] = s6;
3308     }
3309 
3310    /* backward solve the upper triangular */
3311   for (i=n-1; i>=0; i--){
3312     v   = aa + bs2*(adiag[i+1]+1);
3313      vi  = aj + adiag[i+1]+1;
3314      nz  = adiag[i] - adiag[i+1]-1;
3315      idt = bs*i;
3316      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
3317      s5 = x[4+idt];s6 = x[5+idt];
3318      for(k=0;k<nz;k++){
3319       idx   = bs*vi[k];
3320        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
3321        x5    = x[4+idx];x6 = x[5+idx];
3322        s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
3323        s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
3324        s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
3325        s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
3326        s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
3327        s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
3328         v   +=  bs2;
3329     }
3330     /* x = inv_diagonal*x */
3331    x[idt]   = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
3332    x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
3333    x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
3334    x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
3335    x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
3336    x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
3337   }
3338 
3339   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3340   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3341   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
3342   PetscFunctionReturn(0);
3343 }
3344 
3345 #undef __FUNCT__
3346 #define __FUNCT__ "MatSolve_SeqBAIJ_5_inplace"
3347 PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
3348 {
3349   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
3350   IS                iscol=a->col,isrow=a->row;
3351   PetscErrorCode    ierr;
3352   const PetscInt    *r,*c,*rout,*cout,*diag = a->diag;
3353   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3354   PetscInt          i,nz,idx,idt,idc;
3355   const MatScalar   *aa=a->a,*v;
3356   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
3357   const PetscScalar *b;
3358 
3359   PetscFunctionBegin;
3360   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3361   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3362   t  = a->solve_work;
3363 
3364   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3365   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3366 
3367   /* forward solve the lower triangular */
3368   idx    = 5*(*r++);
3369   t[0] = b[idx];   t[1] = b[1+idx];
3370   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
3371   for (i=1; i<n; i++) {
3372     v     = aa + 25*ai[i];
3373     vi    = aj + ai[i];
3374     nz    = diag[i] - ai[i];
3375     idx   = 5*(*r++);
3376     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3377     s5  = b[4+idx];
3378     while (nz--) {
3379       idx   = 5*(*vi++);
3380       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
3381       x4    = t[3+idx];x5 = t[4+idx];
3382       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3383       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3384       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3385       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3386       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3387       v += 25;
3388     }
3389     idx = 5*i;
3390     t[idx]   = s1;t[1+idx] = s2;
3391     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
3392   }
3393   /* backward solve the upper triangular */
3394   for (i=n-1; i>=0; i--){
3395     v    = aa + 25*diag[i] + 25;
3396     vi   = aj + diag[i] + 1;
3397     nz   = ai[i+1] - diag[i] - 1;
3398     idt  = 5*i;
3399     s1 = t[idt];  s2 = t[1+idt];
3400     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
3401     while (nz--) {
3402       idx   = 5*(*vi++);
3403       x1    = t[idx];   x2 = t[1+idx];
3404       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
3405       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3406       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3407       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3408       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3409       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3410       v += 25;
3411     }
3412     idc = 5*(*c--);
3413     v   = aa + 25*diag[i];
3414     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
3415                                  v[15]*s4+v[20]*s5;
3416     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
3417                                  v[16]*s4+v[21]*s5;
3418     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
3419                                  v[17]*s4+v[22]*s5;
3420     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
3421                                  v[18]*s4+v[23]*s5;
3422     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
3423                                  v[19]*s4+v[24]*s5;
3424   }
3425 
3426   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3427   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3428   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3429   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3430   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
3431   PetscFunctionReturn(0);
3432 }
3433 
3434 #undef __FUNCT__
3435 #define __FUNCT__ "MatSolve_SeqBAIJ_5"
3436 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
3437 {
3438   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
3439   IS                iscol=a->col,isrow=a->row;
3440   PetscErrorCode    ierr;
3441   const PetscInt    *r,*c,*rout,*cout;
3442   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3443   PetscInt          i,nz,idx,idt,idc,m;
3444   const MatScalar   *aa=a->a,*v;
3445   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
3446   const PetscScalar *b;
3447 
3448   PetscFunctionBegin;
3449   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3450   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3451   t  = a->solve_work;
3452 
3453   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3454   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
3455 
3456   /* forward solve the lower triangular */
3457   idx    = 5*r[0];
3458   t[0] = b[idx];   t[1] = b[1+idx];
3459   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
3460   for (i=1; i<n; i++) {
3461     v     = aa + 25*ai[i];
3462     vi    = aj + ai[i];
3463     nz    = ai[i+1] - ai[i];
3464     idx   = 5*r[i];
3465     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3466     s5  = b[4+idx];
3467     for(m=0;m<nz;m++){
3468       idx   = 5*vi[m];
3469       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
3470       x4    = t[3+idx];x5 = t[4+idx];
3471       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3472       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3473       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3474       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3475       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3476       v += 25;
3477     }
3478     idx = 5*i;
3479     t[idx]   = s1;t[1+idx] = s2;
3480     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
3481   }
3482   /* backward solve the upper triangular */
3483   for (i=n-1; i>=0; i--){
3484     v    = aa + 25*(adiag[i+1]+1);
3485     vi   = aj + adiag[i+1]+1;
3486     nz   = adiag[i] - adiag[i+1] - 1;
3487     idt  = 5*i;
3488     s1 = t[idt];  s2 = t[1+idt];
3489     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
3490     for(m=0;m<nz;m++){
3491       idx   = 5*vi[m];
3492       x1    = t[idx];   x2 = t[1+idx];
3493       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
3494       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3495       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3496       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3497       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3498       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3499       v += 25;
3500     }
3501     idc = 5*c[i];
3502     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
3503                                  v[15]*s4+v[20]*s5;
3504     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
3505                                  v[16]*s4+v[21]*s5;
3506     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
3507                                  v[17]*s4+v[22]*s5;
3508     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
3509                                  v[18]*s4+v[23]*s5;
3510     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
3511                                  v[19]*s4+v[24]*s5;
3512   }
3513 
3514   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3515   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3516   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3517   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3518   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
3519   PetscFunctionReturn(0);
3520 }
3521 
3522 #undef __FUNCT__
3523 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_inplace"
3524 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
3525 {
3526   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3527   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3528   PetscInt          i,nz,idx,idt,jdx;
3529   PetscErrorCode    ierr;
3530   const MatScalar   *aa=a->a,*v;
3531   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
3532   const PetscScalar *b;
3533 
3534   PetscFunctionBegin;
3535   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3536   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3537   /* forward solve the lower triangular */
3538   idx    = 0;
3539   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
3540   for (i=1; i<n; i++) {
3541     v     =  aa + 25*ai[i];
3542     vi    =  aj + ai[i];
3543     nz    =  diag[i] - ai[i];
3544     idx   =  5*i;
3545     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
3546     while (nz--) {
3547       jdx   = 5*(*vi++);
3548       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
3549       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3550       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3551       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3552       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3553       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3554       v    += 25;
3555     }
3556     x[idx]   = s1;
3557     x[1+idx] = s2;
3558     x[2+idx] = s3;
3559     x[3+idx] = s4;
3560     x[4+idx] = s5;
3561   }
3562   /* backward solve the upper triangular */
3563   for (i=n-1; i>=0; i--){
3564     v    = aa + 25*diag[i] + 25;
3565     vi   = aj + diag[i] + 1;
3566     nz   = ai[i+1] - diag[i] - 1;
3567     idt  = 5*i;
3568     s1 = x[idt];  s2 = x[1+idt];
3569     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
3570     while (nz--) {
3571       idx   = 5*(*vi++);
3572       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
3573       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3574       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3575       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3576       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3577       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3578       v    += 25;
3579     }
3580     v        = aa + 25*diag[i];
3581     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
3582     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
3583     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
3584     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
3585     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
3586   }
3587 
3588   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3589   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3590   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
3591   PetscFunctionReturn(0);
3592 }
3593 
3594 #undef __FUNCT__
3595 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
3596 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
3597 {
3598   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3599   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3600   PetscInt          i,k,nz,idx,idt,jdx;
3601   PetscErrorCode    ierr;
3602   const MatScalar   *aa=a->a,*v;
3603   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
3604   const PetscScalar *b;
3605 
3606   PetscFunctionBegin;
3607   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3608   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3609   /* forward solve the lower triangular */
3610   idx    = 0;
3611   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
3612   for (i=1; i<n; i++) {
3613     v   = aa + 25*ai[i];
3614     vi  = aj + ai[i];
3615     nz  = ai[i+1] - ai[i];
3616     idx = 5*i;
3617     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
3618     for(k=0;k<nz;k++) {
3619       jdx   = 5*vi[k];
3620       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
3621       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3622       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3623       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3624       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3625       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3626       v    += 25;
3627     }
3628     x[idx]   = s1;
3629     x[1+idx] = s2;
3630     x[2+idx] = s3;
3631     x[3+idx] = s4;
3632     x[4+idx] = s5;
3633   }
3634 
3635   /* backward solve the upper triangular */
3636   for (i=n-1; i>=0; i--){
3637     v   = aa + 25*(adiag[i+1]+1);
3638     vi  = aj + adiag[i+1]+1;
3639     nz  = adiag[i] - adiag[i+1]-1;
3640     idt = 5*i;
3641     s1 = x[idt];  s2 = x[1+idt];
3642     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
3643     for(k=0;k<nz;k++){
3644       idx   = 5*vi[k];
3645       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
3646       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3647       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3648       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3649       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3650       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3651       v    += 25;
3652     }
3653     /* x = inv_diagonal*x */
3654     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
3655     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
3656     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
3657     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
3658     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
3659   }
3660 
3661   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3662   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3663   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
3664   PetscFunctionReturn(0);
3665 }
3666 
3667 #undef __FUNCT__
3668 #define __FUNCT__ "MatSolve_SeqBAIJ_4_inplace"
3669 PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
3670 {
3671   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3672   IS                iscol=a->col,isrow=a->row;
3673   PetscErrorCode    ierr;
3674   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3675   PetscInt          i,nz,idx,idt,idc;
3676   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3677   const MatScalar   *aa=a->a,*v;
3678   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
3679   const PetscScalar *b;
3680 
3681   PetscFunctionBegin;
3682   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3683   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3684   t  = a->solve_work;
3685 
3686   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3687   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3688 
3689   /* forward solve the lower triangular */
3690   idx    = 4*(*r++);
3691   t[0] = b[idx];   t[1] = b[1+idx];
3692   t[2] = b[2+idx]; t[3] = b[3+idx];
3693   for (i=1; i<n; i++) {
3694     v     = aa + 16*ai[i];
3695     vi    = aj + ai[i];
3696     nz    = diag[i] - ai[i];
3697     idx   = 4*(*r++);
3698     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3699     while (nz--) {
3700       idx   = 4*(*vi++);
3701       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
3702       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3703       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3704       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3705       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3706       v    += 16;
3707     }
3708     idx        = 4*i;
3709     t[idx]   = s1;t[1+idx] = s2;
3710     t[2+idx] = s3;t[3+idx] = s4;
3711   }
3712   /* backward solve the upper triangular */
3713   for (i=n-1; i>=0; i--){
3714     v    = aa + 16*diag[i] + 16;
3715     vi   = aj + diag[i] + 1;
3716     nz   = ai[i+1] - diag[i] - 1;
3717     idt  = 4*i;
3718     s1 = t[idt];  s2 = t[1+idt];
3719     s3 = t[2+idt];s4 = t[3+idt];
3720     while (nz--) {
3721       idx   = 4*(*vi++);
3722       x1    = t[idx];   x2 = t[1+idx];
3723       x3    = t[2+idx]; x4 = t[3+idx];
3724       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3725       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3726       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3727       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3728       v += 16;
3729     }
3730     idc      = 4*(*c--);
3731     v        = aa + 16*diag[i];
3732     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3733     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3734     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3735     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3736   }
3737 
3738   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3739   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3740   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3741   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3742   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3743   PetscFunctionReturn(0);
3744 }
3745 
3746 #undef __FUNCT__
3747 #define __FUNCT__ "MatSolve_SeqBAIJ_4"
3748 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
3749 {
3750   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3751   IS                iscol=a->col,isrow=a->row;
3752   PetscErrorCode    ierr;
3753   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3754   PetscInt          i,nz,idx,idt,idc,m;
3755   const PetscInt    *r,*c,*rout,*cout;
3756   const MatScalar   *aa=a->a,*v;
3757   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
3758   const PetscScalar *b;
3759 
3760   PetscFunctionBegin;
3761   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3762   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3763   t  = a->solve_work;
3764 
3765   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3766   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
3767 
3768   /* forward solve the lower triangular */
3769   idx    = 4*r[0];
3770   t[0] = b[idx];   t[1] = b[1+idx];
3771   t[2] = b[2+idx]; t[3] = b[3+idx];
3772   for (i=1; i<n; i++) {
3773     v     = aa + 16*ai[i];
3774     vi    = aj + ai[i];
3775     nz    = ai[i+1] - ai[i];
3776     idx   = 4*r[i];
3777     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3778     for(m=0;m<nz;m++){
3779       idx   = 4*vi[m];
3780       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
3781       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3782       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3783       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3784       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3785       v    += 16;
3786     }
3787     idx        = 4*i;
3788     t[idx]   = s1;t[1+idx] = s2;
3789     t[2+idx] = s3;t[3+idx] = s4;
3790   }
3791   /* backward solve the upper triangular */
3792   for (i=n-1; i>=0; i--){
3793     v    = aa + 16*(adiag[i+1]+1);
3794     vi   = aj + adiag[i+1]+1;
3795     nz   = adiag[i] - adiag[i+1] - 1;
3796     idt  = 4*i;
3797     s1 = t[idt];  s2 = t[1+idt];
3798     s3 = t[2+idt];s4 = t[3+idt];
3799     for(m=0;m<nz;m++){
3800       idx   = 4*vi[m];
3801       x1    = t[idx];   x2 = t[1+idx];
3802       x3    = t[2+idx]; x4 = t[3+idx];
3803       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3804       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3805       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3806       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3807       v += 16;
3808     }
3809     idc      = 4*c[i];
3810     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3811     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3812     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3813     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3814   }
3815 
3816   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3817   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3818   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3819   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3820   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3821   PetscFunctionReturn(0);
3822 }
3823 
3824 #undef __FUNCT__
3825 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion"
3826 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
3827 {
3828   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3829   IS                iscol=a->col,isrow=a->row;
3830   PetscErrorCode    ierr;
3831   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3832   PetscInt          i,nz,idx,idt,idc;
3833   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3834   const MatScalar   *aa=a->a,*v;
3835   MatScalar         s1,s2,s3,s4,x1,x2,x3,x4,*t;
3836   PetscScalar       *x;
3837   const PetscScalar *b;
3838 
3839   PetscFunctionBegin;
3840   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3841   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3842   t  = (MatScalar *)a->solve_work;
3843 
3844   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3845   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3846 
3847   /* forward solve the lower triangular */
3848   idx    = 4*(*r++);
3849   t[0] = (MatScalar)b[idx];
3850   t[1] = (MatScalar)b[1+idx];
3851   t[2] = (MatScalar)b[2+idx];
3852   t[3] = (MatScalar)b[3+idx];
3853   for (i=1; i<n; i++) {
3854     v     = aa + 16*ai[i];
3855     vi    = aj + ai[i];
3856     nz    = diag[i] - ai[i];
3857     idx   = 4*(*r++);
3858     s1 = (MatScalar)b[idx];
3859     s2 = (MatScalar)b[1+idx];
3860     s3 = (MatScalar)b[2+idx];
3861     s4 = (MatScalar)b[3+idx];
3862     while (nz--) {
3863       idx   = 4*(*vi++);
3864       x1  = t[idx];
3865       x2  = t[1+idx];
3866       x3  = t[2+idx];
3867       x4  = t[3+idx];
3868       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3869       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3870       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3871       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3872       v    += 16;
3873     }
3874     idx        = 4*i;
3875     t[idx]   = s1;
3876     t[1+idx] = s2;
3877     t[2+idx] = s3;
3878     t[3+idx] = s4;
3879   }
3880   /* backward solve the upper triangular */
3881   for (i=n-1; i>=0; i--){
3882     v    = aa + 16*diag[i] + 16;
3883     vi   = aj + diag[i] + 1;
3884     nz   = ai[i+1] - diag[i] - 1;
3885     idt  = 4*i;
3886     s1 = t[idt];
3887     s2 = t[1+idt];
3888     s3 = t[2+idt];
3889     s4 = t[3+idt];
3890     while (nz--) {
3891       idx   = 4*(*vi++);
3892       x1  = t[idx];
3893       x2  = t[1+idx];
3894       x3  = t[2+idx];
3895       x4  = t[3+idx];
3896       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3897       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3898       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3899       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3900       v += 16;
3901     }
3902     idc      = 4*(*c--);
3903     v        = aa + 16*diag[i];
3904     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3905     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3906     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3907     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3908     x[idc]   = (PetscScalar)t[idt];
3909     x[1+idc] = (PetscScalar)t[1+idt];
3910     x[2+idc] = (PetscScalar)t[2+idt];
3911     x[3+idc] = (PetscScalar)t[3+idt];
3912  }
3913 
3914   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3915   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3916   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3917   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3918   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3919   PetscFunctionReturn(0);
3920 }
3921 
3922 #if defined (PETSC_HAVE_SSE)
3923 
3924 #include PETSC_HAVE_SSE
3925 
3926 #undef __FUNCT__
3927 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
3928 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
3929 {
3930   /*
3931      Note: This code uses demotion of double
3932      to float when performing the mixed-mode computation.
3933      This may not be numerically reasonable for all applications.
3934   */
3935   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3936   IS             iscol=a->col,isrow=a->row;
3937   PetscErrorCode ierr;
3938   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
3939   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
3940   MatScalar      *aa=a->a,*v;
3941   PetscScalar    *x,*b,*t;
3942 
3943   /* Make space in temp stack for 16 Byte Aligned arrays */
3944   float           ssealignedspace[11],*tmps,*tmpx;
3945   unsigned long   offset;
3946 
3947   PetscFunctionBegin;
3948   SSE_SCOPE_BEGIN;
3949 
3950     offset = (unsigned long)ssealignedspace % 16;
3951     if (offset) offset = (16 - offset)/4;
3952     tmps = &ssealignedspace[offset];
3953     tmpx = &ssealignedspace[offset+4];
3954     PREFETCH_NTA(aa+16*ai[1]);
3955 
3956     ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3957     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3958     t  = a->solve_work;
3959 
3960     ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3961     ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3962 
3963     /* forward solve the lower triangular */
3964     idx  = 4*(*r++);
3965     t[0] = b[idx];   t[1] = b[1+idx];
3966     t[2] = b[2+idx]; t[3] = b[3+idx];
3967     v    =  aa + 16*ai[1];
3968 
3969     for (i=1; i<n;) {
3970       PREFETCH_NTA(&v[8]);
3971       vi   =  aj      + ai[i];
3972       nz   =  diag[i] - ai[i];
3973       idx  =  4*(*r++);
3974 
3975       /* Demote sum from double to float */
3976       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
3977       LOAD_PS(tmps,XMM7);
3978 
3979       while (nz--) {
3980         PREFETCH_NTA(&v[16]);
3981         idx = 4*(*vi++);
3982 
3983         /* Demote solution (so far) from double to float */
3984         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
3985 
3986         /* 4x4 Matrix-Vector product with negative accumulation: */
3987         SSE_INLINE_BEGIN_2(tmpx,v)
3988           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3989 
3990           /* First Column */
3991           SSE_COPY_PS(XMM0,XMM6)
3992           SSE_SHUFFLE(XMM0,XMM0,0x00)
3993           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3994           SSE_SUB_PS(XMM7,XMM0)
3995 
3996           /* Second Column */
3997           SSE_COPY_PS(XMM1,XMM6)
3998           SSE_SHUFFLE(XMM1,XMM1,0x55)
3999           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4000           SSE_SUB_PS(XMM7,XMM1)
4001 
4002           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4003 
4004           /* Third Column */
4005           SSE_COPY_PS(XMM2,XMM6)
4006           SSE_SHUFFLE(XMM2,XMM2,0xAA)
4007           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4008           SSE_SUB_PS(XMM7,XMM2)
4009 
4010           /* Fourth Column */
4011           SSE_COPY_PS(XMM3,XMM6)
4012           SSE_SHUFFLE(XMM3,XMM3,0xFF)
4013           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4014           SSE_SUB_PS(XMM7,XMM3)
4015         SSE_INLINE_END_2
4016 
4017         v  += 16;
4018       }
4019       idx = 4*i;
4020       v   = aa + 16*ai[++i];
4021       PREFETCH_NTA(v);
4022       STORE_PS(tmps,XMM7);
4023 
4024       /* Promote result from float to double */
4025       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
4026     }
4027     /* backward solve the upper triangular */
4028     idt  = 4*(n-1);
4029     ai16 = 16*diag[n-1];
4030     v    = aa + ai16 + 16;
4031     for (i=n-1; i>=0;){
4032       PREFETCH_NTA(&v[8]);
4033       vi = aj + diag[i] + 1;
4034       nz = ai[i+1] - diag[i] - 1;
4035 
4036       /* Demote accumulator from double to float */
4037       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
4038       LOAD_PS(tmps,XMM7);
4039 
4040       while (nz--) {
4041         PREFETCH_NTA(&v[16]);
4042         idx = 4*(*vi++);
4043 
4044         /* Demote solution (so far) from double to float */
4045         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
4046 
4047         /* 4x4 Matrix-Vector Product with negative accumulation: */
4048         SSE_INLINE_BEGIN_2(tmpx,v)
4049           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4050 
4051           /* First Column */
4052           SSE_COPY_PS(XMM0,XMM6)
4053           SSE_SHUFFLE(XMM0,XMM0,0x00)
4054           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4055           SSE_SUB_PS(XMM7,XMM0)
4056 
4057           /* Second Column */
4058           SSE_COPY_PS(XMM1,XMM6)
4059           SSE_SHUFFLE(XMM1,XMM1,0x55)
4060           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4061           SSE_SUB_PS(XMM7,XMM1)
4062 
4063           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4064 
4065           /* Third Column */
4066           SSE_COPY_PS(XMM2,XMM6)
4067           SSE_SHUFFLE(XMM2,XMM2,0xAA)
4068           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4069           SSE_SUB_PS(XMM7,XMM2)
4070 
4071           /* Fourth Column */
4072           SSE_COPY_PS(XMM3,XMM6)
4073           SSE_SHUFFLE(XMM3,XMM3,0xFF)
4074           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4075           SSE_SUB_PS(XMM7,XMM3)
4076         SSE_INLINE_END_2
4077         v  += 16;
4078       }
4079       v    = aa + ai16;
4080       ai16 = 16*diag[--i];
4081       PREFETCH_NTA(aa+ai16+16);
4082       /*
4083          Scale the result by the diagonal 4x4 block,
4084          which was inverted as part of the factorization
4085       */
4086       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
4087         /* First Column */
4088         SSE_COPY_PS(XMM0,XMM7)
4089         SSE_SHUFFLE(XMM0,XMM0,0x00)
4090         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
4091 
4092         /* Second Column */
4093         SSE_COPY_PS(XMM1,XMM7)
4094         SSE_SHUFFLE(XMM1,XMM1,0x55)
4095         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
4096         SSE_ADD_PS(XMM0,XMM1)
4097 
4098         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
4099 
4100         /* Third Column */
4101         SSE_COPY_PS(XMM2,XMM7)
4102         SSE_SHUFFLE(XMM2,XMM2,0xAA)
4103         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
4104         SSE_ADD_PS(XMM0,XMM2)
4105 
4106         /* Fourth Column */
4107         SSE_COPY_PS(XMM3,XMM7)
4108         SSE_SHUFFLE(XMM3,XMM3,0xFF)
4109         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
4110         SSE_ADD_PS(XMM0,XMM3)
4111 
4112         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
4113       SSE_INLINE_END_3
4114 
4115       /* Promote solution from float to double */
4116       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
4117 
4118       /* Apply reordering to t and stream into x.    */
4119       /* This way, x doesn't pollute the cache.      */
4120       /* Be careful with size: 2 doubles = 4 floats! */
4121       idc  = 4*(*c--);
4122       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
4123         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
4124         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
4125         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
4126         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
4127         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
4128         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
4129       SSE_INLINE_END_2
4130       v    = aa + ai16 + 16;
4131       idt -= 4;
4132     }
4133 
4134     ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4135     ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4136     ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4137     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4138     ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4139   SSE_SCOPE_END;
4140   PetscFunctionReturn(0);
4141 }
4142 
4143 #endif
4144 
4145 
4146 /*
4147       Special case where the matrix was ILU(0) factored in the natural
4148    ordering. This eliminates the need for the column and row permutation.
4149 */
4150 #undef __FUNCT__
4151 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_inplace"
4152 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
4153 {
4154   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4155   PetscInt          n=a->mbs;
4156   const PetscInt    *ai=a->i,*aj=a->j;
4157   PetscErrorCode    ierr;
4158   const PetscInt    *diag = a->diag;
4159   const MatScalar   *aa=a->a;
4160   PetscScalar       *x;
4161   const PetscScalar *b;
4162 
4163   PetscFunctionBegin;
4164   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
4165   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4166 
4167 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
4168   {
4169     static PetscScalar w[2000]; /* very BAD need to fix */
4170     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
4171   }
4172 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
4173   {
4174     static PetscScalar w[2000]; /* very BAD need to fix */
4175     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
4176   }
4177 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
4178   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
4179 #else
4180   {
4181     PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
4182     const MatScalar *v;
4183     PetscInt        jdx,idt,idx,nz,i,ai16;
4184     const PetscInt  *vi;
4185 
4186   /* forward solve the lower triangular */
4187   idx    = 0;
4188   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
4189   for (i=1; i<n; i++) {
4190     v     =  aa      + 16*ai[i];
4191     vi    =  aj      + ai[i];
4192     nz    =  diag[i] - ai[i];
4193     idx   +=  4;
4194     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
4195     while (nz--) {
4196       jdx   = 4*(*vi++);
4197       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
4198       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
4199       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
4200       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4201       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4202       v    += 16;
4203     }
4204     x[idx]   = s1;
4205     x[1+idx] = s2;
4206     x[2+idx] = s3;
4207     x[3+idx] = s4;
4208   }
4209   /* backward solve the upper triangular */
4210   idt = 4*(n-1);
4211   for (i=n-1; i>=0; i--){
4212     ai16 = 16*diag[i];
4213     v    = aa + ai16 + 16;
4214     vi   = aj + diag[i] + 1;
4215     nz   = ai[i+1] - diag[i] - 1;
4216     s1 = x[idt];  s2 = x[1+idt];
4217     s3 = x[2+idt];s4 = x[3+idt];
4218     while (nz--) {
4219       idx   = 4*(*vi++);
4220       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
4221       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
4222       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
4223       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
4224       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
4225       v    += 16;
4226     }
4227     v        = aa + ai16;
4228     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
4229     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
4230     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
4231     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
4232     idt -= 4;
4233   }
4234   }
4235 #endif
4236 
4237   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
4238   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4239   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4240   PetscFunctionReturn(0);
4241 }
4242 
4243 #undef __FUNCT__
4244 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
4245 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
4246 {
4247     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4248     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
4249     PetscInt          i,k,nz,idx,jdx,idt;
4250     PetscErrorCode    ierr;
4251     const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
4252     const MatScalar   *aa=a->a,*v;
4253     PetscScalar       *x;
4254     const PetscScalar *b;
4255     PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4;
4256 
4257     PetscFunctionBegin;
4258     ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
4259     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4260     /* forward solve the lower triangular */
4261     idx    = 0;
4262     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
4263     for (i=1; i<n; i++) {
4264        v    = aa + bs2*ai[i];
4265        vi   = aj + ai[i];
4266        nz   = ai[i+1] - ai[i];
4267       idx   = bs*i;
4268        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
4269       for(k=0;k<nz;k++) {
4270           jdx   = bs*vi[k];
4271           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
4272           s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
4273           s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
4274           s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4275 	  s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4276 
4277           v   +=  bs2;
4278         }
4279 
4280        x[idx]   = s1;
4281        x[1+idx] = s2;
4282        x[2+idx] = s3;
4283        x[3+idx] = s4;
4284     }
4285 
4286    /* backward solve the upper triangular */
4287   for (i=n-1; i>=0; i--){
4288     v   = aa + bs2*(adiag[i+1]+1);
4289      vi  = aj + adiag[i+1]+1;
4290      nz  = adiag[i] - adiag[i+1]-1;
4291      idt = bs*i;
4292      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
4293 
4294     for(k=0;k<nz;k++){
4295       idx   = bs*vi[k];
4296        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
4297        s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
4298        s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
4299        s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4300        s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4301 
4302         v   +=  bs2;
4303     }
4304     /* x = inv_diagonal*x */
4305    x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
4306    x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;;
4307    x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
4308    x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
4309 
4310   }
4311 
4312   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
4313   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4314   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
4315   PetscFunctionReturn(0);
4316 }
4317 
4318 #undef __FUNCT__
4319 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion"
4320 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
4321 {
4322   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4323   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*diag=a->diag;
4324   PetscErrorCode    ierr;
4325   const MatScalar   *aa=a->a;
4326   const PetscScalar *b;
4327   PetscScalar       *x;
4328 
4329   PetscFunctionBegin;
4330   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
4331   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4332 
4333   {
4334     MatScalar        s1,s2,s3,s4,x1,x2,x3,x4;
4335     const MatScalar  *v;
4336     MatScalar        *t=(MatScalar *)x;
4337     PetscInt         jdx,idt,idx,nz,i,ai16;
4338     const PetscInt   *vi;
4339 
4340     /* forward solve the lower triangular */
4341     idx  = 0;
4342     t[0] = (MatScalar)b[0];
4343     t[1] = (MatScalar)b[1];
4344     t[2] = (MatScalar)b[2];
4345     t[3] = (MatScalar)b[3];
4346     for (i=1; i<n; i++) {
4347       v     =  aa      + 16*ai[i];
4348       vi    =  aj      + ai[i];
4349       nz    =  diag[i] - ai[i];
4350       idx   +=  4;
4351       s1 = (MatScalar)b[idx];
4352       s2 = (MatScalar)b[1+idx];
4353       s3 = (MatScalar)b[2+idx];
4354       s4 = (MatScalar)b[3+idx];
4355       while (nz--) {
4356         jdx = 4*(*vi++);
4357         x1  = t[jdx];
4358         x2  = t[1+jdx];
4359         x3  = t[2+jdx];
4360         x4  = t[3+jdx];
4361         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
4362         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
4363         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4364         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4365         v    += 16;
4366       }
4367       t[idx]   = s1;
4368       t[1+idx] = s2;
4369       t[2+idx] = s3;
4370       t[3+idx] = s4;
4371     }
4372     /* backward solve the upper triangular */
4373     idt = 4*(n-1);
4374     for (i=n-1; i>=0; i--){
4375       ai16 = 16*diag[i];
4376       v    = aa + ai16 + 16;
4377       vi   = aj + diag[i] + 1;
4378       nz   = ai[i+1] - diag[i] - 1;
4379       s1   = t[idt];
4380       s2   = t[1+idt];
4381       s3   = t[2+idt];
4382       s4   = t[3+idt];
4383       while (nz--) {
4384         idx = 4*(*vi++);
4385         x1  = (MatScalar)x[idx];
4386         x2  = (MatScalar)x[1+idx];
4387         x3  = (MatScalar)x[2+idx];
4388         x4  = (MatScalar)x[3+idx];
4389         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
4390         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
4391         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4392         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4393         v    += 16;
4394       }
4395       v        = aa + ai16;
4396       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
4397       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
4398       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
4399       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
4400       idt -= 4;
4401     }
4402   }
4403 
4404   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
4405   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4406   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4407   PetscFunctionReturn(0);
4408 }
4409 
4410 #if defined (PETSC_HAVE_SSE)
4411 
4412 #include PETSC_HAVE_SSE
4413 #undef __FUNCT__
4414 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj"
4415 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
4416 {
4417   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
4418   unsigned short *aj=(unsigned short *)a->j;
4419   PetscErrorCode ierr;
4420   int            *ai=a->i,n=a->mbs,*diag = a->diag;
4421   MatScalar      *aa=a->a;
4422   PetscScalar    *x,*b;
4423 
4424   PetscFunctionBegin;
4425   SSE_SCOPE_BEGIN;
4426   /*
4427      Note: This code currently uses demotion of double
4428      to float when performing the mixed-mode computation.
4429      This may not be numerically reasonable for all applications.
4430   */
4431   PREFETCH_NTA(aa+16*ai[1]);
4432 
4433   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4434   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4435   {
4436     /* x will first be computed in single precision then promoted inplace to double */
4437     MatScalar      *v,*t=(MatScalar *)x;
4438     int            nz,i,idt,ai16;
4439     unsigned int   jdx,idx;
4440     unsigned short *vi;
4441     /* Forward solve the lower triangular factor. */
4442 
4443     /* First block is the identity. */
4444     idx  = 0;
4445     CONVERT_DOUBLE4_FLOAT4(t,b);
4446     v    =  aa + 16*((unsigned int)ai[1]);
4447 
4448     for (i=1; i<n;) {
4449       PREFETCH_NTA(&v[8]);
4450       vi   =  aj      + ai[i];
4451       nz   =  diag[i] - ai[i];
4452       idx +=  4;
4453 
4454       /* Demote RHS from double to float. */
4455       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
4456       LOAD_PS(&t[idx],XMM7);
4457 
4458       while (nz--) {
4459         PREFETCH_NTA(&v[16]);
4460         jdx = 4*((unsigned int)(*vi++));
4461 
4462         /* 4x4 Matrix-Vector product with negative accumulation: */
4463         SSE_INLINE_BEGIN_2(&t[jdx],v)
4464           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4465 
4466           /* First Column */
4467           SSE_COPY_PS(XMM0,XMM6)
4468           SSE_SHUFFLE(XMM0,XMM0,0x00)
4469           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4470           SSE_SUB_PS(XMM7,XMM0)
4471 
4472           /* Second Column */
4473           SSE_COPY_PS(XMM1,XMM6)
4474           SSE_SHUFFLE(XMM1,XMM1,0x55)
4475           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4476           SSE_SUB_PS(XMM7,XMM1)
4477 
4478           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4479 
4480           /* Third Column */
4481           SSE_COPY_PS(XMM2,XMM6)
4482           SSE_SHUFFLE(XMM2,XMM2,0xAA)
4483           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4484           SSE_SUB_PS(XMM7,XMM2)
4485 
4486           /* Fourth Column */
4487           SSE_COPY_PS(XMM3,XMM6)
4488           SSE_SHUFFLE(XMM3,XMM3,0xFF)
4489           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4490           SSE_SUB_PS(XMM7,XMM3)
4491         SSE_INLINE_END_2
4492 
4493         v  += 16;
4494       }
4495       v    =  aa + 16*ai[++i];
4496       PREFETCH_NTA(v);
4497       STORE_PS(&t[idx],XMM7);
4498     }
4499 
4500     /* Backward solve the upper triangular factor.*/
4501 
4502     idt  = 4*(n-1);
4503     ai16 = 16*diag[n-1];
4504     v    = aa + ai16 + 16;
4505     for (i=n-1; i>=0;){
4506       PREFETCH_NTA(&v[8]);
4507       vi = aj + diag[i] + 1;
4508       nz = ai[i+1] - diag[i] - 1;
4509 
4510       LOAD_PS(&t[idt],XMM7);
4511 
4512       while (nz--) {
4513         PREFETCH_NTA(&v[16]);
4514         idx = 4*((unsigned int)(*vi++));
4515 
4516         /* 4x4 Matrix-Vector Product with negative accumulation: */
4517         SSE_INLINE_BEGIN_2(&t[idx],v)
4518           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4519 
4520           /* First Column */
4521           SSE_COPY_PS(XMM0,XMM6)
4522           SSE_SHUFFLE(XMM0,XMM0,0x00)
4523           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4524           SSE_SUB_PS(XMM7,XMM0)
4525 
4526           /* Second Column */
4527           SSE_COPY_PS(XMM1,XMM6)
4528           SSE_SHUFFLE(XMM1,XMM1,0x55)
4529           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4530           SSE_SUB_PS(XMM7,XMM1)
4531 
4532           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4533 
4534           /* Third Column */
4535           SSE_COPY_PS(XMM2,XMM6)
4536           SSE_SHUFFLE(XMM2,XMM2,0xAA)
4537           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4538           SSE_SUB_PS(XMM7,XMM2)
4539 
4540           /* Fourth Column */
4541           SSE_COPY_PS(XMM3,XMM6)
4542           SSE_SHUFFLE(XMM3,XMM3,0xFF)
4543           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4544           SSE_SUB_PS(XMM7,XMM3)
4545         SSE_INLINE_END_2
4546         v  += 16;
4547       }
4548       v    = aa + ai16;
4549       ai16 = 16*diag[--i];
4550       PREFETCH_NTA(aa+ai16+16);
4551       /*
4552          Scale the result by the diagonal 4x4 block,
4553          which was inverted as part of the factorization
4554       */
4555       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
4556         /* First Column */
4557         SSE_COPY_PS(XMM0,XMM7)
4558         SSE_SHUFFLE(XMM0,XMM0,0x00)
4559         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
4560 
4561         /* Second Column */
4562         SSE_COPY_PS(XMM1,XMM7)
4563         SSE_SHUFFLE(XMM1,XMM1,0x55)
4564         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
4565         SSE_ADD_PS(XMM0,XMM1)
4566 
4567         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
4568 
4569         /* Third Column */
4570         SSE_COPY_PS(XMM2,XMM7)
4571         SSE_SHUFFLE(XMM2,XMM2,0xAA)
4572         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
4573         SSE_ADD_PS(XMM0,XMM2)
4574 
4575         /* Fourth Column */
4576         SSE_COPY_PS(XMM3,XMM7)
4577         SSE_SHUFFLE(XMM3,XMM3,0xFF)
4578         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
4579         SSE_ADD_PS(XMM0,XMM3)
4580 
4581         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
4582       SSE_INLINE_END_3
4583 
4584       v    = aa + ai16 + 16;
4585       idt -= 4;
4586     }
4587 
4588     /* Convert t from single precision back to double precision (inplace)*/
4589     idt = 4*(n-1);
4590     for (i=n-1;i>=0;i--) {
4591       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
4592       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
4593       PetscScalar *xtemp=&x[idt];
4594       MatScalar   *ttemp=&t[idt];
4595       xtemp[3] = (PetscScalar)ttemp[3];
4596       xtemp[2] = (PetscScalar)ttemp[2];
4597       xtemp[1] = (PetscScalar)ttemp[1];
4598       xtemp[0] = (PetscScalar)ttemp[0];
4599       idt -= 4;
4600     }
4601 
4602   } /* End of artificial scope. */
4603   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4604   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4605   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4606   SSE_SCOPE_END;
4607   PetscFunctionReturn(0);
4608 }
4609 
4610 #undef __FUNCT__
4611 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion"
4612 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
4613 {
4614   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
4615   int            *aj=a->j;
4616   PetscErrorCode ierr;
4617   int            *ai=a->i,n=a->mbs,*diag = a->diag;
4618   MatScalar      *aa=a->a;
4619   PetscScalar    *x,*b;
4620 
4621   PetscFunctionBegin;
4622   SSE_SCOPE_BEGIN;
4623   /*
4624      Note: This code currently uses demotion of double
4625      to float when performing the mixed-mode computation.
4626      This may not be numerically reasonable for all applications.
4627   */
4628   PREFETCH_NTA(aa+16*ai[1]);
4629 
4630   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4631   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4632   {
4633     /* x will first be computed in single precision then promoted inplace to double */
4634     MatScalar *v,*t=(MatScalar *)x;
4635     int       nz,i,idt,ai16;
4636     int       jdx,idx;
4637     int       *vi;
4638     /* Forward solve the lower triangular factor. */
4639 
4640     /* First block is the identity. */
4641     idx  = 0;
4642     CONVERT_DOUBLE4_FLOAT4(t,b);
4643     v    =  aa + 16*ai[1];
4644 
4645     for (i=1; i<n;) {
4646       PREFETCH_NTA(&v[8]);
4647       vi   =  aj      + ai[i];
4648       nz   =  diag[i] - ai[i];
4649       idx +=  4;
4650 
4651       /* Demote RHS from double to float. */
4652       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
4653       LOAD_PS(&t[idx],XMM7);
4654 
4655       while (nz--) {
4656         PREFETCH_NTA(&v[16]);
4657         jdx = 4*(*vi++);
4658 /*          jdx = *vi++; */
4659 
4660         /* 4x4 Matrix-Vector product with negative accumulation: */
4661         SSE_INLINE_BEGIN_2(&t[jdx],v)
4662           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4663 
4664           /* First Column */
4665           SSE_COPY_PS(XMM0,XMM6)
4666           SSE_SHUFFLE(XMM0,XMM0,0x00)
4667           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4668           SSE_SUB_PS(XMM7,XMM0)
4669 
4670           /* Second Column */
4671           SSE_COPY_PS(XMM1,XMM6)
4672           SSE_SHUFFLE(XMM1,XMM1,0x55)
4673           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4674           SSE_SUB_PS(XMM7,XMM1)
4675 
4676           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4677 
4678           /* Third Column */
4679           SSE_COPY_PS(XMM2,XMM6)
4680           SSE_SHUFFLE(XMM2,XMM2,0xAA)
4681           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4682           SSE_SUB_PS(XMM7,XMM2)
4683 
4684           /* Fourth Column */
4685           SSE_COPY_PS(XMM3,XMM6)
4686           SSE_SHUFFLE(XMM3,XMM3,0xFF)
4687           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4688           SSE_SUB_PS(XMM7,XMM3)
4689         SSE_INLINE_END_2
4690 
4691         v  += 16;
4692       }
4693       v    =  aa + 16*ai[++i];
4694       PREFETCH_NTA(v);
4695       STORE_PS(&t[idx],XMM7);
4696     }
4697 
4698     /* Backward solve the upper triangular factor.*/
4699 
4700     idt  = 4*(n-1);
4701     ai16 = 16*diag[n-1];
4702     v    = aa + ai16 + 16;
4703     for (i=n-1; i>=0;){
4704       PREFETCH_NTA(&v[8]);
4705       vi = aj + diag[i] + 1;
4706       nz = ai[i+1] - diag[i] - 1;
4707 
4708       LOAD_PS(&t[idt],XMM7);
4709 
4710       while (nz--) {
4711         PREFETCH_NTA(&v[16]);
4712         idx = 4*(*vi++);
4713 /*          idx = *vi++; */
4714 
4715         /* 4x4 Matrix-Vector Product with negative accumulation: */
4716         SSE_INLINE_BEGIN_2(&t[idx],v)
4717           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4718 
4719           /* First Column */
4720           SSE_COPY_PS(XMM0,XMM6)
4721           SSE_SHUFFLE(XMM0,XMM0,0x00)
4722           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4723           SSE_SUB_PS(XMM7,XMM0)
4724 
4725           /* Second Column */
4726           SSE_COPY_PS(XMM1,XMM6)
4727           SSE_SHUFFLE(XMM1,XMM1,0x55)
4728           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4729           SSE_SUB_PS(XMM7,XMM1)
4730 
4731           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4732 
4733           /* Third Column */
4734           SSE_COPY_PS(XMM2,XMM6)
4735           SSE_SHUFFLE(XMM2,XMM2,0xAA)
4736           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4737           SSE_SUB_PS(XMM7,XMM2)
4738 
4739           /* Fourth Column */
4740           SSE_COPY_PS(XMM3,XMM6)
4741           SSE_SHUFFLE(XMM3,XMM3,0xFF)
4742           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4743           SSE_SUB_PS(XMM7,XMM3)
4744         SSE_INLINE_END_2
4745         v  += 16;
4746       }
4747       v    = aa + ai16;
4748       ai16 = 16*diag[--i];
4749       PREFETCH_NTA(aa+ai16+16);
4750       /*
4751          Scale the result by the diagonal 4x4 block,
4752          which was inverted as part of the factorization
4753       */
4754       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
4755         /* First Column */
4756         SSE_COPY_PS(XMM0,XMM7)
4757         SSE_SHUFFLE(XMM0,XMM0,0x00)
4758         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
4759 
4760         /* Second Column */
4761         SSE_COPY_PS(XMM1,XMM7)
4762         SSE_SHUFFLE(XMM1,XMM1,0x55)
4763         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
4764         SSE_ADD_PS(XMM0,XMM1)
4765 
4766         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
4767 
4768         /* Third Column */
4769         SSE_COPY_PS(XMM2,XMM7)
4770         SSE_SHUFFLE(XMM2,XMM2,0xAA)
4771         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
4772         SSE_ADD_PS(XMM0,XMM2)
4773 
4774         /* Fourth Column */
4775         SSE_COPY_PS(XMM3,XMM7)
4776         SSE_SHUFFLE(XMM3,XMM3,0xFF)
4777         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
4778         SSE_ADD_PS(XMM0,XMM3)
4779 
4780         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
4781       SSE_INLINE_END_3
4782 
4783       v    = aa + ai16 + 16;
4784       idt -= 4;
4785     }
4786 
4787     /* Convert t from single precision back to double precision (inplace)*/
4788     idt = 4*(n-1);
4789     for (i=n-1;i>=0;i--) {
4790       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
4791       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
4792       PetscScalar *xtemp=&x[idt];
4793       MatScalar   *ttemp=&t[idt];
4794       xtemp[3] = (PetscScalar)ttemp[3];
4795       xtemp[2] = (PetscScalar)ttemp[2];
4796       xtemp[1] = (PetscScalar)ttemp[1];
4797       xtemp[0] = (PetscScalar)ttemp[0];
4798       idt -= 4;
4799     }
4800 
4801   } /* End of artificial scope. */
4802   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4803   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4804   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4805   SSE_SCOPE_END;
4806   PetscFunctionReturn(0);
4807 }
4808 
4809 #endif
4810 
4811 #undef __FUNCT__
4812 #define __FUNCT__ "MatSolve_SeqBAIJ_3_inplace"
4813 PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
4814 {
4815   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4816   IS                iscol=a->col,isrow=a->row;
4817   PetscErrorCode    ierr;
4818   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
4819   PetscInt          i,nz,idx,idt,idc;
4820   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
4821   const MatScalar   *aa=a->a,*v;
4822   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
4823   const PetscScalar *b;
4824 
4825   PetscFunctionBegin;
4826   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
4827   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4828   t  = a->solve_work;
4829 
4830   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4831   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
4832 
4833   /* forward solve the lower triangular */
4834   idx    = 3*(*r++);
4835   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
4836   for (i=1; i<n; i++) {
4837     v     = aa + 9*ai[i];
4838     vi    = aj + ai[i];
4839     nz    = diag[i] - ai[i];
4840     idx   = 3*(*r++);
4841     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
4842     while (nz--) {
4843       idx   = 3*(*vi++);
4844       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4845       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4846       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4847       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4848       v += 9;
4849     }
4850     idx = 3*i;
4851     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
4852   }
4853   /* backward solve the upper triangular */
4854   for (i=n-1; i>=0; i--){
4855     v    = aa + 9*diag[i] + 9;
4856     vi   = aj + diag[i] + 1;
4857     nz   = ai[i+1] - diag[i] - 1;
4858     idt  = 3*i;
4859     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
4860     while (nz--) {
4861       idx   = 3*(*vi++);
4862       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4863       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4864       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4865       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4866       v += 9;
4867     }
4868     idc = 3*(*c--);
4869     v   = aa + 9*diag[i];
4870     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4871     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4872     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4873   }
4874   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4875   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4876   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
4877   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4878   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
4879   PetscFunctionReturn(0);
4880 }
4881 
4882 #undef __FUNCT__
4883 #define __FUNCT__ "MatSolve_SeqBAIJ_3"
4884 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
4885 {
4886   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4887   IS                iscol=a->col,isrow=a->row;
4888   PetscErrorCode    ierr;
4889   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
4890   PetscInt          i,nz,idx,idt,idc,m;
4891   const PetscInt    *r,*c,*rout,*cout;
4892   const MatScalar   *aa=a->a,*v;
4893   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
4894   const PetscScalar *b;
4895 
4896   PetscFunctionBegin;
4897   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
4898   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4899   t  = a->solve_work;
4900 
4901   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4902   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
4903 
4904   /* forward solve the lower triangular */
4905   idx    = 3*r[0];
4906   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
4907   for (i=1; i<n; i++) {
4908     v     = aa + 9*ai[i];
4909     vi    = aj + ai[i];
4910     nz    = ai[i+1] - ai[i];
4911     idx   = 3*r[i];
4912     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
4913     for(m=0;m<nz;m++){
4914       idx   = 3*vi[m];
4915       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4916       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4917       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4918       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4919       v += 9;
4920     }
4921     idx = 3*i;
4922     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
4923   }
4924   /* backward solve the upper triangular */
4925   for (i=n-1; i>=0; i--){
4926     v    = aa + 9*(adiag[i+1]+1);
4927     vi   = aj + adiag[i+1]+1;
4928     nz   = adiag[i] - adiag[i+1] - 1;
4929     idt  = 3*i;
4930     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
4931     for(m=0;m<nz;m++){
4932       idx   = 3*vi[m];
4933       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4934       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4935       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4936       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4937       v += 9;
4938     }
4939     idc = 3*c[i];
4940     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4941     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4942     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4943   }
4944   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4945   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4946   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
4947   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4948   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
4949   PetscFunctionReturn(0);
4950 }
4951 
4952 /*
4953       Special case where the matrix was ILU(0) factored in the natural
4954    ordering. This eliminates the need for the column and row permutation.
4955 */
4956 #undef __FUNCT__
4957 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_inplace"
4958 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
4959 {
4960   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4961   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j;
4962   PetscErrorCode    ierr;
4963   const PetscInt    *diag = a->diag,*vi;
4964   const MatScalar   *aa=a->a,*v;
4965   PetscScalar       *x,s1,s2,s3,x1,x2,x3;
4966   const PetscScalar *b;
4967   PetscInt          jdx,idt,idx,nz,i;
4968 
4969   PetscFunctionBegin;
4970   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
4971   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4972 
4973   /* forward solve the lower triangular */
4974   idx    = 0;
4975   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
4976   for (i=1; i<n; i++) {
4977     v     =  aa      + 9*ai[i];
4978     vi    =  aj      + ai[i];
4979     nz    =  diag[i] - ai[i];
4980     idx   +=  3;
4981     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
4982     while (nz--) {
4983       jdx   = 3*(*vi++);
4984       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
4985       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4986       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4987       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4988       v    += 9;
4989     }
4990     x[idx]   = s1;
4991     x[1+idx] = s2;
4992     x[2+idx] = s3;
4993   }
4994   /* backward solve the upper triangular */
4995   for (i=n-1; i>=0; i--){
4996     v    = aa + 9*diag[i] + 9;
4997     vi   = aj + diag[i] + 1;
4998     nz   = ai[i+1] - diag[i] - 1;
4999     idt  = 3*i;
5000     s1 = x[idt];  s2 = x[1+idt];
5001     s3 = x[2+idt];
5002     while (nz--) {
5003       idx   = 3*(*vi++);
5004       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
5005       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
5006       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
5007       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
5008       v    += 9;
5009     }
5010     v        = aa +  9*diag[i];
5011     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
5012     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
5013     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
5014   }
5015 
5016   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5017   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5018   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
5019   PetscFunctionReturn(0);
5020 }
5021 
5022 #undef __FUNCT__
5023 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
5024 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
5025 {
5026     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
5027     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
5028     PetscErrorCode    ierr;
5029     PetscInt          i,k,nz,idx,jdx,idt;
5030     const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
5031     const MatScalar   *aa=a->a,*v;
5032     PetscScalar       *x;
5033     const PetscScalar *b;
5034     PetscScalar        s1,s2,s3,x1,x2,x3;
5035 
5036     PetscFunctionBegin;
5037     ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
5038     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5039     /* forward solve the lower triangular */
5040     idx    = 0;
5041     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
5042     for (i=1; i<n; i++) {
5043        v    = aa + bs2*ai[i];
5044        vi   = aj + ai[i];
5045        nz   = ai[i+1] - ai[i];
5046       idx   = bs*i;
5047        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];
5048       for(k=0;k<nz;k++){
5049          jdx   = bs*vi[k];
5050           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
5051           s1   -= v[0]*x1 + v[3]*x2 + v[6]*x3;
5052           s2   -= v[1]*x1 + v[4]*x2 + v[7]*x3;
5053           s3   -= v[2]*x1 + v[5]*x2 + v[8]*x3;
5054 
5055           v   +=  bs2;
5056         }
5057 
5058        x[idx]   = s1;
5059        x[1+idx] = s2;
5060        x[2+idx] = s3;
5061     }
5062 
5063    /* backward solve the upper triangular */
5064   for (i=n-1; i>=0; i--){
5065     v   = aa + bs2*(adiag[i+1]+1);
5066      vi  = aj + adiag[i+1]+1;
5067      nz  = adiag[i] - adiag[i+1]-1;
5068      idt = bs*i;
5069      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];
5070 
5071      for(k=0;k<nz;k++){
5072        idx   = bs*vi[k];
5073        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
5074        s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
5075        s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
5076        s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
5077 
5078         v   +=  bs2;
5079     }
5080     /* x = inv_diagonal*x */
5081    x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
5082    x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
5083    x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
5084 
5085   }
5086 
5087   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5088   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5089   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
5090   PetscFunctionReturn(0);
5091 }
5092 
5093 #undef __FUNCT__
5094 #define __FUNCT__ "MatSolve_SeqBAIJ_2_inplace"
5095 PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
5096 {
5097   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
5098   IS                iscol=a->col,isrow=a->row;
5099   PetscErrorCode    ierr;
5100   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
5101   PetscInt          i,nz,idx,idt,idc;
5102   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
5103   const MatScalar   *aa=a->a,*v;
5104   PetscScalar       *x,s1,s2,x1,x2,*t;
5105   const PetscScalar *b;
5106 
5107   PetscFunctionBegin;
5108   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
5109   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5110   t  = a->solve_work;
5111 
5112   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
5113   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
5114 
5115   /* forward solve the lower triangular */
5116   idx    = 2*(*r++);
5117   t[0] = b[idx]; t[1] = b[1+idx];
5118   for (i=1; i<n; i++) {
5119     v     = aa + 4*ai[i];
5120     vi    = aj + ai[i];
5121     nz    = diag[i] - ai[i];
5122     idx   = 2*(*r++);
5123     s1  = b[idx]; s2 = b[1+idx];
5124     while (nz--) {
5125       idx   = 2*(*vi++);
5126       x1    = t[idx]; x2 = t[1+idx];
5127       s1 -= v[0]*x1 + v[2]*x2;
5128       s2 -= v[1]*x1 + v[3]*x2;
5129       v += 4;
5130     }
5131     idx = 2*i;
5132     t[idx] = s1; t[1+idx] = s2;
5133   }
5134   /* backward solve the upper triangular */
5135   for (i=n-1; i>=0; i--){
5136     v    = aa + 4*diag[i] + 4;
5137     vi   = aj + diag[i] + 1;
5138     nz   = ai[i+1] - diag[i] - 1;
5139     idt  = 2*i;
5140     s1 = t[idt]; s2 = t[1+idt];
5141     while (nz--) {
5142       idx   = 2*(*vi++);
5143       x1    = t[idx]; x2 = t[1+idx];
5144       s1 -= v[0]*x1 + v[2]*x2;
5145       s2 -= v[1]*x1 + v[3]*x2;
5146       v += 4;
5147     }
5148     idc = 2*(*c--);
5149     v   = aa + 4*diag[i];
5150     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
5151     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
5152   }
5153   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
5154   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
5155   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5156   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5157   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
5158   PetscFunctionReturn(0);
5159 }
5160 
5161 #undef __FUNCT__
5162 #define __FUNCT__ "MatSolve_SeqBAIJ_2"
5163 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
5164 {
5165   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
5166   IS                iscol=a->col,isrow=a->row;
5167   PetscErrorCode    ierr;
5168   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
5169   PetscInt          i,nz,idx,jdx,idt,idc,m;
5170   const PetscInt    *r,*c,*rout,*cout;
5171   const MatScalar   *aa=a->a,*v;
5172   PetscScalar       *x,s1,s2,x1,x2,*t;
5173   const PetscScalar *b;
5174 
5175   PetscFunctionBegin;
5176   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
5177   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5178   t  = a->solve_work;
5179 
5180   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
5181   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
5182 
5183   /* forward solve the lower triangular */
5184   idx    = 2*r[0];
5185   t[0] = b[idx]; t[1] = b[1+idx];
5186   for (i=1; i<n; i++) {
5187     v     = aa + 4*ai[i];
5188     vi    = aj + ai[i];
5189     nz    = ai[i+1] - ai[i];
5190     idx   = 2*r[i];
5191     s1  = b[idx]; s2 = b[1+idx];
5192     for(m=0;m<nz;m++){
5193       jdx   = 2*vi[m];
5194       x1    = t[jdx]; x2 = t[1+jdx];
5195       s1 -= v[0]*x1 + v[2]*x2;
5196       s2 -= v[1]*x1 + v[3]*x2;
5197       v += 4;
5198     }
5199     idx = 2*i;
5200     t[idx] = s1; t[1+idx] = s2;
5201   }
5202   /* backward solve the upper triangular */
5203   for (i=n-1; i>=0; i--){
5204     v    = aa + 4*(adiag[i+1]+1);
5205     vi   = aj + adiag[i+1]+1;
5206     nz   = adiag[i] - adiag[i+1] - 1;
5207     idt  = 2*i;
5208     s1 = t[idt]; s2 = t[1+idt];
5209     for(m=0;m<nz;m++){
5210       idx   = 2*vi[m];
5211       x1    = t[idx]; x2 = t[1+idx];
5212       s1 -= v[0]*x1 + v[2]*x2;
5213       s2 -= v[1]*x1 + v[3]*x2;
5214       v += 4;
5215     }
5216     idc = 2*c[i];
5217     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
5218     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
5219   }
5220   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
5221   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
5222   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5223   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5224   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
5225   PetscFunctionReturn(0);
5226 }
5227 
5228 /*
5229       Special case where the matrix was ILU(0) factored in the natural
5230    ordering. This eliminates the need for the column and row permutation.
5231 */
5232 #undef __FUNCT__
5233 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_inplace"
5234 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
5235 {
5236   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
5237   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
5238   PetscErrorCode    ierr;
5239   const MatScalar   *aa=a->a,*v;
5240   PetscScalar       *x,s1,s2,x1,x2;
5241   const PetscScalar *b;
5242   PetscInt          jdx,idt,idx,nz,i;
5243 
5244   PetscFunctionBegin;
5245   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
5246   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5247 
5248   /* forward solve the lower triangular */
5249   idx    = 0;
5250   x[0]   = b[0]; x[1] = b[1];
5251   for (i=1; i<n; i++) {
5252     v     =  aa      + 4*ai[i];
5253     vi    =  aj      + ai[i];
5254     nz    =  diag[i] - ai[i];
5255     idx   +=  2;
5256     s1  =  b[idx];s2 = b[1+idx];
5257     while (nz--) {
5258       jdx   = 2*(*vi++);
5259       x1    = x[jdx];x2 = x[1+jdx];
5260       s1 -= v[0]*x1 + v[2]*x2;
5261       s2 -= v[1]*x1 + v[3]*x2;
5262       v    += 4;
5263     }
5264     x[idx]   = s1;
5265     x[1+idx] = s2;
5266   }
5267   /* backward solve the upper triangular */
5268   for (i=n-1; i>=0; i--){
5269     v    = aa + 4*diag[i] + 4;
5270     vi   = aj + diag[i] + 1;
5271     nz   = ai[i+1] - diag[i] - 1;
5272     idt  = 2*i;
5273     s1 = x[idt];  s2 = x[1+idt];
5274     while (nz--) {
5275       idx   = 2*(*vi++);
5276       x1    = x[idx];   x2 = x[1+idx];
5277       s1 -= v[0]*x1 + v[2]*x2;
5278       s2 -= v[1]*x1 + v[3]*x2;
5279       v    += 4;
5280     }
5281     v        = aa +  4*diag[i];
5282     x[idt]   = v[0]*s1 + v[2]*s2;
5283     x[1+idt] = v[1]*s1 + v[3]*s2;
5284   }
5285 
5286   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5287   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5288   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
5289   PetscFunctionReturn(0);
5290 }
5291 
5292 #undef __FUNCT__
5293 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
5294 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
5295 {
5296     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
5297     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
5298     PetscInt          i,k,nz,idx,idt,jdx;
5299     PetscErrorCode    ierr;
5300     const MatScalar   *aa=a->a,*v;
5301     PetscScalar       *x,s1,s2,x1,x2;
5302     const PetscScalar *b;
5303 
5304     PetscFunctionBegin;
5305     ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
5306     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5307     /* forward solve the lower triangular */
5308     idx    = 0;
5309     x[0] = b[idx]; x[1] = b[1+idx];
5310     for (i=1; i<n; i++) {
5311         v   = aa + 4*ai[i];
5312        vi   = aj + ai[i];
5313        nz   = ai[i+1] - ai[i];
5314        idx  = 2*i;
5315        s1   = b[idx];s2 = b[1+idx];
5316       for(k=0;k<nz;k++){
5317          jdx   = 2*vi[k];
5318           x1    = x[jdx];x2 = x[1+jdx];
5319           s1   -= v[0]*x1 + v[2]*x2;
5320           s2   -= v[1]*x1 + v[3]*x2;
5321            v   +=  4;
5322         }
5323        x[idx]   = s1;
5324        x[1+idx] = s2;
5325     }
5326 
5327    /* backward solve the upper triangular */
5328   for (i=n-1; i>=0; i--){
5329      v   = aa + 4*(adiag[i+1]+1);
5330      vi  = aj + adiag[i+1]+1;
5331      nz  = adiag[i] - adiag[i+1]-1;
5332      idt = 2*i;
5333      s1 = x[idt];  s2 = x[1+idt];
5334      for(k=0;k<nz;k++){
5335       idx   = 2*vi[k];
5336        x1    = x[idx];   x2 = x[1+idx];
5337        s1 -= v[0]*x1 + v[2]*x2;
5338        s2 -= v[1]*x1 + v[3]*x2;
5339          v    += 4;
5340     }
5341     /* x = inv_diagonal*x */
5342    x[idt]   = v[0]*s1 + v[2]*s2;
5343    x[1+idt] = v[1]*s1 + v[3]*s2;
5344   }
5345 
5346   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5347   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5348   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
5349   PetscFunctionReturn(0);
5350 }
5351 
5352 #undef __FUNCT__
5353 #define __FUNCT__ "MatSolve_SeqBAIJ_1_inplace"
5354 PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
5355 {
5356   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
5357   IS                iscol=a->col,isrow=a->row;
5358   PetscErrorCode    ierr;
5359   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
5360   PetscInt          i,nz;
5361   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
5362   const MatScalar   *aa=a->a,*v;
5363   PetscScalar       *x,s1,*t;
5364   const PetscScalar *b;
5365 
5366   PetscFunctionBegin;
5367   if (!n) PetscFunctionReturn(0);
5368 
5369   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
5370   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5371   t  = a->solve_work;
5372 
5373   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
5374   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
5375 
5376   /* forward solve the lower triangular */
5377   t[0] = b[*r++];
5378   for (i=1; i<n; i++) {
5379     v     = aa + ai[i];
5380     vi    = aj + ai[i];
5381     nz    = diag[i] - ai[i];
5382     s1  = b[*r++];
5383     while (nz--) {
5384       s1 -= (*v++)*t[*vi++];
5385     }
5386     t[i] = s1;
5387   }
5388   /* backward solve the upper triangular */
5389   for (i=n-1; i>=0; i--){
5390     v    = aa + diag[i] + 1;
5391     vi   = aj + diag[i] + 1;
5392     nz   = ai[i+1] - diag[i] - 1;
5393     s1 = t[i];
5394     while (nz--) {
5395       s1 -= (*v++)*t[*vi++];
5396     }
5397     x[*c--] = t[i] = aa[diag[i]]*s1;
5398   }
5399 
5400   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
5401   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
5402   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5403   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5404   ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr);
5405   PetscFunctionReturn(0);
5406 }
5407 
5408 #undef __FUNCT__
5409 #define __FUNCT__ "MatSolve_SeqBAIJ_1"
5410 PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
5411 {
5412   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data;
5413   IS                iscol = a->col,isrow = a->row;
5414   PetscErrorCode    ierr;
5415   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag = a->diag,nz;
5416   const PetscInt    *rout,*cout,*r,*c;
5417   PetscScalar       *x,*tmp,sum;
5418   const PetscScalar *b;
5419   const MatScalar   *aa = a->a,*v;
5420 
5421   PetscFunctionBegin;
5422   if (!n) PetscFunctionReturn(0);
5423 
5424   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
5425   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5426   tmp  = a->solve_work;
5427 
5428   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
5429   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
5430 
5431   /* forward solve the lower triangular */
5432   tmp[0] = b[r[0]];
5433   v      = aa;
5434   vi     = aj;
5435   for (i=1; i<n; i++) {
5436     nz  = ai[i+1] - ai[i];
5437     sum = b[r[i]];
5438     PetscSparseDenseMinusDot(sum,tmp,v,vi,nz);
5439     tmp[i] = sum;
5440     v += nz; vi += nz;
5441   }
5442 
5443   /* backward solve the upper triangular */
5444   for (i=n-1; i>=0; i--){
5445     v   = aa + adiag[i+1]+1;
5446     vi  = aj + adiag[i+1]+1;
5447     nz  = adiag[i]-adiag[i+1]-1;
5448     sum = tmp[i];
5449     PetscSparseDenseMinusDot(sum,tmp,v,vi,nz);
5450     x[c[i]] = tmp[i] = sum*v[nz]; /* v[nz] = aa[adiag[i]] */
5451   }
5452 
5453   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
5454   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
5455   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5456   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5457   ierr = PetscLogFlops(2*a->nz - A->cmap->n);CHKERRQ(ierr);
5458   PetscFunctionReturn(0);
5459 }
5460 
5461 /*
5462       Special case where the matrix was ILU(0) factored in the natural
5463    ordering. This eliminates the need for the column and row permutation.
5464 */
5465 #undef __FUNCT__
5466 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering_inplace"
5467 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
5468 {
5469   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
5470   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
5471   PetscErrorCode    ierr;
5472   const MatScalar   *aa=a->a,*v;
5473   PetscScalar       *x;
5474   const PetscScalar *b;
5475   PetscScalar       s1,x1;
5476   PetscInt          jdx,idt,idx,nz,i;
5477 
5478   PetscFunctionBegin;
5479   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
5480   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5481 
5482   /* forward solve the lower triangular */
5483   idx    = 0;
5484   x[0]   = b[0];
5485   for (i=1; i<n; i++) {
5486     v     =  aa      + ai[i];
5487     vi    =  aj      + ai[i];
5488     nz    =  diag[i] - ai[i];
5489     idx   +=  1;
5490     s1  =  b[idx];
5491     while (nz--) {
5492       jdx   = *vi++;
5493       x1    = x[jdx];
5494       s1 -= v[0]*x1;
5495       v    += 1;
5496     }
5497     x[idx]   = s1;
5498   }
5499   /* backward solve the upper triangular */
5500   for (i=n-1; i>=0; i--){
5501     v    = aa + diag[i] + 1;
5502     vi   = aj + diag[i] + 1;
5503     nz   = ai[i+1] - diag[i] - 1;
5504     idt  = i;
5505     s1 = x[idt];
5506     while (nz--) {
5507       idx   = *vi++;
5508       x1    = x[idx];
5509       s1 -= v[0]*x1;
5510       v    += 1;
5511     }
5512     v        = aa +  diag[i];
5513     x[idt]   = v[0]*s1;
5514   }
5515   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5516   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5517   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
5518   PetscFunctionReturn(0);
5519 }
5520 
5521 
5522 #undef __FUNCT__
5523 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering"
5524 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
5525 {
5526   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
5527   PetscErrorCode    ierr;
5528   const PetscInt    n = a->mbs,*ai = a->i,*aj = a->j,*adiag = a->diag,*vi;
5529   PetscScalar       *x,sum;
5530   const PetscScalar *b;
5531   const MatScalar   *aa = a->a,*v;
5532   PetscInt          i,nz;
5533 
5534   PetscFunctionBegin;
5535   if (!n) PetscFunctionReturn(0);
5536 
5537   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
5538   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5539 
5540   /* forward solve the lower triangular */
5541   x[0] = b[0];
5542   v    = aa;
5543   vi   = aj;
5544   for (i=1; i<n; i++) {
5545     nz  = ai[i+1] - ai[i];
5546     sum = b[i];
5547     PetscSparseDenseMinusDot(sum,x,v,vi,nz);
5548     v  += nz;
5549     vi += nz;
5550     x[i] = sum;
5551   }
5552 
5553   /* backward solve the upper triangular */
5554   for (i=n-1; i>=0; i--){
5555     v   = aa + adiag[i+1] + 1;
5556     vi  = aj + adiag[i+1] + 1;
5557     nz = adiag[i] - adiag[i+1]-1;
5558     sum = x[i];
5559     PetscSparseDenseMinusDot(sum,x,v,vi,nz);
5560     x[i] = sum*v[nz]; /* x[i]=aa[adiag[i]]*sum; v++; */
5561   }
5562 
5563   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
5564   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5565   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5566   PetscFunctionReturn(0);
5567 }
5568 
5569 /* ----------------------------------------------------------------*/
5570 EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscTruth);
5571 
5572 #undef __FUNCT__
5573 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering"
5574 /*
5575    This is not much faster than MatLUFactorNumeric_SeqBAIJ_N() but the solve is faster at least sometimes
5576 */
5577 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info)
5578 {
5579   Mat             C=B;
5580   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
5581   PetscErrorCode  ierr;
5582   PetscInt        i,j,k,ipvt[15];
5583   const PetscInt  n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*ajtmp,*bjtmp,*bdiag=b->diag,*pj;
5584   PetscInt        nz,nzL,row;
5585   MatScalar       *rtmp,*pc,*mwork,*pv,*vv,work[225];
5586   const MatScalar *v,*aa=a->a;
5587   PetscInt        bs2 = a->bs2,bs=A->rmap->bs,flg;
5588   PetscInt        sol_ver;
5589 
5590   PetscFunctionBegin;
5591 
5592   ierr = PetscOptionsGetInt(PETSC_NULL,"-sol_ver",&sol_ver,PETSC_NULL);CHKERRQ(ierr);
5593 
5594   /* generate work space needed by the factorization */
5595   ierr = PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);CHKERRQ(ierr);
5596   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
5597 
5598   for (i=0; i<n; i++){
5599     /* zero rtmp */
5600     /* L part */
5601     nz    = bi[i+1] - bi[i];
5602     bjtmp = bj + bi[i];
5603     for  (j=0; j<nz; j++){
5604       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5605     }
5606 
5607     /* U part */
5608     nz = bdiag[i] - bdiag[i+1];
5609     bjtmp = bj + bdiag[i+1]+1;
5610     for  (j=0; j<nz; j++){
5611       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5612     }
5613 
5614     /* load in initial (unfactored row) */
5615     nz    = ai[i+1] - ai[i];
5616     ajtmp = aj + ai[i];
5617     v     = aa + bs2*ai[i];
5618     for (j=0; j<nz; j++) {
5619       ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
5620     }
5621 
5622     /* elimination */
5623     bjtmp = bj + bi[i];
5624     nzL   = bi[i+1] - bi[i];
5625     for(k=0;k < nzL;k++) {
5626       row = bjtmp[k];
5627       pc = rtmp + bs2*row;
5628       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
5629       if (flg) {
5630         pv = b->a + bs2*bdiag[row];
5631 	Kernel_A_gets_A_times_B(bs,pc,pv,mwork);
5632 	/*ierr = Kernel_A_gets_A_times_B_15(pc,pv,mwork);CHKERRQ(ierr);*/
5633 	pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
5634         pv = b->a + bs2*(bdiag[row+1]+1);
5635         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
5636         for (j=0; j<nz; j++) {
5637           vv   = rtmp + bs2*pj[j];
5638           Kernel_A_gets_A_minus_B_times_C(bs,vv,pc,pv);
5639 	  /* ierr = Kernel_A_gets_A_minus_B_times_C_15(vv,pc,pv);CHKERRQ(ierr); */
5640 	  pv  += bs2;
5641         }
5642         ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
5643       }
5644     }
5645 
5646     /* finished row so stick it into b->a */
5647     /* L part */
5648     pv   = b->a + bs2*bi[i] ;
5649     pj   = b->j + bi[i] ;
5650     nz   = bi[i+1] - bi[i];
5651     for (j=0; j<nz; j++) {
5652       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5653     }
5654 
5655     /* Mark diagonal and invert diagonal for simplier triangular solves */
5656     pv   = b->a + bs2*bdiag[i];
5657     pj   = b->j + bdiag[i];
5658     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5659     /* Kernel_A_gets_inverse_A(bs,pv,pivots,work); */
5660     ierr = Kernel_A_gets_inverse_A_15(pv,ipvt,work,info->shiftamount);CHKERRQ(ierr);
5661 
5662     /* U part */
5663     pv = b->a + bs2*(bdiag[i+1]+1);
5664     pj = b->j + bdiag[i+1]+1;
5665     nz = bdiag[i] - bdiag[i+1] - 1;
5666     for (j=0; j<nz; j++){
5667       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5668     }
5669   }
5670 
5671   ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr);
5672   C->ops->solve = MatSolve_SeqBAIJ_15_NaturalOrdering_ver1;
5673   C->ops->solvetranspose = MatSolve_SeqBAIJ_N_NaturalOrdering;
5674   C->assembled = PETSC_TRUE;
5675   ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
5676   PetscFunctionReturn(0);
5677 }
5678 
5679 #undef __FUNCT__
5680 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N"
5681 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N(Mat B,Mat A,const MatFactorInfo *info)
5682 {
5683   Mat            C=B;
5684   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
5685   IS             isrow = b->row,isicol = b->icol;
5686   PetscErrorCode ierr;
5687   const PetscInt *r,*ic;
5688   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
5689   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
5690   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
5691   PetscInt       bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg;
5692   MatScalar      *v_work;
5693   PetscTruth     col_identity,row_identity,both_identity;
5694 
5695   PetscFunctionBegin;
5696   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
5697   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
5698 
5699   ierr = PetscMalloc(bs2*n*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
5700   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
5701 
5702   /* generate work space needed by dense LU factorization */
5703   ierr  = PetscMalloc3(bs,MatScalar,&v_work,bs2,MatScalar,&mwork,bs,PetscInt,&v_pivots);CHKERRQ(ierr);
5704 
5705   for (i=0; i<n; i++){
5706     /* zero rtmp */
5707     /* L part */
5708     nz    = bi[i+1] - bi[i];
5709     bjtmp = bj + bi[i];
5710     for  (j=0; j<nz; j++){
5711       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5712     }
5713 
5714     /* U part */
5715     nz = bdiag[i] - bdiag[i+1];
5716     bjtmp = bj + bdiag[i+1]+1;
5717     for  (j=0; j<nz; j++){
5718       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5719     }
5720 
5721     /* load in initial (unfactored row) */
5722     nz    = ai[r[i]+1] - ai[r[i]];
5723     ajtmp = aj + ai[r[i]];
5724     v     = aa + bs2*ai[r[i]];
5725     for (j=0; j<nz; j++) {
5726       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
5727     }
5728 
5729     /* elimination */
5730     bjtmp = bj + bi[i];
5731     nzL   = bi[i+1] - bi[i];
5732     for(k=0;k < nzL;k++) {
5733       row = bjtmp[k];
5734       pc = rtmp + bs2*row;
5735       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
5736       if (flg) {
5737         pv         = b->a + bs2*bdiag[row];
5738         Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */
5739         pj         = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
5740         pv         = b->a + bs2*(bdiag[row+1]+1);
5741         nz         = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
5742         for (j=0; j<nz; j++) {
5743           Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j);
5744         }
5745         ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
5746       }
5747     }
5748 
5749     /* finished row so stick it into b->a */
5750     /* L part */
5751     pv   = b->a + bs2*bi[i] ;
5752     pj   = b->j + bi[i] ;
5753     nz   = bi[i+1] - bi[i];
5754     for (j=0; j<nz; j++) {
5755       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5756     }
5757 
5758     /* Mark diagonal and invert diagonal for simplier triangular solves */
5759     pv  = b->a + bs2*bdiag[i];
5760     pj  = b->j + bdiag[i];
5761     /* if (*pj != i)SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */
5762     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5763     ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr);
5764 
5765     /* U part */
5766     pv = b->a + bs2*(bdiag[i+1]+1);
5767     pj = b->j + bdiag[i+1]+1;
5768     nz = bdiag[i] - bdiag[i+1] - 1;
5769     for (j=0; j<nz; j++){
5770       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5771     }
5772   }
5773 
5774   ierr = PetscFree(rtmp);CHKERRQ(ierr);
5775   ierr = PetscFree3(v_work,mwork,v_pivots);CHKERRQ(ierr);
5776   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
5777   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
5778 
5779   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5780   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
5781   both_identity = (PetscTruth) (row_identity && col_identity);
5782   if (both_identity){
5783     C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering;
5784   } else {
5785     C->ops->solve = MatSolve_SeqBAIJ_N;
5786   }
5787   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_N;
5788 
5789   C->assembled = PETSC_TRUE;
5790   ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
5791   PetscFunctionReturn(0);
5792 }
5793 
5794 /*
5795    ilu(0) with natural ordering under new data structure.
5796    See MatILUFactorSymbolic_SeqAIJ_ilu0() for detailed description
5797    because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_inplace().
5798 */
5799 
5800 #undef __FUNCT__
5801 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0"
5802 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
5803 {
5804 
5805   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
5806   PetscErrorCode     ierr;
5807   PetscInt           n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2;
5808   PetscInt           i,j,nz,*bi,*bj,*bdiag,bi_temp;
5809 
5810   PetscFunctionBegin;
5811   ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr);
5812   b    = (Mat_SeqBAIJ*)(fact)->data;
5813 
5814   /* allocate matrix arrays for new data structure */
5815   ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,n+1,PetscInt,&b->i);CHKERRQ(ierr);
5816   ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(n+1)*sizeof(PetscInt));CHKERRQ(ierr);
5817   b->singlemalloc = PETSC_TRUE;
5818   if (!b->diag){
5819     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr);
5820     ierr = PetscLogObjectMemory(fact,(n+1)*sizeof(PetscInt));CHKERRQ(ierr);
5821   }
5822   bdiag = b->diag;
5823 
5824   if (n > 0) {
5825     ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr);
5826   }
5827 
5828   /* set bi and bj with new data structure */
5829   bi = b->i;
5830   bj = b->j;
5831 
5832   /* L part */
5833   bi[0] = 0;
5834   for (i=0; i<n; i++){
5835     nz = adiag[i] - ai[i];
5836     bi[i+1] = bi[i] + nz;
5837     aj = a->j + ai[i];
5838     for (j=0; j<nz; j++){
5839       *bj = aj[j]; bj++;
5840     }
5841   }
5842 
5843   /* U part */
5844   bi_temp = bi[n];
5845   bdiag[n] = bi[n]-1;
5846   for (i=n-1; i>=0; i--){
5847     nz = ai[i+1] - adiag[i] - 1;
5848     bi_temp = bi_temp + nz + 1;
5849     aj = a->j + adiag[i] + 1;
5850     for (j=0; j<nz; j++){
5851       *bj = aj[j]; bj++;
5852     }
5853     /* diag[i] */
5854     *bj = i; bj++;
5855     bdiag[i] = bi_temp - 1;
5856   }
5857   PetscFunctionReturn(0);
5858 }
5859 
5860 #undef __FUNCT__
5861 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ"
5862 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
5863 {
5864   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
5865   IS                 isicol;
5866   PetscErrorCode     ierr;
5867   const PetscInt     *r,*ic;
5868   PetscInt           n=a->mbs,*ai=a->i,*aj=a->j,d;
5869   PetscInt           *bi,*cols,nnz,*cols_lvl;
5870   PetscInt           *bdiag,prow,fm,nzbd,reallocs=0,dcount=0;
5871   PetscInt           i,levels,diagonal_fill;
5872   PetscTruth         col_identity,row_identity,both_identity;
5873   PetscReal          f;
5874   PetscInt           nlnk,*lnk,*lnk_lvl=PETSC_NULL;
5875   PetscBT            lnkbt;
5876   PetscInt           nzi,*bj,**bj_ptr,**bjlvl_ptr;
5877   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
5878   PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL;
5879   PetscTruth         missing;
5880   PetscInt           bs=A->rmap->bs,bs2=a->bs2;
5881 
5882   PetscFunctionBegin;
5883   if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
5884   if (bs>1){  /* check shifttype */
5885     if (info->shifttype == MAT_SHIFT_NONZERO || info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE)
5886       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Only MAT_SHIFT_NONE and MAT_SHIFT_INBLOCKS are supported for BAIJ matrix");
5887   }
5888 
5889   ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr);
5890   if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
5891 
5892   f             = info->fill;
5893   levels        = (PetscInt)info->levels;
5894   diagonal_fill = (PetscInt)info->diagonal_fill;
5895   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
5896 
5897   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5898   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
5899   both_identity = (PetscTruth) (row_identity && col_identity);
5900 
5901   if (!levels && both_identity) {
5902     /* special case: ilu(0) with natural ordering */
5903     ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0(fact,A,isrow,iscol,info);CHKERRQ(ierr);
5904     ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
5905 
5906     fact->factortype               = MAT_FACTOR_ILU;
5907     (fact)->info.factor_mallocs    = 0;
5908     (fact)->info.fill_ratio_given  = info->fill;
5909     (fact)->info.fill_ratio_needed = 1.0;
5910     b                = (Mat_SeqBAIJ*)(fact)->data;
5911     b->row           = isrow;
5912     b->col           = iscol;
5913     b->icol          = isicol;
5914     ierr             = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5915     ierr             = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5916     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5917     ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
5918     PetscFunctionReturn(0);
5919   }
5920 
5921   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
5922   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
5923 
5924   /* get new row pointers */
5925   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
5926   bi[0] = 0;
5927   /* bdiag is location of diagonal in factor */
5928   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr);
5929   bdiag[0]  = 0;
5930 
5931   ierr = PetscMalloc2(n,PetscInt*,&bj_ptr,n,PetscInt*,&bjlvl_ptr);CHKERRQ(ierr);
5932 
5933   /* create a linked list for storing column indices of the active row */
5934   nlnk = n + 1;
5935   ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
5936 
5937   /* initial FreeSpace size is f*(ai[n]+1) */
5938   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr);
5939   current_space = free_space;
5940   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr);
5941   current_space_lvl = free_space_lvl;
5942 
5943   for (i=0; i<n; i++) {
5944     nzi = 0;
5945     /* copy current row into linked list */
5946     nnz  = ai[r[i]+1] - ai[r[i]];
5947     if (!nnz) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i);
5948     cols = aj + ai[r[i]];
5949     lnk[i] = -1; /* marker to indicate if diagonal exists */
5950     ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
5951     nzi += nlnk;
5952 
5953     /* make sure diagonal entry is included */
5954     if (diagonal_fill && lnk[i] == -1) {
5955       fm = n;
5956       while (lnk[fm] < i) fm = lnk[fm];
5957       lnk[i]     = lnk[fm]; /* insert diagonal into linked list */
5958       lnk[fm]    = i;
5959       lnk_lvl[i] = 0;
5960       nzi++; dcount++;
5961     }
5962 
5963     /* add pivot rows into the active row */
5964     nzbd = 0;
5965     prow = lnk[n];
5966     while (prow < i) {
5967       nnz      = bdiag[prow];
5968       cols     = bj_ptr[prow] + nnz + 1;
5969       cols_lvl = bjlvl_ptr[prow] + nnz + 1;
5970       nnz      = bi[prow+1] - bi[prow] - nnz - 1;
5971       ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr);
5972       nzi += nlnk;
5973       prow = lnk[prow];
5974       nzbd++;
5975     }
5976     bdiag[i] = nzbd;
5977     bi[i+1]  = bi[i] + nzi;
5978 
5979     /* if free space is not available, make more free space */
5980     if (current_space->local_remaining<nzi) {
5981       nnz = 2*nzi*(n - i); /* estimated and max additional space needed */
5982       ierr = PetscFreeSpaceGet(nnz,&current_space);CHKERRQ(ierr);
5983       ierr = PetscFreeSpaceGet(nnz,&current_space_lvl);CHKERRQ(ierr);
5984       reallocs++;
5985     }
5986 
5987     /* copy data into free_space and free_space_lvl, then initialize lnk */
5988     ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr);
5989     bj_ptr[i]    = current_space->array;
5990     bjlvl_ptr[i] = current_space_lvl->array;
5991 
5992     /* make sure the active row i has diagonal entry */
5993     if (*(bj_ptr[i]+bdiag[i]) != i) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\ntry running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i);
5994 
5995     current_space->array           += nzi;
5996     current_space->local_used      += nzi;
5997     current_space->local_remaining -= nzi;
5998     current_space_lvl->array           += nzi;
5999     current_space_lvl->local_used      += nzi;
6000     current_space_lvl->local_remaining -= nzi;
6001   }
6002 
6003   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
6004   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
6005 
6006   /* destroy list of free space and other temporary arrays */
6007   ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr);
6008 
6009   /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */
6010   ierr = PetscFreeSpaceContiguous_LU(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr);
6011 
6012   ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
6013   ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr);
6014   ierr = PetscFree2(bj_ptr,bjlvl_ptr);CHKERRQ(ierr);
6015 
6016 #if defined(PETSC_USE_INFO)
6017   {
6018     PetscReal af = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]);
6019     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr);
6020     ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
6021     ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr);
6022     ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
6023     if (diagonal_fill) {
6024       ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr);
6025     }
6026   }
6027 #endif
6028 
6029   /* put together the new matrix */
6030   ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
6031   ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
6032   b = (Mat_SeqBAIJ*)(fact)->data;
6033   b->free_a       = PETSC_TRUE;
6034   b->free_ij      = PETSC_TRUE;
6035   b->singlemalloc = PETSC_FALSE;
6036   ierr = PetscMalloc( (bs2*(bdiag[0]+1) )*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
6037   b->j          = bj;
6038   b->i          = bi;
6039   b->diag       = bdiag;
6040   b->free_diag  = PETSC_TRUE;
6041   b->ilen       = 0;
6042   b->imax       = 0;
6043   b->row        = isrow;
6044   b->col        = iscol;
6045   ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
6046   ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
6047   b->icol       = isicol;
6048   ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
6049   /* In b structure:  Free imax, ilen, old a, old j.
6050      Allocate bdiag, solve_work, new a, new j */
6051   ierr = PetscLogObjectMemory(fact,(bdiag[0]+1) * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr);
6052   b->maxnz = b->nz = bdiag[0]+1;
6053   fact->info.factor_mallocs    = reallocs;
6054   fact->info.fill_ratio_given  = f;
6055   fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]);
6056   ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
6057   PetscFunctionReturn(0);
6058 }
6059 
6060 /*
6061      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
6062    except that the data structure of Mat_SeqAIJ is slightly different.
6063    Not a good example of code reuse.
6064 */
6065 #undef __FUNCT__
6066 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_inplace"
6067 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_inplace(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
6068 {
6069   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b;
6070   IS             isicol;
6071   PetscErrorCode ierr;
6072   const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi;
6073   PetscInt       prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp;
6074   PetscInt       *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0;
6075   PetscInt       incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd;
6076   PetscTruth     col_identity,row_identity,both_identity,flg;
6077   PetscReal      f;
6078 
6079   PetscFunctionBegin;
6080   ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr);
6081   if (flg) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd);
6082 
6083   f             = info->fill;
6084   levels        = (PetscInt)info->levels;
6085   diagonal_fill = (PetscInt)info->diagonal_fill;
6086   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
6087 
6088   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
6089   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
6090   both_identity = (PetscTruth) (row_identity && col_identity);
6091 
6092   if (!levels && both_identity) {  /* special case copy the nonzero structure */
6093     ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr);
6094     ierr = MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);CHKERRQ(ierr);
6095 
6096     fact->factortype = MAT_FACTOR_ILU;
6097     b            = (Mat_SeqBAIJ*)fact->data;
6098     b->row       = isrow;
6099     b->col       = iscol;
6100     ierr         = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
6101     ierr         = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
6102     b->icol      = isicol;
6103     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
6104     ierr         = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
6105     PetscFunctionReturn(0);
6106   }
6107 
6108   /* general case perform the symbolic factorization */
6109     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
6110     ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
6111 
6112     /* get new row pointers */
6113     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr);
6114     ainew[0] = 0;
6115     /* don't know how many column pointers are needed so estimate */
6116     jmax = (PetscInt)(f*ai[n] + 1);
6117     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr);
6118     /* ajfill is level of fill for each fill entry */
6119     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr);
6120     /* fill is a linked list of nonzeros in active row */
6121     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr);
6122     /* im is level for each filled value */
6123     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr);
6124     /* dloc is location of diagonal in factor */
6125     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr);
6126     dloc[0]  = 0;
6127     for (prow=0; prow<n; prow++) {
6128 
6129       /* copy prow into linked list */
6130       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
6131       if (!nz) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow);
6132       xi         = aj + ai[r[prow]];
6133       fill[n]    = n;
6134       fill[prow] = -1; /* marker for diagonal entry */
6135       while (nz--) {
6136 	fm  = n;
6137 	idx = ic[*xi++];
6138 	do {
6139 	  m  = fm;
6140 	  fm = fill[m];
6141 	} while (fm < idx);
6142 	fill[m]   = idx;
6143 	fill[idx] = fm;
6144 	im[idx]   = 0;
6145       }
6146 
6147       /* make sure diagonal entry is included */
6148       if (diagonal_fill && fill[prow] == -1) {
6149 	fm = n;
6150 	while (fill[fm] < prow) fm = fill[fm];
6151 	fill[prow] = fill[fm];  /* insert diagonal into linked list */
6152 	fill[fm]   = prow;
6153 	im[prow]   = 0;
6154 	nzf++;
6155 	dcount++;
6156       }
6157 
6158       nzi = 0;
6159       row = fill[n];
6160       while (row < prow) {
6161 	incrlev = im[row] + 1;
6162 	nz      = dloc[row];
6163 	xi      = ajnew  + ainew[row] + nz + 1;
6164 	flev    = ajfill + ainew[row] + nz + 1;
6165 	nnz     = ainew[row+1] - ainew[row] - nz - 1;
6166 	fm      = row;
6167 	while (nnz-- > 0) {
6168 	  idx = *xi++;
6169 	  if (*flev + incrlev > levels) {
6170 	    flev++;
6171 	    continue;
6172 	  }
6173 	  do {
6174 	    m  = fm;
6175 	    fm = fill[m];
6176 	  } while (fm < idx);
6177 	  if (fm != idx) {
6178 	    im[idx]   = *flev + incrlev;
6179 	    fill[m]   = idx;
6180 	    fill[idx] = fm;
6181 	    fm        = idx;
6182 	    nzf++;
6183 	  } else {
6184 	    if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
6185 	  }
6186 	  flev++;
6187 	}
6188 	row = fill[row];
6189 	nzi++;
6190       }
6191       /* copy new filled row into permanent storage */
6192       ainew[prow+1] = ainew[prow] + nzf;
6193       if (ainew[prow+1] > jmax) {
6194 
6195 	/* estimate how much additional space we will need */
6196 	/* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
6197 	/* just double the memory each time */
6198 	PetscInt maxadd = jmax;
6199 	/* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
6200 	if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
6201 	jmax += maxadd;
6202 
6203 	/* allocate a longer ajnew and ajfill */
6204 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
6205 	ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
6206 	ierr = PetscFree(ajnew);CHKERRQ(ierr);
6207 	ajnew = xitmp;
6208 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
6209 	ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
6210 	ierr = PetscFree(ajfill);CHKERRQ(ierr);
6211 	ajfill = xitmp;
6212 	reallocate++; /* count how many reallocations are needed */
6213       }
6214       xitmp       = ajnew + ainew[prow];
6215       flev        = ajfill + ainew[prow];
6216       dloc[prow]  = nzi;
6217       fm          = fill[n];
6218       while (nzf--) {
6219 	*xitmp++ = fm;
6220 	*flev++ = im[fm];
6221 	fm      = fill[fm];
6222       }
6223       /* make sure row has diagonal entry */
6224       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
6225 	SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
6226     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow);
6227       }
6228     }
6229     ierr = PetscFree(ajfill);CHKERRQ(ierr);
6230     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
6231     ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
6232     ierr = PetscFree(fill);CHKERRQ(ierr);
6233     ierr = PetscFree(im);CHKERRQ(ierr);
6234 
6235 #if defined(PETSC_USE_INFO)
6236     {
6237       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
6238       ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr);
6239       ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
6240       ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr);
6241       ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
6242       if (diagonal_fill) {
6243 	ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr);
6244       }
6245     }
6246 #endif
6247 
6248     /* put together the new matrix */
6249     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
6250     ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
6251     b    = (Mat_SeqBAIJ*)fact->data;
6252     b->free_a       = PETSC_TRUE;
6253     b->free_ij      = PETSC_TRUE;
6254     b->singlemalloc = PETSC_FALSE;
6255     ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
6256     b->j          = ajnew;
6257     b->i          = ainew;
6258     for (i=0; i<n; i++) dloc[i] += ainew[i];
6259     b->diag       = dloc;
6260     b->free_diag  = PETSC_TRUE;
6261     b->ilen       = 0;
6262     b->imax       = 0;
6263     b->row        = isrow;
6264     b->col        = iscol;
6265     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
6266     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
6267     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
6268     b->icol       = isicol;
6269     ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
6270     /* In b structure:  Free imax, ilen, old a, old j.
6271        Allocate dloc, solve_work, new a, new j */
6272     ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr);
6273     b->maxnz          = b->nz = ainew[n];
6274 
6275     fact->info.factor_mallocs    = reallocate;
6276     fact->info.fill_ratio_given  = f;
6277     fact->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
6278 
6279   ierr = MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);CHKERRQ(ierr);
6280   PetscFunctionReturn(0);
6281 }
6282 
6283 #undef __FUNCT__
6284 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE"
6285 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
6286 {
6287   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
6288   /* int i,*AJ=a->j,nz=a->nz; */
6289   PetscFunctionBegin;
6290   /* Undo Column scaling */
6291 /*    while (nz--) { */
6292 /*      AJ[i] = AJ[i]/4; */
6293 /*    } */
6294   /* This should really invoke a push/pop logic, but we don't have that yet. */
6295   A->ops->setunfactored = PETSC_NULL;
6296   PetscFunctionReturn(0);
6297 }
6298 
6299 #undef __FUNCT__
6300 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj"
6301 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
6302 {
6303   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
6304   PetscInt       *AJ=a->j,nz=a->nz;
6305   unsigned short *aj=(unsigned short *)AJ;
6306   PetscFunctionBegin;
6307   /* Is this really necessary? */
6308   while (nz--) {
6309     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
6310   }
6311   A->ops->setunfactored = PETSC_NULL;
6312   PetscFunctionReturn(0);
6313 }
6314 
6315 
6316