xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision 7d0a6c19129e7069c8a40e210b34ed62989173db)
1 
2 /*
3     Factorization code for BAIJ format.
4 */
5 
6 #include "../src/mat/impls/baij/seq/baij.h"
7 #include "../src/mat/blockinvert.h"
8 #include "petscbt.h"
9 #include "../src/mat/utils/freespace.h"
10 
11 #undef __FUNCT__
12 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering"
13 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
14 {
15   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
16   PetscErrorCode    ierr;
17   const PetscInt    *adiag = a->diag,*ai = a->i,*aj = a->j,*vi;
18   PetscInt          i,n = a->mbs,j;
19   PetscInt          nz;
20   PetscScalar       *x,*tmp,s1;
21   const MatScalar   *aa = a->a,*v;
22   const PetscScalar *b;
23 
24   PetscFunctionBegin;
25   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
26   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
27   tmp  = a->solve_work;
28 
29 
30   /* copy the b into temp work space according to permutation */
31   for (i=0; i<n; i++) tmp[i] = b[i];
32 
33   /* forward solve the U^T */
34   for (i=0; i<n; i++) {
35     v   = aa + adiag[i+1] + 1;
36     vi  = aj + adiag[i+1] + 1;
37     nz  = adiag[i] - adiag[i+1] - 1;
38     s1  = tmp[i];
39     s1 *= v[nz];  /* multiply by inverse of diagonal entry */
40     for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
41     tmp[i] = s1;
42   }
43 
44   /* backward solve the L^T */
45   for (i=n-1; i>=0; i--){
46     v   = aa + ai[i];
47     vi  = aj + ai[i];
48     nz  = ai[i+1] - ai[i];
49     s1  = tmp[i];
50     for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
51   }
52 
53   /* copy tmp into x according to permutation */
54   for (i=0; i<n; i++) x[i] = tmp[i];
55 
56   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
57   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
58 
59   ierr = PetscLogFlops(2.0*a->nz-A->cmap->n);CHKERRQ(ierr);
60   PetscFunctionReturn(0);
61 }
62 
63 #undef __FUNCT__
64 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace"
65 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
66 {
67   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
68   PetscErrorCode    ierr;
69   PetscInt          i,nz;
70   const PetscInt    *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
71   const MatScalar   *aa=a->a,*v;
72   PetscScalar       s1,*x;
73   const PetscScalar *b;
74 
75   PetscFunctionBegin;
76   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
77   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
78   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
79 
80   /* forward solve the U^T */
81   for (i=0; i<n; i++) {
82 
83     v     = aa + diag[i];
84     /* multiply by the inverse of the block diagonal */
85     s1    = (*v++)*x[i];
86     vi    = aj + diag[i] + 1;
87     nz    = ai[i+1] - diag[i] - 1;
88     while (nz--) {
89       x[*vi++]  -= (*v++)*s1;
90     }
91     x[i]   = s1;
92   }
93   /* backward solve the L^T */
94   for (i=n-1; i>=0; i--){
95     v    = aa + diag[i] - 1;
96     vi   = aj + diag[i] - 1;
97     nz   = diag[i] - ai[i];
98     s1   = x[i];
99     while (nz--) {
100       x[*vi--]   -=  (*v--)*s1;
101     }
102   }
103   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
104   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
105   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
106   PetscFunctionReturn(0);
107 }
108 
109 #undef __FUNCT__
110 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace"
111 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
112 {
113   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
114   PetscErrorCode    ierr;
115   PetscInt          i,nz,idx,idt,oidx;
116   const PetscInt    *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j;
117   const MatScalar   *aa=a->a,*v;
118   PetscScalar       s1,s2,x1,x2,*x;
119   const PetscScalar *b;
120 
121   PetscFunctionBegin;
122   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
123   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
124   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
125 
126   /* forward solve the U^T */
127   idx = 0;
128   for (i=0; i<n; i++) {
129 
130     v     = aa + 4*diag[i];
131     /* multiply by the inverse of the block diagonal */
132     x1 = x[idx];   x2 = x[1+idx];
133     s1 = v[0]*x1  +  v[1]*x2;
134     s2 = v[2]*x1  +  v[3]*x2;
135     v += 4;
136 
137     vi    = aj + diag[i] + 1;
138     nz    = ai[i+1] - diag[i] - 1;
139     while (nz--) {
140       oidx = 2*(*vi++);
141       x[oidx]   -= v[0]*s1  +  v[1]*s2;
142       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
143       v  += 4;
144     }
145     x[idx]   = s1;x[1+idx] = s2;
146     idx += 2;
147   }
148   /* backward solve the L^T */
149   for (i=n-1; i>=0; i--){
150     v    = aa + 4*diag[i] - 4;
151     vi   = aj + diag[i] - 1;
152     nz   = diag[i] - ai[i];
153     idt  = 2*i;
154     s1   = x[idt];  s2 = x[1+idt];
155     while (nz--) {
156       idx   = 2*(*vi--);
157       x[idx]   -=  v[0]*s1 +  v[1]*s2;
158       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
159       v -= 4;
160     }
161   }
162   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
163   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
164   ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
165   PetscFunctionReturn(0);
166 }
167 
168 #undef __FUNCT__
169 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering"
170 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
171 {
172   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
173   PetscErrorCode    ierr;
174   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
175   PetscInt          nz,idx,idt,j,i,oidx;
176   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
177   const MatScalar   *aa=a->a,*v;
178   PetscScalar       s1,s2,x1,x2,*x;
179   const PetscScalar *b;
180 
181   PetscFunctionBegin;
182   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
183   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
184   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
185 
186   /* forward solve the U^T */
187   idx = 0;
188   for (i=0; i<n; i++) {
189     v     = aa + bs2*diag[i];
190     /* multiply by the inverse of the block diagonal */
191     x1 = x[idx];   x2 = x[1+idx];
192     s1 = v[0]*x1  +  v[1]*x2;
193     s2 = v[2]*x1  +  v[3]*x2;
194     v -= bs2;
195 
196     vi    = aj + diag[i] - 1;
197     nz    = diag[i] - diag[i+1] - 1;
198     for(j=0;j>-nz;j--){
199       oidx = bs*vi[j];
200       x[oidx]   -= v[0]*s1  +  v[1]*s2;
201       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
202       v  -= bs2;
203     }
204     x[idx]   = s1;x[1+idx] = s2;
205     idx += bs;
206   }
207   /* backward solve the L^T */
208   for (i=n-1; i>=0; i--){
209     v    = aa + bs2*ai[i];
210     vi   = aj + ai[i];
211     nz   = ai[i+1] - ai[i];
212     idt  = bs*i;
213     s1   = x[idt];  s2 = x[1+idt];
214     for(j=0;j<nz;j++){
215       idx   = bs*vi[j];
216       x[idx]   -=  v[0]*s1 +  v[1]*s2;
217       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
218       v += bs2;
219     }
220   }
221   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
222   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
223   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
224   PetscFunctionReturn(0);
225 }
226 
227 #undef __FUNCT__
228 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace"
229 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
230 {
231   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
232   PetscErrorCode    ierr;
233   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
234   PetscInt          i,nz,idx,idt,oidx;
235   const MatScalar   *aa=a->a,*v;
236   PetscScalar       s1,s2,s3,x1,x2,x3,*x;
237   const PetscScalar *b;
238 
239   PetscFunctionBegin;
240   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
241   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
242   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
243 
244   /* forward solve the U^T */
245   idx = 0;
246   for (i=0; i<n; i++) {
247 
248     v     = aa + 9*diag[i];
249     /* multiply by the inverse of the block diagonal */
250     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
251     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
252     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
253     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
254     v += 9;
255 
256     vi    = aj + diag[i] + 1;
257     nz    = ai[i+1] - diag[i] - 1;
258     while (nz--) {
259       oidx = 3*(*vi++);
260       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
261       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
262       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
263       v  += 9;
264     }
265     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
266     idx += 3;
267   }
268   /* backward solve the L^T */
269   for (i=n-1; i>=0; i--){
270     v    = aa + 9*diag[i] - 9;
271     vi   = aj + diag[i] - 1;
272     nz   = diag[i] - ai[i];
273     idt  = 3*i;
274     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
275     while (nz--) {
276       idx   = 3*(*vi--);
277       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
278       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
279       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
280       v -= 9;
281     }
282   }
283   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
284   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
285   ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
286   PetscFunctionReturn(0);
287 }
288 
289 #undef __FUNCT__
290 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering"
291 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
292 {
293   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
294   PetscErrorCode    ierr;
295   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
296   PetscInt          nz,idx,idt,j,i,oidx;
297   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
298   const MatScalar   *aa=a->a,*v;
299   PetscScalar       s1,s2,s3,x1,x2,x3,*x;
300   const PetscScalar *b;
301 
302   PetscFunctionBegin;
303   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
304   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
305   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
306 
307   /* forward solve the U^T */
308   idx = 0;
309   for (i=0; i<n; i++) {
310     v     = aa + bs2*diag[i];
311     /* multiply by the inverse of the block diagonal */
312     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];
313     s1 = v[0]*x1  +  v[1]*x2  + v[2]*x3;
314     s2 = v[3]*x1  +  v[4]*x2  + v[5]*x3;
315     s3 = v[6]*x1  +  v[7]*x2  + v[8]*x3;
316     v -= bs2;
317 
318     vi    = aj + diag[i] - 1;
319     nz    = diag[i] - diag[i+1] - 1;
320     for(j=0;j>-nz;j--){
321       oidx = bs*vi[j];
322       x[oidx]   -= v[0]*s1  +  v[1]*s2  + v[2]*s3;
323       x[oidx+1] -= v[3]*s1  +  v[4]*s2  + v[5]*s3;
324       x[oidx+2] -= v[6]*s1  +  v[7]*s2  + v[8]*s3;
325       v  -= bs2;
326     }
327     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;
328     idx += bs;
329   }
330   /* backward solve the L^T */
331   for (i=n-1; i>=0; i--){
332     v    = aa + bs2*ai[i];
333     vi   = aj + ai[i];
334     nz   = ai[i+1] - ai[i];
335     idt  = bs*i;
336     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];
337     for(j=0;j<nz;j++){
338       idx   = bs*vi[j];
339       x[idx]   -= v[0]*s1  +  v[1]*s2  + v[2]*s3;
340       x[idx+1] -= v[3]*s1  +  v[4]*s2  + v[5]*s3;
341       x[idx+2] -= v[6]*s1  +  v[7]*s2  + v[8]*s3;
342       v += bs2;
343     }
344   }
345   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
346   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
347   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
348   PetscFunctionReturn(0);
349 }
350 
351 #undef __FUNCT__
352 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace"
353 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
354 {
355   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
356   PetscErrorCode    ierr;
357   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
358   PetscInt          i,nz,idx,idt,oidx;
359   const MatScalar   *aa=a->a,*v;
360   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x;
361   const PetscScalar *b;
362 
363   PetscFunctionBegin;
364   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
365   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
366   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
367 
368   /* forward solve the U^T */
369   idx = 0;
370   for (i=0; i<n; i++) {
371 
372     v     = aa + 16*diag[i];
373     /* multiply by the inverse of the block diagonal */
374     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
375     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
376     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
377     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
378     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
379     v += 16;
380 
381     vi    = aj + diag[i] + 1;
382     nz    = ai[i+1] - diag[i] - 1;
383     while (nz--) {
384       oidx = 4*(*vi++);
385       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
386       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
387       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
388       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
389       v  += 16;
390     }
391     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
392     idx += 4;
393   }
394   /* backward solve the L^T */
395   for (i=n-1; i>=0; i--){
396     v    = aa + 16*diag[i] - 16;
397     vi   = aj + diag[i] - 1;
398     nz   = diag[i] - ai[i];
399     idt  = 4*i;
400     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
401     while (nz--) {
402       idx   = 4*(*vi--);
403       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
404       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
405       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
406       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
407       v -= 16;
408     }
409   }
410   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
411   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
412   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
413   PetscFunctionReturn(0);
414 }
415 
416 #undef __FUNCT__
417 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering"
418 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
419 {
420   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
421   PetscErrorCode    ierr;
422   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
423   PetscInt          nz,idx,idt,j,i,oidx;
424   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
425   const MatScalar   *aa=a->a,*v;
426   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x;
427   const PetscScalar *b;
428 
429   PetscFunctionBegin;
430   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
431   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
432   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
433 
434   /* forward solve the U^T */
435   idx = 0;
436   for (i=0; i<n; i++) {
437     v     = aa + bs2*diag[i];
438     /* multiply by the inverse of the block diagonal */
439     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
440     s1 =  v[0]*x1  +  v[1]*x2  + v[2]*x3  + v[3]*x4;
441     s2 =  v[4]*x1  +  v[5]*x2  + v[6]*x3  + v[7]*x4;
442     s3 =  v[8]*x1  +  v[9]*x2  + v[10]*x3 + v[11]*x4;
443     s4 =  v[12]*x1 +  v[13]*x2 + v[14]*x3 + v[15]*x4;
444     v -= bs2;
445 
446     vi    = aj + diag[i] - 1;
447     nz    = diag[i] - diag[i+1] - 1;
448     for(j=0;j>-nz;j--){
449       oidx = bs*vi[j];
450       x[oidx]   -=  v[0]*s1  +  v[1]*s2  + v[2]*s3  + v[3]*s4;
451       x[oidx+1] -=  v[4]*s1  +  v[5]*s2  + v[6]*s3  + v[7]*s4;
452       x[oidx+2] -=  v[8]*s1  +  v[9]*s2  + v[10]*s3 + v[11]*s4;
453       x[oidx+3] -=  v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
454       v  -= bs2;
455     }
456     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4;
457     idx += bs;
458   }
459   /* backward solve the L^T */
460   for (i=n-1; i>=0; i--){
461     v    = aa + bs2*ai[i];
462     vi   = aj + ai[i];
463     nz   = ai[i+1] - ai[i];
464     idt  = bs*i;
465     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];
466     for(j=0;j<nz;j++){
467       idx   = bs*vi[j];
468       x[idx]   -=  v[0]*s1  +  v[1]*s2  + v[2]*s3  + v[3]*s4;
469       x[idx+1] -=  v[4]*s1  +  v[5]*s2  + v[6]*s3  + v[7]*s4;
470       x[idx+2] -=  v[8]*s1  +  v[9]*s2  + v[10]*s3 + v[11]*s4;
471       x[idx+3] -=  v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
472       v += bs2;
473     }
474   }
475   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
476   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
477   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
478   PetscFunctionReturn(0);
479 }
480 
481 #undef __FUNCT__
482 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace"
483 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
484 {
485   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
486   PetscErrorCode    ierr;
487   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
488   PetscInt          i,nz,idx,idt,oidx;
489   const MatScalar   *aa=a->a,*v;
490   PetscScalar       s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x;
491   const PetscScalar *b;
492 
493   PetscFunctionBegin;
494   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
495   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
496   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
497 
498   /* forward solve the U^T */
499   idx = 0;
500   for (i=0; i<n; i++) {
501 
502     v     = aa + 25*diag[i];
503     /* multiply by the inverse of the block diagonal */
504     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
505     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
506     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
507     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
508     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
509     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
510     v += 25;
511 
512     vi    = aj + diag[i] + 1;
513     nz    = ai[i+1] - diag[i] - 1;
514     while (nz--) {
515       oidx = 5*(*vi++);
516       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
517       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
518       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
519       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
520       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
521       v  += 25;
522     }
523     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
524     idx += 5;
525   }
526   /* backward solve the L^T */
527   for (i=n-1; i>=0; i--){
528     v    = aa + 25*diag[i] - 25;
529     vi   = aj + diag[i] - 1;
530     nz   = diag[i] - ai[i];
531     idt  = 5*i;
532     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
533     while (nz--) {
534       idx   = 5*(*vi--);
535       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
536       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
537       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
538       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
539       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
540       v -= 25;
541     }
542   }
543   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
544   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
545   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
546   PetscFunctionReturn(0);
547 }
548 
549 #undef __FUNCT__
550 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering"
551 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
552 {
553   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
554   PetscErrorCode ierr;
555   const PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
556   PetscInt       nz,idx,idt,j,i,oidx;
557   const PetscInt       bs=A->rmap->bs,bs2=a->bs2;
558   const MatScalar      *aa=a->a,*v;
559   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x;
560   const PetscScalar    *b;
561 
562   PetscFunctionBegin;
563   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
564   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
565   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
566 
567   /* forward solve the U^T */
568   idx = 0;
569   for (i=0; i<n; i++) {
570     v     = aa + bs2*diag[i];
571     /* multiply by the inverse of the block diagonal */
572     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
573     x5 = x[4+idx];
574     s1 =  v[0]*x1   +  v[1]*x2   + v[2]*x3   + v[3]*x4   + v[4]*x5;
575     s2 =  v[5]*x1   +  v[6]*x2   + v[7]*x3   + v[8]*x4   + v[9]*x5;
576     s3 =  v[10]*x1  +  v[11]*x2  + v[12]*x3  + v[13]*x4  + v[14]*x5;
577     s4 =  v[15]*x1  +  v[16]*x2  + v[17]*x3  + v[18]*x4  + v[19]*x5;
578     s5 =  v[20]*x1  +  v[21]*x2  + v[22]*x3  + v[23]*x4   + v[24]*x5;
579     v -= bs2;
580 
581     vi    = aj + diag[i] - 1;
582     nz    = diag[i] - diag[i+1] - 1;
583     for(j=0;j>-nz;j--){
584       oidx = bs*vi[j];
585       x[oidx]   -=  v[0]*s1   +  v[1]*s2   + v[2]*s3   + v[3]*s4   + v[4]*s5;
586       x[oidx+1] -=  v[5]*s1   +  v[6]*s2   + v[7]*s3   + v[8]*s4   + v[9]*s5;
587       x[oidx+2] -=  v[10]*s1  +  v[11]*s2  + v[12]*s3  + v[13]*s4  + v[14]*s5;
588       x[oidx+3] -=  v[15]*s1  +  v[16]*s2  + v[17]*s3  + v[18]*s4  + v[19]*s5;
589       x[oidx+4] -=  v[20]*s1  +  v[21]*s2  + v[22]*s3  + v[23]*s4   + v[24]*s5;
590       v  -= bs2;
591     }
592     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
593     idx += bs;
594   }
595   /* backward solve the L^T */
596   for (i=n-1; i>=0; i--){
597     v    = aa + bs2*ai[i];
598     vi   = aj + ai[i];
599     nz   = ai[i+1] - ai[i];
600     idt  = bs*i;
601     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
602     for(j=0;j<nz;j++){
603       idx   = bs*vi[j];
604       x[idx]   -=  v[0]*s1   +  v[1]*s2   + v[2]*s3   + v[3]*s4   + v[4]*s5;
605       x[idx+1] -=  v[5]*s1   +  v[6]*s2   + v[7]*s3   + v[8]*s4   + v[9]*s5;
606       x[idx+2] -=  v[10]*s1  +  v[11]*s2  + v[12]*s3  + v[13]*s4  + v[14]*s5;
607       x[idx+3] -=  v[15]*s1  +  v[16]*s2  + v[17]*s3  + v[18]*s4  + v[19]*s5;
608       x[idx+4] -=  v[20]*s1  +  v[21]*s2  + v[22]*s3  + v[23]*s4   + v[24]*s5;
609       v += bs2;
610     }
611   }
612   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
613   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
614   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
615   PetscFunctionReturn(0);
616 }
617 
618 #undef __FUNCT__
619 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace"
620 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
621 {
622   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
623   PetscErrorCode    ierr;
624   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
625   PetscInt          i,nz,idx,idt,oidx;
626   const MatScalar   *aa=a->a,*v;
627   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x;
628   const PetscScalar *b;
629 
630   PetscFunctionBegin;
631   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
632   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
633   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
634 
635   /* forward solve the U^T */
636   idx = 0;
637   for (i=0; i<n; i++) {
638 
639     v     = aa + 36*diag[i];
640     /* multiply by the inverse of the block diagonal */
641     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
642     x6    = x[5+idx];
643     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
644     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
645     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
646     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
647     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
648     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
649     v += 36;
650 
651     vi    = aj + diag[i] + 1;
652     nz    = ai[i+1] - diag[i] - 1;
653     while (nz--) {
654       oidx = 6*(*vi++);
655       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
656       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
657       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
658       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
659       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
660       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
661       v  += 36;
662     }
663     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
664     x[5+idx] = s6;
665     idx += 6;
666   }
667   /* backward solve the L^T */
668   for (i=n-1; i>=0; i--){
669     v    = aa + 36*diag[i] - 36;
670     vi   = aj + diag[i] - 1;
671     nz   = diag[i] - ai[i];
672     idt  = 6*i;
673     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
674     s6 = x[5+idt];
675     while (nz--) {
676       idx   = 6*(*vi--);
677       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
678       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
679       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
680       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
681       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
682       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
683       v -= 36;
684     }
685   }
686   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
687   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
688   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
689   PetscFunctionReturn(0);
690 }
691 
692 #undef __FUNCT__
693 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering"
694 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
695 {
696   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
697   PetscErrorCode    ierr;
698   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
699   PetscInt          nz,idx,idt,j,i,oidx;
700   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
701   const MatScalar   *aa=a->a,*v;
702   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x;
703   const PetscScalar *b;
704 
705   PetscFunctionBegin;
706   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
707   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
708   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
709 
710   /* forward solve the U^T */
711   idx = 0;
712   for (i=0; i<n; i++) {
713     v     = aa + bs2*diag[i];
714     /* multiply by the inverse of the block diagonal */
715     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
716     x5 = x[4+idx]; x6 = x[5+idx];
717     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
718     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
719     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
720     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
721     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
722     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
723     v -= bs2;
724 
725     vi    = aj + diag[i] - 1;
726     nz    = diag[i] - diag[i+1] - 1;
727     for(j=0;j>-nz;j--){
728       oidx = bs*vi[j];
729       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
730       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
731       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
732       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
733       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
734       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
735       v  -= bs2;
736     }
737     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
738     x[5+idx] = s6;
739     idx += bs;
740   }
741   /* backward solve the L^T */
742   for (i=n-1; i>=0; i--){
743     v    = aa + bs2*ai[i];
744     vi   = aj + ai[i];
745     nz   = ai[i+1] - ai[i];
746     idt  = bs*i;
747     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
748     s6   = x[5+idt];
749     for(j=0;j<nz;j++){
750       idx   = bs*vi[j];
751       x[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
752       x[idx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
753       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
754       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
755       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
756       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
757       v += bs2;
758     }
759   }
760   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
761   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
762   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
763   PetscFunctionReturn(0);
764 }
765 
766 #undef __FUNCT__
767 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace"
768 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
769 {
770   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
771   PetscErrorCode    ierr;
772   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
773   PetscInt          i,nz,idx,idt,oidx;
774   const MatScalar   *aa=a->a,*v;
775   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x;
776   const PetscScalar *b;
777 
778   PetscFunctionBegin;
779   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
780   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
781   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
782 
783   /* forward solve the U^T */
784   idx = 0;
785   for (i=0; i<n; i++) {
786 
787     v     = aa + 49*diag[i];
788     /* multiply by the inverse of the block diagonal */
789     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
790     x6    = x[5+idx]; x7 = x[6+idx];
791     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
792     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
793     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
794     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
795     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
796     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
797     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
798     v += 49;
799 
800     vi    = aj + diag[i] + 1;
801     nz    = ai[i+1] - diag[i] - 1;
802     while (nz--) {
803       oidx = 7*(*vi++);
804       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
805       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
806       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
807       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
808       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
809       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
810       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
811       v  += 49;
812     }
813     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
814     x[5+idx] = s6;x[6+idx] = s7;
815     idx += 7;
816   }
817   /* backward solve the L^T */
818   for (i=n-1; i>=0; i--){
819     v    = aa + 49*diag[i] - 49;
820     vi   = aj + diag[i] - 1;
821     nz   = diag[i] - ai[i];
822     idt  = 7*i;
823     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
824     s6 = x[5+idt];s7 = x[6+idt];
825     while (nz--) {
826       idx   = 7*(*vi--);
827       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
828       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
829       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
830       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
831       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
832       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
833       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
834       v -= 49;
835     }
836   }
837   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
838   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
839   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
840   PetscFunctionReturn(0);
841 }
842 #undef __FUNCT__
843 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering"
844 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
845 {
846   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
847   PetscErrorCode    ierr;
848   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
849   PetscInt          nz,idx,idt,j,i,oidx;
850   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
851   const MatScalar   *aa=a->a,*v;
852   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x;
853   const PetscScalar *b;
854 
855   PetscFunctionBegin;
856   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
857   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
858   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
859 
860   /* forward solve the U^T */
861   idx = 0;
862   for (i=0; i<n; i++) {
863     v     = aa + bs2*diag[i];
864     /* multiply by the inverse of the block diagonal */
865     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
866     x5 = x[4+idx]; x6 = x[5+idx];  x7 = x[6+idx];
867     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
868     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
869     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
870     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
871     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
872     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
873     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
874     v -= bs2;
875     vi    = aj + diag[i] - 1;
876     nz    = diag[i] - diag[i+1] - 1;
877     for(j=0;j>-nz;j--){
878       oidx = bs*vi[j];
879       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
880       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
881       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
882       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
883       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
884       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
885       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
886       v  -= bs2;
887     }
888     x[idx]   = s1;  x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
889     x[5+idx] = s6;  x[6+idx] = s7;
890     idx += bs;
891   }
892   /* backward solve the L^T */
893   for (i=n-1; i>=0; i--){
894     v    = aa + bs2*ai[i];
895     vi   = aj + ai[i];
896     nz   = ai[i+1] - ai[i];
897     idt  = bs*i;
898     s1   = x[idt];    s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
899     s6   = x[5+idt];  s7 = x[6+idt];
900     for(j=0;j<nz;j++){
901       idx   = bs*vi[j];
902       x[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
903       x[idx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
904       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
905       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
906       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
907       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
908       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
909       v += bs2;
910     }
911   }
912   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
913   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
914   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
915   PetscFunctionReturn(0);
916 }
917 
918 /*---------------------------------------------------------------------------------------------*/
919 #undef __FUNCT__
920 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1"
921 PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
922 {
923   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data;
924   IS                iscol = a->col,isrow = a->row;
925   PetscErrorCode    ierr;
926   const PetscInt    *rout,*cout,*r,*c,*adiag = a->diag,*ai = a->i,*aj = a->j,*vi;
927   PetscInt          i,n = a->mbs,j;
928   PetscInt          nz;
929   PetscScalar       *x,*tmp,s1;
930   const MatScalar   *aa = a->a,*v;
931   const PetscScalar *b;
932 
933   PetscFunctionBegin;
934   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
935   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
936   tmp  = a->solve_work;
937 
938   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
939   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
940 
941   /* copy the b into temp work space according to permutation */
942   for (i=0; i<n; i++) tmp[i] = b[c[i]];
943 
944   /* forward solve the U^T */
945   for (i=0; i<n; i++) {
946     v   = aa + adiag[i+1] + 1;
947     vi  = aj + adiag[i+1] + 1;
948     nz  = adiag[i] - adiag[i+1] - 1;
949     s1  = tmp[i];
950     s1 *= v[nz];  /* multiply by inverse of diagonal entry */
951     for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
952     tmp[i] = s1;
953   }
954 
955   /* backward solve the L^T */
956   for (i=n-1; i>=0; i--){
957     v   = aa + ai[i];
958     vi  = aj + ai[i];
959     nz  = ai[i+1] - ai[i];
960     s1  = tmp[i];
961     for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
962   }
963 
964   /* copy tmp into x according to permutation */
965   for (i=0; i<n; i++) x[r[i]] = tmp[i];
966 
967   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
968   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
969   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
970   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
971 
972   ierr = PetscLogFlops(2.0*a->nz-A->cmap->n);CHKERRQ(ierr);
973   PetscFunctionReturn(0);
974 }
975 
976 #undef __FUNCT__
977 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_inplace"
978 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
979 {
980   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
981   IS                iscol=a->col,isrow=a->row;
982   PetscErrorCode    ierr;
983   const PetscInt    *r,*c,*rout,*cout;
984   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
985   PetscInt          i,nz;
986   const MatScalar   *aa=a->a,*v;
987   PetscScalar       s1,*x,*t;
988   const PetscScalar *b;
989 
990   PetscFunctionBegin;
991   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
992   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
993   t  = a->solve_work;
994 
995   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
996   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
997 
998   /* copy the b into temp work space according to permutation */
999   for (i=0; i<n; i++) {
1000     t[i] = b[c[i]];
1001   }
1002 
1003   /* forward solve the U^T */
1004   for (i=0; i<n; i++) {
1005 
1006     v     = aa + diag[i];
1007     /* multiply by the inverse of the block diagonal */
1008     s1    = (*v++)*t[i];
1009     vi    = aj + diag[i] + 1;
1010     nz    = ai[i+1] - diag[i] - 1;
1011     while (nz--) {
1012       t[*vi++]  -= (*v++)*s1;
1013     }
1014     t[i]   = s1;
1015   }
1016   /* backward solve the L^T */
1017   for (i=n-1; i>=0; i--){
1018     v    = aa + diag[i] - 1;
1019     vi   = aj + diag[i] - 1;
1020     nz   = diag[i] - ai[i];
1021     s1   = t[i];
1022     while (nz--) {
1023       t[*vi--]   -=  (*v--)*s1;
1024     }
1025   }
1026 
1027   /* copy t into x according to permutation */
1028   for (i=0; i<n; i++) {
1029     x[r[i]]   = t[i];
1030   }
1031 
1032   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1033   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1034   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1035   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1036   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
1037   PetscFunctionReturn(0);
1038 }
1039 
1040 #undef __FUNCT__
1041 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_inplace"
1042 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
1043 {
1044   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1045   IS                iscol=a->col,isrow=a->row;
1046   PetscErrorCode    ierr;
1047   const PetscInt    *r,*c,*rout,*cout;
1048   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1049   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1050   const MatScalar   *aa=a->a,*v;
1051   PetscScalar       s1,s2,x1,x2,*x,*t;
1052   const PetscScalar *b;
1053 
1054   PetscFunctionBegin;
1055   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1056   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1057   t  = a->solve_work;
1058 
1059   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1060   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1061 
1062   /* copy the b into temp work space according to permutation */
1063   ii = 0;
1064   for (i=0; i<n; i++) {
1065     ic      = 2*c[i];
1066     t[ii]   = b[ic];
1067     t[ii+1] = b[ic+1];
1068     ii += 2;
1069   }
1070 
1071   /* forward solve the U^T */
1072   idx = 0;
1073   for (i=0; i<n; i++) {
1074 
1075     v     = aa + 4*diag[i];
1076     /* multiply by the inverse of the block diagonal */
1077     x1    = t[idx];   x2 = t[1+idx];
1078     s1 = v[0]*x1  +  v[1]*x2;
1079     s2 = v[2]*x1  +  v[3]*x2;
1080     v += 4;
1081 
1082     vi    = aj + diag[i] + 1;
1083     nz    = ai[i+1] - diag[i] - 1;
1084     while (nz--) {
1085       oidx = 2*(*vi++);
1086       t[oidx]   -= v[0]*s1  +  v[1]*s2;
1087       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
1088       v  += 4;
1089     }
1090     t[idx]   = s1;t[1+idx] = s2;
1091     idx += 2;
1092   }
1093   /* backward solve the L^T */
1094   for (i=n-1; i>=0; i--){
1095     v    = aa + 4*diag[i] - 4;
1096     vi   = aj + diag[i] - 1;
1097     nz   = diag[i] - ai[i];
1098     idt  = 2*i;
1099     s1 = t[idt];  s2 = t[1+idt];
1100     while (nz--) {
1101       idx   = 2*(*vi--);
1102       t[idx]   -=  v[0]*s1 +  v[1]*s2;
1103       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
1104       v -= 4;
1105     }
1106   }
1107 
1108   /* copy t into x according to permutation */
1109   ii = 0;
1110   for (i=0; i<n; i++) {
1111     ir      = 2*r[i];
1112     x[ir]   = t[ii];
1113     x[ir+1] = t[ii+1];
1114     ii += 2;
1115   }
1116 
1117   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1118   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1119   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1120   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1121   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
1122   PetscFunctionReturn(0);
1123 }
1124 
1125 #undef __FUNCT__
1126 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2"
1127 PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
1128 {
1129   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1130   PetscErrorCode    ierr;
1131   IS                iscol=a->col,isrow=a->row;
1132   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1133   const PetscInt    *r,*c,*rout,*cout;
1134   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1135   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1136   const MatScalar   *aa=a->a,*v;
1137   PetscScalar       s1,s2,x1,x2,*x,*t;
1138   const PetscScalar *b;
1139 
1140   PetscFunctionBegin;
1141   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1142   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1143   t = a->solve_work;
1144 
1145   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1146   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1147 
1148   /* copy b into temp work space according to permutation */
1149   for(i=0;i<n;i++){
1150     ii = bs*i; ic = bs*c[i];
1151     t[ii] = b[ic]; t[ii+1] = b[ic+1];
1152   }
1153 
1154   /* forward solve the U^T */
1155   idx = 0;
1156   for (i=0; i<n; i++) {
1157     v     = aa + bs2*diag[i];
1158     /* multiply by the inverse of the block diagonal */
1159     x1 = t[idx];   x2 = t[1+idx];
1160     s1 = v[0]*x1  +  v[1]*x2;
1161     s2 = v[2]*x1  +  v[3]*x2;
1162     v -= bs2;
1163 
1164     vi    = aj + diag[i] - 1;
1165     nz    = diag[i] - diag[i+1] - 1;
1166     for(j=0;j>-nz;j--){
1167       oidx = bs*vi[j];
1168       t[oidx]   -= v[0]*s1  +  v[1]*s2;
1169       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
1170       v  -= bs2;
1171     }
1172     t[idx]   = s1;t[1+idx] = s2;
1173     idx += bs;
1174   }
1175   /* backward solve the L^T */
1176   for (i=n-1; i>=0; i--){
1177     v    = aa + bs2*ai[i];
1178     vi   = aj + ai[i];
1179     nz   = ai[i+1] - ai[i];
1180     idt  = bs*i;
1181     s1   = t[idt];  s2 = t[1+idt];
1182     for(j=0;j<nz;j++){
1183       idx   = bs*vi[j];
1184       t[idx]   -=  v[0]*s1 +  v[1]*s2;
1185       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
1186       v += bs2;
1187     }
1188   }
1189 
1190   /* copy t into x according to permutation */
1191   for(i=0;i<n;i++){
1192     ii = bs*i;  ir = bs*r[i];
1193     x[ir] = t[ii];  x[ir+1] = t[ii+1];
1194   }
1195 
1196   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1197   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1198   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1199   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1200   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1201   PetscFunctionReturn(0);
1202 }
1203 
1204 #undef __FUNCT__
1205 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_inplace"
1206 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
1207 {
1208   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1209   IS                iscol=a->col,isrow=a->row;
1210   PetscErrorCode    ierr;
1211   const PetscInt    *r,*c,*rout,*cout;
1212   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1213   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1214   const MatScalar   *aa=a->a,*v;
1215   PetscScalar       s1,s2,s3,x1,x2,x3,*x,*t;
1216   const PetscScalar *b;
1217 
1218   PetscFunctionBegin;
1219   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1220   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1221   t  = a->solve_work;
1222 
1223   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1224   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1225 
1226   /* copy the b into temp work space according to permutation */
1227   ii = 0;
1228   for (i=0; i<n; i++) {
1229     ic      = 3*c[i];
1230     t[ii]   = b[ic];
1231     t[ii+1] = b[ic+1];
1232     t[ii+2] = b[ic+2];
1233     ii += 3;
1234   }
1235 
1236   /* forward solve the U^T */
1237   idx = 0;
1238   for (i=0; i<n; i++) {
1239 
1240     v     = aa + 9*diag[i];
1241     /* multiply by the inverse of the block diagonal */
1242     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
1243     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
1244     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
1245     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
1246     v += 9;
1247 
1248     vi    = aj + diag[i] + 1;
1249     nz    = ai[i+1] - diag[i] - 1;
1250     while (nz--) {
1251       oidx = 3*(*vi++);
1252       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
1253       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
1254       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1255       v  += 9;
1256     }
1257     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
1258     idx += 3;
1259   }
1260   /* backward solve the L^T */
1261   for (i=n-1; i>=0; i--){
1262     v    = aa + 9*diag[i] - 9;
1263     vi   = aj + diag[i] - 1;
1264     nz   = diag[i] - ai[i];
1265     idt  = 3*i;
1266     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
1267     while (nz--) {
1268       idx   = 3*(*vi--);
1269       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
1270       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
1271       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1272       v -= 9;
1273     }
1274   }
1275 
1276   /* copy t into x according to permutation */
1277   ii = 0;
1278   for (i=0; i<n; i++) {
1279     ir      = 3*r[i];
1280     x[ir]   = t[ii];
1281     x[ir+1] = t[ii+1];
1282     x[ir+2] = t[ii+2];
1283     ii += 3;
1284   }
1285 
1286   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1287   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1288   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1289   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1290   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
1291   PetscFunctionReturn(0);
1292 }
1293 
1294 #undef __FUNCT__
1295 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3"
1296 PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
1297 {
1298   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1299   PetscErrorCode    ierr;
1300   IS                iscol=a->col,isrow=a->row;
1301   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1302   const PetscInt    *r,*c,*rout,*cout;
1303   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1304   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1305   const MatScalar   *aa=a->a,*v;
1306   PetscScalar       s1,s2,s3,x1,x2,x3,*x,*t;
1307   const PetscScalar *b;
1308 
1309   PetscFunctionBegin;
1310   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1311   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1312   t = a->solve_work;
1313 
1314   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1315   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1316 
1317   /* copy b into temp work space according to permutation */
1318   for(i=0;i<n;i++){
1319     ii = bs*i; ic = bs*c[i];
1320     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2];
1321   }
1322 
1323   /* forward solve the U^T */
1324   idx = 0;
1325   for (i=0; i<n; i++) {
1326     v     = aa + bs2*diag[i];
1327     /* multiply by the inverse of the block diagonal */
1328     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
1329     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
1330     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
1331     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
1332     v -= bs2;
1333 
1334     vi    = aj + diag[i] - 1;
1335     nz    = diag[i] - diag[i+1] - 1;
1336     for(j=0;j>-nz;j--){
1337       oidx = bs*vi[j];
1338       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
1339       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
1340       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1341       v  -= bs2;
1342     }
1343     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;
1344     idx += bs;
1345   }
1346   /* backward solve the L^T */
1347   for (i=n-1; i>=0; i--){
1348     v    = aa + bs2*ai[i];
1349     vi   = aj + ai[i];
1350     nz   = ai[i+1] - ai[i];
1351     idt  = bs*i;
1352     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];
1353     for(j=0;j<nz;j++){
1354       idx   = bs*vi[j];
1355       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
1356       t[idx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
1357       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1358       v += bs2;
1359     }
1360   }
1361 
1362   /* copy t into x according to permutation */
1363   for(i=0;i<n;i++){
1364     ii = bs*i;  ir = bs*r[i];
1365     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];
1366   }
1367 
1368   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1369   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1370   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1371   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1372   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1373   PetscFunctionReturn(0);
1374 }
1375 
1376 #undef __FUNCT__
1377 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_inplace"
1378 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
1379 {
1380   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1381   IS                iscol=a->col,isrow=a->row;
1382   PetscErrorCode    ierr;
1383   const PetscInt    *r,*c,*rout,*cout;
1384   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1385   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1386   const MatScalar   *aa=a->a,*v;
1387   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x,*t;
1388   const PetscScalar *b;
1389 
1390   PetscFunctionBegin;
1391   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1392   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1393   t  = a->solve_work;
1394 
1395   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1396   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1397 
1398   /* copy the b into temp work space according to permutation */
1399   ii = 0;
1400   for (i=0; i<n; i++) {
1401     ic      = 4*c[i];
1402     t[ii]   = b[ic];
1403     t[ii+1] = b[ic+1];
1404     t[ii+2] = b[ic+2];
1405     t[ii+3] = b[ic+3];
1406     ii += 4;
1407   }
1408 
1409   /* forward solve the U^T */
1410   idx = 0;
1411   for (i=0; i<n; i++) {
1412 
1413     v     = aa + 16*diag[i];
1414     /* multiply by the inverse of the block diagonal */
1415     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
1416     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1417     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1418     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1419     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1420     v += 16;
1421 
1422     vi    = aj + diag[i] + 1;
1423     nz    = ai[i+1] - diag[i] - 1;
1424     while (nz--) {
1425       oidx = 4*(*vi++);
1426       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
1427       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
1428       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1429       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1430       v  += 16;
1431     }
1432     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
1433     idx += 4;
1434   }
1435   /* backward solve the L^T */
1436   for (i=n-1; i>=0; i--){
1437     v    = aa + 16*diag[i] - 16;
1438     vi   = aj + diag[i] - 1;
1439     nz   = diag[i] - ai[i];
1440     idt  = 4*i;
1441     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
1442     while (nz--) {
1443       idx   = 4*(*vi--);
1444       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
1445       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
1446       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1447       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1448       v -= 16;
1449     }
1450   }
1451 
1452   /* copy t into x according to permutation */
1453   ii = 0;
1454   for (i=0; i<n; i++) {
1455     ir      = 4*r[i];
1456     x[ir]   = t[ii];
1457     x[ir+1] = t[ii+1];
1458     x[ir+2] = t[ii+2];
1459     x[ir+3] = t[ii+3];
1460     ii += 4;
1461   }
1462 
1463   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1464   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1465   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1466   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1467   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
1468   PetscFunctionReturn(0);
1469 }
1470 
1471 #undef __FUNCT__
1472 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4"
1473 PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
1474 {
1475   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1476   PetscErrorCode    ierr;
1477   IS                iscol=a->col,isrow=a->row;
1478   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1479   const PetscInt    *r,*c,*rout,*cout;
1480   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1481   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1482   const MatScalar   *aa=a->a,*v;
1483   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x,*t;
1484   const PetscScalar *b;
1485 
1486   PetscFunctionBegin;
1487   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1488   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1489   t = a->solve_work;
1490 
1491   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1492   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1493 
1494   /* copy b into temp work space according to permutation */
1495   for(i=0;i<n;i++){
1496     ii = bs*i; ic = bs*c[i];
1497     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1498   }
1499 
1500   /* forward solve the U^T */
1501   idx = 0;
1502   for (i=0; i<n; i++) {
1503     v     = aa + bs2*diag[i];
1504     /* multiply by the inverse of the block diagonal */
1505     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];  x4 = t[3+idx];
1506     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1507     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1508     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1509     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1510     v -= bs2;
1511 
1512     vi    = aj + diag[i] - 1;
1513     nz    = diag[i] - diag[i+1] - 1;
1514     for(j=0;j>-nz;j--){
1515       oidx = bs*vi[j];
1516       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
1517       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
1518       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1519       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1520       v  -= bs2;
1521     }
1522     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4;
1523     idx += bs;
1524   }
1525   /* backward solve the L^T */
1526   for (i=n-1; i>=0; i--){
1527     v    = aa + bs2*ai[i];
1528     vi   = aj + ai[i];
1529     nz   = ai[i+1] - ai[i];
1530     idt  = bs*i;
1531     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt];
1532     for(j=0;j<nz;j++){
1533       idx   = bs*vi[j];
1534       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3  +  v[3]*s4;
1535       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3  +  v[7]*s4;
1536       t[idx+2] -=  v[8]*s1 +  v[9]*s2 +  v[10]*s3 + v[11]*s4;
1537       t[idx+3] -= v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
1538       v += bs2;
1539     }
1540   }
1541 
1542   /* copy t into x according to permutation */
1543   for(i=0;i<n;i++){
1544     ii = bs*i;  ir = bs*r[i];
1545     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
1546   }
1547 
1548   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1549   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1550   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1551   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1552   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1553   PetscFunctionReturn(0);
1554 }
1555 
1556 #undef __FUNCT__
1557 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_inplace"
1558 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
1559 {
1560   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1561   IS                iscol=a->col,isrow=a->row;
1562   PetscErrorCode    ierr;
1563   const PetscInt    *r,*c,*rout,*cout;
1564   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1565   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1566   const MatScalar   *aa=a->a,*v;
1567   PetscScalar       s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t;
1568   const PetscScalar *b;
1569 
1570   PetscFunctionBegin;
1571   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1572   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1573   t  = a->solve_work;
1574 
1575   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1576   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1577 
1578   /* copy the b into temp work space according to permutation */
1579   ii = 0;
1580   for (i=0; i<n; i++) {
1581     ic      = 5*c[i];
1582     t[ii]   = b[ic];
1583     t[ii+1] = b[ic+1];
1584     t[ii+2] = b[ic+2];
1585     t[ii+3] = b[ic+3];
1586     t[ii+4] = b[ic+4];
1587     ii += 5;
1588   }
1589 
1590   /* forward solve the U^T */
1591   idx = 0;
1592   for (i=0; i<n; i++) {
1593 
1594     v     = aa + 25*diag[i];
1595     /* multiply by the inverse of the block diagonal */
1596     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1597     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1598     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1599     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1600     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1601     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1602     v += 25;
1603 
1604     vi    = aj + diag[i] + 1;
1605     nz    = ai[i+1] - diag[i] - 1;
1606     while (nz--) {
1607       oidx = 5*(*vi++);
1608       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1609       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1610       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1611       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1612       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1613       v  += 25;
1614     }
1615     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1616     idx += 5;
1617   }
1618   /* backward solve the L^T */
1619   for (i=n-1; i>=0; i--){
1620     v    = aa + 25*diag[i] - 25;
1621     vi   = aj + diag[i] - 1;
1622     nz   = diag[i] - ai[i];
1623     idt  = 5*i;
1624     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1625     while (nz--) {
1626       idx   = 5*(*vi--);
1627       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1628       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1629       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1630       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1631       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1632       v -= 25;
1633     }
1634   }
1635 
1636   /* copy t into x according to permutation */
1637   ii = 0;
1638   for (i=0; i<n; i++) {
1639     ir      = 5*r[i];
1640     x[ir]   = t[ii];
1641     x[ir+1] = t[ii+1];
1642     x[ir+2] = t[ii+2];
1643     x[ir+3] = t[ii+3];
1644     x[ir+4] = t[ii+4];
1645     ii += 5;
1646   }
1647 
1648   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1649   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1650   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1651   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1652   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
1653   PetscFunctionReturn(0);
1654 }
1655 
1656 #undef __FUNCT__
1657 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5"
1658 PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
1659 {
1660   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1661   PetscErrorCode    ierr;
1662   IS                iscol=a->col,isrow=a->row;
1663   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1664   const PetscInt    *r,*c,*rout,*cout;
1665   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1666   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1667   const MatScalar   *aa=a->a,*v;
1668   PetscScalar       s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t;
1669   const PetscScalar *b;
1670 
1671   PetscFunctionBegin;
1672   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1673   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1674   t = a->solve_work;
1675 
1676   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1677   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1678 
1679   /* copy b into temp work space according to permutation */
1680   for(i=0;i<n;i++){
1681     ii = bs*i; ic = bs*c[i];
1682     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1683     t[ii+4] = b[ic+4];
1684   }
1685 
1686   /* forward solve the U^T */
1687   idx = 0;
1688   for (i=0; i<n; i++) {
1689     v     = aa + bs2*diag[i];
1690     /* multiply by the inverse of the block diagonal */
1691     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1692     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1693     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1694     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1695     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1696     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1697     v -= bs2;
1698 
1699     vi    = aj + diag[i] - 1;
1700     nz    = diag[i] - diag[i+1] - 1;
1701     for(j=0;j>-nz;j--){
1702       oidx = bs*vi[j];
1703       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1704       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1705       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1706       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1707       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1708       v  -= bs2;
1709     }
1710     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
1711     idx += bs;
1712   }
1713   /* backward solve the L^T */
1714   for (i=n-1; i>=0; i--){
1715     v    = aa + bs2*ai[i];
1716     vi   = aj + ai[i];
1717     nz   = ai[i+1] - ai[i];
1718     idt  = bs*i;
1719     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
1720     for(j=0;j<nz;j++){
1721       idx   = bs*vi[j];
1722       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1723       t[idx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1724       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1725       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1726       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1727       v += bs2;
1728     }
1729   }
1730 
1731   /* copy t into x according to permutation */
1732   for(i=0;i<n;i++){
1733     ii = bs*i;  ir = bs*r[i];
1734     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
1735     x[ir+4] = t[ii+4];
1736   }
1737 
1738   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1739   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1740   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1741   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1742   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1743   PetscFunctionReturn(0);
1744 }
1745 
1746 #undef __FUNCT__
1747 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_inplace"
1748 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
1749 {
1750   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1751   IS                iscol=a->col,isrow=a->row;
1752   PetscErrorCode    ierr;
1753   const PetscInt    *r,*c,*rout,*cout;
1754   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1755   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1756   const MatScalar   *aa=a->a,*v;
1757   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t;
1758   const PetscScalar *b;
1759 
1760   PetscFunctionBegin;
1761   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1762   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1763   t  = a->solve_work;
1764 
1765   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1766   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1767 
1768   /* copy the b into temp work space according to permutation */
1769   ii = 0;
1770   for (i=0; i<n; i++) {
1771     ic      = 6*c[i];
1772     t[ii]   = b[ic];
1773     t[ii+1] = b[ic+1];
1774     t[ii+2] = b[ic+2];
1775     t[ii+3] = b[ic+3];
1776     t[ii+4] = b[ic+4];
1777     t[ii+5] = b[ic+5];
1778     ii += 6;
1779   }
1780 
1781   /* forward solve the U^T */
1782   idx = 0;
1783   for (i=0; i<n; i++) {
1784 
1785     v     = aa + 36*diag[i];
1786     /* multiply by the inverse of the block diagonal */
1787     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1788     x6    = t[5+idx];
1789     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
1790     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
1791     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
1792     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
1793     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
1794     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
1795     v += 36;
1796 
1797     vi    = aj + diag[i] + 1;
1798     nz    = ai[i+1] - diag[i] - 1;
1799     while (nz--) {
1800       oidx = 6*(*vi++);
1801       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1802       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1803       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1804       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1805       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1806       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1807       v  += 36;
1808     }
1809     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1810     t[5+idx] = s6;
1811     idx += 6;
1812   }
1813   /* backward solve the L^T */
1814   for (i=n-1; i>=0; i--){
1815     v    = aa + 36*diag[i] - 36;
1816     vi   = aj + diag[i] - 1;
1817     nz   = diag[i] - ai[i];
1818     idt  = 6*i;
1819     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1820     s6 = t[5+idt];
1821     while (nz--) {
1822       idx   = 6*(*vi--);
1823       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1824       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1825       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1826       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1827       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1828       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1829       v -= 36;
1830     }
1831   }
1832 
1833   /* copy t into x according to permutation */
1834   ii = 0;
1835   for (i=0; i<n; i++) {
1836     ir      = 6*r[i];
1837     x[ir]   = t[ii];
1838     x[ir+1] = t[ii+1];
1839     x[ir+2] = t[ii+2];
1840     x[ir+3] = t[ii+3];
1841     x[ir+4] = t[ii+4];
1842     x[ir+5] = t[ii+5];
1843     ii += 6;
1844   }
1845 
1846   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1847   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1848   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1849   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1850   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1851   PetscFunctionReturn(0);
1852 }
1853 
1854 #undef __FUNCT__
1855 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6"
1856 PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
1857 {
1858   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1859   PetscErrorCode    ierr;
1860   IS                iscol=a->col,isrow=a->row;
1861   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1862   const PetscInt    *r,*c,*rout,*cout;
1863   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1864   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1865   const MatScalar   *aa=a->a,*v;
1866   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t;
1867   const PetscScalar *b;
1868 
1869   PetscFunctionBegin;
1870   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1871   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1872   t = a->solve_work;
1873 
1874   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1875   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1876 
1877   /* copy b into temp work space according to permutation */
1878   for(i=0;i<n;i++){
1879     ii = bs*i; ic = bs*c[i];
1880     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1881     t[ii+4] = b[ic+4];  t[ii+5] = b[ic+5];
1882   }
1883 
1884   /* forward solve the U^T */
1885   idx = 0;
1886   for (i=0; i<n; i++) {
1887     v     = aa + bs2*diag[i];
1888     /* multiply by the inverse of the block diagonal */
1889     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1890     x6    = t[5+idx];
1891     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
1892     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
1893     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
1894     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
1895     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
1896     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
1897     v -= bs2;
1898 
1899     vi    = aj + diag[i] - 1;
1900     nz    = diag[i] - diag[i+1] - 1;
1901     for(j=0;j>-nz;j--){
1902       oidx = bs*vi[j];
1903       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1904       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1905       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1906       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1907       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1908       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1909       v  -= bs2;
1910     }
1911     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
1912     t[5+idx] = s6;
1913     idx += bs;
1914   }
1915   /* backward solve the L^T */
1916   for (i=n-1; i>=0; i--){
1917     v    = aa + bs2*ai[i];
1918     vi   = aj + ai[i];
1919     nz   = ai[i+1] - ai[i];
1920     idt  = bs*i;
1921     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
1922     s6   = t[5+idt];
1923    for(j=0;j<nz;j++){
1924       idx   = bs*vi[j];
1925       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1926       t[idx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1927       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1928       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1929       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1930       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1931       v += bs2;
1932     }
1933   }
1934 
1935   /* copy t into x according to permutation */
1936   for(i=0;i<n;i++){
1937     ii = bs*i;  ir = bs*r[i];
1938     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
1939     x[ir+4] = t[ii+4];  x[ir+5] = t[ii+5];
1940   }
1941 
1942   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1943   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1944   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1945   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1946   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1947   PetscFunctionReturn(0);
1948 }
1949 
1950 #undef __FUNCT__
1951 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_inplace"
1952 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
1953 {
1954   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1955   IS                iscol=a->col,isrow=a->row;
1956   PetscErrorCode    ierr;
1957   const PetscInt    *r,*c,*rout,*cout;
1958   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1959   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1960   const MatScalar   *aa=a->a,*v;
1961   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
1962   const PetscScalar *b;
1963 
1964   PetscFunctionBegin;
1965   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1966   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1967   t  = a->solve_work;
1968 
1969   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1970   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1971 
1972   /* copy the b into temp work space according to permutation */
1973   ii = 0;
1974   for (i=0; i<n; i++) {
1975     ic      = 7*c[i];
1976     t[ii]   = b[ic];
1977     t[ii+1] = b[ic+1];
1978     t[ii+2] = b[ic+2];
1979     t[ii+3] = b[ic+3];
1980     t[ii+4] = b[ic+4];
1981     t[ii+5] = b[ic+5];
1982     t[ii+6] = b[ic+6];
1983     ii += 7;
1984   }
1985 
1986   /* forward solve the U^T */
1987   idx = 0;
1988   for (i=0; i<n; i++) {
1989 
1990     v     = aa + 49*diag[i];
1991     /* multiply by the inverse of the block diagonal */
1992     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1993     x6    = t[5+idx]; x7 = t[6+idx];
1994     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1995     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1996     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1997     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1998     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1999     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
2000     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
2001     v += 49;
2002 
2003     vi    = aj + diag[i] + 1;
2004     nz    = ai[i+1] - diag[i] - 1;
2005     while (nz--) {
2006       oidx = 7*(*vi++);
2007       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
2008       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
2009       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
2010       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
2011       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
2012       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
2013       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
2014       v  += 49;
2015     }
2016     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2017     t[5+idx] = s6;t[6+idx] = s7;
2018     idx += 7;
2019   }
2020   /* backward solve the L^T */
2021   for (i=n-1; i>=0; i--){
2022     v    = aa + 49*diag[i] - 49;
2023     vi   = aj + diag[i] - 1;
2024     nz   = diag[i] - ai[i];
2025     idt  = 7*i;
2026     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2027     s6 = t[5+idt];s7 = t[6+idt];
2028     while (nz--) {
2029       idx   = 7*(*vi--);
2030       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
2031       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
2032       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
2033       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
2034       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
2035       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
2036       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
2037       v -= 49;
2038     }
2039   }
2040 
2041   /* copy t into x according to permutation */
2042   ii = 0;
2043   for (i=0; i<n; i++) {
2044     ir      = 7*r[i];
2045     x[ir]   = t[ii];
2046     x[ir+1] = t[ii+1];
2047     x[ir+2] = t[ii+2];
2048     x[ir+3] = t[ii+3];
2049     x[ir+4] = t[ii+4];
2050     x[ir+5] = t[ii+5];
2051     x[ir+6] = t[ii+6];
2052     ii += 7;
2053   }
2054 
2055   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2056   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2057   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2058   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2059   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
2060   PetscFunctionReturn(0);
2061 }
2062 #undef __FUNCT__
2063 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7"
2064 PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
2065 {
2066   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2067   PetscErrorCode    ierr;
2068   IS                iscol=a->col,isrow=a->row;
2069   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
2070   const PetscInt    *r,*c,*rout,*cout;
2071   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
2072   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
2073   const MatScalar   *aa=a->a,*v;
2074   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
2075   const PetscScalar *b;
2076 
2077   PetscFunctionBegin;
2078   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2079   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2080   t = a->solve_work;
2081 
2082   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2083   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2084 
2085   /* copy b into temp work space according to permutation */
2086   for(i=0;i<n;i++){
2087     ii = bs*i; ic = bs*c[i];
2088     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
2089     t[ii+4] = b[ic+4];  t[ii+5] = b[ic+5];  t[ii+6] = b[ic+6];
2090   }
2091 
2092   /* forward solve the U^T */
2093   idx = 0;
2094   for (i=0; i<n; i++) {
2095     v     = aa + bs2*diag[i];
2096     /* multiply by the inverse of the block diagonal */
2097     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2098     x6    = t[5+idx]; x7 = t[6+idx];
2099     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
2100     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
2101     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
2102     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
2103     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
2104     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
2105     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
2106     v -= bs2;
2107 
2108     vi    = aj + diag[i] - 1;
2109     nz    = diag[i] - diag[i+1] - 1;
2110     for(j=0;j>-nz;j--){
2111       oidx = bs*vi[j];
2112       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
2113       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
2114       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
2115       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
2116       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
2117       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
2118       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
2119       v  -= bs2;
2120     }
2121     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
2122     t[5+idx] = s6;  t[6+idx] = s7;
2123     idx += bs;
2124   }
2125   /* backward solve the L^T */
2126   for (i=n-1; i>=0; i--){
2127     v    = aa + bs2*ai[i];
2128     vi   = aj + ai[i];
2129     nz   = ai[i+1] - ai[i];
2130     idt  = bs*i;
2131     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
2132     s6   = t[5+idt];  s7 = t[6+idt];
2133    for(j=0;j<nz;j++){
2134       idx   = bs*vi[j];
2135       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
2136       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
2137       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
2138       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
2139       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
2140       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
2141       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
2142       v += bs2;
2143     }
2144   }
2145 
2146   /* copy t into x according to permutation */
2147   for(i=0;i<n;i++){
2148     ii = bs*i;  ir = bs*r[i];
2149     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
2150     x[ir+4] = t[ii+4];  x[ir+5] = t[ii+5];  x[ir+6] = t[ii+6];
2151   }
2152 
2153   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2154   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2155   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2156   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2157   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2158   PetscFunctionReturn(0);
2159 }
2160 
2161 /* ----------------------------------------------------------- */
2162 #undef __FUNCT__
2163 #define __FUNCT__ "MatSolve_SeqBAIJ_N_inplace"
2164 PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
2165 {
2166   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2167   IS                iscol=a->col,isrow=a->row;
2168   PetscErrorCode    ierr;
2169   const PetscInt    *r,*c,*rout,*cout;
2170   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*vi;
2171   PetscInt          i,nz;
2172   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
2173   const MatScalar   *aa=a->a,*v;
2174   PetscScalar       *x,*s,*t,*ls;
2175   const PetscScalar *b;
2176 
2177   PetscFunctionBegin;
2178   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2179   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2180   t  = a->solve_work;
2181 
2182   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2183   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2184 
2185   /* forward solve the lower triangular */
2186   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
2187   for (i=1; i<n; i++) {
2188     v   = aa + bs2*ai[i];
2189     vi  = aj + ai[i];
2190     nz  = a->diag[i] - ai[i];
2191     s = t + bs*i;
2192     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
2193     while (nz--) {
2194       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
2195       v += bs2;
2196     }
2197   }
2198   /* backward solve the upper triangular */
2199   ls = a->solve_work + A->cmap->n;
2200   for (i=n-1; i>=0; i--){
2201     v   = aa + bs2*(a->diag[i] + 1);
2202     vi  = aj + a->diag[i] + 1;
2203     nz  = ai[i+1] - a->diag[i] - 1;
2204     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
2205     while (nz--) {
2206       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
2207       v += bs2;
2208     }
2209     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
2210     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
2211   }
2212 
2213   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2214   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2215   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2216   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2217   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
2218   PetscFunctionReturn(0);
2219 }
2220 
2221 /* ----------------------------------------------------------- */
2222 #undef __FUNCT__
2223 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N_inplace"
2224 PetscErrorCode MatSolveTranspose_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
2225 {
2226   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2227   IS                iscol=a->col,isrow=a->row;
2228   PetscErrorCode    ierr;
2229   const PetscInt    *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
2230   PetscInt          i,nz,j;
2231   const PetscInt    n=a->mbs,bs=A->rmap->bs,bs2=a->bs2;
2232   const MatScalar   *aa=a->a,*v;
2233   PetscScalar       *x,*t,*ls;
2234   const PetscScalar *b;
2235   PetscFunctionBegin;
2236   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2237   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2238   t    = a->solve_work;
2239 
2240   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2241   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2242 
2243   /* copy the b into temp work space according to permutation */
2244   for (i=0; i<n; i++) {
2245     for (j=0; j<bs; j++) {
2246       t[i*bs+j] = b[c[i]*bs+j];
2247     }
2248   }
2249 
2250 
2251   /* forward solve the upper triangular transpose */
2252   ls = a->solve_work + A->cmap->n;
2253   for (i=0; i<n; i++){
2254     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
2255     Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
2256     v   = aa + bs2*(a->diag[i] + 1);
2257     vi  = aj + a->diag[i] + 1;
2258     nz  = ai[i+1] - a->diag[i] - 1;
2259     while (nz--) {
2260       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
2261       v += bs2;
2262     }
2263   }
2264 
2265   /* backward solve the lower triangular transpose */
2266   for (i=n-1; i>=0; i--) {
2267     v   = aa + bs2*ai[i];
2268     vi  = aj + ai[i];
2269     nz  = a->diag[i] - ai[i];
2270     while (nz--) {
2271       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
2272       v += bs2;
2273     }
2274   }
2275 
2276   /* copy t into x according to permutation */
2277   for (i=0; i<n; i++) {
2278     for (j=0; j<bs; j++) {
2279       x[bs*r[i]+j]   = t[bs*i+j];
2280     }
2281   }
2282 
2283   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2284   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2285   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2286   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2287   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
2288   PetscFunctionReturn(0);
2289 }
2290 
2291 #undef __FUNCT__
2292 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N"
2293 PetscErrorCode MatSolveTranspose_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
2294 {
2295   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2296   IS                iscol=a->col,isrow=a->row;
2297   PetscErrorCode    ierr;
2298   const PetscInt    *r,*c,*rout,*cout;
2299   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*vi,*diag=a->diag;
2300   PetscInt          i,j,nz;
2301   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
2302   const MatScalar   *aa=a->a,*v;
2303   PetscScalar       *x,*t,*ls;
2304   const PetscScalar *b;
2305 
2306   PetscFunctionBegin;
2307   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2308   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2309   t    = a->solve_work;
2310 
2311   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2312   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2313 
2314   /* copy the b into temp work space according to permutation */
2315   for (i=0; i<n; i++) {
2316     for (j=0; j<bs; j++) {
2317       t[i*bs+j] = b[c[i]*bs+j];
2318     }
2319   }
2320 
2321 
2322   /* forward solve the upper triangular transpose */
2323   ls = a->solve_work + A->cmap->n;
2324   for (i=0; i<n; i++){
2325     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
2326     Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*diag[i],t+i*bs);
2327     v   = aa + bs2*(diag[i] - 1);
2328     vi  = aj + diag[i] - 1;
2329     nz  = diag[i] - diag[i+1] - 1;
2330     for(j=0;j>-nz;j--){
2331       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs);
2332       v -= bs2;
2333     }
2334   }
2335 
2336   /* backward solve the lower triangular transpose */
2337   for (i=n-1; i>=0; i--) {
2338     v   = aa + bs2*ai[i];
2339     vi  = aj + ai[i];
2340     nz  = ai[i+1] - ai[i];
2341     for(j=0;j<nz;j++){
2342       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs);
2343       v += bs2;
2344     }
2345   }
2346 
2347   /* copy t into x according to permutation */
2348   for (i=0; i<n; i++) {
2349     for (j=0; j<bs; j++) {
2350       x[bs*r[i]+j]   = t[bs*i+j];
2351     }
2352   }
2353 
2354   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2355   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2356   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2357   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2358   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
2359   PetscFunctionReturn(0);
2360 }
2361 
2362 /* bs = 15 for PFLOTRAN. Block operations are done by accessing all the columns   of the block at once */
2363 
2364 #undef __FUNCT__
2365 #define __FUNCT__ "MatSolve_SeqBAIJ_15_NaturalOrdering_ver2"
2366 PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver2(Mat A,Vec bb,Vec xx)
2367 {
2368   Mat_SeqBAIJ      *a=(Mat_SeqBAIJ *)A->data;
2369   PetscErrorCode    ierr;
2370   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
2371   PetscInt          i,nz,idx,idt,m;
2372   const MatScalar   *aa=a->a,*v;
2373   PetscScalar       s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15;
2374   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15;
2375   PetscScalar       *x;
2376   const PetscScalar *b;
2377 
2378   PetscFunctionBegin;
2379   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2380   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2381 
2382   /* forward solve the lower triangular */
2383   idx    = 0;
2384   x[0]  = b[idx];    x[1]  = b[1+idx];  x[2]  = b[2+idx];  x[3]  = b[3+idx];  x[4]  = b[4+idx];
2385   x[5]  = b[5+idx];  x[6]  = b[6+idx];  x[7]  = b[7+idx];  x[8]  = b[8+idx];  x[9]  = b[9+idx];
2386   x[10] = b[10+idx]; x[11] = b[11+idx]; x[12] = b[12+idx]; x[13] = b[13+idx]; x[14] = b[14+idx];
2387 
2388   for (i=1; i<n; i++) {
2389     v     = aa + bs2*ai[i];
2390     vi    = aj + ai[i];
2391     nz    = ai[i+1] - ai[i];
2392     idt   = bs*i;
2393     s1   = b[idt];    s2  = b[1+idt];  s3  = b[2+idt];  s4  = b[3+idt];  s5  = b[4+idt];
2394     s6   = b[5+idt];  s7  = b[6+idt];  s8  = b[7+idt];  s9  = b[8+idt];  s10 = b[9+idt];
2395     s11  = b[10+idt]; s12 = b[11+idt]; s13 = b[12+idt]; s14 = b[13+idt]; s15 = b[14+idt];
2396     for(m=0;m<nz;m++){
2397       idx   = bs*vi[m];
2398       x1   = x[idx];     x2  = x[1+idx];  x3  = x[2+idx];  x4  = x[3+idx];  x5  = x[4+idx];
2399       x6   = x[5+idx];   x7  = x[6+idx];  x8  = x[7+idx];  x9  = x[8+idx];  x10 = x[9+idx];
2400       x11  = x[10+idx]; x12  = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx];
2401 
2402 
2403       s1 -=  v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
2404       s2 -=  v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
2405       s3 -=  v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
2406       s4 -=  v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
2407       s5  -= v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
2408       s6  -= v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
2409       s7  -= v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
2410       s8  -= v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
2411       s9  -= v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
2412       s10 -= v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
2413       s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
2414       s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
2415       s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
2416       s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
2417       s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
2418 
2419       v += bs2;
2420     }
2421     x[idt]    = s1;  x[1+idt]  = s2;  x[2+idt]  = s3;  x[3+idt]  = s4;  x[4+idt]  = s5;
2422     x[5+idt]  = s6;  x[6+idt]  = s7;  x[7+idt]  = s8;  x[8+idt]  = s9;  x[9+idt]  = s10;
2423     x[10+idt] = s11; x[11+idt] = s12; x[12+idt] = s13; x[13+idt] = s14; x[14+idt] = s15;
2424 
2425   }
2426   /* backward solve the upper triangular */
2427   for (i=n-1; i>=0; i--){
2428     v    = aa + bs2*(adiag[i+1]+1);
2429     vi   = aj + adiag[i+1]+1;
2430     nz   = adiag[i] - adiag[i+1] - 1;
2431     idt  = bs*i;
2432     s1   = x[idt];     s2  = x[1+idt];  s3  = x[2+idt];  s4  = x[3+idt];  s5  = x[4+idt];
2433     s6   = x[5+idt];   s7  = x[6+idt];  s8  = x[7+idt];  s9  = x[8+idt];  s10 = x[9+idt];
2434     s11  = x[10+idt]; s12  = x[11+idt]; s13 = x[12+idt]; s14 = x[13+idt]; s15 = x[14+idt];
2435 
2436     for(m=0;m<nz;m++){
2437       idx   = bs*vi[m];
2438       x1   = x[idx];     x2  = x[1+idx];  x3  = x[2+idx];  x4  = x[3+idx];  x5  = x[4+idx];
2439       x6   = x[5+idx];   x7  = x[6+idx];  x8  = x[7+idx];  x9  = x[8+idx];  x10 = x[9+idx];
2440       x11  = x[10+idx]; x12  = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx];
2441 
2442       s1  -= v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
2443       s2  -= v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
2444       s3  -= v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
2445       s4  -= v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
2446       s5  -= v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
2447       s6  -= v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
2448       s7  -= v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
2449       s8  -= v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
2450       s9  -= v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
2451       s10 -= v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
2452       s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
2453       s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
2454       s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
2455       s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
2456       s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
2457 
2458       v += bs2;
2459     }
2460 
2461     x[idt] = v[0]*s1  + v[15]*s2 + v[30]*s3 + v[45]*s4 + v[60]*s5 + v[75]*s6 + v[90]*s7  + v[105]*s8 + v[120]*s9 + v[135]*s10 + v[150]*s11 + v[165]*s12 + v[180]*s13 + v[195]*s14 + v[210]*s15;
2462     x[1+idt] = v[1]*s1  + v[16]*s2 + v[31]*s3 + v[46]*s4 + v[61]*s5 + v[76]*s6 + v[91]*s7  + v[106]*s8 + v[121]*s9 + v[136]*s10 + v[151]*s11 + v[166]*s12 + v[181]*s13 + v[196]*s14 + v[211]*s15;
2463     x[2+idt] = v[2]*s1  + v[17]*s2 + v[32]*s3 + v[47]*s4 + v[62]*s5 + v[77]*s6 + v[92]*s7  + v[107]*s8 + v[122]*s9 + v[137]*s10 + v[152]*s11 + v[167]*s12 + v[182]*s13 + v[197]*s14 + v[212]*s15;
2464     x[3+idt] = v[3]*s1  + v[18]*s2 + v[33]*s3 + v[48]*s4 + v[63]*s5 + v[78]*s6 + v[93]*s7  + v[108]*s8 + v[123]*s9 + v[138]*s10 + v[153]*s11 + v[168]*s12 + v[183]*s13 + v[198]*s14 + v[213]*s15;
2465     x[4+idt] = v[4]*s1  + v[19]*s2 + v[34]*s3 + v[49]*s4 + v[64]*s5 + v[79]*s6 + v[94]*s7  + v[109]*s8 + v[124]*s9 + v[139]*s10 + v[154]*s11 + v[169]*s12 + v[184]*s13 + v[199]*s14 + v[214]*s15;
2466     x[5+idt] = v[5]*s1  + v[20]*s2 + v[35]*s3 + v[50]*s4 + v[65]*s5 + v[80]*s6 + v[95]*s7  + v[110]*s8 + v[125]*s9 + v[140]*s10 + v[155]*s11 + v[170]*s12 + v[185]*s13 + v[200]*s14 + v[215]*s15;
2467     x[6+idt] = v[6]*s1  + v[21]*s2 + v[36]*s3 + v[51]*s4 + v[66]*s5 + v[81]*s6 + v[96]*s7  + v[111]*s8 + v[126]*s9 + v[141]*s10 + v[156]*s11 + v[171]*s12 + v[186]*s13 + v[201]*s14 + v[216]*s15;
2468     x[7+idt] = v[7]*s1  + v[22]*s2 + v[37]*s3 + v[52]*s4 + v[67]*s5 + v[82]*s6 + v[97]*s7  + v[112]*s8 + v[127]*s9 + v[142]*s10 + v[157]*s11 + v[172]*s12 + v[187]*s13 + v[202]*s14 + v[217]*s15;
2469     x[8+idt] = v[8]*s1  + v[23]*s2 + v[38]*s3 + v[53]*s4 + v[68]*s5 + v[83]*s6 + v[98]*s7  + v[113]*s8 + v[128]*s9 + v[143]*s10 + v[158]*s11 + v[173]*s12 + v[188]*s13 + v[203]*s14 + v[218]*s15;
2470     x[9+idt] = v[9]*s1  + v[24]*s2 + v[39]*s3 + v[54]*s4 + v[69]*s5 + v[84]*s6 + v[99]*s7  + v[114]*s8 + v[129]*s9 + v[144]*s10 + v[159]*s11 + v[174]*s12 + v[189]*s13 + v[204]*s14 + v[219]*s15;
2471     x[10+idt] = v[10]*s1 + v[25]*s2 + v[40]*s3 + v[55]*s4 + v[70]*s5 + v[85]*s6 + v[100]*s7 + v[115]*s8 + v[130]*s9 + v[145]*s10 + v[160]*s11 + v[175]*s12 + v[190]*s13 + v[205]*s14 + v[220]*s15;
2472     x[11+idt] = v[11]*s1 + v[26]*s2 + v[41]*s3 + v[56]*s4 + v[71]*s5 + v[86]*s6 + v[101]*s7 + v[116]*s8 + v[131]*s9 + v[146]*s10 + v[161]*s11 + v[176]*s12 + v[191]*s13 + v[206]*s14 + v[221]*s15;
2473     x[12+idt] = v[12]*s1 + v[27]*s2 + v[42]*s3 + v[57]*s4 + v[72]*s5 + v[87]*s6 + v[102]*s7 + v[117]*s8 + v[132]*s9 + v[147]*s10 + v[162]*s11 + v[177]*s12 + v[192]*s13 + v[207]*s14 + v[222]*s15;
2474     x[13+idt] = v[13]*s1 + v[28]*s2 + v[43]*s3 + v[58]*s4 + v[73]*s5 + v[88]*s6 + v[103]*s7 + v[118]*s8 + v[133]*s9 + v[148]*s10 + v[163]*s11 + v[178]*s12 + v[193]*s13 + v[208]*s14 + v[223]*s15;
2475     x[14+idt] = v[14]*s1 + v[29]*s2 + v[44]*s3 + v[59]*s4 + v[74]*s5 + v[89]*s6 + v[104]*s7 + v[119]*s8 + v[134]*s9 + v[149]*s10 + v[164]*s11 + v[179]*s12 + v[194]*s13 + v[209]*s14 + v[224]*s15;
2476 
2477   }
2478 
2479   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2480   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2481   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2482   PetscFunctionReturn(0);
2483 }
2484 
2485 /* bs = 15 for PFLOTRAN. Block operations are done by accessing one column at at time */
2486 /* Default MatSolve for block size 15 */
2487 
2488 #undef __FUNCT__
2489 #define __FUNCT__ "MatSolve_SeqBAIJ_15_NaturalOrdering_ver1"
2490 PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver1(Mat A,Vec bb,Vec xx)
2491 {
2492   Mat_SeqBAIJ      *a=(Mat_SeqBAIJ *)A->data;
2493   PetscErrorCode    ierr;
2494   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
2495   PetscInt          i,k,nz,idx,idt,m;
2496   const MatScalar   *aa=a->a,*v;
2497   PetscScalar       s[15];
2498   PetscScalar       *x,xv;
2499   const PetscScalar *b;
2500 
2501   PetscFunctionBegin;
2502   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2503   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2504 
2505   /* forward solve the lower triangular */
2506   for (i=0; i<n; i++) {
2507     v     = aa + bs2*ai[i];
2508     vi    = aj + ai[i];
2509     nz    = ai[i+1] - ai[i];
2510     idt   = bs*i;
2511     x[idt]   = b[idt];    x[1+idt]  = b[1+idt];  x[2+idt]  = b[2+idt];  x[3+idt]  = b[3+idt];  x[4+idt]  = b[4+idt];
2512     x[5+idt]   = b[5+idt];  x[6+idt]  = b[6+idt];  x[7+idt]  = b[7+idt];  x[8+idt]  = b[8+idt];  x[9+idt] = b[9+idt];
2513     x[10+idt]  = b[10+idt]; x[11+idt] = b[11+idt]; x[12+idt] = b[12+idt]; x[13+idt] = b[13+idt]; x[14+idt] = b[14+idt];
2514     for(m=0;m<nz;m++){
2515       idx   = bs*vi[m];
2516       for(k=0;k<15;k++){
2517 	xv        = x[k + idx];
2518 	x[idt]    -= v[0]*xv;
2519 	x[1+idt]  -= v[1]*xv;
2520 	x[2+idt]  -= v[2]*xv;
2521         x[3+idt]  -= v[3]*xv;
2522 	x[4+idt]  -= v[4]*xv;
2523 	x[5+idt]  -= v[5]*xv;
2524 	x[6+idt]  -= v[6]*xv;
2525         x[7+idt]  -= v[7]*xv;
2526 	x[8+idt]  -= v[8]*xv;
2527 	x[9+idt]  -= v[9]*xv;
2528 	x[10+idt] -= v[10]*xv;
2529         x[11+idt] -= v[11]*xv;
2530 	x[12+idt] -= v[12]*xv;
2531 	x[13+idt] -= v[13]*xv;
2532 	x[14+idt] -= v[14]*xv;
2533 	v += 15;
2534       }
2535     }
2536   }
2537   /* backward solve the upper triangular */
2538   for (i=n-1; i>=0; i--){
2539     v    = aa + bs2*(adiag[i+1]+1);
2540     vi   = aj + adiag[i+1]+1;
2541     nz   = adiag[i] - adiag[i+1] - 1;
2542     idt  = bs*i;
2543     s[0]   = x[idt];    s[1]  = x[1+idt];  s[2]  = x[2+idt];  s[3]  = x[3+idt];  s[4]  = x[4+idt];
2544     s[5]   = x[5+idt];  s[6]  = x[6+idt];  s[7]  = x[7+idt];  s[8]  = x[8+idt];  s[9]  = x[9+idt];
2545     s[10]  = x[10+idt]; s[11] = x[11+idt]; s[12] = x[12+idt]; s[13] = x[13+idt]; s[14] = x[14+idt];
2546 
2547     for(m=0;m<nz;m++){
2548       idx   = bs*vi[m];
2549       for(k=0;k<15;k++){
2550 	xv = x[k + idx];
2551 	s[0]  -= v[0]*xv;
2552 	s[1]  -= v[1]*xv;
2553 	s[2]  -= v[2]*xv;
2554         s[3]  -= v[3]*xv;
2555 	s[4]  -= v[4]*xv;
2556 	s[5]  -= v[5]*xv;
2557 	s[6]  -= v[6]*xv;
2558         s[7]  -= v[7]*xv;
2559 	s[8]  -= v[8]*xv;
2560 	s[9]  -= v[9]*xv;
2561 	s[10] -= v[10]*xv;
2562         s[11] -= v[11]*xv;
2563 	s[12] -= v[12]*xv;
2564 	s[13] -= v[13]*xv;
2565 	s[14] -= v[14]*xv;
2566 	v += 15;
2567       }
2568     }
2569     ierr = PetscMemzero(x+idt,bs*sizeof(MatScalar));CHKERRQ(ierr);
2570     for(k=0;k<15;k++){
2571       x[idt]    += v[0]*s[k];
2572       x[1+idt]  += v[1]*s[k];
2573       x[2+idt]  += v[2]*s[k];
2574       x[3+idt]  += v[3]*s[k];
2575       x[4+idt]  += v[4]*s[k];
2576       x[5+idt]  += v[5]*s[k];
2577       x[6+idt]  += v[6]*s[k];
2578       x[7+idt]  += v[7]*s[k];
2579       x[8+idt]  += v[8]*s[k];
2580       x[9+idt]  += v[9]*s[k];
2581       x[10+idt] += v[10]*s[k];
2582       x[11+idt] += v[11]*s[k];
2583       x[12+idt] += v[12]*s[k];
2584       x[13+idt] += v[13]*s[k];
2585       x[14+idt] += v[14]*s[k];
2586       v += 15;
2587     }
2588   }
2589   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2590   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2591   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2592   PetscFunctionReturn(0);
2593 }
2594 
2595 
2596 #undef __FUNCT__
2597 #define __FUNCT__ "MatSolve_SeqBAIJ_7_inplace"
2598 PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
2599 {
2600   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2601   IS                iscol=a->col,isrow=a->row;
2602   PetscErrorCode    ierr;
2603   const PetscInt    *r,*c,*ai=a->i,*aj=a->j;
2604   const PetscInt    *rout,*cout,*diag = a->diag,*vi,n=a->mbs;
2605   PetscInt          i,nz,idx,idt,idc;
2606   const MatScalar   *aa=a->a,*v;
2607   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
2608   const PetscScalar *b;
2609 
2610   PetscFunctionBegin;
2611   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2612   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2613   t  = a->solve_work;
2614 
2615   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2616   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2617 
2618   /* forward solve the lower triangular */
2619   idx    = 7*(*r++);
2620   t[0] = b[idx];   t[1] = b[1+idx];
2621   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
2622   t[5] = b[5+idx]; t[6] = b[6+idx];
2623 
2624   for (i=1; i<n; i++) {
2625     v     = aa + 49*ai[i];
2626     vi    = aj + ai[i];
2627     nz    = diag[i] - ai[i];
2628     idx   = 7*(*r++);
2629     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2630     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
2631     while (nz--) {
2632       idx   = 7*(*vi++);
2633       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
2634       x4    = t[3+idx];x5 = t[4+idx];
2635       x6    = t[5+idx];x7 = t[6+idx];
2636       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2637       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2638       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2639       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2640       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2641       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2642       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2643       v += 49;
2644     }
2645     idx = 7*i;
2646     t[idx]   = s1;t[1+idx] = s2;
2647     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2648     t[5+idx] = s6;t[6+idx] = s7;
2649   }
2650   /* backward solve the upper triangular */
2651   for (i=n-1; i>=0; i--){
2652     v    = aa + 49*diag[i] + 49;
2653     vi   = aj + diag[i] + 1;
2654     nz   = ai[i+1] - diag[i] - 1;
2655     idt  = 7*i;
2656     s1 = t[idt];  s2 = t[1+idt];
2657     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2658     s6 = t[5+idt];s7 = t[6+idt];
2659     while (nz--) {
2660       idx   = 7*(*vi++);
2661       x1    = t[idx];   x2 = t[1+idx];
2662       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2663       x6    = t[5+idx]; x7 = t[6+idx];
2664       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2665       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2666       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2667       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2668       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2669       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2670       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2671       v += 49;
2672     }
2673     idc = 7*(*c--);
2674     v   = aa + 49*diag[i];
2675     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
2676                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
2677     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
2678                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
2679     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
2680                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
2681     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
2682                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
2683     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
2684                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
2685     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
2686                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
2687     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
2688                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
2689   }
2690 
2691   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2692   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2693   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2694   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2695   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
2696   PetscFunctionReturn(0);
2697 }
2698 
2699 #undef __FUNCT__
2700 #define __FUNCT__ "MatSolve_SeqBAIJ_7"
2701 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
2702 {
2703   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2704   IS                iscol=a->col,isrow=a->row;
2705   PetscErrorCode    ierr;
2706   const PetscInt    *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag;
2707   const PetscInt    n=a->mbs,*rout,*cout,*vi;
2708   PetscInt          i,nz,idx,idt,idc,m;
2709   const MatScalar   *aa=a->a,*v;
2710   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
2711   const PetscScalar *b;
2712 
2713   PetscFunctionBegin;
2714   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2715   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2716   t  = a->solve_work;
2717 
2718   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2719   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2720 
2721   /* forward solve the lower triangular */
2722   idx    = 7*r[0];
2723   t[0] = b[idx];   t[1] = b[1+idx];
2724   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
2725   t[5] = b[5+idx]; t[6] = b[6+idx];
2726 
2727   for (i=1; i<n; i++) {
2728     v     = aa + 49*ai[i];
2729     vi    = aj + ai[i];
2730     nz    = ai[i+1] - ai[i];
2731     idx   = 7*r[i];
2732     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2733     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
2734     for(m=0;m<nz;m++){
2735       idx   = 7*vi[m];
2736       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
2737       x4    = t[3+idx];x5 = t[4+idx];
2738       x6    = t[5+idx];x7 = t[6+idx];
2739       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2740       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2741       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2742       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2743       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2744       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2745       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2746       v += 49;
2747     }
2748     idx = 7*i;
2749     t[idx]   = s1;t[1+idx] = s2;
2750     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2751     t[5+idx] = s6;t[6+idx] = s7;
2752   }
2753   /* backward solve the upper triangular */
2754   for (i=n-1; i>=0; i--){
2755     v    = aa + 49*(adiag[i+1]+1);
2756     vi   = aj + adiag[i+1]+1;
2757     nz   = adiag[i] - adiag[i+1] - 1;
2758     idt  = 7*i;
2759     s1 = t[idt];  s2 = t[1+idt];
2760     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2761     s6 = t[5+idt];s7 = t[6+idt];
2762     for(m=0;m<nz;m++){
2763       idx   = 7*vi[m];
2764       x1    = t[idx];   x2 = t[1+idx];
2765       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2766       x6    = t[5+idx]; x7 = t[6+idx];
2767       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2768       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2769       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2770       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2771       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2772       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2773       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2774       v += 49;
2775     }
2776     idc = 7*c[i];
2777     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
2778                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
2779     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
2780                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
2781     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
2782                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
2783     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
2784                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
2785     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
2786                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
2787     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
2788                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
2789     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
2790                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
2791   }
2792 
2793   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2794   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2795   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2796   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2797   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
2798   PetscFunctionReturn(0);
2799 }
2800 
2801 #undef __FUNCT__
2802 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_inplace"
2803 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2804 {
2805   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2806   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
2807   PetscErrorCode    ierr;
2808   PetscInt          i,nz,idx,idt,jdx;
2809   const MatScalar   *aa=a->a,*v;
2810   PetscScalar       *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
2811   const PetscScalar *b;
2812 
2813   PetscFunctionBegin;
2814   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2815   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2816   /* forward solve the lower triangular */
2817   idx    = 0;
2818   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
2819   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
2820   x[6] = b[6+idx];
2821   for (i=1; i<n; i++) {
2822     v     =  aa + 49*ai[i];
2823     vi    =  aj + ai[i];
2824     nz    =  diag[i] - ai[i];
2825     idx   =  7*i;
2826     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
2827     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
2828     s7  =  b[6+idx];
2829     while (nz--) {
2830       jdx   = 7*(*vi++);
2831       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
2832       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
2833       x7    = x[6+jdx];
2834       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2835       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2836       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2837       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2838       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2839       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2840       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2841       v += 49;
2842      }
2843     x[idx]   = s1;
2844     x[1+idx] = s2;
2845     x[2+idx] = s3;
2846     x[3+idx] = s4;
2847     x[4+idx] = s5;
2848     x[5+idx] = s6;
2849     x[6+idx] = s7;
2850   }
2851   /* backward solve the upper triangular */
2852   for (i=n-1; i>=0; i--){
2853     v    = aa + 49*diag[i] + 49;
2854     vi   = aj + diag[i] + 1;
2855     nz   = ai[i+1] - diag[i] - 1;
2856     idt  = 7*i;
2857     s1 = x[idt];   s2 = x[1+idt];
2858     s3 = x[2+idt]; s4 = x[3+idt];
2859     s5 = x[4+idt]; s6 = x[5+idt];
2860     s7 = x[6+idt];
2861     while (nz--) {
2862       idx   = 7*(*vi++);
2863       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
2864       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
2865       x7    = x[6+idx];
2866       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2867       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2868       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2869       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2870       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2871       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2872       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2873       v += 49;
2874     }
2875     v        = aa + 49*diag[i];
2876     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
2877                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
2878     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
2879                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
2880     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
2881                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
2882     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
2883                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
2884     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
2885                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
2886     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
2887                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
2888     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
2889                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
2890   }
2891 
2892   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2893   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2894   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
2895   PetscFunctionReturn(0);
2896 }
2897 
2898 #undef __FUNCT__
2899 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
2900 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
2901 {
2902     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2903     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
2904     PetscErrorCode    ierr;
2905     PetscInt          i,k,nz,idx,jdx,idt;
2906     const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
2907     const MatScalar   *aa=a->a,*v;
2908     PetscScalar       *x;
2909     const PetscScalar *b;
2910     PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
2911 
2912     PetscFunctionBegin;
2913     ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2914     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2915     /* forward solve the lower triangular */
2916     idx    = 0;
2917     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
2918     x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx];
2919     for (i=1; i<n; i++) {
2920        v    = aa + bs2*ai[i];
2921        vi   = aj + ai[i];
2922        nz   = ai[i+1] - ai[i];
2923       idx   = bs*i;
2924        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2925        s5   = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
2926        for(k=0;k<nz;k++) {
2927           jdx   = bs*vi[k];
2928           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
2929 	  x5    = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx];
2930           s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
2931           s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
2932           s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
2933 	  s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
2934           s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
2935 	  s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
2936 	  s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
2937           v   +=  bs2;
2938         }
2939 
2940        x[idx]   = s1;
2941        x[1+idx] = s2;
2942        x[2+idx] = s3;
2943        x[3+idx] = s4;
2944        x[4+idx] = s5;
2945        x[5+idx] = s6;
2946        x[6+idx] = s7;
2947     }
2948 
2949    /* backward solve the upper triangular */
2950   for (i=n-1; i>=0; i--){
2951     v   = aa + bs2*(adiag[i+1]+1);
2952      vi  = aj + adiag[i+1]+1;
2953      nz  = adiag[i] - adiag[i+1]-1;
2954      idt = bs*i;
2955      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
2956      s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt];
2957     for(k=0;k<nz;k++) {
2958       idx   = bs*vi[k];
2959        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
2960        x5    = x[4+idx];x6 = x[5+idx];x7 = x[6+idx];
2961        s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
2962        s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
2963        s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
2964        s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
2965        s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
2966        s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
2967        s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
2968         v   +=  bs2;
2969     }
2970     /* x = inv_diagonal*x */
2971     x[idt]   = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4  + v[28]*s5 + v[35]*s6 + v[42]*s7;
2972     x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4  + v[29]*s5 + v[36]*s6 + v[43]*s7;
2973     x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4  + v[30]*s5 + v[37]*s6 + v[44]*s7;
2974     x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4  + v[31]*s5 + v[38]*s6 + v[45]*s7;
2975     x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4  + v[32]*s5 + v[39]*s6 + v[46]*s7;
2976     x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4  + v[33]*s5 + v[40]*s6 + v[47]*s7;
2977     x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4  + v[34]*s5 + v[41]*s6 + v[48]*s7;
2978   }
2979 
2980   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2981   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2982   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2983   PetscFunctionReturn(0);
2984 }
2985 
2986 #undef __FUNCT__
2987 #define __FUNCT__ "MatSolve_SeqBAIJ_6_inplace"
2988 PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
2989 {
2990   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2991   IS                iscol=a->col,isrow=a->row;
2992   PetscErrorCode    ierr;
2993   const PetscInt    *r,*c,*rout,*cout;
2994   const PetscInt    *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
2995   PetscInt          i,nz,idx,idt,idc;
2996   const MatScalar   *aa=a->a,*v;
2997   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
2998   const PetscScalar *b;
2999 
3000   PetscFunctionBegin;
3001   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3002   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3003   t  = a->solve_work;
3004 
3005   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3006   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3007 
3008   /* forward solve the lower triangular */
3009   idx    = 6*(*r++);
3010   t[0] = b[idx];   t[1] = b[1+idx];
3011   t[2] = b[2+idx]; t[3] = b[3+idx];
3012   t[4] = b[4+idx]; t[5] = b[5+idx];
3013   for (i=1; i<n; i++) {
3014     v     = aa + 36*ai[i];
3015     vi    = aj + ai[i];
3016     nz    = diag[i] - ai[i];
3017     idx   = 6*(*r++);
3018     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3019     s5  = b[4+idx]; s6 = b[5+idx];
3020     while (nz--) {
3021       idx   = 6*(*vi++);
3022       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
3023       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
3024       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3025       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3026       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3027       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3028       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3029       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3030       v += 36;
3031     }
3032     idx = 6*i;
3033     t[idx]   = s1;t[1+idx] = s2;
3034     t[2+idx] = s3;t[3+idx] = s4;
3035     t[4+idx] = s5;t[5+idx] = s6;
3036   }
3037   /* backward solve the upper triangular */
3038   for (i=n-1; i>=0; i--){
3039     v    = aa + 36*diag[i] + 36;
3040     vi   = aj + diag[i] + 1;
3041     nz   = ai[i+1] - diag[i] - 1;
3042     idt  = 6*i;
3043     s1 = t[idt];  s2 = t[1+idt];
3044     s3 = t[2+idt];s4 = t[3+idt];
3045     s5 = t[4+idt];s6 = t[5+idt];
3046     while (nz--) {
3047       idx   = 6*(*vi++);
3048       x1    = t[idx];   x2 = t[1+idx];
3049       x3    = t[2+idx]; x4 = t[3+idx];
3050       x5    = t[4+idx]; x6 = t[5+idx];
3051       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3052       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3053       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3054       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3055       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3056       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3057       v += 36;
3058     }
3059     idc = 6*(*c--);
3060     v   = aa + 36*diag[i];
3061     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
3062                                  v[18]*s4+v[24]*s5+v[30]*s6;
3063     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
3064                                  v[19]*s4+v[25]*s5+v[31]*s6;
3065     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
3066                                  v[20]*s4+v[26]*s5+v[32]*s6;
3067     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
3068                                  v[21]*s4+v[27]*s5+v[33]*s6;
3069     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
3070                                  v[22]*s4+v[28]*s5+v[34]*s6;
3071     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
3072                                  v[23]*s4+v[29]*s5+v[35]*s6;
3073   }
3074 
3075   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3076   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3077   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3078   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3079   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
3080   PetscFunctionReturn(0);
3081 }
3082 
3083 #undef __FUNCT__
3084 #define __FUNCT__ "MatSolve_SeqBAIJ_6"
3085 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
3086 {
3087   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
3088   IS                iscol=a->col,isrow=a->row;
3089   PetscErrorCode    ierr;
3090   const PetscInt    *r,*c,*rout,*cout;
3091   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3092   PetscInt          i,nz,idx,idt,idc,m;
3093   const MatScalar   *aa=a->a,*v;
3094   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
3095   const PetscScalar *b;
3096 
3097   PetscFunctionBegin;
3098   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3099   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3100   t  = a->solve_work;
3101 
3102   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3103   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
3104 
3105   /* forward solve the lower triangular */
3106   idx    = 6*r[0];
3107   t[0] = b[idx];   t[1] = b[1+idx];
3108   t[2] = b[2+idx]; t[3] = b[3+idx];
3109   t[4] = b[4+idx]; t[5] = b[5+idx];
3110   for (i=1; i<n; i++) {
3111     v     = aa + 36*ai[i];
3112     vi    = aj + ai[i];
3113     nz    = ai[i+1] - ai[i];
3114     idx   = 6*r[i];
3115     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3116     s5  = b[4+idx]; s6 = b[5+idx];
3117     for(m=0;m<nz;m++){
3118       idx   = 6*vi[m];
3119       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
3120       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
3121       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3122       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3123       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3124       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3125       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3126       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3127       v += 36;
3128     }
3129     idx = 6*i;
3130     t[idx]   = s1;t[1+idx] = s2;
3131     t[2+idx] = s3;t[3+idx] = s4;
3132     t[4+idx] = s5;t[5+idx] = s6;
3133   }
3134   /* backward solve the upper triangular */
3135   for (i=n-1; i>=0; i--){
3136     v    = aa + 36*(adiag[i+1]+1);
3137     vi   = aj + adiag[i+1]+1;
3138     nz   = adiag[i] - adiag[i+1] - 1;
3139     idt  = 6*i;
3140     s1 = t[idt];  s2 = t[1+idt];
3141     s3 = t[2+idt];s4 = t[3+idt];
3142     s5 = t[4+idt];s6 = t[5+idt];
3143     for(m=0;m<nz;m++){
3144       idx   = 6*vi[m];
3145       x1    = t[idx];   x2 = t[1+idx];
3146       x3    = t[2+idx]; x4 = t[3+idx];
3147       x5    = t[4+idx]; x6 = t[5+idx];
3148       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3149       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3150       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3151       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3152       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3153       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3154       v += 36;
3155     }
3156     idc = 6*c[i];
3157     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
3158                                  v[18]*s4+v[24]*s5+v[30]*s6;
3159     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
3160                                  v[19]*s4+v[25]*s5+v[31]*s6;
3161     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
3162                                  v[20]*s4+v[26]*s5+v[32]*s6;
3163     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
3164                                  v[21]*s4+v[27]*s5+v[33]*s6;
3165     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
3166                                  v[22]*s4+v[28]*s5+v[34]*s6;
3167     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
3168                                  v[23]*s4+v[29]*s5+v[35]*s6;
3169   }
3170 
3171   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3172   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3173   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3174   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3175   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
3176   PetscFunctionReturn(0);
3177 }
3178 
3179 #undef __FUNCT__
3180 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_inplace"
3181 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
3182 {
3183   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3184   PetscInt          i,nz,idx,idt,jdx;
3185   PetscErrorCode    ierr;
3186   const PetscInt    *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j;
3187   const MatScalar   *aa=a->a,*v;
3188   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
3189   const PetscScalar *b;
3190 
3191   PetscFunctionBegin;
3192   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3193   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3194   /* forward solve the lower triangular */
3195   idx    = 0;
3196   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
3197   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
3198   for (i=1; i<n; i++) {
3199     v     =  aa + 36*ai[i];
3200     vi    =  aj + ai[i];
3201     nz    =  diag[i] - ai[i];
3202     idx   =  6*i;
3203     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
3204     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
3205     while (nz--) {
3206       jdx   = 6*(*vi++);
3207       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
3208       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
3209       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3210       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3211       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3212       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3213       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3214       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3215       v += 36;
3216      }
3217     x[idx]   = s1;
3218     x[1+idx] = s2;
3219     x[2+idx] = s3;
3220     x[3+idx] = s4;
3221     x[4+idx] = s5;
3222     x[5+idx] = s6;
3223   }
3224   /* backward solve the upper triangular */
3225   for (i=n-1; i>=0; i--){
3226     v    = aa + 36*diag[i] + 36;
3227     vi   = aj + diag[i] + 1;
3228     nz   = ai[i+1] - diag[i] - 1;
3229     idt  = 6*i;
3230     s1 = x[idt];   s2 = x[1+idt];
3231     s3 = x[2+idt]; s4 = x[3+idt];
3232     s5 = x[4+idt]; s6 = x[5+idt];
3233     while (nz--) {
3234       idx   = 6*(*vi++);
3235       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
3236       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
3237       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3238       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3239       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3240       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3241       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3242       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3243       v += 36;
3244     }
3245     v        = aa + 36*diag[i];
3246     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
3247     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
3248     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
3249     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
3250     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
3251     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
3252   }
3253 
3254   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3255   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3256   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
3257   PetscFunctionReturn(0);
3258 }
3259 
3260 #undef __FUNCT__
3261 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
3262 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
3263 {
3264     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3265     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3266     PetscErrorCode    ierr;
3267     PetscInt          i,k,nz,idx,jdx,idt;
3268     const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
3269     const MatScalar   *aa=a->a,*v;
3270     PetscScalar       *x;
3271     const PetscScalar *b;
3272     PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
3273 
3274     PetscFunctionBegin;
3275     ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3276     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3277     /* forward solve the lower triangular */
3278     idx    = 0;
3279     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
3280     x[4] = b[4+idx];x[5] = b[5+idx];
3281     for (i=1; i<n; i++) {
3282        v    = aa + bs2*ai[i];
3283        vi   = aj + ai[i];
3284        nz   = ai[i+1] - ai[i];
3285       idx   = bs*i;
3286        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3287        s5   = b[4+idx];s6 = b[5+idx];
3288        for(k=0;k<nz;k++){
3289           jdx   = bs*vi[k];
3290           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
3291 	  x5    = x[4+jdx]; x6 = x[5+jdx];
3292           s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
3293           s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
3294           s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
3295 	  s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
3296           s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
3297 	  s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
3298           v   +=  bs2;
3299         }
3300 
3301        x[idx]   = s1;
3302        x[1+idx] = s2;
3303        x[2+idx] = s3;
3304        x[3+idx] = s4;
3305        x[4+idx] = s5;
3306        x[5+idx] = s6;
3307     }
3308 
3309    /* backward solve the upper triangular */
3310   for (i=n-1; i>=0; i--){
3311     v   = aa + bs2*(adiag[i+1]+1);
3312      vi  = aj + adiag[i+1]+1;
3313      nz  = adiag[i] - adiag[i+1]-1;
3314      idt = bs*i;
3315      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
3316      s5 = x[4+idt];s6 = x[5+idt];
3317      for(k=0;k<nz;k++){
3318       idx   = bs*vi[k];
3319        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
3320        x5    = x[4+idx];x6 = x[5+idx];
3321        s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
3322        s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
3323        s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
3324        s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
3325        s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
3326        s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
3327         v   +=  bs2;
3328     }
3329     /* x = inv_diagonal*x */
3330    x[idt]   = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
3331    x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
3332    x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
3333    x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
3334    x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
3335    x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
3336   }
3337 
3338   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3339   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3340   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
3341   PetscFunctionReturn(0);
3342 }
3343 
3344 #undef __FUNCT__
3345 #define __FUNCT__ "MatSolve_SeqBAIJ_5_inplace"
3346 PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
3347 {
3348   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
3349   IS                iscol=a->col,isrow=a->row;
3350   PetscErrorCode    ierr;
3351   const PetscInt    *r,*c,*rout,*cout,*diag = a->diag;
3352   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3353   PetscInt          i,nz,idx,idt,idc;
3354   const MatScalar   *aa=a->a,*v;
3355   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
3356   const PetscScalar *b;
3357 
3358   PetscFunctionBegin;
3359   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3360   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3361   t  = a->solve_work;
3362 
3363   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3364   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3365 
3366   /* forward solve the lower triangular */
3367   idx    = 5*(*r++);
3368   t[0] = b[idx];   t[1] = b[1+idx];
3369   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
3370   for (i=1; i<n; i++) {
3371     v     = aa + 25*ai[i];
3372     vi    = aj + ai[i];
3373     nz    = diag[i] - ai[i];
3374     idx   = 5*(*r++);
3375     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3376     s5  = b[4+idx];
3377     while (nz--) {
3378       idx   = 5*(*vi++);
3379       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
3380       x4    = t[3+idx];x5 = t[4+idx];
3381       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3382       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3383       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3384       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3385       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3386       v += 25;
3387     }
3388     idx = 5*i;
3389     t[idx]   = s1;t[1+idx] = s2;
3390     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
3391   }
3392   /* backward solve the upper triangular */
3393   for (i=n-1; i>=0; i--){
3394     v    = aa + 25*diag[i] + 25;
3395     vi   = aj + diag[i] + 1;
3396     nz   = ai[i+1] - diag[i] - 1;
3397     idt  = 5*i;
3398     s1 = t[idt];  s2 = t[1+idt];
3399     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
3400     while (nz--) {
3401       idx   = 5*(*vi++);
3402       x1    = t[idx];   x2 = t[1+idx];
3403       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
3404       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3405       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3406       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3407       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3408       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3409       v += 25;
3410     }
3411     idc = 5*(*c--);
3412     v   = aa + 25*diag[i];
3413     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
3414                                  v[15]*s4+v[20]*s5;
3415     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
3416                                  v[16]*s4+v[21]*s5;
3417     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
3418                                  v[17]*s4+v[22]*s5;
3419     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
3420                                  v[18]*s4+v[23]*s5;
3421     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
3422                                  v[19]*s4+v[24]*s5;
3423   }
3424 
3425   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3426   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3427   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3428   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3429   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
3430   PetscFunctionReturn(0);
3431 }
3432 
3433 #undef __FUNCT__
3434 #define __FUNCT__ "MatSolve_SeqBAIJ_5"
3435 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
3436 {
3437   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
3438   IS                iscol=a->col,isrow=a->row;
3439   PetscErrorCode    ierr;
3440   const PetscInt    *r,*c,*rout,*cout;
3441   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3442   PetscInt          i,nz,idx,idt,idc,m;
3443   const MatScalar   *aa=a->a,*v;
3444   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
3445   const PetscScalar *b;
3446 
3447   PetscFunctionBegin;
3448   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3449   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3450   t  = a->solve_work;
3451 
3452   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3453   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
3454 
3455   /* forward solve the lower triangular */
3456   idx    = 5*r[0];
3457   t[0] = b[idx];   t[1] = b[1+idx];
3458   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
3459   for (i=1; i<n; i++) {
3460     v     = aa + 25*ai[i];
3461     vi    = aj + ai[i];
3462     nz    = ai[i+1] - ai[i];
3463     idx   = 5*r[i];
3464     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3465     s5  = b[4+idx];
3466     for(m=0;m<nz;m++){
3467       idx   = 5*vi[m];
3468       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
3469       x4    = t[3+idx];x5 = t[4+idx];
3470       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3471       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3472       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3473       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3474       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3475       v += 25;
3476     }
3477     idx = 5*i;
3478     t[idx]   = s1;t[1+idx] = s2;
3479     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
3480   }
3481   /* backward solve the upper triangular */
3482   for (i=n-1; i>=0; i--){
3483     v    = aa + 25*(adiag[i+1]+1);
3484     vi   = aj + adiag[i+1]+1;
3485     nz   = adiag[i] - adiag[i+1] - 1;
3486     idt  = 5*i;
3487     s1 = t[idt];  s2 = t[1+idt];
3488     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
3489     for(m=0;m<nz;m++){
3490       idx   = 5*vi[m];
3491       x1    = t[idx];   x2 = t[1+idx];
3492       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
3493       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3494       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3495       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3496       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3497       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3498       v += 25;
3499     }
3500     idc = 5*c[i];
3501     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
3502                                  v[15]*s4+v[20]*s5;
3503     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
3504                                  v[16]*s4+v[21]*s5;
3505     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
3506                                  v[17]*s4+v[22]*s5;
3507     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
3508                                  v[18]*s4+v[23]*s5;
3509     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
3510                                  v[19]*s4+v[24]*s5;
3511   }
3512 
3513   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3514   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3515   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3516   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3517   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
3518   PetscFunctionReturn(0);
3519 }
3520 
3521 #undef __FUNCT__
3522 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_inplace"
3523 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
3524 {
3525   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3526   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3527   PetscInt          i,nz,idx,idt,jdx;
3528   PetscErrorCode    ierr;
3529   const MatScalar   *aa=a->a,*v;
3530   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
3531   const PetscScalar *b;
3532 
3533   PetscFunctionBegin;
3534   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3535   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3536   /* forward solve the lower triangular */
3537   idx    = 0;
3538   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
3539   for (i=1; i<n; i++) {
3540     v     =  aa + 25*ai[i];
3541     vi    =  aj + ai[i];
3542     nz    =  diag[i] - ai[i];
3543     idx   =  5*i;
3544     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
3545     while (nz--) {
3546       jdx   = 5*(*vi++);
3547       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
3548       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3549       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3550       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3551       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3552       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3553       v    += 25;
3554     }
3555     x[idx]   = s1;
3556     x[1+idx] = s2;
3557     x[2+idx] = s3;
3558     x[3+idx] = s4;
3559     x[4+idx] = s5;
3560   }
3561   /* backward solve the upper triangular */
3562   for (i=n-1; i>=0; i--){
3563     v    = aa + 25*diag[i] + 25;
3564     vi   = aj + diag[i] + 1;
3565     nz   = ai[i+1] - diag[i] - 1;
3566     idt  = 5*i;
3567     s1 = x[idt];  s2 = x[1+idt];
3568     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
3569     while (nz--) {
3570       idx   = 5*(*vi++);
3571       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
3572       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3573       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3574       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3575       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3576       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3577       v    += 25;
3578     }
3579     v        = aa + 25*diag[i];
3580     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
3581     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
3582     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
3583     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
3584     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
3585   }
3586 
3587   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3588   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3589   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
3590   PetscFunctionReturn(0);
3591 }
3592 
3593 #undef __FUNCT__
3594 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
3595 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
3596 {
3597   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3598   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3599   PetscInt          i,k,nz,idx,idt,jdx;
3600   PetscErrorCode    ierr;
3601   const MatScalar   *aa=a->a,*v;
3602   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
3603   const PetscScalar *b;
3604 
3605   PetscFunctionBegin;
3606   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3607   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3608   /* forward solve the lower triangular */
3609   idx    = 0;
3610   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
3611   for (i=1; i<n; i++) {
3612     v   = aa + 25*ai[i];
3613     vi  = aj + ai[i];
3614     nz  = ai[i+1] - ai[i];
3615     idx = 5*i;
3616     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
3617     for(k=0;k<nz;k++) {
3618       jdx   = 5*vi[k];
3619       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
3620       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3621       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3622       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3623       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3624       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3625       v    += 25;
3626     }
3627     x[idx]   = s1;
3628     x[1+idx] = s2;
3629     x[2+idx] = s3;
3630     x[3+idx] = s4;
3631     x[4+idx] = s5;
3632   }
3633 
3634   /* backward solve the upper triangular */
3635   for (i=n-1; i>=0; i--){
3636     v   = aa + 25*(adiag[i+1]+1);
3637     vi  = aj + adiag[i+1]+1;
3638     nz  = adiag[i] - adiag[i+1]-1;
3639     idt = 5*i;
3640     s1 = x[idt];  s2 = x[1+idt];
3641     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
3642     for(k=0;k<nz;k++){
3643       idx   = 5*vi[k];
3644       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
3645       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3646       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3647       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3648       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3649       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3650       v    += 25;
3651     }
3652     /* x = inv_diagonal*x */
3653     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
3654     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
3655     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
3656     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
3657     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
3658   }
3659 
3660   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3661   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3662   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
3663   PetscFunctionReturn(0);
3664 }
3665 
3666 #undef __FUNCT__
3667 #define __FUNCT__ "MatSolve_SeqBAIJ_4_inplace"
3668 PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
3669 {
3670   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3671   IS                iscol=a->col,isrow=a->row;
3672   PetscErrorCode    ierr;
3673   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3674   PetscInt          i,nz,idx,idt,idc;
3675   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3676   const MatScalar   *aa=a->a,*v;
3677   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
3678   const PetscScalar *b;
3679 
3680   PetscFunctionBegin;
3681   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3682   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3683   t  = a->solve_work;
3684 
3685   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3686   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3687 
3688   /* forward solve the lower triangular */
3689   idx    = 4*(*r++);
3690   t[0] = b[idx];   t[1] = b[1+idx];
3691   t[2] = b[2+idx]; t[3] = b[3+idx];
3692   for (i=1; i<n; i++) {
3693     v     = aa + 16*ai[i];
3694     vi    = aj + ai[i];
3695     nz    = diag[i] - ai[i];
3696     idx   = 4*(*r++);
3697     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3698     while (nz--) {
3699       idx   = 4*(*vi++);
3700       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
3701       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3702       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3703       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3704       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3705       v    += 16;
3706     }
3707     idx        = 4*i;
3708     t[idx]   = s1;t[1+idx] = s2;
3709     t[2+idx] = s3;t[3+idx] = s4;
3710   }
3711   /* backward solve the upper triangular */
3712   for (i=n-1; i>=0; i--){
3713     v    = aa + 16*diag[i] + 16;
3714     vi   = aj + diag[i] + 1;
3715     nz   = ai[i+1] - diag[i] - 1;
3716     idt  = 4*i;
3717     s1 = t[idt];  s2 = t[1+idt];
3718     s3 = t[2+idt];s4 = t[3+idt];
3719     while (nz--) {
3720       idx   = 4*(*vi++);
3721       x1    = t[idx];   x2 = t[1+idx];
3722       x3    = t[2+idx]; x4 = t[3+idx];
3723       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3724       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3725       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3726       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3727       v += 16;
3728     }
3729     idc      = 4*(*c--);
3730     v        = aa + 16*diag[i];
3731     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3732     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3733     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3734     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3735   }
3736 
3737   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3738   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3739   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3740   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3741   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3742   PetscFunctionReturn(0);
3743 }
3744 
3745 #undef __FUNCT__
3746 #define __FUNCT__ "MatSolve_SeqBAIJ_4"
3747 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
3748 {
3749   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3750   IS                iscol=a->col,isrow=a->row;
3751   PetscErrorCode    ierr;
3752   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3753   PetscInt          i,nz,idx,idt,idc,m;
3754   const PetscInt    *r,*c,*rout,*cout;
3755   const MatScalar   *aa=a->a,*v;
3756   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
3757   const PetscScalar *b;
3758 
3759   PetscFunctionBegin;
3760   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3761   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3762   t  = a->solve_work;
3763 
3764   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3765   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
3766 
3767   /* forward solve the lower triangular */
3768   idx    = 4*r[0];
3769   t[0] = b[idx];   t[1] = b[1+idx];
3770   t[2] = b[2+idx]; t[3] = b[3+idx];
3771   for (i=1; i<n; i++) {
3772     v     = aa + 16*ai[i];
3773     vi    = aj + ai[i];
3774     nz    = ai[i+1] - ai[i];
3775     idx   = 4*r[i];
3776     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3777     for(m=0;m<nz;m++){
3778       idx   = 4*vi[m];
3779       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
3780       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3781       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3782       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3783       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3784       v    += 16;
3785     }
3786     idx        = 4*i;
3787     t[idx]   = s1;t[1+idx] = s2;
3788     t[2+idx] = s3;t[3+idx] = s4;
3789   }
3790   /* backward solve the upper triangular */
3791   for (i=n-1; i>=0; i--){
3792     v    = aa + 16*(adiag[i+1]+1);
3793     vi   = aj + adiag[i+1]+1;
3794     nz   = adiag[i] - adiag[i+1] - 1;
3795     idt  = 4*i;
3796     s1 = t[idt];  s2 = t[1+idt];
3797     s3 = t[2+idt];s4 = t[3+idt];
3798     for(m=0;m<nz;m++){
3799       idx   = 4*vi[m];
3800       x1    = t[idx];   x2 = t[1+idx];
3801       x3    = t[2+idx]; x4 = t[3+idx];
3802       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3803       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3804       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3805       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3806       v += 16;
3807     }
3808     idc      = 4*c[i];
3809     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3810     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3811     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3812     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3813   }
3814 
3815   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3816   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3817   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3818   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3819   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3820   PetscFunctionReturn(0);
3821 }
3822 
3823 #undef __FUNCT__
3824 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion"
3825 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
3826 {
3827   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3828   IS                iscol=a->col,isrow=a->row;
3829   PetscErrorCode    ierr;
3830   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3831   PetscInt          i,nz,idx,idt,idc;
3832   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3833   const MatScalar   *aa=a->a,*v;
3834   MatScalar         s1,s2,s3,s4,x1,x2,x3,x4,*t;
3835   PetscScalar       *x;
3836   const PetscScalar *b;
3837 
3838   PetscFunctionBegin;
3839   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3840   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3841   t  = (MatScalar *)a->solve_work;
3842 
3843   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3844   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3845 
3846   /* forward solve the lower triangular */
3847   idx    = 4*(*r++);
3848   t[0] = (MatScalar)b[idx];
3849   t[1] = (MatScalar)b[1+idx];
3850   t[2] = (MatScalar)b[2+idx];
3851   t[3] = (MatScalar)b[3+idx];
3852   for (i=1; i<n; i++) {
3853     v     = aa + 16*ai[i];
3854     vi    = aj + ai[i];
3855     nz    = diag[i] - ai[i];
3856     idx   = 4*(*r++);
3857     s1 = (MatScalar)b[idx];
3858     s2 = (MatScalar)b[1+idx];
3859     s3 = (MatScalar)b[2+idx];
3860     s4 = (MatScalar)b[3+idx];
3861     while (nz--) {
3862       idx   = 4*(*vi++);
3863       x1  = t[idx];
3864       x2  = t[1+idx];
3865       x3  = t[2+idx];
3866       x4  = t[3+idx];
3867       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3868       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3869       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3870       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3871       v    += 16;
3872     }
3873     idx        = 4*i;
3874     t[idx]   = s1;
3875     t[1+idx] = s2;
3876     t[2+idx] = s3;
3877     t[3+idx] = s4;
3878   }
3879   /* backward solve the upper triangular */
3880   for (i=n-1; i>=0; i--){
3881     v    = aa + 16*diag[i] + 16;
3882     vi   = aj + diag[i] + 1;
3883     nz   = ai[i+1] - diag[i] - 1;
3884     idt  = 4*i;
3885     s1 = t[idt];
3886     s2 = t[1+idt];
3887     s3 = t[2+idt];
3888     s4 = t[3+idt];
3889     while (nz--) {
3890       idx   = 4*(*vi++);
3891       x1  = t[idx];
3892       x2  = t[1+idx];
3893       x3  = t[2+idx];
3894       x4  = t[3+idx];
3895       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3896       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3897       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3898       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3899       v += 16;
3900     }
3901     idc      = 4*(*c--);
3902     v        = aa + 16*diag[i];
3903     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3904     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3905     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3906     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3907     x[idc]   = (PetscScalar)t[idt];
3908     x[1+idc] = (PetscScalar)t[1+idt];
3909     x[2+idc] = (PetscScalar)t[2+idt];
3910     x[3+idc] = (PetscScalar)t[3+idt];
3911  }
3912 
3913   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3914   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3915   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3916   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3917   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3918   PetscFunctionReturn(0);
3919 }
3920 
3921 #if defined (PETSC_HAVE_SSE)
3922 
3923 #include PETSC_HAVE_SSE
3924 
3925 #undef __FUNCT__
3926 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
3927 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
3928 {
3929   /*
3930      Note: This code uses demotion of double
3931      to float when performing the mixed-mode computation.
3932      This may not be numerically reasonable for all applications.
3933   */
3934   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3935   IS             iscol=a->col,isrow=a->row;
3936   PetscErrorCode ierr;
3937   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
3938   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
3939   MatScalar      *aa=a->a,*v;
3940   PetscScalar    *x,*b,*t;
3941 
3942   /* Make space in temp stack for 16 Byte Aligned arrays */
3943   float           ssealignedspace[11],*tmps,*tmpx;
3944   unsigned long   offset;
3945 
3946   PetscFunctionBegin;
3947   SSE_SCOPE_BEGIN;
3948 
3949     offset = (unsigned long)ssealignedspace % 16;
3950     if (offset) offset = (16 - offset)/4;
3951     tmps = &ssealignedspace[offset];
3952     tmpx = &ssealignedspace[offset+4];
3953     PREFETCH_NTA(aa+16*ai[1]);
3954 
3955     ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3956     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3957     t  = a->solve_work;
3958 
3959     ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3960     ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3961 
3962     /* forward solve the lower triangular */
3963     idx  = 4*(*r++);
3964     t[0] = b[idx];   t[1] = b[1+idx];
3965     t[2] = b[2+idx]; t[3] = b[3+idx];
3966     v    =  aa + 16*ai[1];
3967 
3968     for (i=1; i<n;) {
3969       PREFETCH_NTA(&v[8]);
3970       vi   =  aj      + ai[i];
3971       nz   =  diag[i] - ai[i];
3972       idx  =  4*(*r++);
3973 
3974       /* Demote sum from double to float */
3975       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
3976       LOAD_PS(tmps,XMM7);
3977 
3978       while (nz--) {
3979         PREFETCH_NTA(&v[16]);
3980         idx = 4*(*vi++);
3981 
3982         /* Demote solution (so far) from double to float */
3983         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
3984 
3985         /* 4x4 Matrix-Vector product with negative accumulation: */
3986         SSE_INLINE_BEGIN_2(tmpx,v)
3987           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3988 
3989           /* First Column */
3990           SSE_COPY_PS(XMM0,XMM6)
3991           SSE_SHUFFLE(XMM0,XMM0,0x00)
3992           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3993           SSE_SUB_PS(XMM7,XMM0)
3994 
3995           /* Second Column */
3996           SSE_COPY_PS(XMM1,XMM6)
3997           SSE_SHUFFLE(XMM1,XMM1,0x55)
3998           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3999           SSE_SUB_PS(XMM7,XMM1)
4000 
4001           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4002 
4003           /* Third Column */
4004           SSE_COPY_PS(XMM2,XMM6)
4005           SSE_SHUFFLE(XMM2,XMM2,0xAA)
4006           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4007           SSE_SUB_PS(XMM7,XMM2)
4008 
4009           /* Fourth Column */
4010           SSE_COPY_PS(XMM3,XMM6)
4011           SSE_SHUFFLE(XMM3,XMM3,0xFF)
4012           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4013           SSE_SUB_PS(XMM7,XMM3)
4014         SSE_INLINE_END_2
4015 
4016         v  += 16;
4017       }
4018       idx = 4*i;
4019       v   = aa + 16*ai[++i];
4020       PREFETCH_NTA(v);
4021       STORE_PS(tmps,XMM7);
4022 
4023       /* Promote result from float to double */
4024       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
4025     }
4026     /* backward solve the upper triangular */
4027     idt  = 4*(n-1);
4028     ai16 = 16*diag[n-1];
4029     v    = aa + ai16 + 16;
4030     for (i=n-1; i>=0;){
4031       PREFETCH_NTA(&v[8]);
4032       vi = aj + diag[i] + 1;
4033       nz = ai[i+1] - diag[i] - 1;
4034 
4035       /* Demote accumulator from double to float */
4036       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
4037       LOAD_PS(tmps,XMM7);
4038 
4039       while (nz--) {
4040         PREFETCH_NTA(&v[16]);
4041         idx = 4*(*vi++);
4042 
4043         /* Demote solution (so far) from double to float */
4044         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
4045 
4046         /* 4x4 Matrix-Vector Product with negative accumulation: */
4047         SSE_INLINE_BEGIN_2(tmpx,v)
4048           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4049 
4050           /* First Column */
4051           SSE_COPY_PS(XMM0,XMM6)
4052           SSE_SHUFFLE(XMM0,XMM0,0x00)
4053           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4054           SSE_SUB_PS(XMM7,XMM0)
4055 
4056           /* Second Column */
4057           SSE_COPY_PS(XMM1,XMM6)
4058           SSE_SHUFFLE(XMM1,XMM1,0x55)
4059           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4060           SSE_SUB_PS(XMM7,XMM1)
4061 
4062           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4063 
4064           /* Third Column */
4065           SSE_COPY_PS(XMM2,XMM6)
4066           SSE_SHUFFLE(XMM2,XMM2,0xAA)
4067           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4068           SSE_SUB_PS(XMM7,XMM2)
4069 
4070           /* Fourth Column */
4071           SSE_COPY_PS(XMM3,XMM6)
4072           SSE_SHUFFLE(XMM3,XMM3,0xFF)
4073           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4074           SSE_SUB_PS(XMM7,XMM3)
4075         SSE_INLINE_END_2
4076         v  += 16;
4077       }
4078       v    = aa + ai16;
4079       ai16 = 16*diag[--i];
4080       PREFETCH_NTA(aa+ai16+16);
4081       /*
4082          Scale the result by the diagonal 4x4 block,
4083          which was inverted as part of the factorization
4084       */
4085       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
4086         /* First Column */
4087         SSE_COPY_PS(XMM0,XMM7)
4088         SSE_SHUFFLE(XMM0,XMM0,0x00)
4089         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
4090 
4091         /* Second Column */
4092         SSE_COPY_PS(XMM1,XMM7)
4093         SSE_SHUFFLE(XMM1,XMM1,0x55)
4094         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
4095         SSE_ADD_PS(XMM0,XMM1)
4096 
4097         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
4098 
4099         /* Third Column */
4100         SSE_COPY_PS(XMM2,XMM7)
4101         SSE_SHUFFLE(XMM2,XMM2,0xAA)
4102         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
4103         SSE_ADD_PS(XMM0,XMM2)
4104 
4105         /* Fourth Column */
4106         SSE_COPY_PS(XMM3,XMM7)
4107         SSE_SHUFFLE(XMM3,XMM3,0xFF)
4108         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
4109         SSE_ADD_PS(XMM0,XMM3)
4110 
4111         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
4112       SSE_INLINE_END_3
4113 
4114       /* Promote solution from float to double */
4115       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
4116 
4117       /* Apply reordering to t and stream into x.    */
4118       /* This way, x doesn't pollute the cache.      */
4119       /* Be careful with size: 2 doubles = 4 floats! */
4120       idc  = 4*(*c--);
4121       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
4122         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
4123         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
4124         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
4125         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
4126         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
4127         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
4128       SSE_INLINE_END_2
4129       v    = aa + ai16 + 16;
4130       idt -= 4;
4131     }
4132 
4133     ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4134     ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4135     ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4136     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4137     ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4138   SSE_SCOPE_END;
4139   PetscFunctionReturn(0);
4140 }
4141 
4142 #endif
4143 
4144 
4145 /*
4146       Special case where the matrix was ILU(0) factored in the natural
4147    ordering. This eliminates the need for the column and row permutation.
4148 */
4149 #undef __FUNCT__
4150 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_inplace"
4151 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
4152 {
4153   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4154   PetscInt          n=a->mbs;
4155   const PetscInt    *ai=a->i,*aj=a->j;
4156   PetscErrorCode    ierr;
4157   const PetscInt    *diag = a->diag;
4158   const MatScalar   *aa=a->a;
4159   PetscScalar       *x;
4160   const PetscScalar *b;
4161 
4162   PetscFunctionBegin;
4163   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
4164   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4165 
4166 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
4167   {
4168     static PetscScalar w[2000]; /* very BAD need to fix */
4169     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
4170   }
4171 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
4172   {
4173     static PetscScalar w[2000]; /* very BAD need to fix */
4174     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
4175   }
4176 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
4177   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
4178 #else
4179   {
4180     PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
4181     const MatScalar *v;
4182     PetscInt        jdx,idt,idx,nz,i,ai16;
4183     const PetscInt  *vi;
4184 
4185   /* forward solve the lower triangular */
4186   idx    = 0;
4187   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
4188   for (i=1; i<n; i++) {
4189     v     =  aa      + 16*ai[i];
4190     vi    =  aj      + ai[i];
4191     nz    =  diag[i] - ai[i];
4192     idx   +=  4;
4193     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
4194     while (nz--) {
4195       jdx   = 4*(*vi++);
4196       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
4197       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
4198       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
4199       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4200       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4201       v    += 16;
4202     }
4203     x[idx]   = s1;
4204     x[1+idx] = s2;
4205     x[2+idx] = s3;
4206     x[3+idx] = s4;
4207   }
4208   /* backward solve the upper triangular */
4209   idt = 4*(n-1);
4210   for (i=n-1; i>=0; i--){
4211     ai16 = 16*diag[i];
4212     v    = aa + ai16 + 16;
4213     vi   = aj + diag[i] + 1;
4214     nz   = ai[i+1] - diag[i] - 1;
4215     s1 = x[idt];  s2 = x[1+idt];
4216     s3 = x[2+idt];s4 = x[3+idt];
4217     while (nz--) {
4218       idx   = 4*(*vi++);
4219       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
4220       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
4221       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
4222       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
4223       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
4224       v    += 16;
4225     }
4226     v        = aa + ai16;
4227     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
4228     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
4229     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
4230     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
4231     idt -= 4;
4232   }
4233   }
4234 #endif
4235 
4236   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
4237   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4238   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4239   PetscFunctionReturn(0);
4240 }
4241 
4242 #undef __FUNCT__
4243 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
4244 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
4245 {
4246     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4247     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
4248     PetscInt          i,k,nz,idx,jdx,idt;
4249     PetscErrorCode    ierr;
4250     const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
4251     const MatScalar   *aa=a->a,*v;
4252     PetscScalar       *x;
4253     const PetscScalar *b;
4254     PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4;
4255 
4256     PetscFunctionBegin;
4257     ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
4258     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4259     /* forward solve the lower triangular */
4260     idx    = 0;
4261     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
4262     for (i=1; i<n; i++) {
4263        v    = aa + bs2*ai[i];
4264        vi   = aj + ai[i];
4265        nz   = ai[i+1] - ai[i];
4266       idx   = bs*i;
4267        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
4268       for(k=0;k<nz;k++) {
4269           jdx   = bs*vi[k];
4270           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
4271           s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
4272           s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
4273           s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4274 	  s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4275 
4276           v   +=  bs2;
4277         }
4278 
4279        x[idx]   = s1;
4280        x[1+idx] = s2;
4281        x[2+idx] = s3;
4282        x[3+idx] = s4;
4283     }
4284 
4285    /* backward solve the upper triangular */
4286   for (i=n-1; i>=0; i--){
4287     v   = aa + bs2*(adiag[i+1]+1);
4288      vi  = aj + adiag[i+1]+1;
4289      nz  = adiag[i] - adiag[i+1]-1;
4290      idt = bs*i;
4291      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
4292 
4293     for(k=0;k<nz;k++){
4294       idx   = bs*vi[k];
4295        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
4296        s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
4297        s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
4298        s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4299        s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4300 
4301         v   +=  bs2;
4302     }
4303     /* x = inv_diagonal*x */
4304    x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
4305    x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;;
4306    x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
4307    x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
4308 
4309   }
4310 
4311   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
4312   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4313   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
4314   PetscFunctionReturn(0);
4315 }
4316 
4317 #undef __FUNCT__
4318 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion"
4319 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
4320 {
4321   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4322   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*diag=a->diag;
4323   PetscErrorCode    ierr;
4324   const MatScalar   *aa=a->a;
4325   const PetscScalar *b;
4326   PetscScalar       *x;
4327 
4328   PetscFunctionBegin;
4329   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
4330   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4331 
4332   {
4333     MatScalar        s1,s2,s3,s4,x1,x2,x3,x4;
4334     const MatScalar  *v;
4335     MatScalar        *t=(MatScalar *)x;
4336     PetscInt         jdx,idt,idx,nz,i,ai16;
4337     const PetscInt   *vi;
4338 
4339     /* forward solve the lower triangular */
4340     idx  = 0;
4341     t[0] = (MatScalar)b[0];
4342     t[1] = (MatScalar)b[1];
4343     t[2] = (MatScalar)b[2];
4344     t[3] = (MatScalar)b[3];
4345     for (i=1; i<n; i++) {
4346       v     =  aa      + 16*ai[i];
4347       vi    =  aj      + ai[i];
4348       nz    =  diag[i] - ai[i];
4349       idx   +=  4;
4350       s1 = (MatScalar)b[idx];
4351       s2 = (MatScalar)b[1+idx];
4352       s3 = (MatScalar)b[2+idx];
4353       s4 = (MatScalar)b[3+idx];
4354       while (nz--) {
4355         jdx = 4*(*vi++);
4356         x1  = t[jdx];
4357         x2  = t[1+jdx];
4358         x3  = t[2+jdx];
4359         x4  = t[3+jdx];
4360         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
4361         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
4362         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4363         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4364         v    += 16;
4365       }
4366       t[idx]   = s1;
4367       t[1+idx] = s2;
4368       t[2+idx] = s3;
4369       t[3+idx] = s4;
4370     }
4371     /* backward solve the upper triangular */
4372     idt = 4*(n-1);
4373     for (i=n-1; i>=0; i--){
4374       ai16 = 16*diag[i];
4375       v    = aa + ai16 + 16;
4376       vi   = aj + diag[i] + 1;
4377       nz   = ai[i+1] - diag[i] - 1;
4378       s1   = t[idt];
4379       s2   = t[1+idt];
4380       s3   = t[2+idt];
4381       s4   = t[3+idt];
4382       while (nz--) {
4383         idx = 4*(*vi++);
4384         x1  = (MatScalar)x[idx];
4385         x2  = (MatScalar)x[1+idx];
4386         x3  = (MatScalar)x[2+idx];
4387         x4  = (MatScalar)x[3+idx];
4388         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
4389         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
4390         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4391         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4392         v    += 16;
4393       }
4394       v        = aa + ai16;
4395       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
4396       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
4397       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
4398       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
4399       idt -= 4;
4400     }
4401   }
4402 
4403   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
4404   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4405   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4406   PetscFunctionReturn(0);
4407 }
4408 
4409 #if defined (PETSC_HAVE_SSE)
4410 
4411 #include PETSC_HAVE_SSE
4412 #undef __FUNCT__
4413 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj"
4414 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
4415 {
4416   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
4417   unsigned short *aj=(unsigned short *)a->j;
4418   PetscErrorCode ierr;
4419   int            *ai=a->i,n=a->mbs,*diag = a->diag;
4420   MatScalar      *aa=a->a;
4421   PetscScalar    *x,*b;
4422 
4423   PetscFunctionBegin;
4424   SSE_SCOPE_BEGIN;
4425   /*
4426      Note: This code currently uses demotion of double
4427      to float when performing the mixed-mode computation.
4428      This may not be numerically reasonable for all applications.
4429   */
4430   PREFETCH_NTA(aa+16*ai[1]);
4431 
4432   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4433   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4434   {
4435     /* x will first be computed in single precision then promoted inplace to double */
4436     MatScalar      *v,*t=(MatScalar *)x;
4437     int            nz,i,idt,ai16;
4438     unsigned int   jdx,idx;
4439     unsigned short *vi;
4440     /* Forward solve the lower triangular factor. */
4441 
4442     /* First block is the identity. */
4443     idx  = 0;
4444     CONVERT_DOUBLE4_FLOAT4(t,b);
4445     v    =  aa + 16*((unsigned int)ai[1]);
4446 
4447     for (i=1; i<n;) {
4448       PREFETCH_NTA(&v[8]);
4449       vi   =  aj      + ai[i];
4450       nz   =  diag[i] - ai[i];
4451       idx +=  4;
4452 
4453       /* Demote RHS from double to float. */
4454       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
4455       LOAD_PS(&t[idx],XMM7);
4456 
4457       while (nz--) {
4458         PREFETCH_NTA(&v[16]);
4459         jdx = 4*((unsigned int)(*vi++));
4460 
4461         /* 4x4 Matrix-Vector product with negative accumulation: */
4462         SSE_INLINE_BEGIN_2(&t[jdx],v)
4463           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4464 
4465           /* First Column */
4466           SSE_COPY_PS(XMM0,XMM6)
4467           SSE_SHUFFLE(XMM0,XMM0,0x00)
4468           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4469           SSE_SUB_PS(XMM7,XMM0)
4470 
4471           /* Second Column */
4472           SSE_COPY_PS(XMM1,XMM6)
4473           SSE_SHUFFLE(XMM1,XMM1,0x55)
4474           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4475           SSE_SUB_PS(XMM7,XMM1)
4476 
4477           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4478 
4479           /* Third Column */
4480           SSE_COPY_PS(XMM2,XMM6)
4481           SSE_SHUFFLE(XMM2,XMM2,0xAA)
4482           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4483           SSE_SUB_PS(XMM7,XMM2)
4484 
4485           /* Fourth Column */
4486           SSE_COPY_PS(XMM3,XMM6)
4487           SSE_SHUFFLE(XMM3,XMM3,0xFF)
4488           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4489           SSE_SUB_PS(XMM7,XMM3)
4490         SSE_INLINE_END_2
4491 
4492         v  += 16;
4493       }
4494       v    =  aa + 16*ai[++i];
4495       PREFETCH_NTA(v);
4496       STORE_PS(&t[idx],XMM7);
4497     }
4498 
4499     /* Backward solve the upper triangular factor.*/
4500 
4501     idt  = 4*(n-1);
4502     ai16 = 16*diag[n-1];
4503     v    = aa + ai16 + 16;
4504     for (i=n-1; i>=0;){
4505       PREFETCH_NTA(&v[8]);
4506       vi = aj + diag[i] + 1;
4507       nz = ai[i+1] - diag[i] - 1;
4508 
4509       LOAD_PS(&t[idt],XMM7);
4510 
4511       while (nz--) {
4512         PREFETCH_NTA(&v[16]);
4513         idx = 4*((unsigned int)(*vi++));
4514 
4515         /* 4x4 Matrix-Vector Product with negative accumulation: */
4516         SSE_INLINE_BEGIN_2(&t[idx],v)
4517           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4518 
4519           /* First Column */
4520           SSE_COPY_PS(XMM0,XMM6)
4521           SSE_SHUFFLE(XMM0,XMM0,0x00)
4522           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4523           SSE_SUB_PS(XMM7,XMM0)
4524 
4525           /* Second Column */
4526           SSE_COPY_PS(XMM1,XMM6)
4527           SSE_SHUFFLE(XMM1,XMM1,0x55)
4528           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4529           SSE_SUB_PS(XMM7,XMM1)
4530 
4531           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4532 
4533           /* Third Column */
4534           SSE_COPY_PS(XMM2,XMM6)
4535           SSE_SHUFFLE(XMM2,XMM2,0xAA)
4536           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4537           SSE_SUB_PS(XMM7,XMM2)
4538 
4539           /* Fourth Column */
4540           SSE_COPY_PS(XMM3,XMM6)
4541           SSE_SHUFFLE(XMM3,XMM3,0xFF)
4542           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4543           SSE_SUB_PS(XMM7,XMM3)
4544         SSE_INLINE_END_2
4545         v  += 16;
4546       }
4547       v    = aa + ai16;
4548       ai16 = 16*diag[--i];
4549       PREFETCH_NTA(aa+ai16+16);
4550       /*
4551          Scale the result by the diagonal 4x4 block,
4552          which was inverted as part of the factorization
4553       */
4554       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
4555         /* First Column */
4556         SSE_COPY_PS(XMM0,XMM7)
4557         SSE_SHUFFLE(XMM0,XMM0,0x00)
4558         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
4559 
4560         /* Second Column */
4561         SSE_COPY_PS(XMM1,XMM7)
4562         SSE_SHUFFLE(XMM1,XMM1,0x55)
4563         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
4564         SSE_ADD_PS(XMM0,XMM1)
4565 
4566         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
4567 
4568         /* Third Column */
4569         SSE_COPY_PS(XMM2,XMM7)
4570         SSE_SHUFFLE(XMM2,XMM2,0xAA)
4571         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
4572         SSE_ADD_PS(XMM0,XMM2)
4573 
4574         /* Fourth Column */
4575         SSE_COPY_PS(XMM3,XMM7)
4576         SSE_SHUFFLE(XMM3,XMM3,0xFF)
4577         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
4578         SSE_ADD_PS(XMM0,XMM3)
4579 
4580         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
4581       SSE_INLINE_END_3
4582 
4583       v    = aa + ai16 + 16;
4584       idt -= 4;
4585     }
4586 
4587     /* Convert t from single precision back to double precision (inplace)*/
4588     idt = 4*(n-1);
4589     for (i=n-1;i>=0;i--) {
4590       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
4591       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
4592       PetscScalar *xtemp=&x[idt];
4593       MatScalar   *ttemp=&t[idt];
4594       xtemp[3] = (PetscScalar)ttemp[3];
4595       xtemp[2] = (PetscScalar)ttemp[2];
4596       xtemp[1] = (PetscScalar)ttemp[1];
4597       xtemp[0] = (PetscScalar)ttemp[0];
4598       idt -= 4;
4599     }
4600 
4601   } /* End of artificial scope. */
4602   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4603   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4604   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4605   SSE_SCOPE_END;
4606   PetscFunctionReturn(0);
4607 }
4608 
4609 #undef __FUNCT__
4610 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion"
4611 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
4612 {
4613   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
4614   int            *aj=a->j;
4615   PetscErrorCode ierr;
4616   int            *ai=a->i,n=a->mbs,*diag = a->diag;
4617   MatScalar      *aa=a->a;
4618   PetscScalar    *x,*b;
4619 
4620   PetscFunctionBegin;
4621   SSE_SCOPE_BEGIN;
4622   /*
4623      Note: This code currently uses demotion of double
4624      to float when performing the mixed-mode computation.
4625      This may not be numerically reasonable for all applications.
4626   */
4627   PREFETCH_NTA(aa+16*ai[1]);
4628 
4629   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4630   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4631   {
4632     /* x will first be computed in single precision then promoted inplace to double */
4633     MatScalar *v,*t=(MatScalar *)x;
4634     int       nz,i,idt,ai16;
4635     int       jdx,idx;
4636     int       *vi;
4637     /* Forward solve the lower triangular factor. */
4638 
4639     /* First block is the identity. */
4640     idx  = 0;
4641     CONVERT_DOUBLE4_FLOAT4(t,b);
4642     v    =  aa + 16*ai[1];
4643 
4644     for (i=1; i<n;) {
4645       PREFETCH_NTA(&v[8]);
4646       vi   =  aj      + ai[i];
4647       nz   =  diag[i] - ai[i];
4648       idx +=  4;
4649 
4650       /* Demote RHS from double to float. */
4651       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
4652       LOAD_PS(&t[idx],XMM7);
4653 
4654       while (nz--) {
4655         PREFETCH_NTA(&v[16]);
4656         jdx = 4*(*vi++);
4657 /*          jdx = *vi++; */
4658 
4659         /* 4x4 Matrix-Vector product with negative accumulation: */
4660         SSE_INLINE_BEGIN_2(&t[jdx],v)
4661           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4662 
4663           /* First Column */
4664           SSE_COPY_PS(XMM0,XMM6)
4665           SSE_SHUFFLE(XMM0,XMM0,0x00)
4666           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4667           SSE_SUB_PS(XMM7,XMM0)
4668 
4669           /* Second Column */
4670           SSE_COPY_PS(XMM1,XMM6)
4671           SSE_SHUFFLE(XMM1,XMM1,0x55)
4672           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4673           SSE_SUB_PS(XMM7,XMM1)
4674 
4675           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4676 
4677           /* Third Column */
4678           SSE_COPY_PS(XMM2,XMM6)
4679           SSE_SHUFFLE(XMM2,XMM2,0xAA)
4680           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4681           SSE_SUB_PS(XMM7,XMM2)
4682 
4683           /* Fourth Column */
4684           SSE_COPY_PS(XMM3,XMM6)
4685           SSE_SHUFFLE(XMM3,XMM3,0xFF)
4686           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4687           SSE_SUB_PS(XMM7,XMM3)
4688         SSE_INLINE_END_2
4689 
4690         v  += 16;
4691       }
4692       v    =  aa + 16*ai[++i];
4693       PREFETCH_NTA(v);
4694       STORE_PS(&t[idx],XMM7);
4695     }
4696 
4697     /* Backward solve the upper triangular factor.*/
4698 
4699     idt  = 4*(n-1);
4700     ai16 = 16*diag[n-1];
4701     v    = aa + ai16 + 16;
4702     for (i=n-1; i>=0;){
4703       PREFETCH_NTA(&v[8]);
4704       vi = aj + diag[i] + 1;
4705       nz = ai[i+1] - diag[i] - 1;
4706 
4707       LOAD_PS(&t[idt],XMM7);
4708 
4709       while (nz--) {
4710         PREFETCH_NTA(&v[16]);
4711         idx = 4*(*vi++);
4712 /*          idx = *vi++; */
4713 
4714         /* 4x4 Matrix-Vector Product with negative accumulation: */
4715         SSE_INLINE_BEGIN_2(&t[idx],v)
4716           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4717 
4718           /* First Column */
4719           SSE_COPY_PS(XMM0,XMM6)
4720           SSE_SHUFFLE(XMM0,XMM0,0x00)
4721           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4722           SSE_SUB_PS(XMM7,XMM0)
4723 
4724           /* Second Column */
4725           SSE_COPY_PS(XMM1,XMM6)
4726           SSE_SHUFFLE(XMM1,XMM1,0x55)
4727           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4728           SSE_SUB_PS(XMM7,XMM1)
4729 
4730           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4731 
4732           /* Third Column */
4733           SSE_COPY_PS(XMM2,XMM6)
4734           SSE_SHUFFLE(XMM2,XMM2,0xAA)
4735           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4736           SSE_SUB_PS(XMM7,XMM2)
4737 
4738           /* Fourth Column */
4739           SSE_COPY_PS(XMM3,XMM6)
4740           SSE_SHUFFLE(XMM3,XMM3,0xFF)
4741           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4742           SSE_SUB_PS(XMM7,XMM3)
4743         SSE_INLINE_END_2
4744         v  += 16;
4745       }
4746       v    = aa + ai16;
4747       ai16 = 16*diag[--i];
4748       PREFETCH_NTA(aa+ai16+16);
4749       /*
4750          Scale the result by the diagonal 4x4 block,
4751          which was inverted as part of the factorization
4752       */
4753       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
4754         /* First Column */
4755         SSE_COPY_PS(XMM0,XMM7)
4756         SSE_SHUFFLE(XMM0,XMM0,0x00)
4757         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
4758 
4759         /* Second Column */
4760         SSE_COPY_PS(XMM1,XMM7)
4761         SSE_SHUFFLE(XMM1,XMM1,0x55)
4762         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
4763         SSE_ADD_PS(XMM0,XMM1)
4764 
4765         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
4766 
4767         /* Third Column */
4768         SSE_COPY_PS(XMM2,XMM7)
4769         SSE_SHUFFLE(XMM2,XMM2,0xAA)
4770         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
4771         SSE_ADD_PS(XMM0,XMM2)
4772 
4773         /* Fourth Column */
4774         SSE_COPY_PS(XMM3,XMM7)
4775         SSE_SHUFFLE(XMM3,XMM3,0xFF)
4776         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
4777         SSE_ADD_PS(XMM0,XMM3)
4778 
4779         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
4780       SSE_INLINE_END_3
4781 
4782       v    = aa + ai16 + 16;
4783       idt -= 4;
4784     }
4785 
4786     /* Convert t from single precision back to double precision (inplace)*/
4787     idt = 4*(n-1);
4788     for (i=n-1;i>=0;i--) {
4789       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
4790       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
4791       PetscScalar *xtemp=&x[idt];
4792       MatScalar   *ttemp=&t[idt];
4793       xtemp[3] = (PetscScalar)ttemp[3];
4794       xtemp[2] = (PetscScalar)ttemp[2];
4795       xtemp[1] = (PetscScalar)ttemp[1];
4796       xtemp[0] = (PetscScalar)ttemp[0];
4797       idt -= 4;
4798     }
4799 
4800   } /* End of artificial scope. */
4801   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4802   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4803   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4804   SSE_SCOPE_END;
4805   PetscFunctionReturn(0);
4806 }
4807 
4808 #endif
4809 
4810 #undef __FUNCT__
4811 #define __FUNCT__ "MatSolve_SeqBAIJ_3_inplace"
4812 PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
4813 {
4814   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4815   IS                iscol=a->col,isrow=a->row;
4816   PetscErrorCode    ierr;
4817   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
4818   PetscInt          i,nz,idx,idt,idc;
4819   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
4820   const MatScalar   *aa=a->a,*v;
4821   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
4822   const PetscScalar *b;
4823 
4824   PetscFunctionBegin;
4825   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
4826   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4827   t  = a->solve_work;
4828 
4829   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4830   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
4831 
4832   /* forward solve the lower triangular */
4833   idx    = 3*(*r++);
4834   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
4835   for (i=1; i<n; i++) {
4836     v     = aa + 9*ai[i];
4837     vi    = aj + ai[i];
4838     nz    = diag[i] - ai[i];
4839     idx   = 3*(*r++);
4840     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
4841     while (nz--) {
4842       idx   = 3*(*vi++);
4843       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4844       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4845       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4846       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4847       v += 9;
4848     }
4849     idx = 3*i;
4850     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
4851   }
4852   /* backward solve the upper triangular */
4853   for (i=n-1; i>=0; i--){
4854     v    = aa + 9*diag[i] + 9;
4855     vi   = aj + diag[i] + 1;
4856     nz   = ai[i+1] - diag[i] - 1;
4857     idt  = 3*i;
4858     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
4859     while (nz--) {
4860       idx   = 3*(*vi++);
4861       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4862       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4863       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4864       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4865       v += 9;
4866     }
4867     idc = 3*(*c--);
4868     v   = aa + 9*diag[i];
4869     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4870     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4871     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4872   }
4873   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4874   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4875   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
4876   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4877   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
4878   PetscFunctionReturn(0);
4879 }
4880 
4881 #undef __FUNCT__
4882 #define __FUNCT__ "MatSolve_SeqBAIJ_3"
4883 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
4884 {
4885   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4886   IS                iscol=a->col,isrow=a->row;
4887   PetscErrorCode    ierr;
4888   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
4889   PetscInt          i,nz,idx,idt,idc,m;
4890   const PetscInt    *r,*c,*rout,*cout;
4891   const MatScalar   *aa=a->a,*v;
4892   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
4893   const PetscScalar *b;
4894 
4895   PetscFunctionBegin;
4896   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
4897   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4898   t  = a->solve_work;
4899 
4900   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4901   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
4902 
4903   /* forward solve the lower triangular */
4904   idx    = 3*r[0];
4905   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
4906   for (i=1; i<n; i++) {
4907     v     = aa + 9*ai[i];
4908     vi    = aj + ai[i];
4909     nz    = ai[i+1] - ai[i];
4910     idx   = 3*r[i];
4911     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
4912     for(m=0;m<nz;m++){
4913       idx   = 3*vi[m];
4914       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4915       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4916       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4917       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4918       v += 9;
4919     }
4920     idx = 3*i;
4921     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
4922   }
4923   /* backward solve the upper triangular */
4924   for (i=n-1; i>=0; i--){
4925     v    = aa + 9*(adiag[i+1]+1);
4926     vi   = aj + adiag[i+1]+1;
4927     nz   = adiag[i] - adiag[i+1] - 1;
4928     idt  = 3*i;
4929     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
4930     for(m=0;m<nz;m++){
4931       idx   = 3*vi[m];
4932       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4933       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4934       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4935       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4936       v += 9;
4937     }
4938     idc = 3*c[i];
4939     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4940     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4941     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4942   }
4943   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4944   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4945   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
4946   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4947   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
4948   PetscFunctionReturn(0);
4949 }
4950 
4951 /*
4952       Special case where the matrix was ILU(0) factored in the natural
4953    ordering. This eliminates the need for the column and row permutation.
4954 */
4955 #undef __FUNCT__
4956 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_inplace"
4957 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
4958 {
4959   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4960   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j;
4961   PetscErrorCode    ierr;
4962   const PetscInt    *diag = a->diag,*vi;
4963   const MatScalar   *aa=a->a,*v;
4964   PetscScalar       *x,s1,s2,s3,x1,x2,x3;
4965   const PetscScalar *b;
4966   PetscInt          jdx,idt,idx,nz,i;
4967 
4968   PetscFunctionBegin;
4969   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
4970   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4971 
4972   /* forward solve the lower triangular */
4973   idx    = 0;
4974   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
4975   for (i=1; i<n; i++) {
4976     v     =  aa      + 9*ai[i];
4977     vi    =  aj      + ai[i];
4978     nz    =  diag[i] - ai[i];
4979     idx   +=  3;
4980     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
4981     while (nz--) {
4982       jdx   = 3*(*vi++);
4983       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
4984       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4985       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4986       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4987       v    += 9;
4988     }
4989     x[idx]   = s1;
4990     x[1+idx] = s2;
4991     x[2+idx] = s3;
4992   }
4993   /* backward solve the upper triangular */
4994   for (i=n-1; i>=0; i--){
4995     v    = aa + 9*diag[i] + 9;
4996     vi   = aj + diag[i] + 1;
4997     nz   = ai[i+1] - diag[i] - 1;
4998     idt  = 3*i;
4999     s1 = x[idt];  s2 = x[1+idt];
5000     s3 = x[2+idt];
5001     while (nz--) {
5002       idx   = 3*(*vi++);
5003       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
5004       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
5005       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
5006       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
5007       v    += 9;
5008     }
5009     v        = aa +  9*diag[i];
5010     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
5011     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
5012     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
5013   }
5014 
5015   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5016   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5017   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
5018   PetscFunctionReturn(0);
5019 }
5020 
5021 #undef __FUNCT__
5022 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
5023 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
5024 {
5025     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
5026     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
5027     PetscErrorCode    ierr;
5028     PetscInt          i,k,nz,idx,jdx,idt;
5029     const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
5030     const MatScalar   *aa=a->a,*v;
5031     PetscScalar       *x;
5032     const PetscScalar *b;
5033     PetscScalar        s1,s2,s3,x1,x2,x3;
5034 
5035     PetscFunctionBegin;
5036     ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
5037     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5038     /* forward solve the lower triangular */
5039     idx    = 0;
5040     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
5041     for (i=1; i<n; i++) {
5042        v    = aa + bs2*ai[i];
5043        vi   = aj + ai[i];
5044        nz   = ai[i+1] - ai[i];
5045       idx   = bs*i;
5046        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];
5047       for(k=0;k<nz;k++){
5048          jdx   = bs*vi[k];
5049           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
5050           s1   -= v[0]*x1 + v[3]*x2 + v[6]*x3;
5051           s2   -= v[1]*x1 + v[4]*x2 + v[7]*x3;
5052           s3   -= v[2]*x1 + v[5]*x2 + v[8]*x3;
5053 
5054           v   +=  bs2;
5055         }
5056 
5057        x[idx]   = s1;
5058        x[1+idx] = s2;
5059        x[2+idx] = s3;
5060     }
5061 
5062    /* backward solve the upper triangular */
5063   for (i=n-1; i>=0; i--){
5064     v   = aa + bs2*(adiag[i+1]+1);
5065      vi  = aj + adiag[i+1]+1;
5066      nz  = adiag[i] - adiag[i+1]-1;
5067      idt = bs*i;
5068      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];
5069 
5070      for(k=0;k<nz;k++){
5071        idx   = bs*vi[k];
5072        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
5073        s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
5074        s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
5075        s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
5076 
5077         v   +=  bs2;
5078     }
5079     /* x = inv_diagonal*x */
5080    x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
5081    x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
5082    x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
5083 
5084   }
5085 
5086   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5087   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5088   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
5089   PetscFunctionReturn(0);
5090 }
5091 
5092 #undef __FUNCT__
5093 #define __FUNCT__ "MatSolve_SeqBAIJ_2_inplace"
5094 PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
5095 {
5096   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
5097   IS                iscol=a->col,isrow=a->row;
5098   PetscErrorCode    ierr;
5099   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
5100   PetscInt          i,nz,idx,idt,idc;
5101   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
5102   const MatScalar   *aa=a->a,*v;
5103   PetscScalar       *x,s1,s2,x1,x2,*t;
5104   const PetscScalar *b;
5105 
5106   PetscFunctionBegin;
5107   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
5108   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5109   t  = a->solve_work;
5110 
5111   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
5112   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
5113 
5114   /* forward solve the lower triangular */
5115   idx    = 2*(*r++);
5116   t[0] = b[idx]; t[1] = b[1+idx];
5117   for (i=1; i<n; i++) {
5118     v     = aa + 4*ai[i];
5119     vi    = aj + ai[i];
5120     nz    = diag[i] - ai[i];
5121     idx   = 2*(*r++);
5122     s1  = b[idx]; s2 = b[1+idx];
5123     while (nz--) {
5124       idx   = 2*(*vi++);
5125       x1    = t[idx]; x2 = t[1+idx];
5126       s1 -= v[0]*x1 + v[2]*x2;
5127       s2 -= v[1]*x1 + v[3]*x2;
5128       v += 4;
5129     }
5130     idx = 2*i;
5131     t[idx] = s1; t[1+idx] = s2;
5132   }
5133   /* backward solve the upper triangular */
5134   for (i=n-1; i>=0; i--){
5135     v    = aa + 4*diag[i] + 4;
5136     vi   = aj + diag[i] + 1;
5137     nz   = ai[i+1] - diag[i] - 1;
5138     idt  = 2*i;
5139     s1 = t[idt]; s2 = t[1+idt];
5140     while (nz--) {
5141       idx   = 2*(*vi++);
5142       x1    = t[idx]; x2 = t[1+idx];
5143       s1 -= v[0]*x1 + v[2]*x2;
5144       s2 -= v[1]*x1 + v[3]*x2;
5145       v += 4;
5146     }
5147     idc = 2*(*c--);
5148     v   = aa + 4*diag[i];
5149     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
5150     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
5151   }
5152   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
5153   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
5154   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5155   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5156   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
5157   PetscFunctionReturn(0);
5158 }
5159 
5160 #undef __FUNCT__
5161 #define __FUNCT__ "MatSolve_SeqBAIJ_2"
5162 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
5163 {
5164   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
5165   IS                iscol=a->col,isrow=a->row;
5166   PetscErrorCode    ierr;
5167   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
5168   PetscInt          i,nz,idx,jdx,idt,idc,m;
5169   const PetscInt    *r,*c,*rout,*cout;
5170   const MatScalar   *aa=a->a,*v;
5171   PetscScalar       *x,s1,s2,x1,x2,*t;
5172   const PetscScalar *b;
5173 
5174   PetscFunctionBegin;
5175   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
5176   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5177   t  = a->solve_work;
5178 
5179   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
5180   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
5181 
5182   /* forward solve the lower triangular */
5183   idx    = 2*r[0];
5184   t[0] = b[idx]; t[1] = b[1+idx];
5185   for (i=1; i<n; i++) {
5186     v     = aa + 4*ai[i];
5187     vi    = aj + ai[i];
5188     nz    = ai[i+1] - ai[i];
5189     idx   = 2*r[i];
5190     s1  = b[idx]; s2 = b[1+idx];
5191     for(m=0;m<nz;m++){
5192       jdx   = 2*vi[m];
5193       x1    = t[jdx]; x2 = t[1+jdx];
5194       s1 -= v[0]*x1 + v[2]*x2;
5195       s2 -= v[1]*x1 + v[3]*x2;
5196       v += 4;
5197     }
5198     idx = 2*i;
5199     t[idx] = s1; t[1+idx] = s2;
5200   }
5201   /* backward solve the upper triangular */
5202   for (i=n-1; i>=0; i--){
5203     v    = aa + 4*(adiag[i+1]+1);
5204     vi   = aj + adiag[i+1]+1;
5205     nz   = adiag[i] - adiag[i+1] - 1;
5206     idt  = 2*i;
5207     s1 = t[idt]; s2 = t[1+idt];
5208     for(m=0;m<nz;m++){
5209       idx   = 2*vi[m];
5210       x1    = t[idx]; x2 = t[1+idx];
5211       s1 -= v[0]*x1 + v[2]*x2;
5212       s2 -= v[1]*x1 + v[3]*x2;
5213       v += 4;
5214     }
5215     idc = 2*c[i];
5216     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
5217     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
5218   }
5219   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
5220   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
5221   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5222   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5223   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
5224   PetscFunctionReturn(0);
5225 }
5226 
5227 /*
5228       Special case where the matrix was ILU(0) factored in the natural
5229    ordering. This eliminates the need for the column and row permutation.
5230 */
5231 #undef __FUNCT__
5232 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_inplace"
5233 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
5234 {
5235   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
5236   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
5237   PetscErrorCode    ierr;
5238   const MatScalar   *aa=a->a,*v;
5239   PetscScalar       *x,s1,s2,x1,x2;
5240   const PetscScalar *b;
5241   PetscInt          jdx,idt,idx,nz,i;
5242 
5243   PetscFunctionBegin;
5244   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
5245   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5246 
5247   /* forward solve the lower triangular */
5248   idx    = 0;
5249   x[0]   = b[0]; x[1] = b[1];
5250   for (i=1; i<n; i++) {
5251     v     =  aa      + 4*ai[i];
5252     vi    =  aj      + ai[i];
5253     nz    =  diag[i] - ai[i];
5254     idx   +=  2;
5255     s1  =  b[idx];s2 = b[1+idx];
5256     while (nz--) {
5257       jdx   = 2*(*vi++);
5258       x1    = x[jdx];x2 = x[1+jdx];
5259       s1 -= v[0]*x1 + v[2]*x2;
5260       s2 -= v[1]*x1 + v[3]*x2;
5261       v    += 4;
5262     }
5263     x[idx]   = s1;
5264     x[1+idx] = s2;
5265   }
5266   /* backward solve the upper triangular */
5267   for (i=n-1; i>=0; i--){
5268     v    = aa + 4*diag[i] + 4;
5269     vi   = aj + diag[i] + 1;
5270     nz   = ai[i+1] - diag[i] - 1;
5271     idt  = 2*i;
5272     s1 = x[idt];  s2 = x[1+idt];
5273     while (nz--) {
5274       idx   = 2*(*vi++);
5275       x1    = x[idx];   x2 = x[1+idx];
5276       s1 -= v[0]*x1 + v[2]*x2;
5277       s2 -= v[1]*x1 + v[3]*x2;
5278       v    += 4;
5279     }
5280     v        = aa +  4*diag[i];
5281     x[idt]   = v[0]*s1 + v[2]*s2;
5282     x[1+idt] = v[1]*s1 + v[3]*s2;
5283   }
5284 
5285   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5286   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5287   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
5288   PetscFunctionReturn(0);
5289 }
5290 
5291 #undef __FUNCT__
5292 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
5293 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
5294 {
5295     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
5296     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
5297     PetscInt          i,k,nz,idx,idt,jdx;
5298     PetscErrorCode    ierr;
5299     const MatScalar   *aa=a->a,*v;
5300     PetscScalar       *x,s1,s2,x1,x2;
5301     const PetscScalar *b;
5302 
5303     PetscFunctionBegin;
5304     ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
5305     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5306     /* forward solve the lower triangular */
5307     idx    = 0;
5308     x[0] = b[idx]; x[1] = b[1+idx];
5309     for (i=1; i<n; i++) {
5310         v   = aa + 4*ai[i];
5311        vi   = aj + ai[i];
5312        nz   = ai[i+1] - ai[i];
5313        idx  = 2*i;
5314        s1   = b[idx];s2 = b[1+idx];
5315       for(k=0;k<nz;k++){
5316          jdx   = 2*vi[k];
5317           x1    = x[jdx];x2 = x[1+jdx];
5318           s1   -= v[0]*x1 + v[2]*x2;
5319           s2   -= v[1]*x1 + v[3]*x2;
5320            v   +=  4;
5321         }
5322        x[idx]   = s1;
5323        x[1+idx] = s2;
5324     }
5325 
5326    /* backward solve the upper triangular */
5327   for (i=n-1; i>=0; i--){
5328      v   = aa + 4*(adiag[i+1]+1);
5329      vi  = aj + adiag[i+1]+1;
5330      nz  = adiag[i] - adiag[i+1]-1;
5331      idt = 2*i;
5332      s1 = x[idt];  s2 = x[1+idt];
5333      for(k=0;k<nz;k++){
5334       idx   = 2*vi[k];
5335        x1    = x[idx];   x2 = x[1+idx];
5336        s1 -= v[0]*x1 + v[2]*x2;
5337        s2 -= v[1]*x1 + v[3]*x2;
5338          v    += 4;
5339     }
5340     /* x = inv_diagonal*x */
5341    x[idt]   = v[0]*s1 + v[2]*s2;
5342    x[1+idt] = v[1]*s1 + v[3]*s2;
5343   }
5344 
5345   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5346   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5347   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
5348   PetscFunctionReturn(0);
5349 }
5350 
5351 #undef __FUNCT__
5352 #define __FUNCT__ "MatSolve_SeqBAIJ_1_inplace"
5353 PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
5354 {
5355   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
5356   IS                iscol=a->col,isrow=a->row;
5357   PetscErrorCode    ierr;
5358   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
5359   PetscInt          i,nz;
5360   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
5361   const MatScalar   *aa=a->a,*v;
5362   PetscScalar       *x,s1,*t;
5363   const PetscScalar *b;
5364 
5365   PetscFunctionBegin;
5366   if (!n) PetscFunctionReturn(0);
5367 
5368   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
5369   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5370   t  = a->solve_work;
5371 
5372   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
5373   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
5374 
5375   /* forward solve the lower triangular */
5376   t[0] = b[*r++];
5377   for (i=1; i<n; i++) {
5378     v     = aa + ai[i];
5379     vi    = aj + ai[i];
5380     nz    = diag[i] - ai[i];
5381     s1  = b[*r++];
5382     while (nz--) {
5383       s1 -= (*v++)*t[*vi++];
5384     }
5385     t[i] = s1;
5386   }
5387   /* backward solve the upper triangular */
5388   for (i=n-1; i>=0; i--){
5389     v    = aa + diag[i] + 1;
5390     vi   = aj + diag[i] + 1;
5391     nz   = ai[i+1] - diag[i] - 1;
5392     s1 = t[i];
5393     while (nz--) {
5394       s1 -= (*v++)*t[*vi++];
5395     }
5396     x[*c--] = t[i] = aa[diag[i]]*s1;
5397   }
5398 
5399   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
5400   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
5401   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5402   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5403   ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr);
5404   PetscFunctionReturn(0);
5405 }
5406 
5407 #undef __FUNCT__
5408 #define __FUNCT__ "MatSolve_SeqBAIJ_1"
5409 PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
5410 {
5411   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data;
5412   IS                iscol = a->col,isrow = a->row;
5413   PetscErrorCode    ierr;
5414   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag = a->diag,nz;
5415   const PetscInt    *rout,*cout,*r,*c;
5416   PetscScalar       *x,*tmp,sum;
5417   const PetscScalar *b;
5418   const MatScalar   *aa = a->a,*v;
5419 
5420   PetscFunctionBegin;
5421   if (!n) PetscFunctionReturn(0);
5422 
5423   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
5424   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5425   tmp  = a->solve_work;
5426 
5427   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
5428   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
5429 
5430   /* forward solve the lower triangular */
5431   tmp[0] = b[r[0]];
5432   v      = aa;
5433   vi     = aj;
5434   for (i=1; i<n; i++) {
5435     nz  = ai[i+1] - ai[i];
5436     sum = b[r[i]];
5437     PetscSparseDenseMinusDot(sum,tmp,v,vi,nz);
5438     tmp[i] = sum;
5439     v += nz; vi += nz;
5440   }
5441 
5442   /* backward solve the upper triangular */
5443   for (i=n-1; i>=0; i--){
5444     v   = aa + adiag[i+1]+1;
5445     vi  = aj + adiag[i+1]+1;
5446     nz  = adiag[i]-adiag[i+1]-1;
5447     sum = tmp[i];
5448     PetscSparseDenseMinusDot(sum,tmp,v,vi,nz);
5449     x[c[i]] = tmp[i] = sum*v[nz]; /* v[nz] = aa[adiag[i]] */
5450   }
5451 
5452   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
5453   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
5454   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5455   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5456   ierr = PetscLogFlops(2*a->nz - A->cmap->n);CHKERRQ(ierr);
5457   PetscFunctionReturn(0);
5458 }
5459 
5460 /*
5461       Special case where the matrix was ILU(0) factored in the natural
5462    ordering. This eliminates the need for the column and row permutation.
5463 */
5464 #undef __FUNCT__
5465 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering_inplace"
5466 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
5467 {
5468   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
5469   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
5470   PetscErrorCode    ierr;
5471   const MatScalar   *aa=a->a,*v;
5472   PetscScalar       *x;
5473   const PetscScalar *b;
5474   PetscScalar       s1,x1;
5475   PetscInt          jdx,idt,idx,nz,i;
5476 
5477   PetscFunctionBegin;
5478   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
5479   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5480 
5481   /* forward solve the lower triangular */
5482   idx    = 0;
5483   x[0]   = b[0];
5484   for (i=1; i<n; i++) {
5485     v     =  aa      + ai[i];
5486     vi    =  aj      + ai[i];
5487     nz    =  diag[i] - ai[i];
5488     idx   +=  1;
5489     s1  =  b[idx];
5490     while (nz--) {
5491       jdx   = *vi++;
5492       x1    = x[jdx];
5493       s1 -= v[0]*x1;
5494       v    += 1;
5495     }
5496     x[idx]   = s1;
5497   }
5498   /* backward solve the upper triangular */
5499   for (i=n-1; i>=0; i--){
5500     v    = aa + diag[i] + 1;
5501     vi   = aj + diag[i] + 1;
5502     nz   = ai[i+1] - diag[i] - 1;
5503     idt  = i;
5504     s1 = x[idt];
5505     while (nz--) {
5506       idx   = *vi++;
5507       x1    = x[idx];
5508       s1 -= v[0]*x1;
5509       v    += 1;
5510     }
5511     v        = aa +  diag[i];
5512     x[idt]   = v[0]*s1;
5513   }
5514   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5515   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5516   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
5517   PetscFunctionReturn(0);
5518 }
5519 
5520 
5521 #undef __FUNCT__
5522 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering"
5523 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
5524 {
5525   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
5526   PetscErrorCode    ierr;
5527   const PetscInt    n = a->mbs,*ai = a->i,*aj = a->j,*adiag = a->diag,*vi;
5528   PetscScalar       *x,sum;
5529   const PetscScalar *b;
5530   const MatScalar   *aa = a->a,*v;
5531   PetscInt          i,nz;
5532 
5533   PetscFunctionBegin;
5534   if (!n) PetscFunctionReturn(0);
5535 
5536   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
5537   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5538 
5539   /* forward solve the lower triangular */
5540   x[0] = b[0];
5541   v    = aa;
5542   vi   = aj;
5543   for (i=1; i<n; i++) {
5544     nz  = ai[i+1] - ai[i];
5545     sum = b[i];
5546     PetscSparseDenseMinusDot(sum,x,v,vi,nz);
5547     v  += nz;
5548     vi += nz;
5549     x[i] = sum;
5550   }
5551 
5552   /* backward solve the upper triangular */
5553   for (i=n-1; i>=0; i--){
5554     v   = aa + adiag[i+1] + 1;
5555     vi  = aj + adiag[i+1] + 1;
5556     nz = adiag[i] - adiag[i+1]-1;
5557     sum = x[i];
5558     PetscSparseDenseMinusDot(sum,x,v,vi,nz);
5559     x[i] = sum*v[nz]; /* x[i]=aa[adiag[i]]*sum; v++; */
5560   }
5561 
5562   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
5563   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5564   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5565   PetscFunctionReturn(0);
5566 }
5567 
5568 /* ----------------------------------------------------------------*/
5569 extern PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscBool );
5570 
5571 #undef __FUNCT__
5572 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering"
5573 /*
5574    This is not much faster than MatLUFactorNumeric_SeqBAIJ_N() but the solve is faster at least sometimes
5575 */
5576 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info)
5577 {
5578   Mat             C=B;
5579   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
5580   PetscErrorCode  ierr;
5581   PetscInt        i,j,k,ipvt[15];
5582   const PetscInt  n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*ajtmp,*bjtmp,*bdiag=b->diag,*pj;
5583   PetscInt        nz,nzL,row;
5584   MatScalar       *rtmp,*pc,*mwork,*pv,*vv,work[225];
5585   const MatScalar *v,*aa=a->a;
5586   PetscInt        bs2 = a->bs2,bs=A->rmap->bs,flg;
5587   PetscInt        sol_ver;
5588 
5589   PetscFunctionBegin;
5590 
5591   ierr = PetscOptionsGetInt(PETSC_NULL,"-sol_ver",&sol_ver,PETSC_NULL);CHKERRQ(ierr);
5592 
5593   /* generate work space needed by the factorization */
5594   ierr = PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);CHKERRQ(ierr);
5595   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
5596 
5597   for (i=0; i<n; i++){
5598     /* zero rtmp */
5599     /* L part */
5600     nz    = bi[i+1] - bi[i];
5601     bjtmp = bj + bi[i];
5602     for  (j=0; j<nz; j++){
5603       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5604     }
5605 
5606     /* U part */
5607     nz = bdiag[i] - bdiag[i+1];
5608     bjtmp = bj + bdiag[i+1]+1;
5609     for  (j=0; j<nz; j++){
5610       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5611     }
5612 
5613     /* load in initial (unfactored row) */
5614     nz    = ai[i+1] - ai[i];
5615     ajtmp = aj + ai[i];
5616     v     = aa + bs2*ai[i];
5617     for (j=0; j<nz; j++) {
5618       ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
5619     }
5620 
5621     /* elimination */
5622     bjtmp = bj + bi[i];
5623     nzL   = bi[i+1] - bi[i];
5624     for(k=0;k < nzL;k++) {
5625       row = bjtmp[k];
5626       pc = rtmp + bs2*row;
5627       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
5628       if (flg) {
5629         pv = b->a + bs2*bdiag[row];
5630 	Kernel_A_gets_A_times_B(bs,pc,pv,mwork);
5631 	/*ierr = Kernel_A_gets_A_times_B_15(pc,pv,mwork);CHKERRQ(ierr);*/
5632 	pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
5633         pv = b->a + bs2*(bdiag[row+1]+1);
5634         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
5635         for (j=0; j<nz; j++) {
5636           vv   = rtmp + bs2*pj[j];
5637           Kernel_A_gets_A_minus_B_times_C(bs,vv,pc,pv);
5638 	  /* ierr = Kernel_A_gets_A_minus_B_times_C_15(vv,pc,pv);CHKERRQ(ierr); */
5639 	  pv  += bs2;
5640         }
5641         ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
5642       }
5643     }
5644 
5645     /* finished row so stick it into b->a */
5646     /* L part */
5647     pv   = b->a + bs2*bi[i] ;
5648     pj   = b->j + bi[i] ;
5649     nz   = bi[i+1] - bi[i];
5650     for (j=0; j<nz; j++) {
5651       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5652     }
5653 
5654     /* Mark diagonal and invert diagonal for simplier triangular solves */
5655     pv   = b->a + bs2*bdiag[i];
5656     pj   = b->j + bdiag[i];
5657     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5658     /* Kernel_A_gets_inverse_A(bs,pv,pivots,work); */
5659     ierr = Kernel_A_gets_inverse_A_15(pv,ipvt,work,info->shiftamount);CHKERRQ(ierr);
5660 
5661     /* U part */
5662     pv = b->a + bs2*(bdiag[i+1]+1);
5663     pj = b->j + bdiag[i+1]+1;
5664     nz = bdiag[i] - bdiag[i+1] - 1;
5665     for (j=0; j<nz; j++){
5666       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5667     }
5668   }
5669 
5670   ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr);
5671   C->ops->solve = MatSolve_SeqBAIJ_15_NaturalOrdering_ver1;
5672   C->ops->solvetranspose = MatSolve_SeqBAIJ_N_NaturalOrdering;
5673   C->assembled = PETSC_TRUE;
5674   ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
5675   PetscFunctionReturn(0);
5676 }
5677 
5678 #undef __FUNCT__
5679 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N"
5680 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N(Mat B,Mat A,const MatFactorInfo *info)
5681 {
5682   Mat            C=B;
5683   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
5684   IS             isrow = b->row,isicol = b->icol;
5685   PetscErrorCode ierr;
5686   const PetscInt *r,*ic;
5687   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
5688   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
5689   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
5690   PetscInt       bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg;
5691   MatScalar      *v_work;
5692   PetscBool      col_identity,row_identity,both_identity;
5693 
5694   PetscFunctionBegin;
5695   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
5696   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
5697 
5698   ierr = PetscMalloc(bs2*n*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
5699   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
5700 
5701   /* generate work space needed by dense LU factorization */
5702   ierr  = PetscMalloc3(bs,MatScalar,&v_work,bs2,MatScalar,&mwork,bs,PetscInt,&v_pivots);CHKERRQ(ierr);
5703 
5704   for (i=0; i<n; i++){
5705     /* zero rtmp */
5706     /* L part */
5707     nz    = bi[i+1] - bi[i];
5708     bjtmp = bj + bi[i];
5709     for  (j=0; j<nz; j++){
5710       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5711     }
5712 
5713     /* U part */
5714     nz = bdiag[i] - bdiag[i+1];
5715     bjtmp = bj + bdiag[i+1]+1;
5716     for  (j=0; j<nz; j++){
5717       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5718     }
5719 
5720     /* load in initial (unfactored row) */
5721     nz    = ai[r[i]+1] - ai[r[i]];
5722     ajtmp = aj + ai[r[i]];
5723     v     = aa + bs2*ai[r[i]];
5724     for (j=0; j<nz; j++) {
5725       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
5726     }
5727 
5728     /* elimination */
5729     bjtmp = bj + bi[i];
5730     nzL   = bi[i+1] - bi[i];
5731     for(k=0;k < nzL;k++) {
5732       row = bjtmp[k];
5733       pc = rtmp + bs2*row;
5734       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
5735       if (flg) {
5736         pv         = b->a + bs2*bdiag[row];
5737         Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */
5738         pj         = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
5739         pv         = b->a + bs2*(bdiag[row+1]+1);
5740         nz         = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
5741         for (j=0; j<nz; j++) {
5742           Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j);
5743         }
5744         ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
5745       }
5746     }
5747 
5748     /* finished row so stick it into b->a */
5749     /* L part */
5750     pv   = b->a + bs2*bi[i] ;
5751     pj   = b->j + bi[i] ;
5752     nz   = bi[i+1] - bi[i];
5753     for (j=0; j<nz; j++) {
5754       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5755     }
5756 
5757     /* Mark diagonal and invert diagonal for simplier triangular solves */
5758     pv  = b->a + bs2*bdiag[i];
5759     pj  = b->j + bdiag[i];
5760     /* if (*pj != i)SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */
5761     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5762     ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr);
5763 
5764     /* U part */
5765     pv = b->a + bs2*(bdiag[i+1]+1);
5766     pj = b->j + bdiag[i+1]+1;
5767     nz = bdiag[i] - bdiag[i+1] - 1;
5768     for (j=0; j<nz; j++){
5769       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5770     }
5771   }
5772 
5773   ierr = PetscFree(rtmp);CHKERRQ(ierr);
5774   ierr = PetscFree3(v_work,mwork,v_pivots);CHKERRQ(ierr);
5775   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
5776   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
5777 
5778   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5779   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
5780   both_identity = (PetscBool) (row_identity && col_identity);
5781   if (both_identity){
5782     C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering;
5783   } else {
5784     C->ops->solve = MatSolve_SeqBAIJ_N;
5785   }
5786   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_N;
5787 
5788   C->assembled = PETSC_TRUE;
5789   ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
5790   PetscFunctionReturn(0);
5791 }
5792 
5793 /*
5794    ilu(0) with natural ordering under new data structure.
5795    See MatILUFactorSymbolic_SeqAIJ_ilu0() for detailed description
5796    because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_inplace().
5797 */
5798 
5799 #undef __FUNCT__
5800 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0"
5801 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
5802 {
5803 
5804   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
5805   PetscErrorCode     ierr;
5806   PetscInt           n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2;
5807   PetscInt           i,j,nz,*bi,*bj,*bdiag,bi_temp;
5808 
5809   PetscFunctionBegin;
5810   ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr);
5811   b    = (Mat_SeqBAIJ*)(fact)->data;
5812 
5813   /* allocate matrix arrays for new data structure */
5814   ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,n+1,PetscInt,&b->i);CHKERRQ(ierr);
5815   ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(n+1)*sizeof(PetscInt));CHKERRQ(ierr);
5816   b->singlemalloc = PETSC_TRUE;
5817   if (!b->diag){
5818     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr);
5819     ierr = PetscLogObjectMemory(fact,(n+1)*sizeof(PetscInt));CHKERRQ(ierr);
5820   }
5821   bdiag = b->diag;
5822 
5823   if (n > 0) {
5824     ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr);
5825   }
5826 
5827   /* set bi and bj with new data structure */
5828   bi = b->i;
5829   bj = b->j;
5830 
5831   /* L part */
5832   bi[0] = 0;
5833   for (i=0; i<n; i++){
5834     nz = adiag[i] - ai[i];
5835     bi[i+1] = bi[i] + nz;
5836     aj = a->j + ai[i];
5837     for (j=0; j<nz; j++){
5838       *bj = aj[j]; bj++;
5839     }
5840   }
5841 
5842   /* U part */
5843   bi_temp = bi[n];
5844   bdiag[n] = bi[n]-1;
5845   for (i=n-1; i>=0; i--){
5846     nz = ai[i+1] - adiag[i] - 1;
5847     bi_temp = bi_temp + nz + 1;
5848     aj = a->j + adiag[i] + 1;
5849     for (j=0; j<nz; j++){
5850       *bj = aj[j]; bj++;
5851     }
5852     /* diag[i] */
5853     *bj = i; bj++;
5854     bdiag[i] = bi_temp - 1;
5855   }
5856   PetscFunctionReturn(0);
5857 }
5858 
5859 #undef __FUNCT__
5860 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ"
5861 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
5862 {
5863   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
5864   IS                 isicol;
5865   PetscErrorCode     ierr;
5866   const PetscInt     *r,*ic;
5867   PetscInt           n=a->mbs,*ai=a->i,*aj=a->j,d;
5868   PetscInt           *bi,*cols,nnz,*cols_lvl;
5869   PetscInt           *bdiag,prow,fm,nzbd,reallocs=0,dcount=0;
5870   PetscInt           i,levels,diagonal_fill;
5871   PetscBool          col_identity,row_identity,both_identity;
5872   PetscReal          f;
5873   PetscInt           nlnk,*lnk,*lnk_lvl=PETSC_NULL;
5874   PetscBT            lnkbt;
5875   PetscInt           nzi,*bj,**bj_ptr,**bjlvl_ptr;
5876   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
5877   PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL;
5878   PetscBool          missing;
5879   PetscInt           bs=A->rmap->bs,bs2=a->bs2;
5880 
5881   PetscFunctionBegin;
5882   if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
5883   if (bs>1){  /* check shifttype */
5884     if (info->shifttype == MAT_SHIFT_NONZERO || info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE)
5885       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Only MAT_SHIFT_NONE and MAT_SHIFT_INBLOCKS are supported for BAIJ matrix");
5886   }
5887 
5888   ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr);
5889   if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
5890 
5891   f             = info->fill;
5892   levels        = (PetscInt)info->levels;
5893   diagonal_fill = (PetscInt)info->diagonal_fill;
5894   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
5895 
5896   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5897   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
5898   both_identity = (PetscBool) (row_identity && col_identity);
5899 
5900   if (!levels && both_identity) {
5901     /* special case: ilu(0) with natural ordering */
5902     ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0(fact,A,isrow,iscol,info);CHKERRQ(ierr);
5903     ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
5904 
5905     fact->factortype               = MAT_FACTOR_ILU;
5906     (fact)->info.factor_mallocs    = 0;
5907     (fact)->info.fill_ratio_given  = info->fill;
5908     (fact)->info.fill_ratio_needed = 1.0;
5909     b                = (Mat_SeqBAIJ*)(fact)->data;
5910     b->row           = isrow;
5911     b->col           = iscol;
5912     b->icol          = isicol;
5913     ierr             = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5914     ierr             = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5915     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5916     ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
5917     PetscFunctionReturn(0);
5918   }
5919 
5920   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
5921   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
5922 
5923   /* get new row pointers */
5924   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
5925   bi[0] = 0;
5926   /* bdiag is location of diagonal in factor */
5927   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr);
5928   bdiag[0]  = 0;
5929 
5930   ierr = PetscMalloc2(n,PetscInt*,&bj_ptr,n,PetscInt*,&bjlvl_ptr);CHKERRQ(ierr);
5931 
5932   /* create a linked list for storing column indices of the active row */
5933   nlnk = n + 1;
5934   ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
5935 
5936   /* initial FreeSpace size is f*(ai[n]+1) */
5937   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr);
5938   current_space = free_space;
5939   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr);
5940   current_space_lvl = free_space_lvl;
5941 
5942   for (i=0; i<n; i++) {
5943     nzi = 0;
5944     /* copy current row into linked list */
5945     nnz  = ai[r[i]+1] - ai[r[i]];
5946     if (!nnz) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i);
5947     cols = aj + ai[r[i]];
5948     lnk[i] = -1; /* marker to indicate if diagonal exists */
5949     ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
5950     nzi += nlnk;
5951 
5952     /* make sure diagonal entry is included */
5953     if (diagonal_fill && lnk[i] == -1) {
5954       fm = n;
5955       while (lnk[fm] < i) fm = lnk[fm];
5956       lnk[i]     = lnk[fm]; /* insert diagonal into linked list */
5957       lnk[fm]    = i;
5958       lnk_lvl[i] = 0;
5959       nzi++; dcount++;
5960     }
5961 
5962     /* add pivot rows into the active row */
5963     nzbd = 0;
5964     prow = lnk[n];
5965     while (prow < i) {
5966       nnz      = bdiag[prow];
5967       cols     = bj_ptr[prow] + nnz + 1;
5968       cols_lvl = bjlvl_ptr[prow] + nnz + 1;
5969       nnz      = bi[prow+1] - bi[prow] - nnz - 1;
5970       ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr);
5971       nzi += nlnk;
5972       prow = lnk[prow];
5973       nzbd++;
5974     }
5975     bdiag[i] = nzbd;
5976     bi[i+1]  = bi[i] + nzi;
5977 
5978     /* if free space is not available, make more free space */
5979     if (current_space->local_remaining<nzi) {
5980       nnz = 2*nzi*(n - i); /* estimated and max additional space needed */
5981       ierr = PetscFreeSpaceGet(nnz,&current_space);CHKERRQ(ierr);
5982       ierr = PetscFreeSpaceGet(nnz,&current_space_lvl);CHKERRQ(ierr);
5983       reallocs++;
5984     }
5985 
5986     /* copy data into free_space and free_space_lvl, then initialize lnk */
5987     ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr);
5988     bj_ptr[i]    = current_space->array;
5989     bjlvl_ptr[i] = current_space_lvl->array;
5990 
5991     /* make sure the active row i has diagonal entry */
5992     if (*(bj_ptr[i]+bdiag[i]) != i) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\ntry running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i);
5993 
5994     current_space->array           += nzi;
5995     current_space->local_used      += nzi;
5996     current_space->local_remaining -= nzi;
5997     current_space_lvl->array           += nzi;
5998     current_space_lvl->local_used      += nzi;
5999     current_space_lvl->local_remaining -= nzi;
6000   }
6001 
6002   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
6003   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
6004 
6005   /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */
6006   ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr);
6007   ierr = PetscFreeSpaceContiguous_LU(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr);
6008 
6009   ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
6010   ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr);
6011   ierr = PetscFree2(bj_ptr,bjlvl_ptr);CHKERRQ(ierr);
6012 
6013 #if defined(PETSC_USE_INFO)
6014   {
6015     PetscReal af = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]);
6016     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr);
6017     ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
6018     ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr);
6019     ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
6020     if (diagonal_fill) {
6021       ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr);
6022     }
6023   }
6024 #endif
6025 
6026   /* put together the new matrix */
6027   ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
6028   ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
6029   b = (Mat_SeqBAIJ*)(fact)->data;
6030   b->free_a       = PETSC_TRUE;
6031   b->free_ij      = PETSC_TRUE;
6032   b->singlemalloc = PETSC_FALSE;
6033   ierr = PetscMalloc( (bs2*(bdiag[0]+1) )*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
6034   b->j          = bj;
6035   b->i          = bi;
6036   b->diag       = bdiag;
6037   b->free_diag  = PETSC_TRUE;
6038   b->ilen       = 0;
6039   b->imax       = 0;
6040   b->row        = isrow;
6041   b->col        = iscol;
6042   ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
6043   ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
6044   b->icol       = isicol;
6045   ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
6046   /* In b structure:  Free imax, ilen, old a, old j.
6047      Allocate bdiag, solve_work, new a, new j */
6048   ierr = PetscLogObjectMemory(fact,(bdiag[0]+1) * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr);
6049   b->maxnz = b->nz = bdiag[0]+1;
6050   fact->info.factor_mallocs    = reallocs;
6051   fact->info.fill_ratio_given  = f;
6052   fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]);
6053   ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
6054   PetscFunctionReturn(0);
6055 }
6056 
6057 /*
6058      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
6059    except that the data structure of Mat_SeqAIJ is slightly different.
6060    Not a good example of code reuse.
6061 */
6062 #undef __FUNCT__
6063 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_inplace"
6064 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_inplace(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
6065 {
6066   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b;
6067   IS             isicol;
6068   PetscErrorCode ierr;
6069   const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi;
6070   PetscInt       prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp;
6071   PetscInt       *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0;
6072   PetscInt       incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd;
6073   PetscBool      col_identity,row_identity,both_identity,flg;
6074   PetscReal      f;
6075 
6076   PetscFunctionBegin;
6077   ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr);
6078   if (flg) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd);
6079 
6080   f             = info->fill;
6081   levels        = (PetscInt)info->levels;
6082   diagonal_fill = (PetscInt)info->diagonal_fill;
6083   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
6084 
6085   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
6086   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
6087   both_identity = (PetscBool) (row_identity && col_identity);
6088 
6089   if (!levels && both_identity) {  /* special case copy the nonzero structure */
6090     ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr);
6091     ierr = MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);CHKERRQ(ierr);
6092 
6093     fact->factortype = MAT_FACTOR_ILU;
6094     b            = (Mat_SeqBAIJ*)fact->data;
6095     b->row       = isrow;
6096     b->col       = iscol;
6097     ierr         = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
6098     ierr         = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
6099     b->icol      = isicol;
6100     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
6101     ierr         = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
6102     PetscFunctionReturn(0);
6103   }
6104 
6105   /* general case perform the symbolic factorization */
6106     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
6107     ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
6108 
6109     /* get new row pointers */
6110     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr);
6111     ainew[0] = 0;
6112     /* don't know how many column pointers are needed so estimate */
6113     jmax = (PetscInt)(f*ai[n] + 1);
6114     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr);
6115     /* ajfill is level of fill for each fill entry */
6116     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr);
6117     /* fill is a linked list of nonzeros in active row */
6118     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr);
6119     /* im is level for each filled value */
6120     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr);
6121     /* dloc is location of diagonal in factor */
6122     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr);
6123     dloc[0]  = 0;
6124     for (prow=0; prow<n; prow++) {
6125 
6126       /* copy prow into linked list */
6127       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
6128       if (!nz) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow);
6129       xi         = aj + ai[r[prow]];
6130       fill[n]    = n;
6131       fill[prow] = -1; /* marker for diagonal entry */
6132       while (nz--) {
6133 	fm  = n;
6134 	idx = ic[*xi++];
6135 	do {
6136 	  m  = fm;
6137 	  fm = fill[m];
6138 	} while (fm < idx);
6139 	fill[m]   = idx;
6140 	fill[idx] = fm;
6141 	im[idx]   = 0;
6142       }
6143 
6144       /* make sure diagonal entry is included */
6145       if (diagonal_fill && fill[prow] == -1) {
6146 	fm = n;
6147 	while (fill[fm] < prow) fm = fill[fm];
6148 	fill[prow] = fill[fm];  /* insert diagonal into linked list */
6149 	fill[fm]   = prow;
6150 	im[prow]   = 0;
6151 	nzf++;
6152 	dcount++;
6153       }
6154 
6155       nzi = 0;
6156       row = fill[n];
6157       while (row < prow) {
6158 	incrlev = im[row] + 1;
6159 	nz      = dloc[row];
6160 	xi      = ajnew  + ainew[row] + nz + 1;
6161 	flev    = ajfill + ainew[row] + nz + 1;
6162 	nnz     = ainew[row+1] - ainew[row] - nz - 1;
6163 	fm      = row;
6164 	while (nnz-- > 0) {
6165 	  idx = *xi++;
6166 	  if (*flev + incrlev > levels) {
6167 	    flev++;
6168 	    continue;
6169 	  }
6170 	  do {
6171 	    m  = fm;
6172 	    fm = fill[m];
6173 	  } while (fm < idx);
6174 	  if (fm != idx) {
6175 	    im[idx]   = *flev + incrlev;
6176 	    fill[m]   = idx;
6177 	    fill[idx] = fm;
6178 	    fm        = idx;
6179 	    nzf++;
6180 	  } else {
6181 	    if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
6182 	  }
6183 	  flev++;
6184 	}
6185 	row = fill[row];
6186 	nzi++;
6187       }
6188       /* copy new filled row into permanent storage */
6189       ainew[prow+1] = ainew[prow] + nzf;
6190       if (ainew[prow+1] > jmax) {
6191 
6192 	/* estimate how much additional space we will need */
6193 	/* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
6194 	/* just double the memory each time */
6195 	PetscInt maxadd = jmax;
6196 	/* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
6197 	if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
6198 	jmax += maxadd;
6199 
6200 	/* allocate a longer ajnew and ajfill */
6201 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
6202 	ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
6203 	ierr = PetscFree(ajnew);CHKERRQ(ierr);
6204 	ajnew = xitmp;
6205 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
6206 	ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
6207 	ierr = PetscFree(ajfill);CHKERRQ(ierr);
6208 	ajfill = xitmp;
6209 	reallocate++; /* count how many reallocations are needed */
6210       }
6211       xitmp       = ajnew + ainew[prow];
6212       flev        = ajfill + ainew[prow];
6213       dloc[prow]  = nzi;
6214       fm          = fill[n];
6215       while (nzf--) {
6216 	*xitmp++ = fm;
6217 	*flev++ = im[fm];
6218 	fm      = fill[fm];
6219       }
6220       /* make sure row has diagonal entry */
6221       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
6222 	SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
6223     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow);
6224       }
6225     }
6226     ierr = PetscFree(ajfill);CHKERRQ(ierr);
6227     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
6228     ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
6229     ierr = PetscFree(fill);CHKERRQ(ierr);
6230     ierr = PetscFree(im);CHKERRQ(ierr);
6231 
6232 #if defined(PETSC_USE_INFO)
6233     {
6234       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
6235       ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr);
6236       ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
6237       ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr);
6238       ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
6239       if (diagonal_fill) {
6240 	ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr);
6241       }
6242     }
6243 #endif
6244 
6245     /* put together the new matrix */
6246     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
6247     ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
6248     b    = (Mat_SeqBAIJ*)fact->data;
6249     b->free_a       = PETSC_TRUE;
6250     b->free_ij      = PETSC_TRUE;
6251     b->singlemalloc = PETSC_FALSE;
6252     ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
6253     b->j          = ajnew;
6254     b->i          = ainew;
6255     for (i=0; i<n; i++) dloc[i] += ainew[i];
6256     b->diag       = dloc;
6257     b->free_diag  = PETSC_TRUE;
6258     b->ilen       = 0;
6259     b->imax       = 0;
6260     b->row        = isrow;
6261     b->col        = iscol;
6262     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
6263     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
6264     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
6265     b->icol       = isicol;
6266     ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
6267     /* In b structure:  Free imax, ilen, old a, old j.
6268        Allocate dloc, solve_work, new a, new j */
6269     ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr);
6270     b->maxnz          = b->nz = ainew[n];
6271 
6272     fact->info.factor_mallocs    = reallocate;
6273     fact->info.fill_ratio_given  = f;
6274     fact->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
6275 
6276   ierr = MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);CHKERRQ(ierr);
6277   PetscFunctionReturn(0);
6278 }
6279 
6280 #undef __FUNCT__
6281 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE"
6282 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
6283 {
6284   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
6285   /* int i,*AJ=a->j,nz=a->nz; */
6286   PetscFunctionBegin;
6287   /* Undo Column scaling */
6288 /*    while (nz--) { */
6289 /*      AJ[i] = AJ[i]/4; */
6290 /*    } */
6291   /* This should really invoke a push/pop logic, but we don't have that yet. */
6292   A->ops->setunfactored = PETSC_NULL;
6293   PetscFunctionReturn(0);
6294 }
6295 
6296 #undef __FUNCT__
6297 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj"
6298 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
6299 {
6300   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
6301   PetscInt       *AJ=a->j,nz=a->nz;
6302   unsigned short *aj=(unsigned short *)AJ;
6303   PetscFunctionBegin;
6304   /* Is this really necessary? */
6305   while (nz--) {
6306     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
6307   }
6308   A->ops->setunfactored = PETSC_NULL;
6309   PetscFunctionReturn(0);
6310 }
6311 
6312 
6313