xref: /petsc/src/mat/impls/baij/seq/baijsolv.c (revision bebe2cf65d55febe21a5af8db2bd2e168caaa2e7)
1 #include <../src/mat/impls/baij/seq/baij.h>
2 #include <petsc/private/kernels/blockinvert.h>
3 
4 #undef __FUNCT__
5 #define __FUNCT__ "MatSolve_SeqBAIJ_N_inplace"
6 PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
7 {
8   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
9   IS                iscol=a->col,isrow=a->row;
10   PetscErrorCode    ierr;
11   const PetscInt    *r,*c,*rout,*cout;
12   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*vi;
13   PetscInt          i,nz;
14   const PetscInt    bs =A->rmap->bs,bs2=a->bs2;
15   const MatScalar   *aa=a->a,*v;
16   PetscScalar       *x,*s,*t,*ls;
17   const PetscScalar *b;
18 
19   PetscFunctionBegin;
20   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
21   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
22   t    = a->solve_work;
23 
24   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
25   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
26 
27   /* forward solve the lower triangular */
28   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
29   for (i=1; i<n; i++) {
30     v    = aa + bs2*ai[i];
31     vi   = aj + ai[i];
32     nz   = a->diag[i] - ai[i];
33     s    = t + bs*i;
34     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
35     while (nz--) {
36       PetscKernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
37       v += bs2;
38     }
39   }
40   /* backward solve the upper triangular */
41   ls = a->solve_work + A->cmap->n;
42   for (i=n-1; i>=0; i--) {
43     v    = aa + bs2*(a->diag[i] + 1);
44     vi   = aj + a->diag[i] + 1;
45     nz   = ai[i+1] - a->diag[i] - 1;
46     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
47     while (nz--) {
48       PetscKernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
49       v += bs2;
50     }
51     PetscKernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
52     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
53   }
54 
55   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
56   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
57   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
58   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
59   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
60   PetscFunctionReturn(0);
61 }
62 
63 #undef __FUNCT__
64 #define __FUNCT__ "MatSolve_SeqBAIJ_7_inplace"
65 PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
66 {
67   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
68   IS                iscol=a->col,isrow=a->row;
69   PetscErrorCode    ierr;
70   const PetscInt    *r,*c,*ai=a->i,*aj=a->j;
71   const PetscInt    *rout,*cout,*diag = a->diag,*vi,n=a->mbs;
72   PetscInt          i,nz,idx,idt,idc;
73   const MatScalar   *aa=a->a,*v;
74   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
75   const PetscScalar *b;
76 
77   PetscFunctionBegin;
78   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
79   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
80   t    = a->solve_work;
81 
82   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
83   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
84 
85   /* forward solve the lower triangular */
86   idx  = 7*(*r++);
87   t[0] = b[idx];   t[1] = b[1+idx];
88   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
89   t[5] = b[5+idx]; t[6] = b[6+idx];
90 
91   for (i=1; i<n; i++) {
92     v   = aa + 49*ai[i];
93     vi  = aj + ai[i];
94     nz  = diag[i] - ai[i];
95     idx = 7*(*r++);
96     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
97     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
98     while (nz--) {
99       idx = 7*(*vi++);
100       x1  = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
101       x4  = t[3+idx];x5 = t[4+idx];
102       x6  = t[5+idx];x7 = t[6+idx];
103       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
104       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
105       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
106       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
107       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
108       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
109       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
110       v  += 49;
111     }
112     idx      = 7*i;
113     t[idx]   = s1;t[1+idx] = s2;
114     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
115     t[5+idx] = s6;t[6+idx] = s7;
116   }
117   /* backward solve the upper triangular */
118   for (i=n-1; i>=0; i--) {
119     v   = aa + 49*diag[i] + 49;
120     vi  = aj + diag[i] + 1;
121     nz  = ai[i+1] - diag[i] - 1;
122     idt = 7*i;
123     s1  = t[idt];  s2 = t[1+idt];
124     s3  = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
125     s6  = t[5+idt];s7 = t[6+idt];
126     while (nz--) {
127       idx = 7*(*vi++);
128       x1  = t[idx];   x2 = t[1+idx];
129       x3  = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
130       x6  = t[5+idx]; x7 = t[6+idx];
131       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
132       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
133       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
134       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
135       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
136       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
137       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
138       v  += 49;
139     }
140     idc    = 7*(*c--);
141     v      = aa + 49*diag[i];
142     x[idc] = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
143                         v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
144     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
145                           v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
146     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
147                           v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
148     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
149                           v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
150     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
151                           v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
152     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
153                           v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
154     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
155                           v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
156   }
157 
158   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
159   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
160   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
161   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
162   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
163   PetscFunctionReturn(0);
164 }
165 
166 #undef __FUNCT__
167 #define __FUNCT__ "MatSolve_SeqBAIJ_7"
168 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
169 {
170   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
171   IS                iscol=a->col,isrow=a->row;
172   PetscErrorCode    ierr;
173   const PetscInt    *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag;
174   const PetscInt    n=a->mbs,*rout,*cout,*vi;
175   PetscInt          i,nz,idx,idt,idc,m;
176   const MatScalar   *aa=a->a,*v;
177   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
178   const PetscScalar *b;
179 
180   PetscFunctionBegin;
181   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
182   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
183   t    = a->solve_work;
184 
185   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
186   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
187 
188   /* forward solve the lower triangular */
189   idx  = 7*r[0];
190   t[0] = b[idx];   t[1] = b[1+idx];
191   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
192   t[5] = b[5+idx]; t[6] = b[6+idx];
193 
194   for (i=1; i<n; i++) {
195     v   = aa + 49*ai[i];
196     vi  = aj + ai[i];
197     nz  = ai[i+1] - ai[i];
198     idx = 7*r[i];
199     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
200     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
201     for (m=0; m<nz; m++) {
202       idx = 7*vi[m];
203       x1  = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
204       x4  = t[3+idx];x5 = t[4+idx];
205       x6  = t[5+idx];x7 = t[6+idx];
206       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
207       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
208       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
209       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
210       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
211       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
212       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
213       v  += 49;
214     }
215     idx      = 7*i;
216     t[idx]   = s1;t[1+idx] = s2;
217     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
218     t[5+idx] = s6;t[6+idx] = s7;
219   }
220   /* backward solve the upper triangular */
221   for (i=n-1; i>=0; i--) {
222     v   = aa + 49*(adiag[i+1]+1);
223     vi  = aj + adiag[i+1]+1;
224     nz  = adiag[i] - adiag[i+1] - 1;
225     idt = 7*i;
226     s1  = t[idt];  s2 = t[1+idt];
227     s3  = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
228     s6  = t[5+idt];s7 = t[6+idt];
229     for (m=0; m<nz; m++) {
230       idx = 7*vi[m];
231       x1  = t[idx];   x2 = t[1+idx];
232       x3  = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
233       x6  = t[5+idx]; x7 = t[6+idx];
234       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
235       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
236       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
237       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
238       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
239       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
240       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
241       v  += 49;
242     }
243     idc    = 7*c[i];
244     x[idc] = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
245                         v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
246     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
247                           v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
248     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
249                           v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
250     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
251                           v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
252     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
253                           v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
254     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
255                           v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
256     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
257                           v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
258   }
259 
260   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
261   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
262   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
263   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
264   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
265   PetscFunctionReturn(0);
266 }
267 
268 #undef __FUNCT__
269 #define __FUNCT__ "MatSolve_SeqBAIJ_6_inplace"
270 PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
271 {
272   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
273   IS                iscol=a->col,isrow=a->row;
274   PetscErrorCode    ierr;
275   const PetscInt    *r,*c,*rout,*cout;
276   const PetscInt    *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
277   PetscInt          i,nz,idx,idt,idc;
278   const MatScalar   *aa=a->a,*v;
279   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
280   const PetscScalar *b;
281 
282   PetscFunctionBegin;
283   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
284   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
285   t    = a->solve_work;
286 
287   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
288   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
289 
290   /* forward solve the lower triangular */
291   idx  = 6*(*r++);
292   t[0] = b[idx];   t[1] = b[1+idx];
293   t[2] = b[2+idx]; t[3] = b[3+idx];
294   t[4] = b[4+idx]; t[5] = b[5+idx];
295   for (i=1; i<n; i++) {
296     v   = aa + 36*ai[i];
297     vi  = aj + ai[i];
298     nz  = diag[i] - ai[i];
299     idx = 6*(*r++);
300     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
301     s5  = b[4+idx]; s6 = b[5+idx];
302     while (nz--) {
303       idx = 6*(*vi++);
304       x1  = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
305       x4  = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
306       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
307       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
308       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
309       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
310       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
311       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
312       v  += 36;
313     }
314     idx      = 6*i;
315     t[idx]   = s1;t[1+idx] = s2;
316     t[2+idx] = s3;t[3+idx] = s4;
317     t[4+idx] = s5;t[5+idx] = s6;
318   }
319   /* backward solve the upper triangular */
320   for (i=n-1; i>=0; i--) {
321     v   = aa + 36*diag[i] + 36;
322     vi  = aj + diag[i] + 1;
323     nz  = ai[i+1] - diag[i] - 1;
324     idt = 6*i;
325     s1  = t[idt];  s2 = t[1+idt];
326     s3  = t[2+idt];s4 = t[3+idt];
327     s5  = t[4+idt];s6 = t[5+idt];
328     while (nz--) {
329       idx = 6*(*vi++);
330       x1  = t[idx];   x2 = t[1+idx];
331       x3  = t[2+idx]; x4 = t[3+idx];
332       x5  = t[4+idx]; x6 = t[5+idx];
333       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
334       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
335       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
336       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
337       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
338       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
339       v  += 36;
340     }
341     idc    = 6*(*c--);
342     v      = aa + 36*diag[i];
343     x[idc] = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
344                         v[18]*s4+v[24]*s5+v[30]*s6;
345     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
346                           v[19]*s4+v[25]*s5+v[31]*s6;
347     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
348                           v[20]*s4+v[26]*s5+v[32]*s6;
349     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
350                           v[21]*s4+v[27]*s5+v[33]*s6;
351     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
352                           v[22]*s4+v[28]*s5+v[34]*s6;
353     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
354                           v[23]*s4+v[29]*s5+v[35]*s6;
355   }
356 
357   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
358   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
359   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
360   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
361   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
362   PetscFunctionReturn(0);
363 }
364 
365 #undef __FUNCT__
366 #define __FUNCT__ "MatSolve_SeqBAIJ_6"
367 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
368 {
369   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
370   IS                iscol=a->col,isrow=a->row;
371   PetscErrorCode    ierr;
372   const PetscInt    *r,*c,*rout,*cout;
373   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
374   PetscInt          i,nz,idx,idt,idc,m;
375   const MatScalar   *aa=a->a,*v;
376   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
377   const PetscScalar *b;
378 
379   PetscFunctionBegin;
380   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
381   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
382   t    = a->solve_work;
383 
384   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
385   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
386 
387   /* forward solve the lower triangular */
388   idx  = 6*r[0];
389   t[0] = b[idx];   t[1] = b[1+idx];
390   t[2] = b[2+idx]; t[3] = b[3+idx];
391   t[4] = b[4+idx]; t[5] = b[5+idx];
392   for (i=1; i<n; i++) {
393     v   = aa + 36*ai[i];
394     vi  = aj + ai[i];
395     nz  = ai[i+1] - ai[i];
396     idx = 6*r[i];
397     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
398     s5  = b[4+idx]; s6 = b[5+idx];
399     for (m=0; m<nz; m++) {
400       idx = 6*vi[m];
401       x1  = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
402       x4  = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
403       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
404       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
405       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
406       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
407       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
408       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
409       v  += 36;
410     }
411     idx      = 6*i;
412     t[idx]   = s1;t[1+idx] = s2;
413     t[2+idx] = s3;t[3+idx] = s4;
414     t[4+idx] = s5;t[5+idx] = s6;
415   }
416   /* backward solve the upper triangular */
417   for (i=n-1; i>=0; i--) {
418     v   = aa + 36*(adiag[i+1]+1);
419     vi  = aj + adiag[i+1]+1;
420     nz  = adiag[i] - adiag[i+1] - 1;
421     idt = 6*i;
422     s1  = t[idt];  s2 = t[1+idt];
423     s3  = t[2+idt];s4 = t[3+idt];
424     s5  = t[4+idt];s6 = t[5+idt];
425     for (m=0; m<nz; m++) {
426       idx = 6*vi[m];
427       x1  = t[idx];   x2 = t[1+idx];
428       x3  = t[2+idx]; x4 = t[3+idx];
429       x5  = t[4+idx]; x6 = t[5+idx];
430       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
431       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
432       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
433       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
434       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
435       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
436       v  += 36;
437     }
438     idc    = 6*c[i];
439     x[idc] = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
440                         v[18]*s4+v[24]*s5+v[30]*s6;
441     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
442                           v[19]*s4+v[25]*s5+v[31]*s6;
443     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
444                           v[20]*s4+v[26]*s5+v[32]*s6;
445     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
446                           v[21]*s4+v[27]*s5+v[33]*s6;
447     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
448                           v[22]*s4+v[28]*s5+v[34]*s6;
449     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
450                           v[23]*s4+v[29]*s5+v[35]*s6;
451   }
452 
453   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
454   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
455   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
456   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
457   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
458   PetscFunctionReturn(0);
459 }
460 
461 #undef __FUNCT__
462 #define __FUNCT__ "MatSolve_SeqBAIJ_5_inplace"
463 PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
464 {
465   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
466   IS                iscol=a->col,isrow=a->row;
467   PetscErrorCode    ierr;
468   const PetscInt    *r,*c,*rout,*cout,*diag = a->diag;
469   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
470   PetscInt          i,nz,idx,idt,idc;
471   const MatScalar   *aa=a->a,*v;
472   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
473   const PetscScalar *b;
474 
475   PetscFunctionBegin;
476   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
477   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
478   t    = a->solve_work;
479 
480   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
481   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
482 
483   /* forward solve the lower triangular */
484   idx  = 5*(*r++);
485   t[0] = b[idx];   t[1] = b[1+idx];
486   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
487   for (i=1; i<n; i++) {
488     v   = aa + 25*ai[i];
489     vi  = aj + ai[i];
490     nz  = diag[i] - ai[i];
491     idx = 5*(*r++);
492     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
493     s5  = b[4+idx];
494     while (nz--) {
495       idx = 5*(*vi++);
496       x1  = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
497       x4  = t[3+idx];x5 = t[4+idx];
498       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
499       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
500       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
501       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
502       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
503       v  += 25;
504     }
505     idx      = 5*i;
506     t[idx]   = s1;t[1+idx] = s2;
507     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
508   }
509   /* backward solve the upper triangular */
510   for (i=n-1; i>=0; i--) {
511     v   = aa + 25*diag[i] + 25;
512     vi  = aj + diag[i] + 1;
513     nz  = ai[i+1] - diag[i] - 1;
514     idt = 5*i;
515     s1  = t[idt];  s2 = t[1+idt];
516     s3  = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
517     while (nz--) {
518       idx = 5*(*vi++);
519       x1  = t[idx];   x2 = t[1+idx];
520       x3  = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
521       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
522       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
523       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
524       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
525       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
526       v  += 25;
527     }
528     idc    = 5*(*c--);
529     v      = aa + 25*diag[i];
530     x[idc] = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
531                         v[15]*s4+v[20]*s5;
532     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
533                           v[16]*s4+v[21]*s5;
534     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
535                           v[17]*s4+v[22]*s5;
536     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
537                           v[18]*s4+v[23]*s5;
538     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
539                           v[19]*s4+v[24]*s5;
540   }
541 
542   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
543   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
544   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
545   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
546   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
547   PetscFunctionReturn(0);
548 }
549 
550 #undef __FUNCT__
551 #define __FUNCT__ "MatSolve_SeqBAIJ_5"
552 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
553 {
554   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
555   IS                iscol=a->col,isrow=a->row;
556   PetscErrorCode    ierr;
557   const PetscInt    *r,*c,*rout,*cout;
558   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
559   PetscInt          i,nz,idx,idt,idc,m;
560   const MatScalar   *aa=a->a,*v;
561   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
562   const PetscScalar *b;
563 
564   PetscFunctionBegin;
565   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
566   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
567   t    = a->solve_work;
568 
569   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
570   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
571 
572   /* forward solve the lower triangular */
573   idx  = 5*r[0];
574   t[0] = b[idx];   t[1] = b[1+idx];
575   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
576   for (i=1; i<n; i++) {
577     v   = aa + 25*ai[i];
578     vi  = aj + ai[i];
579     nz  = ai[i+1] - ai[i];
580     idx = 5*r[i];
581     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
582     s5  = b[4+idx];
583     for (m=0; m<nz; m++) {
584       idx = 5*vi[m];
585       x1  = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
586       x4  = t[3+idx];x5 = t[4+idx];
587       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
588       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
589       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
590       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
591       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
592       v  += 25;
593     }
594     idx      = 5*i;
595     t[idx]   = s1;t[1+idx] = s2;
596     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
597   }
598   /* backward solve the upper triangular */
599   for (i=n-1; i>=0; i--) {
600     v   = aa + 25*(adiag[i+1]+1);
601     vi  = aj + adiag[i+1]+1;
602     nz  = adiag[i] - adiag[i+1] - 1;
603     idt = 5*i;
604     s1  = t[idt];  s2 = t[1+idt];
605     s3  = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
606     for (m=0; m<nz; m++) {
607       idx = 5*vi[m];
608       x1  = t[idx];   x2 = t[1+idx];
609       x3  = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
610       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
611       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
612       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
613       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
614       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
615       v  += 25;
616     }
617     idc    = 5*c[i];
618     x[idc] = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
619                         v[15]*s4+v[20]*s5;
620     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
621                           v[16]*s4+v[21]*s5;
622     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
623                           v[17]*s4+v[22]*s5;
624     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
625                           v[18]*s4+v[23]*s5;
626     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
627                           v[19]*s4+v[24]*s5;
628   }
629 
630   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
631   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
632   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
633   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
634   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
635   PetscFunctionReturn(0);
636 }
637 
638 #undef __FUNCT__
639 #define __FUNCT__ "MatSolve_SeqBAIJ_4_inplace"
640 PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
641 {
642   Mat_SeqBAIJ       *a   = (Mat_SeqBAIJ*)A->data;
643   IS                iscol=a->col,isrow=a->row;
644   PetscErrorCode    ierr;
645   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
646   PetscInt          i,nz,idx,idt,idc;
647   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
648   const MatScalar   *aa=a->a,*v;
649   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
650   const PetscScalar *b;
651 
652   PetscFunctionBegin;
653   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
654   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
655   t    = a->solve_work;
656 
657   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
658   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
659 
660   /* forward solve the lower triangular */
661   idx  = 4*(*r++);
662   t[0] = b[idx];   t[1] = b[1+idx];
663   t[2] = b[2+idx]; t[3] = b[3+idx];
664   for (i=1; i<n; i++) {
665     v   = aa + 16*ai[i];
666     vi  = aj + ai[i];
667     nz  = diag[i] - ai[i];
668     idx = 4*(*r++);
669     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
670     while (nz--) {
671       idx = 4*(*vi++);
672       x1  = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
673       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
674       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
675       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
676       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
677       v  += 16;
678     }
679     idx      = 4*i;
680     t[idx]   = s1;t[1+idx] = s2;
681     t[2+idx] = s3;t[3+idx] = s4;
682   }
683   /* backward solve the upper triangular */
684   for (i=n-1; i>=0; i--) {
685     v   = aa + 16*diag[i] + 16;
686     vi  = aj + diag[i] + 1;
687     nz  = ai[i+1] - diag[i] - 1;
688     idt = 4*i;
689     s1  = t[idt];  s2 = t[1+idt];
690     s3  = t[2+idt];s4 = t[3+idt];
691     while (nz--) {
692       idx = 4*(*vi++);
693       x1  = t[idx];   x2 = t[1+idx];
694       x3  = t[2+idx]; x4 = t[3+idx];
695       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
696       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
697       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
698       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
699       v  += 16;
700     }
701     idc      = 4*(*c--);
702     v        = aa + 16*diag[i];
703     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
704     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
705     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
706     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
707   }
708 
709   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
710   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
711   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
712   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
713   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
714   PetscFunctionReturn(0);
715 }
716 
717 #undef __FUNCT__
718 #define __FUNCT__ "MatSolve_SeqBAIJ_4"
719 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
720 {
721   Mat_SeqBAIJ       *a   = (Mat_SeqBAIJ*)A->data;
722   IS                iscol=a->col,isrow=a->row;
723   PetscErrorCode    ierr;
724   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
725   PetscInt          i,nz,idx,idt,idc,m;
726   const PetscInt    *r,*c,*rout,*cout;
727   const MatScalar   *aa=a->a,*v;
728   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
729   const PetscScalar *b;
730 
731   PetscFunctionBegin;
732   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
733   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
734   t    = a->solve_work;
735 
736   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
737   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
738 
739   /* forward solve the lower triangular */
740   idx  = 4*r[0];
741   t[0] = b[idx];   t[1] = b[1+idx];
742   t[2] = b[2+idx]; t[3] = b[3+idx];
743   for (i=1; i<n; i++) {
744     v   = aa + 16*ai[i];
745     vi  = aj + ai[i];
746     nz  = ai[i+1] - ai[i];
747     idx = 4*r[i];
748     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
749     for (m=0; m<nz; m++) {
750       idx = 4*vi[m];
751       x1  = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
752       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
753       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
754       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
755       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
756       v  += 16;
757     }
758     idx      = 4*i;
759     t[idx]   = s1;t[1+idx] = s2;
760     t[2+idx] = s3;t[3+idx] = s4;
761   }
762   /* backward solve the upper triangular */
763   for (i=n-1; i>=0; i--) {
764     v   = aa + 16*(adiag[i+1]+1);
765     vi  = aj + adiag[i+1]+1;
766     nz  = adiag[i] - adiag[i+1] - 1;
767     idt = 4*i;
768     s1  = t[idt];  s2 = t[1+idt];
769     s3  = t[2+idt];s4 = t[3+idt];
770     for (m=0; m<nz; m++) {
771       idx = 4*vi[m];
772       x1  = t[idx];   x2 = t[1+idx];
773       x3  = t[2+idx]; x4 = t[3+idx];
774       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
775       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
776       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
777       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
778       v  += 16;
779     }
780     idc      = 4*c[i];
781     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
782     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
783     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
784     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
785   }
786 
787   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
788   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
789   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
790   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
791   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
792   PetscFunctionReturn(0);
793 }
794 
795 #undef __FUNCT__
796 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion"
797 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
798 {
799   Mat_SeqBAIJ       *a   = (Mat_SeqBAIJ*)A->data;
800   IS                iscol=a->col,isrow=a->row;
801   PetscErrorCode    ierr;
802   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
803   PetscInt          i,nz,idx,idt,idc;
804   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
805   const MatScalar   *aa=a->a,*v;
806   MatScalar         s1,s2,s3,s4,x1,x2,x3,x4,*t;
807   PetscScalar       *x;
808   const PetscScalar *b;
809 
810   PetscFunctionBegin;
811   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
812   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
813   t    = (MatScalar*)a->solve_work;
814 
815   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
816   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
817 
818   /* forward solve the lower triangular */
819   idx  = 4*(*r++);
820   t[0] = (MatScalar)b[idx];
821   t[1] = (MatScalar)b[1+idx];
822   t[2] = (MatScalar)b[2+idx];
823   t[3] = (MatScalar)b[3+idx];
824   for (i=1; i<n; i++) {
825     v   = aa + 16*ai[i];
826     vi  = aj + ai[i];
827     nz  = diag[i] - ai[i];
828     idx = 4*(*r++);
829     s1  = (MatScalar)b[idx];
830     s2  = (MatScalar)b[1+idx];
831     s3  = (MatScalar)b[2+idx];
832     s4  = (MatScalar)b[3+idx];
833     while (nz--) {
834       idx = 4*(*vi++);
835       x1  = t[idx];
836       x2  = t[1+idx];
837       x3  = t[2+idx];
838       x4  = t[3+idx];
839       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
840       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
841       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
842       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
843       v  += 16;
844     }
845     idx      = 4*i;
846     t[idx]   = s1;
847     t[1+idx] = s2;
848     t[2+idx] = s3;
849     t[3+idx] = s4;
850   }
851   /* backward solve the upper triangular */
852   for (i=n-1; i>=0; i--) {
853     v   = aa + 16*diag[i] + 16;
854     vi  = aj + diag[i] + 1;
855     nz  = ai[i+1] - diag[i] - 1;
856     idt = 4*i;
857     s1  = t[idt];
858     s2  = t[1+idt];
859     s3  = t[2+idt];
860     s4  = t[3+idt];
861     while (nz--) {
862       idx = 4*(*vi++);
863       x1  = t[idx];
864       x2  = t[1+idx];
865       x3  = t[2+idx];
866       x4  = t[3+idx];
867       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
868       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
869       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
870       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
871       v  += 16;
872     }
873     idc      = 4*(*c--);
874     v        = aa + 16*diag[i];
875     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
876     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
877     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
878     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
879     x[idc]   = (PetscScalar)t[idt];
880     x[1+idc] = (PetscScalar)t[1+idt];
881     x[2+idc] = (PetscScalar)t[2+idt];
882     x[3+idc] = (PetscScalar)t[3+idt];
883   }
884 
885   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
886   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
887   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
888   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
889   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
890   PetscFunctionReturn(0);
891 }
892 
893 #if defined(PETSC_HAVE_SSE)
894 
895 #include PETSC_HAVE_SSE
896 
897 #undef __FUNCT__
898 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
899 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
900 {
901   /*
902      Note: This code uses demotion of double
903      to float when performing the mixed-mode computation.
904      This may not be numerically reasonable for all applications.
905   */
906   Mat_SeqBAIJ    *a   = (Mat_SeqBAIJ*)A->data;
907   IS             iscol=a->col,isrow=a->row;
908   PetscErrorCode ierr;
909   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
910   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
911   MatScalar      *aa=a->a,*v;
912   PetscScalar    *x,*b,*t;
913 
914   /* Make space in temp stack for 16 Byte Aligned arrays */
915   float         ssealignedspace[11],*tmps,*tmpx;
916   unsigned long offset;
917 
918   PetscFunctionBegin;
919   SSE_SCOPE_BEGIN;
920 
921   offset = (unsigned long)ssealignedspace % 16;
922   if (offset) offset = (16 - offset)/4;
923   tmps = &ssealignedspace[offset];
924   tmpx = &ssealignedspace[offset+4];
925   PREFETCH_NTA(aa+16*ai[1]);
926 
927   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
928   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
929   t    = a->solve_work;
930 
931   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
932   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
933 
934   /* forward solve the lower triangular */
935   idx  = 4*(*r++);
936   t[0] = b[idx];   t[1] = b[1+idx];
937   t[2] = b[2+idx]; t[3] = b[3+idx];
938   v    =  aa + 16*ai[1];
939 
940   for (i=1; i<n; ) {
941     PREFETCH_NTA(&v[8]);
942     vi  =  aj      + ai[i];
943     nz  =  diag[i] - ai[i];
944     idx =  4*(*r++);
945 
946     /* Demote sum from double to float */
947     CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
948     LOAD_PS(tmps,XMM7);
949 
950     while (nz--) {
951       PREFETCH_NTA(&v[16]);
952       idx = 4*(*vi++);
953 
954       /* Demote solution (so far) from double to float */
955       CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
956 
957       /* 4x4 Matrix-Vector product with negative accumulation: */
958       SSE_INLINE_BEGIN_2(tmpx,v)
959       SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
960 
961       /* First Column */
962       SSE_COPY_PS(XMM0,XMM6)
963       SSE_SHUFFLE(XMM0,XMM0,0x00)
964       SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
965       SSE_SUB_PS(XMM7,XMM0)
966 
967       /* Second Column */
968       SSE_COPY_PS(XMM1,XMM6)
969       SSE_SHUFFLE(XMM1,XMM1,0x55)
970       SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
971       SSE_SUB_PS(XMM7,XMM1)
972 
973       SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
974 
975       /* Third Column */
976       SSE_COPY_PS(XMM2,XMM6)
977       SSE_SHUFFLE(XMM2,XMM2,0xAA)
978       SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
979       SSE_SUB_PS(XMM7,XMM2)
980 
981       /* Fourth Column */
982       SSE_COPY_PS(XMM3,XMM6)
983       SSE_SHUFFLE(XMM3,XMM3,0xFF)
984       SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
985       SSE_SUB_PS(XMM7,XMM3)
986       SSE_INLINE_END_2
987 
988       v += 16;
989     }
990     idx = 4*i;
991     v   = aa + 16*ai[++i];
992     PREFETCH_NTA(v);
993     STORE_PS(tmps,XMM7);
994 
995     /* Promote result from float to double */
996     CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
997   }
998   /* backward solve the upper triangular */
999   idt  = 4*(n-1);
1000   ai16 = 16*diag[n-1];
1001   v    = aa + ai16 + 16;
1002   for (i=n-1; i>=0; ) {
1003     PREFETCH_NTA(&v[8]);
1004     vi = aj + diag[i] + 1;
1005     nz = ai[i+1] - diag[i] - 1;
1006 
1007     /* Demote accumulator from double to float */
1008     CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
1009     LOAD_PS(tmps,XMM7);
1010 
1011     while (nz--) {
1012       PREFETCH_NTA(&v[16]);
1013       idx = 4*(*vi++);
1014 
1015       /* Demote solution (so far) from double to float */
1016       CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
1017 
1018       /* 4x4 Matrix-Vector Product with negative accumulation: */
1019       SSE_INLINE_BEGIN_2(tmpx,v)
1020       SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
1021 
1022       /* First Column */
1023       SSE_COPY_PS(XMM0,XMM6)
1024       SSE_SHUFFLE(XMM0,XMM0,0x00)
1025       SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1026       SSE_SUB_PS(XMM7,XMM0)
1027 
1028       /* Second Column */
1029       SSE_COPY_PS(XMM1,XMM6)
1030       SSE_SHUFFLE(XMM1,XMM1,0x55)
1031       SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1032       SSE_SUB_PS(XMM7,XMM1)
1033 
1034       SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
1035 
1036       /* Third Column */
1037       SSE_COPY_PS(XMM2,XMM6)
1038       SSE_SHUFFLE(XMM2,XMM2,0xAA)
1039       SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1040       SSE_SUB_PS(XMM7,XMM2)
1041 
1042       /* Fourth Column */
1043       SSE_COPY_PS(XMM3,XMM6)
1044       SSE_SHUFFLE(XMM3,XMM3,0xFF)
1045       SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1046       SSE_SUB_PS(XMM7,XMM3)
1047       SSE_INLINE_END_2
1048       v += 16;
1049     }
1050     v    = aa + ai16;
1051     ai16 = 16*diag[--i];
1052     PREFETCH_NTA(aa+ai16+16);
1053     /*
1054        Scale the result by the diagonal 4x4 block,
1055        which was inverted as part of the factorization
1056     */
1057     SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
1058     /* First Column */
1059     SSE_COPY_PS(XMM0,XMM7)
1060     SSE_SHUFFLE(XMM0,XMM0,0x00)
1061     SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
1062 
1063     /* Second Column */
1064     SSE_COPY_PS(XMM1,XMM7)
1065     SSE_SHUFFLE(XMM1,XMM1,0x55)
1066     SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
1067     SSE_ADD_PS(XMM0,XMM1)
1068 
1069     SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
1070 
1071     /* Third Column */
1072     SSE_COPY_PS(XMM2,XMM7)
1073     SSE_SHUFFLE(XMM2,XMM2,0xAA)
1074     SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
1075     SSE_ADD_PS(XMM0,XMM2)
1076 
1077     /* Fourth Column */
1078     SSE_COPY_PS(XMM3,XMM7)
1079     SSE_SHUFFLE(XMM3,XMM3,0xFF)
1080     SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
1081     SSE_ADD_PS(XMM0,XMM3)
1082 
1083     SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
1084     SSE_INLINE_END_3
1085 
1086     /* Promote solution from float to double */
1087     CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
1088 
1089     /* Apply reordering to t and stream into x.    */
1090     /* This way, x doesn't pollute the cache.      */
1091     /* Be careful with size: 2 doubles = 4 floats! */
1092     idc = 4*(*c--);
1093     SSE_INLINE_BEGIN_2((float*)&t[idt],(float*)&x[idc])
1094     /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
1095     SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
1096     SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
1097     /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
1098     SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
1099     SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
1100     SSE_INLINE_END_2
1101     v    = aa + ai16 + 16;
1102     idt -= 4;
1103   }
1104 
1105   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1106   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1107   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1108   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1109   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
1110   SSE_SCOPE_END;
1111   PetscFunctionReturn(0);
1112 }
1113 
1114 #endif
1115 
1116 #undef __FUNCT__
1117 #define __FUNCT__ "MatSolve_SeqBAIJ_3_inplace"
1118 PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
1119 {
1120   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
1121   IS                iscol=a->col,isrow=a->row;
1122   PetscErrorCode    ierr;
1123   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1124   PetscInt          i,nz,idx,idt,idc;
1125   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
1126   const MatScalar   *aa=a->a,*v;
1127   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
1128   const PetscScalar *b;
1129 
1130   PetscFunctionBegin;
1131   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1132   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1133   t    = a->solve_work;
1134 
1135   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1136   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1137 
1138   /* forward solve the lower triangular */
1139   idx  = 3*(*r++);
1140   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
1141   for (i=1; i<n; i++) {
1142     v   = aa + 9*ai[i];
1143     vi  = aj + ai[i];
1144     nz  = diag[i] - ai[i];
1145     idx = 3*(*r++);
1146     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
1147     while (nz--) {
1148       idx = 3*(*vi++);
1149       x1  = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
1150       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1151       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1152       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1153       v  += 9;
1154     }
1155     idx    = 3*i;
1156     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
1157   }
1158   /* backward solve the upper triangular */
1159   for (i=n-1; i>=0; i--) {
1160     v   = aa + 9*diag[i] + 9;
1161     vi  = aj + diag[i] + 1;
1162     nz  = ai[i+1] - diag[i] - 1;
1163     idt = 3*i;
1164     s1  = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
1165     while (nz--) {
1166       idx = 3*(*vi++);
1167       x1  = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
1168       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1169       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1170       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1171       v  += 9;
1172     }
1173     idc      = 3*(*c--);
1174     v        = aa + 9*diag[i];
1175     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
1176     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
1177     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
1178   }
1179   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1180   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1181   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1182   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1183   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
1184   PetscFunctionReturn(0);
1185 }
1186 
1187 #undef __FUNCT__
1188 #define __FUNCT__ "MatSolve_SeqBAIJ_3"
1189 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
1190 {
1191   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
1192   IS                iscol=a->col,isrow=a->row;
1193   PetscErrorCode    ierr;
1194   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
1195   PetscInt          i,nz,idx,idt,idc,m;
1196   const PetscInt    *r,*c,*rout,*cout;
1197   const MatScalar   *aa=a->a,*v;
1198   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
1199   const PetscScalar *b;
1200 
1201   PetscFunctionBegin;
1202   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1203   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1204   t    = a->solve_work;
1205 
1206   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1207   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1208 
1209   /* forward solve the lower triangular */
1210   idx  = 3*r[0];
1211   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
1212   for (i=1; i<n; i++) {
1213     v   = aa + 9*ai[i];
1214     vi  = aj + ai[i];
1215     nz  = ai[i+1] - ai[i];
1216     idx = 3*r[i];
1217     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
1218     for (m=0; m<nz; m++) {
1219       idx = 3*vi[m];
1220       x1  = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
1221       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1222       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1223       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1224       v  += 9;
1225     }
1226     idx    = 3*i;
1227     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
1228   }
1229   /* backward solve the upper triangular */
1230   for (i=n-1; i>=0; i--) {
1231     v   = aa + 9*(adiag[i+1]+1);
1232     vi  = aj + adiag[i+1]+1;
1233     nz  = adiag[i] - adiag[i+1] - 1;
1234     idt = 3*i;
1235     s1  = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
1236     for (m=0; m<nz; m++) {
1237       idx = 3*vi[m];
1238       x1  = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
1239       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1240       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1241       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1242       v  += 9;
1243     }
1244     idc      = 3*c[i];
1245     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
1246     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
1247     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
1248   }
1249   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1250   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1251   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1252   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1253   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
1254   PetscFunctionReturn(0);
1255 }
1256 
1257 #undef __FUNCT__
1258 #define __FUNCT__ "MatSolve_SeqBAIJ_2_inplace"
1259 PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
1260 {
1261   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
1262   IS                iscol=a->col,isrow=a->row;
1263   PetscErrorCode    ierr;
1264   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1265   PetscInt          i,nz,idx,idt,idc;
1266   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
1267   const MatScalar   *aa=a->a,*v;
1268   PetscScalar       *x,s1,s2,x1,x2,*t;
1269   const PetscScalar *b;
1270 
1271   PetscFunctionBegin;
1272   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1273   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1274   t    = a->solve_work;
1275 
1276   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1277   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1278 
1279   /* forward solve the lower triangular */
1280   idx  = 2*(*r++);
1281   t[0] = b[idx]; t[1] = b[1+idx];
1282   for (i=1; i<n; i++) {
1283     v   = aa + 4*ai[i];
1284     vi  = aj + ai[i];
1285     nz  = diag[i] - ai[i];
1286     idx = 2*(*r++);
1287     s1  = b[idx]; s2 = b[1+idx];
1288     while (nz--) {
1289       idx = 2*(*vi++);
1290       x1  = t[idx]; x2 = t[1+idx];
1291       s1 -= v[0]*x1 + v[2]*x2;
1292       s2 -= v[1]*x1 + v[3]*x2;
1293       v  += 4;
1294     }
1295     idx    = 2*i;
1296     t[idx] = s1; t[1+idx] = s2;
1297   }
1298   /* backward solve the upper triangular */
1299   for (i=n-1; i>=0; i--) {
1300     v   = aa + 4*diag[i] + 4;
1301     vi  = aj + diag[i] + 1;
1302     nz  = ai[i+1] - diag[i] - 1;
1303     idt = 2*i;
1304     s1  = t[idt]; s2 = t[1+idt];
1305     while (nz--) {
1306       idx = 2*(*vi++);
1307       x1  = t[idx]; x2 = t[1+idx];
1308       s1 -= v[0]*x1 + v[2]*x2;
1309       s2 -= v[1]*x1 + v[3]*x2;
1310       v  += 4;
1311     }
1312     idc      = 2*(*c--);
1313     v        = aa + 4*diag[i];
1314     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
1315     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
1316   }
1317   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1318   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1319   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1320   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1321   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
1322   PetscFunctionReturn(0);
1323 }
1324 
1325 #undef __FUNCT__
1326 #define __FUNCT__ "MatSolve_SeqBAIJ_2"
1327 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
1328 {
1329   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
1330   IS                iscol=a->col,isrow=a->row;
1331   PetscErrorCode    ierr;
1332   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
1333   PetscInt          i,nz,idx,jdx,idt,idc,m;
1334   const PetscInt    *r,*c,*rout,*cout;
1335   const MatScalar   *aa=a->a,*v;
1336   PetscScalar       *x,s1,s2,x1,x2,*t;
1337   const PetscScalar *b;
1338 
1339   PetscFunctionBegin;
1340   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1341   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1342   t    = a->solve_work;
1343 
1344   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1345   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1346 
1347   /* forward solve the lower triangular */
1348   idx  = 2*r[0];
1349   t[0] = b[idx]; t[1] = b[1+idx];
1350   for (i=1; i<n; i++) {
1351     v   = aa + 4*ai[i];
1352     vi  = aj + ai[i];
1353     nz  = ai[i+1] - ai[i];
1354     idx = 2*r[i];
1355     s1  = b[idx]; s2 = b[1+idx];
1356     for (m=0; m<nz; m++) {
1357       jdx = 2*vi[m];
1358       x1  = t[jdx]; x2 = t[1+jdx];
1359       s1 -= v[0]*x1 + v[2]*x2;
1360       s2 -= v[1]*x1 + v[3]*x2;
1361       v  += 4;
1362     }
1363     idx    = 2*i;
1364     t[idx] = s1; t[1+idx] = s2;
1365   }
1366   /* backward solve the upper triangular */
1367   for (i=n-1; i>=0; i--) {
1368     v   = aa + 4*(adiag[i+1]+1);
1369     vi  = aj + adiag[i+1]+1;
1370     nz  = adiag[i] - adiag[i+1] - 1;
1371     idt = 2*i;
1372     s1  = t[idt]; s2 = t[1+idt];
1373     for (m=0; m<nz; m++) {
1374       idx = 2*vi[m];
1375       x1  = t[idx]; x2 = t[1+idx];
1376       s1 -= v[0]*x1 + v[2]*x2;
1377       s2 -= v[1]*x1 + v[3]*x2;
1378       v  += 4;
1379     }
1380     idc      = 2*c[i];
1381     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
1382     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
1383   }
1384   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1385   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1386   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1387   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1388   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
1389   PetscFunctionReturn(0);
1390 }
1391 
1392 #undef __FUNCT__
1393 #define __FUNCT__ "MatSolve_SeqBAIJ_1_inplace"
1394 PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1395 {
1396   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
1397   IS                iscol=a->col,isrow=a->row;
1398   PetscErrorCode    ierr;
1399   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1400   PetscInt          i,nz;
1401   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
1402   const MatScalar   *aa=a->a,*v;
1403   PetscScalar       *x,s1,*t;
1404   const PetscScalar *b;
1405 
1406   PetscFunctionBegin;
1407   if (!n) PetscFunctionReturn(0);
1408 
1409   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1410   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1411   t    = a->solve_work;
1412 
1413   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1414   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1415 
1416   /* forward solve the lower triangular */
1417   t[0] = b[*r++];
1418   for (i=1; i<n; i++) {
1419     v  = aa + ai[i];
1420     vi = aj + ai[i];
1421     nz = diag[i] - ai[i];
1422     s1 = b[*r++];
1423     while (nz--) {
1424       s1 -= (*v++)*t[*vi++];
1425     }
1426     t[i] = s1;
1427   }
1428   /* backward solve the upper triangular */
1429   for (i=n-1; i>=0; i--) {
1430     v  = aa + diag[i] + 1;
1431     vi = aj + diag[i] + 1;
1432     nz = ai[i+1] - diag[i] - 1;
1433     s1 = t[i];
1434     while (nz--) {
1435       s1 -= (*v++)*t[*vi++];
1436     }
1437     x[*c--] = t[i] = aa[diag[i]]*s1;
1438   }
1439 
1440   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1441   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1442   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1443   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1444   ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr);
1445   PetscFunctionReturn(0);
1446 }
1447 
1448 #undef __FUNCT__
1449 #define __FUNCT__ "MatSolve_SeqBAIJ_1"
1450 PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
1451 {
1452   Mat_SeqBAIJ       *a    = (Mat_SeqBAIJ*)A->data;
1453   IS                iscol = a->col,isrow = a->row;
1454   PetscErrorCode    ierr;
1455   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag = a->diag,nz;
1456   const PetscInt    *rout,*cout,*r,*c;
1457   PetscScalar       *x,*tmp,sum;
1458   const PetscScalar *b;
1459   const MatScalar   *aa = a->a,*v;
1460 
1461   PetscFunctionBegin;
1462   if (!n) PetscFunctionReturn(0);
1463 
1464   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1465   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1466   tmp  = a->solve_work;
1467 
1468   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1469   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1470 
1471   /* forward solve the lower triangular */
1472   tmp[0] = b[r[0]];
1473   v      = aa;
1474   vi     = aj;
1475   for (i=1; i<n; i++) {
1476     nz  = ai[i+1] - ai[i];
1477     sum = b[r[i]];
1478     PetscSparseDenseMinusDot(sum,tmp,v,vi,nz);
1479     tmp[i] = sum;
1480     v     += nz; vi += nz;
1481   }
1482 
1483   /* backward solve the upper triangular */
1484   for (i=n-1; i>=0; i--) {
1485     v   = aa + adiag[i+1]+1;
1486     vi  = aj + adiag[i+1]+1;
1487     nz  = adiag[i]-adiag[i+1]-1;
1488     sum = tmp[i];
1489     PetscSparseDenseMinusDot(sum,tmp,v,vi,nz);
1490     x[c[i]] = tmp[i] = sum*v[nz]; /* v[nz] = aa[adiag[i]] */
1491   }
1492 
1493   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1494   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1495   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1496   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1497   ierr = PetscLogFlops(2*a->nz - A->cmap->n);CHKERRQ(ierr);
1498   PetscFunctionReturn(0);
1499 }
1500 
1501