xref: /petsc/src/mat/impls/baij/seq/baijsolv.c (revision 5b6bfdb9644f185dbf5e5a09b808ec241507e1e7)
1 #include <../src/mat/impls/baij/seq/baij.h>
2 #include <petsc/private/kernels/blockinvert.h>
3 
4 PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
5 {
6   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
7   IS                iscol=a->col,isrow=a->row;
8   PetscErrorCode    ierr;
9   const PetscInt    *r,*c,*rout,*cout;
10   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*vi;
11   PetscInt          i,nz;
12   const PetscInt    bs =A->rmap->bs,bs2=a->bs2;
13   const MatScalar   *aa=a->a,*v;
14   PetscScalar       *x,*s,*t,*ls;
15   const PetscScalar *b;
16 
17   PetscFunctionBegin;
18   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
19   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
20   t    = a->solve_work;
21 
22   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
23   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
24 
25   /* forward solve the lower triangular */
26   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
27   for (i=1; i<n; i++) {
28     v    = aa + bs2*ai[i];
29     vi   = aj + ai[i];
30     nz   = a->diag[i] - ai[i];
31     s    = t + bs*i;
32     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
33     while (nz--) {
34       PetscKernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
35       v += bs2;
36     }
37   }
38   /* backward solve the upper triangular */
39   ls = a->solve_work + A->cmap->n;
40   for (i=n-1; i>=0; i--) {
41     v    = aa + bs2*(a->diag[i] + 1);
42     vi   = aj + a->diag[i] + 1;
43     nz   = ai[i+1] - a->diag[i] - 1;
44     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
45     while (nz--) {
46       PetscKernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
47       v += bs2;
48     }
49     PetscKernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
50     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
51   }
52 
53   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
54   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
55   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
56   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
57   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
58   PetscFunctionReturn(0);
59 }
60 
61 PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
62 {
63   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
64   IS                iscol=a->col,isrow=a->row;
65   PetscErrorCode    ierr;
66   const PetscInt    *r,*c,*ai=a->i,*aj=a->j;
67   const PetscInt    *rout,*cout,*diag = a->diag,*vi,n=a->mbs;
68   PetscInt          i,nz,idx,idt,idc;
69   const MatScalar   *aa=a->a,*v;
70   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
71   const PetscScalar *b;
72 
73   PetscFunctionBegin;
74   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
75   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
76   t    = a->solve_work;
77 
78   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
79   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
80 
81   /* forward solve the lower triangular */
82   idx  = 7*(*r++);
83   t[0] = b[idx];   t[1] = b[1+idx];
84   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
85   t[5] = b[5+idx]; t[6] = b[6+idx];
86 
87   for (i=1; i<n; i++) {
88     v   = aa + 49*ai[i];
89     vi  = aj + ai[i];
90     nz  = diag[i] - ai[i];
91     idx = 7*(*r++);
92     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
93     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
94     while (nz--) {
95       idx = 7*(*vi++);
96       x1  = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
97       x4  = t[3+idx];x5 = t[4+idx];
98       x6  = t[5+idx];x7 = t[6+idx];
99       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
100       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
101       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
102       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
103       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
104       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
105       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
106       v  += 49;
107     }
108     idx      = 7*i;
109     t[idx]   = s1;t[1+idx] = s2;
110     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
111     t[5+idx] = s6;t[6+idx] = s7;
112   }
113   /* backward solve the upper triangular */
114   for (i=n-1; i>=0; i--) {
115     v   = aa + 49*diag[i] + 49;
116     vi  = aj + diag[i] + 1;
117     nz  = ai[i+1] - diag[i] - 1;
118     idt = 7*i;
119     s1  = t[idt];  s2 = t[1+idt];
120     s3  = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
121     s6  = t[5+idt];s7 = t[6+idt];
122     while (nz--) {
123       idx = 7*(*vi++);
124       x1  = t[idx];   x2 = t[1+idx];
125       x3  = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
126       x6  = t[5+idx]; x7 = t[6+idx];
127       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
128       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
129       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
130       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
131       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
132       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
133       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
134       v  += 49;
135     }
136     idc    = 7*(*c--);
137     v      = aa + 49*diag[i];
138     x[idc] = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
139                         v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
140     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
141                           v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
142     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
143                           v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
144     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
145                           v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
146     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
147                           v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
148     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
149                           v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
150     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
151                           v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
152   }
153 
154   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
155   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
156   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
157   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
158   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
159   PetscFunctionReturn(0);
160 }
161 
162 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
163 {
164   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
165   IS                iscol=a->col,isrow=a->row;
166   PetscErrorCode    ierr;
167   const PetscInt    *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag;
168   const PetscInt    n=a->mbs,*rout,*cout,*vi;
169   PetscInt          i,nz,idx,idt,idc,m;
170   const MatScalar   *aa=a->a,*v;
171   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
172   const PetscScalar *b;
173 
174   PetscFunctionBegin;
175   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
176   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
177   t    = a->solve_work;
178 
179   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
180   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
181 
182   /* forward solve the lower triangular */
183   idx  = 7*r[0];
184   t[0] = b[idx];   t[1] = b[1+idx];
185   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
186   t[5] = b[5+idx]; t[6] = b[6+idx];
187 
188   for (i=1; i<n; i++) {
189     v   = aa + 49*ai[i];
190     vi  = aj + ai[i];
191     nz  = ai[i+1] - ai[i];
192     idx = 7*r[i];
193     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
194     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
195     for (m=0; m<nz; m++) {
196       idx = 7*vi[m];
197       x1  = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
198       x4  = t[3+idx];x5 = t[4+idx];
199       x6  = t[5+idx];x7 = t[6+idx];
200       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
201       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
202       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
203       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
204       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
205       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
206       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
207       v  += 49;
208     }
209     idx      = 7*i;
210     t[idx]   = s1;t[1+idx] = s2;
211     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
212     t[5+idx] = s6;t[6+idx] = s7;
213   }
214   /* backward solve the upper triangular */
215   for (i=n-1; i>=0; i--) {
216     v   = aa + 49*(adiag[i+1]+1);
217     vi  = aj + adiag[i+1]+1;
218     nz  = adiag[i] - adiag[i+1] - 1;
219     idt = 7*i;
220     s1  = t[idt];  s2 = t[1+idt];
221     s3  = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
222     s6  = t[5+idt];s7 = t[6+idt];
223     for (m=0; m<nz; m++) {
224       idx = 7*vi[m];
225       x1  = t[idx];   x2 = t[1+idx];
226       x3  = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
227       x6  = t[5+idx]; x7 = t[6+idx];
228       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
229       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
230       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
231       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
232       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
233       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
234       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
235       v  += 49;
236     }
237     idc    = 7*c[i];
238     x[idc] = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
239                         v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
240     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
241                           v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
242     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
243                           v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
244     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
245                           v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
246     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
247                           v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
248     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
249                           v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
250     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
251                           v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
252   }
253 
254   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
255   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
256   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
257   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
258   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
259   PetscFunctionReturn(0);
260 }
261 
262 PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
263 {
264   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
265   IS                iscol=a->col,isrow=a->row;
266   PetscErrorCode    ierr;
267   const PetscInt    *r,*c,*rout,*cout;
268   const PetscInt    *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
269   PetscInt          i,nz,idx,idt,idc;
270   const MatScalar   *aa=a->a,*v;
271   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
272   const PetscScalar *b;
273 
274   PetscFunctionBegin;
275   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
276   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
277   t    = a->solve_work;
278 
279   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
280   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
281 
282   /* forward solve the lower triangular */
283   idx  = 6*(*r++);
284   t[0] = b[idx];   t[1] = b[1+idx];
285   t[2] = b[2+idx]; t[3] = b[3+idx];
286   t[4] = b[4+idx]; t[5] = b[5+idx];
287   for (i=1; i<n; i++) {
288     v   = aa + 36*ai[i];
289     vi  = aj + ai[i];
290     nz  = diag[i] - ai[i];
291     idx = 6*(*r++);
292     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
293     s5  = b[4+idx]; s6 = b[5+idx];
294     while (nz--) {
295       idx = 6*(*vi++);
296       x1  = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
297       x4  = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
298       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
299       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
300       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
301       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
302       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
303       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
304       v  += 36;
305     }
306     idx      = 6*i;
307     t[idx]   = s1;t[1+idx] = s2;
308     t[2+idx] = s3;t[3+idx] = s4;
309     t[4+idx] = s5;t[5+idx] = s6;
310   }
311   /* backward solve the upper triangular */
312   for (i=n-1; i>=0; i--) {
313     v   = aa + 36*diag[i] + 36;
314     vi  = aj + diag[i] + 1;
315     nz  = ai[i+1] - diag[i] - 1;
316     idt = 6*i;
317     s1  = t[idt];  s2 = t[1+idt];
318     s3  = t[2+idt];s4 = t[3+idt];
319     s5  = t[4+idt];s6 = t[5+idt];
320     while (nz--) {
321       idx = 6*(*vi++);
322       x1  = t[idx];   x2 = t[1+idx];
323       x3  = t[2+idx]; x4 = t[3+idx];
324       x5  = t[4+idx]; x6 = t[5+idx];
325       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
326       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
327       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
328       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
329       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
330       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
331       v  += 36;
332     }
333     idc    = 6*(*c--);
334     v      = aa + 36*diag[i];
335     x[idc] = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
336                         v[18]*s4+v[24]*s5+v[30]*s6;
337     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
338                           v[19]*s4+v[25]*s5+v[31]*s6;
339     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
340                           v[20]*s4+v[26]*s5+v[32]*s6;
341     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
342                           v[21]*s4+v[27]*s5+v[33]*s6;
343     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
344                           v[22]*s4+v[28]*s5+v[34]*s6;
345     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
346                           v[23]*s4+v[29]*s5+v[35]*s6;
347   }
348 
349   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
350   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
351   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
352   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
353   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
354   PetscFunctionReturn(0);
355 }
356 
357 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
358 {
359   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
360   IS                iscol=a->col,isrow=a->row;
361   PetscErrorCode    ierr;
362   const PetscInt    *r,*c,*rout,*cout;
363   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
364   PetscInt          i,nz,idx,idt,idc,m;
365   const MatScalar   *aa=a->a,*v;
366   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
367   const PetscScalar *b;
368 
369   PetscFunctionBegin;
370   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
371   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
372   t    = a->solve_work;
373 
374   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
375   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
376 
377   /* forward solve the lower triangular */
378   idx  = 6*r[0];
379   t[0] = b[idx];   t[1] = b[1+idx];
380   t[2] = b[2+idx]; t[3] = b[3+idx];
381   t[4] = b[4+idx]; t[5] = b[5+idx];
382   for (i=1; i<n; i++) {
383     v   = aa + 36*ai[i];
384     vi  = aj + ai[i];
385     nz  = ai[i+1] - ai[i];
386     idx = 6*r[i];
387     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
388     s5  = b[4+idx]; s6 = b[5+idx];
389     for (m=0; m<nz; m++) {
390       idx = 6*vi[m];
391       x1  = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
392       x4  = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
393       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
394       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
395       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
396       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
397       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
398       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
399       v  += 36;
400     }
401     idx      = 6*i;
402     t[idx]   = s1;t[1+idx] = s2;
403     t[2+idx] = s3;t[3+idx] = s4;
404     t[4+idx] = s5;t[5+idx] = s6;
405   }
406   /* backward solve the upper triangular */
407   for (i=n-1; i>=0; i--) {
408     v   = aa + 36*(adiag[i+1]+1);
409     vi  = aj + adiag[i+1]+1;
410     nz  = adiag[i] - adiag[i+1] - 1;
411     idt = 6*i;
412     s1  = t[idt];  s2 = t[1+idt];
413     s3  = t[2+idt];s4 = t[3+idt];
414     s5  = t[4+idt];s6 = t[5+idt];
415     for (m=0; m<nz; m++) {
416       idx = 6*vi[m];
417       x1  = t[idx];   x2 = t[1+idx];
418       x3  = t[2+idx]; x4 = t[3+idx];
419       x5  = t[4+idx]; x6 = t[5+idx];
420       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
421       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
422       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
423       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
424       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
425       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
426       v  += 36;
427     }
428     idc    = 6*c[i];
429     x[idc] = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
430                         v[18]*s4+v[24]*s5+v[30]*s6;
431     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
432                           v[19]*s4+v[25]*s5+v[31]*s6;
433     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
434                           v[20]*s4+v[26]*s5+v[32]*s6;
435     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
436                           v[21]*s4+v[27]*s5+v[33]*s6;
437     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
438                           v[22]*s4+v[28]*s5+v[34]*s6;
439     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
440                           v[23]*s4+v[29]*s5+v[35]*s6;
441   }
442 
443   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
444   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
445   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
446   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
447   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
448   PetscFunctionReturn(0);
449 }
450 
451 PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
452 {
453   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
454   IS                iscol=a->col,isrow=a->row;
455   PetscErrorCode    ierr;
456   const PetscInt    *r,*c,*rout,*cout,*diag = a->diag;
457   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
458   PetscInt          i,nz,idx,idt,idc;
459   const MatScalar   *aa=a->a,*v;
460   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
461   const PetscScalar *b;
462 
463   PetscFunctionBegin;
464   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
465   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
466   t    = a->solve_work;
467 
468   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
469   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
470 
471   /* forward solve the lower triangular */
472   idx  = 5*(*r++);
473   t[0] = b[idx];   t[1] = b[1+idx];
474   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
475   for (i=1; i<n; i++) {
476     v   = aa + 25*ai[i];
477     vi  = aj + ai[i];
478     nz  = diag[i] - ai[i];
479     idx = 5*(*r++);
480     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
481     s5  = b[4+idx];
482     while (nz--) {
483       idx = 5*(*vi++);
484       x1  = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
485       x4  = t[3+idx];x5 = t[4+idx];
486       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
487       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
488       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
489       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
490       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
491       v  += 25;
492     }
493     idx      = 5*i;
494     t[idx]   = s1;t[1+idx] = s2;
495     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
496   }
497   /* backward solve the upper triangular */
498   for (i=n-1; i>=0; i--) {
499     v   = aa + 25*diag[i] + 25;
500     vi  = aj + diag[i] + 1;
501     nz  = ai[i+1] - diag[i] - 1;
502     idt = 5*i;
503     s1  = t[idt];  s2 = t[1+idt];
504     s3  = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
505     while (nz--) {
506       idx = 5*(*vi++);
507       x1  = t[idx];   x2 = t[1+idx];
508       x3  = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
509       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
510       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
511       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
512       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
513       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
514       v  += 25;
515     }
516     idc    = 5*(*c--);
517     v      = aa + 25*diag[i];
518     x[idc] = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
519                         v[15]*s4+v[20]*s5;
520     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
521                           v[16]*s4+v[21]*s5;
522     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
523                           v[17]*s4+v[22]*s5;
524     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
525                           v[18]*s4+v[23]*s5;
526     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
527                           v[19]*s4+v[24]*s5;
528   }
529 
530   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
531   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
532   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
533   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
534   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
535   PetscFunctionReturn(0);
536 }
537 
538 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
539 {
540   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
541   IS                iscol=a->col,isrow=a->row;
542   PetscErrorCode    ierr;
543   const PetscInt    *r,*c,*rout,*cout;
544   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
545   PetscInt          i,nz,idx,idt,idc,m;
546   const MatScalar   *aa=a->a,*v;
547   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
548   const PetscScalar *b;
549 
550   PetscFunctionBegin;
551   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
552   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
553   t    = a->solve_work;
554 
555   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
556   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
557 
558   /* forward solve the lower triangular */
559   idx  = 5*r[0];
560   t[0] = b[idx];   t[1] = b[1+idx];
561   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
562   for (i=1; i<n; i++) {
563     v   = aa + 25*ai[i];
564     vi  = aj + ai[i];
565     nz  = ai[i+1] - ai[i];
566     idx = 5*r[i];
567     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
568     s5  = b[4+idx];
569     for (m=0; m<nz; m++) {
570       idx = 5*vi[m];
571       x1  = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
572       x4  = t[3+idx];x5 = t[4+idx];
573       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
574       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
575       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
576       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
577       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
578       v  += 25;
579     }
580     idx      = 5*i;
581     t[idx]   = s1;t[1+idx] = s2;
582     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
583   }
584   /* backward solve the upper triangular */
585   for (i=n-1; i>=0; i--) {
586     v   = aa + 25*(adiag[i+1]+1);
587     vi  = aj + adiag[i+1]+1;
588     nz  = adiag[i] - adiag[i+1] - 1;
589     idt = 5*i;
590     s1  = t[idt];  s2 = t[1+idt];
591     s3  = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
592     for (m=0; m<nz; m++) {
593       idx = 5*vi[m];
594       x1  = t[idx];   x2 = t[1+idx];
595       x3  = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
596       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
597       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
598       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
599       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
600       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
601       v  += 25;
602     }
603     idc    = 5*c[i];
604     x[idc] = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
605                         v[15]*s4+v[20]*s5;
606     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
607                           v[16]*s4+v[21]*s5;
608     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
609                           v[17]*s4+v[22]*s5;
610     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
611                           v[18]*s4+v[23]*s5;
612     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
613                           v[19]*s4+v[24]*s5;
614   }
615 
616   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
617   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
618   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
619   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
620   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
621   PetscFunctionReturn(0);
622 }
623 
624 PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
625 {
626   Mat_SeqBAIJ       *a   = (Mat_SeqBAIJ*)A->data;
627   IS                iscol=a->col,isrow=a->row;
628   PetscErrorCode    ierr;
629   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
630   PetscInt          i,nz,idx,idt,idc;
631   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
632   const MatScalar   *aa=a->a,*v;
633   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
634   const PetscScalar *b;
635 
636   PetscFunctionBegin;
637   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
638   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
639   t    = a->solve_work;
640 
641   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
642   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
643 
644   /* forward solve the lower triangular */
645   idx  = 4*(*r++);
646   t[0] = b[idx];   t[1] = b[1+idx];
647   t[2] = b[2+idx]; t[3] = b[3+idx];
648   for (i=1; i<n; i++) {
649     v   = aa + 16*ai[i];
650     vi  = aj + ai[i];
651     nz  = diag[i] - ai[i];
652     idx = 4*(*r++);
653     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
654     while (nz--) {
655       idx = 4*(*vi++);
656       x1  = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
657       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
658       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
659       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
660       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
661       v  += 16;
662     }
663     idx      = 4*i;
664     t[idx]   = s1;t[1+idx] = s2;
665     t[2+idx] = s3;t[3+idx] = s4;
666   }
667   /* backward solve the upper triangular */
668   for (i=n-1; i>=0; i--) {
669     v   = aa + 16*diag[i] + 16;
670     vi  = aj + diag[i] + 1;
671     nz  = ai[i+1] - diag[i] - 1;
672     idt = 4*i;
673     s1  = t[idt];  s2 = t[1+idt];
674     s3  = t[2+idt];s4 = t[3+idt];
675     while (nz--) {
676       idx = 4*(*vi++);
677       x1  = t[idx];   x2 = t[1+idx];
678       x3  = t[2+idx]; x4 = t[3+idx];
679       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
680       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
681       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
682       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
683       v  += 16;
684     }
685     idc      = 4*(*c--);
686     v        = aa + 16*diag[i];
687     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
688     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
689     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
690     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
691   }
692 
693   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
694   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
695   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
696   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
697   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
698   PetscFunctionReturn(0);
699 }
700 
701 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
702 {
703   Mat_SeqBAIJ       *a   = (Mat_SeqBAIJ*)A->data;
704   IS                iscol=a->col,isrow=a->row;
705   PetscErrorCode    ierr;
706   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
707   PetscInt          i,nz,idx,idt,idc,m;
708   const PetscInt    *r,*c,*rout,*cout;
709   const MatScalar   *aa=a->a,*v;
710   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
711   const PetscScalar *b;
712 
713   PetscFunctionBegin;
714   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
715   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
716   t    = a->solve_work;
717 
718   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
719   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
720 
721   /* forward solve the lower triangular */
722   idx  = 4*r[0];
723   t[0] = b[idx];   t[1] = b[1+idx];
724   t[2] = b[2+idx]; t[3] = b[3+idx];
725   for (i=1; i<n; i++) {
726     v   = aa + 16*ai[i];
727     vi  = aj + ai[i];
728     nz  = ai[i+1] - ai[i];
729     idx = 4*r[i];
730     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
731     for (m=0; m<nz; m++) {
732       idx = 4*vi[m];
733       x1  = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
734       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
735       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
736       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
737       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
738       v  += 16;
739     }
740     idx      = 4*i;
741     t[idx]   = s1;t[1+idx] = s2;
742     t[2+idx] = s3;t[3+idx] = s4;
743   }
744   /* backward solve the upper triangular */
745   for (i=n-1; i>=0; i--) {
746     v   = aa + 16*(adiag[i+1]+1);
747     vi  = aj + adiag[i+1]+1;
748     nz  = adiag[i] - adiag[i+1] - 1;
749     idt = 4*i;
750     s1  = t[idt];  s2 = t[1+idt];
751     s3  = t[2+idt];s4 = t[3+idt];
752     for (m=0; m<nz; m++) {
753       idx = 4*vi[m];
754       x1  = t[idx];   x2 = t[1+idx];
755       x3  = t[2+idx]; x4 = t[3+idx];
756       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
757       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
758       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
759       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
760       v  += 16;
761     }
762     idc      = 4*c[i];
763     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
764     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
765     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
766     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
767   }
768 
769   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
770   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
771   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
772   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
773   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
774   PetscFunctionReturn(0);
775 }
776 
777 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
778 {
779   Mat_SeqBAIJ       *a   = (Mat_SeqBAIJ*)A->data;
780   IS                iscol=a->col,isrow=a->row;
781   PetscErrorCode    ierr;
782   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
783   PetscInt          i,nz,idx,idt,idc;
784   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
785   const MatScalar   *aa=a->a,*v;
786   MatScalar         s1,s2,s3,s4,x1,x2,x3,x4,*t;
787   PetscScalar       *x;
788   const PetscScalar *b;
789 
790   PetscFunctionBegin;
791   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
792   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
793   t    = (MatScalar*)a->solve_work;
794 
795   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
796   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
797 
798   /* forward solve the lower triangular */
799   idx  = 4*(*r++);
800   t[0] = (MatScalar)b[idx];
801   t[1] = (MatScalar)b[1+idx];
802   t[2] = (MatScalar)b[2+idx];
803   t[3] = (MatScalar)b[3+idx];
804   for (i=1; i<n; i++) {
805     v   = aa + 16*ai[i];
806     vi  = aj + ai[i];
807     nz  = diag[i] - ai[i];
808     idx = 4*(*r++);
809     s1  = (MatScalar)b[idx];
810     s2  = (MatScalar)b[1+idx];
811     s3  = (MatScalar)b[2+idx];
812     s4  = (MatScalar)b[3+idx];
813     while (nz--) {
814       idx = 4*(*vi++);
815       x1  = t[idx];
816       x2  = t[1+idx];
817       x3  = t[2+idx];
818       x4  = t[3+idx];
819       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
820       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
821       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
822       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
823       v  += 16;
824     }
825     idx      = 4*i;
826     t[idx]   = s1;
827     t[1+idx] = s2;
828     t[2+idx] = s3;
829     t[3+idx] = s4;
830   }
831   /* backward solve the upper triangular */
832   for (i=n-1; i>=0; i--) {
833     v   = aa + 16*diag[i] + 16;
834     vi  = aj + diag[i] + 1;
835     nz  = ai[i+1] - diag[i] - 1;
836     idt = 4*i;
837     s1  = t[idt];
838     s2  = t[1+idt];
839     s3  = t[2+idt];
840     s4  = t[3+idt];
841     while (nz--) {
842       idx = 4*(*vi++);
843       x1  = t[idx];
844       x2  = t[1+idx];
845       x3  = t[2+idx];
846       x4  = t[3+idx];
847       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
848       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
849       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
850       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
851       v  += 16;
852     }
853     idc      = 4*(*c--);
854     v        = aa + 16*diag[i];
855     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
856     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
857     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
858     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
859     x[idc]   = (PetscScalar)t[idt];
860     x[1+idc] = (PetscScalar)t[1+idt];
861     x[2+idc] = (PetscScalar)t[2+idt];
862     x[3+idc] = (PetscScalar)t[3+idt];
863   }
864 
865   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
866   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
867   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
868   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
869   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
870   PetscFunctionReturn(0);
871 }
872 
873 #if defined(PETSC_HAVE_SSE)
874 
875 #include PETSC_HAVE_SSE
876 
877 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
878 {
879   /*
880      Note: This code uses demotion of double
881      to float when performing the mixed-mode computation.
882      This may not be numerically reasonable for all applications.
883   */
884   Mat_SeqBAIJ    *a   = (Mat_SeqBAIJ*)A->data;
885   IS             iscol=a->col,isrow=a->row;
886   PetscErrorCode ierr;
887   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
888   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
889   MatScalar      *aa=a->a,*v;
890   PetscScalar    *x,*b,*t;
891 
892   /* Make space in temp stack for 16 Byte Aligned arrays */
893   float         ssealignedspace[11],*tmps,*tmpx;
894   unsigned long offset;
895 
896   PetscFunctionBegin;
897   SSE_SCOPE_BEGIN;
898 
899   offset = (unsigned long)ssealignedspace % 16;
900   if (offset) offset = (16 - offset)/4;
901   tmps = &ssealignedspace[offset];
902   tmpx = &ssealignedspace[offset+4];
903   PREFETCH_NTA(aa+16*ai[1]);
904 
905   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
906   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
907   t    = a->solve_work;
908 
909   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
910   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
911 
912   /* forward solve the lower triangular */
913   idx  = 4*(*r++);
914   t[0] = b[idx];   t[1] = b[1+idx];
915   t[2] = b[2+idx]; t[3] = b[3+idx];
916   v    =  aa + 16*ai[1];
917 
918   for (i=1; i<n; ) {
919     PREFETCH_NTA(&v[8]);
920     vi  =  aj      + ai[i];
921     nz  =  diag[i] - ai[i];
922     idx =  4*(*r++);
923 
924     /* Demote sum from double to float */
925     CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
926     LOAD_PS(tmps,XMM7);
927 
928     while (nz--) {
929       PREFETCH_NTA(&v[16]);
930       idx = 4*(*vi++);
931 
932       /* Demote solution (so far) from double to float */
933       CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
934 
935       /* 4x4 Matrix-Vector product with negative accumulation: */
936       SSE_INLINE_BEGIN_2(tmpx,v)
937       SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
938 
939       /* First Column */
940       SSE_COPY_PS(XMM0,XMM6)
941       SSE_SHUFFLE(XMM0,XMM0,0x00)
942       SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
943       SSE_SUB_PS(XMM7,XMM0)
944 
945       /* Second Column */
946       SSE_COPY_PS(XMM1,XMM6)
947       SSE_SHUFFLE(XMM1,XMM1,0x55)
948       SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
949       SSE_SUB_PS(XMM7,XMM1)
950 
951       SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
952 
953       /* Third Column */
954       SSE_COPY_PS(XMM2,XMM6)
955       SSE_SHUFFLE(XMM2,XMM2,0xAA)
956       SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
957       SSE_SUB_PS(XMM7,XMM2)
958 
959       /* Fourth Column */
960       SSE_COPY_PS(XMM3,XMM6)
961       SSE_SHUFFLE(XMM3,XMM3,0xFF)
962       SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
963       SSE_SUB_PS(XMM7,XMM3)
964       SSE_INLINE_END_2
965 
966       v += 16;
967     }
968     idx = 4*i;
969     v   = aa + 16*ai[++i];
970     PREFETCH_NTA(v);
971     STORE_PS(tmps,XMM7);
972 
973     /* Promote result from float to double */
974     CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
975   }
976   /* backward solve the upper triangular */
977   idt  = 4*(n-1);
978   ai16 = 16*diag[n-1];
979   v    = aa + ai16 + 16;
980   for (i=n-1; i>=0; ) {
981     PREFETCH_NTA(&v[8]);
982     vi = aj + diag[i] + 1;
983     nz = ai[i+1] - diag[i] - 1;
984 
985     /* Demote accumulator from double to float */
986     CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
987     LOAD_PS(tmps,XMM7);
988 
989     while (nz--) {
990       PREFETCH_NTA(&v[16]);
991       idx = 4*(*vi++);
992 
993       /* Demote solution (so far) from double to float */
994       CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
995 
996       /* 4x4 Matrix-Vector Product with negative accumulation: */
997       SSE_INLINE_BEGIN_2(tmpx,v)
998       SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
999 
1000       /* First Column */
1001       SSE_COPY_PS(XMM0,XMM6)
1002       SSE_SHUFFLE(XMM0,XMM0,0x00)
1003       SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1004       SSE_SUB_PS(XMM7,XMM0)
1005 
1006       /* Second Column */
1007       SSE_COPY_PS(XMM1,XMM6)
1008       SSE_SHUFFLE(XMM1,XMM1,0x55)
1009       SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1010       SSE_SUB_PS(XMM7,XMM1)
1011 
1012       SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
1013 
1014       /* Third Column */
1015       SSE_COPY_PS(XMM2,XMM6)
1016       SSE_SHUFFLE(XMM2,XMM2,0xAA)
1017       SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1018       SSE_SUB_PS(XMM7,XMM2)
1019 
1020       /* Fourth Column */
1021       SSE_COPY_PS(XMM3,XMM6)
1022       SSE_SHUFFLE(XMM3,XMM3,0xFF)
1023       SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1024       SSE_SUB_PS(XMM7,XMM3)
1025       SSE_INLINE_END_2
1026       v += 16;
1027     }
1028     v    = aa + ai16;
1029     ai16 = 16*diag[--i];
1030     PREFETCH_NTA(aa+ai16+16);
1031     /*
1032        Scale the result by the diagonal 4x4 block,
1033        which was inverted as part of the factorization
1034     */
1035     SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
1036     /* First Column */
1037     SSE_COPY_PS(XMM0,XMM7)
1038     SSE_SHUFFLE(XMM0,XMM0,0x00)
1039     SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
1040 
1041     /* Second Column */
1042     SSE_COPY_PS(XMM1,XMM7)
1043     SSE_SHUFFLE(XMM1,XMM1,0x55)
1044     SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
1045     SSE_ADD_PS(XMM0,XMM1)
1046 
1047     SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
1048 
1049     /* Third Column */
1050     SSE_COPY_PS(XMM2,XMM7)
1051     SSE_SHUFFLE(XMM2,XMM2,0xAA)
1052     SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
1053     SSE_ADD_PS(XMM0,XMM2)
1054 
1055     /* Fourth Column */
1056     SSE_COPY_PS(XMM3,XMM7)
1057     SSE_SHUFFLE(XMM3,XMM3,0xFF)
1058     SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
1059     SSE_ADD_PS(XMM0,XMM3)
1060 
1061     SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
1062     SSE_INLINE_END_3
1063 
1064     /* Promote solution from float to double */
1065     CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
1066 
1067     /* Apply reordering to t and stream into x.    */
1068     /* This way, x doesn't pollute the cache.      */
1069     /* Be careful with size: 2 doubles = 4 floats! */
1070     idc = 4*(*c--);
1071     SSE_INLINE_BEGIN_2((float*)&t[idt],(float*)&x[idc])
1072     /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
1073     SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
1074     SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
1075     /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
1076     SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
1077     SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
1078     SSE_INLINE_END_2
1079     v    = aa + ai16 + 16;
1080     idt -= 4;
1081   }
1082 
1083   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1084   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1085   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1086   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1087   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
1088   SSE_SCOPE_END;
1089   PetscFunctionReturn(0);
1090 }
1091 
1092 #endif
1093 
1094 PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
1095 {
1096   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
1097   IS                iscol=a->col,isrow=a->row;
1098   PetscErrorCode    ierr;
1099   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1100   PetscInt          i,nz,idx,idt,idc;
1101   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
1102   const MatScalar   *aa=a->a,*v;
1103   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
1104   const PetscScalar *b;
1105 
1106   PetscFunctionBegin;
1107   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1108   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1109   t    = a->solve_work;
1110 
1111   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1112   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1113 
1114   /* forward solve the lower triangular */
1115   idx  = 3*(*r++);
1116   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
1117   for (i=1; i<n; i++) {
1118     v   = aa + 9*ai[i];
1119     vi  = aj + ai[i];
1120     nz  = diag[i] - ai[i];
1121     idx = 3*(*r++);
1122     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
1123     while (nz--) {
1124       idx = 3*(*vi++);
1125       x1  = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
1126       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1127       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1128       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1129       v  += 9;
1130     }
1131     idx    = 3*i;
1132     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
1133   }
1134   /* backward solve the upper triangular */
1135   for (i=n-1; i>=0; i--) {
1136     v   = aa + 9*diag[i] + 9;
1137     vi  = aj + diag[i] + 1;
1138     nz  = ai[i+1] - diag[i] - 1;
1139     idt = 3*i;
1140     s1  = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
1141     while (nz--) {
1142       idx = 3*(*vi++);
1143       x1  = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
1144       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1145       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1146       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1147       v  += 9;
1148     }
1149     idc      = 3*(*c--);
1150     v        = aa + 9*diag[i];
1151     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
1152     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
1153     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
1154   }
1155   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1156   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1157   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1158   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1159   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
1160   PetscFunctionReturn(0);
1161 }
1162 
1163 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
1164 {
1165   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
1166   IS                iscol=a->col,isrow=a->row;
1167   PetscErrorCode    ierr;
1168   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
1169   PetscInt          i,nz,idx,idt,idc,m;
1170   const PetscInt    *r,*c,*rout,*cout;
1171   const MatScalar   *aa=a->a,*v;
1172   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
1173   const PetscScalar *b;
1174 
1175   PetscFunctionBegin;
1176   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1177   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1178   t    = a->solve_work;
1179 
1180   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1181   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1182 
1183   /* forward solve the lower triangular */
1184   idx  = 3*r[0];
1185   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
1186   for (i=1; i<n; i++) {
1187     v   = aa + 9*ai[i];
1188     vi  = aj + ai[i];
1189     nz  = ai[i+1] - ai[i];
1190     idx = 3*r[i];
1191     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
1192     for (m=0; m<nz; m++) {
1193       idx = 3*vi[m];
1194       x1  = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
1195       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1196       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1197       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1198       v  += 9;
1199     }
1200     idx    = 3*i;
1201     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
1202   }
1203   /* backward solve the upper triangular */
1204   for (i=n-1; i>=0; i--) {
1205     v   = aa + 9*(adiag[i+1]+1);
1206     vi  = aj + adiag[i+1]+1;
1207     nz  = adiag[i] - adiag[i+1] - 1;
1208     idt = 3*i;
1209     s1  = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
1210     for (m=0; m<nz; m++) {
1211       idx = 3*vi[m];
1212       x1  = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
1213       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1214       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1215       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1216       v  += 9;
1217     }
1218     idc      = 3*c[i];
1219     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
1220     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
1221     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
1222   }
1223   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1224   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1225   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1226   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1227   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
1228   PetscFunctionReturn(0);
1229 }
1230 
1231 PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
1232 {
1233   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
1234   IS                iscol=a->col,isrow=a->row;
1235   PetscErrorCode    ierr;
1236   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1237   PetscInt          i,nz,idx,idt,idc;
1238   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
1239   const MatScalar   *aa=a->a,*v;
1240   PetscScalar       *x,s1,s2,x1,x2,*t;
1241   const PetscScalar *b;
1242 
1243   PetscFunctionBegin;
1244   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1245   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1246   t    = a->solve_work;
1247 
1248   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1249   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1250 
1251   /* forward solve the lower triangular */
1252   idx  = 2*(*r++);
1253   t[0] = b[idx]; t[1] = b[1+idx];
1254   for (i=1; i<n; i++) {
1255     v   = aa + 4*ai[i];
1256     vi  = aj + ai[i];
1257     nz  = diag[i] - ai[i];
1258     idx = 2*(*r++);
1259     s1  = b[idx]; s2 = b[1+idx];
1260     while (nz--) {
1261       idx = 2*(*vi++);
1262       x1  = t[idx]; x2 = t[1+idx];
1263       s1 -= v[0]*x1 + v[2]*x2;
1264       s2 -= v[1]*x1 + v[3]*x2;
1265       v  += 4;
1266     }
1267     idx    = 2*i;
1268     t[idx] = s1; t[1+idx] = s2;
1269   }
1270   /* backward solve the upper triangular */
1271   for (i=n-1; i>=0; i--) {
1272     v   = aa + 4*diag[i] + 4;
1273     vi  = aj + diag[i] + 1;
1274     nz  = ai[i+1] - diag[i] - 1;
1275     idt = 2*i;
1276     s1  = t[idt]; s2 = t[1+idt];
1277     while (nz--) {
1278       idx = 2*(*vi++);
1279       x1  = t[idx]; x2 = t[1+idx];
1280       s1 -= v[0]*x1 + v[2]*x2;
1281       s2 -= v[1]*x1 + v[3]*x2;
1282       v  += 4;
1283     }
1284     idc      = 2*(*c--);
1285     v        = aa + 4*diag[i];
1286     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
1287     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
1288   }
1289   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1290   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1291   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1292   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1293   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
1294   PetscFunctionReturn(0);
1295 }
1296 
1297 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
1298 {
1299   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
1300   IS                iscol=a->col,isrow=a->row;
1301   PetscErrorCode    ierr;
1302   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
1303   PetscInt          i,nz,idx,jdx,idt,idc,m;
1304   const PetscInt    *r,*c,*rout,*cout;
1305   const MatScalar   *aa=a->a,*v;
1306   PetscScalar       *x,s1,s2,x1,x2,*t;
1307   const PetscScalar *b;
1308 
1309   PetscFunctionBegin;
1310   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1311   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1312   t    = a->solve_work;
1313 
1314   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1315   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1316 
1317   /* forward solve the lower triangular */
1318   idx  = 2*r[0];
1319   t[0] = b[idx]; t[1] = b[1+idx];
1320   for (i=1; i<n; i++) {
1321     v   = aa + 4*ai[i];
1322     vi  = aj + ai[i];
1323     nz  = ai[i+1] - ai[i];
1324     idx = 2*r[i];
1325     s1  = b[idx]; s2 = b[1+idx];
1326     for (m=0; m<nz; m++) {
1327       jdx = 2*vi[m];
1328       x1  = t[jdx]; x2 = t[1+jdx];
1329       s1 -= v[0]*x1 + v[2]*x2;
1330       s2 -= v[1]*x1 + v[3]*x2;
1331       v  += 4;
1332     }
1333     idx    = 2*i;
1334     t[idx] = s1; t[1+idx] = s2;
1335   }
1336   /* backward solve the upper triangular */
1337   for (i=n-1; i>=0; i--) {
1338     v   = aa + 4*(adiag[i+1]+1);
1339     vi  = aj + adiag[i+1]+1;
1340     nz  = adiag[i] - adiag[i+1] - 1;
1341     idt = 2*i;
1342     s1  = t[idt]; s2 = t[1+idt];
1343     for (m=0; m<nz; m++) {
1344       idx = 2*vi[m];
1345       x1  = t[idx]; x2 = t[1+idx];
1346       s1 -= v[0]*x1 + v[2]*x2;
1347       s2 -= v[1]*x1 + v[3]*x2;
1348       v  += 4;
1349     }
1350     idc      = 2*c[i];
1351     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
1352     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
1353   }
1354   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1355   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1356   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1357   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1358   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
1359   PetscFunctionReturn(0);
1360 }
1361 
1362 PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1363 {
1364   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
1365   IS                iscol=a->col,isrow=a->row;
1366   PetscErrorCode    ierr;
1367   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1368   PetscInt          i,nz;
1369   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
1370   const MatScalar   *aa=a->a,*v;
1371   PetscScalar       *x,s1,*t;
1372   const PetscScalar *b;
1373 
1374   PetscFunctionBegin;
1375   if (!n) PetscFunctionReturn(0);
1376 
1377   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1378   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1379   t    = a->solve_work;
1380 
1381   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1382   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1383 
1384   /* forward solve the lower triangular */
1385   t[0] = b[*r++];
1386   for (i=1; i<n; i++) {
1387     v  = aa + ai[i];
1388     vi = aj + ai[i];
1389     nz = diag[i] - ai[i];
1390     s1 = b[*r++];
1391     while (nz--) {
1392       s1 -= (*v++)*t[*vi++];
1393     }
1394     t[i] = s1;
1395   }
1396   /* backward solve the upper triangular */
1397   for (i=n-1; i>=0; i--) {
1398     v  = aa + diag[i] + 1;
1399     vi = aj + diag[i] + 1;
1400     nz = ai[i+1] - diag[i] - 1;
1401     s1 = t[i];
1402     while (nz--) {
1403       s1 -= (*v++)*t[*vi++];
1404     }
1405     x[*c--] = t[i] = aa[diag[i]]*s1;
1406   }
1407 
1408   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1409   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1410   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1411   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1412   ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr);
1413   PetscFunctionReturn(0);
1414 }
1415 
1416 PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
1417 {
1418   Mat_SeqBAIJ       *a    = (Mat_SeqBAIJ*)A->data;
1419   IS                iscol = a->col,isrow = a->row;
1420   PetscErrorCode    ierr;
1421   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag = a->diag,nz;
1422   const PetscInt    *rout,*cout,*r,*c;
1423   PetscScalar       *x,*tmp,sum;
1424   const PetscScalar *b;
1425   const MatScalar   *aa = a->a,*v;
1426 
1427   PetscFunctionBegin;
1428   if (!n) PetscFunctionReturn(0);
1429 
1430   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1431   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1432   tmp  = a->solve_work;
1433 
1434   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1435   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1436 
1437   /* forward solve the lower triangular */
1438   tmp[0] = b[r[0]];
1439   v      = aa;
1440   vi     = aj;
1441   for (i=1; i<n; i++) {
1442     nz  = ai[i+1] - ai[i];
1443     sum = b[r[i]];
1444     PetscSparseDenseMinusDot(sum,tmp,v,vi,nz);
1445     tmp[i] = sum;
1446     v     += nz; vi += nz;
1447   }
1448 
1449   /* backward solve the upper triangular */
1450   for (i=n-1; i>=0; i--) {
1451     v   = aa + adiag[i+1]+1;
1452     vi  = aj + adiag[i+1]+1;
1453     nz  = adiag[i]-adiag[i+1]-1;
1454     sum = tmp[i];
1455     PetscSparseDenseMinusDot(sum,tmp,v,vi,nz);
1456     x[c[i]] = tmp[i] = sum*v[nz]; /* v[nz] = aa[adiag[i]] */
1457   }
1458 
1459   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1460   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1461   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1462   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1463   ierr = PetscLogFlops(2*a->nz - A->cmap->n);CHKERRQ(ierr);
1464   PetscFunctionReturn(0);
1465 }
1466 
1467