1 #include <../src/mat/impls/baij/seq/baij.h> 2 #include <petsc/private/kernels/blockinvert.h> 3 4 PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx) 5 { 6 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 7 IS iscol=a->col,isrow=a->row; 8 PetscErrorCode ierr; 9 const PetscInt *r,*c,*rout,*cout; 10 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*vi; 11 PetscInt i,nz; 12 const PetscInt bs =A->rmap->bs,bs2=a->bs2; 13 const MatScalar *aa=a->a,*v; 14 PetscScalar *x,*s,*t,*ls; 15 const PetscScalar *b; 16 17 PetscFunctionBegin; 18 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 19 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 20 t = a->solve_work; 21 22 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 23 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 24 25 /* forward solve the lower triangular */ 26 ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 27 for (i=1; i<n; i++) { 28 v = aa + bs2*ai[i]; 29 vi = aj + ai[i]; 30 nz = a->diag[i] - ai[i]; 31 s = t + bs*i; 32 ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 33 while (nz--) { 34 PetscKernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 35 v += bs2; 36 } 37 } 38 /* backward solve the upper triangular */ 39 ls = a->solve_work + A->cmap->n; 40 for (i=n-1; i>=0; i--) { 41 v = aa + bs2*(a->diag[i] + 1); 42 vi = aj + a->diag[i] + 1; 43 nz = ai[i+1] - a->diag[i] - 1; 44 ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 45 while (nz--) { 46 PetscKernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 47 v += bs2; 48 } 49 PetscKernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 50 ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 51 } 52 53 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 54 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 55 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 56 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 57 ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 58 PetscFunctionReturn(0); 59 } 60 61 PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx) 62 { 63 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 64 IS iscol=a->col,isrow=a->row; 65 PetscErrorCode ierr; 66 const PetscInt *r,*c,*ai=a->i,*aj=a->j; 67 const PetscInt *rout,*cout,*diag = a->diag,*vi,n=a->mbs; 68 PetscInt i,nz,idx,idt,idc; 69 const MatScalar *aa=a->a,*v; 70 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t; 71 const PetscScalar *b; 72 73 PetscFunctionBegin; 74 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 75 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 76 t = a->solve_work; 77 78 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 79 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 80 81 /* forward solve the lower triangular */ 82 idx = 7*(*r++); 83 t[0] = b[idx]; t[1] = b[1+idx]; 84 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 85 t[5] = b[5+idx]; t[6] = b[6+idx]; 86 87 for (i=1; i<n; i++) { 88 v = aa + 49*ai[i]; 89 vi = aj + ai[i]; 90 nz = diag[i] - ai[i]; 91 idx = 7*(*r++); 92 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 93 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 94 while (nz--) { 95 idx = 7*(*vi++); 96 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 97 x4 = t[3+idx];x5 = t[4+idx]; 98 x6 = t[5+idx];x7 = t[6+idx]; 99 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 100 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 101 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 102 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 103 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 104 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 105 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 106 v += 49; 107 } 108 idx = 7*i; 109 t[idx] = s1;t[1+idx] = s2; 110 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 111 t[5+idx] = s6;t[6+idx] = s7; 112 } 113 /* backward solve the upper triangular */ 114 for (i=n-1; i>=0; i--) { 115 v = aa + 49*diag[i] + 49; 116 vi = aj + diag[i] + 1; 117 nz = ai[i+1] - diag[i] - 1; 118 idt = 7*i; 119 s1 = t[idt]; s2 = t[1+idt]; 120 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 121 s6 = t[5+idt];s7 = t[6+idt]; 122 while (nz--) { 123 idx = 7*(*vi++); 124 x1 = t[idx]; x2 = t[1+idx]; 125 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 126 x6 = t[5+idx]; x7 = t[6+idx]; 127 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 128 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 129 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 130 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 131 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 132 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 133 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 134 v += 49; 135 } 136 idc = 7*(*c--); 137 v = aa + 49*diag[i]; 138 x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 139 v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 140 x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 141 v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 142 x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 143 v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 144 x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 145 v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 146 x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 147 v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 148 x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 149 v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 150 x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 151 v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 152 } 153 154 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 155 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 156 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 157 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 158 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 159 PetscFunctionReturn(0); 160 } 161 162 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 163 { 164 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 165 IS iscol=a->col,isrow=a->row; 166 PetscErrorCode ierr; 167 const PetscInt *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag; 168 const PetscInt n=a->mbs,*rout,*cout,*vi; 169 PetscInt i,nz,idx,idt,idc,m; 170 const MatScalar *aa=a->a,*v; 171 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t; 172 const PetscScalar *b; 173 174 PetscFunctionBegin; 175 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 176 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 177 t = a->solve_work; 178 179 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 180 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 181 182 /* forward solve the lower triangular */ 183 idx = 7*r[0]; 184 t[0] = b[idx]; t[1] = b[1+idx]; 185 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 186 t[5] = b[5+idx]; t[6] = b[6+idx]; 187 188 for (i=1; i<n; i++) { 189 v = aa + 49*ai[i]; 190 vi = aj + ai[i]; 191 nz = ai[i+1] - ai[i]; 192 idx = 7*r[i]; 193 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 194 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 195 for (m=0; m<nz; m++) { 196 idx = 7*vi[m]; 197 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 198 x4 = t[3+idx];x5 = t[4+idx]; 199 x6 = t[5+idx];x7 = t[6+idx]; 200 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 201 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 202 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 203 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 204 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 205 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 206 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 207 v += 49; 208 } 209 idx = 7*i; 210 t[idx] = s1;t[1+idx] = s2; 211 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 212 t[5+idx] = s6;t[6+idx] = s7; 213 } 214 /* backward solve the upper triangular */ 215 for (i=n-1; i>=0; i--) { 216 v = aa + 49*(adiag[i+1]+1); 217 vi = aj + adiag[i+1]+1; 218 nz = adiag[i] - adiag[i+1] - 1; 219 idt = 7*i; 220 s1 = t[idt]; s2 = t[1+idt]; 221 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 222 s6 = t[5+idt];s7 = t[6+idt]; 223 for (m=0; m<nz; m++) { 224 idx = 7*vi[m]; 225 x1 = t[idx]; x2 = t[1+idx]; 226 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 227 x6 = t[5+idx]; x7 = t[6+idx]; 228 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 229 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 230 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 231 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 232 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 233 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 234 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 235 v += 49; 236 } 237 idc = 7*c[i]; 238 x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 239 v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 240 x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 241 v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 242 x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 243 v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 244 x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 245 v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 246 x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 247 v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 248 x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 249 v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 250 x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 251 v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 252 } 253 254 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 255 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 256 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 257 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 258 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 259 PetscFunctionReturn(0); 260 } 261 262 PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx) 263 { 264 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 265 IS iscol=a->col,isrow=a->row; 266 PetscErrorCode ierr; 267 const PetscInt *r,*c,*rout,*cout; 268 const PetscInt *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 269 PetscInt i,nz,idx,idt,idc; 270 const MatScalar *aa=a->a,*v; 271 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 272 const PetscScalar *b; 273 274 PetscFunctionBegin; 275 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 276 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 277 t = a->solve_work; 278 279 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 280 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 281 282 /* forward solve the lower triangular */ 283 idx = 6*(*r++); 284 t[0] = b[idx]; t[1] = b[1+idx]; 285 t[2] = b[2+idx]; t[3] = b[3+idx]; 286 t[4] = b[4+idx]; t[5] = b[5+idx]; 287 for (i=1; i<n; i++) { 288 v = aa + 36*ai[i]; 289 vi = aj + ai[i]; 290 nz = diag[i] - ai[i]; 291 idx = 6*(*r++); 292 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 293 s5 = b[4+idx]; s6 = b[5+idx]; 294 while (nz--) { 295 idx = 6*(*vi++); 296 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 297 x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 298 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 299 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 300 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 301 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 302 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 303 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 304 v += 36; 305 } 306 idx = 6*i; 307 t[idx] = s1;t[1+idx] = s2; 308 t[2+idx] = s3;t[3+idx] = s4; 309 t[4+idx] = s5;t[5+idx] = s6; 310 } 311 /* backward solve the upper triangular */ 312 for (i=n-1; i>=0; i--) { 313 v = aa + 36*diag[i] + 36; 314 vi = aj + diag[i] + 1; 315 nz = ai[i+1] - diag[i] - 1; 316 idt = 6*i; 317 s1 = t[idt]; s2 = t[1+idt]; 318 s3 = t[2+idt];s4 = t[3+idt]; 319 s5 = t[4+idt];s6 = t[5+idt]; 320 while (nz--) { 321 idx = 6*(*vi++); 322 x1 = t[idx]; x2 = t[1+idx]; 323 x3 = t[2+idx]; x4 = t[3+idx]; 324 x5 = t[4+idx]; x6 = t[5+idx]; 325 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 326 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 327 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 328 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 329 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 330 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 331 v += 36; 332 } 333 idc = 6*(*c--); 334 v = aa + 36*diag[i]; 335 x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 336 v[18]*s4+v[24]*s5+v[30]*s6; 337 x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 338 v[19]*s4+v[25]*s5+v[31]*s6; 339 x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 340 v[20]*s4+v[26]*s5+v[32]*s6; 341 x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 342 v[21]*s4+v[27]*s5+v[33]*s6; 343 x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 344 v[22]*s4+v[28]*s5+v[34]*s6; 345 x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 346 v[23]*s4+v[29]*s5+v[35]*s6; 347 } 348 349 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 350 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 351 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 352 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 353 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 354 PetscFunctionReturn(0); 355 } 356 357 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 358 { 359 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 360 IS iscol=a->col,isrow=a->row; 361 PetscErrorCode ierr; 362 const PetscInt *r,*c,*rout,*cout; 363 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 364 PetscInt i,nz,idx,idt,idc,m; 365 const MatScalar *aa=a->a,*v; 366 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 367 const PetscScalar *b; 368 369 PetscFunctionBegin; 370 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 371 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 372 t = a->solve_work; 373 374 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 375 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 376 377 /* forward solve the lower triangular */ 378 idx = 6*r[0]; 379 t[0] = b[idx]; t[1] = b[1+idx]; 380 t[2] = b[2+idx]; t[3] = b[3+idx]; 381 t[4] = b[4+idx]; t[5] = b[5+idx]; 382 for (i=1; i<n; i++) { 383 v = aa + 36*ai[i]; 384 vi = aj + ai[i]; 385 nz = ai[i+1] - ai[i]; 386 idx = 6*r[i]; 387 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 388 s5 = b[4+idx]; s6 = b[5+idx]; 389 for (m=0; m<nz; m++) { 390 idx = 6*vi[m]; 391 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 392 x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 393 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 394 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 395 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 396 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 397 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 398 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 399 v += 36; 400 } 401 idx = 6*i; 402 t[idx] = s1;t[1+idx] = s2; 403 t[2+idx] = s3;t[3+idx] = s4; 404 t[4+idx] = s5;t[5+idx] = s6; 405 } 406 /* backward solve the upper triangular */ 407 for (i=n-1; i>=0; i--) { 408 v = aa + 36*(adiag[i+1]+1); 409 vi = aj + adiag[i+1]+1; 410 nz = adiag[i] - adiag[i+1] - 1; 411 idt = 6*i; 412 s1 = t[idt]; s2 = t[1+idt]; 413 s3 = t[2+idt];s4 = t[3+idt]; 414 s5 = t[4+idt];s6 = t[5+idt]; 415 for (m=0; m<nz; m++) { 416 idx = 6*vi[m]; 417 x1 = t[idx]; x2 = t[1+idx]; 418 x3 = t[2+idx]; x4 = t[3+idx]; 419 x5 = t[4+idx]; x6 = t[5+idx]; 420 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 421 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 422 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 423 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 424 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 425 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 426 v += 36; 427 } 428 idc = 6*c[i]; 429 x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 430 v[18]*s4+v[24]*s5+v[30]*s6; 431 x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 432 v[19]*s4+v[25]*s5+v[31]*s6; 433 x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 434 v[20]*s4+v[26]*s5+v[32]*s6; 435 x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 436 v[21]*s4+v[27]*s5+v[33]*s6; 437 x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 438 v[22]*s4+v[28]*s5+v[34]*s6; 439 x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 440 v[23]*s4+v[29]*s5+v[35]*s6; 441 } 442 443 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 444 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 445 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 446 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 447 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 448 PetscFunctionReturn(0); 449 } 450 451 PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx) 452 { 453 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 454 IS iscol=a->col,isrow=a->row; 455 PetscErrorCode ierr; 456 const PetscInt *r,*c,*rout,*cout,*diag = a->diag; 457 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 458 PetscInt i,nz,idx,idt,idc; 459 const MatScalar *aa=a->a,*v; 460 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 461 const PetscScalar *b; 462 463 PetscFunctionBegin; 464 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 465 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 466 t = a->solve_work; 467 468 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 469 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 470 471 /* forward solve the lower triangular */ 472 idx = 5*(*r++); 473 t[0] = b[idx]; t[1] = b[1+idx]; 474 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 475 for (i=1; i<n; i++) { 476 v = aa + 25*ai[i]; 477 vi = aj + ai[i]; 478 nz = diag[i] - ai[i]; 479 idx = 5*(*r++); 480 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 481 s5 = b[4+idx]; 482 while (nz--) { 483 idx = 5*(*vi++); 484 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 485 x4 = t[3+idx];x5 = t[4+idx]; 486 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 487 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 488 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 489 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 490 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 491 v += 25; 492 } 493 idx = 5*i; 494 t[idx] = s1;t[1+idx] = s2; 495 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 496 } 497 /* backward solve the upper triangular */ 498 for (i=n-1; i>=0; i--) { 499 v = aa + 25*diag[i] + 25; 500 vi = aj + diag[i] + 1; 501 nz = ai[i+1] - diag[i] - 1; 502 idt = 5*i; 503 s1 = t[idt]; s2 = t[1+idt]; 504 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 505 while (nz--) { 506 idx = 5*(*vi++); 507 x1 = t[idx]; x2 = t[1+idx]; 508 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 509 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 510 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 511 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 512 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 513 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 514 v += 25; 515 } 516 idc = 5*(*c--); 517 v = aa + 25*diag[i]; 518 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 519 v[15]*s4+v[20]*s5; 520 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 521 v[16]*s4+v[21]*s5; 522 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 523 v[17]*s4+v[22]*s5; 524 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 525 v[18]*s4+v[23]*s5; 526 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 527 v[19]*s4+v[24]*s5; 528 } 529 530 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 531 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 532 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 533 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 534 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 535 PetscFunctionReturn(0); 536 } 537 538 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 539 { 540 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 541 IS iscol=a->col,isrow=a->row; 542 PetscErrorCode ierr; 543 const PetscInt *r,*c,*rout,*cout; 544 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 545 PetscInt i,nz,idx,idt,idc,m; 546 const MatScalar *aa=a->a,*v; 547 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 548 const PetscScalar *b; 549 550 PetscFunctionBegin; 551 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 552 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 553 t = a->solve_work; 554 555 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 556 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 557 558 /* forward solve the lower triangular */ 559 idx = 5*r[0]; 560 t[0] = b[idx]; t[1] = b[1+idx]; 561 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 562 for (i=1; i<n; i++) { 563 v = aa + 25*ai[i]; 564 vi = aj + ai[i]; 565 nz = ai[i+1] - ai[i]; 566 idx = 5*r[i]; 567 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 568 s5 = b[4+idx]; 569 for (m=0; m<nz; m++) { 570 idx = 5*vi[m]; 571 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 572 x4 = t[3+idx];x5 = t[4+idx]; 573 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 574 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 575 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 576 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 577 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 578 v += 25; 579 } 580 idx = 5*i; 581 t[idx] = s1;t[1+idx] = s2; 582 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 583 } 584 /* backward solve the upper triangular */ 585 for (i=n-1; i>=0; i--) { 586 v = aa + 25*(adiag[i+1]+1); 587 vi = aj + adiag[i+1]+1; 588 nz = adiag[i] - adiag[i+1] - 1; 589 idt = 5*i; 590 s1 = t[idt]; s2 = t[1+idt]; 591 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 592 for (m=0; m<nz; m++) { 593 idx = 5*vi[m]; 594 x1 = t[idx]; x2 = t[1+idx]; 595 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 596 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 597 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 598 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 599 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 600 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 601 v += 25; 602 } 603 idc = 5*c[i]; 604 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 605 v[15]*s4+v[20]*s5; 606 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 607 v[16]*s4+v[21]*s5; 608 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 609 v[17]*s4+v[22]*s5; 610 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 611 v[18]*s4+v[23]*s5; 612 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 613 v[19]*s4+v[24]*s5; 614 } 615 616 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 617 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 618 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 619 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 620 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 621 PetscFunctionReturn(0); 622 } 623 624 PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx) 625 { 626 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 627 IS iscol=a->col,isrow=a->row; 628 PetscErrorCode ierr; 629 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 630 PetscInt i,nz,idx,idt,idc; 631 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 632 const MatScalar *aa=a->a,*v; 633 PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 634 const PetscScalar *b; 635 636 PetscFunctionBegin; 637 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 638 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 639 t = a->solve_work; 640 641 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 642 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 643 644 /* forward solve the lower triangular */ 645 idx = 4*(*r++); 646 t[0] = b[idx]; t[1] = b[1+idx]; 647 t[2] = b[2+idx]; t[3] = b[3+idx]; 648 for (i=1; i<n; i++) { 649 v = aa + 16*ai[i]; 650 vi = aj + ai[i]; 651 nz = diag[i] - ai[i]; 652 idx = 4*(*r++); 653 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 654 while (nz--) { 655 idx = 4*(*vi++); 656 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 657 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 658 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 659 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 660 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 661 v += 16; 662 } 663 idx = 4*i; 664 t[idx] = s1;t[1+idx] = s2; 665 t[2+idx] = s3;t[3+idx] = s4; 666 } 667 /* backward solve the upper triangular */ 668 for (i=n-1; i>=0; i--) { 669 v = aa + 16*diag[i] + 16; 670 vi = aj + diag[i] + 1; 671 nz = ai[i+1] - diag[i] - 1; 672 idt = 4*i; 673 s1 = t[idt]; s2 = t[1+idt]; 674 s3 = t[2+idt];s4 = t[3+idt]; 675 while (nz--) { 676 idx = 4*(*vi++); 677 x1 = t[idx]; x2 = t[1+idx]; 678 x3 = t[2+idx]; x4 = t[3+idx]; 679 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 680 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 681 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 682 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 683 v += 16; 684 } 685 idc = 4*(*c--); 686 v = aa + 16*diag[i]; 687 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 688 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 689 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 690 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 691 } 692 693 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 694 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 695 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 696 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 697 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 698 PetscFunctionReturn(0); 699 } 700 701 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 702 { 703 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 704 IS iscol=a->col,isrow=a->row; 705 PetscErrorCode ierr; 706 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 707 PetscInt i,nz,idx,idt,idc,m; 708 const PetscInt *r,*c,*rout,*cout; 709 const MatScalar *aa=a->a,*v; 710 PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 711 const PetscScalar *b; 712 713 PetscFunctionBegin; 714 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 715 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 716 t = a->solve_work; 717 718 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 719 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 720 721 /* forward solve the lower triangular */ 722 idx = 4*r[0]; 723 t[0] = b[idx]; t[1] = b[1+idx]; 724 t[2] = b[2+idx]; t[3] = b[3+idx]; 725 for (i=1; i<n; i++) { 726 v = aa + 16*ai[i]; 727 vi = aj + ai[i]; 728 nz = ai[i+1] - ai[i]; 729 idx = 4*r[i]; 730 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 731 for (m=0; m<nz; m++) { 732 idx = 4*vi[m]; 733 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 734 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 735 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 736 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 737 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 738 v += 16; 739 } 740 idx = 4*i; 741 t[idx] = s1;t[1+idx] = s2; 742 t[2+idx] = s3;t[3+idx] = s4; 743 } 744 /* backward solve the upper triangular */ 745 for (i=n-1; i>=0; i--) { 746 v = aa + 16*(adiag[i+1]+1); 747 vi = aj + adiag[i+1]+1; 748 nz = adiag[i] - adiag[i+1] - 1; 749 idt = 4*i; 750 s1 = t[idt]; s2 = t[1+idt]; 751 s3 = t[2+idt];s4 = t[3+idt]; 752 for (m=0; m<nz; m++) { 753 idx = 4*vi[m]; 754 x1 = t[idx]; x2 = t[1+idx]; 755 x3 = t[2+idx]; x4 = t[3+idx]; 756 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 757 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 758 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 759 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 760 v += 16; 761 } 762 idc = 4*c[i]; 763 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 764 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 765 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 766 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 767 } 768 769 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 770 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 771 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 772 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 773 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 774 PetscFunctionReturn(0); 775 } 776 777 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx) 778 { 779 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 780 IS iscol=a->col,isrow=a->row; 781 PetscErrorCode ierr; 782 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 783 PetscInt i,nz,idx,idt,idc; 784 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 785 const MatScalar *aa=a->a,*v; 786 MatScalar s1,s2,s3,s4,x1,x2,x3,x4,*t; 787 PetscScalar *x; 788 const PetscScalar *b; 789 790 PetscFunctionBegin; 791 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 792 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 793 t = (MatScalar*)a->solve_work; 794 795 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 796 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 797 798 /* forward solve the lower triangular */ 799 idx = 4*(*r++); 800 t[0] = (MatScalar)b[idx]; 801 t[1] = (MatScalar)b[1+idx]; 802 t[2] = (MatScalar)b[2+idx]; 803 t[3] = (MatScalar)b[3+idx]; 804 for (i=1; i<n; i++) { 805 v = aa + 16*ai[i]; 806 vi = aj + ai[i]; 807 nz = diag[i] - ai[i]; 808 idx = 4*(*r++); 809 s1 = (MatScalar)b[idx]; 810 s2 = (MatScalar)b[1+idx]; 811 s3 = (MatScalar)b[2+idx]; 812 s4 = (MatScalar)b[3+idx]; 813 while (nz--) { 814 idx = 4*(*vi++); 815 x1 = t[idx]; 816 x2 = t[1+idx]; 817 x3 = t[2+idx]; 818 x4 = t[3+idx]; 819 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 820 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 821 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 822 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 823 v += 16; 824 } 825 idx = 4*i; 826 t[idx] = s1; 827 t[1+idx] = s2; 828 t[2+idx] = s3; 829 t[3+idx] = s4; 830 } 831 /* backward solve the upper triangular */ 832 for (i=n-1; i>=0; i--) { 833 v = aa + 16*diag[i] + 16; 834 vi = aj + diag[i] + 1; 835 nz = ai[i+1] - diag[i] - 1; 836 idt = 4*i; 837 s1 = t[idt]; 838 s2 = t[1+idt]; 839 s3 = t[2+idt]; 840 s4 = t[3+idt]; 841 while (nz--) { 842 idx = 4*(*vi++); 843 x1 = t[idx]; 844 x2 = t[1+idx]; 845 x3 = t[2+idx]; 846 x4 = t[3+idx]; 847 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 848 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 849 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 850 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 851 v += 16; 852 } 853 idc = 4*(*c--); 854 v = aa + 16*diag[i]; 855 t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 856 t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 857 t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 858 t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 859 x[idc] = (PetscScalar)t[idt]; 860 x[1+idc] = (PetscScalar)t[1+idt]; 861 x[2+idc] = (PetscScalar)t[2+idt]; 862 x[3+idc] = (PetscScalar)t[3+idt]; 863 } 864 865 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 866 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 867 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 868 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 869 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 870 PetscFunctionReturn(0); 871 } 872 873 #if defined(PETSC_HAVE_SSE) 874 875 #include PETSC_HAVE_SSE 876 877 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx) 878 { 879 /* 880 Note: This code uses demotion of double 881 to float when performing the mixed-mode computation. 882 This may not be numerically reasonable for all applications. 883 */ 884 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 885 IS iscol=a->col,isrow=a->row; 886 PetscErrorCode ierr; 887 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16; 888 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 889 MatScalar *aa=a->a,*v; 890 PetscScalar *x,*b,*t; 891 892 /* Make space in temp stack for 16 Byte Aligned arrays */ 893 float ssealignedspace[11],*tmps,*tmpx; 894 unsigned long offset; 895 896 PetscFunctionBegin; 897 SSE_SCOPE_BEGIN; 898 899 offset = (unsigned long)ssealignedspace % 16; 900 if (offset) offset = (16 - offset)/4; 901 tmps = &ssealignedspace[offset]; 902 tmpx = &ssealignedspace[offset+4]; 903 PREFETCH_NTA(aa+16*ai[1]); 904 905 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 906 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 907 t = a->solve_work; 908 909 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 910 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 911 912 /* forward solve the lower triangular */ 913 idx = 4*(*r++); 914 t[0] = b[idx]; t[1] = b[1+idx]; 915 t[2] = b[2+idx]; t[3] = b[3+idx]; 916 v = aa + 16*ai[1]; 917 918 for (i=1; i<n; ) { 919 PREFETCH_NTA(&v[8]); 920 vi = aj + ai[i]; 921 nz = diag[i] - ai[i]; 922 idx = 4*(*r++); 923 924 /* Demote sum from double to float */ 925 CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 926 LOAD_PS(tmps,XMM7); 927 928 while (nz--) { 929 PREFETCH_NTA(&v[16]); 930 idx = 4*(*vi++); 931 932 /* Demote solution (so far) from double to float */ 933 CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 934 935 /* 4x4 Matrix-Vector product with negative accumulation: */ 936 SSE_INLINE_BEGIN_2(tmpx,v) 937 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 938 939 /* First Column */ 940 SSE_COPY_PS(XMM0,XMM6) 941 SSE_SHUFFLE(XMM0,XMM0,0x00) 942 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 943 SSE_SUB_PS(XMM7,XMM0) 944 945 /* Second Column */ 946 SSE_COPY_PS(XMM1,XMM6) 947 SSE_SHUFFLE(XMM1,XMM1,0x55) 948 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 949 SSE_SUB_PS(XMM7,XMM1) 950 951 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 952 953 /* Third Column */ 954 SSE_COPY_PS(XMM2,XMM6) 955 SSE_SHUFFLE(XMM2,XMM2,0xAA) 956 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 957 SSE_SUB_PS(XMM7,XMM2) 958 959 /* Fourth Column */ 960 SSE_COPY_PS(XMM3,XMM6) 961 SSE_SHUFFLE(XMM3,XMM3,0xFF) 962 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 963 SSE_SUB_PS(XMM7,XMM3) 964 SSE_INLINE_END_2 965 966 v += 16; 967 } 968 idx = 4*i; 969 v = aa + 16*ai[++i]; 970 PREFETCH_NTA(v); 971 STORE_PS(tmps,XMM7); 972 973 /* Promote result from float to double */ 974 CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps); 975 } 976 /* backward solve the upper triangular */ 977 idt = 4*(n-1); 978 ai16 = 16*diag[n-1]; 979 v = aa + ai16 + 16; 980 for (i=n-1; i>=0; ) { 981 PREFETCH_NTA(&v[8]); 982 vi = aj + diag[i] + 1; 983 nz = ai[i+1] - diag[i] - 1; 984 985 /* Demote accumulator from double to float */ 986 CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]); 987 LOAD_PS(tmps,XMM7); 988 989 while (nz--) { 990 PREFETCH_NTA(&v[16]); 991 idx = 4*(*vi++); 992 993 /* Demote solution (so far) from double to float */ 994 CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]); 995 996 /* 4x4 Matrix-Vector Product with negative accumulation: */ 997 SSE_INLINE_BEGIN_2(tmpx,v) 998 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 999 1000 /* First Column */ 1001 SSE_COPY_PS(XMM0,XMM6) 1002 SSE_SHUFFLE(XMM0,XMM0,0x00) 1003 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 1004 SSE_SUB_PS(XMM7,XMM0) 1005 1006 /* Second Column */ 1007 SSE_COPY_PS(XMM1,XMM6) 1008 SSE_SHUFFLE(XMM1,XMM1,0x55) 1009 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 1010 SSE_SUB_PS(XMM7,XMM1) 1011 1012 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 1013 1014 /* Third Column */ 1015 SSE_COPY_PS(XMM2,XMM6) 1016 SSE_SHUFFLE(XMM2,XMM2,0xAA) 1017 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 1018 SSE_SUB_PS(XMM7,XMM2) 1019 1020 /* Fourth Column */ 1021 SSE_COPY_PS(XMM3,XMM6) 1022 SSE_SHUFFLE(XMM3,XMM3,0xFF) 1023 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 1024 SSE_SUB_PS(XMM7,XMM3) 1025 SSE_INLINE_END_2 1026 v += 16; 1027 } 1028 v = aa + ai16; 1029 ai16 = 16*diag[--i]; 1030 PREFETCH_NTA(aa+ai16+16); 1031 /* 1032 Scale the result by the diagonal 4x4 block, 1033 which was inverted as part of the factorization 1034 */ 1035 SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 1036 /* First Column */ 1037 SSE_COPY_PS(XMM0,XMM7) 1038 SSE_SHUFFLE(XMM0,XMM0,0x00) 1039 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 1040 1041 /* Second Column */ 1042 SSE_COPY_PS(XMM1,XMM7) 1043 SSE_SHUFFLE(XMM1,XMM1,0x55) 1044 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 1045 SSE_ADD_PS(XMM0,XMM1) 1046 1047 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 1048 1049 /* Third Column */ 1050 SSE_COPY_PS(XMM2,XMM7) 1051 SSE_SHUFFLE(XMM2,XMM2,0xAA) 1052 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 1053 SSE_ADD_PS(XMM0,XMM2) 1054 1055 /* Fourth Column */ 1056 SSE_COPY_PS(XMM3,XMM7) 1057 SSE_SHUFFLE(XMM3,XMM3,0xFF) 1058 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 1059 SSE_ADD_PS(XMM0,XMM3) 1060 1061 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 1062 SSE_INLINE_END_3 1063 1064 /* Promote solution from float to double */ 1065 CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps); 1066 1067 /* Apply reordering to t and stream into x. */ 1068 /* This way, x doesn't pollute the cache. */ 1069 /* Be careful with size: 2 doubles = 4 floats! */ 1070 idc = 4*(*c--); 1071 SSE_INLINE_BEGIN_2((float*)&t[idt],(float*)&x[idc]) 1072 /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 1073 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 1074 SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0) 1075 /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 1076 SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1) 1077 SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1) 1078 SSE_INLINE_END_2 1079 v = aa + ai16 + 16; 1080 idt -= 4; 1081 } 1082 1083 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1084 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1085 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1086 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1087 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 1088 SSE_SCOPE_END; 1089 PetscFunctionReturn(0); 1090 } 1091 1092 #endif 1093 1094 PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx) 1095 { 1096 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 1097 IS iscol=a->col,isrow=a->row; 1098 PetscErrorCode ierr; 1099 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1100 PetscInt i,nz,idx,idt,idc; 1101 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 1102 const MatScalar *aa=a->a,*v; 1103 PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 1104 const PetscScalar *b; 1105 1106 PetscFunctionBegin; 1107 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1108 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1109 t = a->solve_work; 1110 1111 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1112 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1113 1114 /* forward solve the lower triangular */ 1115 idx = 3*(*r++); 1116 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 1117 for (i=1; i<n; i++) { 1118 v = aa + 9*ai[i]; 1119 vi = aj + ai[i]; 1120 nz = diag[i] - ai[i]; 1121 idx = 3*(*r++); 1122 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1123 while (nz--) { 1124 idx = 3*(*vi++); 1125 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1126 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 1127 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 1128 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 1129 v += 9; 1130 } 1131 idx = 3*i; 1132 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 1133 } 1134 /* backward solve the upper triangular */ 1135 for (i=n-1; i>=0; i--) { 1136 v = aa + 9*diag[i] + 9; 1137 vi = aj + diag[i] + 1; 1138 nz = ai[i+1] - diag[i] - 1; 1139 idt = 3*i; 1140 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 1141 while (nz--) { 1142 idx = 3*(*vi++); 1143 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1144 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 1145 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 1146 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 1147 v += 9; 1148 } 1149 idc = 3*(*c--); 1150 v = aa + 9*diag[i]; 1151 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 1152 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 1153 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 1154 } 1155 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1156 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1157 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1158 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1159 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 1160 PetscFunctionReturn(0); 1161 } 1162 1163 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 1164 { 1165 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 1166 IS iscol=a->col,isrow=a->row; 1167 PetscErrorCode ierr; 1168 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 1169 PetscInt i,nz,idx,idt,idc,m; 1170 const PetscInt *r,*c,*rout,*cout; 1171 const MatScalar *aa=a->a,*v; 1172 PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 1173 const PetscScalar *b; 1174 1175 PetscFunctionBegin; 1176 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1177 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1178 t = a->solve_work; 1179 1180 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1181 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1182 1183 /* forward solve the lower triangular */ 1184 idx = 3*r[0]; 1185 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 1186 for (i=1; i<n; i++) { 1187 v = aa + 9*ai[i]; 1188 vi = aj + ai[i]; 1189 nz = ai[i+1] - ai[i]; 1190 idx = 3*r[i]; 1191 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1192 for (m=0; m<nz; m++) { 1193 idx = 3*vi[m]; 1194 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1195 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 1196 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 1197 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 1198 v += 9; 1199 } 1200 idx = 3*i; 1201 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 1202 } 1203 /* backward solve the upper triangular */ 1204 for (i=n-1; i>=0; i--) { 1205 v = aa + 9*(adiag[i+1]+1); 1206 vi = aj + adiag[i+1]+1; 1207 nz = adiag[i] - adiag[i+1] - 1; 1208 idt = 3*i; 1209 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 1210 for (m=0; m<nz; m++) { 1211 idx = 3*vi[m]; 1212 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1213 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 1214 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 1215 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 1216 v += 9; 1217 } 1218 idc = 3*c[i]; 1219 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 1220 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 1221 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 1222 } 1223 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1224 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1225 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1226 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1227 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 1228 PetscFunctionReturn(0); 1229 } 1230 1231 PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx) 1232 { 1233 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 1234 IS iscol=a->col,isrow=a->row; 1235 PetscErrorCode ierr; 1236 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1237 PetscInt i,nz,idx,idt,idc; 1238 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 1239 const MatScalar *aa=a->a,*v; 1240 PetscScalar *x,s1,s2,x1,x2,*t; 1241 const PetscScalar *b; 1242 1243 PetscFunctionBegin; 1244 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1245 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1246 t = a->solve_work; 1247 1248 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1249 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1250 1251 /* forward solve the lower triangular */ 1252 idx = 2*(*r++); 1253 t[0] = b[idx]; t[1] = b[1+idx]; 1254 for (i=1; i<n; i++) { 1255 v = aa + 4*ai[i]; 1256 vi = aj + ai[i]; 1257 nz = diag[i] - ai[i]; 1258 idx = 2*(*r++); 1259 s1 = b[idx]; s2 = b[1+idx]; 1260 while (nz--) { 1261 idx = 2*(*vi++); 1262 x1 = t[idx]; x2 = t[1+idx]; 1263 s1 -= v[0]*x1 + v[2]*x2; 1264 s2 -= v[1]*x1 + v[3]*x2; 1265 v += 4; 1266 } 1267 idx = 2*i; 1268 t[idx] = s1; t[1+idx] = s2; 1269 } 1270 /* backward solve the upper triangular */ 1271 for (i=n-1; i>=0; i--) { 1272 v = aa + 4*diag[i] + 4; 1273 vi = aj + diag[i] + 1; 1274 nz = ai[i+1] - diag[i] - 1; 1275 idt = 2*i; 1276 s1 = t[idt]; s2 = t[1+idt]; 1277 while (nz--) { 1278 idx = 2*(*vi++); 1279 x1 = t[idx]; x2 = t[1+idx]; 1280 s1 -= v[0]*x1 + v[2]*x2; 1281 s2 -= v[1]*x1 + v[3]*x2; 1282 v += 4; 1283 } 1284 idc = 2*(*c--); 1285 v = aa + 4*diag[i]; 1286 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 1287 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 1288 } 1289 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1290 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1291 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1292 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1293 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 1294 PetscFunctionReturn(0); 1295 } 1296 1297 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 1298 { 1299 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 1300 IS iscol=a->col,isrow=a->row; 1301 PetscErrorCode ierr; 1302 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 1303 PetscInt i,nz,idx,jdx,idt,idc,m; 1304 const PetscInt *r,*c,*rout,*cout; 1305 const MatScalar *aa=a->a,*v; 1306 PetscScalar *x,s1,s2,x1,x2,*t; 1307 const PetscScalar *b; 1308 1309 PetscFunctionBegin; 1310 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1311 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1312 t = a->solve_work; 1313 1314 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1315 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1316 1317 /* forward solve the lower triangular */ 1318 idx = 2*r[0]; 1319 t[0] = b[idx]; t[1] = b[1+idx]; 1320 for (i=1; i<n; i++) { 1321 v = aa + 4*ai[i]; 1322 vi = aj + ai[i]; 1323 nz = ai[i+1] - ai[i]; 1324 idx = 2*r[i]; 1325 s1 = b[idx]; s2 = b[1+idx]; 1326 for (m=0; m<nz; m++) { 1327 jdx = 2*vi[m]; 1328 x1 = t[jdx]; x2 = t[1+jdx]; 1329 s1 -= v[0]*x1 + v[2]*x2; 1330 s2 -= v[1]*x1 + v[3]*x2; 1331 v += 4; 1332 } 1333 idx = 2*i; 1334 t[idx] = s1; t[1+idx] = s2; 1335 } 1336 /* backward solve the upper triangular */ 1337 for (i=n-1; i>=0; i--) { 1338 v = aa + 4*(adiag[i+1]+1); 1339 vi = aj + adiag[i+1]+1; 1340 nz = adiag[i] - adiag[i+1] - 1; 1341 idt = 2*i; 1342 s1 = t[idt]; s2 = t[1+idt]; 1343 for (m=0; m<nz; m++) { 1344 idx = 2*vi[m]; 1345 x1 = t[idx]; x2 = t[1+idx]; 1346 s1 -= v[0]*x1 + v[2]*x2; 1347 s2 -= v[1]*x1 + v[3]*x2; 1348 v += 4; 1349 } 1350 idc = 2*c[i]; 1351 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 1352 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 1353 } 1354 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1355 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1356 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1357 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1358 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 1359 PetscFunctionReturn(0); 1360 } 1361 1362 PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx) 1363 { 1364 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 1365 IS iscol=a->col,isrow=a->row; 1366 PetscErrorCode ierr; 1367 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1368 PetscInt i,nz; 1369 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 1370 const MatScalar *aa=a->a,*v; 1371 PetscScalar *x,s1,*t; 1372 const PetscScalar *b; 1373 1374 PetscFunctionBegin; 1375 if (!n) PetscFunctionReturn(0); 1376 1377 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1378 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1379 t = a->solve_work; 1380 1381 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1382 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1383 1384 /* forward solve the lower triangular */ 1385 t[0] = b[*r++]; 1386 for (i=1; i<n; i++) { 1387 v = aa + ai[i]; 1388 vi = aj + ai[i]; 1389 nz = diag[i] - ai[i]; 1390 s1 = b[*r++]; 1391 while (nz--) { 1392 s1 -= (*v++)*t[*vi++]; 1393 } 1394 t[i] = s1; 1395 } 1396 /* backward solve the upper triangular */ 1397 for (i=n-1; i>=0; i--) { 1398 v = aa + diag[i] + 1; 1399 vi = aj + diag[i] + 1; 1400 nz = ai[i+1] - diag[i] - 1; 1401 s1 = t[i]; 1402 while (nz--) { 1403 s1 -= (*v++)*t[*vi++]; 1404 } 1405 x[*c--] = t[i] = aa[diag[i]]*s1; 1406 } 1407 1408 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1409 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1410 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1411 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1412 ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr); 1413 PetscFunctionReturn(0); 1414 } 1415 1416 PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 1417 { 1418 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1419 IS iscol = a->col,isrow = a->row; 1420 PetscErrorCode ierr; 1421 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag = a->diag,nz; 1422 const PetscInt *rout,*cout,*r,*c; 1423 PetscScalar *x,*tmp,sum; 1424 const PetscScalar *b; 1425 const MatScalar *aa = a->a,*v; 1426 1427 PetscFunctionBegin; 1428 if (!n) PetscFunctionReturn(0); 1429 1430 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1431 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1432 tmp = a->solve_work; 1433 1434 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1435 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1436 1437 /* forward solve the lower triangular */ 1438 tmp[0] = b[r[0]]; 1439 v = aa; 1440 vi = aj; 1441 for (i=1; i<n; i++) { 1442 nz = ai[i+1] - ai[i]; 1443 sum = b[r[i]]; 1444 PetscSparseDenseMinusDot(sum,tmp,v,vi,nz); 1445 tmp[i] = sum; 1446 v += nz; vi += nz; 1447 } 1448 1449 /* backward solve the upper triangular */ 1450 for (i=n-1; i>=0; i--) { 1451 v = aa + adiag[i+1]+1; 1452 vi = aj + adiag[i+1]+1; 1453 nz = adiag[i]-adiag[i+1]-1; 1454 sum = tmp[i]; 1455 PetscSparseDenseMinusDot(sum,tmp,v,vi,nz); 1456 x[c[i]] = tmp[i] = sum*v[nz]; /* v[nz] = aa[adiag[i]] */ 1457 } 1458 1459 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1460 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1461 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1462 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1463 ierr = PetscLogFlops(2*a->nz - A->cmap->n);CHKERRQ(ierr); 1464 PetscFunctionReturn(0); 1465 } 1466 1467