1 /*$Id: baijfact2.c,v 1.70 2001/08/07 03:02:55 balay Exp bsmith $*/ 2 /* 3 Factorization code for BAIJ format. 4 */ 5 6 #include "src/mat/impls/baij/seq/baij.h" 7 #include "src/vec/vecimpl.h" 8 #include "src/inline/ilu.h" 9 #include "src/inline/dot.h" 10 11 #undef __FUNCT__ 12 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering" 13 int MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 14 { 15 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 16 int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 17 int *diag = a->diag; 18 MatScalar *aa=a->a,*v; 19 PetscScalar s1,*x,*b; 20 21 PetscFunctionBegin; 22 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 23 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 24 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 25 26 /* forward solve the U^T */ 27 for (i=0; i<n; i++) { 28 29 v = aa + diag[i]; 30 /* multiply by the inverse of the block diagonal */ 31 s1 = (*v++)*x[i]; 32 vi = aj + diag[i] + 1; 33 nz = ai[i+1] - diag[i] - 1; 34 while (nz--) { 35 x[*vi++] -= (*v++)*s1; 36 } 37 x[i] = s1; 38 } 39 /* backward solve the L^T */ 40 for (i=n-1; i>=0; i--){ 41 v = aa + diag[i] - 1; 42 vi = aj + diag[i] - 1; 43 nz = diag[i] - ai[i]; 44 s1 = x[i]; 45 while (nz--) { 46 x[*vi--] -= (*v--)*s1; 47 } 48 } 49 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 50 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 51 PetscLogFlops(2*(a->nz) - A->n); 52 PetscFunctionReturn(0); 53 } 54 55 #undef __FUNCT__ 56 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering" 57 int MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 58 { 59 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 60 int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 61 int *diag = a->diag,oidx; 62 MatScalar *aa=a->a,*v; 63 PetscScalar s1,s2,x1,x2; 64 PetscScalar *x,*b; 65 66 PetscFunctionBegin; 67 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 68 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 69 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 70 71 /* forward solve the U^T */ 72 idx = 0; 73 for (i=0; i<n; i++) { 74 75 v = aa + 4*diag[i]; 76 /* multiply by the inverse of the block diagonal */ 77 x1 = x[idx]; x2 = x[1+idx]; 78 s1 = v[0]*x1 + v[1]*x2; 79 s2 = v[2]*x1 + v[3]*x2; 80 v += 4; 81 82 vi = aj + diag[i] + 1; 83 nz = ai[i+1] - diag[i] - 1; 84 while (nz--) { 85 oidx = 2*(*vi++); 86 x[oidx] -= v[0]*s1 + v[1]*s2; 87 x[oidx+1] -= v[2]*s1 + v[3]*s2; 88 v += 4; 89 } 90 x[idx] = s1;x[1+idx] = s2; 91 idx += 2; 92 } 93 /* backward solve the L^T */ 94 for (i=n-1; i>=0; i--){ 95 v = aa + 4*diag[i] - 4; 96 vi = aj + diag[i] - 1; 97 nz = diag[i] - ai[i]; 98 idt = 2*i; 99 s1 = x[idt]; s2 = x[1+idt]; 100 while (nz--) { 101 idx = 2*(*vi--); 102 x[idx] -= v[0]*s1 + v[1]*s2; 103 x[idx+1] -= v[2]*s1 + v[3]*s2; 104 v -= 4; 105 } 106 } 107 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 108 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 109 PetscLogFlops(2*4*(a->nz) - 2*A->n); 110 PetscFunctionReturn(0); 111 } 112 113 #undef __FUNCT__ 114 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering" 115 int MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 116 { 117 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 118 int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 119 int *diag = a->diag,oidx; 120 MatScalar *aa=a->a,*v; 121 PetscScalar s1,s2,s3,x1,x2,x3; 122 PetscScalar *x,*b; 123 124 PetscFunctionBegin; 125 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 126 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 127 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 128 129 /* forward solve the U^T */ 130 idx = 0; 131 for (i=0; i<n; i++) { 132 133 v = aa + 9*diag[i]; 134 /* multiply by the inverse of the block diagonal */ 135 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 136 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 137 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 138 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 139 v += 9; 140 141 vi = aj + diag[i] + 1; 142 nz = ai[i+1] - diag[i] - 1; 143 while (nz--) { 144 oidx = 3*(*vi++); 145 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 146 x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 147 x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 148 v += 9; 149 } 150 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 151 idx += 3; 152 } 153 /* backward solve the L^T */ 154 for (i=n-1; i>=0; i--){ 155 v = aa + 9*diag[i] - 9; 156 vi = aj + diag[i] - 1; 157 nz = diag[i] - ai[i]; 158 idt = 3*i; 159 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 160 while (nz--) { 161 idx = 3*(*vi--); 162 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 163 x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 164 x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 165 v -= 9; 166 } 167 } 168 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 169 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 170 PetscLogFlops(2*9*(a->nz) - 3*A->n); 171 PetscFunctionReturn(0); 172 } 173 174 #undef __FUNCT__ 175 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering" 176 int MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 177 { 178 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 179 int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 180 int *diag = a->diag,oidx; 181 MatScalar *aa=a->a,*v; 182 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 183 PetscScalar *x,*b; 184 185 PetscFunctionBegin; 186 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 187 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 188 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 189 190 /* forward solve the U^T */ 191 idx = 0; 192 for (i=0; i<n; i++) { 193 194 v = aa + 16*diag[i]; 195 /* multiply by the inverse of the block diagonal */ 196 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 197 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 198 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 199 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 200 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 201 v += 16; 202 203 vi = aj + diag[i] + 1; 204 nz = ai[i+1] - diag[i] - 1; 205 while (nz--) { 206 oidx = 4*(*vi++); 207 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 208 x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 209 x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 210 x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 211 v += 16; 212 } 213 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; 214 idx += 4; 215 } 216 /* backward solve the L^T */ 217 for (i=n-1; i>=0; i--){ 218 v = aa + 16*diag[i] - 16; 219 vi = aj + diag[i] - 1; 220 nz = diag[i] - ai[i]; 221 idt = 4*i; 222 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; 223 while (nz--) { 224 idx = 4*(*vi--); 225 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 226 x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 227 x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 228 x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 229 v -= 16; 230 } 231 } 232 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 233 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 234 PetscLogFlops(2*16*(a->nz) - 4*A->n); 235 PetscFunctionReturn(0); 236 } 237 238 #undef __FUNCT__ 239 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering" 240 int MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 241 { 242 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 243 int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 244 int *diag = a->diag,oidx; 245 MatScalar *aa=a->a,*v; 246 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 247 PetscScalar *x,*b; 248 249 PetscFunctionBegin; 250 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 251 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 252 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 253 254 /* forward solve the U^T */ 255 idx = 0; 256 for (i=0; i<n; i++) { 257 258 v = aa + 25*diag[i]; 259 /* multiply by the inverse of the block diagonal */ 260 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 261 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 262 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 263 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 264 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 265 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 266 v += 25; 267 268 vi = aj + diag[i] + 1; 269 nz = ai[i+1] - diag[i] - 1; 270 while (nz--) { 271 oidx = 5*(*vi++); 272 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 273 x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 274 x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 275 x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 276 x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 277 v += 25; 278 } 279 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 280 idx += 5; 281 } 282 /* backward solve the L^T */ 283 for (i=n-1; i>=0; i--){ 284 v = aa + 25*diag[i] - 25; 285 vi = aj + diag[i] - 1; 286 nz = diag[i] - ai[i]; 287 idt = 5*i; 288 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 289 while (nz--) { 290 idx = 5*(*vi--); 291 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 292 x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 293 x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 294 x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 295 x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 296 v -= 25; 297 } 298 } 299 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 300 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 301 PetscLogFlops(2*25*(a->nz) - 5*A->n); 302 PetscFunctionReturn(0); 303 } 304 305 #undef __FUNCT__ 306 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering" 307 int MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 308 { 309 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 310 int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 311 int *diag = a->diag,oidx; 312 MatScalar *aa=a->a,*v; 313 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 314 PetscScalar *x,*b; 315 316 PetscFunctionBegin; 317 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 318 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 319 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 320 321 /* forward solve the U^T */ 322 idx = 0; 323 for (i=0; i<n; i++) { 324 325 v = aa + 36*diag[i]; 326 /* multiply by the inverse of the block diagonal */ 327 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 328 x6 = x[5+idx]; 329 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 330 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 331 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 332 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 333 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 334 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 335 v += 36; 336 337 vi = aj + diag[i] + 1; 338 nz = ai[i+1] - diag[i] - 1; 339 while (nz--) { 340 oidx = 6*(*vi++); 341 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 342 x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 343 x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 344 x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 345 x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 346 x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 347 v += 36; 348 } 349 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 350 x[5+idx] = s6; 351 idx += 6; 352 } 353 /* backward solve the L^T */ 354 for (i=n-1; i>=0; i--){ 355 v = aa + 36*diag[i] - 36; 356 vi = aj + diag[i] - 1; 357 nz = diag[i] - ai[i]; 358 idt = 6*i; 359 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 360 s6 = x[5+idt]; 361 while (nz--) { 362 idx = 6*(*vi--); 363 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 364 x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 365 x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 366 x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 367 x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 368 x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 369 v -= 36; 370 } 371 } 372 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 373 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 374 PetscLogFlops(2*36*(a->nz) - 6*A->n); 375 PetscFunctionReturn(0); 376 } 377 378 #undef __FUNCT__ 379 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering" 380 int MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 381 { 382 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 383 int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 384 int *diag = a->diag,oidx; 385 MatScalar *aa=a->a,*v; 386 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 387 PetscScalar *x,*b; 388 389 PetscFunctionBegin; 390 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 391 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 392 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 393 394 /* forward solve the U^T */ 395 idx = 0; 396 for (i=0; i<n; i++) { 397 398 v = aa + 49*diag[i]; 399 /* multiply by the inverse of the block diagonal */ 400 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 401 x6 = x[5+idx]; x7 = x[6+idx]; 402 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 403 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 404 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 405 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 406 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 407 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 408 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 409 v += 49; 410 411 vi = aj + diag[i] + 1; 412 nz = ai[i+1] - diag[i] - 1; 413 while (nz--) { 414 oidx = 7*(*vi++); 415 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 416 x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 417 x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 418 x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 419 x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 420 x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 421 x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 422 v += 49; 423 } 424 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 425 x[5+idx] = s6;x[6+idx] = s7; 426 idx += 7; 427 } 428 /* backward solve the L^T */ 429 for (i=n-1; i>=0; i--){ 430 v = aa + 49*diag[i] - 49; 431 vi = aj + diag[i] - 1; 432 nz = diag[i] - ai[i]; 433 idt = 7*i; 434 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 435 s6 = x[5+idt];s7 = x[6+idt]; 436 while (nz--) { 437 idx = 7*(*vi--); 438 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 439 x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 440 x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 441 x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 442 x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 443 x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 444 x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 445 v -= 49; 446 } 447 } 448 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 449 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 450 PetscLogFlops(2*49*(a->nz) - 7*A->n); 451 PetscFunctionReturn(0); 452 } 453 454 /*---------------------------------------------------------------------------------------------*/ 455 #undef __FUNCT__ 456 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1" 457 int MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 458 { 459 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 460 IS iscol=a->col,isrow=a->row; 461 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout; 462 int *diag = a->diag; 463 MatScalar *aa=a->a,*v; 464 PetscScalar s1,*x,*b,*t; 465 466 PetscFunctionBegin; 467 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 468 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 469 t = a->solve_work; 470 471 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 472 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 473 474 /* copy the b into temp work space according to permutation */ 475 for (i=0; i<n; i++) { 476 t[i] = b[c[i]]; 477 } 478 479 /* forward solve the U^T */ 480 for (i=0; i<n; i++) { 481 482 v = aa + diag[i]; 483 /* multiply by the inverse of the block diagonal */ 484 s1 = (*v++)*t[i]; 485 vi = aj + diag[i] + 1; 486 nz = ai[i+1] - diag[i] - 1; 487 while (nz--) { 488 t[*vi++] -= (*v++)*s1; 489 } 490 t[i] = s1; 491 } 492 /* backward solve the L^T */ 493 for (i=n-1; i>=0; i--){ 494 v = aa + diag[i] - 1; 495 vi = aj + diag[i] - 1; 496 nz = diag[i] - ai[i]; 497 s1 = t[i]; 498 while (nz--) { 499 t[*vi--] -= (*v--)*s1; 500 } 501 } 502 503 /* copy t into x according to permutation */ 504 for (i=0; i<n; i++) { 505 x[r[i]] = t[i]; 506 } 507 508 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 509 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 510 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 511 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 512 PetscLogFlops(2*(a->nz) - A->n); 513 PetscFunctionReturn(0); 514 } 515 516 #undef __FUNCT__ 517 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2" 518 int MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 519 { 520 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 521 IS iscol=a->col,isrow=a->row; 522 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 523 int *diag = a->diag,ii,ic,ir,oidx; 524 MatScalar *aa=a->a,*v; 525 PetscScalar s1,s2,x1,x2; 526 PetscScalar *x,*b,*t; 527 528 PetscFunctionBegin; 529 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 530 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 531 t = a->solve_work; 532 533 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 534 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 535 536 /* copy the b into temp work space according to permutation */ 537 ii = 0; 538 for (i=0; i<n; i++) { 539 ic = 2*c[i]; 540 t[ii] = b[ic]; 541 t[ii+1] = b[ic+1]; 542 ii += 2; 543 } 544 545 /* forward solve the U^T */ 546 idx = 0; 547 for (i=0; i<n; i++) { 548 549 v = aa + 4*diag[i]; 550 /* multiply by the inverse of the block diagonal */ 551 x1 = t[idx]; x2 = t[1+idx]; 552 s1 = v[0]*x1 + v[1]*x2; 553 s2 = v[2]*x1 + v[3]*x2; 554 v += 4; 555 556 vi = aj + diag[i] + 1; 557 nz = ai[i+1] - diag[i] - 1; 558 while (nz--) { 559 oidx = 2*(*vi++); 560 t[oidx] -= v[0]*s1 + v[1]*s2; 561 t[oidx+1] -= v[2]*s1 + v[3]*s2; 562 v += 4; 563 } 564 t[idx] = s1;t[1+idx] = s2; 565 idx += 2; 566 } 567 /* backward solve the L^T */ 568 for (i=n-1; i>=0; i--){ 569 v = aa + 4*diag[i] - 4; 570 vi = aj + diag[i] - 1; 571 nz = diag[i] - ai[i]; 572 idt = 2*i; 573 s1 = t[idt]; s2 = t[1+idt]; 574 while (nz--) { 575 idx = 2*(*vi--); 576 t[idx] -= v[0]*s1 + v[1]*s2; 577 t[idx+1] -= v[2]*s1 + v[3]*s2; 578 v -= 4; 579 } 580 } 581 582 /* copy t into x according to permutation */ 583 ii = 0; 584 for (i=0; i<n; i++) { 585 ir = 2*r[i]; 586 x[ir] = t[ii]; 587 x[ir+1] = t[ii+1]; 588 ii += 2; 589 } 590 591 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 592 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 593 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 594 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 595 PetscLogFlops(2*4*(a->nz) - 2*A->n); 596 PetscFunctionReturn(0); 597 } 598 599 #undef __FUNCT__ 600 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3" 601 int MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 602 { 603 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 604 IS iscol=a->col,isrow=a->row; 605 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 606 int *diag = a->diag,ii,ic,ir,oidx; 607 MatScalar *aa=a->a,*v; 608 PetscScalar s1,s2,s3,x1,x2,x3; 609 PetscScalar *x,*b,*t; 610 611 PetscFunctionBegin; 612 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 613 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 614 t = a->solve_work; 615 616 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 617 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 618 619 /* copy the b into temp work space according to permutation */ 620 ii = 0; 621 for (i=0; i<n; i++) { 622 ic = 3*c[i]; 623 t[ii] = b[ic]; 624 t[ii+1] = b[ic+1]; 625 t[ii+2] = b[ic+2]; 626 ii += 3; 627 } 628 629 /* forward solve the U^T */ 630 idx = 0; 631 for (i=0; i<n; i++) { 632 633 v = aa + 9*diag[i]; 634 /* multiply by the inverse of the block diagonal */ 635 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 636 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 637 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 638 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 639 v += 9; 640 641 vi = aj + diag[i] + 1; 642 nz = ai[i+1] - diag[i] - 1; 643 while (nz--) { 644 oidx = 3*(*vi++); 645 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 646 t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 647 t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 648 v += 9; 649 } 650 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 651 idx += 3; 652 } 653 /* backward solve the L^T */ 654 for (i=n-1; i>=0; i--){ 655 v = aa + 9*diag[i] - 9; 656 vi = aj + diag[i] - 1; 657 nz = diag[i] - ai[i]; 658 idt = 3*i; 659 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 660 while (nz--) { 661 idx = 3*(*vi--); 662 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 663 t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 664 t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 665 v -= 9; 666 } 667 } 668 669 /* copy t into x according to permutation */ 670 ii = 0; 671 for (i=0; i<n; i++) { 672 ir = 3*r[i]; 673 x[ir] = t[ii]; 674 x[ir+1] = t[ii+1]; 675 x[ir+2] = t[ii+2]; 676 ii += 3; 677 } 678 679 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 680 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 681 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 682 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 683 PetscLogFlops(2*9*(a->nz) - 3*A->n); 684 PetscFunctionReturn(0); 685 } 686 687 #undef __FUNCT__ 688 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4" 689 int MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 690 { 691 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 692 IS iscol=a->col,isrow=a->row; 693 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 694 int *diag = a->diag,ii,ic,ir,oidx; 695 MatScalar *aa=a->a,*v; 696 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 697 PetscScalar *x,*b,*t; 698 699 PetscFunctionBegin; 700 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 701 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 702 t = a->solve_work; 703 704 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 705 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 706 707 /* copy the b into temp work space according to permutation */ 708 ii = 0; 709 for (i=0; i<n; i++) { 710 ic = 4*c[i]; 711 t[ii] = b[ic]; 712 t[ii+1] = b[ic+1]; 713 t[ii+2] = b[ic+2]; 714 t[ii+3] = b[ic+3]; 715 ii += 4; 716 } 717 718 /* forward solve the U^T */ 719 idx = 0; 720 for (i=0; i<n; i++) { 721 722 v = aa + 16*diag[i]; 723 /* multiply by the inverse of the block diagonal */ 724 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 725 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 726 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 727 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 728 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 729 v += 16; 730 731 vi = aj + diag[i] + 1; 732 nz = ai[i+1] - diag[i] - 1; 733 while (nz--) { 734 oidx = 4*(*vi++); 735 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 736 t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 737 t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 738 t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 739 v += 16; 740 } 741 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; 742 idx += 4; 743 } 744 /* backward solve the L^T */ 745 for (i=n-1; i>=0; i--){ 746 v = aa + 16*diag[i] - 16; 747 vi = aj + diag[i] - 1; 748 nz = diag[i] - ai[i]; 749 idt = 4*i; 750 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; 751 while (nz--) { 752 idx = 4*(*vi--); 753 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 754 t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 755 t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 756 t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 757 v -= 16; 758 } 759 } 760 761 /* copy t into x according to permutation */ 762 ii = 0; 763 for (i=0; i<n; i++) { 764 ir = 4*r[i]; 765 x[ir] = t[ii]; 766 x[ir+1] = t[ii+1]; 767 x[ir+2] = t[ii+2]; 768 x[ir+3] = t[ii+3]; 769 ii += 4; 770 } 771 772 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 773 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 774 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 775 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 776 PetscLogFlops(2*16*(a->nz) - 4*A->n); 777 PetscFunctionReturn(0); 778 } 779 780 #undef __FUNCT__ 781 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5" 782 int MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 783 { 784 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 785 IS iscol=a->col,isrow=a->row; 786 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 787 int *diag = a->diag,ii,ic,ir,oidx; 788 MatScalar *aa=a->a,*v; 789 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 790 PetscScalar *x,*b,*t; 791 792 PetscFunctionBegin; 793 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 794 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 795 t = a->solve_work; 796 797 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 798 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 799 800 /* copy the b into temp work space according to permutation */ 801 ii = 0; 802 for (i=0; i<n; i++) { 803 ic = 5*c[i]; 804 t[ii] = b[ic]; 805 t[ii+1] = b[ic+1]; 806 t[ii+2] = b[ic+2]; 807 t[ii+3] = b[ic+3]; 808 t[ii+4] = b[ic+4]; 809 ii += 5; 810 } 811 812 /* forward solve the U^T */ 813 idx = 0; 814 for (i=0; i<n; i++) { 815 816 v = aa + 25*diag[i]; 817 /* multiply by the inverse of the block diagonal */ 818 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 819 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 820 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 821 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 822 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 823 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 824 v += 25; 825 826 vi = aj + diag[i] + 1; 827 nz = ai[i+1] - diag[i] - 1; 828 while (nz--) { 829 oidx = 5*(*vi++); 830 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 831 t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 832 t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 833 t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 834 t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 835 v += 25; 836 } 837 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 838 idx += 5; 839 } 840 /* backward solve the L^T */ 841 for (i=n-1; i>=0; i--){ 842 v = aa + 25*diag[i] - 25; 843 vi = aj + diag[i] - 1; 844 nz = diag[i] - ai[i]; 845 idt = 5*i; 846 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 847 while (nz--) { 848 idx = 5*(*vi--); 849 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 850 t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 851 t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 852 t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 853 t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 854 v -= 25; 855 } 856 } 857 858 /* copy t into x according to permutation */ 859 ii = 0; 860 for (i=0; i<n; i++) { 861 ir = 5*r[i]; 862 x[ir] = t[ii]; 863 x[ir+1] = t[ii+1]; 864 x[ir+2] = t[ii+2]; 865 x[ir+3] = t[ii+3]; 866 x[ir+4] = t[ii+4]; 867 ii += 5; 868 } 869 870 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 871 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 872 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 873 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 874 PetscLogFlops(2*25*(a->nz) - 5*A->n); 875 PetscFunctionReturn(0); 876 } 877 878 #undef __FUNCT__ 879 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6" 880 int MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 881 { 882 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 883 IS iscol=a->col,isrow=a->row; 884 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 885 int *diag = a->diag,ii,ic,ir,oidx; 886 MatScalar *aa=a->a,*v; 887 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 888 PetscScalar *x,*b,*t; 889 890 PetscFunctionBegin; 891 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 892 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 893 t = a->solve_work; 894 895 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 896 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 897 898 /* copy the b into temp work space according to permutation */ 899 ii = 0; 900 for (i=0; i<n; i++) { 901 ic = 6*c[i]; 902 t[ii] = b[ic]; 903 t[ii+1] = b[ic+1]; 904 t[ii+2] = b[ic+2]; 905 t[ii+3] = b[ic+3]; 906 t[ii+4] = b[ic+4]; 907 t[ii+5] = b[ic+5]; 908 ii += 6; 909 } 910 911 /* forward solve the U^T */ 912 idx = 0; 913 for (i=0; i<n; i++) { 914 915 v = aa + 36*diag[i]; 916 /* multiply by the inverse of the block diagonal */ 917 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 918 x6 = t[5+idx]; 919 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 920 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 921 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 922 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 923 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 924 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 925 v += 36; 926 927 vi = aj + diag[i] + 1; 928 nz = ai[i+1] - diag[i] - 1; 929 while (nz--) { 930 oidx = 6*(*vi++); 931 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 932 t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 933 t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 934 t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 935 t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 936 t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 937 v += 36; 938 } 939 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 940 t[5+idx] = s6; 941 idx += 6; 942 } 943 /* backward solve the L^T */ 944 for (i=n-1; i>=0; i--){ 945 v = aa + 36*diag[i] - 36; 946 vi = aj + diag[i] - 1; 947 nz = diag[i] - ai[i]; 948 idt = 6*i; 949 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 950 s6 = t[5+idt]; 951 while (nz--) { 952 idx = 6*(*vi--); 953 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 954 t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 955 t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 956 t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 957 t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 958 t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 959 v -= 36; 960 } 961 } 962 963 /* copy t into x according to permutation */ 964 ii = 0; 965 for (i=0; i<n; i++) { 966 ir = 6*r[i]; 967 x[ir] = t[ii]; 968 x[ir+1] = t[ii+1]; 969 x[ir+2] = t[ii+2]; 970 x[ir+3] = t[ii+3]; 971 x[ir+4] = t[ii+4]; 972 x[ir+5] = t[ii+5]; 973 ii += 6; 974 } 975 976 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 977 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 978 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 979 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 980 PetscLogFlops(2*36*(a->nz) - 6*A->n); 981 PetscFunctionReturn(0); 982 } 983 984 #undef __FUNCT__ 985 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7" 986 int MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 987 { 988 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 989 IS iscol=a->col,isrow=a->row; 990 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 991 int *diag = a->diag,ii,ic,ir,oidx; 992 MatScalar *aa=a->a,*v; 993 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 994 PetscScalar *x,*b,*t; 995 996 PetscFunctionBegin; 997 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 998 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 999 t = a->solve_work; 1000 1001 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1002 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1003 1004 /* copy the b into temp work space according to permutation */ 1005 ii = 0; 1006 for (i=0; i<n; i++) { 1007 ic = 7*c[i]; 1008 t[ii] = b[ic]; 1009 t[ii+1] = b[ic+1]; 1010 t[ii+2] = b[ic+2]; 1011 t[ii+3] = b[ic+3]; 1012 t[ii+4] = b[ic+4]; 1013 t[ii+5] = b[ic+5]; 1014 t[ii+6] = b[ic+6]; 1015 ii += 7; 1016 } 1017 1018 /* forward solve the U^T */ 1019 idx = 0; 1020 for (i=0; i<n; i++) { 1021 1022 v = aa + 49*diag[i]; 1023 /* multiply by the inverse of the block diagonal */ 1024 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1025 x6 = t[5+idx]; x7 = t[6+idx]; 1026 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 1027 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 1028 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 1029 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 1030 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 1031 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 1032 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 1033 v += 49; 1034 1035 vi = aj + diag[i] + 1; 1036 nz = ai[i+1] - diag[i] - 1; 1037 while (nz--) { 1038 oidx = 7*(*vi++); 1039 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1040 t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1041 t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1042 t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1043 t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1044 t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1045 t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1046 v += 49; 1047 } 1048 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1049 t[5+idx] = s6;t[6+idx] = s7; 1050 idx += 7; 1051 } 1052 /* backward solve the L^T */ 1053 for (i=n-1; i>=0; i--){ 1054 v = aa + 49*diag[i] - 49; 1055 vi = aj + diag[i] - 1; 1056 nz = diag[i] - ai[i]; 1057 idt = 7*i; 1058 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1059 s6 = t[5+idt];s7 = t[6+idt]; 1060 while (nz--) { 1061 idx = 7*(*vi--); 1062 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1063 t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1064 t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1065 t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1066 t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1067 t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1068 t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1069 v -= 49; 1070 } 1071 } 1072 1073 /* copy t into x according to permutation */ 1074 ii = 0; 1075 for (i=0; i<n; i++) { 1076 ir = 7*r[i]; 1077 x[ir] = t[ii]; 1078 x[ir+1] = t[ii+1]; 1079 x[ir+2] = t[ii+2]; 1080 x[ir+3] = t[ii+3]; 1081 x[ir+4] = t[ii+4]; 1082 x[ir+5] = t[ii+5]; 1083 x[ir+6] = t[ii+6]; 1084 ii += 7; 1085 } 1086 1087 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1088 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1089 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1090 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1091 PetscLogFlops(2*49*(a->nz) - 7*A->n); 1092 PetscFunctionReturn(0); 1093 } 1094 1095 /* ----------------------------------------------------------- */ 1096 #undef __FUNCT__ 1097 #define __FUNCT__ "MatSolve_SeqBAIJ_N" 1098 int MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 1099 { 1100 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1101 IS iscol=a->col,isrow=a->row; 1102 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1103 int nz,bs=a->bs,bs2=a->bs2,*rout,*cout; 1104 MatScalar *aa=a->a,*v; 1105 PetscScalar *x,*b,*s,*t,*ls; 1106 1107 PetscFunctionBegin; 1108 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1109 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1110 t = a->solve_work; 1111 1112 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1113 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1114 1115 /* forward solve the lower triangular */ 1116 ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 1117 for (i=1; i<n; i++) { 1118 v = aa + bs2*ai[i]; 1119 vi = aj + ai[i]; 1120 nz = a->diag[i] - ai[i]; 1121 s = t + bs*i; 1122 ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 1123 while (nz--) { 1124 Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 1125 v += bs2; 1126 } 1127 } 1128 /* backward solve the upper triangular */ 1129 ls = a->solve_work + A->n; 1130 for (i=n-1; i>=0; i--){ 1131 v = aa + bs2*(a->diag[i] + 1); 1132 vi = aj + a->diag[i] + 1; 1133 nz = ai[i+1] - a->diag[i] - 1; 1134 ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 1135 while (nz--) { 1136 Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 1137 v += bs2; 1138 } 1139 Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 1140 ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 1141 } 1142 1143 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1144 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1145 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1146 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1147 PetscLogFlops(2*(a->bs2)*(a->nz) - a->bs*A->n); 1148 PetscFunctionReturn(0); 1149 } 1150 1151 #undef __FUNCT__ 1152 #define __FUNCT__ "MatSolve_SeqBAIJ_7" 1153 int MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 1154 { 1155 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1156 IS iscol=a->col,isrow=a->row; 1157 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1158 int *diag = a->diag; 1159 MatScalar *aa=a->a,*v; 1160 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1161 PetscScalar *x,*b,*t; 1162 1163 PetscFunctionBegin; 1164 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1165 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1166 t = a->solve_work; 1167 1168 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1169 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1170 1171 /* forward solve the lower triangular */ 1172 idx = 7*(*r++); 1173 t[0] = b[idx]; t[1] = b[1+idx]; 1174 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1175 t[5] = b[5+idx]; t[6] = b[6+idx]; 1176 1177 for (i=1; i<n; i++) { 1178 v = aa + 49*ai[i]; 1179 vi = aj + ai[i]; 1180 nz = diag[i] - ai[i]; 1181 idx = 7*(*r++); 1182 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1183 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 1184 while (nz--) { 1185 idx = 7*(*vi++); 1186 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1187 x4 = t[3+idx];x5 = t[4+idx]; 1188 x6 = t[5+idx];x7 = t[6+idx]; 1189 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1190 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1191 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1192 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1193 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1194 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1195 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1196 v += 49; 1197 } 1198 idx = 7*i; 1199 t[idx] = s1;t[1+idx] = s2; 1200 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1201 t[5+idx] = s6;t[6+idx] = s7; 1202 } 1203 /* backward solve the upper triangular */ 1204 for (i=n-1; i>=0; i--){ 1205 v = aa + 49*diag[i] + 49; 1206 vi = aj + diag[i] + 1; 1207 nz = ai[i+1] - diag[i] - 1; 1208 idt = 7*i; 1209 s1 = t[idt]; s2 = t[1+idt]; 1210 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1211 s6 = t[5+idt];s7 = t[6+idt]; 1212 while (nz--) { 1213 idx = 7*(*vi++); 1214 x1 = t[idx]; x2 = t[1+idx]; 1215 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1216 x6 = t[5+idx]; x7 = t[6+idx]; 1217 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1218 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1219 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1220 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1221 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1222 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1223 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1224 v += 49; 1225 } 1226 idc = 7*(*c--); 1227 v = aa + 49*diag[i]; 1228 x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 1229 v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 1230 x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 1231 v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 1232 x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 1233 v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 1234 x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 1235 v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 1236 x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 1237 v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 1238 x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 1239 v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 1240 x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 1241 v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 1242 } 1243 1244 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1245 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1246 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1247 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1248 PetscLogFlops(2*49*(a->nz) - 7*A->n); 1249 PetscFunctionReturn(0); 1250 } 1251 1252 #undef __FUNCT__ 1253 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering" 1254 int MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 1255 { 1256 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1257 int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1258 int ierr,*diag = a->diag,jdx; 1259 MatScalar *aa=a->a,*v; 1260 PetscScalar *x,*b,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1261 1262 PetscFunctionBegin; 1263 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1264 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1265 /* forward solve the lower triangular */ 1266 idx = 0; 1267 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 1268 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 1269 x[6] = b[6+idx]; 1270 for (i=1; i<n; i++) { 1271 v = aa + 49*ai[i]; 1272 vi = aj + ai[i]; 1273 nz = diag[i] - ai[i]; 1274 idx = 7*i; 1275 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1276 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1277 s7 = b[6+idx]; 1278 while (nz--) { 1279 jdx = 7*(*vi++); 1280 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 1281 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 1282 x7 = x[6+jdx]; 1283 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1284 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1285 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1286 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1287 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1288 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1289 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1290 v += 49; 1291 } 1292 x[idx] = s1; 1293 x[1+idx] = s2; 1294 x[2+idx] = s3; 1295 x[3+idx] = s4; 1296 x[4+idx] = s5; 1297 x[5+idx] = s6; 1298 x[6+idx] = s7; 1299 } 1300 /* backward solve the upper triangular */ 1301 for (i=n-1; i>=0; i--){ 1302 v = aa + 49*diag[i] + 49; 1303 vi = aj + diag[i] + 1; 1304 nz = ai[i+1] - diag[i] - 1; 1305 idt = 7*i; 1306 s1 = x[idt]; s2 = x[1+idt]; 1307 s3 = x[2+idt]; s4 = x[3+idt]; 1308 s5 = x[4+idt]; s6 = x[5+idt]; 1309 s7 = x[6+idt]; 1310 while (nz--) { 1311 idx = 7*(*vi++); 1312 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 1313 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 1314 x7 = x[6+idx]; 1315 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1316 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1317 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1318 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1319 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1320 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1321 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1322 v += 49; 1323 } 1324 v = aa + 49*diag[i]; 1325 x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 1326 + v[28]*s5 + v[35]*s6 + v[42]*s7; 1327 x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 1328 + v[29]*s5 + v[36]*s6 + v[43]*s7; 1329 x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 1330 + v[30]*s5 + v[37]*s6 + v[44]*s7; 1331 x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 1332 + v[31]*s5 + v[38]*s6 + v[45]*s7; 1333 x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 1334 + v[32]*s5 + v[39]*s6 + v[46]*s7; 1335 x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 1336 + v[33]*s5 + v[40]*s6 + v[47]*s7; 1337 x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 1338 + v[34]*s5 + v[41]*s6 + v[48]*s7; 1339 } 1340 1341 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1342 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1343 PetscLogFlops(2*36*(a->nz) - 6*A->n); 1344 PetscFunctionReturn(0); 1345 } 1346 1347 #undef __FUNCT__ 1348 #define __FUNCT__ "MatSolve_SeqBAIJ_6" 1349 int MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 1350 { 1351 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1352 IS iscol=a->col,isrow=a->row; 1353 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1354 int *diag = a->diag; 1355 MatScalar *aa=a->a,*v; 1356 PetscScalar *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 1357 1358 PetscFunctionBegin; 1359 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1360 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1361 t = a->solve_work; 1362 1363 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1364 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1365 1366 /* forward solve the lower triangular */ 1367 idx = 6*(*r++); 1368 t[0] = b[idx]; t[1] = b[1+idx]; 1369 t[2] = b[2+idx]; t[3] = b[3+idx]; 1370 t[4] = b[4+idx]; t[5] = b[5+idx]; 1371 for (i=1; i<n; i++) { 1372 v = aa + 36*ai[i]; 1373 vi = aj + ai[i]; 1374 nz = diag[i] - ai[i]; 1375 idx = 6*(*r++); 1376 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1377 s5 = b[4+idx]; s6 = b[5+idx]; 1378 while (nz--) { 1379 idx = 6*(*vi++); 1380 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1381 x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 1382 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1383 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1384 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1385 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1386 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1387 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1388 v += 36; 1389 } 1390 idx = 6*i; 1391 t[idx] = s1;t[1+idx] = s2; 1392 t[2+idx] = s3;t[3+idx] = s4; 1393 t[4+idx] = s5;t[5+idx] = s6; 1394 } 1395 /* backward solve the upper triangular */ 1396 for (i=n-1; i>=0; i--){ 1397 v = aa + 36*diag[i] + 36; 1398 vi = aj + diag[i] + 1; 1399 nz = ai[i+1] - diag[i] - 1; 1400 idt = 6*i; 1401 s1 = t[idt]; s2 = t[1+idt]; 1402 s3 = t[2+idt];s4 = t[3+idt]; 1403 s5 = t[4+idt];s6 = t[5+idt]; 1404 while (nz--) { 1405 idx = 6*(*vi++); 1406 x1 = t[idx]; x2 = t[1+idx]; 1407 x3 = t[2+idx]; x4 = t[3+idx]; 1408 x5 = t[4+idx]; x6 = t[5+idx]; 1409 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1410 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1411 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1412 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1413 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1414 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1415 v += 36; 1416 } 1417 idc = 6*(*c--); 1418 v = aa + 36*diag[i]; 1419 x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 1420 v[18]*s4+v[24]*s5+v[30]*s6; 1421 x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 1422 v[19]*s4+v[25]*s5+v[31]*s6; 1423 x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 1424 v[20]*s4+v[26]*s5+v[32]*s6; 1425 x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 1426 v[21]*s4+v[27]*s5+v[33]*s6; 1427 x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 1428 v[22]*s4+v[28]*s5+v[34]*s6; 1429 x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 1430 v[23]*s4+v[29]*s5+v[35]*s6; 1431 } 1432 1433 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1434 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1435 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1436 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1437 PetscLogFlops(2*36*(a->nz) - 6*A->n); 1438 PetscFunctionReturn(0); 1439 } 1440 1441 #undef __FUNCT__ 1442 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering" 1443 int MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 1444 { 1445 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1446 int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1447 int ierr,*diag = a->diag,jdx; 1448 MatScalar *aa=a->a,*v; 1449 PetscScalar *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 1450 1451 PetscFunctionBegin; 1452 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1453 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1454 /* forward solve the lower triangular */ 1455 idx = 0; 1456 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 1457 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 1458 for (i=1; i<n; i++) { 1459 v = aa + 36*ai[i]; 1460 vi = aj + ai[i]; 1461 nz = diag[i] - ai[i]; 1462 idx = 6*i; 1463 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1464 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1465 while (nz--) { 1466 jdx = 6*(*vi++); 1467 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 1468 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 1469 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1470 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1471 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1472 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1473 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1474 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1475 v += 36; 1476 } 1477 x[idx] = s1; 1478 x[1+idx] = s2; 1479 x[2+idx] = s3; 1480 x[3+idx] = s4; 1481 x[4+idx] = s5; 1482 x[5+idx] = s6; 1483 } 1484 /* backward solve the upper triangular */ 1485 for (i=n-1; i>=0; i--){ 1486 v = aa + 36*diag[i] + 36; 1487 vi = aj + diag[i] + 1; 1488 nz = ai[i+1] - diag[i] - 1; 1489 idt = 6*i; 1490 s1 = x[idt]; s2 = x[1+idt]; 1491 s3 = x[2+idt]; s4 = x[3+idt]; 1492 s5 = x[4+idt]; s6 = x[5+idt]; 1493 while (nz--) { 1494 idx = 6*(*vi++); 1495 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 1496 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 1497 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1498 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1499 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1500 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1501 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1502 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1503 v += 36; 1504 } 1505 v = aa + 36*diag[i]; 1506 x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 1507 x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 1508 x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 1509 x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 1510 x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 1511 x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 1512 } 1513 1514 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1515 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1516 PetscLogFlops(2*36*(a->nz) - 6*A->n); 1517 PetscFunctionReturn(0); 1518 } 1519 1520 #undef __FUNCT__ 1521 #define __FUNCT__ "MatSolve_SeqBAIJ_5" 1522 int MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 1523 { 1524 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1525 IS iscol=a->col,isrow=a->row; 1526 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1527 int *diag = a->diag; 1528 MatScalar *aa=a->a,*v; 1529 PetscScalar *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 1530 1531 PetscFunctionBegin; 1532 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1533 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1534 t = a->solve_work; 1535 1536 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1537 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1538 1539 /* forward solve the lower triangular */ 1540 idx = 5*(*r++); 1541 t[0] = b[idx]; t[1] = b[1+idx]; 1542 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1543 for (i=1; i<n; i++) { 1544 v = aa + 25*ai[i]; 1545 vi = aj + ai[i]; 1546 nz = diag[i] - ai[i]; 1547 idx = 5*(*r++); 1548 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1549 s5 = b[4+idx]; 1550 while (nz--) { 1551 idx = 5*(*vi++); 1552 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1553 x4 = t[3+idx];x5 = t[4+idx]; 1554 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1555 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1556 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1557 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1558 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1559 v += 25; 1560 } 1561 idx = 5*i; 1562 t[idx] = s1;t[1+idx] = s2; 1563 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1564 } 1565 /* backward solve the upper triangular */ 1566 for (i=n-1; i>=0; i--){ 1567 v = aa + 25*diag[i] + 25; 1568 vi = aj + diag[i] + 1; 1569 nz = ai[i+1] - diag[i] - 1; 1570 idt = 5*i; 1571 s1 = t[idt]; s2 = t[1+idt]; 1572 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1573 while (nz--) { 1574 idx = 5*(*vi++); 1575 x1 = t[idx]; x2 = t[1+idx]; 1576 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1577 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1578 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1579 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1580 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1581 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1582 v += 25; 1583 } 1584 idc = 5*(*c--); 1585 v = aa + 25*diag[i]; 1586 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 1587 v[15]*s4+v[20]*s5; 1588 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 1589 v[16]*s4+v[21]*s5; 1590 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 1591 v[17]*s4+v[22]*s5; 1592 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 1593 v[18]*s4+v[23]*s5; 1594 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 1595 v[19]*s4+v[24]*s5; 1596 } 1597 1598 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1599 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1600 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1601 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1602 PetscLogFlops(2*25*(a->nz) - 5*A->n); 1603 PetscFunctionReturn(0); 1604 } 1605 1606 #undef __FUNCT__ 1607 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering" 1608 int MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 1609 { 1610 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1611 int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1612 int ierr,*diag = a->diag,jdx; 1613 MatScalar *aa=a->a,*v; 1614 PetscScalar *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 1615 1616 PetscFunctionBegin; 1617 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1618 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1619 /* forward solve the lower triangular */ 1620 idx = 0; 1621 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 1622 for (i=1; i<n; i++) { 1623 v = aa + 25*ai[i]; 1624 vi = aj + ai[i]; 1625 nz = diag[i] - ai[i]; 1626 idx = 5*i; 1627 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 1628 while (nz--) { 1629 jdx = 5*(*vi++); 1630 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 1631 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1632 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1633 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1634 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1635 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1636 v += 25; 1637 } 1638 x[idx] = s1; 1639 x[1+idx] = s2; 1640 x[2+idx] = s3; 1641 x[3+idx] = s4; 1642 x[4+idx] = s5; 1643 } 1644 /* backward solve the upper triangular */ 1645 for (i=n-1; i>=0; i--){ 1646 v = aa + 25*diag[i] + 25; 1647 vi = aj + diag[i] + 1; 1648 nz = ai[i+1] - diag[i] - 1; 1649 idt = 5*i; 1650 s1 = x[idt]; s2 = x[1+idt]; 1651 s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 1652 while (nz--) { 1653 idx = 5*(*vi++); 1654 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 1655 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1656 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1657 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1658 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1659 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1660 v += 25; 1661 } 1662 v = aa + 25*diag[i]; 1663 x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 1664 x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 1665 x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 1666 x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 1667 x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 1668 } 1669 1670 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1671 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1672 PetscLogFlops(2*25*(a->nz) - 5*A->n); 1673 PetscFunctionReturn(0); 1674 } 1675 1676 #undef __FUNCT__ 1677 #define __FUNCT__ "MatSolve_SeqBAIJ_4" 1678 int MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 1679 { 1680 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1681 IS iscol=a->col,isrow=a->row; 1682 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1683 int *diag = a->diag; 1684 MatScalar *aa=a->a,*v; 1685 PetscScalar *x,*b,s1,s2,s3,s4,x1,x2,x3,x4,*t; 1686 1687 PetscFunctionBegin; 1688 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1689 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1690 t = a->solve_work; 1691 1692 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1693 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1694 1695 /* forward solve the lower triangular */ 1696 idx = 4*(*r++); 1697 t[0] = b[idx]; t[1] = b[1+idx]; 1698 t[2] = b[2+idx]; t[3] = b[3+idx]; 1699 for (i=1; i<n; i++) { 1700 v = aa + 16*ai[i]; 1701 vi = aj + ai[i]; 1702 nz = diag[i] - ai[i]; 1703 idx = 4*(*r++); 1704 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1705 while (nz--) { 1706 idx = 4*(*vi++); 1707 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 1708 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1709 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1710 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1711 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1712 v += 16; 1713 } 1714 idx = 4*i; 1715 t[idx] = s1;t[1+idx] = s2; 1716 t[2+idx] = s3;t[3+idx] = s4; 1717 } 1718 /* backward solve the upper triangular */ 1719 for (i=n-1; i>=0; i--){ 1720 v = aa + 16*diag[i] + 16; 1721 vi = aj + diag[i] + 1; 1722 nz = ai[i+1] - diag[i] - 1; 1723 idt = 4*i; 1724 s1 = t[idt]; s2 = t[1+idt]; 1725 s3 = t[2+idt];s4 = t[3+idt]; 1726 while (nz--) { 1727 idx = 4*(*vi++); 1728 x1 = t[idx]; x2 = t[1+idx]; 1729 x3 = t[2+idx]; x4 = t[3+idx]; 1730 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1731 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1732 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1733 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1734 v += 16; 1735 } 1736 idc = 4*(*c--); 1737 v = aa + 16*diag[i]; 1738 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 1739 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 1740 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 1741 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 1742 } 1743 1744 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1745 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1746 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1747 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1748 PetscLogFlops(2*16*(a->nz) - 4*A->n); 1749 PetscFunctionReturn(0); 1750 } 1751 #if defined (PETSC_HAVE_SSE) 1752 1753 #include PETSC_HAVE_SSE 1754 1755 #undef __FUNCT__ 1756 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion" 1757 int MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx) 1758 { 1759 /* 1760 Note: This code uses demotion of double 1761 to float when performing the mixed-mode computation. 1762 This may not be numerically reasonable for all applications. 1763 */ 1764 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1765 IS iscol=a->col,isrow=a->row; 1766 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1767 int *diag = a->diag,ai16; 1768 MatScalar *aa=a->a,*v; 1769 PetscScalar *x,*b,*t; 1770 1771 /* Make space in temp stack for 16 Byte Aligned arrays */ 1772 float ssealignedspace[11],*tmps,*tmpx; 1773 unsigned long offset; 1774 1775 PetscFunctionBegin; 1776 SSE_SCOPE_BEGIN; 1777 1778 offset = (unsigned long)ssealignedspace % 16; 1779 if (offset) offset = (16 - offset)/4; 1780 tmps = &ssealignedspace[offset]; 1781 tmpx = &ssealignedspace[offset+4]; 1782 PREFETCH_NTA(aa+16*ai[1]); 1783 1784 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1785 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1786 t = a->solve_work; 1787 1788 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1789 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1790 1791 /* forward solve the lower triangular */ 1792 idx = 4*(*r++); 1793 t[0] = b[idx]; t[1] = b[1+idx]; 1794 t[2] = b[2+idx]; t[3] = b[3+idx]; 1795 v = aa + 16*ai[1]; 1796 1797 for (i=1; i<n;) { 1798 PREFETCH_NTA(&v[8]); 1799 vi = aj + ai[i]; 1800 nz = diag[i] - ai[i]; 1801 idx = 4*(*r++); 1802 1803 /* Demote sum from double to float */ 1804 CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 1805 LOAD_PS(tmps,XMM7); 1806 1807 while (nz--) { 1808 PREFETCH_NTA(&v[16]); 1809 idx = 4*(*vi++); 1810 1811 /* Demote solution (so far) from double to float */ 1812 CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 1813 1814 /* 4x4 Matrix-Vector product with negative accumulation: */ 1815 SSE_INLINE_BEGIN_2(tmpx,v) 1816 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 1817 1818 /* First Column */ 1819 SSE_COPY_PS(XMM0,XMM6) 1820 SSE_SHUFFLE(XMM0,XMM0,0x00) 1821 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 1822 SSE_SUB_PS(XMM7,XMM0) 1823 1824 /* Second Column */ 1825 SSE_COPY_PS(XMM1,XMM6) 1826 SSE_SHUFFLE(XMM1,XMM1,0x55) 1827 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 1828 SSE_SUB_PS(XMM7,XMM1) 1829 1830 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 1831 1832 /* Third Column */ 1833 SSE_COPY_PS(XMM2,XMM6) 1834 SSE_SHUFFLE(XMM2,XMM2,0xAA) 1835 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 1836 SSE_SUB_PS(XMM7,XMM2) 1837 1838 /* Fourth Column */ 1839 SSE_COPY_PS(XMM3,XMM6) 1840 SSE_SHUFFLE(XMM3,XMM3,0xFF) 1841 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 1842 SSE_SUB_PS(XMM7,XMM3) 1843 SSE_INLINE_END_2 1844 1845 v += 16; 1846 } 1847 idx = 4*i; 1848 v = aa + 16*ai[++i]; 1849 PREFETCH_NTA(v); 1850 STORE_PS(tmps,XMM7); 1851 1852 /* Promote result from float to double */ 1853 CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps); 1854 } 1855 /* backward solve the upper triangular */ 1856 idt = 4*(n-1); 1857 ai16 = 16*diag[n-1]; 1858 v = aa + ai16 + 16; 1859 for (i=n-1; i>=0;){ 1860 PREFETCH_NTA(&v[8]); 1861 vi = aj + diag[i] + 1; 1862 nz = ai[i+1] - diag[i] - 1; 1863 1864 /* Demote accumulator from double to float */ 1865 CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]); 1866 LOAD_PS(tmps,XMM7); 1867 1868 while (nz--) { 1869 PREFETCH_NTA(&v[16]); 1870 idx = 4*(*vi++); 1871 1872 /* Demote solution (so far) from double to float */ 1873 CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]); 1874 1875 /* 4x4 Matrix-Vector Product with negative accumulation: */ 1876 SSE_INLINE_BEGIN_2(tmpx,v) 1877 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 1878 1879 /* First Column */ 1880 SSE_COPY_PS(XMM0,XMM6) 1881 SSE_SHUFFLE(XMM0,XMM0,0x00) 1882 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 1883 SSE_SUB_PS(XMM7,XMM0) 1884 1885 /* Second Column */ 1886 SSE_COPY_PS(XMM1,XMM6) 1887 SSE_SHUFFLE(XMM1,XMM1,0x55) 1888 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 1889 SSE_SUB_PS(XMM7,XMM1) 1890 1891 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 1892 1893 /* Third Column */ 1894 SSE_COPY_PS(XMM2,XMM6) 1895 SSE_SHUFFLE(XMM2,XMM2,0xAA) 1896 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 1897 SSE_SUB_PS(XMM7,XMM2) 1898 1899 /* Fourth Column */ 1900 SSE_COPY_PS(XMM3,XMM6) 1901 SSE_SHUFFLE(XMM3,XMM3,0xFF) 1902 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 1903 SSE_SUB_PS(XMM7,XMM3) 1904 SSE_INLINE_END_2 1905 v += 16; 1906 } 1907 v = aa + ai16; 1908 ai16 = 16*diag[--i]; 1909 PREFETCH_NTA(aa+ai16+16); 1910 /* 1911 Scale the result by the diagonal 4x4 block, 1912 which was inverted as part of the factorization 1913 */ 1914 SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 1915 /* First Column */ 1916 SSE_COPY_PS(XMM0,XMM7) 1917 SSE_SHUFFLE(XMM0,XMM0,0x00) 1918 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 1919 1920 /* Second Column */ 1921 SSE_COPY_PS(XMM1,XMM7) 1922 SSE_SHUFFLE(XMM1,XMM1,0x55) 1923 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 1924 SSE_ADD_PS(XMM0,XMM1) 1925 1926 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 1927 1928 /* Third Column */ 1929 SSE_COPY_PS(XMM2,XMM7) 1930 SSE_SHUFFLE(XMM2,XMM2,0xAA) 1931 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 1932 SSE_ADD_PS(XMM0,XMM2) 1933 1934 /* Fourth Column */ 1935 SSE_COPY_PS(XMM3,XMM7) 1936 SSE_SHUFFLE(XMM3,XMM3,0xFF) 1937 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 1938 SSE_ADD_PS(XMM0,XMM3) 1939 1940 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 1941 SSE_INLINE_END_3 1942 1943 /* Promote solution from float to double */ 1944 CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps); 1945 1946 /* Apply reordering to t and stream into x. */ 1947 /* This way, x doesn't pollute the cache. */ 1948 /* Be careful with size: 2 doubles = 4 floats! */ 1949 idc = 4*(*c--); 1950 SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc]) 1951 /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 1952 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 1953 SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0) 1954 /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 1955 SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1) 1956 SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1) 1957 SSE_INLINE_END_2 1958 v = aa + ai16 + 16; 1959 idt -= 4; 1960 } 1961 1962 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1963 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1964 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1965 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1966 PetscLogFlops(2*16*(a->nz) - 4*A->n); 1967 SSE_SCOPE_END; 1968 PetscFunctionReturn(0); 1969 } 1970 1971 #endif 1972 1973 1974 /* 1975 Special case where the matrix was ILU(0) factored in the natural 1976 ordering. This eliminates the need for the column and row permutation. 1977 */ 1978 #undef __FUNCT__ 1979 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering" 1980 int MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 1981 { 1982 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1983 int n=a->mbs,*ai=a->i,*aj=a->j; 1984 int ierr,*diag = a->diag; 1985 MatScalar *aa=a->a; 1986 PetscScalar *x,*b; 1987 1988 PetscFunctionBegin; 1989 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1990 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1991 1992 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS) 1993 { 1994 static PetscScalar w[2000]; /* very BAD need to fix */ 1995 fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w); 1996 } 1997 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 1998 { 1999 static PetscScalar w[2000]; /* very BAD need to fix */ 2000 fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w); 2001 } 2002 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 2003 fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b); 2004 #else 2005 { 2006 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 2007 MatScalar *v; 2008 int jdx,idt,idx,nz,*vi,i,ai16; 2009 2010 /* forward solve the lower triangular */ 2011 idx = 0; 2012 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 2013 for (i=1; i<n; i++) { 2014 v = aa + 16*ai[i]; 2015 vi = aj + ai[i]; 2016 nz = diag[i] - ai[i]; 2017 idx += 4; 2018 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2019 while (nz--) { 2020 jdx = 4*(*vi++); 2021 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx]; 2022 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2023 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2024 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2025 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2026 v += 16; 2027 } 2028 x[idx] = s1; 2029 x[1+idx] = s2; 2030 x[2+idx] = s3; 2031 x[3+idx] = s4; 2032 } 2033 /* backward solve the upper triangular */ 2034 idt = 4*(n-1); 2035 for (i=n-1; i>=0; i--){ 2036 ai16 = 16*diag[i]; 2037 v = aa + ai16 + 16; 2038 vi = aj + diag[i] + 1; 2039 nz = ai[i+1] - diag[i] - 1; 2040 s1 = x[idt]; s2 = x[1+idt]; 2041 s3 = x[2+idt];s4 = x[3+idt]; 2042 while (nz--) { 2043 idx = 4*(*vi++); 2044 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; 2045 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2046 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2047 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2048 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2049 v += 16; 2050 } 2051 v = aa + ai16; 2052 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 2053 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 2054 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 2055 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 2056 idt -= 4; 2057 } 2058 } 2059 #endif 2060 2061 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2062 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2063 PetscLogFlops(2*16*(a->nz) - 4*A->n); 2064 PetscFunctionReturn(0); 2065 } 2066 2067 #if defined (PETSC_HAVE_SSE) 2068 2069 #include PETSC_HAVE_SSE 2070 2071 #undef __FUNCT__ 2072 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion" 2073 int MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx) 2074 { 2075 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2076 int n=a->mbs,*ai=a->i,*aj=a->j; 2077 int ierr,*diag = a->diag; 2078 MatScalar *aa=a->a; 2079 PetscScalar *x,*b; 2080 2081 PetscFunctionBegin; 2082 SSE_SCOPE_BEGIN; 2083 /* 2084 Note: This code currently uses demotion of double 2085 to float when performing the mixed-mode computation. 2086 This may not be numerically reasonable for all applications. 2087 */ 2088 PREFETCH_NTA(aa+16*ai[1]); 2089 2090 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2091 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2092 { 2093 MatScalar *v; 2094 int jdx,idt,idx,nz,*vi,i,ai16; 2095 2096 /* Make space in temp stack for 16 Byte Aligned arrays */ 2097 float ssealignedspace[11],*tmps,*tmpx; 2098 unsigned long offset = (unsigned long)ssealignedspace % 16; 2099 if (offset) offset = (16 - offset)/4; 2100 tmps = &ssealignedspace[offset]; 2101 tmpx = &ssealignedspace[offset+4]; 2102 2103 /* forward solve the lower triangular */ 2104 idx = 0; 2105 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 2106 v = aa + 16*ai[1]; 2107 2108 for (i=1; i<n;) { 2109 PREFETCH_NTA(&v[8]); 2110 vi = aj + ai[i]; 2111 nz = diag[i] - ai[i]; 2112 idx += 4; 2113 2114 /* Demote sum from double to float */ 2115 CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 2116 LOAD_PS(tmps,XMM7); 2117 2118 while (nz--) { 2119 PREFETCH_NTA(&v[16]); 2120 jdx = 4*(*vi++); 2121 2122 /* Demote solution (so far) from double to float */ 2123 CONVERT_DOUBLE4_FLOAT4(tmpx,&x[jdx]); 2124 2125 /* 4x4 Matrix-Vector product with negative accumulation: */ 2126 SSE_INLINE_BEGIN_2(tmpx,v) 2127 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2128 2129 /* First Column */ 2130 SSE_COPY_PS(XMM0,XMM6) 2131 SSE_SHUFFLE(XMM0,XMM0,0x00) 2132 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2133 SSE_SUB_PS(XMM7,XMM0) 2134 2135 /* Second Column */ 2136 SSE_COPY_PS(XMM1,XMM6) 2137 SSE_SHUFFLE(XMM1,XMM1,0x55) 2138 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2139 SSE_SUB_PS(XMM7,XMM1) 2140 2141 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2142 2143 /* Third Column */ 2144 SSE_COPY_PS(XMM2,XMM6) 2145 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2146 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2147 SSE_SUB_PS(XMM7,XMM2) 2148 2149 /* Fourth Column */ 2150 SSE_COPY_PS(XMM3,XMM6) 2151 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2152 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2153 SSE_SUB_PS(XMM7,XMM3) 2154 SSE_INLINE_END_2 2155 2156 v += 16; 2157 } 2158 v = aa + 16*ai[++i]; 2159 PREFETCH_NTA(v); 2160 STORE_PS(tmps,XMM7); 2161 2162 /* Promote result from float to double */ 2163 CONVERT_FLOAT4_DOUBLE4(&x[idx],tmps); 2164 } 2165 /* backward solve the upper triangular */ 2166 idt = 4*(n-1); 2167 ai16 = 16*diag[n-1]; 2168 v = aa + ai16 + 16; 2169 for (i=n-1; i>=0;){ 2170 PREFETCH_NTA(&v[8]); 2171 vi = aj + diag[i] + 1; 2172 nz = ai[i+1] - diag[i] - 1; 2173 2174 /* Demote accumulator from double to float */ 2175 CONVERT_DOUBLE4_FLOAT4(tmps,&x[idt]); 2176 LOAD_PS(tmps,XMM7); 2177 2178 while (nz--) { 2179 PREFETCH_NTA(&v[16]); 2180 idx = 4*(*vi++); 2181 2182 /* Demote solution (so far) from double to float */ 2183 CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 2184 2185 /* 4x4 Matrix-Vector Product with negative accumulation: */ 2186 SSE_INLINE_BEGIN_2(tmpx,v) 2187 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2188 2189 /* First Column */ 2190 SSE_COPY_PS(XMM0,XMM6) 2191 SSE_SHUFFLE(XMM0,XMM0,0x00) 2192 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2193 SSE_SUB_PS(XMM7,XMM0) 2194 2195 /* Second Column */ 2196 SSE_COPY_PS(XMM1,XMM6) 2197 SSE_SHUFFLE(XMM1,XMM1,0x55) 2198 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2199 SSE_SUB_PS(XMM7,XMM1) 2200 2201 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2202 2203 /* Third Column */ 2204 SSE_COPY_PS(XMM2,XMM6) 2205 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2206 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2207 SSE_SUB_PS(XMM7,XMM2) 2208 2209 /* Fourth Column */ 2210 SSE_COPY_PS(XMM3,XMM6) 2211 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2212 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2213 SSE_SUB_PS(XMM7,XMM3) 2214 SSE_INLINE_END_2 2215 v += 16; 2216 } 2217 v = aa + ai16; 2218 ai16 = 16*diag[--i]; 2219 PREFETCH_NTA(aa+ai16+16); 2220 /* 2221 Scale the result by the diagonal 4x4 block, 2222 which was inverted as part of the factorization 2223 */ 2224 SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 2225 /* First Column */ 2226 SSE_COPY_PS(XMM0,XMM7) 2227 SSE_SHUFFLE(XMM0,XMM0,0x00) 2228 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 2229 2230 /* Second Column */ 2231 SSE_COPY_PS(XMM1,XMM7) 2232 SSE_SHUFFLE(XMM1,XMM1,0x55) 2233 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 2234 SSE_ADD_PS(XMM0,XMM1) 2235 2236 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 2237 2238 /* Third Column */ 2239 SSE_COPY_PS(XMM2,XMM7) 2240 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2241 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 2242 SSE_ADD_PS(XMM0,XMM2) 2243 2244 /* Fourth Column */ 2245 SSE_COPY_PS(XMM3,XMM7) 2246 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2247 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 2248 SSE_ADD_PS(XMM0,XMM3) 2249 2250 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 2251 SSE_INLINE_END_3 2252 2253 /* Promote solution from float to double */ 2254 CONVERT_FLOAT4_DOUBLE4(&x[idt],tmps); 2255 2256 v = aa + ai16 + 16; 2257 idt -= 4; 2258 } 2259 } 2260 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2261 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2262 PetscLogFlops(2*16*(a->nz) - 4*A->n); 2263 SSE_SCOPE_END; 2264 PetscFunctionReturn(0); 2265 } 2266 2267 #endif 2268 #undef __FUNCT__ 2269 #define __FUNCT__ "MatSolve_SeqBAIJ_3" 2270 int MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 2271 { 2272 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2273 IS iscol=a->col,isrow=a->row; 2274 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 2275 int *diag = a->diag; 2276 MatScalar *aa=a->a,*v; 2277 PetscScalar *x,*b,s1,s2,s3,x1,x2,x3,*t; 2278 2279 PetscFunctionBegin; 2280 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2281 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2282 t = a->solve_work; 2283 2284 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2285 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2286 2287 /* forward solve the lower triangular */ 2288 idx = 3*(*r++); 2289 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 2290 for (i=1; i<n; i++) { 2291 v = aa + 9*ai[i]; 2292 vi = aj + ai[i]; 2293 nz = diag[i] - ai[i]; 2294 idx = 3*(*r++); 2295 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 2296 while (nz--) { 2297 idx = 3*(*vi++); 2298 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2299 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2300 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2301 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2302 v += 9; 2303 } 2304 idx = 3*i; 2305 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 2306 } 2307 /* backward solve the upper triangular */ 2308 for (i=n-1; i>=0; i--){ 2309 v = aa + 9*diag[i] + 9; 2310 vi = aj + diag[i] + 1; 2311 nz = ai[i+1] - diag[i] - 1; 2312 idt = 3*i; 2313 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 2314 while (nz--) { 2315 idx = 3*(*vi++); 2316 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2317 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2318 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2319 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2320 v += 9; 2321 } 2322 idc = 3*(*c--); 2323 v = aa + 9*diag[i]; 2324 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 2325 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 2326 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 2327 } 2328 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2329 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2330 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2331 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2332 PetscLogFlops(2*9*(a->nz) - 3*A->n); 2333 PetscFunctionReturn(0); 2334 } 2335 2336 /* 2337 Special case where the matrix was ILU(0) factored in the natural 2338 ordering. This eliminates the need for the column and row permutation. 2339 */ 2340 #undef __FUNCT__ 2341 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering" 2342 int MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 2343 { 2344 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2345 int n=a->mbs,*ai=a->i,*aj=a->j; 2346 int ierr,*diag = a->diag; 2347 MatScalar *aa=a->a,*v; 2348 PetscScalar *x,*b,s1,s2,s3,x1,x2,x3; 2349 int jdx,idt,idx,nz,*vi,i; 2350 2351 PetscFunctionBegin; 2352 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2353 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2354 2355 2356 /* forward solve the lower triangular */ 2357 idx = 0; 2358 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; 2359 for (i=1; i<n; i++) { 2360 v = aa + 9*ai[i]; 2361 vi = aj + ai[i]; 2362 nz = diag[i] - ai[i]; 2363 idx += 3; 2364 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 2365 while (nz--) { 2366 jdx = 3*(*vi++); 2367 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx]; 2368 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2369 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2370 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2371 v += 9; 2372 } 2373 x[idx] = s1; 2374 x[1+idx] = s2; 2375 x[2+idx] = s3; 2376 } 2377 /* backward solve the upper triangular */ 2378 for (i=n-1; i>=0; i--){ 2379 v = aa + 9*diag[i] + 9; 2380 vi = aj + diag[i] + 1; 2381 nz = ai[i+1] - diag[i] - 1; 2382 idt = 3*i; 2383 s1 = x[idt]; s2 = x[1+idt]; 2384 s3 = x[2+idt]; 2385 while (nz--) { 2386 idx = 3*(*vi++); 2387 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; 2388 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2389 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2390 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2391 v += 9; 2392 } 2393 v = aa + 9*diag[i]; 2394 x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 2395 x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 2396 x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 2397 } 2398 2399 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2400 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2401 PetscLogFlops(2*9*(a->nz) - 3*A->n); 2402 PetscFunctionReturn(0); 2403 } 2404 2405 #undef __FUNCT__ 2406 #define __FUNCT__ "MatSolve_SeqBAIJ_2" 2407 int MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 2408 { 2409 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2410 IS iscol=a->col,isrow=a->row; 2411 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 2412 int *diag = a->diag; 2413 MatScalar *aa=a->a,*v; 2414 PetscScalar *x,*b,s1,s2,x1,x2,*t; 2415 2416 PetscFunctionBegin; 2417 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2418 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2419 t = a->solve_work; 2420 2421 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2422 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2423 2424 /* forward solve the lower triangular */ 2425 idx = 2*(*r++); 2426 t[0] = b[idx]; t[1] = b[1+idx]; 2427 for (i=1; i<n; i++) { 2428 v = aa + 4*ai[i]; 2429 vi = aj + ai[i]; 2430 nz = diag[i] - ai[i]; 2431 idx = 2*(*r++); 2432 s1 = b[idx]; s2 = b[1+idx]; 2433 while (nz--) { 2434 idx = 2*(*vi++); 2435 x1 = t[idx]; x2 = t[1+idx]; 2436 s1 -= v[0]*x1 + v[2]*x2; 2437 s2 -= v[1]*x1 + v[3]*x2; 2438 v += 4; 2439 } 2440 idx = 2*i; 2441 t[idx] = s1; t[1+idx] = s2; 2442 } 2443 /* backward solve the upper triangular */ 2444 for (i=n-1; i>=0; i--){ 2445 v = aa + 4*diag[i] + 4; 2446 vi = aj + diag[i] + 1; 2447 nz = ai[i+1] - diag[i] - 1; 2448 idt = 2*i; 2449 s1 = t[idt]; s2 = t[1+idt]; 2450 while (nz--) { 2451 idx = 2*(*vi++); 2452 x1 = t[idx]; x2 = t[1+idx]; 2453 s1 -= v[0]*x1 + v[2]*x2; 2454 s2 -= v[1]*x1 + v[3]*x2; 2455 v += 4; 2456 } 2457 idc = 2*(*c--); 2458 v = aa + 4*diag[i]; 2459 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 2460 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 2461 } 2462 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2463 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2464 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2465 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2466 PetscLogFlops(2*4*(a->nz) - 2*A->n); 2467 PetscFunctionReturn(0); 2468 } 2469 2470 /* 2471 Special case where the matrix was ILU(0) factored in the natural 2472 ordering. This eliminates the need for the column and row permutation. 2473 */ 2474 #undef __FUNCT__ 2475 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering" 2476 int MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 2477 { 2478 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2479 int n=a->mbs,*ai=a->i,*aj=a->j; 2480 int ierr,*diag = a->diag; 2481 MatScalar *aa=a->a,*v; 2482 PetscScalar *x,*b,s1,s2,x1,x2; 2483 int jdx,idt,idx,nz,*vi,i; 2484 2485 PetscFunctionBegin; 2486 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2487 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2488 2489 /* forward solve the lower triangular */ 2490 idx = 0; 2491 x[0] = b[0]; x[1] = b[1]; 2492 for (i=1; i<n; i++) { 2493 v = aa + 4*ai[i]; 2494 vi = aj + ai[i]; 2495 nz = diag[i] - ai[i]; 2496 idx += 2; 2497 s1 = b[idx];s2 = b[1+idx]; 2498 while (nz--) { 2499 jdx = 2*(*vi++); 2500 x1 = x[jdx];x2 = x[1+jdx]; 2501 s1 -= v[0]*x1 + v[2]*x2; 2502 s2 -= v[1]*x1 + v[3]*x2; 2503 v += 4; 2504 } 2505 x[idx] = s1; 2506 x[1+idx] = s2; 2507 } 2508 /* backward solve the upper triangular */ 2509 for (i=n-1; i>=0; i--){ 2510 v = aa + 4*diag[i] + 4; 2511 vi = aj + diag[i] + 1; 2512 nz = ai[i+1] - diag[i] - 1; 2513 idt = 2*i; 2514 s1 = x[idt]; s2 = x[1+idt]; 2515 while (nz--) { 2516 idx = 2*(*vi++); 2517 x1 = x[idx]; x2 = x[1+idx]; 2518 s1 -= v[0]*x1 + v[2]*x2; 2519 s2 -= v[1]*x1 + v[3]*x2; 2520 v += 4; 2521 } 2522 v = aa + 4*diag[i]; 2523 x[idt] = v[0]*s1 + v[2]*s2; 2524 x[1+idt] = v[1]*s1 + v[3]*s2; 2525 } 2526 2527 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2528 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2529 PetscLogFlops(2*4*(a->nz) - 2*A->n); 2530 PetscFunctionReturn(0); 2531 } 2532 2533 #undef __FUNCT__ 2534 #define __FUNCT__ "MatSolve_SeqBAIJ_1" 2535 int MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 2536 { 2537 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2538 IS iscol=a->col,isrow=a->row; 2539 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout; 2540 int *diag = a->diag; 2541 MatScalar *aa=a->a,*v; 2542 PetscScalar *x,*b,s1,*t; 2543 2544 PetscFunctionBegin; 2545 if (!n) PetscFunctionReturn(0); 2546 2547 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2548 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2549 t = a->solve_work; 2550 2551 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2552 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2553 2554 /* forward solve the lower triangular */ 2555 t[0] = b[*r++]; 2556 for (i=1; i<n; i++) { 2557 v = aa + ai[i]; 2558 vi = aj + ai[i]; 2559 nz = diag[i] - ai[i]; 2560 s1 = b[*r++]; 2561 while (nz--) { 2562 s1 -= (*v++)*t[*vi++]; 2563 } 2564 t[i] = s1; 2565 } 2566 /* backward solve the upper triangular */ 2567 for (i=n-1; i>=0; i--){ 2568 v = aa + diag[i] + 1; 2569 vi = aj + diag[i] + 1; 2570 nz = ai[i+1] - diag[i] - 1; 2571 s1 = t[i]; 2572 while (nz--) { 2573 s1 -= (*v++)*t[*vi++]; 2574 } 2575 x[*c--] = t[i] = aa[diag[i]]*s1; 2576 } 2577 2578 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2579 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2580 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2581 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2582 PetscLogFlops(2*1*(a->nz) - A->n); 2583 PetscFunctionReturn(0); 2584 } 2585 /* 2586 Special case where the matrix was ILU(0) factored in the natural 2587 ordering. This eliminates the need for the column and row permutation. 2588 */ 2589 #undef __FUNCT__ 2590 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering" 2591 int MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 2592 { 2593 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2594 int n=a->mbs,*ai=a->i,*aj=a->j; 2595 int ierr,*diag = a->diag; 2596 MatScalar *aa=a->a; 2597 PetscScalar *x,*b; 2598 PetscScalar s1,x1; 2599 MatScalar *v; 2600 int jdx,idt,idx,nz,*vi,i; 2601 2602 PetscFunctionBegin; 2603 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2604 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2605 2606 /* forward solve the lower triangular */ 2607 idx = 0; 2608 x[0] = b[0]; 2609 for (i=1; i<n; i++) { 2610 v = aa + ai[i]; 2611 vi = aj + ai[i]; 2612 nz = diag[i] - ai[i]; 2613 idx += 1; 2614 s1 = b[idx]; 2615 while (nz--) { 2616 jdx = *vi++; 2617 x1 = x[jdx]; 2618 s1 -= v[0]*x1; 2619 v += 1; 2620 } 2621 x[idx] = s1; 2622 } 2623 /* backward solve the upper triangular */ 2624 for (i=n-1; i>=0; i--){ 2625 v = aa + diag[i] + 1; 2626 vi = aj + diag[i] + 1; 2627 nz = ai[i+1] - diag[i] - 1; 2628 idt = i; 2629 s1 = x[idt]; 2630 while (nz--) { 2631 idx = *vi++; 2632 x1 = x[idx]; 2633 s1 -= v[0]*x1; 2634 v += 1; 2635 } 2636 v = aa + diag[i]; 2637 x[idt] = v[0]*s1; 2638 } 2639 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2640 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2641 PetscLogFlops(2*(a->nz) - A->n); 2642 PetscFunctionReturn(0); 2643 } 2644 2645 /* ----------------------------------------------------------------*/ 2646 /* 2647 This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 2648 except that the data structure of Mat_SeqAIJ is slightly different. 2649 Not a good example of code reuse. 2650 */ 2651 EXTERN int MatMissingDiagonal_SeqBAIJ(Mat); 2652 2653 #undef __FUNCT__ 2654 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ" 2655 int MatILUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,MatILUInfo *info,Mat *fact) 2656 { 2657 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 2658 IS isicol; 2659 int *r,*ic,ierr,prow,n = a->mbs,*ai = a->i,*aj = a->j; 2660 int *ainew,*ajnew,jmax,*fill,*xi,nz,*im,*ajfill,*flev; 2661 int *dloc,idx,row,m,fm,nzf,nzi,len, realloc = 0,dcount = 0; 2662 int incrlev,nnz,i,bs = a->bs,bs2 = a->bs2,levels,diagonal_fill; 2663 PetscTruth col_identity,row_identity; 2664 PetscReal f; 2665 2666 PetscFunctionBegin; 2667 if (info) { 2668 f = info->fill; 2669 levels = (int)info->levels; 2670 diagonal_fill = (int)info->diagonal_fill; 2671 } else { 2672 f = 1.0; 2673 levels = 0; 2674 diagonal_fill = 0; 2675 } 2676 ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 2677 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 2678 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 2679 2680 if (!levels && row_identity && col_identity) { /* special case copy the nonzero structure */ 2681 ierr = MatDuplicate_SeqBAIJ(A,MAT_DO_NOT_COPY_VALUES,fact);CHKERRQ(ierr); 2682 (*fact)->factor = FACTOR_LU; 2683 b = (Mat_SeqBAIJ*)(*fact)->data; 2684 if (!b->diag) { 2685 ierr = MatMarkDiagonal_SeqBAIJ(*fact);CHKERRQ(ierr); 2686 } 2687 ierr = MatMissingDiagonal_SeqBAIJ(*fact);CHKERRQ(ierr); 2688 b->row = isrow; 2689 b->col = iscol; 2690 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 2691 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 2692 b->icol = isicol; 2693 ierr = PetscMalloc(((*fact)->m+1+b->bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 2694 } else { /* general case perform the symbolic factorization */ 2695 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 2696 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 2697 2698 /* get new row pointers */ 2699 ierr = PetscMalloc((n+1)*sizeof(int),&ainew);CHKERRQ(ierr); 2700 ainew[0] = 0; 2701 /* don't know how many column pointers are needed so estimate */ 2702 jmax = (int)(f*ai[n] + 1); 2703 ierr = PetscMalloc((jmax)*sizeof(int),&ajnew);CHKERRQ(ierr); 2704 /* ajfill is level of fill for each fill entry */ 2705 ierr = PetscMalloc((jmax)*sizeof(int),&ajfill);CHKERRQ(ierr); 2706 /* fill is a linked list of nonzeros in active row */ 2707 ierr = PetscMalloc((n+1)*sizeof(int),&fill);CHKERRQ(ierr); 2708 /* im is level for each filled value */ 2709 ierr = PetscMalloc((n+1)*sizeof(int),&im);CHKERRQ(ierr); 2710 /* dloc is location of diagonal in factor */ 2711 ierr = PetscMalloc((n+1)*sizeof(int),&dloc);CHKERRQ(ierr); 2712 dloc[0] = 0; 2713 for (prow=0; prow<n; prow++) { 2714 2715 /* copy prow into linked list */ 2716 nzf = nz = ai[r[prow]+1] - ai[r[prow]]; 2717 if (!nz) SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix"); 2718 xi = aj + ai[r[prow]]; 2719 fill[n] = n; 2720 fill[prow] = -1; /* marker for diagonal entry */ 2721 while (nz--) { 2722 fm = n; 2723 idx = ic[*xi++]; 2724 do { 2725 m = fm; 2726 fm = fill[m]; 2727 } while (fm < idx); 2728 fill[m] = idx; 2729 fill[idx] = fm; 2730 im[idx] = 0; 2731 } 2732 2733 /* make sure diagonal entry is included */ 2734 if (diagonal_fill && fill[prow] == -1) { 2735 fm = n; 2736 while (fill[fm] < prow) fm = fill[fm]; 2737 fill[prow] = fill[fm]; /* insert diagonal into linked list */ 2738 fill[fm] = prow; 2739 im[prow] = 0; 2740 nzf++; 2741 dcount++; 2742 } 2743 2744 nzi = 0; 2745 row = fill[n]; 2746 while (row < prow) { 2747 incrlev = im[row] + 1; 2748 nz = dloc[row]; 2749 xi = ajnew + ainew[row] + nz + 1; 2750 flev = ajfill + ainew[row] + nz + 1; 2751 nnz = ainew[row+1] - ainew[row] - nz - 1; 2752 fm = row; 2753 while (nnz-- > 0) { 2754 idx = *xi++; 2755 if (*flev + incrlev > levels) { 2756 flev++; 2757 continue; 2758 } 2759 do { 2760 m = fm; 2761 fm = fill[m]; 2762 } while (fm < idx); 2763 if (fm != idx) { 2764 im[idx] = *flev + incrlev; 2765 fill[m] = idx; 2766 fill[idx] = fm; 2767 fm = idx; 2768 nzf++; 2769 } else { 2770 if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev; 2771 } 2772 flev++; 2773 } 2774 row = fill[row]; 2775 nzi++; 2776 } 2777 /* copy new filled row into permanent storage */ 2778 ainew[prow+1] = ainew[prow] + nzf; 2779 if (ainew[prow+1] > jmax) { 2780 2781 /* estimate how much additional space we will need */ 2782 /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 2783 /* just double the memory each time */ 2784 int maxadd = jmax; 2785 /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */ 2786 if (maxadd < nzf) maxadd = (n-prow)*(nzf+1); 2787 jmax += maxadd; 2788 2789 /* allocate a longer ajnew and ajfill */ 2790 ierr = PetscMalloc(jmax*sizeof(int),&xi);CHKERRQ(ierr); 2791 ierr = PetscMemcpy(xi,ajnew,ainew[prow]*sizeof(int));CHKERRQ(ierr); 2792 ierr = PetscFree(ajnew);CHKERRQ(ierr); 2793 ajnew = xi; 2794 ierr = PetscMalloc(jmax*sizeof(int),&xi);CHKERRQ(ierr); 2795 ierr = PetscMemcpy(xi,ajfill,ainew[prow]*sizeof(int));CHKERRQ(ierr); 2796 ierr = PetscFree(ajfill);CHKERRQ(ierr); 2797 ajfill = xi; 2798 realloc++; /* count how many reallocations are needed */ 2799 } 2800 xi = ajnew + ainew[prow]; 2801 flev = ajfill + ainew[prow]; 2802 dloc[prow] = nzi; 2803 fm = fill[n]; 2804 while (nzf--) { 2805 *xi++ = fm; 2806 *flev++ = im[fm]; 2807 fm = fill[fm]; 2808 } 2809 /* make sure row has diagonal entry */ 2810 if (ajnew[ainew[prow]+dloc[prow]] != prow) { 2811 SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %d has missing diagonal in factored matrix\n\ 2812 try running with -pc_ilu_nonzeros_along_diagonal or -pc_ilu_diagonal_fill",prow); 2813 } 2814 } 2815 ierr = PetscFree(ajfill);CHKERRQ(ierr); 2816 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 2817 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 2818 ierr = PetscFree(fill);CHKERRQ(ierr); 2819 ierr = PetscFree(im);CHKERRQ(ierr); 2820 2821 { 2822 PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]); 2823 PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",realloc,f,af); 2824 PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Run with -pc_ilu_fill %g or use \n",af); 2825 PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:PCILUSetFill(pc,%g);\n",af); 2826 PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:for best performance.\n"); 2827 if (diagonal_fill) { 2828 PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Detected and replace %d missing diagonals",dcount); 2829 } 2830 } 2831 2832 /* put together the new matrix */ 2833 ierr = MatCreateSeqBAIJ(A->comm,bs,bs*n,bs*n,0,PETSC_NULL,fact);CHKERRQ(ierr); 2834 PetscLogObjectParent(*fact,isicol); 2835 b = (Mat_SeqBAIJ*)(*fact)->data; 2836 ierr = PetscFree(b->imax);CHKERRQ(ierr); 2837 b->singlemalloc = PETSC_FALSE; 2838 len = bs2*ainew[n]*sizeof(MatScalar); 2839 /* the next line frees the default space generated by the Create() */ 2840 ierr = PetscFree(b->a);CHKERRQ(ierr); 2841 ierr = PetscFree(b->ilen);CHKERRQ(ierr); 2842 ierr = PetscMalloc(len,&b->a);CHKERRQ(ierr); 2843 b->j = ajnew; 2844 b->i = ainew; 2845 for (i=0; i<n; i++) dloc[i] += ainew[i]; 2846 b->diag = dloc; 2847 b->ilen = 0; 2848 b->imax = 0; 2849 b->row = isrow; 2850 b->col = iscol; 2851 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 2852 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 2853 b->icol = isicol; 2854 ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 2855 /* In b structure: Free imax, ilen, old a, old j. 2856 Allocate dloc, solve_work, new a, new j */ 2857 PetscLogObjectMemory(*fact,(ainew[n]-n)*(sizeof(int))+bs2*ainew[n]*sizeof(PetscScalar)); 2858 b->maxnz = b->nz = ainew[n]; 2859 (*fact)->factor = FACTOR_LU; 2860 2861 (*fact)->info.factor_mallocs = realloc; 2862 (*fact)->info.fill_ratio_given = f; 2863 (*fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]); 2864 } 2865 2866 if (row_identity && col_identity) { 2867 ierr = MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(*fact);CHKERRQ(ierr); 2868 } 2869 PetscFunctionReturn(0); 2870 } 2871 2872 #undef __FUNCT__ 2873 #define __FUNCT__ "MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering" 2874 int MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(Mat inA) 2875 { 2876 /* 2877 Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver 2878 with natural ordering 2879 */ 2880 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)inA->data; 2881 PetscTruth sse_enabled_local,sse_enabled_global; 2882 int ierr; 2883 2884 PetscFunctionBegin; 2885 inA->ops->solve = MatSolve_SeqBAIJ_Update; 2886 inA->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_Update; 2887 switch (a->bs) { 2888 case 1: 2889 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_1; 2890 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=1\n"); 2891 break; 2892 case 2: 2893 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering; 2894 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=2\n"); 2895 break; 2896 case 3: 2897 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering; 2898 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=3\n"); 2899 break; 2900 case 4: 2901 ierr = PetscSSEIsEnabled(inA->comm,&sse_enabled_local,&sse_enabled_global);CHKERRQ(ierr); 2902 if (sse_enabled_local) { 2903 #if defined(PETSC_HAVE_SSE) 2904 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE; 2905 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special SSE, in-place natural ordering factor BS=4\n"); 2906 #else 2907 /* This should never be reached. If so, problem in PetscSSEIsEnabled. */ 2908 SETERRQ(PETSC_ERR_SUP,"SSE Hardware unavailable"); 2909 #endif 2910 } else { 2911 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering; 2912 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=4\n"); 2913 } 2914 break; 2915 case 5: 2916 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_5_NaturalOrdering; 2917 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=5\n"); 2918 break; 2919 case 6: 2920 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_6_NaturalOrdering; 2921 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=6\n"); 2922 break; 2923 case 7: 2924 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_7_NaturalOrdering; 2925 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=7\n"); 2926 break; 2927 } 2928 PetscFunctionReturn(0); 2929 } 2930 2931 #undef __FUNCT__ 2932 #define __FUNCT__ "MatSeqBAIJ_UpdateSolvers" 2933 int MatSeqBAIJ_UpdateSolvers(Mat A) 2934 { 2935 /* 2936 Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver 2937 with natural ordering 2938 */ 2939 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2940 IS row = a->row, col = a->col; 2941 PetscTruth row_identity, col_identity; 2942 PetscTruth sse_enabled_local, sse_enabled_global; 2943 PetscTruth use_single, use_natural; 2944 int ierr; 2945 2946 PetscFunctionBegin; 2947 2948 use_single = PETSC_FALSE; 2949 use_natural = PETSC_FALSE; 2950 2951 ierr = ISIdentity(row,&row_identity);CHKERRQ(ierr); 2952 ierr = ISIdentity(col,&col_identity);CHKERRQ(ierr); 2953 2954 if (row_identity && col_identity) { 2955 use_natural = PETSC_TRUE; 2956 } else { 2957 use_natural = PETSC_FALSE; 2958 } 2959 switch (a->bs) { 2960 case 1: 2961 if (use_natural) { 2962 A->ops->solve = MatSolve_SeqBAIJ_1_NaturalOrdering; 2963 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering; 2964 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=1\n"); 2965 PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n"); 2966 } else { 2967 A->ops->solve = MatSolve_SeqBAIJ_1; 2968 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1; 2969 } 2970 break; 2971 case 2: 2972 if (use_natural) { 2973 A->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering; 2974 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering; 2975 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=2\n"); 2976 PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n"); 2977 } else { 2978 A->ops->solve = MatSolve_SeqBAIJ_2; 2979 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2; 2980 } 2981 break; 2982 case 3: 2983 if (use_natural) { 2984 A->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering; 2985 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering; 2986 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=3\n"); 2987 PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n"); 2988 } else { 2989 A->ops->solve = MatSolve_SeqBAIJ_3; 2990 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3; 2991 } 2992 break; 2993 case 4: 2994 ierr = PetscSSEIsEnabled(A->comm,&sse_enabled_local,&sse_enabled_global);CHKERRQ(ierr); 2995 if (sse_enabled_local) { 2996 PetscTruth single_prec,flg; 2997 single_prec = flg = PETSC_FALSE; 2998 ierr = PetscOptionsGetLogical(PETSC_NULL,"-mat_single_precision_solves",&single_prec,&flg);CHKERRQ(ierr); 2999 if (flg) { 3000 a->single_precision_solves = single_prec; 3001 } 3002 if (a->single_precision_solves) { 3003 use_single = PETSC_TRUE; 3004 } 3005 } else { 3006 use_single = PETSC_FALSE; 3007 } 3008 if (use_natural) { 3009 if (use_single) { 3010 #if defined(PETSC_HAVE_SSE) 3011 A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion; 3012 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE, in-place natural ordering solve BS=4\n"); 3013 #else 3014 /* This should never be reached. If so, problem in PetscSSEIsEnabled. */ 3015 SETERRQ(PETSC_ERR_SUP,"SSE Hardware unavailable"); 3016 #endif 3017 } else { 3018 A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering; 3019 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=4\n"); 3020 } 3021 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering; 3022 PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n"); 3023 } else { 3024 if (use_single) { 3025 #if defined(PETSC_HAVE_SSE) 3026 A->ops->solve = MatSolve_SeqBAIJ_4_SSE_Demotion; 3027 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE solve BS=4\n"); 3028 #else 3029 /* This should never be reached. If so, problem in PetscSSEIsEnabled. */ 3030 SETERRQ(PETSC_ERR_SUP,"SSE Hardware unavailable"); 3031 #endif 3032 } else { 3033 A->ops->solve = MatSolve_SeqBAIJ_4; 3034 } 3035 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4; 3036 } 3037 break; 3038 case 5: 3039 if (use_natural) { 3040 A->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering; 3041 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_5_NaturalOrdering; 3042 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=5\n"); 3043 PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=5\n"); 3044 } else { 3045 A->ops->solve = MatSolve_SeqBAIJ_5; 3046 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_5; 3047 } 3048 break; 3049 case 6: 3050 if (use_natural) { 3051 A->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering; 3052 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_6_NaturalOrdering; 3053 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=6\n"); 3054 PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=6\n"); 3055 } else { 3056 A->ops->solve = MatSolve_SeqBAIJ_6; 3057 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_6; 3058 } 3059 break; 3060 case 7: 3061 if (use_natural) { 3062 A->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering; 3063 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_7_NaturalOrdering; 3064 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=7\n"); 3065 PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=7\n"); 3066 } else { 3067 A->ops->solve = MatSolve_SeqBAIJ_7; 3068 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_7; 3069 } 3070 break; 3071 default: 3072 A->ops->solve = MatSolve_SeqBAIJ_N; 3073 break; 3074 } 3075 PetscFunctionReturn(0); 3076 } 3077 3078 #undef __FUNCT__ 3079 #define __FUNCT__ "MatSolve_SeqBAIJ_Update" 3080 int MatSolve_SeqBAIJ_Update(Mat A,Vec x,Vec y) { 3081 int ierr; 3082 3083 PetscFunctionBegin; 3084 ierr = MatSeqBAIJ_UpdateSolvers(A); 3085 if (A->ops->solve != MatSolve_SeqBAIJ_Update) { 3086 ierr = (*A->ops->solve)(A,x,y);CHKERRQ(ierr); 3087 } else { 3088 SETERRQ(PETSC_ERR_SUP,"Something really wrong happened."); 3089 } 3090 PetscFunctionReturn(0); 3091 } 3092 3093 #undef __FUNCT__ 3094 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_Update" 3095 int MatSolveTranspose_SeqBAIJ_Update(Mat A,Vec x,Vec y) { 3096 int ierr; 3097 3098 PetscFunctionBegin; 3099 ierr = MatSeqBAIJ_UpdateSolvers(A); 3100 ierr = (*A->ops->solvetranspose)(A,x,y);CHKERRQ(ierr); 3101 PetscFunctionReturn(0); 3102 } 3103 3104 3105