1 2 /* 3 Factorization code for BAIJ format. 4 */ 5 6 #include <../src/mat/impls/baij/seq/baij.h> 7 #include <../src/mat/blockinvert.h> 8 #include <petscbt.h> 9 #include <../src/mat/utils/freespace.h> 10 11 #undef __FUNCT__ 12 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering" 13 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 14 { 15 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 16 PetscErrorCode ierr; 17 const PetscInt *adiag = a->diag,*ai = a->i,*aj = a->j,*vi; 18 PetscInt i,n = a->mbs,j; 19 PetscInt nz; 20 PetscScalar *x,*tmp,s1; 21 const MatScalar *aa = a->a,*v; 22 const PetscScalar *b; 23 24 PetscFunctionBegin; 25 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 26 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 27 tmp = a->solve_work; 28 29 30 /* copy the b into temp work space according to permutation */ 31 for (i=0; i<n; i++) tmp[i] = b[i]; 32 33 /* forward solve the U^T */ 34 for (i=0; i<n; i++) { 35 v = aa + adiag[i+1] + 1; 36 vi = aj + adiag[i+1] + 1; 37 nz = adiag[i] - adiag[i+1] - 1; 38 s1 = tmp[i]; 39 s1 *= v[nz]; /* multiply by inverse of diagonal entry */ 40 for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j]; 41 tmp[i] = s1; 42 } 43 44 /* backward solve the L^T */ 45 for (i=n-1; i>=0; i--){ 46 v = aa + ai[i]; 47 vi = aj + ai[i]; 48 nz = ai[i+1] - ai[i]; 49 s1 = tmp[i]; 50 for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j]; 51 } 52 53 /* copy tmp into x according to permutation */ 54 for (i=0; i<n; i++) x[i] = tmp[i]; 55 56 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 57 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 58 59 ierr = PetscLogFlops(2.0*a->nz-A->cmap->n);CHKERRQ(ierr); 60 PetscFunctionReturn(0); 61 } 62 63 #undef __FUNCT__ 64 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace" 65 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 66 { 67 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 68 PetscErrorCode ierr; 69 PetscInt i,nz; 70 const PetscInt *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 71 const MatScalar *aa=a->a,*v; 72 PetscScalar s1,*x; 73 74 PetscFunctionBegin; 75 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 76 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 77 78 /* forward solve the U^T */ 79 for (i=0; i<n; i++) { 80 81 v = aa + diag[i]; 82 /* multiply by the inverse of the block diagonal */ 83 s1 = (*v++)*x[i]; 84 vi = aj + diag[i] + 1; 85 nz = ai[i+1] - diag[i] - 1; 86 while (nz--) { 87 x[*vi++] -= (*v++)*s1; 88 } 89 x[i] = s1; 90 } 91 /* backward solve the L^T */ 92 for (i=n-1; i>=0; i--){ 93 v = aa + diag[i] - 1; 94 vi = aj + diag[i] - 1; 95 nz = diag[i] - ai[i]; 96 s1 = x[i]; 97 while (nz--) { 98 x[*vi--] -= (*v--)*s1; 99 } 100 } 101 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 102 ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 103 PetscFunctionReturn(0); 104 } 105 106 #undef __FUNCT__ 107 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace" 108 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 109 { 110 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 111 PetscErrorCode ierr; 112 PetscInt i,nz,idx,idt,oidx; 113 const PetscInt *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j; 114 const MatScalar *aa=a->a,*v; 115 PetscScalar s1,s2,x1,x2,*x; 116 117 PetscFunctionBegin; 118 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 119 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 120 121 /* forward solve the U^T */ 122 idx = 0; 123 for (i=0; i<n; i++) { 124 125 v = aa + 4*diag[i]; 126 /* multiply by the inverse of the block diagonal */ 127 x1 = x[idx]; x2 = x[1+idx]; 128 s1 = v[0]*x1 + v[1]*x2; 129 s2 = v[2]*x1 + v[3]*x2; 130 v += 4; 131 132 vi = aj + diag[i] + 1; 133 nz = ai[i+1] - diag[i] - 1; 134 while (nz--) { 135 oidx = 2*(*vi++); 136 x[oidx] -= v[0]*s1 + v[1]*s2; 137 x[oidx+1] -= v[2]*s1 + v[3]*s2; 138 v += 4; 139 } 140 x[idx] = s1;x[1+idx] = s2; 141 idx += 2; 142 } 143 /* backward solve the L^T */ 144 for (i=n-1; i>=0; i--){ 145 v = aa + 4*diag[i] - 4; 146 vi = aj + diag[i] - 1; 147 nz = diag[i] - ai[i]; 148 idt = 2*i; 149 s1 = x[idt]; s2 = x[1+idt]; 150 while (nz--) { 151 idx = 2*(*vi--); 152 x[idx] -= v[0]*s1 + v[1]*s2; 153 x[idx+1] -= v[2]*s1 + v[3]*s2; 154 v -= 4; 155 } 156 } 157 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 158 ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 159 PetscFunctionReturn(0); 160 } 161 162 #undef __FUNCT__ 163 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering" 164 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 165 { 166 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 167 PetscErrorCode ierr; 168 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 169 PetscInt nz,idx,idt,j,i,oidx; 170 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 171 const MatScalar *aa=a->a,*v; 172 PetscScalar s1,s2,x1,x2,*x; 173 174 PetscFunctionBegin; 175 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 176 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 177 178 /* forward solve the U^T */ 179 idx = 0; 180 for (i=0; i<n; i++) { 181 v = aa + bs2*diag[i]; 182 /* multiply by the inverse of the block diagonal */ 183 x1 = x[idx]; x2 = x[1+idx]; 184 s1 = v[0]*x1 + v[1]*x2; 185 s2 = v[2]*x1 + v[3]*x2; 186 v -= bs2; 187 188 vi = aj + diag[i] - 1; 189 nz = diag[i] - diag[i+1] - 1; 190 for(j=0;j>-nz;j--){ 191 oidx = bs*vi[j]; 192 x[oidx] -= v[0]*s1 + v[1]*s2; 193 x[oidx+1] -= v[2]*s1 + v[3]*s2; 194 v -= bs2; 195 } 196 x[idx] = s1;x[1+idx] = s2; 197 idx += bs; 198 } 199 /* backward solve the L^T */ 200 for (i=n-1; i>=0; i--){ 201 v = aa + bs2*ai[i]; 202 vi = aj + ai[i]; 203 nz = ai[i+1] - ai[i]; 204 idt = bs*i; 205 s1 = x[idt]; s2 = x[1+idt]; 206 for(j=0;j<nz;j++){ 207 idx = bs*vi[j]; 208 x[idx] -= v[0]*s1 + v[1]*s2; 209 x[idx+1] -= v[2]*s1 + v[3]*s2; 210 v += bs2; 211 } 212 } 213 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 214 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 215 PetscFunctionReturn(0); 216 } 217 218 #undef __FUNCT__ 219 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace" 220 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 221 { 222 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 223 PetscErrorCode ierr; 224 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 225 PetscInt i,nz,idx,idt,oidx; 226 const MatScalar *aa=a->a,*v; 227 PetscScalar s1,s2,s3,x1,x2,x3,*x; 228 229 PetscFunctionBegin; 230 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 231 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 232 233 /* forward solve the U^T */ 234 idx = 0; 235 for (i=0; i<n; i++) { 236 237 v = aa + 9*diag[i]; 238 /* multiply by the inverse of the block diagonal */ 239 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 240 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 241 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 242 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 243 v += 9; 244 245 vi = aj + diag[i] + 1; 246 nz = ai[i+1] - diag[i] - 1; 247 while (nz--) { 248 oidx = 3*(*vi++); 249 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 250 x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 251 x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 252 v += 9; 253 } 254 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 255 idx += 3; 256 } 257 /* backward solve the L^T */ 258 for (i=n-1; i>=0; i--){ 259 v = aa + 9*diag[i] - 9; 260 vi = aj + diag[i] - 1; 261 nz = diag[i] - ai[i]; 262 idt = 3*i; 263 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 264 while (nz--) { 265 idx = 3*(*vi--); 266 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 267 x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 268 x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 269 v -= 9; 270 } 271 } 272 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 273 ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 274 PetscFunctionReturn(0); 275 } 276 277 #undef __FUNCT__ 278 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering" 279 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 280 { 281 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 282 PetscErrorCode ierr; 283 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 284 PetscInt nz,idx,idt,j,i,oidx; 285 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 286 const MatScalar *aa=a->a,*v; 287 PetscScalar s1,s2,s3,x1,x2,x3,*x; 288 289 PetscFunctionBegin; 290 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 291 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 292 293 /* forward solve the U^T */ 294 idx = 0; 295 for (i=0; i<n; i++) { 296 v = aa + bs2*diag[i]; 297 /* multiply by the inverse of the block diagonal */ 298 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 299 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 300 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 301 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 302 v -= bs2; 303 304 vi = aj + diag[i] - 1; 305 nz = diag[i] - diag[i+1] - 1; 306 for(j=0;j>-nz;j--){ 307 oidx = bs*vi[j]; 308 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 309 x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 310 x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 311 v -= bs2; 312 } 313 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 314 idx += bs; 315 } 316 /* backward solve the L^T */ 317 for (i=n-1; i>=0; i--){ 318 v = aa + bs2*ai[i]; 319 vi = aj + ai[i]; 320 nz = ai[i+1] - ai[i]; 321 idt = bs*i; 322 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 323 for(j=0;j<nz;j++){ 324 idx = bs*vi[j]; 325 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 326 x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 327 x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 328 v += bs2; 329 } 330 } 331 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 332 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 333 PetscFunctionReturn(0); 334 } 335 336 #undef __FUNCT__ 337 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace" 338 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 339 { 340 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 341 PetscErrorCode ierr; 342 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 343 PetscInt i,nz,idx,idt,oidx; 344 const MatScalar *aa=a->a,*v; 345 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4,*x; 346 347 PetscFunctionBegin; 348 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 349 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 350 351 /* forward solve the U^T */ 352 idx = 0; 353 for (i=0; i<n; i++) { 354 355 v = aa + 16*diag[i]; 356 /* multiply by the inverse of the block diagonal */ 357 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 358 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 359 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 360 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 361 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 362 v += 16; 363 364 vi = aj + diag[i] + 1; 365 nz = ai[i+1] - diag[i] - 1; 366 while (nz--) { 367 oidx = 4*(*vi++); 368 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 369 x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 370 x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 371 x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 372 v += 16; 373 } 374 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; 375 idx += 4; 376 } 377 /* backward solve the L^T */ 378 for (i=n-1; i>=0; i--){ 379 v = aa + 16*diag[i] - 16; 380 vi = aj + diag[i] - 1; 381 nz = diag[i] - ai[i]; 382 idt = 4*i; 383 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; 384 while (nz--) { 385 idx = 4*(*vi--); 386 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 387 x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 388 x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 389 x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 390 v -= 16; 391 } 392 } 393 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 394 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 395 PetscFunctionReturn(0); 396 } 397 398 #undef __FUNCT__ 399 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering" 400 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 401 { 402 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 403 PetscErrorCode ierr; 404 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 405 PetscInt nz,idx,idt,j,i,oidx; 406 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 407 const MatScalar *aa=a->a,*v; 408 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4,*x; 409 410 PetscFunctionBegin; 411 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 412 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 413 414 /* forward solve the U^T */ 415 idx = 0; 416 for (i=0; i<n; i++) { 417 v = aa + bs2*diag[i]; 418 /* multiply by the inverse of the block diagonal */ 419 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 420 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 421 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 422 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 423 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 424 v -= bs2; 425 426 vi = aj + diag[i] - 1; 427 nz = diag[i] - diag[i+1] - 1; 428 for(j=0;j>-nz;j--){ 429 oidx = bs*vi[j]; 430 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 431 x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 432 x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 433 x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 434 v -= bs2; 435 } 436 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; 437 idx += bs; 438 } 439 /* backward solve the L^T */ 440 for (i=n-1; i>=0; i--){ 441 v = aa + bs2*ai[i]; 442 vi = aj + ai[i]; 443 nz = ai[i+1] - ai[i]; 444 idt = bs*i; 445 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; 446 for(j=0;j<nz;j++){ 447 idx = bs*vi[j]; 448 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 449 x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 450 x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 451 x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 452 v += bs2; 453 } 454 } 455 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 456 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 457 PetscFunctionReturn(0); 458 } 459 460 #undef __FUNCT__ 461 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace" 462 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 463 { 464 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 465 PetscErrorCode ierr; 466 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 467 PetscInt i,nz,idx,idt,oidx; 468 const MatScalar *aa=a->a,*v; 469 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x; 470 471 PetscFunctionBegin; 472 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 473 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 474 475 /* forward solve the U^T */ 476 idx = 0; 477 for (i=0; i<n; i++) { 478 479 v = aa + 25*diag[i]; 480 /* multiply by the inverse of the block diagonal */ 481 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 482 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 483 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 484 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 485 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 486 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 487 v += 25; 488 489 vi = aj + diag[i] + 1; 490 nz = ai[i+1] - diag[i] - 1; 491 while (nz--) { 492 oidx = 5*(*vi++); 493 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 494 x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 495 x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 496 x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 497 x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 498 v += 25; 499 } 500 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 501 idx += 5; 502 } 503 /* backward solve the L^T */ 504 for (i=n-1; i>=0; i--){ 505 v = aa + 25*diag[i] - 25; 506 vi = aj + diag[i] - 1; 507 nz = diag[i] - ai[i]; 508 idt = 5*i; 509 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 510 while (nz--) { 511 idx = 5*(*vi--); 512 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 513 x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 514 x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 515 x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 516 x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 517 v -= 25; 518 } 519 } 520 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 521 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 522 PetscFunctionReturn(0); 523 } 524 525 #undef __FUNCT__ 526 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering" 527 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 528 { 529 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 530 PetscErrorCode ierr; 531 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 532 PetscInt nz,idx,idt,j,i,oidx; 533 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 534 const MatScalar *aa=a->a,*v; 535 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x; 536 537 PetscFunctionBegin; 538 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 539 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 540 541 /* forward solve the U^T */ 542 idx = 0; 543 for (i=0; i<n; i++) { 544 v = aa + bs2*diag[i]; 545 /* multiply by the inverse of the block diagonal */ 546 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 547 x5 = x[4+idx]; 548 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 549 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 550 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 551 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 552 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 553 v -= bs2; 554 555 vi = aj + diag[i] - 1; 556 nz = diag[i] - diag[i+1] - 1; 557 for(j=0;j>-nz;j--){ 558 oidx = bs*vi[j]; 559 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 560 x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 561 x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 562 x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 563 x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 564 v -= bs2; 565 } 566 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; x[4+idx] = s5; 567 idx += bs; 568 } 569 /* backward solve the L^T */ 570 for (i=n-1; i>=0; i--){ 571 v = aa + bs2*ai[i]; 572 vi = aj + ai[i]; 573 nz = ai[i+1] - ai[i]; 574 idt = bs*i; 575 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt]; 576 for(j=0;j<nz;j++){ 577 idx = bs*vi[j]; 578 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 579 x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 580 x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 581 x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 582 x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 583 v += bs2; 584 } 585 } 586 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 587 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 588 PetscFunctionReturn(0); 589 } 590 591 #undef __FUNCT__ 592 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace" 593 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 594 { 595 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 596 PetscErrorCode ierr; 597 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 598 PetscInt i,nz,idx,idt,oidx; 599 const MatScalar *aa=a->a,*v; 600 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x; 601 602 PetscFunctionBegin; 603 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 604 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 605 606 /* forward solve the U^T */ 607 idx = 0; 608 for (i=0; i<n; i++) { 609 610 v = aa + 36*diag[i]; 611 /* multiply by the inverse of the block diagonal */ 612 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 613 x6 = x[5+idx]; 614 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 615 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 616 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 617 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 618 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 619 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 620 v += 36; 621 622 vi = aj + diag[i] + 1; 623 nz = ai[i+1] - diag[i] - 1; 624 while (nz--) { 625 oidx = 6*(*vi++); 626 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 627 x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 628 x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 629 x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 630 x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 631 x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 632 v += 36; 633 } 634 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 635 x[5+idx] = s6; 636 idx += 6; 637 } 638 /* backward solve the L^T */ 639 for (i=n-1; i>=0; i--){ 640 v = aa + 36*diag[i] - 36; 641 vi = aj + diag[i] - 1; 642 nz = diag[i] - ai[i]; 643 idt = 6*i; 644 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 645 s6 = x[5+idt]; 646 while (nz--) { 647 idx = 6*(*vi--); 648 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 649 x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 650 x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 651 x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 652 x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 653 x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 654 v -= 36; 655 } 656 } 657 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 658 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 659 PetscFunctionReturn(0); 660 } 661 662 #undef __FUNCT__ 663 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering" 664 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 665 { 666 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 667 PetscErrorCode ierr; 668 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 669 PetscInt nz,idx,idt,j,i,oidx; 670 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 671 const MatScalar *aa=a->a,*v; 672 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x; 673 674 PetscFunctionBegin; 675 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 676 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 677 678 /* forward solve the U^T */ 679 idx = 0; 680 for (i=0; i<n; i++) { 681 v = aa + bs2*diag[i]; 682 /* multiply by the inverse of the block diagonal */ 683 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 684 x5 = x[4+idx]; x6 = x[5+idx]; 685 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 686 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 687 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 688 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 689 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 690 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 691 v -= bs2; 692 693 vi = aj + diag[i] - 1; 694 nz = diag[i] - diag[i+1] - 1; 695 for(j=0;j>-nz;j--){ 696 oidx = bs*vi[j]; 697 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 698 x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 699 x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 700 x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 701 x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 702 x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 703 v -= bs2; 704 } 705 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; x[4+idx] = s5; 706 x[5+idx] = s6; 707 idx += bs; 708 } 709 /* backward solve the L^T */ 710 for (i=n-1; i>=0; i--){ 711 v = aa + bs2*ai[i]; 712 vi = aj + ai[i]; 713 nz = ai[i+1] - ai[i]; 714 idt = bs*i; 715 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt]; 716 s6 = x[5+idt]; 717 for(j=0;j<nz;j++){ 718 idx = bs*vi[j]; 719 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 720 x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 721 x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 722 x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 723 x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 724 x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 725 v += bs2; 726 } 727 } 728 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 729 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 730 PetscFunctionReturn(0); 731 } 732 733 #undef __FUNCT__ 734 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace" 735 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 736 { 737 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 738 PetscErrorCode ierr; 739 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 740 PetscInt i,nz,idx,idt,oidx; 741 const MatScalar *aa=a->a,*v; 742 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x; 743 744 PetscFunctionBegin; 745 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 746 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 747 748 /* forward solve the U^T */ 749 idx = 0; 750 for (i=0; i<n; i++) { 751 752 v = aa + 49*diag[i]; 753 /* multiply by the inverse of the block diagonal */ 754 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 755 x6 = x[5+idx]; x7 = x[6+idx]; 756 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 757 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 758 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 759 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 760 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 761 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 762 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 763 v += 49; 764 765 vi = aj + diag[i] + 1; 766 nz = ai[i+1] - diag[i] - 1; 767 while (nz--) { 768 oidx = 7*(*vi++); 769 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 770 x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 771 x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 772 x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 773 x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 774 x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 775 x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 776 v += 49; 777 } 778 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 779 x[5+idx] = s6;x[6+idx] = s7; 780 idx += 7; 781 } 782 /* backward solve the L^T */ 783 for (i=n-1; i>=0; i--){ 784 v = aa + 49*diag[i] - 49; 785 vi = aj + diag[i] - 1; 786 nz = diag[i] - ai[i]; 787 idt = 7*i; 788 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 789 s6 = x[5+idt];s7 = x[6+idt]; 790 while (nz--) { 791 idx = 7*(*vi--); 792 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 793 x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 794 x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 795 x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 796 x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 797 x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 798 x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 799 v -= 49; 800 } 801 } 802 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 803 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 804 PetscFunctionReturn(0); 805 } 806 #undef __FUNCT__ 807 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering" 808 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 809 { 810 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 811 PetscErrorCode ierr; 812 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 813 PetscInt nz,idx,idt,j,i,oidx; 814 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 815 const MatScalar *aa=a->a,*v; 816 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x; 817 818 PetscFunctionBegin; 819 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 820 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 821 822 /* forward solve the U^T */ 823 idx = 0; 824 for (i=0; i<n; i++) { 825 v = aa + bs2*diag[i]; 826 /* multiply by the inverse of the block diagonal */ 827 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 828 x5 = x[4+idx]; x6 = x[5+idx]; x7 = x[6+idx]; 829 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 830 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 831 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 832 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 833 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 834 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 835 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 836 v -= bs2; 837 vi = aj + diag[i] - 1; 838 nz = diag[i] - diag[i+1] - 1; 839 for(j=0;j>-nz;j--){ 840 oidx = bs*vi[j]; 841 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 842 x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 843 x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 844 x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 845 x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 846 x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 847 x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 848 v -= bs2; 849 } 850 x[idx] = s1; x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; x[4+idx] = s5; 851 x[5+idx] = s6; x[6+idx] = s7; 852 idx += bs; 853 } 854 /* backward solve the L^T */ 855 for (i=n-1; i>=0; i--){ 856 v = aa + bs2*ai[i]; 857 vi = aj + ai[i]; 858 nz = ai[i+1] - ai[i]; 859 idt = bs*i; 860 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt]; 861 s6 = x[5+idt]; s7 = x[6+idt]; 862 for(j=0;j<nz;j++){ 863 idx = bs*vi[j]; 864 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 865 x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 866 x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 867 x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 868 x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 869 x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 870 x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 871 v += bs2; 872 } 873 } 874 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 875 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 876 PetscFunctionReturn(0); 877 } 878 879 /*---------------------------------------------------------------------------------------------*/ 880 #undef __FUNCT__ 881 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1" 882 PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 883 { 884 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 885 IS iscol = a->col,isrow = a->row; 886 PetscErrorCode ierr; 887 const PetscInt *rout,*cout,*r,*c,*adiag = a->diag,*ai = a->i,*aj = a->j,*vi; 888 PetscInt i,n = a->mbs,j; 889 PetscInt nz; 890 PetscScalar *x,*tmp,s1; 891 const MatScalar *aa = a->a,*v; 892 const PetscScalar *b; 893 894 PetscFunctionBegin; 895 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 896 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 897 tmp = a->solve_work; 898 899 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 900 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 901 902 /* copy the b into temp work space according to permutation */ 903 for (i=0; i<n; i++) tmp[i] = b[c[i]]; 904 905 /* forward solve the U^T */ 906 for (i=0; i<n; i++) { 907 v = aa + adiag[i+1] + 1; 908 vi = aj + adiag[i+1] + 1; 909 nz = adiag[i] - adiag[i+1] - 1; 910 s1 = tmp[i]; 911 s1 *= v[nz]; /* multiply by inverse of diagonal entry */ 912 for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j]; 913 tmp[i] = s1; 914 } 915 916 /* backward solve the L^T */ 917 for (i=n-1; i>=0; i--){ 918 v = aa + ai[i]; 919 vi = aj + ai[i]; 920 nz = ai[i+1] - ai[i]; 921 s1 = tmp[i]; 922 for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j]; 923 } 924 925 /* copy tmp into x according to permutation */ 926 for (i=0; i<n; i++) x[r[i]] = tmp[i]; 927 928 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 929 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 930 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 931 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 932 933 ierr = PetscLogFlops(2.0*a->nz-A->cmap->n);CHKERRQ(ierr); 934 PetscFunctionReturn(0); 935 } 936 937 #undef __FUNCT__ 938 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_inplace" 939 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx) 940 { 941 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 942 IS iscol=a->col,isrow=a->row; 943 PetscErrorCode ierr; 944 const PetscInt *r,*c,*rout,*cout; 945 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 946 PetscInt i,nz; 947 const MatScalar *aa=a->a,*v; 948 PetscScalar s1,*x,*t; 949 const PetscScalar *b; 950 951 PetscFunctionBegin; 952 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 953 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 954 t = a->solve_work; 955 956 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 957 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 958 959 /* copy the b into temp work space according to permutation */ 960 for (i=0; i<n; i++) { 961 t[i] = b[c[i]]; 962 } 963 964 /* forward solve the U^T */ 965 for (i=0; i<n; i++) { 966 967 v = aa + diag[i]; 968 /* multiply by the inverse of the block diagonal */ 969 s1 = (*v++)*t[i]; 970 vi = aj + diag[i] + 1; 971 nz = ai[i+1] - diag[i] - 1; 972 while (nz--) { 973 t[*vi++] -= (*v++)*s1; 974 } 975 t[i] = s1; 976 } 977 /* backward solve the L^T */ 978 for (i=n-1; i>=0; i--){ 979 v = aa + diag[i] - 1; 980 vi = aj + diag[i] - 1; 981 nz = diag[i] - ai[i]; 982 s1 = t[i]; 983 while (nz--) { 984 t[*vi--] -= (*v--)*s1; 985 } 986 } 987 988 /* copy t into x according to permutation */ 989 for (i=0; i<n; i++) { 990 x[r[i]] = t[i]; 991 } 992 993 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 994 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 995 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 996 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 997 ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 998 PetscFunctionReturn(0); 999 } 1000 1001 #undef __FUNCT__ 1002 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_inplace" 1003 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx) 1004 { 1005 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1006 IS iscol=a->col,isrow=a->row; 1007 PetscErrorCode ierr; 1008 const PetscInt *r,*c,*rout,*cout; 1009 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1010 PetscInt i,nz,idx,idt,ii,ic,ir,oidx; 1011 const MatScalar *aa=a->a,*v; 1012 PetscScalar s1,s2,x1,x2,*x,*t; 1013 const PetscScalar *b; 1014 1015 PetscFunctionBegin; 1016 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1017 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1018 t = a->solve_work; 1019 1020 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1021 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1022 1023 /* copy the b into temp work space according to permutation */ 1024 ii = 0; 1025 for (i=0; i<n; i++) { 1026 ic = 2*c[i]; 1027 t[ii] = b[ic]; 1028 t[ii+1] = b[ic+1]; 1029 ii += 2; 1030 } 1031 1032 /* forward solve the U^T */ 1033 idx = 0; 1034 for (i=0; i<n; i++) { 1035 1036 v = aa + 4*diag[i]; 1037 /* multiply by the inverse of the block diagonal */ 1038 x1 = t[idx]; x2 = t[1+idx]; 1039 s1 = v[0]*x1 + v[1]*x2; 1040 s2 = v[2]*x1 + v[3]*x2; 1041 v += 4; 1042 1043 vi = aj + diag[i] + 1; 1044 nz = ai[i+1] - diag[i] - 1; 1045 while (nz--) { 1046 oidx = 2*(*vi++); 1047 t[oidx] -= v[0]*s1 + v[1]*s2; 1048 t[oidx+1] -= v[2]*s1 + v[3]*s2; 1049 v += 4; 1050 } 1051 t[idx] = s1;t[1+idx] = s2; 1052 idx += 2; 1053 } 1054 /* backward solve the L^T */ 1055 for (i=n-1; i>=0; i--){ 1056 v = aa + 4*diag[i] - 4; 1057 vi = aj + diag[i] - 1; 1058 nz = diag[i] - ai[i]; 1059 idt = 2*i; 1060 s1 = t[idt]; s2 = t[1+idt]; 1061 while (nz--) { 1062 idx = 2*(*vi--); 1063 t[idx] -= v[0]*s1 + v[1]*s2; 1064 t[idx+1] -= v[2]*s1 + v[3]*s2; 1065 v -= 4; 1066 } 1067 } 1068 1069 /* copy t into x according to permutation */ 1070 ii = 0; 1071 for (i=0; i<n; i++) { 1072 ir = 2*r[i]; 1073 x[ir] = t[ii]; 1074 x[ir+1] = t[ii+1]; 1075 ii += 2; 1076 } 1077 1078 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1079 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1080 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1081 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1082 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 1083 PetscFunctionReturn(0); 1084 } 1085 1086 #undef __FUNCT__ 1087 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2" 1088 PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 1089 { 1090 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1091 PetscErrorCode ierr; 1092 IS iscol=a->col,isrow=a->row; 1093 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 1094 const PetscInt *r,*c,*rout,*cout; 1095 PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1096 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 1097 const MatScalar *aa=a->a,*v; 1098 PetscScalar s1,s2,x1,x2,*x,*t; 1099 const PetscScalar *b; 1100 1101 PetscFunctionBegin; 1102 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1103 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1104 t = a->solve_work; 1105 1106 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1107 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1108 1109 /* copy b into temp work space according to permutation */ 1110 for(i=0;i<n;i++){ 1111 ii = bs*i; ic = bs*c[i]; 1112 t[ii] = b[ic]; t[ii+1] = b[ic+1]; 1113 } 1114 1115 /* forward solve the U^T */ 1116 idx = 0; 1117 for (i=0; i<n; i++) { 1118 v = aa + bs2*diag[i]; 1119 /* multiply by the inverse of the block diagonal */ 1120 x1 = t[idx]; x2 = t[1+idx]; 1121 s1 = v[0]*x1 + v[1]*x2; 1122 s2 = v[2]*x1 + v[3]*x2; 1123 v -= bs2; 1124 1125 vi = aj + diag[i] - 1; 1126 nz = diag[i] - diag[i+1] - 1; 1127 for(j=0;j>-nz;j--){ 1128 oidx = bs*vi[j]; 1129 t[oidx] -= v[0]*s1 + v[1]*s2; 1130 t[oidx+1] -= v[2]*s1 + v[3]*s2; 1131 v -= bs2; 1132 } 1133 t[idx] = s1;t[1+idx] = s2; 1134 idx += bs; 1135 } 1136 /* backward solve the L^T */ 1137 for (i=n-1; i>=0; i--){ 1138 v = aa + bs2*ai[i]; 1139 vi = aj + ai[i]; 1140 nz = ai[i+1] - ai[i]; 1141 idt = bs*i; 1142 s1 = t[idt]; s2 = t[1+idt]; 1143 for(j=0;j<nz;j++){ 1144 idx = bs*vi[j]; 1145 t[idx] -= v[0]*s1 + v[1]*s2; 1146 t[idx+1] -= v[2]*s1 + v[3]*s2; 1147 v += bs2; 1148 } 1149 } 1150 1151 /* copy t into x according to permutation */ 1152 for(i=0;i<n;i++){ 1153 ii = bs*i; ir = bs*r[i]; 1154 x[ir] = t[ii]; x[ir+1] = t[ii+1]; 1155 } 1156 1157 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1158 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1159 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1160 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1161 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1162 PetscFunctionReturn(0); 1163 } 1164 1165 #undef __FUNCT__ 1166 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_inplace" 1167 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx) 1168 { 1169 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1170 IS iscol=a->col,isrow=a->row; 1171 PetscErrorCode ierr; 1172 const PetscInt *r,*c,*rout,*cout; 1173 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1174 PetscInt i,nz,idx,idt,ii,ic,ir,oidx; 1175 const MatScalar *aa=a->a,*v; 1176 PetscScalar s1,s2,s3,x1,x2,x3,*x,*t; 1177 const PetscScalar *b; 1178 1179 PetscFunctionBegin; 1180 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1181 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1182 t = a->solve_work; 1183 1184 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1185 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1186 1187 /* copy the b into temp work space according to permutation */ 1188 ii = 0; 1189 for (i=0; i<n; i++) { 1190 ic = 3*c[i]; 1191 t[ii] = b[ic]; 1192 t[ii+1] = b[ic+1]; 1193 t[ii+2] = b[ic+2]; 1194 ii += 3; 1195 } 1196 1197 /* forward solve the U^T */ 1198 idx = 0; 1199 for (i=0; i<n; i++) { 1200 1201 v = aa + 9*diag[i]; 1202 /* multiply by the inverse of the block diagonal */ 1203 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1204 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 1205 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 1206 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 1207 v += 9; 1208 1209 vi = aj + diag[i] + 1; 1210 nz = ai[i+1] - diag[i] - 1; 1211 while (nz--) { 1212 oidx = 3*(*vi++); 1213 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 1214 t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 1215 t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 1216 v += 9; 1217 } 1218 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 1219 idx += 3; 1220 } 1221 /* backward solve the L^T */ 1222 for (i=n-1; i>=0; i--){ 1223 v = aa + 9*diag[i] - 9; 1224 vi = aj + diag[i] - 1; 1225 nz = diag[i] - ai[i]; 1226 idt = 3*i; 1227 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 1228 while (nz--) { 1229 idx = 3*(*vi--); 1230 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 1231 t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 1232 t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 1233 v -= 9; 1234 } 1235 } 1236 1237 /* copy t into x according to permutation */ 1238 ii = 0; 1239 for (i=0; i<n; i++) { 1240 ir = 3*r[i]; 1241 x[ir] = t[ii]; 1242 x[ir+1] = t[ii+1]; 1243 x[ir+2] = t[ii+2]; 1244 ii += 3; 1245 } 1246 1247 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1248 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1249 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1250 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1251 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 1252 PetscFunctionReturn(0); 1253 } 1254 1255 #undef __FUNCT__ 1256 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3" 1257 PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 1258 { 1259 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1260 PetscErrorCode ierr; 1261 IS iscol=a->col,isrow=a->row; 1262 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 1263 const PetscInt *r,*c,*rout,*cout; 1264 PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1265 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 1266 const MatScalar *aa=a->a,*v; 1267 PetscScalar s1,s2,s3,x1,x2,x3,*x,*t; 1268 const PetscScalar *b; 1269 1270 PetscFunctionBegin; 1271 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1272 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1273 t = a->solve_work; 1274 1275 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1276 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1277 1278 /* copy b into temp work space according to permutation */ 1279 for(i=0;i<n;i++){ 1280 ii = bs*i; ic = bs*c[i]; 1281 t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; 1282 } 1283 1284 /* forward solve the U^T */ 1285 idx = 0; 1286 for (i=0; i<n; i++) { 1287 v = aa + bs2*diag[i]; 1288 /* multiply by the inverse of the block diagonal */ 1289 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1290 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 1291 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 1292 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 1293 v -= bs2; 1294 1295 vi = aj + diag[i] - 1; 1296 nz = diag[i] - diag[i+1] - 1; 1297 for(j=0;j>-nz;j--){ 1298 oidx = bs*vi[j]; 1299 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 1300 t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 1301 t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 1302 v -= bs2; 1303 } 1304 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 1305 idx += bs; 1306 } 1307 /* backward solve the L^T */ 1308 for (i=n-1; i>=0; i--){ 1309 v = aa + bs2*ai[i]; 1310 vi = aj + ai[i]; 1311 nz = ai[i+1] - ai[i]; 1312 idt = bs*i; 1313 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 1314 for(j=0;j<nz;j++){ 1315 idx = bs*vi[j]; 1316 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 1317 t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 1318 t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 1319 v += bs2; 1320 } 1321 } 1322 1323 /* copy t into x according to permutation */ 1324 for(i=0;i<n;i++){ 1325 ii = bs*i; ir = bs*r[i]; 1326 x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; 1327 } 1328 1329 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1330 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1331 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1332 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1333 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1334 PetscFunctionReturn(0); 1335 } 1336 1337 #undef __FUNCT__ 1338 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_inplace" 1339 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx) 1340 { 1341 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1342 IS iscol=a->col,isrow=a->row; 1343 PetscErrorCode ierr; 1344 const PetscInt *r,*c,*rout,*cout; 1345 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1346 PetscInt i,nz,idx,idt,ii,ic,ir,oidx; 1347 const MatScalar *aa=a->a,*v; 1348 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4,*x,*t; 1349 const PetscScalar *b; 1350 1351 PetscFunctionBegin; 1352 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1353 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1354 t = a->solve_work; 1355 1356 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1357 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1358 1359 /* copy the b into temp work space according to permutation */ 1360 ii = 0; 1361 for (i=0; i<n; i++) { 1362 ic = 4*c[i]; 1363 t[ii] = b[ic]; 1364 t[ii+1] = b[ic+1]; 1365 t[ii+2] = b[ic+2]; 1366 t[ii+3] = b[ic+3]; 1367 ii += 4; 1368 } 1369 1370 /* forward solve the U^T */ 1371 idx = 0; 1372 for (i=0; i<n; i++) { 1373 1374 v = aa + 16*diag[i]; 1375 /* multiply by the inverse of the block diagonal */ 1376 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 1377 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 1378 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 1379 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 1380 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 1381 v += 16; 1382 1383 vi = aj + diag[i] + 1; 1384 nz = ai[i+1] - diag[i] - 1; 1385 while (nz--) { 1386 oidx = 4*(*vi++); 1387 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 1388 t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 1389 t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 1390 t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 1391 v += 16; 1392 } 1393 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; 1394 idx += 4; 1395 } 1396 /* backward solve the L^T */ 1397 for (i=n-1; i>=0; i--){ 1398 v = aa + 16*diag[i] - 16; 1399 vi = aj + diag[i] - 1; 1400 nz = diag[i] - ai[i]; 1401 idt = 4*i; 1402 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; 1403 while (nz--) { 1404 idx = 4*(*vi--); 1405 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 1406 t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 1407 t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 1408 t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 1409 v -= 16; 1410 } 1411 } 1412 1413 /* copy t into x according to permutation */ 1414 ii = 0; 1415 for (i=0; i<n; i++) { 1416 ir = 4*r[i]; 1417 x[ir] = t[ii]; 1418 x[ir+1] = t[ii+1]; 1419 x[ir+2] = t[ii+2]; 1420 x[ir+3] = t[ii+3]; 1421 ii += 4; 1422 } 1423 1424 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1425 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1426 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1427 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1428 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 1429 PetscFunctionReturn(0); 1430 } 1431 1432 #undef __FUNCT__ 1433 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4" 1434 PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 1435 { 1436 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1437 PetscErrorCode ierr; 1438 IS iscol=a->col,isrow=a->row; 1439 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 1440 const PetscInt *r,*c,*rout,*cout; 1441 PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1442 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 1443 const MatScalar *aa=a->a,*v; 1444 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4,*x,*t; 1445 const PetscScalar *b; 1446 1447 PetscFunctionBegin; 1448 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1449 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1450 t = a->solve_work; 1451 1452 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1453 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1454 1455 /* copy b into temp work space according to permutation */ 1456 for(i=0;i<n;i++){ 1457 ii = bs*i; ic = bs*c[i]; 1458 t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3]; 1459 } 1460 1461 /* forward solve the U^T */ 1462 idx = 0; 1463 for (i=0; i<n; i++) { 1464 v = aa + bs2*diag[i]; 1465 /* multiply by the inverse of the block diagonal */ 1466 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 1467 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 1468 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 1469 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 1470 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 1471 v -= bs2; 1472 1473 vi = aj + diag[i] - 1; 1474 nz = diag[i] - diag[i+1] - 1; 1475 for(j=0;j>-nz;j--){ 1476 oidx = bs*vi[j]; 1477 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 1478 t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 1479 t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 1480 t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 1481 v -= bs2; 1482 } 1483 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; 1484 idx += bs; 1485 } 1486 /* backward solve the L^T */ 1487 for (i=n-1; i>=0; i--){ 1488 v = aa + bs2*ai[i]; 1489 vi = aj + ai[i]; 1490 nz = ai[i+1] - ai[i]; 1491 idt = bs*i; 1492 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; 1493 for(j=0;j<nz;j++){ 1494 idx = bs*vi[j]; 1495 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 1496 t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 1497 t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 1498 t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 1499 v += bs2; 1500 } 1501 } 1502 1503 /* copy t into x according to permutation */ 1504 for(i=0;i<n;i++){ 1505 ii = bs*i; ir = bs*r[i]; 1506 x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3]; 1507 } 1508 1509 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1510 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1511 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1512 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1513 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1514 PetscFunctionReturn(0); 1515 } 1516 1517 #undef __FUNCT__ 1518 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_inplace" 1519 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx) 1520 { 1521 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1522 IS iscol=a->col,isrow=a->row; 1523 PetscErrorCode ierr; 1524 const PetscInt *r,*c,*rout,*cout; 1525 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1526 PetscInt i,nz,idx,idt,ii,ic,ir,oidx; 1527 const MatScalar *aa=a->a,*v; 1528 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t; 1529 const PetscScalar *b; 1530 1531 PetscFunctionBegin; 1532 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1533 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1534 t = a->solve_work; 1535 1536 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1537 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1538 1539 /* copy the b into temp work space according to permutation */ 1540 ii = 0; 1541 for (i=0; i<n; i++) { 1542 ic = 5*c[i]; 1543 t[ii] = b[ic]; 1544 t[ii+1] = b[ic+1]; 1545 t[ii+2] = b[ic+2]; 1546 t[ii+3] = b[ic+3]; 1547 t[ii+4] = b[ic+4]; 1548 ii += 5; 1549 } 1550 1551 /* forward solve the U^T */ 1552 idx = 0; 1553 for (i=0; i<n; i++) { 1554 1555 v = aa + 25*diag[i]; 1556 /* multiply by the inverse of the block diagonal */ 1557 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1558 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 1559 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 1560 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 1561 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 1562 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 1563 v += 25; 1564 1565 vi = aj + diag[i] + 1; 1566 nz = ai[i+1] - diag[i] - 1; 1567 while (nz--) { 1568 oidx = 5*(*vi++); 1569 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 1570 t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 1571 t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 1572 t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 1573 t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 1574 v += 25; 1575 } 1576 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1577 idx += 5; 1578 } 1579 /* backward solve the L^T */ 1580 for (i=n-1; i>=0; i--){ 1581 v = aa + 25*diag[i] - 25; 1582 vi = aj + diag[i] - 1; 1583 nz = diag[i] - ai[i]; 1584 idt = 5*i; 1585 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1586 while (nz--) { 1587 idx = 5*(*vi--); 1588 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 1589 t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 1590 t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 1591 t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 1592 t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 1593 v -= 25; 1594 } 1595 } 1596 1597 /* copy t into x according to permutation */ 1598 ii = 0; 1599 for (i=0; i<n; i++) { 1600 ir = 5*r[i]; 1601 x[ir] = t[ii]; 1602 x[ir+1] = t[ii+1]; 1603 x[ir+2] = t[ii+2]; 1604 x[ir+3] = t[ii+3]; 1605 x[ir+4] = t[ii+4]; 1606 ii += 5; 1607 } 1608 1609 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1610 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1611 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1612 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1613 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 1614 PetscFunctionReturn(0); 1615 } 1616 1617 #undef __FUNCT__ 1618 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5" 1619 PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 1620 { 1621 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1622 PetscErrorCode ierr; 1623 IS iscol=a->col,isrow=a->row; 1624 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 1625 const PetscInt *r,*c,*rout,*cout; 1626 PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1627 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 1628 const MatScalar *aa=a->a,*v; 1629 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t; 1630 const PetscScalar *b; 1631 1632 PetscFunctionBegin; 1633 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1634 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1635 t = a->solve_work; 1636 1637 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1638 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1639 1640 /* copy b into temp work space according to permutation */ 1641 for(i=0;i<n;i++){ 1642 ii = bs*i; ic = bs*c[i]; 1643 t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3]; 1644 t[ii+4] = b[ic+4]; 1645 } 1646 1647 /* forward solve the U^T */ 1648 idx = 0; 1649 for (i=0; i<n; i++) { 1650 v = aa + bs2*diag[i]; 1651 /* multiply by the inverse of the block diagonal */ 1652 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1653 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 1654 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 1655 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 1656 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 1657 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 1658 v -= bs2; 1659 1660 vi = aj + diag[i] - 1; 1661 nz = diag[i] - diag[i+1] - 1; 1662 for(j=0;j>-nz;j--){ 1663 oidx = bs*vi[j]; 1664 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 1665 t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 1666 t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 1667 t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 1668 t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 1669 v -= bs2; 1670 } 1671 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; t[4+idx] =s5; 1672 idx += bs; 1673 } 1674 /* backward solve the L^T */ 1675 for (i=n-1; i>=0; i--){ 1676 v = aa + bs2*ai[i]; 1677 vi = aj + ai[i]; 1678 nz = ai[i+1] - ai[i]; 1679 idt = bs*i; 1680 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; s5 = t[4+idt]; 1681 for(j=0;j<nz;j++){ 1682 idx = bs*vi[j]; 1683 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 1684 t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 1685 t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 1686 t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 1687 t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 1688 v += bs2; 1689 } 1690 } 1691 1692 /* copy t into x according to permutation */ 1693 for(i=0;i<n;i++){ 1694 ii = bs*i; ir = bs*r[i]; 1695 x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3]; 1696 x[ir+4] = t[ii+4]; 1697 } 1698 1699 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1700 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1701 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1702 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1703 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1704 PetscFunctionReturn(0); 1705 } 1706 1707 #undef __FUNCT__ 1708 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_inplace" 1709 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx) 1710 { 1711 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1712 IS iscol=a->col,isrow=a->row; 1713 PetscErrorCode ierr; 1714 const PetscInt *r,*c,*rout,*cout; 1715 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1716 PetscInt i,nz,idx,idt,ii,ic,ir,oidx; 1717 const MatScalar *aa=a->a,*v; 1718 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t; 1719 const PetscScalar *b; 1720 1721 PetscFunctionBegin; 1722 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1723 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1724 t = a->solve_work; 1725 1726 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1727 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1728 1729 /* copy the b into temp work space according to permutation */ 1730 ii = 0; 1731 for (i=0; i<n; i++) { 1732 ic = 6*c[i]; 1733 t[ii] = b[ic]; 1734 t[ii+1] = b[ic+1]; 1735 t[ii+2] = b[ic+2]; 1736 t[ii+3] = b[ic+3]; 1737 t[ii+4] = b[ic+4]; 1738 t[ii+5] = b[ic+5]; 1739 ii += 6; 1740 } 1741 1742 /* forward solve the U^T */ 1743 idx = 0; 1744 for (i=0; i<n; i++) { 1745 1746 v = aa + 36*diag[i]; 1747 /* multiply by the inverse of the block diagonal */ 1748 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1749 x6 = t[5+idx]; 1750 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 1751 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 1752 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 1753 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 1754 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 1755 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 1756 v += 36; 1757 1758 vi = aj + diag[i] + 1; 1759 nz = ai[i+1] - diag[i] - 1; 1760 while (nz--) { 1761 oidx = 6*(*vi++); 1762 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 1763 t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 1764 t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 1765 t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 1766 t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 1767 t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 1768 v += 36; 1769 } 1770 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1771 t[5+idx] = s6; 1772 idx += 6; 1773 } 1774 /* backward solve the L^T */ 1775 for (i=n-1; i>=0; i--){ 1776 v = aa + 36*diag[i] - 36; 1777 vi = aj + diag[i] - 1; 1778 nz = diag[i] - ai[i]; 1779 idt = 6*i; 1780 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1781 s6 = t[5+idt]; 1782 while (nz--) { 1783 idx = 6*(*vi--); 1784 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 1785 t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 1786 t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 1787 t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 1788 t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 1789 t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 1790 v -= 36; 1791 } 1792 } 1793 1794 /* copy t into x according to permutation */ 1795 ii = 0; 1796 for (i=0; i<n; i++) { 1797 ir = 6*r[i]; 1798 x[ir] = t[ii]; 1799 x[ir+1] = t[ii+1]; 1800 x[ir+2] = t[ii+2]; 1801 x[ir+3] = t[ii+3]; 1802 x[ir+4] = t[ii+4]; 1803 x[ir+5] = t[ii+5]; 1804 ii += 6; 1805 } 1806 1807 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1808 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1809 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1810 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1811 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1812 PetscFunctionReturn(0); 1813 } 1814 1815 #undef __FUNCT__ 1816 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6" 1817 PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 1818 { 1819 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1820 PetscErrorCode ierr; 1821 IS iscol=a->col,isrow=a->row; 1822 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 1823 const PetscInt *r,*c,*rout,*cout; 1824 PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1825 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 1826 const MatScalar *aa=a->a,*v; 1827 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t; 1828 const PetscScalar *b; 1829 1830 PetscFunctionBegin; 1831 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1832 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1833 t = a->solve_work; 1834 1835 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1836 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1837 1838 /* copy b into temp work space according to permutation */ 1839 for(i=0;i<n;i++){ 1840 ii = bs*i; ic = bs*c[i]; 1841 t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3]; 1842 t[ii+4] = b[ic+4]; t[ii+5] = b[ic+5]; 1843 } 1844 1845 /* forward solve the U^T */ 1846 idx = 0; 1847 for (i=0; i<n; i++) { 1848 v = aa + bs2*diag[i]; 1849 /* multiply by the inverse of the block diagonal */ 1850 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1851 x6 = t[5+idx]; 1852 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 1853 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 1854 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 1855 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 1856 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 1857 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 1858 v -= bs2; 1859 1860 vi = aj + diag[i] - 1; 1861 nz = diag[i] - diag[i+1] - 1; 1862 for(j=0;j>-nz;j--){ 1863 oidx = bs*vi[j]; 1864 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 1865 t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 1866 t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 1867 t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 1868 t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 1869 t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 1870 v -= bs2; 1871 } 1872 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; t[4+idx] =s5; 1873 t[5+idx] = s6; 1874 idx += bs; 1875 } 1876 /* backward solve the L^T */ 1877 for (i=n-1; i>=0; i--){ 1878 v = aa + bs2*ai[i]; 1879 vi = aj + ai[i]; 1880 nz = ai[i+1] - ai[i]; 1881 idt = bs*i; 1882 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; s5 = t[4+idt]; 1883 s6 = t[5+idt]; 1884 for(j=0;j<nz;j++){ 1885 idx = bs*vi[j]; 1886 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 1887 t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 1888 t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 1889 t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 1890 t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 1891 t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 1892 v += bs2; 1893 } 1894 } 1895 1896 /* copy t into x according to permutation */ 1897 for(i=0;i<n;i++){ 1898 ii = bs*i; ir = bs*r[i]; 1899 x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3]; 1900 x[ir+4] = t[ii+4]; x[ir+5] = t[ii+5]; 1901 } 1902 1903 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1904 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1905 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1906 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1907 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1908 PetscFunctionReturn(0); 1909 } 1910 1911 #undef __FUNCT__ 1912 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_inplace" 1913 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx) 1914 { 1915 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1916 IS iscol=a->col,isrow=a->row; 1917 PetscErrorCode ierr; 1918 const PetscInt *r,*c,*rout,*cout; 1919 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1920 PetscInt i,nz,idx,idt,ii,ic,ir,oidx; 1921 const MatScalar *aa=a->a,*v; 1922 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t; 1923 const PetscScalar *b; 1924 1925 PetscFunctionBegin; 1926 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1927 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1928 t = a->solve_work; 1929 1930 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1931 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1932 1933 /* copy the b into temp work space according to permutation */ 1934 ii = 0; 1935 for (i=0; i<n; i++) { 1936 ic = 7*c[i]; 1937 t[ii] = b[ic]; 1938 t[ii+1] = b[ic+1]; 1939 t[ii+2] = b[ic+2]; 1940 t[ii+3] = b[ic+3]; 1941 t[ii+4] = b[ic+4]; 1942 t[ii+5] = b[ic+5]; 1943 t[ii+6] = b[ic+6]; 1944 ii += 7; 1945 } 1946 1947 /* forward solve the U^T */ 1948 idx = 0; 1949 for (i=0; i<n; i++) { 1950 1951 v = aa + 49*diag[i]; 1952 /* multiply by the inverse of the block diagonal */ 1953 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1954 x6 = t[5+idx]; x7 = t[6+idx]; 1955 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 1956 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 1957 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 1958 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 1959 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 1960 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 1961 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 1962 v += 49; 1963 1964 vi = aj + diag[i] + 1; 1965 nz = ai[i+1] - diag[i] - 1; 1966 while (nz--) { 1967 oidx = 7*(*vi++); 1968 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1969 t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1970 t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1971 t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1972 t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1973 t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1974 t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1975 v += 49; 1976 } 1977 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1978 t[5+idx] = s6;t[6+idx] = s7; 1979 idx += 7; 1980 } 1981 /* backward solve the L^T */ 1982 for (i=n-1; i>=0; i--){ 1983 v = aa + 49*diag[i] - 49; 1984 vi = aj + diag[i] - 1; 1985 nz = diag[i] - ai[i]; 1986 idt = 7*i; 1987 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1988 s6 = t[5+idt];s7 = t[6+idt]; 1989 while (nz--) { 1990 idx = 7*(*vi--); 1991 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1992 t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1993 t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1994 t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1995 t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1996 t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1997 t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1998 v -= 49; 1999 } 2000 } 2001 2002 /* copy t into x according to permutation */ 2003 ii = 0; 2004 for (i=0; i<n; i++) { 2005 ir = 7*r[i]; 2006 x[ir] = t[ii]; 2007 x[ir+1] = t[ii+1]; 2008 x[ir+2] = t[ii+2]; 2009 x[ir+3] = t[ii+3]; 2010 x[ir+4] = t[ii+4]; 2011 x[ir+5] = t[ii+5]; 2012 x[ir+6] = t[ii+6]; 2013 ii += 7; 2014 } 2015 2016 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2017 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2018 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2019 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2020 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 2021 PetscFunctionReturn(0); 2022 } 2023 #undef __FUNCT__ 2024 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7" 2025 PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 2026 { 2027 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2028 PetscErrorCode ierr; 2029 IS iscol=a->col,isrow=a->row; 2030 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 2031 const PetscInt *r,*c,*rout,*cout; 2032 PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 2033 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 2034 const MatScalar *aa=a->a,*v; 2035 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t; 2036 const PetscScalar *b; 2037 2038 PetscFunctionBegin; 2039 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2040 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2041 t = a->solve_work; 2042 2043 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2044 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2045 2046 /* copy b into temp work space according to permutation */ 2047 for(i=0;i<n;i++){ 2048 ii = bs*i; ic = bs*c[i]; 2049 t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3]; 2050 t[ii+4] = b[ic+4]; t[ii+5] = b[ic+5]; t[ii+6] = b[ic+6]; 2051 } 2052 2053 /* forward solve the U^T */ 2054 idx = 0; 2055 for (i=0; i<n; i++) { 2056 v = aa + bs2*diag[i]; 2057 /* multiply by the inverse of the block diagonal */ 2058 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 2059 x6 = t[5+idx]; x7 = t[6+idx]; 2060 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 2061 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 2062 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 2063 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 2064 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 2065 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 2066 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 2067 v -= bs2; 2068 2069 vi = aj + diag[i] - 1; 2070 nz = diag[i] - diag[i+1] - 1; 2071 for(j=0;j>-nz;j--){ 2072 oidx = bs*vi[j]; 2073 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 2074 t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 2075 t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 2076 t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 2077 t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 2078 t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 2079 t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 2080 v -= bs2; 2081 } 2082 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; t[4+idx] =s5; 2083 t[5+idx] = s6; t[6+idx] = s7; 2084 idx += bs; 2085 } 2086 /* backward solve the L^T */ 2087 for (i=n-1; i>=0; i--){ 2088 v = aa + bs2*ai[i]; 2089 vi = aj + ai[i]; 2090 nz = ai[i+1] - ai[i]; 2091 idt = bs*i; 2092 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; s5 = t[4+idt]; 2093 s6 = t[5+idt]; s7 = t[6+idt]; 2094 for(j=0;j<nz;j++){ 2095 idx = bs*vi[j]; 2096 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 2097 t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 2098 t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 2099 t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 2100 t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 2101 t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 2102 t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 2103 v += bs2; 2104 } 2105 } 2106 2107 /* copy t into x according to permutation */ 2108 for(i=0;i<n;i++){ 2109 ii = bs*i; ir = bs*r[i]; 2110 x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3]; 2111 x[ir+4] = t[ii+4]; x[ir+5] = t[ii+5]; x[ir+6] = t[ii+6]; 2112 } 2113 2114 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2115 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2116 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2117 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2118 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 2119 PetscFunctionReturn(0); 2120 } 2121 2122 /* ----------------------------------------------------------- */ 2123 #undef __FUNCT__ 2124 #define __FUNCT__ "MatSolve_SeqBAIJ_N_inplace" 2125 PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx) 2126 { 2127 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2128 IS iscol=a->col,isrow=a->row; 2129 PetscErrorCode ierr; 2130 const PetscInt *r,*c,*rout,*cout; 2131 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*vi; 2132 PetscInt i,nz; 2133 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 2134 const MatScalar *aa=a->a,*v; 2135 PetscScalar *x,*s,*t,*ls; 2136 const PetscScalar *b; 2137 2138 PetscFunctionBegin; 2139 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2140 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2141 t = a->solve_work; 2142 2143 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2144 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2145 2146 /* forward solve the lower triangular */ 2147 ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 2148 for (i=1; i<n; i++) { 2149 v = aa + bs2*ai[i]; 2150 vi = aj + ai[i]; 2151 nz = a->diag[i] - ai[i]; 2152 s = t + bs*i; 2153 ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 2154 while (nz--) { 2155 Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 2156 v += bs2; 2157 } 2158 } 2159 /* backward solve the upper triangular */ 2160 ls = a->solve_work + A->cmap->n; 2161 for (i=n-1; i>=0; i--){ 2162 v = aa + bs2*(a->diag[i] + 1); 2163 vi = aj + a->diag[i] + 1; 2164 nz = ai[i+1] - a->diag[i] - 1; 2165 ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 2166 while (nz--) { 2167 Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 2168 v += bs2; 2169 } 2170 Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 2171 ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 2172 } 2173 2174 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2175 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2176 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2177 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2178 ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 2179 PetscFunctionReturn(0); 2180 } 2181 2182 /* ----------------------------------------------------------- */ 2183 #undef __FUNCT__ 2184 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N_inplace" 2185 PetscErrorCode MatSolveTranspose_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx) 2186 { 2187 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2188 IS iscol=a->col,isrow=a->row; 2189 PetscErrorCode ierr; 2190 const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi; 2191 PetscInt i,nz,j; 2192 const PetscInt n=a->mbs,bs=A->rmap->bs,bs2=a->bs2; 2193 const MatScalar *aa=a->a,*v; 2194 PetscScalar *x,*t,*ls; 2195 const PetscScalar *b; 2196 PetscFunctionBegin; 2197 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2198 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2199 t = a->solve_work; 2200 2201 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2202 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2203 2204 /* copy the b into temp work space according to permutation */ 2205 for (i=0; i<n; i++) { 2206 for (j=0; j<bs; j++) { 2207 t[i*bs+j] = b[c[i]*bs+j]; 2208 } 2209 } 2210 2211 2212 /* forward solve the upper triangular transpose */ 2213 ls = a->solve_work + A->cmap->n; 2214 for (i=0; i<n; i++){ 2215 ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 2216 Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 2217 v = aa + bs2*(a->diag[i] + 1); 2218 vi = aj + a->diag[i] + 1; 2219 nz = ai[i+1] - a->diag[i] - 1; 2220 while (nz--) { 2221 Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs); 2222 v += bs2; 2223 } 2224 } 2225 2226 /* backward solve the lower triangular transpose */ 2227 for (i=n-1; i>=0; i--) { 2228 v = aa + bs2*ai[i]; 2229 vi = aj + ai[i]; 2230 nz = a->diag[i] - ai[i]; 2231 while (nz--) { 2232 Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs); 2233 v += bs2; 2234 } 2235 } 2236 2237 /* copy t into x according to permutation */ 2238 for (i=0; i<n; i++) { 2239 for (j=0; j<bs; j++) { 2240 x[bs*r[i]+j] = t[bs*i+j]; 2241 } 2242 } 2243 2244 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2245 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2246 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2247 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2248 ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 2249 PetscFunctionReturn(0); 2250 } 2251 2252 #undef __FUNCT__ 2253 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N" 2254 PetscErrorCode MatSolveTranspose_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 2255 { 2256 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2257 IS iscol=a->col,isrow=a->row; 2258 PetscErrorCode ierr; 2259 const PetscInt *r,*c,*rout,*cout; 2260 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*vi,*diag=a->diag; 2261 PetscInt i,j,nz; 2262 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 2263 const MatScalar *aa=a->a,*v; 2264 PetscScalar *x,*t,*ls; 2265 const PetscScalar *b; 2266 2267 PetscFunctionBegin; 2268 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2269 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2270 t = a->solve_work; 2271 2272 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2273 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2274 2275 /* copy the b into temp work space according to permutation */ 2276 for (i=0; i<n; i++) { 2277 for (j=0; j<bs; j++) { 2278 t[i*bs+j] = b[c[i]*bs+j]; 2279 } 2280 } 2281 2282 2283 /* forward solve the upper triangular transpose */ 2284 ls = a->solve_work + A->cmap->n; 2285 for (i=0; i<n; i++){ 2286 ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 2287 Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*diag[i],t+i*bs); 2288 v = aa + bs2*(diag[i] - 1); 2289 vi = aj + diag[i] - 1; 2290 nz = diag[i] - diag[i+1] - 1; 2291 for(j=0;j>-nz;j--){ 2292 Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs); 2293 v -= bs2; 2294 } 2295 } 2296 2297 /* backward solve the lower triangular transpose */ 2298 for (i=n-1; i>=0; i--) { 2299 v = aa + bs2*ai[i]; 2300 vi = aj + ai[i]; 2301 nz = ai[i+1] - ai[i]; 2302 for(j=0;j<nz;j++){ 2303 Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs); 2304 v += bs2; 2305 } 2306 } 2307 2308 /* copy t into x according to permutation */ 2309 for (i=0; i<n; i++) { 2310 for (j=0; j<bs; j++) { 2311 x[bs*r[i]+j] = t[bs*i+j]; 2312 } 2313 } 2314 2315 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2316 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2317 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2318 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2319 ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 2320 PetscFunctionReturn(0); 2321 } 2322 2323 /* bs = 15 for PFLOTRAN. Block operations are done by accessing all the columns of the block at once */ 2324 2325 #undef __FUNCT__ 2326 #define __FUNCT__ "MatSolve_SeqBAIJ_15_NaturalOrdering_ver2" 2327 PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver2(Mat A,Vec bb,Vec xx) 2328 { 2329 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2330 PetscErrorCode ierr; 2331 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2; 2332 PetscInt i,nz,idx,idt,m; 2333 const MatScalar *aa=a->a,*v; 2334 PetscScalar s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15; 2335 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15; 2336 PetscScalar *x; 2337 const PetscScalar *b; 2338 2339 PetscFunctionBegin; 2340 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2341 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2342 2343 /* forward solve the lower triangular */ 2344 idx = 0; 2345 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx]; x[4] = b[4+idx]; 2346 x[5] = b[5+idx]; x[6] = b[6+idx]; x[7] = b[7+idx]; x[8] = b[8+idx]; x[9] = b[9+idx]; 2347 x[10] = b[10+idx]; x[11] = b[11+idx]; x[12] = b[12+idx]; x[13] = b[13+idx]; x[14] = b[14+idx]; 2348 2349 for (i=1; i<n; i++) { 2350 v = aa + bs2*ai[i]; 2351 vi = aj + ai[i]; 2352 nz = ai[i+1] - ai[i]; 2353 idt = bs*i; 2354 s1 = b[idt]; s2 = b[1+idt]; s3 = b[2+idt]; s4 = b[3+idt]; s5 = b[4+idt]; 2355 s6 = b[5+idt]; s7 = b[6+idt]; s8 = b[7+idt]; s9 = b[8+idt]; s10 = b[9+idt]; 2356 s11 = b[10+idt]; s12 = b[11+idt]; s13 = b[12+idt]; s14 = b[13+idt]; s15 = b[14+idt]; 2357 for(m=0;m<nz;m++){ 2358 idx = bs*vi[m]; 2359 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 2360 x6 = x[5+idx]; x7 = x[6+idx]; x8 = x[7+idx]; x9 = x[8+idx]; x10 = x[9+idx]; 2361 x11 = x[10+idx]; x12 = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx]; 2362 2363 2364 s1 -= v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15; 2365 s2 -= v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15; 2366 s3 -= v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15; 2367 s4 -= v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15; 2368 s5 -= v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15; 2369 s6 -= v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15; 2370 s7 -= v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15; 2371 s8 -= v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15; 2372 s9 -= v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15; 2373 s10 -= v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15; 2374 s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15; 2375 s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15; 2376 s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15; 2377 s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15; 2378 s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15; 2379 2380 v += bs2; 2381 } 2382 x[idt] = s1; x[1+idt] = s2; x[2+idt] = s3; x[3+idt] = s4; x[4+idt] = s5; 2383 x[5+idt] = s6; x[6+idt] = s7; x[7+idt] = s8; x[8+idt] = s9; x[9+idt] = s10; 2384 x[10+idt] = s11; x[11+idt] = s12; x[12+idt] = s13; x[13+idt] = s14; x[14+idt] = s15; 2385 2386 } 2387 /* backward solve the upper triangular */ 2388 for (i=n-1; i>=0; i--){ 2389 v = aa + bs2*(adiag[i+1]+1); 2390 vi = aj + adiag[i+1]+1; 2391 nz = adiag[i] - adiag[i+1] - 1; 2392 idt = bs*i; 2393 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt]; 2394 s6 = x[5+idt]; s7 = x[6+idt]; s8 = x[7+idt]; s9 = x[8+idt]; s10 = x[9+idt]; 2395 s11 = x[10+idt]; s12 = x[11+idt]; s13 = x[12+idt]; s14 = x[13+idt]; s15 = x[14+idt]; 2396 2397 for(m=0;m<nz;m++){ 2398 idx = bs*vi[m]; 2399 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 2400 x6 = x[5+idx]; x7 = x[6+idx]; x8 = x[7+idx]; x9 = x[8+idx]; x10 = x[9+idx]; 2401 x11 = x[10+idx]; x12 = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx]; 2402 2403 s1 -= v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15; 2404 s2 -= v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15; 2405 s3 -= v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15; 2406 s4 -= v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15; 2407 s5 -= v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15; 2408 s6 -= v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15; 2409 s7 -= v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15; 2410 s8 -= v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15; 2411 s9 -= v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15; 2412 s10 -= v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15; 2413 s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15; 2414 s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15; 2415 s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15; 2416 s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15; 2417 s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15; 2418 2419 v += bs2; 2420 } 2421 2422 x[idt] = v[0]*s1 + v[15]*s2 + v[30]*s3 + v[45]*s4 + v[60]*s5 + v[75]*s6 + v[90]*s7 + v[105]*s8 + v[120]*s9 + v[135]*s10 + v[150]*s11 + v[165]*s12 + v[180]*s13 + v[195]*s14 + v[210]*s15; 2423 x[1+idt] = v[1]*s1 + v[16]*s2 + v[31]*s3 + v[46]*s4 + v[61]*s5 + v[76]*s6 + v[91]*s7 + v[106]*s8 + v[121]*s9 + v[136]*s10 + v[151]*s11 + v[166]*s12 + v[181]*s13 + v[196]*s14 + v[211]*s15; 2424 x[2+idt] = v[2]*s1 + v[17]*s2 + v[32]*s3 + v[47]*s4 + v[62]*s5 + v[77]*s6 + v[92]*s7 + v[107]*s8 + v[122]*s9 + v[137]*s10 + v[152]*s11 + v[167]*s12 + v[182]*s13 + v[197]*s14 + v[212]*s15; 2425 x[3+idt] = v[3]*s1 + v[18]*s2 + v[33]*s3 + v[48]*s4 + v[63]*s5 + v[78]*s6 + v[93]*s7 + v[108]*s8 + v[123]*s9 + v[138]*s10 + v[153]*s11 + v[168]*s12 + v[183]*s13 + v[198]*s14 + v[213]*s15; 2426 x[4+idt] = v[4]*s1 + v[19]*s2 + v[34]*s3 + v[49]*s4 + v[64]*s5 + v[79]*s6 + v[94]*s7 + v[109]*s8 + v[124]*s9 + v[139]*s10 + v[154]*s11 + v[169]*s12 + v[184]*s13 + v[199]*s14 + v[214]*s15; 2427 x[5+idt] = v[5]*s1 + v[20]*s2 + v[35]*s3 + v[50]*s4 + v[65]*s5 + v[80]*s6 + v[95]*s7 + v[110]*s8 + v[125]*s9 + v[140]*s10 + v[155]*s11 + v[170]*s12 + v[185]*s13 + v[200]*s14 + v[215]*s15; 2428 x[6+idt] = v[6]*s1 + v[21]*s2 + v[36]*s3 + v[51]*s4 + v[66]*s5 + v[81]*s6 + v[96]*s7 + v[111]*s8 + v[126]*s9 + v[141]*s10 + v[156]*s11 + v[171]*s12 + v[186]*s13 + v[201]*s14 + v[216]*s15; 2429 x[7+idt] = v[7]*s1 + v[22]*s2 + v[37]*s3 + v[52]*s4 + v[67]*s5 + v[82]*s6 + v[97]*s7 + v[112]*s8 + v[127]*s9 + v[142]*s10 + v[157]*s11 + v[172]*s12 + v[187]*s13 + v[202]*s14 + v[217]*s15; 2430 x[8+idt] = v[8]*s1 + v[23]*s2 + v[38]*s3 + v[53]*s4 + v[68]*s5 + v[83]*s6 + v[98]*s7 + v[113]*s8 + v[128]*s9 + v[143]*s10 + v[158]*s11 + v[173]*s12 + v[188]*s13 + v[203]*s14 + v[218]*s15; 2431 x[9+idt] = v[9]*s1 + v[24]*s2 + v[39]*s3 + v[54]*s4 + v[69]*s5 + v[84]*s6 + v[99]*s7 + v[114]*s8 + v[129]*s9 + v[144]*s10 + v[159]*s11 + v[174]*s12 + v[189]*s13 + v[204]*s14 + v[219]*s15; 2432 x[10+idt] = v[10]*s1 + v[25]*s2 + v[40]*s3 + v[55]*s4 + v[70]*s5 + v[85]*s6 + v[100]*s7 + v[115]*s8 + v[130]*s9 + v[145]*s10 + v[160]*s11 + v[175]*s12 + v[190]*s13 + v[205]*s14 + v[220]*s15; 2433 x[11+idt] = v[11]*s1 + v[26]*s2 + v[41]*s3 + v[56]*s4 + v[71]*s5 + v[86]*s6 + v[101]*s7 + v[116]*s8 + v[131]*s9 + v[146]*s10 + v[161]*s11 + v[176]*s12 + v[191]*s13 + v[206]*s14 + v[221]*s15; 2434 x[12+idt] = v[12]*s1 + v[27]*s2 + v[42]*s3 + v[57]*s4 + v[72]*s5 + v[87]*s6 + v[102]*s7 + v[117]*s8 + v[132]*s9 + v[147]*s10 + v[162]*s11 + v[177]*s12 + v[192]*s13 + v[207]*s14 + v[222]*s15; 2435 x[13+idt] = v[13]*s1 + v[28]*s2 + v[43]*s3 + v[58]*s4 + v[73]*s5 + v[88]*s6 + v[103]*s7 + v[118]*s8 + v[133]*s9 + v[148]*s10 + v[163]*s11 + v[178]*s12 + v[193]*s13 + v[208]*s14 + v[223]*s15; 2436 x[14+idt] = v[14]*s1 + v[29]*s2 + v[44]*s3 + v[59]*s4 + v[74]*s5 + v[89]*s6 + v[104]*s7 + v[119]*s8 + v[134]*s9 + v[149]*s10 + v[164]*s11 + v[179]*s12 + v[194]*s13 + v[209]*s14 + v[224]*s15; 2437 2438 } 2439 2440 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2441 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2442 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 2443 PetscFunctionReturn(0); 2444 } 2445 2446 /* bs = 15 for PFLOTRAN. Block operations are done by accessing one column at at time */ 2447 /* Default MatSolve for block size 15 */ 2448 2449 #undef __FUNCT__ 2450 #define __FUNCT__ "MatSolve_SeqBAIJ_15_NaturalOrdering_ver1" 2451 PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver1(Mat A,Vec bb,Vec xx) 2452 { 2453 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2454 PetscErrorCode ierr; 2455 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2; 2456 PetscInt i,k,nz,idx,idt,m; 2457 const MatScalar *aa=a->a,*v; 2458 PetscScalar s[15]; 2459 PetscScalar *x,xv; 2460 const PetscScalar *b; 2461 2462 PetscFunctionBegin; 2463 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2464 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2465 2466 /* forward solve the lower triangular */ 2467 for (i=0; i<n; i++) { 2468 v = aa + bs2*ai[i]; 2469 vi = aj + ai[i]; 2470 nz = ai[i+1] - ai[i]; 2471 idt = bs*i; 2472 x[idt] = b[idt]; x[1+idt] = b[1+idt]; x[2+idt] = b[2+idt]; x[3+idt] = b[3+idt]; x[4+idt] = b[4+idt]; 2473 x[5+idt] = b[5+idt]; x[6+idt] = b[6+idt]; x[7+idt] = b[7+idt]; x[8+idt] = b[8+idt]; x[9+idt] = b[9+idt]; 2474 x[10+idt] = b[10+idt]; x[11+idt] = b[11+idt]; x[12+idt] = b[12+idt]; x[13+idt] = b[13+idt]; x[14+idt] = b[14+idt]; 2475 for(m=0;m<nz;m++){ 2476 idx = bs*vi[m]; 2477 for(k=0;k<15;k++){ 2478 xv = x[k + idx]; 2479 x[idt] -= v[0]*xv; 2480 x[1+idt] -= v[1]*xv; 2481 x[2+idt] -= v[2]*xv; 2482 x[3+idt] -= v[3]*xv; 2483 x[4+idt] -= v[4]*xv; 2484 x[5+idt] -= v[5]*xv; 2485 x[6+idt] -= v[6]*xv; 2486 x[7+idt] -= v[7]*xv; 2487 x[8+idt] -= v[8]*xv; 2488 x[9+idt] -= v[9]*xv; 2489 x[10+idt] -= v[10]*xv; 2490 x[11+idt] -= v[11]*xv; 2491 x[12+idt] -= v[12]*xv; 2492 x[13+idt] -= v[13]*xv; 2493 x[14+idt] -= v[14]*xv; 2494 v += 15; 2495 } 2496 } 2497 } 2498 /* backward solve the upper triangular */ 2499 for (i=n-1; i>=0; i--){ 2500 v = aa + bs2*(adiag[i+1]+1); 2501 vi = aj + adiag[i+1]+1; 2502 nz = adiag[i] - adiag[i+1] - 1; 2503 idt = bs*i; 2504 s[0] = x[idt]; s[1] = x[1+idt]; s[2] = x[2+idt]; s[3] = x[3+idt]; s[4] = x[4+idt]; 2505 s[5] = x[5+idt]; s[6] = x[6+idt]; s[7] = x[7+idt]; s[8] = x[8+idt]; s[9] = x[9+idt]; 2506 s[10] = x[10+idt]; s[11] = x[11+idt]; s[12] = x[12+idt]; s[13] = x[13+idt]; s[14] = x[14+idt]; 2507 2508 for(m=0;m<nz;m++){ 2509 idx = bs*vi[m]; 2510 for(k=0;k<15;k++){ 2511 xv = x[k + idx]; 2512 s[0] -= v[0]*xv; 2513 s[1] -= v[1]*xv; 2514 s[2] -= v[2]*xv; 2515 s[3] -= v[3]*xv; 2516 s[4] -= v[4]*xv; 2517 s[5] -= v[5]*xv; 2518 s[6] -= v[6]*xv; 2519 s[7] -= v[7]*xv; 2520 s[8] -= v[8]*xv; 2521 s[9] -= v[9]*xv; 2522 s[10] -= v[10]*xv; 2523 s[11] -= v[11]*xv; 2524 s[12] -= v[12]*xv; 2525 s[13] -= v[13]*xv; 2526 s[14] -= v[14]*xv; 2527 v += 15; 2528 } 2529 } 2530 ierr = PetscMemzero(x+idt,bs*sizeof(MatScalar));CHKERRQ(ierr); 2531 for(k=0;k<15;k++){ 2532 x[idt] += v[0]*s[k]; 2533 x[1+idt] += v[1]*s[k]; 2534 x[2+idt] += v[2]*s[k]; 2535 x[3+idt] += v[3]*s[k]; 2536 x[4+idt] += v[4]*s[k]; 2537 x[5+idt] += v[5]*s[k]; 2538 x[6+idt] += v[6]*s[k]; 2539 x[7+idt] += v[7]*s[k]; 2540 x[8+idt] += v[8]*s[k]; 2541 x[9+idt] += v[9]*s[k]; 2542 x[10+idt] += v[10]*s[k]; 2543 x[11+idt] += v[11]*s[k]; 2544 x[12+idt] += v[12]*s[k]; 2545 x[13+idt] += v[13]*s[k]; 2546 x[14+idt] += v[14]*s[k]; 2547 v += 15; 2548 } 2549 } 2550 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2551 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2552 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 2553 PetscFunctionReturn(0); 2554 } 2555 2556 2557 #undef __FUNCT__ 2558 #define __FUNCT__ "MatSolve_SeqBAIJ_7_inplace" 2559 PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx) 2560 { 2561 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2562 IS iscol=a->col,isrow=a->row; 2563 PetscErrorCode ierr; 2564 const PetscInt *r,*c,*ai=a->i,*aj=a->j; 2565 const PetscInt *rout,*cout,*diag = a->diag,*vi,n=a->mbs; 2566 PetscInt i,nz,idx,idt,idc; 2567 const MatScalar *aa=a->a,*v; 2568 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t; 2569 const PetscScalar *b; 2570 2571 PetscFunctionBegin; 2572 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2573 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2574 t = a->solve_work; 2575 2576 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2577 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2578 2579 /* forward solve the lower triangular */ 2580 idx = 7*(*r++); 2581 t[0] = b[idx]; t[1] = b[1+idx]; 2582 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 2583 t[5] = b[5+idx]; t[6] = b[6+idx]; 2584 2585 for (i=1; i<n; i++) { 2586 v = aa + 49*ai[i]; 2587 vi = aj + ai[i]; 2588 nz = diag[i] - ai[i]; 2589 idx = 7*(*r++); 2590 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2591 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 2592 while (nz--) { 2593 idx = 7*(*vi++); 2594 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 2595 x4 = t[3+idx];x5 = t[4+idx]; 2596 x6 = t[5+idx];x7 = t[6+idx]; 2597 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2598 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2599 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2600 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2601 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2602 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2603 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2604 v += 49; 2605 } 2606 idx = 7*i; 2607 t[idx] = s1;t[1+idx] = s2; 2608 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 2609 t[5+idx] = s6;t[6+idx] = s7; 2610 } 2611 /* backward solve the upper triangular */ 2612 for (i=n-1; i>=0; i--){ 2613 v = aa + 49*diag[i] + 49; 2614 vi = aj + diag[i] + 1; 2615 nz = ai[i+1] - diag[i] - 1; 2616 idt = 7*i; 2617 s1 = t[idt]; s2 = t[1+idt]; 2618 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 2619 s6 = t[5+idt];s7 = t[6+idt]; 2620 while (nz--) { 2621 idx = 7*(*vi++); 2622 x1 = t[idx]; x2 = t[1+idx]; 2623 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 2624 x6 = t[5+idx]; x7 = t[6+idx]; 2625 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2626 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2627 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2628 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2629 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2630 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2631 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2632 v += 49; 2633 } 2634 idc = 7*(*c--); 2635 v = aa + 49*diag[i]; 2636 x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 2637 v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 2638 x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 2639 v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 2640 x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 2641 v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 2642 x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 2643 v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 2644 x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 2645 v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 2646 x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 2647 v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 2648 x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 2649 v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 2650 } 2651 2652 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2653 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2654 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2655 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2656 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 2657 PetscFunctionReturn(0); 2658 } 2659 2660 #undef __FUNCT__ 2661 #define __FUNCT__ "MatSolve_SeqBAIJ_7" 2662 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 2663 { 2664 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2665 IS iscol=a->col,isrow=a->row; 2666 PetscErrorCode ierr; 2667 const PetscInt *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag; 2668 const PetscInt n=a->mbs,*rout,*cout,*vi; 2669 PetscInt i,nz,idx,idt,idc,m; 2670 const MatScalar *aa=a->a,*v; 2671 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t; 2672 const PetscScalar *b; 2673 2674 PetscFunctionBegin; 2675 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2676 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2677 t = a->solve_work; 2678 2679 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2680 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2681 2682 /* forward solve the lower triangular */ 2683 idx = 7*r[0]; 2684 t[0] = b[idx]; t[1] = b[1+idx]; 2685 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 2686 t[5] = b[5+idx]; t[6] = b[6+idx]; 2687 2688 for (i=1; i<n; i++) { 2689 v = aa + 49*ai[i]; 2690 vi = aj + ai[i]; 2691 nz = ai[i+1] - ai[i]; 2692 idx = 7*r[i]; 2693 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2694 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 2695 for(m=0;m<nz;m++){ 2696 idx = 7*vi[m]; 2697 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 2698 x4 = t[3+idx];x5 = t[4+idx]; 2699 x6 = t[5+idx];x7 = t[6+idx]; 2700 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2701 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2702 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2703 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2704 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2705 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2706 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2707 v += 49; 2708 } 2709 idx = 7*i; 2710 t[idx] = s1;t[1+idx] = s2; 2711 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 2712 t[5+idx] = s6;t[6+idx] = s7; 2713 } 2714 /* backward solve the upper triangular */ 2715 for (i=n-1; i>=0; i--){ 2716 v = aa + 49*(adiag[i+1]+1); 2717 vi = aj + adiag[i+1]+1; 2718 nz = adiag[i] - adiag[i+1] - 1; 2719 idt = 7*i; 2720 s1 = t[idt]; s2 = t[1+idt]; 2721 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 2722 s6 = t[5+idt];s7 = t[6+idt]; 2723 for(m=0;m<nz;m++){ 2724 idx = 7*vi[m]; 2725 x1 = t[idx]; x2 = t[1+idx]; 2726 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 2727 x6 = t[5+idx]; x7 = t[6+idx]; 2728 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2729 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2730 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2731 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2732 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2733 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2734 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2735 v += 49; 2736 } 2737 idc = 7*c[i]; 2738 x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 2739 v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 2740 x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 2741 v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 2742 x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 2743 v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 2744 x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 2745 v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 2746 x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 2747 v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 2748 x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 2749 v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 2750 x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 2751 v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 2752 } 2753 2754 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2755 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2756 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2757 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2758 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 2759 PetscFunctionReturn(0); 2760 } 2761 2762 #undef __FUNCT__ 2763 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_inplace" 2764 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 2765 { 2766 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2767 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 2768 PetscErrorCode ierr; 2769 PetscInt i,nz,idx,idt,jdx; 2770 const MatScalar *aa=a->a,*v; 2771 PetscScalar *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 2772 const PetscScalar *b; 2773 2774 PetscFunctionBegin; 2775 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2776 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2777 /* forward solve the lower triangular */ 2778 idx = 0; 2779 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 2780 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 2781 x[6] = b[6+idx]; 2782 for (i=1; i<n; i++) { 2783 v = aa + 49*ai[i]; 2784 vi = aj + ai[i]; 2785 nz = diag[i] - ai[i]; 2786 idx = 7*i; 2787 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 2788 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 2789 s7 = b[6+idx]; 2790 while (nz--) { 2791 jdx = 7*(*vi++); 2792 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 2793 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 2794 x7 = x[6+jdx]; 2795 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2796 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2797 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2798 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2799 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2800 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2801 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2802 v += 49; 2803 } 2804 x[idx] = s1; 2805 x[1+idx] = s2; 2806 x[2+idx] = s3; 2807 x[3+idx] = s4; 2808 x[4+idx] = s5; 2809 x[5+idx] = s6; 2810 x[6+idx] = s7; 2811 } 2812 /* backward solve the upper triangular */ 2813 for (i=n-1; i>=0; i--){ 2814 v = aa + 49*diag[i] + 49; 2815 vi = aj + diag[i] + 1; 2816 nz = ai[i+1] - diag[i] - 1; 2817 idt = 7*i; 2818 s1 = x[idt]; s2 = x[1+idt]; 2819 s3 = x[2+idt]; s4 = x[3+idt]; 2820 s5 = x[4+idt]; s6 = x[5+idt]; 2821 s7 = x[6+idt]; 2822 while (nz--) { 2823 idx = 7*(*vi++); 2824 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 2825 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 2826 x7 = x[6+idx]; 2827 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2828 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2829 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2830 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2831 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2832 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2833 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2834 v += 49; 2835 } 2836 v = aa + 49*diag[i]; 2837 x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 2838 + v[28]*s5 + v[35]*s6 + v[42]*s7; 2839 x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 2840 + v[29]*s5 + v[36]*s6 + v[43]*s7; 2841 x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 2842 + v[30]*s5 + v[37]*s6 + v[44]*s7; 2843 x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 2844 + v[31]*s5 + v[38]*s6 + v[45]*s7; 2845 x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 2846 + v[32]*s5 + v[39]*s6 + v[46]*s7; 2847 x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 2848 + v[33]*s5 + v[40]*s6 + v[47]*s7; 2849 x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 2850 + v[34]*s5 + v[41]*s6 + v[48]*s7; 2851 } 2852 2853 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2854 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2855 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 2856 PetscFunctionReturn(0); 2857 } 2858 2859 #undef __FUNCT__ 2860 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering" 2861 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 2862 { 2863 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2864 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 2865 PetscErrorCode ierr; 2866 PetscInt i,k,nz,idx,jdx,idt; 2867 const PetscInt bs = A->rmap->bs,bs2 = a->bs2; 2868 const MatScalar *aa=a->a,*v; 2869 PetscScalar *x; 2870 const PetscScalar *b; 2871 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 2872 2873 PetscFunctionBegin; 2874 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2875 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2876 /* forward solve the lower triangular */ 2877 idx = 0; 2878 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 2879 x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx]; 2880 for (i=1; i<n; i++) { 2881 v = aa + bs2*ai[i]; 2882 vi = aj + ai[i]; 2883 nz = ai[i+1] - ai[i]; 2884 idx = bs*i; 2885 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2886 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 2887 for(k=0;k<nz;k++) { 2888 jdx = bs*vi[k]; 2889 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 2890 x5 = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx]; 2891 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2892 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2893 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2894 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2895 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2896 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2897 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2898 v += bs2; 2899 } 2900 2901 x[idx] = s1; 2902 x[1+idx] = s2; 2903 x[2+idx] = s3; 2904 x[3+idx] = s4; 2905 x[4+idx] = s5; 2906 x[5+idx] = s6; 2907 x[6+idx] = s7; 2908 } 2909 2910 /* backward solve the upper triangular */ 2911 for (i=n-1; i>=0; i--){ 2912 v = aa + bs2*(adiag[i+1]+1); 2913 vi = aj + adiag[i+1]+1; 2914 nz = adiag[i] - adiag[i+1]-1; 2915 idt = bs*i; 2916 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 2917 s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt]; 2918 for(k=0;k<nz;k++) { 2919 idx = bs*vi[k]; 2920 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 2921 x5 = x[4+idx];x6 = x[5+idx];x7 = x[6+idx]; 2922 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2923 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2924 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2925 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2926 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2927 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2928 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2929 v += bs2; 2930 } 2931 /* x = inv_diagonal*x */ 2932 x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 + v[28]*s5 + v[35]*s6 + v[42]*s7; 2933 x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 + v[29]*s5 + v[36]*s6 + v[43]*s7; 2934 x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 + v[30]*s5 + v[37]*s6 + v[44]*s7; 2935 x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 + v[31]*s5 + v[38]*s6 + v[45]*s7; 2936 x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 + v[32]*s5 + v[39]*s6 + v[46]*s7; 2937 x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 + v[33]*s5 + v[40]*s6 + v[47]*s7; 2938 x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 + v[34]*s5 + v[41]*s6 + v[48]*s7; 2939 } 2940 2941 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2942 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2943 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 2944 PetscFunctionReturn(0); 2945 } 2946 2947 #undef __FUNCT__ 2948 #define __FUNCT__ "MatSolve_SeqBAIJ_6_inplace" 2949 PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx) 2950 { 2951 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2952 IS iscol=a->col,isrow=a->row; 2953 PetscErrorCode ierr; 2954 const PetscInt *r,*c,*rout,*cout; 2955 const PetscInt *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 2956 PetscInt i,nz,idx,idt,idc; 2957 const MatScalar *aa=a->a,*v; 2958 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 2959 const PetscScalar *b; 2960 2961 PetscFunctionBegin; 2962 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2963 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2964 t = a->solve_work; 2965 2966 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2967 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2968 2969 /* forward solve the lower triangular */ 2970 idx = 6*(*r++); 2971 t[0] = b[idx]; t[1] = b[1+idx]; 2972 t[2] = b[2+idx]; t[3] = b[3+idx]; 2973 t[4] = b[4+idx]; t[5] = b[5+idx]; 2974 for (i=1; i<n; i++) { 2975 v = aa + 36*ai[i]; 2976 vi = aj + ai[i]; 2977 nz = diag[i] - ai[i]; 2978 idx = 6*(*r++); 2979 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2980 s5 = b[4+idx]; s6 = b[5+idx]; 2981 while (nz--) { 2982 idx = 6*(*vi++); 2983 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2984 x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 2985 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2986 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2987 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2988 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2989 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2990 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2991 v += 36; 2992 } 2993 idx = 6*i; 2994 t[idx] = s1;t[1+idx] = s2; 2995 t[2+idx] = s3;t[3+idx] = s4; 2996 t[4+idx] = s5;t[5+idx] = s6; 2997 } 2998 /* backward solve the upper triangular */ 2999 for (i=n-1; i>=0; i--){ 3000 v = aa + 36*diag[i] + 36; 3001 vi = aj + diag[i] + 1; 3002 nz = ai[i+1] - diag[i] - 1; 3003 idt = 6*i; 3004 s1 = t[idt]; s2 = t[1+idt]; 3005 s3 = t[2+idt];s4 = t[3+idt]; 3006 s5 = t[4+idt];s6 = t[5+idt]; 3007 while (nz--) { 3008 idx = 6*(*vi++); 3009 x1 = t[idx]; x2 = t[1+idx]; 3010 x3 = t[2+idx]; x4 = t[3+idx]; 3011 x5 = t[4+idx]; x6 = t[5+idx]; 3012 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 3013 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 3014 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 3015 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 3016 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 3017 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 3018 v += 36; 3019 } 3020 idc = 6*(*c--); 3021 v = aa + 36*diag[i]; 3022 x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 3023 v[18]*s4+v[24]*s5+v[30]*s6; 3024 x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 3025 v[19]*s4+v[25]*s5+v[31]*s6; 3026 x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 3027 v[20]*s4+v[26]*s5+v[32]*s6; 3028 x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 3029 v[21]*s4+v[27]*s5+v[33]*s6; 3030 x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 3031 v[22]*s4+v[28]*s5+v[34]*s6; 3032 x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 3033 v[23]*s4+v[29]*s5+v[35]*s6; 3034 } 3035 3036 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3037 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3038 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3039 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3040 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 3041 PetscFunctionReturn(0); 3042 } 3043 3044 #undef __FUNCT__ 3045 #define __FUNCT__ "MatSolve_SeqBAIJ_6" 3046 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 3047 { 3048 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 3049 IS iscol=a->col,isrow=a->row; 3050 PetscErrorCode ierr; 3051 const PetscInt *r,*c,*rout,*cout; 3052 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 3053 PetscInt i,nz,idx,idt,idc,m; 3054 const MatScalar *aa=a->a,*v; 3055 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 3056 const PetscScalar *b; 3057 3058 PetscFunctionBegin; 3059 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3060 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3061 t = a->solve_work; 3062 3063 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3064 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 3065 3066 /* forward solve the lower triangular */ 3067 idx = 6*r[0]; 3068 t[0] = b[idx]; t[1] = b[1+idx]; 3069 t[2] = b[2+idx]; t[3] = b[3+idx]; 3070 t[4] = b[4+idx]; t[5] = b[5+idx]; 3071 for (i=1; i<n; i++) { 3072 v = aa + 36*ai[i]; 3073 vi = aj + ai[i]; 3074 nz = ai[i+1] - ai[i]; 3075 idx = 6*r[i]; 3076 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3077 s5 = b[4+idx]; s6 = b[5+idx]; 3078 for(m=0;m<nz;m++){ 3079 idx = 6*vi[m]; 3080 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 3081 x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 3082 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 3083 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 3084 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 3085 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 3086 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 3087 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 3088 v += 36; 3089 } 3090 idx = 6*i; 3091 t[idx] = s1;t[1+idx] = s2; 3092 t[2+idx] = s3;t[3+idx] = s4; 3093 t[4+idx] = s5;t[5+idx] = s6; 3094 } 3095 /* backward solve the upper triangular */ 3096 for (i=n-1; i>=0; i--){ 3097 v = aa + 36*(adiag[i+1]+1); 3098 vi = aj + adiag[i+1]+1; 3099 nz = adiag[i] - adiag[i+1] - 1; 3100 idt = 6*i; 3101 s1 = t[idt]; s2 = t[1+idt]; 3102 s3 = t[2+idt];s4 = t[3+idt]; 3103 s5 = t[4+idt];s6 = t[5+idt]; 3104 for(m=0;m<nz;m++){ 3105 idx = 6*vi[m]; 3106 x1 = t[idx]; x2 = t[1+idx]; 3107 x3 = t[2+idx]; x4 = t[3+idx]; 3108 x5 = t[4+idx]; x6 = t[5+idx]; 3109 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 3110 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 3111 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 3112 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 3113 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 3114 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 3115 v += 36; 3116 } 3117 idc = 6*c[i]; 3118 x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 3119 v[18]*s4+v[24]*s5+v[30]*s6; 3120 x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 3121 v[19]*s4+v[25]*s5+v[31]*s6; 3122 x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 3123 v[20]*s4+v[26]*s5+v[32]*s6; 3124 x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 3125 v[21]*s4+v[27]*s5+v[33]*s6; 3126 x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 3127 v[22]*s4+v[28]*s5+v[34]*s6; 3128 x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 3129 v[23]*s4+v[29]*s5+v[35]*s6; 3130 } 3131 3132 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3133 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3134 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3135 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3136 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 3137 PetscFunctionReturn(0); 3138 } 3139 3140 #undef __FUNCT__ 3141 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_inplace" 3142 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 3143 { 3144 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3145 PetscInt i,nz,idx,idt,jdx; 3146 PetscErrorCode ierr; 3147 const PetscInt *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j; 3148 const MatScalar *aa=a->a,*v; 3149 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 3150 const PetscScalar *b; 3151 3152 PetscFunctionBegin; 3153 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3154 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3155 /* forward solve the lower triangular */ 3156 idx = 0; 3157 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 3158 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 3159 for (i=1; i<n; i++) { 3160 v = aa + 36*ai[i]; 3161 vi = aj + ai[i]; 3162 nz = diag[i] - ai[i]; 3163 idx = 6*i; 3164 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 3165 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 3166 while (nz--) { 3167 jdx = 6*(*vi++); 3168 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 3169 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 3170 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 3171 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 3172 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 3173 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 3174 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 3175 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 3176 v += 36; 3177 } 3178 x[idx] = s1; 3179 x[1+idx] = s2; 3180 x[2+idx] = s3; 3181 x[3+idx] = s4; 3182 x[4+idx] = s5; 3183 x[5+idx] = s6; 3184 } 3185 /* backward solve the upper triangular */ 3186 for (i=n-1; i>=0; i--){ 3187 v = aa + 36*diag[i] + 36; 3188 vi = aj + diag[i] + 1; 3189 nz = ai[i+1] - diag[i] - 1; 3190 idt = 6*i; 3191 s1 = x[idt]; s2 = x[1+idt]; 3192 s3 = x[2+idt]; s4 = x[3+idt]; 3193 s5 = x[4+idt]; s6 = x[5+idt]; 3194 while (nz--) { 3195 idx = 6*(*vi++); 3196 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 3197 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 3198 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 3199 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 3200 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 3201 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 3202 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 3203 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 3204 v += 36; 3205 } 3206 v = aa + 36*diag[i]; 3207 x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 3208 x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 3209 x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 3210 x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 3211 x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 3212 x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 3213 } 3214 3215 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3216 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3217 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 3218 PetscFunctionReturn(0); 3219 } 3220 3221 #undef __FUNCT__ 3222 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering" 3223 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 3224 { 3225 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3226 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 3227 PetscErrorCode ierr; 3228 PetscInt i,k,nz,idx,jdx,idt; 3229 const PetscInt bs = A->rmap->bs,bs2 = a->bs2; 3230 const MatScalar *aa=a->a,*v; 3231 PetscScalar *x; 3232 const PetscScalar *b; 3233 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 3234 3235 PetscFunctionBegin; 3236 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3237 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3238 /* forward solve the lower triangular */ 3239 idx = 0; 3240 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 3241 x[4] = b[4+idx];x[5] = b[5+idx]; 3242 for (i=1; i<n; i++) { 3243 v = aa + bs2*ai[i]; 3244 vi = aj + ai[i]; 3245 nz = ai[i+1] - ai[i]; 3246 idx = bs*i; 3247 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3248 s5 = b[4+idx];s6 = b[5+idx]; 3249 for(k=0;k<nz;k++){ 3250 jdx = bs*vi[k]; 3251 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 3252 x5 = x[4+jdx]; x6 = x[5+jdx]; 3253 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 3254 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 3255 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 3256 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 3257 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 3258 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 3259 v += bs2; 3260 } 3261 3262 x[idx] = s1; 3263 x[1+idx] = s2; 3264 x[2+idx] = s3; 3265 x[3+idx] = s4; 3266 x[4+idx] = s5; 3267 x[5+idx] = s6; 3268 } 3269 3270 /* backward solve the upper triangular */ 3271 for (i=n-1; i>=0; i--){ 3272 v = aa + bs2*(adiag[i+1]+1); 3273 vi = aj + adiag[i+1]+1; 3274 nz = adiag[i] - adiag[i+1]-1; 3275 idt = bs*i; 3276 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 3277 s5 = x[4+idt];s6 = x[5+idt]; 3278 for(k=0;k<nz;k++){ 3279 idx = bs*vi[k]; 3280 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 3281 x5 = x[4+idx];x6 = x[5+idx]; 3282 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 3283 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 3284 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 3285 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 3286 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 3287 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 3288 v += bs2; 3289 } 3290 /* x = inv_diagonal*x */ 3291 x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 3292 x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 3293 x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 3294 x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 3295 x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 3296 x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 3297 } 3298 3299 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3300 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3301 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 3302 PetscFunctionReturn(0); 3303 } 3304 3305 #undef __FUNCT__ 3306 #define __FUNCT__ "MatSolve_SeqBAIJ_5_inplace" 3307 PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx) 3308 { 3309 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 3310 IS iscol=a->col,isrow=a->row; 3311 PetscErrorCode ierr; 3312 const PetscInt *r,*c,*rout,*cout,*diag = a->diag; 3313 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 3314 PetscInt i,nz,idx,idt,idc; 3315 const MatScalar *aa=a->a,*v; 3316 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 3317 const PetscScalar *b; 3318 3319 PetscFunctionBegin; 3320 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3321 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3322 t = a->solve_work; 3323 3324 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3325 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3326 3327 /* forward solve the lower triangular */ 3328 idx = 5*(*r++); 3329 t[0] = b[idx]; t[1] = b[1+idx]; 3330 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 3331 for (i=1; i<n; i++) { 3332 v = aa + 25*ai[i]; 3333 vi = aj + ai[i]; 3334 nz = diag[i] - ai[i]; 3335 idx = 5*(*r++); 3336 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3337 s5 = b[4+idx]; 3338 while (nz--) { 3339 idx = 5*(*vi++); 3340 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 3341 x4 = t[3+idx];x5 = t[4+idx]; 3342 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3343 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3344 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3345 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3346 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3347 v += 25; 3348 } 3349 idx = 5*i; 3350 t[idx] = s1;t[1+idx] = s2; 3351 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 3352 } 3353 /* backward solve the upper triangular */ 3354 for (i=n-1; i>=0; i--){ 3355 v = aa + 25*diag[i] + 25; 3356 vi = aj + diag[i] + 1; 3357 nz = ai[i+1] - diag[i] - 1; 3358 idt = 5*i; 3359 s1 = t[idt]; s2 = t[1+idt]; 3360 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 3361 while (nz--) { 3362 idx = 5*(*vi++); 3363 x1 = t[idx]; x2 = t[1+idx]; 3364 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 3365 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3366 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3367 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3368 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3369 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3370 v += 25; 3371 } 3372 idc = 5*(*c--); 3373 v = aa + 25*diag[i]; 3374 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 3375 v[15]*s4+v[20]*s5; 3376 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 3377 v[16]*s4+v[21]*s5; 3378 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 3379 v[17]*s4+v[22]*s5; 3380 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 3381 v[18]*s4+v[23]*s5; 3382 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 3383 v[19]*s4+v[24]*s5; 3384 } 3385 3386 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3387 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3388 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3389 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3390 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 3391 PetscFunctionReturn(0); 3392 } 3393 3394 #undef __FUNCT__ 3395 #define __FUNCT__ "MatSolve_SeqBAIJ_5" 3396 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 3397 { 3398 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 3399 IS iscol=a->col,isrow=a->row; 3400 PetscErrorCode ierr; 3401 const PetscInt *r,*c,*rout,*cout; 3402 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 3403 PetscInt i,nz,idx,idt,idc,m; 3404 const MatScalar *aa=a->a,*v; 3405 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 3406 const PetscScalar *b; 3407 3408 PetscFunctionBegin; 3409 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3410 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3411 t = a->solve_work; 3412 3413 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3414 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 3415 3416 /* forward solve the lower triangular */ 3417 idx = 5*r[0]; 3418 t[0] = b[idx]; t[1] = b[1+idx]; 3419 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 3420 for (i=1; i<n; i++) { 3421 v = aa + 25*ai[i]; 3422 vi = aj + ai[i]; 3423 nz = ai[i+1] - ai[i]; 3424 idx = 5*r[i]; 3425 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3426 s5 = b[4+idx]; 3427 for(m=0;m<nz;m++){ 3428 idx = 5*vi[m]; 3429 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 3430 x4 = t[3+idx];x5 = t[4+idx]; 3431 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3432 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3433 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3434 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3435 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3436 v += 25; 3437 } 3438 idx = 5*i; 3439 t[idx] = s1;t[1+idx] = s2; 3440 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 3441 } 3442 /* backward solve the upper triangular */ 3443 for (i=n-1; i>=0; i--){ 3444 v = aa + 25*(adiag[i+1]+1); 3445 vi = aj + adiag[i+1]+1; 3446 nz = adiag[i] - adiag[i+1] - 1; 3447 idt = 5*i; 3448 s1 = t[idt]; s2 = t[1+idt]; 3449 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 3450 for(m=0;m<nz;m++){ 3451 idx = 5*vi[m]; 3452 x1 = t[idx]; x2 = t[1+idx]; 3453 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 3454 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3455 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3456 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3457 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3458 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3459 v += 25; 3460 } 3461 idc = 5*c[i]; 3462 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 3463 v[15]*s4+v[20]*s5; 3464 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 3465 v[16]*s4+v[21]*s5; 3466 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 3467 v[17]*s4+v[22]*s5; 3468 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 3469 v[18]*s4+v[23]*s5; 3470 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 3471 v[19]*s4+v[24]*s5; 3472 } 3473 3474 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3475 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3476 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3477 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3478 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 3479 PetscFunctionReturn(0); 3480 } 3481 3482 #undef __FUNCT__ 3483 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_inplace" 3484 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 3485 { 3486 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3487 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 3488 PetscInt i,nz,idx,idt,jdx; 3489 PetscErrorCode ierr; 3490 const MatScalar *aa=a->a,*v; 3491 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 3492 const PetscScalar *b; 3493 3494 PetscFunctionBegin; 3495 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3496 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3497 /* forward solve the lower triangular */ 3498 idx = 0; 3499 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 3500 for (i=1; i<n; i++) { 3501 v = aa + 25*ai[i]; 3502 vi = aj + ai[i]; 3503 nz = diag[i] - ai[i]; 3504 idx = 5*i; 3505 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 3506 while (nz--) { 3507 jdx = 5*(*vi++); 3508 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 3509 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3510 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3511 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3512 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3513 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3514 v += 25; 3515 } 3516 x[idx] = s1; 3517 x[1+idx] = s2; 3518 x[2+idx] = s3; 3519 x[3+idx] = s4; 3520 x[4+idx] = s5; 3521 } 3522 /* backward solve the upper triangular */ 3523 for (i=n-1; i>=0; i--){ 3524 v = aa + 25*diag[i] + 25; 3525 vi = aj + diag[i] + 1; 3526 nz = ai[i+1] - diag[i] - 1; 3527 idt = 5*i; 3528 s1 = x[idt]; s2 = x[1+idt]; 3529 s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 3530 while (nz--) { 3531 idx = 5*(*vi++); 3532 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 3533 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3534 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3535 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3536 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3537 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3538 v += 25; 3539 } 3540 v = aa + 25*diag[i]; 3541 x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 3542 x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 3543 x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 3544 x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 3545 x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 3546 } 3547 3548 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3549 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3550 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 3551 PetscFunctionReturn(0); 3552 } 3553 3554 #undef __FUNCT__ 3555 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering" 3556 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 3557 { 3558 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3559 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 3560 PetscInt i,k,nz,idx,idt,jdx; 3561 PetscErrorCode ierr; 3562 const MatScalar *aa=a->a,*v; 3563 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 3564 const PetscScalar *b; 3565 3566 PetscFunctionBegin; 3567 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3568 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3569 /* forward solve the lower triangular */ 3570 idx = 0; 3571 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 3572 for (i=1; i<n; i++) { 3573 v = aa + 25*ai[i]; 3574 vi = aj + ai[i]; 3575 nz = ai[i+1] - ai[i]; 3576 idx = 5*i; 3577 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 3578 for(k=0;k<nz;k++) { 3579 jdx = 5*vi[k]; 3580 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 3581 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3582 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3583 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3584 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3585 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3586 v += 25; 3587 } 3588 x[idx] = s1; 3589 x[1+idx] = s2; 3590 x[2+idx] = s3; 3591 x[3+idx] = s4; 3592 x[4+idx] = s5; 3593 } 3594 3595 /* backward solve the upper triangular */ 3596 for (i=n-1; i>=0; i--){ 3597 v = aa + 25*(adiag[i+1]+1); 3598 vi = aj + adiag[i+1]+1; 3599 nz = adiag[i] - adiag[i+1]-1; 3600 idt = 5*i; 3601 s1 = x[idt]; s2 = x[1+idt]; 3602 s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 3603 for(k=0;k<nz;k++){ 3604 idx = 5*vi[k]; 3605 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 3606 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3607 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3608 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3609 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3610 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3611 v += 25; 3612 } 3613 /* x = inv_diagonal*x */ 3614 x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 3615 x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 3616 x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 3617 x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 3618 x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 3619 } 3620 3621 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3622 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3623 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 3624 PetscFunctionReturn(0); 3625 } 3626 3627 #undef __FUNCT__ 3628 #define __FUNCT__ "MatSolve_SeqBAIJ_4_inplace" 3629 PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx) 3630 { 3631 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3632 IS iscol=a->col,isrow=a->row; 3633 PetscErrorCode ierr; 3634 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 3635 PetscInt i,nz,idx,idt,idc; 3636 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3637 const MatScalar *aa=a->a,*v; 3638 PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 3639 const PetscScalar *b; 3640 3641 PetscFunctionBegin; 3642 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3643 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3644 t = a->solve_work; 3645 3646 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3647 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3648 3649 /* forward solve the lower triangular */ 3650 idx = 4*(*r++); 3651 t[0] = b[idx]; t[1] = b[1+idx]; 3652 t[2] = b[2+idx]; t[3] = b[3+idx]; 3653 for (i=1; i<n; i++) { 3654 v = aa + 16*ai[i]; 3655 vi = aj + ai[i]; 3656 nz = diag[i] - ai[i]; 3657 idx = 4*(*r++); 3658 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3659 while (nz--) { 3660 idx = 4*(*vi++); 3661 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 3662 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3663 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3664 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3665 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3666 v += 16; 3667 } 3668 idx = 4*i; 3669 t[idx] = s1;t[1+idx] = s2; 3670 t[2+idx] = s3;t[3+idx] = s4; 3671 } 3672 /* backward solve the upper triangular */ 3673 for (i=n-1; i>=0; i--){ 3674 v = aa + 16*diag[i] + 16; 3675 vi = aj + diag[i] + 1; 3676 nz = ai[i+1] - diag[i] - 1; 3677 idt = 4*i; 3678 s1 = t[idt]; s2 = t[1+idt]; 3679 s3 = t[2+idt];s4 = t[3+idt]; 3680 while (nz--) { 3681 idx = 4*(*vi++); 3682 x1 = t[idx]; x2 = t[1+idx]; 3683 x3 = t[2+idx]; x4 = t[3+idx]; 3684 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3685 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3686 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3687 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3688 v += 16; 3689 } 3690 idc = 4*(*c--); 3691 v = aa + 16*diag[i]; 3692 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 3693 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 3694 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 3695 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 3696 } 3697 3698 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3699 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3700 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3701 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3702 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3703 PetscFunctionReturn(0); 3704 } 3705 3706 #undef __FUNCT__ 3707 #define __FUNCT__ "MatSolve_SeqBAIJ_4" 3708 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 3709 { 3710 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3711 IS iscol=a->col,isrow=a->row; 3712 PetscErrorCode ierr; 3713 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 3714 PetscInt i,nz,idx,idt,idc,m; 3715 const PetscInt *r,*c,*rout,*cout; 3716 const MatScalar *aa=a->a,*v; 3717 PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 3718 const PetscScalar *b; 3719 3720 PetscFunctionBegin; 3721 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3722 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3723 t = a->solve_work; 3724 3725 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3726 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 3727 3728 /* forward solve the lower triangular */ 3729 idx = 4*r[0]; 3730 t[0] = b[idx]; t[1] = b[1+idx]; 3731 t[2] = b[2+idx]; t[3] = b[3+idx]; 3732 for (i=1; i<n; i++) { 3733 v = aa + 16*ai[i]; 3734 vi = aj + ai[i]; 3735 nz = ai[i+1] - ai[i]; 3736 idx = 4*r[i]; 3737 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3738 for(m=0;m<nz;m++){ 3739 idx = 4*vi[m]; 3740 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 3741 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3742 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3743 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3744 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3745 v += 16; 3746 } 3747 idx = 4*i; 3748 t[idx] = s1;t[1+idx] = s2; 3749 t[2+idx] = s3;t[3+idx] = s4; 3750 } 3751 /* backward solve the upper triangular */ 3752 for (i=n-1; i>=0; i--){ 3753 v = aa + 16*(adiag[i+1]+1); 3754 vi = aj + adiag[i+1]+1; 3755 nz = adiag[i] - adiag[i+1] - 1; 3756 idt = 4*i; 3757 s1 = t[idt]; s2 = t[1+idt]; 3758 s3 = t[2+idt];s4 = t[3+idt]; 3759 for(m=0;m<nz;m++){ 3760 idx = 4*vi[m]; 3761 x1 = t[idx]; x2 = t[1+idx]; 3762 x3 = t[2+idx]; x4 = t[3+idx]; 3763 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3764 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3765 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3766 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3767 v += 16; 3768 } 3769 idc = 4*c[i]; 3770 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 3771 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 3772 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 3773 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 3774 } 3775 3776 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3777 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3778 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3779 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3780 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3781 PetscFunctionReturn(0); 3782 } 3783 3784 #undef __FUNCT__ 3785 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion" 3786 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx) 3787 { 3788 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3789 IS iscol=a->col,isrow=a->row; 3790 PetscErrorCode ierr; 3791 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 3792 PetscInt i,nz,idx,idt,idc; 3793 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3794 const MatScalar *aa=a->a,*v; 3795 MatScalar s1,s2,s3,s4,x1,x2,x3,x4,*t; 3796 PetscScalar *x; 3797 const PetscScalar *b; 3798 3799 PetscFunctionBegin; 3800 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3801 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3802 t = (MatScalar *)a->solve_work; 3803 3804 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3805 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3806 3807 /* forward solve the lower triangular */ 3808 idx = 4*(*r++); 3809 t[0] = (MatScalar)b[idx]; 3810 t[1] = (MatScalar)b[1+idx]; 3811 t[2] = (MatScalar)b[2+idx]; 3812 t[3] = (MatScalar)b[3+idx]; 3813 for (i=1; i<n; i++) { 3814 v = aa + 16*ai[i]; 3815 vi = aj + ai[i]; 3816 nz = diag[i] - ai[i]; 3817 idx = 4*(*r++); 3818 s1 = (MatScalar)b[idx]; 3819 s2 = (MatScalar)b[1+idx]; 3820 s3 = (MatScalar)b[2+idx]; 3821 s4 = (MatScalar)b[3+idx]; 3822 while (nz--) { 3823 idx = 4*(*vi++); 3824 x1 = t[idx]; 3825 x2 = t[1+idx]; 3826 x3 = t[2+idx]; 3827 x4 = t[3+idx]; 3828 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3829 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3830 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3831 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3832 v += 16; 3833 } 3834 idx = 4*i; 3835 t[idx] = s1; 3836 t[1+idx] = s2; 3837 t[2+idx] = s3; 3838 t[3+idx] = s4; 3839 } 3840 /* backward solve the upper triangular */ 3841 for (i=n-1; i>=0; i--){ 3842 v = aa + 16*diag[i] + 16; 3843 vi = aj + diag[i] + 1; 3844 nz = ai[i+1] - diag[i] - 1; 3845 idt = 4*i; 3846 s1 = t[idt]; 3847 s2 = t[1+idt]; 3848 s3 = t[2+idt]; 3849 s4 = t[3+idt]; 3850 while (nz--) { 3851 idx = 4*(*vi++); 3852 x1 = t[idx]; 3853 x2 = t[1+idx]; 3854 x3 = t[2+idx]; 3855 x4 = t[3+idx]; 3856 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3857 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3858 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3859 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3860 v += 16; 3861 } 3862 idc = 4*(*c--); 3863 v = aa + 16*diag[i]; 3864 t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 3865 t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 3866 t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 3867 t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 3868 x[idc] = (PetscScalar)t[idt]; 3869 x[1+idc] = (PetscScalar)t[1+idt]; 3870 x[2+idc] = (PetscScalar)t[2+idt]; 3871 x[3+idc] = (PetscScalar)t[3+idt]; 3872 } 3873 3874 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3875 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3876 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3877 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3878 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3879 PetscFunctionReturn(0); 3880 } 3881 3882 #if defined (PETSC_HAVE_SSE) 3883 3884 #include PETSC_HAVE_SSE 3885 3886 #undef __FUNCT__ 3887 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion" 3888 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx) 3889 { 3890 /* 3891 Note: This code uses demotion of double 3892 to float when performing the mixed-mode computation. 3893 This may not be numerically reasonable for all applications. 3894 */ 3895 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3896 IS iscol=a->col,isrow=a->row; 3897 PetscErrorCode ierr; 3898 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16; 3899 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3900 MatScalar *aa=a->a,*v; 3901 PetscScalar *x,*b,*t; 3902 3903 /* Make space in temp stack for 16 Byte Aligned arrays */ 3904 float ssealignedspace[11],*tmps,*tmpx; 3905 unsigned long offset; 3906 3907 PetscFunctionBegin; 3908 SSE_SCOPE_BEGIN; 3909 3910 offset = (unsigned long)ssealignedspace % 16; 3911 if (offset) offset = (16 - offset)/4; 3912 tmps = &ssealignedspace[offset]; 3913 tmpx = &ssealignedspace[offset+4]; 3914 PREFETCH_NTA(aa+16*ai[1]); 3915 3916 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3917 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3918 t = a->solve_work; 3919 3920 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3921 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3922 3923 /* forward solve the lower triangular */ 3924 idx = 4*(*r++); 3925 t[0] = b[idx]; t[1] = b[1+idx]; 3926 t[2] = b[2+idx]; t[3] = b[3+idx]; 3927 v = aa + 16*ai[1]; 3928 3929 for (i=1; i<n;) { 3930 PREFETCH_NTA(&v[8]); 3931 vi = aj + ai[i]; 3932 nz = diag[i] - ai[i]; 3933 idx = 4*(*r++); 3934 3935 /* Demote sum from double to float */ 3936 CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 3937 LOAD_PS(tmps,XMM7); 3938 3939 while (nz--) { 3940 PREFETCH_NTA(&v[16]); 3941 idx = 4*(*vi++); 3942 3943 /* Demote solution (so far) from double to float */ 3944 CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 3945 3946 /* 4x4 Matrix-Vector product with negative accumulation: */ 3947 SSE_INLINE_BEGIN_2(tmpx,v) 3948 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 3949 3950 /* First Column */ 3951 SSE_COPY_PS(XMM0,XMM6) 3952 SSE_SHUFFLE(XMM0,XMM0,0x00) 3953 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 3954 SSE_SUB_PS(XMM7,XMM0) 3955 3956 /* Second Column */ 3957 SSE_COPY_PS(XMM1,XMM6) 3958 SSE_SHUFFLE(XMM1,XMM1,0x55) 3959 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 3960 SSE_SUB_PS(XMM7,XMM1) 3961 3962 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 3963 3964 /* Third Column */ 3965 SSE_COPY_PS(XMM2,XMM6) 3966 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3967 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 3968 SSE_SUB_PS(XMM7,XMM2) 3969 3970 /* Fourth Column */ 3971 SSE_COPY_PS(XMM3,XMM6) 3972 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3973 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 3974 SSE_SUB_PS(XMM7,XMM3) 3975 SSE_INLINE_END_2 3976 3977 v += 16; 3978 } 3979 idx = 4*i; 3980 v = aa + 16*ai[++i]; 3981 PREFETCH_NTA(v); 3982 STORE_PS(tmps,XMM7); 3983 3984 /* Promote result from float to double */ 3985 CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps); 3986 } 3987 /* backward solve the upper triangular */ 3988 idt = 4*(n-1); 3989 ai16 = 16*diag[n-1]; 3990 v = aa + ai16 + 16; 3991 for (i=n-1; i>=0;){ 3992 PREFETCH_NTA(&v[8]); 3993 vi = aj + diag[i] + 1; 3994 nz = ai[i+1] - diag[i] - 1; 3995 3996 /* Demote accumulator from double to float */ 3997 CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]); 3998 LOAD_PS(tmps,XMM7); 3999 4000 while (nz--) { 4001 PREFETCH_NTA(&v[16]); 4002 idx = 4*(*vi++); 4003 4004 /* Demote solution (so far) from double to float */ 4005 CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]); 4006 4007 /* 4x4 Matrix-Vector Product with negative accumulation: */ 4008 SSE_INLINE_BEGIN_2(tmpx,v) 4009 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 4010 4011 /* First Column */ 4012 SSE_COPY_PS(XMM0,XMM6) 4013 SSE_SHUFFLE(XMM0,XMM0,0x00) 4014 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 4015 SSE_SUB_PS(XMM7,XMM0) 4016 4017 /* Second Column */ 4018 SSE_COPY_PS(XMM1,XMM6) 4019 SSE_SHUFFLE(XMM1,XMM1,0x55) 4020 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 4021 SSE_SUB_PS(XMM7,XMM1) 4022 4023 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 4024 4025 /* Third Column */ 4026 SSE_COPY_PS(XMM2,XMM6) 4027 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4028 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 4029 SSE_SUB_PS(XMM7,XMM2) 4030 4031 /* Fourth Column */ 4032 SSE_COPY_PS(XMM3,XMM6) 4033 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4034 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 4035 SSE_SUB_PS(XMM7,XMM3) 4036 SSE_INLINE_END_2 4037 v += 16; 4038 } 4039 v = aa + ai16; 4040 ai16 = 16*diag[--i]; 4041 PREFETCH_NTA(aa+ai16+16); 4042 /* 4043 Scale the result by the diagonal 4x4 block, 4044 which was inverted as part of the factorization 4045 */ 4046 SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 4047 /* First Column */ 4048 SSE_COPY_PS(XMM0,XMM7) 4049 SSE_SHUFFLE(XMM0,XMM0,0x00) 4050 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 4051 4052 /* Second Column */ 4053 SSE_COPY_PS(XMM1,XMM7) 4054 SSE_SHUFFLE(XMM1,XMM1,0x55) 4055 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 4056 SSE_ADD_PS(XMM0,XMM1) 4057 4058 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 4059 4060 /* Third Column */ 4061 SSE_COPY_PS(XMM2,XMM7) 4062 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4063 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 4064 SSE_ADD_PS(XMM0,XMM2) 4065 4066 /* Fourth Column */ 4067 SSE_COPY_PS(XMM3,XMM7) 4068 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4069 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 4070 SSE_ADD_PS(XMM0,XMM3) 4071 4072 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 4073 SSE_INLINE_END_3 4074 4075 /* Promote solution from float to double */ 4076 CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps); 4077 4078 /* Apply reordering to t and stream into x. */ 4079 /* This way, x doesn't pollute the cache. */ 4080 /* Be careful with size: 2 doubles = 4 floats! */ 4081 idc = 4*(*c--); 4082 SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc]) 4083 /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 4084 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 4085 SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0) 4086 /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 4087 SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1) 4088 SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1) 4089 SSE_INLINE_END_2 4090 v = aa + ai16 + 16; 4091 idt -= 4; 4092 } 4093 4094 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4095 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4096 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 4097 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4098 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 4099 SSE_SCOPE_END; 4100 PetscFunctionReturn(0); 4101 } 4102 4103 #endif 4104 4105 4106 /* 4107 Special case where the matrix was ILU(0) factored in the natural 4108 ordering. This eliminates the need for the column and row permutation. 4109 */ 4110 #undef __FUNCT__ 4111 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_inplace" 4112 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 4113 { 4114 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4115 PetscInt n=a->mbs; 4116 const PetscInt *ai=a->i,*aj=a->j; 4117 PetscErrorCode ierr; 4118 const PetscInt *diag = a->diag; 4119 const MatScalar *aa=a->a; 4120 PetscScalar *x; 4121 const PetscScalar *b; 4122 4123 PetscFunctionBegin; 4124 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 4125 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4126 4127 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS) 4128 { 4129 static PetscScalar w[2000]; /* very BAD need to fix */ 4130 fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w); 4131 } 4132 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 4133 { 4134 static PetscScalar w[2000]; /* very BAD need to fix */ 4135 fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w); 4136 } 4137 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 4138 fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b); 4139 #else 4140 { 4141 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 4142 const MatScalar *v; 4143 PetscInt jdx,idt,idx,nz,i,ai16; 4144 const PetscInt *vi; 4145 4146 /* forward solve the lower triangular */ 4147 idx = 0; 4148 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 4149 for (i=1; i<n; i++) { 4150 v = aa + 16*ai[i]; 4151 vi = aj + ai[i]; 4152 nz = diag[i] - ai[i]; 4153 idx += 4; 4154 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 4155 while (nz--) { 4156 jdx = 4*(*vi++); 4157 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx]; 4158 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4159 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4160 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4161 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4162 v += 16; 4163 } 4164 x[idx] = s1; 4165 x[1+idx] = s2; 4166 x[2+idx] = s3; 4167 x[3+idx] = s4; 4168 } 4169 /* backward solve the upper triangular */ 4170 idt = 4*(n-1); 4171 for (i=n-1; i>=0; i--){ 4172 ai16 = 16*diag[i]; 4173 v = aa + ai16 + 16; 4174 vi = aj + diag[i] + 1; 4175 nz = ai[i+1] - diag[i] - 1; 4176 s1 = x[idt]; s2 = x[1+idt]; 4177 s3 = x[2+idt];s4 = x[3+idt]; 4178 while (nz--) { 4179 idx = 4*(*vi++); 4180 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; 4181 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4182 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4183 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4184 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4185 v += 16; 4186 } 4187 v = aa + ai16; 4188 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 4189 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 4190 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 4191 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 4192 idt -= 4; 4193 } 4194 } 4195 #endif 4196 4197 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 4198 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4199 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 4200 PetscFunctionReturn(0); 4201 } 4202 4203 #undef __FUNCT__ 4204 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering" 4205 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 4206 { 4207 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4208 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 4209 PetscInt i,k,nz,idx,jdx,idt; 4210 PetscErrorCode ierr; 4211 const PetscInt bs = A->rmap->bs,bs2 = a->bs2; 4212 const MatScalar *aa=a->a,*v; 4213 PetscScalar *x; 4214 const PetscScalar *b; 4215 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 4216 4217 PetscFunctionBegin; 4218 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 4219 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4220 /* forward solve the lower triangular */ 4221 idx = 0; 4222 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 4223 for (i=1; i<n; i++) { 4224 v = aa + bs2*ai[i]; 4225 vi = aj + ai[i]; 4226 nz = ai[i+1] - ai[i]; 4227 idx = bs*i; 4228 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 4229 for(k=0;k<nz;k++) { 4230 jdx = bs*vi[k]; 4231 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 4232 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4233 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4234 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4235 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4236 4237 v += bs2; 4238 } 4239 4240 x[idx] = s1; 4241 x[1+idx] = s2; 4242 x[2+idx] = s3; 4243 x[3+idx] = s4; 4244 } 4245 4246 /* backward solve the upper triangular */ 4247 for (i=n-1; i>=0; i--){ 4248 v = aa + bs2*(adiag[i+1]+1); 4249 vi = aj + adiag[i+1]+1; 4250 nz = adiag[i] - adiag[i+1]-1; 4251 idt = bs*i; 4252 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 4253 4254 for(k=0;k<nz;k++){ 4255 idx = bs*vi[k]; 4256 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 4257 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4258 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4259 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4260 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4261 4262 v += bs2; 4263 } 4264 /* x = inv_diagonal*x */ 4265 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 4266 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;; 4267 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 4268 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 4269 4270 } 4271 4272 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 4273 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4274 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 4275 PetscFunctionReturn(0); 4276 } 4277 4278 #undef __FUNCT__ 4279 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion" 4280 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx) 4281 { 4282 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4283 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*diag=a->diag; 4284 PetscErrorCode ierr; 4285 const MatScalar *aa=a->a; 4286 const PetscScalar *b; 4287 PetscScalar *x; 4288 4289 PetscFunctionBegin; 4290 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 4291 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4292 4293 { 4294 MatScalar s1,s2,s3,s4,x1,x2,x3,x4; 4295 const MatScalar *v; 4296 MatScalar *t=(MatScalar *)x; 4297 PetscInt jdx,idt,idx,nz,i,ai16; 4298 const PetscInt *vi; 4299 4300 /* forward solve the lower triangular */ 4301 idx = 0; 4302 t[0] = (MatScalar)b[0]; 4303 t[1] = (MatScalar)b[1]; 4304 t[2] = (MatScalar)b[2]; 4305 t[3] = (MatScalar)b[3]; 4306 for (i=1; i<n; i++) { 4307 v = aa + 16*ai[i]; 4308 vi = aj + ai[i]; 4309 nz = diag[i] - ai[i]; 4310 idx += 4; 4311 s1 = (MatScalar)b[idx]; 4312 s2 = (MatScalar)b[1+idx]; 4313 s3 = (MatScalar)b[2+idx]; 4314 s4 = (MatScalar)b[3+idx]; 4315 while (nz--) { 4316 jdx = 4*(*vi++); 4317 x1 = t[jdx]; 4318 x2 = t[1+jdx]; 4319 x3 = t[2+jdx]; 4320 x4 = t[3+jdx]; 4321 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4322 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4323 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4324 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4325 v += 16; 4326 } 4327 t[idx] = s1; 4328 t[1+idx] = s2; 4329 t[2+idx] = s3; 4330 t[3+idx] = s4; 4331 } 4332 /* backward solve the upper triangular */ 4333 idt = 4*(n-1); 4334 for (i=n-1; i>=0; i--){ 4335 ai16 = 16*diag[i]; 4336 v = aa + ai16 + 16; 4337 vi = aj + diag[i] + 1; 4338 nz = ai[i+1] - diag[i] - 1; 4339 s1 = t[idt]; 4340 s2 = t[1+idt]; 4341 s3 = t[2+idt]; 4342 s4 = t[3+idt]; 4343 while (nz--) { 4344 idx = 4*(*vi++); 4345 x1 = (MatScalar)x[idx]; 4346 x2 = (MatScalar)x[1+idx]; 4347 x3 = (MatScalar)x[2+idx]; 4348 x4 = (MatScalar)x[3+idx]; 4349 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4350 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4351 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4352 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4353 v += 16; 4354 } 4355 v = aa + ai16; 4356 x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4); 4357 x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4); 4358 x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4); 4359 x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4); 4360 idt -= 4; 4361 } 4362 } 4363 4364 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 4365 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4366 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 4367 PetscFunctionReturn(0); 4368 } 4369 4370 #if defined (PETSC_HAVE_SSE) 4371 4372 #include PETSC_HAVE_SSE 4373 #undef __FUNCT__ 4374 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj" 4375 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx) 4376 { 4377 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4378 unsigned short *aj=(unsigned short *)a->j; 4379 PetscErrorCode ierr; 4380 int *ai=a->i,n=a->mbs,*diag = a->diag; 4381 MatScalar *aa=a->a; 4382 PetscScalar *x,*b; 4383 4384 PetscFunctionBegin; 4385 SSE_SCOPE_BEGIN; 4386 /* 4387 Note: This code currently uses demotion of double 4388 to float when performing the mixed-mode computation. 4389 This may not be numerically reasonable for all applications. 4390 */ 4391 PREFETCH_NTA(aa+16*ai[1]); 4392 4393 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 4394 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4395 { 4396 /* x will first be computed in single precision then promoted inplace to double */ 4397 MatScalar *v,*t=(MatScalar *)x; 4398 int nz,i,idt,ai16; 4399 unsigned int jdx,idx; 4400 unsigned short *vi; 4401 /* Forward solve the lower triangular factor. */ 4402 4403 /* First block is the identity. */ 4404 idx = 0; 4405 CONVERT_DOUBLE4_FLOAT4(t,b); 4406 v = aa + 16*((unsigned int)ai[1]); 4407 4408 for (i=1; i<n;) { 4409 PREFETCH_NTA(&v[8]); 4410 vi = aj + ai[i]; 4411 nz = diag[i] - ai[i]; 4412 idx += 4; 4413 4414 /* Demote RHS from double to float. */ 4415 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 4416 LOAD_PS(&t[idx],XMM7); 4417 4418 while (nz--) { 4419 PREFETCH_NTA(&v[16]); 4420 jdx = 4*((unsigned int)(*vi++)); 4421 4422 /* 4x4 Matrix-Vector product with negative accumulation: */ 4423 SSE_INLINE_BEGIN_2(&t[jdx],v) 4424 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 4425 4426 /* First Column */ 4427 SSE_COPY_PS(XMM0,XMM6) 4428 SSE_SHUFFLE(XMM0,XMM0,0x00) 4429 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 4430 SSE_SUB_PS(XMM7,XMM0) 4431 4432 /* Second Column */ 4433 SSE_COPY_PS(XMM1,XMM6) 4434 SSE_SHUFFLE(XMM1,XMM1,0x55) 4435 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 4436 SSE_SUB_PS(XMM7,XMM1) 4437 4438 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 4439 4440 /* Third Column */ 4441 SSE_COPY_PS(XMM2,XMM6) 4442 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4443 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 4444 SSE_SUB_PS(XMM7,XMM2) 4445 4446 /* Fourth Column */ 4447 SSE_COPY_PS(XMM3,XMM6) 4448 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4449 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 4450 SSE_SUB_PS(XMM7,XMM3) 4451 SSE_INLINE_END_2 4452 4453 v += 16; 4454 } 4455 v = aa + 16*ai[++i]; 4456 PREFETCH_NTA(v); 4457 STORE_PS(&t[idx],XMM7); 4458 } 4459 4460 /* Backward solve the upper triangular factor.*/ 4461 4462 idt = 4*(n-1); 4463 ai16 = 16*diag[n-1]; 4464 v = aa + ai16 + 16; 4465 for (i=n-1; i>=0;){ 4466 PREFETCH_NTA(&v[8]); 4467 vi = aj + diag[i] + 1; 4468 nz = ai[i+1] - diag[i] - 1; 4469 4470 LOAD_PS(&t[idt],XMM7); 4471 4472 while (nz--) { 4473 PREFETCH_NTA(&v[16]); 4474 idx = 4*((unsigned int)(*vi++)); 4475 4476 /* 4x4 Matrix-Vector Product with negative accumulation: */ 4477 SSE_INLINE_BEGIN_2(&t[idx],v) 4478 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 4479 4480 /* First Column */ 4481 SSE_COPY_PS(XMM0,XMM6) 4482 SSE_SHUFFLE(XMM0,XMM0,0x00) 4483 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 4484 SSE_SUB_PS(XMM7,XMM0) 4485 4486 /* Second Column */ 4487 SSE_COPY_PS(XMM1,XMM6) 4488 SSE_SHUFFLE(XMM1,XMM1,0x55) 4489 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 4490 SSE_SUB_PS(XMM7,XMM1) 4491 4492 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 4493 4494 /* Third Column */ 4495 SSE_COPY_PS(XMM2,XMM6) 4496 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4497 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 4498 SSE_SUB_PS(XMM7,XMM2) 4499 4500 /* Fourth Column */ 4501 SSE_COPY_PS(XMM3,XMM6) 4502 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4503 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 4504 SSE_SUB_PS(XMM7,XMM3) 4505 SSE_INLINE_END_2 4506 v += 16; 4507 } 4508 v = aa + ai16; 4509 ai16 = 16*diag[--i]; 4510 PREFETCH_NTA(aa+ai16+16); 4511 /* 4512 Scale the result by the diagonal 4x4 block, 4513 which was inverted as part of the factorization 4514 */ 4515 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 4516 /* First Column */ 4517 SSE_COPY_PS(XMM0,XMM7) 4518 SSE_SHUFFLE(XMM0,XMM0,0x00) 4519 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 4520 4521 /* Second Column */ 4522 SSE_COPY_PS(XMM1,XMM7) 4523 SSE_SHUFFLE(XMM1,XMM1,0x55) 4524 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 4525 SSE_ADD_PS(XMM0,XMM1) 4526 4527 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 4528 4529 /* Third Column */ 4530 SSE_COPY_PS(XMM2,XMM7) 4531 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4532 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 4533 SSE_ADD_PS(XMM0,XMM2) 4534 4535 /* Fourth Column */ 4536 SSE_COPY_PS(XMM3,XMM7) 4537 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4538 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 4539 SSE_ADD_PS(XMM0,XMM3) 4540 4541 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 4542 SSE_INLINE_END_3 4543 4544 v = aa + ai16 + 16; 4545 idt -= 4; 4546 } 4547 4548 /* Convert t from single precision back to double precision (inplace)*/ 4549 idt = 4*(n-1); 4550 for (i=n-1;i>=0;i--) { 4551 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 4552 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 4553 PetscScalar *xtemp=&x[idt]; 4554 MatScalar *ttemp=&t[idt]; 4555 xtemp[3] = (PetscScalar)ttemp[3]; 4556 xtemp[2] = (PetscScalar)ttemp[2]; 4557 xtemp[1] = (PetscScalar)ttemp[1]; 4558 xtemp[0] = (PetscScalar)ttemp[0]; 4559 idt -= 4; 4560 } 4561 4562 } /* End of artificial scope. */ 4563 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 4564 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4565 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 4566 SSE_SCOPE_END; 4567 PetscFunctionReturn(0); 4568 } 4569 4570 #undef __FUNCT__ 4571 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion" 4572 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx) 4573 { 4574 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4575 int *aj=a->j; 4576 PetscErrorCode ierr; 4577 int *ai=a->i,n=a->mbs,*diag = a->diag; 4578 MatScalar *aa=a->a; 4579 PetscScalar *x,*b; 4580 4581 PetscFunctionBegin; 4582 SSE_SCOPE_BEGIN; 4583 /* 4584 Note: This code currently uses demotion of double 4585 to float when performing the mixed-mode computation. 4586 This may not be numerically reasonable for all applications. 4587 */ 4588 PREFETCH_NTA(aa+16*ai[1]); 4589 4590 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 4591 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4592 { 4593 /* x will first be computed in single precision then promoted inplace to double */ 4594 MatScalar *v,*t=(MatScalar *)x; 4595 int nz,i,idt,ai16; 4596 int jdx,idx; 4597 int *vi; 4598 /* Forward solve the lower triangular factor. */ 4599 4600 /* First block is the identity. */ 4601 idx = 0; 4602 CONVERT_DOUBLE4_FLOAT4(t,b); 4603 v = aa + 16*ai[1]; 4604 4605 for (i=1; i<n;) { 4606 PREFETCH_NTA(&v[8]); 4607 vi = aj + ai[i]; 4608 nz = diag[i] - ai[i]; 4609 idx += 4; 4610 4611 /* Demote RHS from double to float. */ 4612 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 4613 LOAD_PS(&t[idx],XMM7); 4614 4615 while (nz--) { 4616 PREFETCH_NTA(&v[16]); 4617 jdx = 4*(*vi++); 4618 /* jdx = *vi++; */ 4619 4620 /* 4x4 Matrix-Vector product with negative accumulation: */ 4621 SSE_INLINE_BEGIN_2(&t[jdx],v) 4622 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 4623 4624 /* First Column */ 4625 SSE_COPY_PS(XMM0,XMM6) 4626 SSE_SHUFFLE(XMM0,XMM0,0x00) 4627 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 4628 SSE_SUB_PS(XMM7,XMM0) 4629 4630 /* Second Column */ 4631 SSE_COPY_PS(XMM1,XMM6) 4632 SSE_SHUFFLE(XMM1,XMM1,0x55) 4633 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 4634 SSE_SUB_PS(XMM7,XMM1) 4635 4636 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 4637 4638 /* Third Column */ 4639 SSE_COPY_PS(XMM2,XMM6) 4640 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4641 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 4642 SSE_SUB_PS(XMM7,XMM2) 4643 4644 /* Fourth Column */ 4645 SSE_COPY_PS(XMM3,XMM6) 4646 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4647 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 4648 SSE_SUB_PS(XMM7,XMM3) 4649 SSE_INLINE_END_2 4650 4651 v += 16; 4652 } 4653 v = aa + 16*ai[++i]; 4654 PREFETCH_NTA(v); 4655 STORE_PS(&t[idx],XMM7); 4656 } 4657 4658 /* Backward solve the upper triangular factor.*/ 4659 4660 idt = 4*(n-1); 4661 ai16 = 16*diag[n-1]; 4662 v = aa + ai16 + 16; 4663 for (i=n-1; i>=0;){ 4664 PREFETCH_NTA(&v[8]); 4665 vi = aj + diag[i] + 1; 4666 nz = ai[i+1] - diag[i] - 1; 4667 4668 LOAD_PS(&t[idt],XMM7); 4669 4670 while (nz--) { 4671 PREFETCH_NTA(&v[16]); 4672 idx = 4*(*vi++); 4673 /* idx = *vi++; */ 4674 4675 /* 4x4 Matrix-Vector Product with negative accumulation: */ 4676 SSE_INLINE_BEGIN_2(&t[idx],v) 4677 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 4678 4679 /* First Column */ 4680 SSE_COPY_PS(XMM0,XMM6) 4681 SSE_SHUFFLE(XMM0,XMM0,0x00) 4682 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 4683 SSE_SUB_PS(XMM7,XMM0) 4684 4685 /* Second Column */ 4686 SSE_COPY_PS(XMM1,XMM6) 4687 SSE_SHUFFLE(XMM1,XMM1,0x55) 4688 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 4689 SSE_SUB_PS(XMM7,XMM1) 4690 4691 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 4692 4693 /* Third Column */ 4694 SSE_COPY_PS(XMM2,XMM6) 4695 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4696 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 4697 SSE_SUB_PS(XMM7,XMM2) 4698 4699 /* Fourth Column */ 4700 SSE_COPY_PS(XMM3,XMM6) 4701 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4702 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 4703 SSE_SUB_PS(XMM7,XMM3) 4704 SSE_INLINE_END_2 4705 v += 16; 4706 } 4707 v = aa + ai16; 4708 ai16 = 16*diag[--i]; 4709 PREFETCH_NTA(aa+ai16+16); 4710 /* 4711 Scale the result by the diagonal 4x4 block, 4712 which was inverted as part of the factorization 4713 */ 4714 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 4715 /* First Column */ 4716 SSE_COPY_PS(XMM0,XMM7) 4717 SSE_SHUFFLE(XMM0,XMM0,0x00) 4718 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 4719 4720 /* Second Column */ 4721 SSE_COPY_PS(XMM1,XMM7) 4722 SSE_SHUFFLE(XMM1,XMM1,0x55) 4723 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 4724 SSE_ADD_PS(XMM0,XMM1) 4725 4726 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 4727 4728 /* Third Column */ 4729 SSE_COPY_PS(XMM2,XMM7) 4730 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4731 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 4732 SSE_ADD_PS(XMM0,XMM2) 4733 4734 /* Fourth Column */ 4735 SSE_COPY_PS(XMM3,XMM7) 4736 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4737 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 4738 SSE_ADD_PS(XMM0,XMM3) 4739 4740 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 4741 SSE_INLINE_END_3 4742 4743 v = aa + ai16 + 16; 4744 idt -= 4; 4745 } 4746 4747 /* Convert t from single precision back to double precision (inplace)*/ 4748 idt = 4*(n-1); 4749 for (i=n-1;i>=0;i--) { 4750 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 4751 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 4752 PetscScalar *xtemp=&x[idt]; 4753 MatScalar *ttemp=&t[idt]; 4754 xtemp[3] = (PetscScalar)ttemp[3]; 4755 xtemp[2] = (PetscScalar)ttemp[2]; 4756 xtemp[1] = (PetscScalar)ttemp[1]; 4757 xtemp[0] = (PetscScalar)ttemp[0]; 4758 idt -= 4; 4759 } 4760 4761 } /* End of artificial scope. */ 4762 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 4763 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4764 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 4765 SSE_SCOPE_END; 4766 PetscFunctionReturn(0); 4767 } 4768 4769 #endif 4770 4771 #undef __FUNCT__ 4772 #define __FUNCT__ "MatSolve_SeqBAIJ_3_inplace" 4773 PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx) 4774 { 4775 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 4776 IS iscol=a->col,isrow=a->row; 4777 PetscErrorCode ierr; 4778 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 4779 PetscInt i,nz,idx,idt,idc; 4780 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 4781 const MatScalar *aa=a->a,*v; 4782 PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 4783 const PetscScalar *b; 4784 4785 PetscFunctionBegin; 4786 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 4787 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4788 t = a->solve_work; 4789 4790 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 4791 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 4792 4793 /* forward solve the lower triangular */ 4794 idx = 3*(*r++); 4795 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 4796 for (i=1; i<n; i++) { 4797 v = aa + 9*ai[i]; 4798 vi = aj + ai[i]; 4799 nz = diag[i] - ai[i]; 4800 idx = 3*(*r++); 4801 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 4802 while (nz--) { 4803 idx = 3*(*vi++); 4804 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4805 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4806 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4807 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4808 v += 9; 4809 } 4810 idx = 3*i; 4811 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 4812 } 4813 /* backward solve the upper triangular */ 4814 for (i=n-1; i>=0; i--){ 4815 v = aa + 9*diag[i] + 9; 4816 vi = aj + diag[i] + 1; 4817 nz = ai[i+1] - diag[i] - 1; 4818 idt = 3*i; 4819 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 4820 while (nz--) { 4821 idx = 3*(*vi++); 4822 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4823 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4824 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4825 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4826 v += 9; 4827 } 4828 idc = 3*(*c--); 4829 v = aa + 9*diag[i]; 4830 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4831 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4832 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4833 } 4834 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4835 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4836 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 4837 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4838 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 4839 PetscFunctionReturn(0); 4840 } 4841 4842 #undef __FUNCT__ 4843 #define __FUNCT__ "MatSolve_SeqBAIJ_3" 4844 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 4845 { 4846 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 4847 IS iscol=a->col,isrow=a->row; 4848 PetscErrorCode ierr; 4849 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 4850 PetscInt i,nz,idx,idt,idc,m; 4851 const PetscInt *r,*c,*rout,*cout; 4852 const MatScalar *aa=a->a,*v; 4853 PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 4854 const PetscScalar *b; 4855 4856 PetscFunctionBegin; 4857 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 4858 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4859 t = a->solve_work; 4860 4861 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 4862 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 4863 4864 /* forward solve the lower triangular */ 4865 idx = 3*r[0]; 4866 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 4867 for (i=1; i<n; i++) { 4868 v = aa + 9*ai[i]; 4869 vi = aj + ai[i]; 4870 nz = ai[i+1] - ai[i]; 4871 idx = 3*r[i]; 4872 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 4873 for(m=0;m<nz;m++){ 4874 idx = 3*vi[m]; 4875 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4876 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4877 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4878 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4879 v += 9; 4880 } 4881 idx = 3*i; 4882 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 4883 } 4884 /* backward solve the upper triangular */ 4885 for (i=n-1; i>=0; i--){ 4886 v = aa + 9*(adiag[i+1]+1); 4887 vi = aj + adiag[i+1]+1; 4888 nz = adiag[i] - adiag[i+1] - 1; 4889 idt = 3*i; 4890 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 4891 for(m=0;m<nz;m++){ 4892 idx = 3*vi[m]; 4893 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4894 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4895 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4896 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4897 v += 9; 4898 } 4899 idc = 3*c[i]; 4900 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4901 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4902 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4903 } 4904 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4905 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4906 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 4907 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4908 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 4909 PetscFunctionReturn(0); 4910 } 4911 4912 /* 4913 Special case where the matrix was ILU(0) factored in the natural 4914 ordering. This eliminates the need for the column and row permutation. 4915 */ 4916 #undef __FUNCT__ 4917 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_inplace" 4918 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 4919 { 4920 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4921 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 4922 PetscErrorCode ierr; 4923 const PetscInt *diag = a->diag,*vi; 4924 const MatScalar *aa=a->a,*v; 4925 PetscScalar *x,s1,s2,s3,x1,x2,x3; 4926 const PetscScalar *b; 4927 PetscInt jdx,idt,idx,nz,i; 4928 4929 PetscFunctionBegin; 4930 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 4931 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4932 4933 /* forward solve the lower triangular */ 4934 idx = 0; 4935 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; 4936 for (i=1; i<n; i++) { 4937 v = aa + 9*ai[i]; 4938 vi = aj + ai[i]; 4939 nz = diag[i] - ai[i]; 4940 idx += 3; 4941 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 4942 while (nz--) { 4943 jdx = 3*(*vi++); 4944 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx]; 4945 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4946 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4947 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4948 v += 9; 4949 } 4950 x[idx] = s1; 4951 x[1+idx] = s2; 4952 x[2+idx] = s3; 4953 } 4954 /* backward solve the upper triangular */ 4955 for (i=n-1; i>=0; i--){ 4956 v = aa + 9*diag[i] + 9; 4957 vi = aj + diag[i] + 1; 4958 nz = ai[i+1] - diag[i] - 1; 4959 idt = 3*i; 4960 s1 = x[idt]; s2 = x[1+idt]; 4961 s3 = x[2+idt]; 4962 while (nz--) { 4963 idx = 3*(*vi++); 4964 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; 4965 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4966 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4967 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4968 v += 9; 4969 } 4970 v = aa + 9*diag[i]; 4971 x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4972 x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4973 x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4974 } 4975 4976 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 4977 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4978 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 4979 PetscFunctionReturn(0); 4980 } 4981 4982 #undef __FUNCT__ 4983 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering" 4984 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 4985 { 4986 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4987 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 4988 PetscErrorCode ierr; 4989 PetscInt i,k,nz,idx,jdx,idt; 4990 const PetscInt bs = A->rmap->bs,bs2 = a->bs2; 4991 const MatScalar *aa=a->a,*v; 4992 PetscScalar *x; 4993 const PetscScalar *b; 4994 PetscScalar s1,s2,s3,x1,x2,x3; 4995 4996 PetscFunctionBegin; 4997 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 4998 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4999 /* forward solve the lower triangular */ 5000 idx = 0; 5001 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx]; 5002 for (i=1; i<n; i++) { 5003 v = aa + bs2*ai[i]; 5004 vi = aj + ai[i]; 5005 nz = ai[i+1] - ai[i]; 5006 idx = bs*i; 5007 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 5008 for(k=0;k<nz;k++){ 5009 jdx = bs*vi[k]; 5010 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx]; 5011 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 5012 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 5013 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 5014 5015 v += bs2; 5016 } 5017 5018 x[idx] = s1; 5019 x[1+idx] = s2; 5020 x[2+idx] = s3; 5021 } 5022 5023 /* backward solve the upper triangular */ 5024 for (i=n-1; i>=0; i--){ 5025 v = aa + bs2*(adiag[i+1]+1); 5026 vi = aj + adiag[i+1]+1; 5027 nz = adiag[i] - adiag[i+1]-1; 5028 idt = bs*i; 5029 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt]; 5030 5031 for(k=0;k<nz;k++){ 5032 idx = bs*vi[k]; 5033 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 5034 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 5035 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 5036 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 5037 5038 v += bs2; 5039 } 5040 /* x = inv_diagonal*x */ 5041 x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 5042 x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 5043 x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 5044 5045 } 5046 5047 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5048 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5049 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 5050 PetscFunctionReturn(0); 5051 } 5052 5053 #undef __FUNCT__ 5054 #define __FUNCT__ "MatSolve_SeqBAIJ_2_inplace" 5055 PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx) 5056 { 5057 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 5058 IS iscol=a->col,isrow=a->row; 5059 PetscErrorCode ierr; 5060 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 5061 PetscInt i,nz,idx,idt,idc; 5062 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 5063 const MatScalar *aa=a->a,*v; 5064 PetscScalar *x,s1,s2,x1,x2,*t; 5065 const PetscScalar *b; 5066 5067 PetscFunctionBegin; 5068 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 5069 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5070 t = a->solve_work; 5071 5072 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 5073 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 5074 5075 /* forward solve the lower triangular */ 5076 idx = 2*(*r++); 5077 t[0] = b[idx]; t[1] = b[1+idx]; 5078 for (i=1; i<n; i++) { 5079 v = aa + 4*ai[i]; 5080 vi = aj + ai[i]; 5081 nz = diag[i] - ai[i]; 5082 idx = 2*(*r++); 5083 s1 = b[idx]; s2 = b[1+idx]; 5084 while (nz--) { 5085 idx = 2*(*vi++); 5086 x1 = t[idx]; x2 = t[1+idx]; 5087 s1 -= v[0]*x1 + v[2]*x2; 5088 s2 -= v[1]*x1 + v[3]*x2; 5089 v += 4; 5090 } 5091 idx = 2*i; 5092 t[idx] = s1; t[1+idx] = s2; 5093 } 5094 /* backward solve the upper triangular */ 5095 for (i=n-1; i>=0; i--){ 5096 v = aa + 4*diag[i] + 4; 5097 vi = aj + diag[i] + 1; 5098 nz = ai[i+1] - diag[i] - 1; 5099 idt = 2*i; 5100 s1 = t[idt]; s2 = t[1+idt]; 5101 while (nz--) { 5102 idx = 2*(*vi++); 5103 x1 = t[idx]; x2 = t[1+idx]; 5104 s1 -= v[0]*x1 + v[2]*x2; 5105 s2 -= v[1]*x1 + v[3]*x2; 5106 v += 4; 5107 } 5108 idc = 2*(*c--); 5109 v = aa + 4*diag[i]; 5110 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 5111 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 5112 } 5113 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 5114 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 5115 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5116 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5117 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 5118 PetscFunctionReturn(0); 5119 } 5120 5121 #undef __FUNCT__ 5122 #define __FUNCT__ "MatSolve_SeqBAIJ_2" 5123 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 5124 { 5125 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 5126 IS iscol=a->col,isrow=a->row; 5127 PetscErrorCode ierr; 5128 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 5129 PetscInt i,nz,idx,jdx,idt,idc,m; 5130 const PetscInt *r,*c,*rout,*cout; 5131 const MatScalar *aa=a->a,*v; 5132 PetscScalar *x,s1,s2,x1,x2,*t; 5133 const PetscScalar *b; 5134 5135 PetscFunctionBegin; 5136 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 5137 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5138 t = a->solve_work; 5139 5140 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 5141 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 5142 5143 /* forward solve the lower triangular */ 5144 idx = 2*r[0]; 5145 t[0] = b[idx]; t[1] = b[1+idx]; 5146 for (i=1; i<n; i++) { 5147 v = aa + 4*ai[i]; 5148 vi = aj + ai[i]; 5149 nz = ai[i+1] - ai[i]; 5150 idx = 2*r[i]; 5151 s1 = b[idx]; s2 = b[1+idx]; 5152 for(m=0;m<nz;m++){ 5153 jdx = 2*vi[m]; 5154 x1 = t[jdx]; x2 = t[1+jdx]; 5155 s1 -= v[0]*x1 + v[2]*x2; 5156 s2 -= v[1]*x1 + v[3]*x2; 5157 v += 4; 5158 } 5159 idx = 2*i; 5160 t[idx] = s1; t[1+idx] = s2; 5161 } 5162 /* backward solve the upper triangular */ 5163 for (i=n-1; i>=0; i--){ 5164 v = aa + 4*(adiag[i+1]+1); 5165 vi = aj + adiag[i+1]+1; 5166 nz = adiag[i] - adiag[i+1] - 1; 5167 idt = 2*i; 5168 s1 = t[idt]; s2 = t[1+idt]; 5169 for(m=0;m<nz;m++){ 5170 idx = 2*vi[m]; 5171 x1 = t[idx]; x2 = t[1+idx]; 5172 s1 -= v[0]*x1 + v[2]*x2; 5173 s2 -= v[1]*x1 + v[3]*x2; 5174 v += 4; 5175 } 5176 idc = 2*c[i]; 5177 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 5178 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 5179 } 5180 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 5181 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 5182 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5183 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5184 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 5185 PetscFunctionReturn(0); 5186 } 5187 5188 /* 5189 Special case where the matrix was ILU(0) factored in the natural 5190 ordering. This eliminates the need for the column and row permutation. 5191 */ 5192 #undef __FUNCT__ 5193 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_inplace" 5194 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 5195 { 5196 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 5197 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 5198 PetscErrorCode ierr; 5199 const MatScalar *aa=a->a,*v; 5200 PetscScalar *x,s1,s2,x1,x2; 5201 const PetscScalar *b; 5202 PetscInt jdx,idt,idx,nz,i; 5203 5204 PetscFunctionBegin; 5205 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 5206 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5207 5208 /* forward solve the lower triangular */ 5209 idx = 0; 5210 x[0] = b[0]; x[1] = b[1]; 5211 for (i=1; i<n; i++) { 5212 v = aa + 4*ai[i]; 5213 vi = aj + ai[i]; 5214 nz = diag[i] - ai[i]; 5215 idx += 2; 5216 s1 = b[idx];s2 = b[1+idx]; 5217 while (nz--) { 5218 jdx = 2*(*vi++); 5219 x1 = x[jdx];x2 = x[1+jdx]; 5220 s1 -= v[0]*x1 + v[2]*x2; 5221 s2 -= v[1]*x1 + v[3]*x2; 5222 v += 4; 5223 } 5224 x[idx] = s1; 5225 x[1+idx] = s2; 5226 } 5227 /* backward solve the upper triangular */ 5228 for (i=n-1; i>=0; i--){ 5229 v = aa + 4*diag[i] + 4; 5230 vi = aj + diag[i] + 1; 5231 nz = ai[i+1] - diag[i] - 1; 5232 idt = 2*i; 5233 s1 = x[idt]; s2 = x[1+idt]; 5234 while (nz--) { 5235 idx = 2*(*vi++); 5236 x1 = x[idx]; x2 = x[1+idx]; 5237 s1 -= v[0]*x1 + v[2]*x2; 5238 s2 -= v[1]*x1 + v[3]*x2; 5239 v += 4; 5240 } 5241 v = aa + 4*diag[i]; 5242 x[idt] = v[0]*s1 + v[2]*s2; 5243 x[1+idt] = v[1]*s1 + v[3]*s2; 5244 } 5245 5246 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5247 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5248 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 5249 PetscFunctionReturn(0); 5250 } 5251 5252 #undef __FUNCT__ 5253 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering" 5254 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 5255 { 5256 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 5257 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 5258 PetscInt i,k,nz,idx,idt,jdx; 5259 PetscErrorCode ierr; 5260 const MatScalar *aa=a->a,*v; 5261 PetscScalar *x,s1,s2,x1,x2; 5262 const PetscScalar *b; 5263 5264 PetscFunctionBegin; 5265 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 5266 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5267 /* forward solve the lower triangular */ 5268 idx = 0; 5269 x[0] = b[idx]; x[1] = b[1+idx]; 5270 for (i=1; i<n; i++) { 5271 v = aa + 4*ai[i]; 5272 vi = aj + ai[i]; 5273 nz = ai[i+1] - ai[i]; 5274 idx = 2*i; 5275 s1 = b[idx];s2 = b[1+idx]; 5276 PetscPrefetchBlock(vi+nz,nz,0,PETSC_PREFETCH_HINT_NTA); 5277 PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); 5278 for(k=0;k<nz;k++){ 5279 jdx = 2*vi[k]; 5280 x1 = x[jdx];x2 = x[1+jdx]; 5281 s1 -= v[0]*x1 + v[2]*x2; 5282 s2 -= v[1]*x1 + v[3]*x2; 5283 v += 4; 5284 } 5285 x[idx] = s1; 5286 x[1+idx] = s2; 5287 } 5288 5289 /* backward solve the upper triangular */ 5290 for (i=n-1; i>=0; i--){ 5291 v = aa + 4*(adiag[i+1]+1); 5292 vi = aj + adiag[i+1]+1; 5293 nz = adiag[i] - adiag[i+1]-1; 5294 idt = 2*i; 5295 s1 = x[idt]; s2 = x[1+idt]; 5296 PetscPrefetchBlock(vi+nz,nz,0,PETSC_PREFETCH_HINT_NTA); 5297 PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); 5298 for(k=0;k<nz;k++){ 5299 idx = 2*vi[k]; 5300 x1 = x[idx]; x2 = x[1+idx]; 5301 s1 -= v[0]*x1 + v[2]*x2; 5302 s2 -= v[1]*x1 + v[3]*x2; 5303 v += 4; 5304 } 5305 /* x = inv_diagonal*x */ 5306 x[idt] = v[0]*s1 + v[2]*s2; 5307 x[1+idt] = v[1]*s1 + v[3]*s2; 5308 } 5309 5310 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5311 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5312 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 5313 PetscFunctionReturn(0); 5314 } 5315 5316 #undef __FUNCT__ 5317 #define __FUNCT__ "MatSolve_SeqBAIJ_1_inplace" 5318 PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx) 5319 { 5320 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 5321 IS iscol=a->col,isrow=a->row; 5322 PetscErrorCode ierr; 5323 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 5324 PetscInt i,nz; 5325 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 5326 const MatScalar *aa=a->a,*v; 5327 PetscScalar *x,s1,*t; 5328 const PetscScalar *b; 5329 5330 PetscFunctionBegin; 5331 if (!n) PetscFunctionReturn(0); 5332 5333 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 5334 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5335 t = a->solve_work; 5336 5337 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 5338 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 5339 5340 /* forward solve the lower triangular */ 5341 t[0] = b[*r++]; 5342 for (i=1; i<n; i++) { 5343 v = aa + ai[i]; 5344 vi = aj + ai[i]; 5345 nz = diag[i] - ai[i]; 5346 s1 = b[*r++]; 5347 while (nz--) { 5348 s1 -= (*v++)*t[*vi++]; 5349 } 5350 t[i] = s1; 5351 } 5352 /* backward solve the upper triangular */ 5353 for (i=n-1; i>=0; i--){ 5354 v = aa + diag[i] + 1; 5355 vi = aj + diag[i] + 1; 5356 nz = ai[i+1] - diag[i] - 1; 5357 s1 = t[i]; 5358 while (nz--) { 5359 s1 -= (*v++)*t[*vi++]; 5360 } 5361 x[*c--] = t[i] = aa[diag[i]]*s1; 5362 } 5363 5364 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 5365 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 5366 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5367 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5368 ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr); 5369 PetscFunctionReturn(0); 5370 } 5371 5372 #undef __FUNCT__ 5373 #define __FUNCT__ "MatSolve_SeqBAIJ_1" 5374 PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 5375 { 5376 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 5377 IS iscol = a->col,isrow = a->row; 5378 PetscErrorCode ierr; 5379 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag = a->diag,nz; 5380 const PetscInt *rout,*cout,*r,*c; 5381 PetscScalar *x,*tmp,sum; 5382 const PetscScalar *b; 5383 const MatScalar *aa = a->a,*v; 5384 5385 PetscFunctionBegin; 5386 if (!n) PetscFunctionReturn(0); 5387 5388 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 5389 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5390 tmp = a->solve_work; 5391 5392 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 5393 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 5394 5395 /* forward solve the lower triangular */ 5396 tmp[0] = b[r[0]]; 5397 v = aa; 5398 vi = aj; 5399 for (i=1; i<n; i++) { 5400 nz = ai[i+1] - ai[i]; 5401 sum = b[r[i]]; 5402 PetscSparseDenseMinusDot(sum,tmp,v,vi,nz); 5403 tmp[i] = sum; 5404 v += nz; vi += nz; 5405 } 5406 5407 /* backward solve the upper triangular */ 5408 for (i=n-1; i>=0; i--){ 5409 v = aa + adiag[i+1]+1; 5410 vi = aj + adiag[i+1]+1; 5411 nz = adiag[i]-adiag[i+1]-1; 5412 sum = tmp[i]; 5413 PetscSparseDenseMinusDot(sum,tmp,v,vi,nz); 5414 x[c[i]] = tmp[i] = sum*v[nz]; /* v[nz] = aa[adiag[i]] */ 5415 } 5416 5417 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 5418 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 5419 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5420 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5421 ierr = PetscLogFlops(2*a->nz - A->cmap->n);CHKERRQ(ierr); 5422 PetscFunctionReturn(0); 5423 } 5424 5425 /* 5426 Special case where the matrix was ILU(0) factored in the natural 5427 ordering. This eliminates the need for the column and row permutation. 5428 */ 5429 #undef __FUNCT__ 5430 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering_inplace" 5431 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 5432 { 5433 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 5434 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 5435 PetscErrorCode ierr; 5436 const MatScalar *aa=a->a,*v; 5437 PetscScalar *x; 5438 const PetscScalar *b; 5439 PetscScalar s1,x1; 5440 PetscInt jdx,idt,idx,nz,i; 5441 5442 PetscFunctionBegin; 5443 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 5444 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5445 5446 /* forward solve the lower triangular */ 5447 idx = 0; 5448 x[0] = b[0]; 5449 for (i=1; i<n; i++) { 5450 v = aa + ai[i]; 5451 vi = aj + ai[i]; 5452 nz = diag[i] - ai[i]; 5453 idx += 1; 5454 s1 = b[idx]; 5455 while (nz--) { 5456 jdx = *vi++; 5457 x1 = x[jdx]; 5458 s1 -= v[0]*x1; 5459 v += 1; 5460 } 5461 x[idx] = s1; 5462 } 5463 /* backward solve the upper triangular */ 5464 for (i=n-1; i>=0; i--){ 5465 v = aa + diag[i] + 1; 5466 vi = aj + diag[i] + 1; 5467 nz = ai[i+1] - diag[i] - 1; 5468 idt = i; 5469 s1 = x[idt]; 5470 while (nz--) { 5471 idx = *vi++; 5472 x1 = x[idx]; 5473 s1 -= v[0]*x1; 5474 v += 1; 5475 } 5476 v = aa + diag[i]; 5477 x[idt] = v[0]*s1; 5478 } 5479 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5480 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5481 ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 5482 PetscFunctionReturn(0); 5483 } 5484 5485 5486 #undef __FUNCT__ 5487 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering" 5488 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 5489 { 5490 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 5491 PetscErrorCode ierr; 5492 const PetscInt n = a->mbs,*ai = a->i,*aj = a->j,*adiag = a->diag,*vi; 5493 PetscScalar *x,sum; 5494 const PetscScalar *b; 5495 const MatScalar *aa = a->a,*v; 5496 PetscInt i,nz; 5497 5498 PetscFunctionBegin; 5499 if (!n) PetscFunctionReturn(0); 5500 5501 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 5502 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5503 5504 /* forward solve the lower triangular */ 5505 x[0] = b[0]; 5506 v = aa; 5507 vi = aj; 5508 for (i=1; i<n; i++) { 5509 nz = ai[i+1] - ai[i]; 5510 sum = b[i]; 5511 PetscSparseDenseMinusDot(sum,x,v,vi,nz); 5512 v += nz; 5513 vi += nz; 5514 x[i] = sum; 5515 } 5516 5517 /* backward solve the upper triangular */ 5518 for (i=n-1; i>=0; i--){ 5519 v = aa + adiag[i+1] + 1; 5520 vi = aj + adiag[i+1] + 1; 5521 nz = adiag[i] - adiag[i+1]-1; 5522 sum = x[i]; 5523 PetscSparseDenseMinusDot(sum,x,v,vi,nz); 5524 x[i] = sum*v[nz]; /* x[i]=aa[adiag[i]]*sum; v++; */ 5525 } 5526 5527 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 5528 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5529 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5530 PetscFunctionReturn(0); 5531 } 5532 5533 /* ----------------------------------------------------------------*/ 5534 extern PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscBool ); 5535 5536 #undef __FUNCT__ 5537 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering" 5538 /* 5539 This is not much faster than MatLUFactorNumeric_SeqBAIJ_N() but the solve is faster at least sometimes 5540 */ 5541 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info) 5542 { 5543 Mat C=B; 5544 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 5545 PetscErrorCode ierr; 5546 PetscInt i,j,k,ipvt[15]; 5547 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*ajtmp,*bjtmp,*bdiag=b->diag,*pj; 5548 PetscInt nz,nzL,row; 5549 MatScalar *rtmp,*pc,*mwork,*pv,*vv,work[225]; 5550 const MatScalar *v,*aa=a->a; 5551 PetscInt bs2 = a->bs2,bs=A->rmap->bs,flg; 5552 PetscInt sol_ver; 5553 5554 PetscFunctionBegin; 5555 5556 ierr = PetscOptionsGetInt(PETSC_NULL,"-sol_ver",&sol_ver,PETSC_NULL);CHKERRQ(ierr); 5557 5558 /* generate work space needed by the factorization */ 5559 ierr = PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);CHKERRQ(ierr); 5560 ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 5561 5562 for (i=0; i<n; i++){ 5563 /* zero rtmp */ 5564 /* L part */ 5565 nz = bi[i+1] - bi[i]; 5566 bjtmp = bj + bi[i]; 5567 for (j=0; j<nz; j++){ 5568 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5569 } 5570 5571 /* U part */ 5572 nz = bdiag[i] - bdiag[i+1]; 5573 bjtmp = bj + bdiag[i+1]+1; 5574 for (j=0; j<nz; j++){ 5575 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5576 } 5577 5578 /* load in initial (unfactored row) */ 5579 nz = ai[i+1] - ai[i]; 5580 ajtmp = aj + ai[i]; 5581 v = aa + bs2*ai[i]; 5582 for (j=0; j<nz; j++) { 5583 ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 5584 } 5585 5586 /* elimination */ 5587 bjtmp = bj + bi[i]; 5588 nzL = bi[i+1] - bi[i]; 5589 for(k=0;k < nzL;k++) { 5590 row = bjtmp[k]; 5591 pc = rtmp + bs2*row; 5592 for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 5593 if (flg) { 5594 pv = b->a + bs2*bdiag[row]; 5595 Kernel_A_gets_A_times_B(bs,pc,pv,mwork); 5596 /*ierr = Kernel_A_gets_A_times_B_15(pc,pv,mwork);CHKERRQ(ierr);*/ 5597 pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 5598 pv = b->a + bs2*(bdiag[row+1]+1); 5599 nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 5600 for (j=0; j<nz; j++) { 5601 vv = rtmp + bs2*pj[j]; 5602 Kernel_A_gets_A_minus_B_times_C(bs,vv,pc,pv); 5603 /* ierr = Kernel_A_gets_A_minus_B_times_C_15(vv,pc,pv);CHKERRQ(ierr); */ 5604 pv += bs2; 5605 } 5606 ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 5607 } 5608 } 5609 5610 /* finished row so stick it into b->a */ 5611 /* L part */ 5612 pv = b->a + bs2*bi[i] ; 5613 pj = b->j + bi[i] ; 5614 nz = bi[i+1] - bi[i]; 5615 for (j=0; j<nz; j++) { 5616 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5617 } 5618 5619 /* Mark diagonal and invert diagonal for simplier triangular solves */ 5620 pv = b->a + bs2*bdiag[i]; 5621 pj = b->j + bdiag[i]; 5622 ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5623 /* Kernel_A_gets_inverse_A(bs,pv,pivots,work); */ 5624 ierr = Kernel_A_gets_inverse_A_15(pv,ipvt,work,info->shiftamount);CHKERRQ(ierr); 5625 5626 /* U part */ 5627 pv = b->a + bs2*(bdiag[i+1]+1); 5628 pj = b->j + bdiag[i+1]+1; 5629 nz = bdiag[i] - bdiag[i+1] - 1; 5630 for (j=0; j<nz; j++){ 5631 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5632 } 5633 } 5634 5635 ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr); 5636 C->ops->solve = MatSolve_SeqBAIJ_15_NaturalOrdering_ver1; 5637 C->ops->solvetranspose = MatSolve_SeqBAIJ_N_NaturalOrdering; 5638 C->assembled = PETSC_TRUE; 5639 ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 5640 PetscFunctionReturn(0); 5641 } 5642 5643 #undef __FUNCT__ 5644 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N" 5645 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N(Mat B,Mat A,const MatFactorInfo *info) 5646 { 5647 Mat C=B; 5648 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 5649 IS isrow = b->row,isicol = b->icol; 5650 PetscErrorCode ierr; 5651 const PetscInt *r,*ic; 5652 PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 5653 PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj; 5654 MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 5655 PetscInt bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg; 5656 MatScalar *v_work; 5657 PetscBool col_identity,row_identity,both_identity; 5658 5659 PetscFunctionBegin; 5660 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 5661 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 5662 5663 ierr = PetscMalloc(bs2*n*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 5664 ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 5665 5666 /* generate work space needed by dense LU factorization */ 5667 ierr = PetscMalloc3(bs,MatScalar,&v_work,bs2,MatScalar,&mwork,bs,PetscInt,&v_pivots);CHKERRQ(ierr); 5668 5669 for (i=0; i<n; i++){ 5670 /* zero rtmp */ 5671 /* L part */ 5672 nz = bi[i+1] - bi[i]; 5673 bjtmp = bj + bi[i]; 5674 for (j=0; j<nz; j++){ 5675 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5676 } 5677 5678 /* U part */ 5679 nz = bdiag[i] - bdiag[i+1]; 5680 bjtmp = bj + bdiag[i+1]+1; 5681 for (j=0; j<nz; j++){ 5682 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5683 } 5684 5685 /* load in initial (unfactored row) */ 5686 nz = ai[r[i]+1] - ai[r[i]]; 5687 ajtmp = aj + ai[r[i]]; 5688 v = aa + bs2*ai[r[i]]; 5689 for (j=0; j<nz; j++) { 5690 ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 5691 } 5692 5693 /* elimination */ 5694 bjtmp = bj + bi[i]; 5695 nzL = bi[i+1] - bi[i]; 5696 for(k=0;k < nzL;k++) { 5697 row = bjtmp[k]; 5698 pc = rtmp + bs2*row; 5699 for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 5700 if (flg) { 5701 pv = b->a + bs2*bdiag[row]; 5702 Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */ 5703 pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 5704 pv = b->a + bs2*(bdiag[row+1]+1); 5705 nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 5706 for (j=0; j<nz; j++) { 5707 Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); 5708 } 5709 ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 5710 } 5711 } 5712 5713 /* finished row so stick it into b->a */ 5714 /* L part */ 5715 pv = b->a + bs2*bi[i] ; 5716 pj = b->j + bi[i] ; 5717 nz = bi[i+1] - bi[i]; 5718 for (j=0; j<nz; j++) { 5719 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5720 } 5721 5722 /* Mark diagonal and invert diagonal for simplier triangular solves */ 5723 pv = b->a + bs2*bdiag[i]; 5724 pj = b->j + bdiag[i]; 5725 /* if (*pj != i)SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */ 5726 ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5727 ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); 5728 5729 /* U part */ 5730 pv = b->a + bs2*(bdiag[i+1]+1); 5731 pj = b->j + bdiag[i+1]+1; 5732 nz = bdiag[i] - bdiag[i+1] - 1; 5733 for (j=0; j<nz; j++){ 5734 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5735 } 5736 } 5737 5738 ierr = PetscFree(rtmp);CHKERRQ(ierr); 5739 ierr = PetscFree3(v_work,mwork,v_pivots);CHKERRQ(ierr); 5740 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 5741 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 5742 5743 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 5744 ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr); 5745 both_identity = (PetscBool) (row_identity && col_identity); 5746 if (both_identity){ 5747 C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering; 5748 } else { 5749 C->ops->solve = MatSolve_SeqBAIJ_N; 5750 } 5751 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_N; 5752 5753 C->assembled = PETSC_TRUE; 5754 ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 5755 PetscFunctionReturn(0); 5756 } 5757 5758 /* 5759 ilu(0) with natural ordering under new data structure. 5760 See MatILUFactorSymbolic_SeqAIJ_ilu0() for detailed description 5761 because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_inplace(). 5762 */ 5763 5764 #undef __FUNCT__ 5765 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0" 5766 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 5767 { 5768 5769 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 5770 PetscErrorCode ierr; 5771 PetscInt n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2; 5772 PetscInt i,j,nz,*bi,*bj,*bdiag,bi_temp; 5773 5774 PetscFunctionBegin; 5775 ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr); 5776 b = (Mat_SeqBAIJ*)(fact)->data; 5777 5778 /* allocate matrix arrays for new data structure */ 5779 ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,n+1,PetscInt,&b->i);CHKERRQ(ierr); 5780 ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(n+1)*sizeof(PetscInt));CHKERRQ(ierr); 5781 b->singlemalloc = PETSC_TRUE; 5782 if (!b->diag){ 5783 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr); 5784 ierr = PetscLogObjectMemory(fact,(n+1)*sizeof(PetscInt));CHKERRQ(ierr); 5785 } 5786 bdiag = b->diag; 5787 5788 if (n > 0) { 5789 ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr); 5790 } 5791 5792 /* set bi and bj with new data structure */ 5793 bi = b->i; 5794 bj = b->j; 5795 5796 /* L part */ 5797 bi[0] = 0; 5798 for (i=0; i<n; i++){ 5799 nz = adiag[i] - ai[i]; 5800 bi[i+1] = bi[i] + nz; 5801 aj = a->j + ai[i]; 5802 for (j=0; j<nz; j++){ 5803 *bj = aj[j]; bj++; 5804 } 5805 } 5806 5807 /* U part */ 5808 bi_temp = bi[n]; 5809 bdiag[n] = bi[n]-1; 5810 for (i=n-1; i>=0; i--){ 5811 nz = ai[i+1] - adiag[i] - 1; 5812 bi_temp = bi_temp + nz + 1; 5813 aj = a->j + adiag[i] + 1; 5814 for (j=0; j<nz; j++){ 5815 *bj = aj[j]; bj++; 5816 } 5817 /* diag[i] */ 5818 *bj = i; bj++; 5819 bdiag[i] = bi_temp - 1; 5820 } 5821 PetscFunctionReturn(0); 5822 } 5823 5824 #undef __FUNCT__ 5825 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ" 5826 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 5827 { 5828 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 5829 IS isicol; 5830 PetscErrorCode ierr; 5831 const PetscInt *r,*ic; 5832 PetscInt n=a->mbs,*ai=a->i,*aj=a->j,d; 5833 PetscInt *bi,*cols,nnz,*cols_lvl; 5834 PetscInt *bdiag,prow,fm,nzbd,reallocs=0,dcount=0; 5835 PetscInt i,levels,diagonal_fill; 5836 PetscBool col_identity,row_identity,both_identity; 5837 PetscReal f; 5838 PetscInt nlnk,*lnk,*lnk_lvl=PETSC_NULL; 5839 PetscBT lnkbt; 5840 PetscInt nzi,*bj,**bj_ptr,**bjlvl_ptr; 5841 PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL; 5842 PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL; 5843 PetscBool missing; 5844 PetscInt bs=A->rmap->bs,bs2=a->bs2; 5845 5846 PetscFunctionBegin; 5847 if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n); 5848 if (bs>1){ /* check shifttype */ 5849 if (info->shifttype == MAT_SHIFT_NONZERO || info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE) 5850 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Only MAT_SHIFT_NONE and MAT_SHIFT_INBLOCKS are supported for BAIJ matrix"); 5851 } 5852 5853 ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr); 5854 if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d); 5855 5856 f = info->fill; 5857 levels = (PetscInt)info->levels; 5858 diagonal_fill = (PetscInt)info->diagonal_fill; 5859 ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 5860 5861 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 5862 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 5863 both_identity = (PetscBool) (row_identity && col_identity); 5864 5865 if (!levels && both_identity) { 5866 /* special case: ilu(0) with natural ordering */ 5867 ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0(fact,A,isrow,iscol,info);CHKERRQ(ierr); 5868 ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 5869 5870 fact->factortype = MAT_FACTOR_ILU; 5871 (fact)->info.factor_mallocs = 0; 5872 (fact)->info.fill_ratio_given = info->fill; 5873 (fact)->info.fill_ratio_needed = 1.0; 5874 b = (Mat_SeqBAIJ*)(fact)->data; 5875 b->row = isrow; 5876 b->col = iscol; 5877 b->icol = isicol; 5878 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 5879 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 5880 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 5881 ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 5882 PetscFunctionReturn(0); 5883 } 5884 5885 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 5886 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 5887 5888 /* get new row pointers */ 5889 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr); 5890 bi[0] = 0; 5891 /* bdiag is location of diagonal in factor */ 5892 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr); 5893 bdiag[0] = 0; 5894 5895 ierr = PetscMalloc2(n,PetscInt*,&bj_ptr,n,PetscInt*,&bjlvl_ptr);CHKERRQ(ierr); 5896 5897 /* create a linked list for storing column indices of the active row */ 5898 nlnk = n + 1; 5899 ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 5900 5901 /* initial FreeSpace size is f*(ai[n]+1) */ 5902 ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr); 5903 current_space = free_space; 5904 ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr); 5905 current_space_lvl = free_space_lvl; 5906 5907 for (i=0; i<n; i++) { 5908 nzi = 0; 5909 /* copy current row into linked list */ 5910 nnz = ai[r[i]+1] - ai[r[i]]; 5911 if (!nnz) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i); 5912 cols = aj + ai[r[i]]; 5913 lnk[i] = -1; /* marker to indicate if diagonal exists */ 5914 ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 5915 nzi += nlnk; 5916 5917 /* make sure diagonal entry is included */ 5918 if (diagonal_fill && lnk[i] == -1) { 5919 fm = n; 5920 while (lnk[fm] < i) fm = lnk[fm]; 5921 lnk[i] = lnk[fm]; /* insert diagonal into linked list */ 5922 lnk[fm] = i; 5923 lnk_lvl[i] = 0; 5924 nzi++; dcount++; 5925 } 5926 5927 /* add pivot rows into the active row */ 5928 nzbd = 0; 5929 prow = lnk[n]; 5930 while (prow < i) { 5931 nnz = bdiag[prow]; 5932 cols = bj_ptr[prow] + nnz + 1; 5933 cols_lvl = bjlvl_ptr[prow] + nnz + 1; 5934 nnz = bi[prow+1] - bi[prow] - nnz - 1; 5935 ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr); 5936 nzi += nlnk; 5937 prow = lnk[prow]; 5938 nzbd++; 5939 } 5940 bdiag[i] = nzbd; 5941 bi[i+1] = bi[i] + nzi; 5942 5943 /* if free space is not available, make more free space */ 5944 if (current_space->local_remaining<nzi) { 5945 nnz = 2*nzi*(n - i); /* estimated and max additional space needed */ 5946 ierr = PetscFreeSpaceGet(nnz,¤t_space);CHKERRQ(ierr); 5947 ierr = PetscFreeSpaceGet(nnz,¤t_space_lvl);CHKERRQ(ierr); 5948 reallocs++; 5949 } 5950 5951 /* copy data into free_space and free_space_lvl, then initialize lnk */ 5952 ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr); 5953 bj_ptr[i] = current_space->array; 5954 bjlvl_ptr[i] = current_space_lvl->array; 5955 5956 /* make sure the active row i has diagonal entry */ 5957 if (*(bj_ptr[i]+bdiag[i]) != i) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\ntry running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i); 5958 5959 current_space->array += nzi; 5960 current_space->local_used += nzi; 5961 current_space->local_remaining -= nzi; 5962 current_space_lvl->array += nzi; 5963 current_space_lvl->local_used += nzi; 5964 current_space_lvl->local_remaining -= nzi; 5965 } 5966 5967 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 5968 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 5969 5970 /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */ 5971 ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr); 5972 ierr = PetscFreeSpaceContiguous_LU(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr); 5973 5974 ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 5975 ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr); 5976 ierr = PetscFree2(bj_ptr,bjlvl_ptr);CHKERRQ(ierr); 5977 5978 #if defined(PETSC_USE_INFO) 5979 { 5980 PetscReal af = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]); 5981 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr); 5982 ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 5983 ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr); 5984 ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 5985 if (diagonal_fill) { 5986 ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr); 5987 } 5988 } 5989 #endif 5990 5991 /* put together the new matrix */ 5992 ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 5993 ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 5994 b = (Mat_SeqBAIJ*)(fact)->data; 5995 b->free_a = PETSC_TRUE; 5996 b->free_ij = PETSC_TRUE; 5997 b->singlemalloc = PETSC_FALSE; 5998 ierr = PetscMalloc( (bs2*(bdiag[0]+1) )*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 5999 b->j = bj; 6000 b->i = bi; 6001 b->diag = bdiag; 6002 b->free_diag = PETSC_TRUE; 6003 b->ilen = 0; 6004 b->imax = 0; 6005 b->row = isrow; 6006 b->col = iscol; 6007 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 6008 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 6009 b->icol = isicol; 6010 ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 6011 /* In b structure: Free imax, ilen, old a, old j. 6012 Allocate bdiag, solve_work, new a, new j */ 6013 ierr = PetscLogObjectMemory(fact,(bdiag[0]+1) * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr); 6014 b->maxnz = b->nz = bdiag[0]+1; 6015 fact->info.factor_mallocs = reallocs; 6016 fact->info.fill_ratio_given = f; 6017 fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]); 6018 ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 6019 PetscFunctionReturn(0); 6020 } 6021 6022 /* 6023 This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 6024 except that the data structure of Mat_SeqAIJ is slightly different. 6025 Not a good example of code reuse. 6026 */ 6027 #undef __FUNCT__ 6028 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_inplace" 6029 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_inplace(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 6030 { 6031 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 6032 IS isicol; 6033 PetscErrorCode ierr; 6034 const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi; 6035 PetscInt prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp; 6036 PetscInt *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0; 6037 PetscInt incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd; 6038 PetscBool col_identity,row_identity,both_identity,flg; 6039 PetscReal f; 6040 6041 PetscFunctionBegin; 6042 ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr); 6043 if (flg) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd); 6044 6045 f = info->fill; 6046 levels = (PetscInt)info->levels; 6047 diagonal_fill = (PetscInt)info->diagonal_fill; 6048 ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 6049 6050 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 6051 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 6052 both_identity = (PetscBool) (row_identity && col_identity); 6053 6054 if (!levels && both_identity) { /* special case copy the nonzero structure */ 6055 ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr); 6056 ierr = MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);CHKERRQ(ierr); 6057 6058 fact->factortype = MAT_FACTOR_ILU; 6059 b = (Mat_SeqBAIJ*)fact->data; 6060 b->row = isrow; 6061 b->col = iscol; 6062 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 6063 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 6064 b->icol = isicol; 6065 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 6066 ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 6067 PetscFunctionReturn(0); 6068 } 6069 6070 /* general case perform the symbolic factorization */ 6071 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 6072 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 6073 6074 /* get new row pointers */ 6075 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr); 6076 ainew[0] = 0; 6077 /* don't know how many column pointers are needed so estimate */ 6078 jmax = (PetscInt)(f*ai[n] + 1); 6079 ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr); 6080 /* ajfill is level of fill for each fill entry */ 6081 ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr); 6082 /* fill is a linked list of nonzeros in active row */ 6083 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr); 6084 /* im is level for each filled value */ 6085 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr); 6086 /* dloc is location of diagonal in factor */ 6087 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr); 6088 dloc[0] = 0; 6089 for (prow=0; prow<n; prow++) { 6090 6091 /* copy prow into linked list */ 6092 nzf = nz = ai[r[prow]+1] - ai[r[prow]]; 6093 if (!nz) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow); 6094 xi = aj + ai[r[prow]]; 6095 fill[n] = n; 6096 fill[prow] = -1; /* marker for diagonal entry */ 6097 while (nz--) { 6098 fm = n; 6099 idx = ic[*xi++]; 6100 do { 6101 m = fm; 6102 fm = fill[m]; 6103 } while (fm < idx); 6104 fill[m] = idx; 6105 fill[idx] = fm; 6106 im[idx] = 0; 6107 } 6108 6109 /* make sure diagonal entry is included */ 6110 if (diagonal_fill && fill[prow] == -1) { 6111 fm = n; 6112 while (fill[fm] < prow) fm = fill[fm]; 6113 fill[prow] = fill[fm]; /* insert diagonal into linked list */ 6114 fill[fm] = prow; 6115 im[prow] = 0; 6116 nzf++; 6117 dcount++; 6118 } 6119 6120 nzi = 0; 6121 row = fill[n]; 6122 while (row < prow) { 6123 incrlev = im[row] + 1; 6124 nz = dloc[row]; 6125 xi = ajnew + ainew[row] + nz + 1; 6126 flev = ajfill + ainew[row] + nz + 1; 6127 nnz = ainew[row+1] - ainew[row] - nz - 1; 6128 fm = row; 6129 while (nnz-- > 0) { 6130 idx = *xi++; 6131 if (*flev + incrlev > levels) { 6132 flev++; 6133 continue; 6134 } 6135 do { 6136 m = fm; 6137 fm = fill[m]; 6138 } while (fm < idx); 6139 if (fm != idx) { 6140 im[idx] = *flev + incrlev; 6141 fill[m] = idx; 6142 fill[idx] = fm; 6143 fm = idx; 6144 nzf++; 6145 } else { 6146 if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev; 6147 } 6148 flev++; 6149 } 6150 row = fill[row]; 6151 nzi++; 6152 } 6153 /* copy new filled row into permanent storage */ 6154 ainew[prow+1] = ainew[prow] + nzf; 6155 if (ainew[prow+1] > jmax) { 6156 6157 /* estimate how much additional space we will need */ 6158 /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 6159 /* just double the memory each time */ 6160 PetscInt maxadd = jmax; 6161 /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */ 6162 if (maxadd < nzf) maxadd = (n-prow)*(nzf+1); 6163 jmax += maxadd; 6164 6165 /* allocate a longer ajnew and ajfill */ 6166 ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 6167 ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 6168 ierr = PetscFree(ajnew);CHKERRQ(ierr); 6169 ajnew = xitmp; 6170 ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 6171 ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 6172 ierr = PetscFree(ajfill);CHKERRQ(ierr); 6173 ajfill = xitmp; 6174 reallocate++; /* count how many reallocations are needed */ 6175 } 6176 xitmp = ajnew + ainew[prow]; 6177 flev = ajfill + ainew[prow]; 6178 dloc[prow] = nzi; 6179 fm = fill[n]; 6180 while (nzf--) { 6181 *xitmp++ = fm; 6182 *flev++ = im[fm]; 6183 fm = fill[fm]; 6184 } 6185 /* make sure row has diagonal entry */ 6186 if (ajnew[ainew[prow]+dloc[prow]] != prow) { 6187 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 6188 try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow); 6189 } 6190 } 6191 ierr = PetscFree(ajfill);CHKERRQ(ierr); 6192 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 6193 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 6194 ierr = PetscFree(fill);CHKERRQ(ierr); 6195 ierr = PetscFree(im);CHKERRQ(ierr); 6196 6197 #if defined(PETSC_USE_INFO) 6198 { 6199 PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]); 6200 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr); 6201 ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 6202 ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr); 6203 ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 6204 if (diagonal_fill) { 6205 ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr); 6206 } 6207 } 6208 #endif 6209 6210 /* put together the new matrix */ 6211 ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 6212 ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 6213 b = (Mat_SeqBAIJ*)fact->data; 6214 b->free_a = PETSC_TRUE; 6215 b->free_ij = PETSC_TRUE; 6216 b->singlemalloc = PETSC_FALSE; 6217 ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 6218 b->j = ajnew; 6219 b->i = ainew; 6220 for (i=0; i<n; i++) dloc[i] += ainew[i]; 6221 b->diag = dloc; 6222 b->free_diag = PETSC_TRUE; 6223 b->ilen = 0; 6224 b->imax = 0; 6225 b->row = isrow; 6226 b->col = iscol; 6227 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 6228 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 6229 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 6230 b->icol = isicol; 6231 ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 6232 /* In b structure: Free imax, ilen, old a, old j. 6233 Allocate dloc, solve_work, new a, new j */ 6234 ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr); 6235 b->maxnz = b->nz = ainew[n]; 6236 6237 fact->info.factor_mallocs = reallocate; 6238 fact->info.fill_ratio_given = f; 6239 fact->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]); 6240 6241 ierr = MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);CHKERRQ(ierr); 6242 PetscFunctionReturn(0); 6243 } 6244 6245 #undef __FUNCT__ 6246 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE" 6247 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A) 6248 { 6249 /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */ 6250 /* int i,*AJ=a->j,nz=a->nz; */ 6251 PetscFunctionBegin; 6252 /* Undo Column scaling */ 6253 /* while (nz--) { */ 6254 /* AJ[i] = AJ[i]/4; */ 6255 /* } */ 6256 /* This should really invoke a push/pop logic, but we don't have that yet. */ 6257 A->ops->setunfactored = PETSC_NULL; 6258 PetscFunctionReturn(0); 6259 } 6260 6261 #undef __FUNCT__ 6262 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj" 6263 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A) 6264 { 6265 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 6266 PetscInt *AJ=a->j,nz=a->nz; 6267 unsigned short *aj=(unsigned short *)AJ; 6268 PetscFunctionBegin; 6269 /* Is this really necessary? */ 6270 while (nz--) { 6271 AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */ 6272 } 6273 A->ops->setunfactored = PETSC_NULL; 6274 PetscFunctionReturn(0); 6275 } 6276 6277 6278