1 #define PETSCMAT_DLL 2 3 /* 4 Factorization code for BAIJ format. 5 */ 6 7 #include "../src/mat/impls/baij/seq/baij.h" 8 #include "../src/mat/blockinvert.h" 9 #include "petscbt.h" 10 #include "../src/mat/utils/freespace.h" 11 12 #undef __FUNCT__ 13 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering" 14 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 15 { 16 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 17 PetscErrorCode ierr; 18 const PetscInt *adiag = a->diag,*ai = a->i,*aj = a->j,*vi; 19 PetscInt i,n = a->mbs,j; 20 PetscInt nz; 21 PetscScalar *x,*tmp,s1; 22 const MatScalar *aa = a->a,*v; 23 const PetscScalar *b; 24 25 PetscFunctionBegin; 26 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 27 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 28 tmp = a->solve_work; 29 30 31 /* copy the b into temp work space according to permutation */ 32 for (i=0; i<n; i++) tmp[i] = b[i]; 33 34 /* forward solve the U^T */ 35 for (i=0; i<n; i++) { 36 v = aa + adiag[i+1] + 1; 37 vi = aj + adiag[i+1] + 1; 38 nz = adiag[i] - adiag[i+1] - 1; 39 s1 = tmp[i]; 40 s1 *= v[nz]; /* multiply by inverse of diagonal entry */ 41 for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j]; 42 tmp[i] = s1; 43 } 44 45 /* backward solve the L^T */ 46 for (i=n-1; i>=0; i--){ 47 v = aa + ai[i]; 48 vi = aj + ai[i]; 49 nz = ai[i+1] - ai[i]; 50 s1 = tmp[i]; 51 for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j]; 52 } 53 54 /* copy tmp into x according to permutation */ 55 for (i=0; i<n; i++) x[i] = tmp[i]; 56 57 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 58 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 59 60 ierr = PetscLogFlops(2.0*a->nz-A->cmap->n);CHKERRQ(ierr); 61 PetscFunctionReturn(0); 62 } 63 64 #undef __FUNCT__ 65 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace" 66 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 67 { 68 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 69 PetscErrorCode ierr; 70 PetscInt i,nz; 71 const PetscInt *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 72 const MatScalar *aa=a->a,*v; 73 PetscScalar s1,*x; 74 const PetscScalar *b; 75 76 PetscFunctionBegin; 77 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 78 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 79 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 80 81 /* forward solve the U^T */ 82 for (i=0; i<n; i++) { 83 84 v = aa + diag[i]; 85 /* multiply by the inverse of the block diagonal */ 86 s1 = (*v++)*x[i]; 87 vi = aj + diag[i] + 1; 88 nz = ai[i+1] - diag[i] - 1; 89 while (nz--) { 90 x[*vi++] -= (*v++)*s1; 91 } 92 x[i] = s1; 93 } 94 /* backward solve the L^T */ 95 for (i=n-1; i>=0; i--){ 96 v = aa + diag[i] - 1; 97 vi = aj + diag[i] - 1; 98 nz = diag[i] - ai[i]; 99 s1 = x[i]; 100 while (nz--) { 101 x[*vi--] -= (*v--)*s1; 102 } 103 } 104 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 105 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 106 ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 107 PetscFunctionReturn(0); 108 } 109 110 #undef __FUNCT__ 111 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace" 112 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 113 { 114 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 115 PetscErrorCode ierr; 116 PetscInt i,nz,idx,idt,oidx; 117 const PetscInt *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j; 118 const MatScalar *aa=a->a,*v; 119 PetscScalar s1,s2,x1,x2,*x; 120 const PetscScalar *b; 121 122 PetscFunctionBegin; 123 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 124 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 125 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 126 127 /* forward solve the U^T */ 128 idx = 0; 129 for (i=0; i<n; i++) { 130 131 v = aa + 4*diag[i]; 132 /* multiply by the inverse of the block diagonal */ 133 x1 = x[idx]; x2 = x[1+idx]; 134 s1 = v[0]*x1 + v[1]*x2; 135 s2 = v[2]*x1 + v[3]*x2; 136 v += 4; 137 138 vi = aj + diag[i] + 1; 139 nz = ai[i+1] - diag[i] - 1; 140 while (nz--) { 141 oidx = 2*(*vi++); 142 x[oidx] -= v[0]*s1 + v[1]*s2; 143 x[oidx+1] -= v[2]*s1 + v[3]*s2; 144 v += 4; 145 } 146 x[idx] = s1;x[1+idx] = s2; 147 idx += 2; 148 } 149 /* backward solve the L^T */ 150 for (i=n-1; i>=0; i--){ 151 v = aa + 4*diag[i] - 4; 152 vi = aj + diag[i] - 1; 153 nz = diag[i] - ai[i]; 154 idt = 2*i; 155 s1 = x[idt]; s2 = x[1+idt]; 156 while (nz--) { 157 idx = 2*(*vi--); 158 x[idx] -= v[0]*s1 + v[1]*s2; 159 x[idx+1] -= v[2]*s1 + v[3]*s2; 160 v -= 4; 161 } 162 } 163 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 164 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 165 ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 166 PetscFunctionReturn(0); 167 } 168 169 #undef __FUNCT__ 170 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering" 171 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 172 { 173 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 174 PetscErrorCode ierr; 175 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 176 PetscInt nz,idx,idt,j,i,oidx; 177 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 178 const MatScalar *aa=a->a,*v; 179 PetscScalar s1,s2,x1,x2,*x; 180 const PetscScalar *b; 181 182 PetscFunctionBegin; 183 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 184 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 185 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 186 187 /* forward solve the U^T */ 188 idx = 0; 189 for (i=0; i<n; i++) { 190 v = aa + bs2*diag[i]; 191 /* multiply by the inverse of the block diagonal */ 192 x1 = x[idx]; x2 = x[1+idx]; 193 s1 = v[0]*x1 + v[1]*x2; 194 s2 = v[2]*x1 + v[3]*x2; 195 v -= bs2; 196 197 vi = aj + diag[i] - 1; 198 nz = diag[i] - diag[i+1] - 1; 199 for(j=0;j>-nz;j--){ 200 oidx = bs*vi[j]; 201 x[oidx] -= v[0]*s1 + v[1]*s2; 202 x[oidx+1] -= v[2]*s1 + v[3]*s2; 203 v -= bs2; 204 } 205 x[idx] = s1;x[1+idx] = s2; 206 idx += bs; 207 } 208 /* backward solve the L^T */ 209 for (i=n-1; i>=0; i--){ 210 v = aa + bs2*ai[i]; 211 vi = aj + ai[i]; 212 nz = ai[i+1] - ai[i]; 213 idt = bs*i; 214 s1 = x[idt]; s2 = x[1+idt]; 215 for(j=0;j<nz;j++){ 216 idx = bs*vi[j]; 217 x[idx] -= v[0]*s1 + v[1]*s2; 218 x[idx+1] -= v[2]*s1 + v[3]*s2; 219 v += bs2; 220 } 221 } 222 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 223 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 224 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 225 PetscFunctionReturn(0); 226 } 227 228 #undef __FUNCT__ 229 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace" 230 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 231 { 232 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 233 PetscErrorCode ierr; 234 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 235 PetscInt i,nz,idx,idt,oidx; 236 const MatScalar *aa=a->a,*v; 237 PetscScalar s1,s2,s3,x1,x2,x3,*x; 238 const PetscScalar *b; 239 240 PetscFunctionBegin; 241 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 242 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 243 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 244 245 /* forward solve the U^T */ 246 idx = 0; 247 for (i=0; i<n; i++) { 248 249 v = aa + 9*diag[i]; 250 /* multiply by the inverse of the block diagonal */ 251 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 252 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 253 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 254 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 255 v += 9; 256 257 vi = aj + diag[i] + 1; 258 nz = ai[i+1] - diag[i] - 1; 259 while (nz--) { 260 oidx = 3*(*vi++); 261 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 262 x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 263 x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 264 v += 9; 265 } 266 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 267 idx += 3; 268 } 269 /* backward solve the L^T */ 270 for (i=n-1; i>=0; i--){ 271 v = aa + 9*diag[i] - 9; 272 vi = aj + diag[i] - 1; 273 nz = diag[i] - ai[i]; 274 idt = 3*i; 275 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 276 while (nz--) { 277 idx = 3*(*vi--); 278 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 279 x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 280 x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 281 v -= 9; 282 } 283 } 284 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 285 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 286 ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 287 PetscFunctionReturn(0); 288 } 289 290 #undef __FUNCT__ 291 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering" 292 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 293 { 294 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 295 PetscErrorCode ierr; 296 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 297 PetscInt nz,idx,idt,j,i,oidx; 298 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 299 const MatScalar *aa=a->a,*v; 300 PetscScalar s1,s2,s3,x1,x2,x3,*x; 301 const PetscScalar *b; 302 303 PetscFunctionBegin; 304 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 305 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 306 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 307 308 /* forward solve the U^T */ 309 idx = 0; 310 for (i=0; i<n; i++) { 311 v = aa + bs2*diag[i]; 312 /* multiply by the inverse of the block diagonal */ 313 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 314 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 315 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 316 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 317 v -= bs2; 318 319 vi = aj + diag[i] - 1; 320 nz = diag[i] - diag[i+1] - 1; 321 for(j=0;j>-nz;j--){ 322 oidx = bs*vi[j]; 323 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 324 x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 325 x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 326 v -= bs2; 327 } 328 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 329 idx += bs; 330 } 331 /* backward solve the L^T */ 332 for (i=n-1; i>=0; i--){ 333 v = aa + bs2*ai[i]; 334 vi = aj + ai[i]; 335 nz = ai[i+1] - ai[i]; 336 idt = bs*i; 337 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 338 for(j=0;j<nz;j++){ 339 idx = bs*vi[j]; 340 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 341 x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 342 x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 343 v += bs2; 344 } 345 } 346 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 347 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 348 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 349 PetscFunctionReturn(0); 350 } 351 352 #undef __FUNCT__ 353 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace" 354 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 355 { 356 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 357 PetscErrorCode ierr; 358 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 359 PetscInt i,nz,idx,idt,oidx; 360 const MatScalar *aa=a->a,*v; 361 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4,*x; 362 const PetscScalar *b; 363 364 PetscFunctionBegin; 365 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 366 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 367 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 368 369 /* forward solve the U^T */ 370 idx = 0; 371 for (i=0; i<n; i++) { 372 373 v = aa + 16*diag[i]; 374 /* multiply by the inverse of the block diagonal */ 375 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 376 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 377 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 378 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 379 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 380 v += 16; 381 382 vi = aj + diag[i] + 1; 383 nz = ai[i+1] - diag[i] - 1; 384 while (nz--) { 385 oidx = 4*(*vi++); 386 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 387 x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 388 x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 389 x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 390 v += 16; 391 } 392 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; 393 idx += 4; 394 } 395 /* backward solve the L^T */ 396 for (i=n-1; i>=0; i--){ 397 v = aa + 16*diag[i] - 16; 398 vi = aj + diag[i] - 1; 399 nz = diag[i] - ai[i]; 400 idt = 4*i; 401 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; 402 while (nz--) { 403 idx = 4*(*vi--); 404 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 405 x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 406 x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 407 x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 408 v -= 16; 409 } 410 } 411 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 412 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 413 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 414 PetscFunctionReturn(0); 415 } 416 417 #undef __FUNCT__ 418 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering" 419 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 420 { 421 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 422 PetscErrorCode ierr; 423 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 424 PetscInt nz,idx,idt,j,i,oidx; 425 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 426 const MatScalar *aa=a->a,*v; 427 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4,*x; 428 const PetscScalar *b; 429 430 PetscFunctionBegin; 431 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 432 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 433 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 434 435 /* forward solve the U^T */ 436 idx = 0; 437 for (i=0; i<n; i++) { 438 v = aa + bs2*diag[i]; 439 /* multiply by the inverse of the block diagonal */ 440 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 441 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 442 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 443 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 444 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 445 v -= bs2; 446 447 vi = aj + diag[i] - 1; 448 nz = diag[i] - diag[i+1] - 1; 449 for(j=0;j>-nz;j--){ 450 oidx = bs*vi[j]; 451 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 452 x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 453 x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 454 x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 455 v -= bs2; 456 } 457 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; 458 idx += bs; 459 } 460 /* backward solve the L^T */ 461 for (i=n-1; i>=0; i--){ 462 v = aa + bs2*ai[i]; 463 vi = aj + ai[i]; 464 nz = ai[i+1] - ai[i]; 465 idt = bs*i; 466 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; 467 for(j=0;j<nz;j++){ 468 idx = bs*vi[j]; 469 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 470 x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 471 x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 472 x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 473 v += bs2; 474 } 475 } 476 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 477 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 478 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 479 PetscFunctionReturn(0); 480 } 481 482 #undef __FUNCT__ 483 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace" 484 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 485 { 486 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 487 PetscErrorCode ierr; 488 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 489 PetscInt i,nz,idx,idt,oidx; 490 const MatScalar *aa=a->a,*v; 491 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x; 492 const PetscScalar *b; 493 494 PetscFunctionBegin; 495 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 496 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 497 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 498 499 /* forward solve the U^T */ 500 idx = 0; 501 for (i=0; i<n; i++) { 502 503 v = aa + 25*diag[i]; 504 /* multiply by the inverse of the block diagonal */ 505 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 506 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 507 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 508 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 509 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 510 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 511 v += 25; 512 513 vi = aj + diag[i] + 1; 514 nz = ai[i+1] - diag[i] - 1; 515 while (nz--) { 516 oidx = 5*(*vi++); 517 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 518 x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 519 x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 520 x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 521 x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 522 v += 25; 523 } 524 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 525 idx += 5; 526 } 527 /* backward solve the L^T */ 528 for (i=n-1; i>=0; i--){ 529 v = aa + 25*diag[i] - 25; 530 vi = aj + diag[i] - 1; 531 nz = diag[i] - ai[i]; 532 idt = 5*i; 533 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 534 while (nz--) { 535 idx = 5*(*vi--); 536 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 537 x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 538 x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 539 x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 540 x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 541 v -= 25; 542 } 543 } 544 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 545 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 546 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 547 PetscFunctionReturn(0); 548 } 549 550 #undef __FUNCT__ 551 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering" 552 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 553 { 554 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 555 PetscErrorCode ierr; 556 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 557 PetscInt nz,idx,idt,j,i,oidx; 558 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 559 const MatScalar *aa=a->a,*v; 560 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x; 561 const PetscScalar *b; 562 563 PetscFunctionBegin; 564 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 565 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 566 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 567 568 /* forward solve the U^T */ 569 idx = 0; 570 for (i=0; i<n; i++) { 571 v = aa + bs2*diag[i]; 572 /* multiply by the inverse of the block diagonal */ 573 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 574 x5 = x[4+idx]; 575 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 576 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 577 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 578 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 579 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 580 v -= bs2; 581 582 vi = aj + diag[i] - 1; 583 nz = diag[i] - diag[i+1] - 1; 584 for(j=0;j>-nz;j--){ 585 oidx = bs*vi[j]; 586 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 587 x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 588 x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 589 x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 590 x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 591 v -= bs2; 592 } 593 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; x[4+idx] = s5; 594 idx += bs; 595 } 596 /* backward solve the L^T */ 597 for (i=n-1; i>=0; i--){ 598 v = aa + bs2*ai[i]; 599 vi = aj + ai[i]; 600 nz = ai[i+1] - ai[i]; 601 idt = bs*i; 602 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt]; 603 for(j=0;j<nz;j++){ 604 idx = bs*vi[j]; 605 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 606 x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 607 x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 608 x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 609 x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 610 v += bs2; 611 } 612 } 613 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 614 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 615 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 616 PetscFunctionReturn(0); 617 } 618 619 #undef __FUNCT__ 620 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace" 621 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 622 { 623 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 624 PetscErrorCode ierr; 625 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 626 PetscInt i,nz,idx,idt,oidx; 627 const MatScalar *aa=a->a,*v; 628 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x; 629 const PetscScalar *b; 630 631 PetscFunctionBegin; 632 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 633 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 634 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 635 636 /* forward solve the U^T */ 637 idx = 0; 638 for (i=0; i<n; i++) { 639 640 v = aa + 36*diag[i]; 641 /* multiply by the inverse of the block diagonal */ 642 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 643 x6 = x[5+idx]; 644 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 645 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 646 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 647 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 648 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 649 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 650 v += 36; 651 652 vi = aj + diag[i] + 1; 653 nz = ai[i+1] - diag[i] - 1; 654 while (nz--) { 655 oidx = 6*(*vi++); 656 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 657 x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 658 x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 659 x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 660 x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 661 x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 662 v += 36; 663 } 664 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 665 x[5+idx] = s6; 666 idx += 6; 667 } 668 /* backward solve the L^T */ 669 for (i=n-1; i>=0; i--){ 670 v = aa + 36*diag[i] - 36; 671 vi = aj + diag[i] - 1; 672 nz = diag[i] - ai[i]; 673 idt = 6*i; 674 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 675 s6 = x[5+idt]; 676 while (nz--) { 677 idx = 6*(*vi--); 678 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 679 x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 680 x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 681 x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 682 x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 683 x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 684 v -= 36; 685 } 686 } 687 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 688 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 689 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 690 PetscFunctionReturn(0); 691 } 692 693 #undef __FUNCT__ 694 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering" 695 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 696 { 697 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 698 PetscErrorCode ierr; 699 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 700 PetscInt nz,idx,idt,j,i,oidx; 701 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 702 const MatScalar *aa=a->a,*v; 703 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x; 704 const PetscScalar *b; 705 706 PetscFunctionBegin; 707 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 708 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 709 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 710 711 /* forward solve the U^T */ 712 idx = 0; 713 for (i=0; i<n; i++) { 714 v = aa + bs2*diag[i]; 715 /* multiply by the inverse of the block diagonal */ 716 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 717 x5 = x[4+idx]; x6 = x[5+idx]; 718 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 719 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 720 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 721 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 722 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 723 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 724 v -= bs2; 725 726 vi = aj + diag[i] - 1; 727 nz = diag[i] - diag[i+1] - 1; 728 for(j=0;j>-nz;j--){ 729 oidx = bs*vi[j]; 730 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 731 x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 732 x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 733 x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 734 x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 735 x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 736 v -= bs2; 737 } 738 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; x[4+idx] = s5; 739 x[5+idx] = s6; 740 idx += bs; 741 } 742 /* backward solve the L^T */ 743 for (i=n-1; i>=0; i--){ 744 v = aa + bs2*ai[i]; 745 vi = aj + ai[i]; 746 nz = ai[i+1] - ai[i]; 747 idt = bs*i; 748 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt]; 749 s6 = x[5+idt]; 750 for(j=0;j<nz;j++){ 751 idx = bs*vi[j]; 752 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 753 x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 754 x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 755 x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 756 x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 757 x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 758 v += bs2; 759 } 760 } 761 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 762 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 763 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 764 PetscFunctionReturn(0); 765 } 766 767 #undef __FUNCT__ 768 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace" 769 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 770 { 771 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 772 PetscErrorCode ierr; 773 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 774 PetscInt i,nz,idx,idt,oidx; 775 const MatScalar *aa=a->a,*v; 776 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x; 777 const PetscScalar *b; 778 779 PetscFunctionBegin; 780 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 781 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 782 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 783 784 /* forward solve the U^T */ 785 idx = 0; 786 for (i=0; i<n; i++) { 787 788 v = aa + 49*diag[i]; 789 /* multiply by the inverse of the block diagonal */ 790 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 791 x6 = x[5+idx]; x7 = x[6+idx]; 792 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 793 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 794 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 795 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 796 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 797 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 798 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 799 v += 49; 800 801 vi = aj + diag[i] + 1; 802 nz = ai[i+1] - diag[i] - 1; 803 while (nz--) { 804 oidx = 7*(*vi++); 805 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 806 x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 807 x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 808 x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 809 x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 810 x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 811 x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 812 v += 49; 813 } 814 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 815 x[5+idx] = s6;x[6+idx] = s7; 816 idx += 7; 817 } 818 /* backward solve the L^T */ 819 for (i=n-1; i>=0; i--){ 820 v = aa + 49*diag[i] - 49; 821 vi = aj + diag[i] - 1; 822 nz = diag[i] - ai[i]; 823 idt = 7*i; 824 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 825 s6 = x[5+idt];s7 = x[6+idt]; 826 while (nz--) { 827 idx = 7*(*vi--); 828 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 829 x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 830 x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 831 x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 832 x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 833 x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 834 x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 835 v -= 49; 836 } 837 } 838 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 839 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 840 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 841 PetscFunctionReturn(0); 842 } 843 #undef __FUNCT__ 844 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering" 845 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 846 { 847 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 848 PetscErrorCode ierr; 849 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 850 PetscInt nz,idx,idt,j,i,oidx; 851 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 852 const MatScalar *aa=a->a,*v; 853 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x; 854 const PetscScalar *b; 855 856 PetscFunctionBegin; 857 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 858 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 859 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 860 861 /* forward solve the U^T */ 862 idx = 0; 863 for (i=0; i<n; i++) { 864 v = aa + bs2*diag[i]; 865 /* multiply by the inverse of the block diagonal */ 866 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 867 x5 = x[4+idx]; x6 = x[5+idx]; x7 = x[6+idx]; 868 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 869 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 870 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 871 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 872 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 873 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 874 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 875 v -= bs2; 876 vi = aj + diag[i] - 1; 877 nz = diag[i] - diag[i+1] - 1; 878 for(j=0;j>-nz;j--){ 879 oidx = bs*vi[j]; 880 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 881 x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 882 x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 883 x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 884 x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 885 x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 886 x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 887 v -= bs2; 888 } 889 x[idx] = s1; x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; x[4+idx] = s5; 890 x[5+idx] = s6; x[6+idx] = s7; 891 idx += bs; 892 } 893 /* backward solve the L^T */ 894 for (i=n-1; i>=0; i--){ 895 v = aa + bs2*ai[i]; 896 vi = aj + ai[i]; 897 nz = ai[i+1] - ai[i]; 898 idt = bs*i; 899 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt]; 900 s6 = x[5+idt]; s7 = x[6+idt]; 901 for(j=0;j<nz;j++){ 902 idx = bs*vi[j]; 903 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 904 x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 905 x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 906 x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 907 x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 908 x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 909 x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 910 v += bs2; 911 } 912 } 913 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 914 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 915 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 916 PetscFunctionReturn(0); 917 } 918 919 /*---------------------------------------------------------------------------------------------*/ 920 #undef __FUNCT__ 921 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1" 922 PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 923 { 924 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 925 IS iscol = a->col,isrow = a->row; 926 PetscErrorCode ierr; 927 const PetscInt *rout,*cout,*r,*c,*adiag = a->diag,*ai = a->i,*aj = a->j,*vi; 928 PetscInt i,n = a->mbs,j; 929 PetscInt nz; 930 PetscScalar *x,*tmp,s1; 931 const MatScalar *aa = a->a,*v; 932 const PetscScalar *b; 933 934 PetscFunctionBegin; 935 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 936 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 937 tmp = a->solve_work; 938 939 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 940 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 941 942 /* copy the b into temp work space according to permutation */ 943 for (i=0; i<n; i++) tmp[i] = b[c[i]]; 944 945 /* forward solve the U^T */ 946 for (i=0; i<n; i++) { 947 v = aa + adiag[i+1] + 1; 948 vi = aj + adiag[i+1] + 1; 949 nz = adiag[i] - adiag[i+1] - 1; 950 s1 = tmp[i]; 951 s1 *= v[nz]; /* multiply by inverse of diagonal entry */ 952 for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j]; 953 tmp[i] = s1; 954 } 955 956 /* backward solve the L^T */ 957 for (i=n-1; i>=0; i--){ 958 v = aa + ai[i]; 959 vi = aj + ai[i]; 960 nz = ai[i+1] - ai[i]; 961 s1 = tmp[i]; 962 for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j]; 963 } 964 965 /* copy tmp into x according to permutation */ 966 for (i=0; i<n; i++) x[r[i]] = tmp[i]; 967 968 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 969 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 970 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 971 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 972 973 ierr = PetscLogFlops(2.0*a->nz-A->cmap->n);CHKERRQ(ierr); 974 PetscFunctionReturn(0); 975 } 976 977 #undef __FUNCT__ 978 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_inplace" 979 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx) 980 { 981 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 982 IS iscol=a->col,isrow=a->row; 983 PetscErrorCode ierr; 984 const PetscInt *r,*c,*rout,*cout; 985 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 986 PetscInt i,nz; 987 const MatScalar *aa=a->a,*v; 988 PetscScalar s1,*x,*t; 989 const PetscScalar *b; 990 991 PetscFunctionBegin; 992 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 993 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 994 t = a->solve_work; 995 996 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 997 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 998 999 /* copy the b into temp work space according to permutation */ 1000 for (i=0; i<n; i++) { 1001 t[i] = b[c[i]]; 1002 } 1003 1004 /* forward solve the U^T */ 1005 for (i=0; i<n; i++) { 1006 1007 v = aa + diag[i]; 1008 /* multiply by the inverse of the block diagonal */ 1009 s1 = (*v++)*t[i]; 1010 vi = aj + diag[i] + 1; 1011 nz = ai[i+1] - diag[i] - 1; 1012 while (nz--) { 1013 t[*vi++] -= (*v++)*s1; 1014 } 1015 t[i] = s1; 1016 } 1017 /* backward solve the L^T */ 1018 for (i=n-1; i>=0; i--){ 1019 v = aa + diag[i] - 1; 1020 vi = aj + diag[i] - 1; 1021 nz = diag[i] - ai[i]; 1022 s1 = t[i]; 1023 while (nz--) { 1024 t[*vi--] -= (*v--)*s1; 1025 } 1026 } 1027 1028 /* copy t into x according to permutation */ 1029 for (i=0; i<n; i++) { 1030 x[r[i]] = t[i]; 1031 } 1032 1033 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1034 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1035 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1036 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1037 ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 1038 PetscFunctionReturn(0); 1039 } 1040 1041 #undef __FUNCT__ 1042 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_inplace" 1043 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx) 1044 { 1045 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1046 IS iscol=a->col,isrow=a->row; 1047 PetscErrorCode ierr; 1048 const PetscInt *r,*c,*rout,*cout; 1049 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1050 PetscInt i,nz,idx,idt,ii,ic,ir,oidx; 1051 const MatScalar *aa=a->a,*v; 1052 PetscScalar s1,s2,x1,x2,*x,*t; 1053 const PetscScalar *b; 1054 1055 PetscFunctionBegin; 1056 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1057 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1058 t = a->solve_work; 1059 1060 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1061 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1062 1063 /* copy the b into temp work space according to permutation */ 1064 ii = 0; 1065 for (i=0; i<n; i++) { 1066 ic = 2*c[i]; 1067 t[ii] = b[ic]; 1068 t[ii+1] = b[ic+1]; 1069 ii += 2; 1070 } 1071 1072 /* forward solve the U^T */ 1073 idx = 0; 1074 for (i=0; i<n; i++) { 1075 1076 v = aa + 4*diag[i]; 1077 /* multiply by the inverse of the block diagonal */ 1078 x1 = t[idx]; x2 = t[1+idx]; 1079 s1 = v[0]*x1 + v[1]*x2; 1080 s2 = v[2]*x1 + v[3]*x2; 1081 v += 4; 1082 1083 vi = aj + diag[i] + 1; 1084 nz = ai[i+1] - diag[i] - 1; 1085 while (nz--) { 1086 oidx = 2*(*vi++); 1087 t[oidx] -= v[0]*s1 + v[1]*s2; 1088 t[oidx+1] -= v[2]*s1 + v[3]*s2; 1089 v += 4; 1090 } 1091 t[idx] = s1;t[1+idx] = s2; 1092 idx += 2; 1093 } 1094 /* backward solve the L^T */ 1095 for (i=n-1; i>=0; i--){ 1096 v = aa + 4*diag[i] - 4; 1097 vi = aj + diag[i] - 1; 1098 nz = diag[i] - ai[i]; 1099 idt = 2*i; 1100 s1 = t[idt]; s2 = t[1+idt]; 1101 while (nz--) { 1102 idx = 2*(*vi--); 1103 t[idx] -= v[0]*s1 + v[1]*s2; 1104 t[idx+1] -= v[2]*s1 + v[3]*s2; 1105 v -= 4; 1106 } 1107 } 1108 1109 /* copy t into x according to permutation */ 1110 ii = 0; 1111 for (i=0; i<n; i++) { 1112 ir = 2*r[i]; 1113 x[ir] = t[ii]; 1114 x[ir+1] = t[ii+1]; 1115 ii += 2; 1116 } 1117 1118 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1119 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1120 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1121 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1122 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 1123 PetscFunctionReturn(0); 1124 } 1125 1126 #undef __FUNCT__ 1127 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2" 1128 PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 1129 { 1130 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1131 PetscErrorCode ierr; 1132 IS iscol=a->col,isrow=a->row; 1133 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 1134 const PetscInt *r,*c,*rout,*cout; 1135 PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1136 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 1137 const MatScalar *aa=a->a,*v; 1138 PetscScalar s1,s2,x1,x2,*x,*t; 1139 const PetscScalar *b; 1140 1141 PetscFunctionBegin; 1142 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1143 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1144 t = a->solve_work; 1145 1146 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1147 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1148 1149 /* copy b into temp work space according to permutation */ 1150 for(i=0;i<n;i++){ 1151 ii = bs*i; ic = bs*c[i]; 1152 t[ii] = b[ic]; t[ii+1] = b[ic+1]; 1153 } 1154 1155 /* forward solve the U^T */ 1156 idx = 0; 1157 for (i=0; i<n; i++) { 1158 v = aa + bs2*diag[i]; 1159 /* multiply by the inverse of the block diagonal */ 1160 x1 = t[idx]; x2 = t[1+idx]; 1161 s1 = v[0]*x1 + v[1]*x2; 1162 s2 = v[2]*x1 + v[3]*x2; 1163 v -= bs2; 1164 1165 vi = aj + diag[i] - 1; 1166 nz = diag[i] - diag[i+1] - 1; 1167 for(j=0;j>-nz;j--){ 1168 oidx = bs*vi[j]; 1169 t[oidx] -= v[0]*s1 + v[1]*s2; 1170 t[oidx+1] -= v[2]*s1 + v[3]*s2; 1171 v -= bs2; 1172 } 1173 t[idx] = s1;t[1+idx] = s2; 1174 idx += bs; 1175 } 1176 /* backward solve the L^T */ 1177 for (i=n-1; i>=0; i--){ 1178 v = aa + bs2*ai[i]; 1179 vi = aj + ai[i]; 1180 nz = ai[i+1] - ai[i]; 1181 idt = bs*i; 1182 s1 = t[idt]; s2 = t[1+idt]; 1183 for(j=0;j<nz;j++){ 1184 idx = bs*vi[j]; 1185 t[idx] -= v[0]*s1 + v[1]*s2; 1186 t[idx+1] -= v[2]*s1 + v[3]*s2; 1187 v += bs2; 1188 } 1189 } 1190 1191 /* copy t into x according to permutation */ 1192 for(i=0;i<n;i++){ 1193 ii = bs*i; ir = bs*r[i]; 1194 x[ir] = t[ii]; x[ir+1] = t[ii+1]; 1195 } 1196 1197 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1198 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1199 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1200 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1201 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1202 PetscFunctionReturn(0); 1203 } 1204 1205 #undef __FUNCT__ 1206 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_inplace" 1207 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx) 1208 { 1209 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1210 IS iscol=a->col,isrow=a->row; 1211 PetscErrorCode ierr; 1212 const PetscInt *r,*c,*rout,*cout; 1213 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1214 PetscInt i,nz,idx,idt,ii,ic,ir,oidx; 1215 const MatScalar *aa=a->a,*v; 1216 PetscScalar s1,s2,s3,x1,x2,x3,*x,*t; 1217 const PetscScalar *b; 1218 1219 PetscFunctionBegin; 1220 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1221 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1222 t = a->solve_work; 1223 1224 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1225 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1226 1227 /* copy the b into temp work space according to permutation */ 1228 ii = 0; 1229 for (i=0; i<n; i++) { 1230 ic = 3*c[i]; 1231 t[ii] = b[ic]; 1232 t[ii+1] = b[ic+1]; 1233 t[ii+2] = b[ic+2]; 1234 ii += 3; 1235 } 1236 1237 /* forward solve the U^T */ 1238 idx = 0; 1239 for (i=0; i<n; i++) { 1240 1241 v = aa + 9*diag[i]; 1242 /* multiply by the inverse of the block diagonal */ 1243 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1244 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 1245 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 1246 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 1247 v += 9; 1248 1249 vi = aj + diag[i] + 1; 1250 nz = ai[i+1] - diag[i] - 1; 1251 while (nz--) { 1252 oidx = 3*(*vi++); 1253 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 1254 t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 1255 t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 1256 v += 9; 1257 } 1258 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 1259 idx += 3; 1260 } 1261 /* backward solve the L^T */ 1262 for (i=n-1; i>=0; i--){ 1263 v = aa + 9*diag[i] - 9; 1264 vi = aj + diag[i] - 1; 1265 nz = diag[i] - ai[i]; 1266 idt = 3*i; 1267 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 1268 while (nz--) { 1269 idx = 3*(*vi--); 1270 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 1271 t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 1272 t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 1273 v -= 9; 1274 } 1275 } 1276 1277 /* copy t into x according to permutation */ 1278 ii = 0; 1279 for (i=0; i<n; i++) { 1280 ir = 3*r[i]; 1281 x[ir] = t[ii]; 1282 x[ir+1] = t[ii+1]; 1283 x[ir+2] = t[ii+2]; 1284 ii += 3; 1285 } 1286 1287 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1288 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1289 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1290 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1291 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 1292 PetscFunctionReturn(0); 1293 } 1294 1295 #undef __FUNCT__ 1296 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3" 1297 PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 1298 { 1299 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1300 PetscErrorCode ierr; 1301 IS iscol=a->col,isrow=a->row; 1302 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 1303 const PetscInt *r,*c,*rout,*cout; 1304 PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1305 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 1306 const MatScalar *aa=a->a,*v; 1307 PetscScalar s1,s2,s3,x1,x2,x3,*x,*t; 1308 const PetscScalar *b; 1309 1310 PetscFunctionBegin; 1311 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1312 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1313 t = a->solve_work; 1314 1315 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1316 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1317 1318 /* copy b into temp work space according to permutation */ 1319 for(i=0;i<n;i++){ 1320 ii = bs*i; ic = bs*c[i]; 1321 t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; 1322 } 1323 1324 /* forward solve the U^T */ 1325 idx = 0; 1326 for (i=0; i<n; i++) { 1327 v = aa + bs2*diag[i]; 1328 /* multiply by the inverse of the block diagonal */ 1329 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1330 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 1331 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 1332 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 1333 v -= bs2; 1334 1335 vi = aj + diag[i] - 1; 1336 nz = diag[i] - diag[i+1] - 1; 1337 for(j=0;j>-nz;j--){ 1338 oidx = bs*vi[j]; 1339 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 1340 t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 1341 t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 1342 v -= bs2; 1343 } 1344 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 1345 idx += bs; 1346 } 1347 /* backward solve the L^T */ 1348 for (i=n-1; i>=0; i--){ 1349 v = aa + bs2*ai[i]; 1350 vi = aj + ai[i]; 1351 nz = ai[i+1] - ai[i]; 1352 idt = bs*i; 1353 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 1354 for(j=0;j<nz;j++){ 1355 idx = bs*vi[j]; 1356 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 1357 t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 1358 t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 1359 v += bs2; 1360 } 1361 } 1362 1363 /* copy t into x according to permutation */ 1364 for(i=0;i<n;i++){ 1365 ii = bs*i; ir = bs*r[i]; 1366 x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; 1367 } 1368 1369 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1370 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1371 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1372 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1373 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1374 PetscFunctionReturn(0); 1375 } 1376 1377 #undef __FUNCT__ 1378 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_inplace" 1379 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx) 1380 { 1381 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1382 IS iscol=a->col,isrow=a->row; 1383 PetscErrorCode ierr; 1384 const PetscInt *r,*c,*rout,*cout; 1385 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1386 PetscInt i,nz,idx,idt,ii,ic,ir,oidx; 1387 const MatScalar *aa=a->a,*v; 1388 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4,*x,*t; 1389 const PetscScalar *b; 1390 1391 PetscFunctionBegin; 1392 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1393 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1394 t = a->solve_work; 1395 1396 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1397 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1398 1399 /* copy the b into temp work space according to permutation */ 1400 ii = 0; 1401 for (i=0; i<n; i++) { 1402 ic = 4*c[i]; 1403 t[ii] = b[ic]; 1404 t[ii+1] = b[ic+1]; 1405 t[ii+2] = b[ic+2]; 1406 t[ii+3] = b[ic+3]; 1407 ii += 4; 1408 } 1409 1410 /* forward solve the U^T */ 1411 idx = 0; 1412 for (i=0; i<n; i++) { 1413 1414 v = aa + 16*diag[i]; 1415 /* multiply by the inverse of the block diagonal */ 1416 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 1417 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 1418 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 1419 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 1420 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 1421 v += 16; 1422 1423 vi = aj + diag[i] + 1; 1424 nz = ai[i+1] - diag[i] - 1; 1425 while (nz--) { 1426 oidx = 4*(*vi++); 1427 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 1428 t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 1429 t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 1430 t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 1431 v += 16; 1432 } 1433 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; 1434 idx += 4; 1435 } 1436 /* backward solve the L^T */ 1437 for (i=n-1; i>=0; i--){ 1438 v = aa + 16*diag[i] - 16; 1439 vi = aj + diag[i] - 1; 1440 nz = diag[i] - ai[i]; 1441 idt = 4*i; 1442 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; 1443 while (nz--) { 1444 idx = 4*(*vi--); 1445 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 1446 t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 1447 t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 1448 t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 1449 v -= 16; 1450 } 1451 } 1452 1453 /* copy t into x according to permutation */ 1454 ii = 0; 1455 for (i=0; i<n; i++) { 1456 ir = 4*r[i]; 1457 x[ir] = t[ii]; 1458 x[ir+1] = t[ii+1]; 1459 x[ir+2] = t[ii+2]; 1460 x[ir+3] = t[ii+3]; 1461 ii += 4; 1462 } 1463 1464 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1465 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1466 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1467 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1468 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 1469 PetscFunctionReturn(0); 1470 } 1471 1472 #undef __FUNCT__ 1473 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4" 1474 PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 1475 { 1476 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1477 PetscErrorCode ierr; 1478 IS iscol=a->col,isrow=a->row; 1479 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 1480 const PetscInt *r,*c,*rout,*cout; 1481 PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1482 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 1483 const MatScalar *aa=a->a,*v; 1484 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4,*x,*t; 1485 const PetscScalar *b; 1486 1487 PetscFunctionBegin; 1488 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1489 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1490 t = a->solve_work; 1491 1492 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1493 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1494 1495 /* copy b into temp work space according to permutation */ 1496 for(i=0;i<n;i++){ 1497 ii = bs*i; ic = bs*c[i]; 1498 t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3]; 1499 } 1500 1501 /* forward solve the U^T */ 1502 idx = 0; 1503 for (i=0; i<n; i++) { 1504 v = aa + bs2*diag[i]; 1505 /* multiply by the inverse of the block diagonal */ 1506 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 1507 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 1508 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 1509 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 1510 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 1511 v -= bs2; 1512 1513 vi = aj + diag[i] - 1; 1514 nz = diag[i] - diag[i+1] - 1; 1515 for(j=0;j>-nz;j--){ 1516 oidx = bs*vi[j]; 1517 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 1518 t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 1519 t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 1520 t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 1521 v -= bs2; 1522 } 1523 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; 1524 idx += bs; 1525 } 1526 /* backward solve the L^T */ 1527 for (i=n-1; i>=0; i--){ 1528 v = aa + bs2*ai[i]; 1529 vi = aj + ai[i]; 1530 nz = ai[i+1] - ai[i]; 1531 idt = bs*i; 1532 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; 1533 for(j=0;j<nz;j++){ 1534 idx = bs*vi[j]; 1535 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 1536 t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 1537 t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 1538 t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 1539 v += bs2; 1540 } 1541 } 1542 1543 /* copy t into x according to permutation */ 1544 for(i=0;i<n;i++){ 1545 ii = bs*i; ir = bs*r[i]; 1546 x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3]; 1547 } 1548 1549 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1550 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1551 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1552 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1553 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1554 PetscFunctionReturn(0); 1555 } 1556 1557 #undef __FUNCT__ 1558 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_inplace" 1559 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx) 1560 { 1561 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1562 IS iscol=a->col,isrow=a->row; 1563 PetscErrorCode ierr; 1564 const PetscInt *r,*c,*rout,*cout; 1565 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1566 PetscInt i,nz,idx,idt,ii,ic,ir,oidx; 1567 const MatScalar *aa=a->a,*v; 1568 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t; 1569 const PetscScalar *b; 1570 1571 PetscFunctionBegin; 1572 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1573 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1574 t = a->solve_work; 1575 1576 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1577 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1578 1579 /* copy the b into temp work space according to permutation */ 1580 ii = 0; 1581 for (i=0; i<n; i++) { 1582 ic = 5*c[i]; 1583 t[ii] = b[ic]; 1584 t[ii+1] = b[ic+1]; 1585 t[ii+2] = b[ic+2]; 1586 t[ii+3] = b[ic+3]; 1587 t[ii+4] = b[ic+4]; 1588 ii += 5; 1589 } 1590 1591 /* forward solve the U^T */ 1592 idx = 0; 1593 for (i=0; i<n; i++) { 1594 1595 v = aa + 25*diag[i]; 1596 /* multiply by the inverse of the block diagonal */ 1597 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1598 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 1599 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 1600 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 1601 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 1602 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 1603 v += 25; 1604 1605 vi = aj + diag[i] + 1; 1606 nz = ai[i+1] - diag[i] - 1; 1607 while (nz--) { 1608 oidx = 5*(*vi++); 1609 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 1610 t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 1611 t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 1612 t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 1613 t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 1614 v += 25; 1615 } 1616 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1617 idx += 5; 1618 } 1619 /* backward solve the L^T */ 1620 for (i=n-1; i>=0; i--){ 1621 v = aa + 25*diag[i] - 25; 1622 vi = aj + diag[i] - 1; 1623 nz = diag[i] - ai[i]; 1624 idt = 5*i; 1625 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1626 while (nz--) { 1627 idx = 5*(*vi--); 1628 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 1629 t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 1630 t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 1631 t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 1632 t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 1633 v -= 25; 1634 } 1635 } 1636 1637 /* copy t into x according to permutation */ 1638 ii = 0; 1639 for (i=0; i<n; i++) { 1640 ir = 5*r[i]; 1641 x[ir] = t[ii]; 1642 x[ir+1] = t[ii+1]; 1643 x[ir+2] = t[ii+2]; 1644 x[ir+3] = t[ii+3]; 1645 x[ir+4] = t[ii+4]; 1646 ii += 5; 1647 } 1648 1649 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1650 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1651 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1652 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1653 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 1654 PetscFunctionReturn(0); 1655 } 1656 1657 #undef __FUNCT__ 1658 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5" 1659 PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 1660 { 1661 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1662 PetscErrorCode ierr; 1663 IS iscol=a->col,isrow=a->row; 1664 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 1665 const PetscInt *r,*c,*rout,*cout; 1666 PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1667 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 1668 const MatScalar *aa=a->a,*v; 1669 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t; 1670 const PetscScalar *b; 1671 1672 PetscFunctionBegin; 1673 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1674 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1675 t = a->solve_work; 1676 1677 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1678 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1679 1680 /* copy b into temp work space according to permutation */ 1681 for(i=0;i<n;i++){ 1682 ii = bs*i; ic = bs*c[i]; 1683 t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3]; 1684 t[ii+4] = b[ic+4]; 1685 } 1686 1687 /* forward solve the U^T */ 1688 idx = 0; 1689 for (i=0; i<n; i++) { 1690 v = aa + bs2*diag[i]; 1691 /* multiply by the inverse of the block diagonal */ 1692 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1693 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 1694 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 1695 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 1696 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 1697 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 1698 v -= bs2; 1699 1700 vi = aj + diag[i] - 1; 1701 nz = diag[i] - diag[i+1] - 1; 1702 for(j=0;j>-nz;j--){ 1703 oidx = bs*vi[j]; 1704 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 1705 t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 1706 t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 1707 t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 1708 t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 1709 v -= bs2; 1710 } 1711 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; t[4+idx] =s5; 1712 idx += bs; 1713 } 1714 /* backward solve the L^T */ 1715 for (i=n-1; i>=0; i--){ 1716 v = aa + bs2*ai[i]; 1717 vi = aj + ai[i]; 1718 nz = ai[i+1] - ai[i]; 1719 idt = bs*i; 1720 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; s5 = t[4+idt]; 1721 for(j=0;j<nz;j++){ 1722 idx = bs*vi[j]; 1723 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 1724 t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 1725 t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 1726 t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 1727 t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 1728 v += bs2; 1729 } 1730 } 1731 1732 /* copy t into x according to permutation */ 1733 for(i=0;i<n;i++){ 1734 ii = bs*i; ir = bs*r[i]; 1735 x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3]; 1736 x[ir+4] = t[ii+4]; 1737 } 1738 1739 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1740 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1741 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1742 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1743 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1744 PetscFunctionReturn(0); 1745 } 1746 1747 #undef __FUNCT__ 1748 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_inplace" 1749 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx) 1750 { 1751 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1752 IS iscol=a->col,isrow=a->row; 1753 PetscErrorCode ierr; 1754 const PetscInt *r,*c,*rout,*cout; 1755 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1756 PetscInt i,nz,idx,idt,ii,ic,ir,oidx; 1757 const MatScalar *aa=a->a,*v; 1758 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t; 1759 const PetscScalar *b; 1760 1761 PetscFunctionBegin; 1762 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1763 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1764 t = a->solve_work; 1765 1766 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1767 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1768 1769 /* copy the b into temp work space according to permutation */ 1770 ii = 0; 1771 for (i=0; i<n; i++) { 1772 ic = 6*c[i]; 1773 t[ii] = b[ic]; 1774 t[ii+1] = b[ic+1]; 1775 t[ii+2] = b[ic+2]; 1776 t[ii+3] = b[ic+3]; 1777 t[ii+4] = b[ic+4]; 1778 t[ii+5] = b[ic+5]; 1779 ii += 6; 1780 } 1781 1782 /* forward solve the U^T */ 1783 idx = 0; 1784 for (i=0; i<n; i++) { 1785 1786 v = aa + 36*diag[i]; 1787 /* multiply by the inverse of the block diagonal */ 1788 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1789 x6 = t[5+idx]; 1790 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 1791 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 1792 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 1793 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 1794 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 1795 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 1796 v += 36; 1797 1798 vi = aj + diag[i] + 1; 1799 nz = ai[i+1] - diag[i] - 1; 1800 while (nz--) { 1801 oidx = 6*(*vi++); 1802 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 1803 t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 1804 t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 1805 t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 1806 t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 1807 t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 1808 v += 36; 1809 } 1810 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1811 t[5+idx] = s6; 1812 idx += 6; 1813 } 1814 /* backward solve the L^T */ 1815 for (i=n-1; i>=0; i--){ 1816 v = aa + 36*diag[i] - 36; 1817 vi = aj + diag[i] - 1; 1818 nz = diag[i] - ai[i]; 1819 idt = 6*i; 1820 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1821 s6 = t[5+idt]; 1822 while (nz--) { 1823 idx = 6*(*vi--); 1824 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 1825 t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 1826 t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 1827 t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 1828 t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 1829 t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 1830 v -= 36; 1831 } 1832 } 1833 1834 /* copy t into x according to permutation */ 1835 ii = 0; 1836 for (i=0; i<n; i++) { 1837 ir = 6*r[i]; 1838 x[ir] = t[ii]; 1839 x[ir+1] = t[ii+1]; 1840 x[ir+2] = t[ii+2]; 1841 x[ir+3] = t[ii+3]; 1842 x[ir+4] = t[ii+4]; 1843 x[ir+5] = t[ii+5]; 1844 ii += 6; 1845 } 1846 1847 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1848 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1849 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1850 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1851 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1852 PetscFunctionReturn(0); 1853 } 1854 1855 #undef __FUNCT__ 1856 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6" 1857 PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 1858 { 1859 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1860 PetscErrorCode ierr; 1861 IS iscol=a->col,isrow=a->row; 1862 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 1863 const PetscInt *r,*c,*rout,*cout; 1864 PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1865 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 1866 const MatScalar *aa=a->a,*v; 1867 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t; 1868 const PetscScalar *b; 1869 1870 PetscFunctionBegin; 1871 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1872 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1873 t = a->solve_work; 1874 1875 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1876 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1877 1878 /* copy b into temp work space according to permutation */ 1879 for(i=0;i<n;i++){ 1880 ii = bs*i; ic = bs*c[i]; 1881 t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3]; 1882 t[ii+4] = b[ic+4]; t[ii+5] = b[ic+5]; 1883 } 1884 1885 /* forward solve the U^T */ 1886 idx = 0; 1887 for (i=0; i<n; i++) { 1888 v = aa + bs2*diag[i]; 1889 /* multiply by the inverse of the block diagonal */ 1890 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1891 x6 = t[5+idx]; 1892 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 1893 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 1894 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 1895 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 1896 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 1897 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 1898 v -= bs2; 1899 1900 vi = aj + diag[i] - 1; 1901 nz = diag[i] - diag[i+1] - 1; 1902 for(j=0;j>-nz;j--){ 1903 oidx = bs*vi[j]; 1904 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 1905 t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 1906 t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 1907 t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 1908 t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 1909 t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 1910 v -= bs2; 1911 } 1912 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; t[4+idx] =s5; 1913 t[5+idx] = s6; 1914 idx += bs; 1915 } 1916 /* backward solve the L^T */ 1917 for (i=n-1; i>=0; i--){ 1918 v = aa + bs2*ai[i]; 1919 vi = aj + ai[i]; 1920 nz = ai[i+1] - ai[i]; 1921 idt = bs*i; 1922 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; s5 = t[4+idt]; 1923 s6 = t[5+idt]; 1924 for(j=0;j<nz;j++){ 1925 idx = bs*vi[j]; 1926 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 1927 t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 1928 t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 1929 t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 1930 t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 1931 t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 1932 v += bs2; 1933 } 1934 } 1935 1936 /* copy t into x according to permutation */ 1937 for(i=0;i<n;i++){ 1938 ii = bs*i; ir = bs*r[i]; 1939 x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3]; 1940 x[ir+4] = t[ii+4]; x[ir+5] = t[ii+5]; 1941 } 1942 1943 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1944 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1945 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1946 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1947 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1948 PetscFunctionReturn(0); 1949 } 1950 1951 #undef __FUNCT__ 1952 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_inplace" 1953 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx) 1954 { 1955 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1956 IS iscol=a->col,isrow=a->row; 1957 PetscErrorCode ierr; 1958 const PetscInt *r,*c,*rout,*cout; 1959 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1960 PetscInt i,nz,idx,idt,ii,ic,ir,oidx; 1961 const MatScalar *aa=a->a,*v; 1962 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t; 1963 const PetscScalar *b; 1964 1965 PetscFunctionBegin; 1966 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1967 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1968 t = a->solve_work; 1969 1970 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1971 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1972 1973 /* copy the b into temp work space according to permutation */ 1974 ii = 0; 1975 for (i=0; i<n; i++) { 1976 ic = 7*c[i]; 1977 t[ii] = b[ic]; 1978 t[ii+1] = b[ic+1]; 1979 t[ii+2] = b[ic+2]; 1980 t[ii+3] = b[ic+3]; 1981 t[ii+4] = b[ic+4]; 1982 t[ii+5] = b[ic+5]; 1983 t[ii+6] = b[ic+6]; 1984 ii += 7; 1985 } 1986 1987 /* forward solve the U^T */ 1988 idx = 0; 1989 for (i=0; i<n; i++) { 1990 1991 v = aa + 49*diag[i]; 1992 /* multiply by the inverse of the block diagonal */ 1993 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1994 x6 = t[5+idx]; x7 = t[6+idx]; 1995 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 1996 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 1997 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 1998 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 1999 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 2000 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 2001 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 2002 v += 49; 2003 2004 vi = aj + diag[i] + 1; 2005 nz = ai[i+1] - diag[i] - 1; 2006 while (nz--) { 2007 oidx = 7*(*vi++); 2008 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 2009 t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 2010 t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 2011 t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 2012 t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 2013 t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 2014 t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 2015 v += 49; 2016 } 2017 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 2018 t[5+idx] = s6;t[6+idx] = s7; 2019 idx += 7; 2020 } 2021 /* backward solve the L^T */ 2022 for (i=n-1; i>=0; i--){ 2023 v = aa + 49*diag[i] - 49; 2024 vi = aj + diag[i] - 1; 2025 nz = diag[i] - ai[i]; 2026 idt = 7*i; 2027 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 2028 s6 = t[5+idt];s7 = t[6+idt]; 2029 while (nz--) { 2030 idx = 7*(*vi--); 2031 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 2032 t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 2033 t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 2034 t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 2035 t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 2036 t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 2037 t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 2038 v -= 49; 2039 } 2040 } 2041 2042 /* copy t into x according to permutation */ 2043 ii = 0; 2044 for (i=0; i<n; i++) { 2045 ir = 7*r[i]; 2046 x[ir] = t[ii]; 2047 x[ir+1] = t[ii+1]; 2048 x[ir+2] = t[ii+2]; 2049 x[ir+3] = t[ii+3]; 2050 x[ir+4] = t[ii+4]; 2051 x[ir+5] = t[ii+5]; 2052 x[ir+6] = t[ii+6]; 2053 ii += 7; 2054 } 2055 2056 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2057 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2058 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2059 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2060 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 2061 PetscFunctionReturn(0); 2062 } 2063 #undef __FUNCT__ 2064 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7" 2065 PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 2066 { 2067 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2068 PetscErrorCode ierr; 2069 IS iscol=a->col,isrow=a->row; 2070 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 2071 const PetscInt *r,*c,*rout,*cout; 2072 PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 2073 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 2074 const MatScalar *aa=a->a,*v; 2075 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t; 2076 const PetscScalar *b; 2077 2078 PetscFunctionBegin; 2079 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2080 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2081 t = a->solve_work; 2082 2083 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2084 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2085 2086 /* copy b into temp work space according to permutation */ 2087 for(i=0;i<n;i++){ 2088 ii = bs*i; ic = bs*c[i]; 2089 t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3]; 2090 t[ii+4] = b[ic+4]; t[ii+5] = b[ic+5]; t[ii+6] = b[ic+6]; 2091 } 2092 2093 /* forward solve the U^T */ 2094 idx = 0; 2095 for (i=0; i<n; i++) { 2096 v = aa + bs2*diag[i]; 2097 /* multiply by the inverse of the block diagonal */ 2098 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 2099 x6 = t[5+idx]; x7 = t[6+idx]; 2100 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 2101 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 2102 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 2103 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 2104 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 2105 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 2106 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 2107 v -= bs2; 2108 2109 vi = aj + diag[i] - 1; 2110 nz = diag[i] - diag[i+1] - 1; 2111 for(j=0;j>-nz;j--){ 2112 oidx = bs*vi[j]; 2113 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 2114 t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 2115 t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 2116 t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 2117 t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 2118 t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 2119 t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 2120 v -= bs2; 2121 } 2122 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; t[4+idx] =s5; 2123 t[5+idx] = s6; t[6+idx] = s7; 2124 idx += bs; 2125 } 2126 /* backward solve the L^T */ 2127 for (i=n-1; i>=0; i--){ 2128 v = aa + bs2*ai[i]; 2129 vi = aj + ai[i]; 2130 nz = ai[i+1] - ai[i]; 2131 idt = bs*i; 2132 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; s5 = t[4+idt]; 2133 s6 = t[5+idt]; s7 = t[6+idt]; 2134 for(j=0;j<nz;j++){ 2135 idx = bs*vi[j]; 2136 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 2137 t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 2138 t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 2139 t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 2140 t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 2141 t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 2142 t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 2143 v += bs2; 2144 } 2145 } 2146 2147 /* copy t into x according to permutation */ 2148 for(i=0;i<n;i++){ 2149 ii = bs*i; ir = bs*r[i]; 2150 x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3]; 2151 x[ir+4] = t[ii+4]; x[ir+5] = t[ii+5]; x[ir+6] = t[ii+6]; 2152 } 2153 2154 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2155 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2156 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2157 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2158 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 2159 PetscFunctionReturn(0); 2160 } 2161 2162 /* ----------------------------------------------------------- */ 2163 #undef __FUNCT__ 2164 #define __FUNCT__ "MatSolve_SeqBAIJ_N_inplace" 2165 PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx) 2166 { 2167 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2168 IS iscol=a->col,isrow=a->row; 2169 PetscErrorCode ierr; 2170 const PetscInt *r,*c,*rout,*cout; 2171 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*vi; 2172 PetscInt i,nz; 2173 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 2174 const MatScalar *aa=a->a,*v; 2175 PetscScalar *x,*s,*t,*ls; 2176 const PetscScalar *b; 2177 2178 PetscFunctionBegin; 2179 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2180 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2181 t = a->solve_work; 2182 2183 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2184 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2185 2186 /* forward solve the lower triangular */ 2187 ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 2188 for (i=1; i<n; i++) { 2189 v = aa + bs2*ai[i]; 2190 vi = aj + ai[i]; 2191 nz = a->diag[i] - ai[i]; 2192 s = t + bs*i; 2193 ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 2194 while (nz--) { 2195 Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 2196 v += bs2; 2197 } 2198 } 2199 /* backward solve the upper triangular */ 2200 ls = a->solve_work + A->cmap->n; 2201 for (i=n-1; i>=0; i--){ 2202 v = aa + bs2*(a->diag[i] + 1); 2203 vi = aj + a->diag[i] + 1; 2204 nz = ai[i+1] - a->diag[i] - 1; 2205 ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 2206 while (nz--) { 2207 Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 2208 v += bs2; 2209 } 2210 Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 2211 ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 2212 } 2213 2214 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2215 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2216 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2217 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2218 ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 2219 PetscFunctionReturn(0); 2220 } 2221 2222 /* ----------------------------------------------------------- */ 2223 #undef __FUNCT__ 2224 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N_inplace" 2225 PetscErrorCode MatSolveTranspose_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx) 2226 { 2227 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2228 IS iscol=a->col,isrow=a->row; 2229 PetscErrorCode ierr; 2230 const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi; 2231 PetscInt i,nz,j; 2232 const PetscInt n=a->mbs,bs=A->rmap->bs,bs2=a->bs2; 2233 const MatScalar *aa=a->a,*v; 2234 PetscScalar *x,*t,*ls; 2235 const PetscScalar *b; 2236 PetscFunctionBegin; 2237 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2238 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2239 t = a->solve_work; 2240 2241 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2242 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2243 2244 /* copy the b into temp work space according to permutation */ 2245 for (i=0; i<n; i++) { 2246 for (j=0; j<bs; j++) { 2247 t[i*bs+j] = b[c[i]*bs+j]; 2248 } 2249 } 2250 2251 2252 /* forward solve the upper triangular transpose */ 2253 ls = a->solve_work + A->cmap->n; 2254 for (i=0; i<n; i++){ 2255 ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 2256 Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 2257 v = aa + bs2*(a->diag[i] + 1); 2258 vi = aj + a->diag[i] + 1; 2259 nz = ai[i+1] - a->diag[i] - 1; 2260 while (nz--) { 2261 Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs); 2262 v += bs2; 2263 } 2264 } 2265 2266 /* backward solve the lower triangular transpose */ 2267 for (i=n-1; i>=0; i--) { 2268 v = aa + bs2*ai[i]; 2269 vi = aj + ai[i]; 2270 nz = a->diag[i] - ai[i]; 2271 while (nz--) { 2272 Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs); 2273 v += bs2; 2274 } 2275 } 2276 2277 /* copy t into x according to permutation */ 2278 for (i=0; i<n; i++) { 2279 for (j=0; j<bs; j++) { 2280 x[bs*r[i]+j] = t[bs*i+j]; 2281 } 2282 } 2283 2284 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2285 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2286 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2287 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2288 ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 2289 PetscFunctionReturn(0); 2290 } 2291 2292 #undef __FUNCT__ 2293 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N" 2294 PetscErrorCode MatSolveTranspose_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 2295 { 2296 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2297 IS iscol=a->col,isrow=a->row; 2298 PetscErrorCode ierr; 2299 const PetscInt *r,*c,*rout,*cout; 2300 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*vi,*diag=a->diag; 2301 PetscInt i,j,nz; 2302 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 2303 const MatScalar *aa=a->a,*v; 2304 PetscScalar *x,*t,*ls; 2305 const PetscScalar *b; 2306 2307 PetscFunctionBegin; 2308 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2309 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2310 t = a->solve_work; 2311 2312 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2313 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2314 2315 /* copy the b into temp work space according to permutation */ 2316 for (i=0; i<n; i++) { 2317 for (j=0; j<bs; j++) { 2318 t[i*bs+j] = b[c[i]*bs+j]; 2319 } 2320 } 2321 2322 2323 /* forward solve the upper triangular transpose */ 2324 ls = a->solve_work + A->cmap->n; 2325 for (i=0; i<n; i++){ 2326 ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 2327 Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*diag[i],t+i*bs); 2328 v = aa + bs2*(diag[i] - 1); 2329 vi = aj + diag[i] - 1; 2330 nz = diag[i] - diag[i+1] - 1; 2331 for(j=0;j>-nz;j--){ 2332 Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs); 2333 v -= bs2; 2334 } 2335 } 2336 2337 /* backward solve the lower triangular transpose */ 2338 for (i=n-1; i>=0; i--) { 2339 v = aa + bs2*ai[i]; 2340 vi = aj + ai[i]; 2341 nz = ai[i+1] - ai[i]; 2342 for(j=0;j<nz;j++){ 2343 Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs); 2344 v += bs2; 2345 } 2346 } 2347 2348 /* copy t into x according to permutation */ 2349 for (i=0; i<n; i++) { 2350 for (j=0; j<bs; j++) { 2351 x[bs*r[i]+j] = t[bs*i+j]; 2352 } 2353 } 2354 2355 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2356 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2357 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2358 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2359 ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 2360 PetscFunctionReturn(0); 2361 } 2362 2363 /* bs = 15 for PFLOTRAN. Block operations are done by accessing all the columns of the block at once */ 2364 2365 #undef __FUNCT__ 2366 #define __FUNCT__ "MatSolve_SeqBAIJ_15_NaturalOrdering_ver2" 2367 PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver2(Mat A,Vec bb,Vec xx) 2368 { 2369 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2370 PetscErrorCode ierr; 2371 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2; 2372 PetscInt i,nz,idx,idt,m; 2373 const MatScalar *aa=a->a,*v; 2374 PetscScalar s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15; 2375 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15; 2376 PetscScalar *x; 2377 const PetscScalar *b; 2378 2379 PetscFunctionBegin; 2380 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2381 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2382 2383 /* forward solve the lower triangular */ 2384 idx = 0; 2385 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx]; x[4] = b[4+idx]; 2386 x[5] = b[5+idx]; x[6] = b[6+idx]; x[7] = b[7+idx]; x[8] = b[8+idx]; x[9] = b[9+idx]; 2387 x[10] = b[10+idx]; x[11] = b[11+idx]; x[12] = b[12+idx]; x[13] = b[13+idx]; x[14] = b[14+idx]; 2388 2389 for (i=1; i<n; i++) { 2390 v = aa + bs2*ai[i]; 2391 vi = aj + ai[i]; 2392 nz = ai[i+1] - ai[i]; 2393 idt = bs*i; 2394 s1 = b[idt]; s2 = b[1+idt]; s3 = b[2+idt]; s4 = b[3+idt]; s5 = b[4+idt]; 2395 s6 = b[5+idt]; s7 = b[6+idt]; s8 = b[7+idt]; s9 = b[8+idt]; s10 = b[9+idt]; 2396 s11 = b[10+idt]; s12 = b[11+idt]; s13 = b[12+idt]; s14 = b[13+idt]; s15 = b[14+idt]; 2397 for(m=0;m<nz;m++){ 2398 idx = bs*vi[m]; 2399 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 2400 x6 = x[5+idx]; x7 = x[6+idx]; x8 = x[7+idx]; x9 = x[8+idx]; x10 = x[9+idx]; 2401 x11 = x[10+idx]; x12 = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx]; 2402 2403 2404 s1 -= v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15; 2405 s2 -= v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15; 2406 s3 -= v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15; 2407 s4 -= v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15; 2408 s5 -= v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15; 2409 s6 -= v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15; 2410 s7 -= v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15; 2411 s8 -= v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15; 2412 s9 -= v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15; 2413 s10 -= v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15; 2414 s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15; 2415 s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15; 2416 s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15; 2417 s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15; 2418 s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15; 2419 2420 v += bs2; 2421 } 2422 x[idt] = s1; x[1+idt] = s2; x[2+idt] = s3; x[3+idt] = s4; x[4+idt] = s5; 2423 x[5+idt] = s6; x[6+idt] = s7; x[7+idt] = s8; x[8+idt] = s9; x[9+idt] = s10; 2424 x[10+idt] = s11; x[11+idt] = s12; x[12+idt] = s13; x[13+idt] = s14; x[14+idt] = s15; 2425 2426 } 2427 /* backward solve the upper triangular */ 2428 for (i=n-1; i>=0; i--){ 2429 v = aa + bs2*(adiag[i+1]+1); 2430 vi = aj + adiag[i+1]+1; 2431 nz = adiag[i] - adiag[i+1] - 1; 2432 idt = bs*i; 2433 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt]; 2434 s6 = x[5+idt]; s7 = x[6+idt]; s8 = x[7+idt]; s9 = x[8+idt]; s10 = x[9+idt]; 2435 s11 = x[10+idt]; s12 = x[11+idt]; s13 = x[12+idt]; s14 = x[13+idt]; s15 = x[14+idt]; 2436 2437 for(m=0;m<nz;m++){ 2438 idx = bs*vi[m]; 2439 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 2440 x6 = x[5+idx]; x7 = x[6+idx]; x8 = x[7+idx]; x9 = x[8+idx]; x10 = x[9+idx]; 2441 x11 = x[10+idx]; x12 = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx]; 2442 2443 s1 -= v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15; 2444 s2 -= v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15; 2445 s3 -= v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15; 2446 s4 -= v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15; 2447 s5 -= v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15; 2448 s6 -= v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15; 2449 s7 -= v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15; 2450 s8 -= v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15; 2451 s9 -= v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15; 2452 s10 -= v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15; 2453 s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15; 2454 s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15; 2455 s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15; 2456 s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15; 2457 s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15; 2458 2459 v += bs2; 2460 } 2461 2462 x[idt] = v[0]*s1 + v[15]*s2 + v[30]*s3 + v[45]*s4 + v[60]*s5 + v[75]*s6 + v[90]*s7 + v[105]*s8 + v[120]*s9 + v[135]*s10 + v[150]*s11 + v[165]*s12 + v[180]*s13 + v[195]*s14 + v[210]*s15; 2463 x[1+idt] = v[1]*s1 + v[16]*s2 + v[31]*s3 + v[46]*s4 + v[61]*s5 + v[76]*s6 + v[91]*s7 + v[106]*s8 + v[121]*s9 + v[136]*s10 + v[151]*s11 + v[166]*s12 + v[181]*s13 + v[196]*s14 + v[211]*s15; 2464 x[2+idt] = v[2]*s1 + v[17]*s2 + v[32]*s3 + v[47]*s4 + v[62]*s5 + v[77]*s6 + v[92]*s7 + v[107]*s8 + v[122]*s9 + v[137]*s10 + v[152]*s11 + v[167]*s12 + v[182]*s13 + v[197]*s14 + v[212]*s15; 2465 x[3+idt] = v[3]*s1 + v[18]*s2 + v[33]*s3 + v[48]*s4 + v[63]*s5 + v[78]*s6 + v[93]*s7 + v[108]*s8 + v[123]*s9 + v[138]*s10 + v[153]*s11 + v[168]*s12 + v[183]*s13 + v[198]*s14 + v[213]*s15; 2466 x[4+idt] = v[4]*s1 + v[19]*s2 + v[34]*s3 + v[49]*s4 + v[64]*s5 + v[79]*s6 + v[94]*s7 + v[109]*s8 + v[124]*s9 + v[139]*s10 + v[154]*s11 + v[169]*s12 + v[184]*s13 + v[199]*s14 + v[214]*s15; 2467 x[5+idt] = v[5]*s1 + v[20]*s2 + v[35]*s3 + v[50]*s4 + v[65]*s5 + v[80]*s6 + v[95]*s7 + v[110]*s8 + v[125]*s9 + v[140]*s10 + v[155]*s11 + v[170]*s12 + v[185]*s13 + v[200]*s14 + v[215]*s15; 2468 x[6+idt] = v[6]*s1 + v[21]*s2 + v[36]*s3 + v[51]*s4 + v[66]*s5 + v[81]*s6 + v[96]*s7 + v[111]*s8 + v[126]*s9 + v[141]*s10 + v[156]*s11 + v[171]*s12 + v[186]*s13 + v[201]*s14 + v[216]*s15; 2469 x[7+idt] = v[7]*s1 + v[22]*s2 + v[37]*s3 + v[52]*s4 + v[67]*s5 + v[82]*s6 + v[97]*s7 + v[112]*s8 + v[127]*s9 + v[142]*s10 + v[157]*s11 + v[172]*s12 + v[187]*s13 + v[202]*s14 + v[217]*s15; 2470 x[8+idt] = v[8]*s1 + v[23]*s2 + v[38]*s3 + v[53]*s4 + v[68]*s5 + v[83]*s6 + v[98]*s7 + v[113]*s8 + v[128]*s9 + v[143]*s10 + v[158]*s11 + v[173]*s12 + v[188]*s13 + v[203]*s14 + v[218]*s15; 2471 x[9+idt] = v[9]*s1 + v[24]*s2 + v[39]*s3 + v[54]*s4 + v[69]*s5 + v[84]*s6 + v[99]*s7 + v[114]*s8 + v[129]*s9 + v[144]*s10 + v[159]*s11 + v[174]*s12 + v[189]*s13 + v[204]*s14 + v[219]*s15; 2472 x[10+idt] = v[10]*s1 + v[25]*s2 + v[40]*s3 + v[55]*s4 + v[70]*s5 + v[85]*s6 + v[100]*s7 + v[115]*s8 + v[130]*s9 + v[145]*s10 + v[160]*s11 + v[175]*s12 + v[190]*s13 + v[205]*s14 + v[220]*s15; 2473 x[11+idt] = v[11]*s1 + v[26]*s2 + v[41]*s3 + v[56]*s4 + v[71]*s5 + v[86]*s6 + v[101]*s7 + v[116]*s8 + v[131]*s9 + v[146]*s10 + v[161]*s11 + v[176]*s12 + v[191]*s13 + v[206]*s14 + v[221]*s15; 2474 x[12+idt] = v[12]*s1 + v[27]*s2 + v[42]*s3 + v[57]*s4 + v[72]*s5 + v[87]*s6 + v[102]*s7 + v[117]*s8 + v[132]*s9 + v[147]*s10 + v[162]*s11 + v[177]*s12 + v[192]*s13 + v[207]*s14 + v[222]*s15; 2475 x[13+idt] = v[13]*s1 + v[28]*s2 + v[43]*s3 + v[58]*s4 + v[73]*s5 + v[88]*s6 + v[103]*s7 + v[118]*s8 + v[133]*s9 + v[148]*s10 + v[163]*s11 + v[178]*s12 + v[193]*s13 + v[208]*s14 + v[223]*s15; 2476 x[14+idt] = v[14]*s1 + v[29]*s2 + v[44]*s3 + v[59]*s4 + v[74]*s5 + v[89]*s6 + v[104]*s7 + v[119]*s8 + v[134]*s9 + v[149]*s10 + v[164]*s11 + v[179]*s12 + v[194]*s13 + v[209]*s14 + v[224]*s15; 2477 2478 } 2479 2480 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2481 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2482 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 2483 PetscFunctionReturn(0); 2484 } 2485 2486 /* bs = 15 for PFLOTRAN. Block operations are done by accessing one column at at time */ 2487 /* Default MatSolve for block size 15 */ 2488 2489 #undef __FUNCT__ 2490 #define __FUNCT__ "MatSolve_SeqBAIJ_15_NaturalOrdering_ver1" 2491 PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver1(Mat A,Vec bb,Vec xx) 2492 { 2493 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2494 PetscErrorCode ierr; 2495 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2; 2496 PetscInt i,k,nz,idx,idt,m; 2497 const MatScalar *aa=a->a,*v; 2498 PetscScalar s[15]; 2499 PetscScalar *x,xv; 2500 const PetscScalar *b; 2501 2502 PetscFunctionBegin; 2503 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2504 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2505 2506 /* forward solve the lower triangular */ 2507 for (i=0; i<n; i++) { 2508 v = aa + bs2*ai[i]; 2509 vi = aj + ai[i]; 2510 nz = ai[i+1] - ai[i]; 2511 idt = bs*i; 2512 x[idt] = b[idt]; x[1+idt] = b[1+idt]; x[2+idt] = b[2+idt]; x[3+idt] = b[3+idt]; x[4+idt] = b[4+idt]; 2513 x[5+idt] = b[5+idt]; x[6+idt] = b[6+idt]; x[7+idt] = b[7+idt]; x[8+idt] = b[8+idt]; x[9+idt] = b[9+idt]; 2514 x[10+idt] = b[10+idt]; x[11+idt] = b[11+idt]; x[12+idt] = b[12+idt]; x[13+idt] = b[13+idt]; x[14+idt] = b[14+idt]; 2515 for(m=0;m<nz;m++){ 2516 idx = bs*vi[m]; 2517 for(k=0;k<15;k++){ 2518 xv = x[k + idx]; 2519 x[idt] -= v[0]*xv; 2520 x[1+idt] -= v[1]*xv; 2521 x[2+idt] -= v[2]*xv; 2522 x[3+idt] -= v[3]*xv; 2523 x[4+idt] -= v[4]*xv; 2524 x[5+idt] -= v[5]*xv; 2525 x[6+idt] -= v[6]*xv; 2526 x[7+idt] -= v[7]*xv; 2527 x[8+idt] -= v[8]*xv; 2528 x[9+idt] -= v[9]*xv; 2529 x[10+idt] -= v[10]*xv; 2530 x[11+idt] -= v[11]*xv; 2531 x[12+idt] -= v[12]*xv; 2532 x[13+idt] -= v[13]*xv; 2533 x[14+idt] -= v[14]*xv; 2534 v += 15; 2535 } 2536 } 2537 } 2538 /* backward solve the upper triangular */ 2539 for (i=n-1; i>=0; i--){ 2540 v = aa + bs2*(adiag[i+1]+1); 2541 vi = aj + adiag[i+1]+1; 2542 nz = adiag[i] - adiag[i+1] - 1; 2543 idt = bs*i; 2544 s[0] = x[idt]; s[1] = x[1+idt]; s[2] = x[2+idt]; s[3] = x[3+idt]; s[4] = x[4+idt]; 2545 s[5] = x[5+idt]; s[6] = x[6+idt]; s[7] = x[7+idt]; s[8] = x[8+idt]; s[9] = x[9+idt]; 2546 s[10] = x[10+idt]; s[11] = x[11+idt]; s[12] = x[12+idt]; s[13] = x[13+idt]; s[14] = x[14+idt]; 2547 2548 for(m=0;m<nz;m++){ 2549 idx = bs*vi[m]; 2550 for(k=0;k<15;k++){ 2551 xv = x[k + idx]; 2552 s[0] -= v[0]*xv; 2553 s[1] -= v[1]*xv; 2554 s[2] -= v[2]*xv; 2555 s[3] -= v[3]*xv; 2556 s[4] -= v[4]*xv; 2557 s[5] -= v[5]*xv; 2558 s[6] -= v[6]*xv; 2559 s[7] -= v[7]*xv; 2560 s[8] -= v[8]*xv; 2561 s[9] -= v[9]*xv; 2562 s[10] -= v[10]*xv; 2563 s[11] -= v[11]*xv; 2564 s[12] -= v[12]*xv; 2565 s[13] -= v[13]*xv; 2566 s[14] -= v[14]*xv; 2567 v += 15; 2568 } 2569 } 2570 ierr = PetscMemzero(x+idt,bs*sizeof(MatScalar));CHKERRQ(ierr); 2571 for(k=0;k<15;k++){ 2572 x[idt] += v[0]*s[k]; 2573 x[1+idt] += v[1]*s[k]; 2574 x[2+idt] += v[2]*s[k]; 2575 x[3+idt] += v[3]*s[k]; 2576 x[4+idt] += v[4]*s[k]; 2577 x[5+idt] += v[5]*s[k]; 2578 x[6+idt] += v[6]*s[k]; 2579 x[7+idt] += v[7]*s[k]; 2580 x[8+idt] += v[8]*s[k]; 2581 x[9+idt] += v[9]*s[k]; 2582 x[10+idt] += v[10]*s[k]; 2583 x[11+idt] += v[11]*s[k]; 2584 x[12+idt] += v[12]*s[k]; 2585 x[13+idt] += v[13]*s[k]; 2586 x[14+idt] += v[14]*s[k]; 2587 v += 15; 2588 } 2589 } 2590 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2591 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2592 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 2593 PetscFunctionReturn(0); 2594 } 2595 2596 2597 #undef __FUNCT__ 2598 #define __FUNCT__ "MatSolve_SeqBAIJ_7_inplace" 2599 PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx) 2600 { 2601 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2602 IS iscol=a->col,isrow=a->row; 2603 PetscErrorCode ierr; 2604 const PetscInt *r,*c,*ai=a->i,*aj=a->j; 2605 const PetscInt *rout,*cout,*diag = a->diag,*vi,n=a->mbs; 2606 PetscInt i,nz,idx,idt,idc; 2607 const MatScalar *aa=a->a,*v; 2608 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t; 2609 const PetscScalar *b; 2610 2611 PetscFunctionBegin; 2612 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2613 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2614 t = a->solve_work; 2615 2616 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2617 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2618 2619 /* forward solve the lower triangular */ 2620 idx = 7*(*r++); 2621 t[0] = b[idx]; t[1] = b[1+idx]; 2622 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 2623 t[5] = b[5+idx]; t[6] = b[6+idx]; 2624 2625 for (i=1; i<n; i++) { 2626 v = aa + 49*ai[i]; 2627 vi = aj + ai[i]; 2628 nz = diag[i] - ai[i]; 2629 idx = 7*(*r++); 2630 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2631 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 2632 while (nz--) { 2633 idx = 7*(*vi++); 2634 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 2635 x4 = t[3+idx];x5 = t[4+idx]; 2636 x6 = t[5+idx];x7 = t[6+idx]; 2637 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2638 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2639 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2640 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2641 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2642 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2643 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2644 v += 49; 2645 } 2646 idx = 7*i; 2647 t[idx] = s1;t[1+idx] = s2; 2648 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 2649 t[5+idx] = s6;t[6+idx] = s7; 2650 } 2651 /* backward solve the upper triangular */ 2652 for (i=n-1; i>=0; i--){ 2653 v = aa + 49*diag[i] + 49; 2654 vi = aj + diag[i] + 1; 2655 nz = ai[i+1] - diag[i] - 1; 2656 idt = 7*i; 2657 s1 = t[idt]; s2 = t[1+idt]; 2658 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 2659 s6 = t[5+idt];s7 = t[6+idt]; 2660 while (nz--) { 2661 idx = 7*(*vi++); 2662 x1 = t[idx]; x2 = t[1+idx]; 2663 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 2664 x6 = t[5+idx]; x7 = t[6+idx]; 2665 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2666 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2667 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2668 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2669 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2670 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2671 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2672 v += 49; 2673 } 2674 idc = 7*(*c--); 2675 v = aa + 49*diag[i]; 2676 x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 2677 v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 2678 x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 2679 v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 2680 x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 2681 v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 2682 x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 2683 v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 2684 x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 2685 v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 2686 x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 2687 v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 2688 x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 2689 v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 2690 } 2691 2692 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2693 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2694 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2695 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2696 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 2697 PetscFunctionReturn(0); 2698 } 2699 2700 #undef __FUNCT__ 2701 #define __FUNCT__ "MatSolve_SeqBAIJ_7" 2702 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 2703 { 2704 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2705 IS iscol=a->col,isrow=a->row; 2706 PetscErrorCode ierr; 2707 const PetscInt *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag; 2708 const PetscInt n=a->mbs,*rout,*cout,*vi; 2709 PetscInt i,nz,idx,idt,idc,m; 2710 const MatScalar *aa=a->a,*v; 2711 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t; 2712 const PetscScalar *b; 2713 2714 PetscFunctionBegin; 2715 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2716 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2717 t = a->solve_work; 2718 2719 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2720 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2721 2722 /* forward solve the lower triangular */ 2723 idx = 7*r[0]; 2724 t[0] = b[idx]; t[1] = b[1+idx]; 2725 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 2726 t[5] = b[5+idx]; t[6] = b[6+idx]; 2727 2728 for (i=1; i<n; i++) { 2729 v = aa + 49*ai[i]; 2730 vi = aj + ai[i]; 2731 nz = ai[i+1] - ai[i]; 2732 idx = 7*r[i]; 2733 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2734 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 2735 for(m=0;m<nz;m++){ 2736 idx = 7*vi[m]; 2737 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 2738 x4 = t[3+idx];x5 = t[4+idx]; 2739 x6 = t[5+idx];x7 = t[6+idx]; 2740 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2741 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2742 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2743 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2744 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2745 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2746 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2747 v += 49; 2748 } 2749 idx = 7*i; 2750 t[idx] = s1;t[1+idx] = s2; 2751 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 2752 t[5+idx] = s6;t[6+idx] = s7; 2753 } 2754 /* backward solve the upper triangular */ 2755 for (i=n-1; i>=0; i--){ 2756 v = aa + 49*(adiag[i+1]+1); 2757 vi = aj + adiag[i+1]+1; 2758 nz = adiag[i] - adiag[i+1] - 1; 2759 idt = 7*i; 2760 s1 = t[idt]; s2 = t[1+idt]; 2761 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 2762 s6 = t[5+idt];s7 = t[6+idt]; 2763 for(m=0;m<nz;m++){ 2764 idx = 7*vi[m]; 2765 x1 = t[idx]; x2 = t[1+idx]; 2766 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 2767 x6 = t[5+idx]; x7 = t[6+idx]; 2768 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2769 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2770 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2771 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2772 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2773 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2774 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2775 v += 49; 2776 } 2777 idc = 7*c[i]; 2778 x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 2779 v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 2780 x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 2781 v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 2782 x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 2783 v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 2784 x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 2785 v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 2786 x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 2787 v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 2788 x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 2789 v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 2790 x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 2791 v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 2792 } 2793 2794 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2795 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2796 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2797 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2798 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 2799 PetscFunctionReturn(0); 2800 } 2801 2802 #undef __FUNCT__ 2803 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_inplace" 2804 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 2805 { 2806 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2807 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 2808 PetscErrorCode ierr; 2809 PetscInt i,nz,idx,idt,jdx; 2810 const MatScalar *aa=a->a,*v; 2811 PetscScalar *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 2812 const PetscScalar *b; 2813 2814 PetscFunctionBegin; 2815 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2816 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2817 /* forward solve the lower triangular */ 2818 idx = 0; 2819 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 2820 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 2821 x[6] = b[6+idx]; 2822 for (i=1; i<n; i++) { 2823 v = aa + 49*ai[i]; 2824 vi = aj + ai[i]; 2825 nz = diag[i] - ai[i]; 2826 idx = 7*i; 2827 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 2828 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 2829 s7 = b[6+idx]; 2830 while (nz--) { 2831 jdx = 7*(*vi++); 2832 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 2833 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 2834 x7 = x[6+jdx]; 2835 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2836 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2837 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2838 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2839 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2840 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2841 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2842 v += 49; 2843 } 2844 x[idx] = s1; 2845 x[1+idx] = s2; 2846 x[2+idx] = s3; 2847 x[3+idx] = s4; 2848 x[4+idx] = s5; 2849 x[5+idx] = s6; 2850 x[6+idx] = s7; 2851 } 2852 /* backward solve the upper triangular */ 2853 for (i=n-1; i>=0; i--){ 2854 v = aa + 49*diag[i] + 49; 2855 vi = aj + diag[i] + 1; 2856 nz = ai[i+1] - diag[i] - 1; 2857 idt = 7*i; 2858 s1 = x[idt]; s2 = x[1+idt]; 2859 s3 = x[2+idt]; s4 = x[3+idt]; 2860 s5 = x[4+idt]; s6 = x[5+idt]; 2861 s7 = x[6+idt]; 2862 while (nz--) { 2863 idx = 7*(*vi++); 2864 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 2865 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 2866 x7 = x[6+idx]; 2867 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2868 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2869 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2870 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2871 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2872 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2873 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2874 v += 49; 2875 } 2876 v = aa + 49*diag[i]; 2877 x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 2878 + v[28]*s5 + v[35]*s6 + v[42]*s7; 2879 x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 2880 + v[29]*s5 + v[36]*s6 + v[43]*s7; 2881 x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 2882 + v[30]*s5 + v[37]*s6 + v[44]*s7; 2883 x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 2884 + v[31]*s5 + v[38]*s6 + v[45]*s7; 2885 x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 2886 + v[32]*s5 + v[39]*s6 + v[46]*s7; 2887 x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 2888 + v[33]*s5 + v[40]*s6 + v[47]*s7; 2889 x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 2890 + v[34]*s5 + v[41]*s6 + v[48]*s7; 2891 } 2892 2893 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2894 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2895 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 2896 PetscFunctionReturn(0); 2897 } 2898 2899 #undef __FUNCT__ 2900 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering" 2901 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 2902 { 2903 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2904 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 2905 PetscErrorCode ierr; 2906 PetscInt i,k,nz,idx,jdx,idt; 2907 const PetscInt bs = A->rmap->bs,bs2 = a->bs2; 2908 const MatScalar *aa=a->a,*v; 2909 PetscScalar *x; 2910 const PetscScalar *b; 2911 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 2912 2913 PetscFunctionBegin; 2914 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2915 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2916 /* forward solve the lower triangular */ 2917 idx = 0; 2918 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 2919 x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx]; 2920 for (i=1; i<n; i++) { 2921 v = aa + bs2*ai[i]; 2922 vi = aj + ai[i]; 2923 nz = ai[i+1] - ai[i]; 2924 idx = bs*i; 2925 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2926 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 2927 for(k=0;k<nz;k++) { 2928 jdx = bs*vi[k]; 2929 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 2930 x5 = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx]; 2931 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2932 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2933 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2934 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2935 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2936 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2937 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2938 v += bs2; 2939 } 2940 2941 x[idx] = s1; 2942 x[1+idx] = s2; 2943 x[2+idx] = s3; 2944 x[3+idx] = s4; 2945 x[4+idx] = s5; 2946 x[5+idx] = s6; 2947 x[6+idx] = s7; 2948 } 2949 2950 /* backward solve the upper triangular */ 2951 for (i=n-1; i>=0; i--){ 2952 v = aa + bs2*(adiag[i+1]+1); 2953 vi = aj + adiag[i+1]+1; 2954 nz = adiag[i] - adiag[i+1]-1; 2955 idt = bs*i; 2956 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 2957 s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt]; 2958 for(k=0;k<nz;k++) { 2959 idx = bs*vi[k]; 2960 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 2961 x5 = x[4+idx];x6 = x[5+idx];x7 = x[6+idx]; 2962 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2963 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2964 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2965 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2966 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2967 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2968 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2969 v += bs2; 2970 } 2971 /* x = inv_diagonal*x */ 2972 x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 + v[28]*s5 + v[35]*s6 + v[42]*s7; 2973 x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 + v[29]*s5 + v[36]*s6 + v[43]*s7; 2974 x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 + v[30]*s5 + v[37]*s6 + v[44]*s7; 2975 x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 + v[31]*s5 + v[38]*s6 + v[45]*s7; 2976 x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 + v[32]*s5 + v[39]*s6 + v[46]*s7; 2977 x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 + v[33]*s5 + v[40]*s6 + v[47]*s7; 2978 x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 + v[34]*s5 + v[41]*s6 + v[48]*s7; 2979 } 2980 2981 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2982 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2983 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 2984 PetscFunctionReturn(0); 2985 } 2986 2987 #undef __FUNCT__ 2988 #define __FUNCT__ "MatSolve_SeqBAIJ_6_inplace" 2989 PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx) 2990 { 2991 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2992 IS iscol=a->col,isrow=a->row; 2993 PetscErrorCode ierr; 2994 const PetscInt *r,*c,*rout,*cout; 2995 const PetscInt *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 2996 PetscInt i,nz,idx,idt,idc; 2997 const MatScalar *aa=a->a,*v; 2998 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 2999 const PetscScalar *b; 3000 3001 PetscFunctionBegin; 3002 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3003 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3004 t = a->solve_work; 3005 3006 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3007 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3008 3009 /* forward solve the lower triangular */ 3010 idx = 6*(*r++); 3011 t[0] = b[idx]; t[1] = b[1+idx]; 3012 t[2] = b[2+idx]; t[3] = b[3+idx]; 3013 t[4] = b[4+idx]; t[5] = b[5+idx]; 3014 for (i=1; i<n; i++) { 3015 v = aa + 36*ai[i]; 3016 vi = aj + ai[i]; 3017 nz = diag[i] - ai[i]; 3018 idx = 6*(*r++); 3019 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3020 s5 = b[4+idx]; s6 = b[5+idx]; 3021 while (nz--) { 3022 idx = 6*(*vi++); 3023 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 3024 x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 3025 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 3026 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 3027 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 3028 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 3029 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 3030 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 3031 v += 36; 3032 } 3033 idx = 6*i; 3034 t[idx] = s1;t[1+idx] = s2; 3035 t[2+idx] = s3;t[3+idx] = s4; 3036 t[4+idx] = s5;t[5+idx] = s6; 3037 } 3038 /* backward solve the upper triangular */ 3039 for (i=n-1; i>=0; i--){ 3040 v = aa + 36*diag[i] + 36; 3041 vi = aj + diag[i] + 1; 3042 nz = ai[i+1] - diag[i] - 1; 3043 idt = 6*i; 3044 s1 = t[idt]; s2 = t[1+idt]; 3045 s3 = t[2+idt];s4 = t[3+idt]; 3046 s5 = t[4+idt];s6 = t[5+idt]; 3047 while (nz--) { 3048 idx = 6*(*vi++); 3049 x1 = t[idx]; x2 = t[1+idx]; 3050 x3 = t[2+idx]; x4 = t[3+idx]; 3051 x5 = t[4+idx]; x6 = t[5+idx]; 3052 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 3053 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 3054 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 3055 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 3056 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 3057 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 3058 v += 36; 3059 } 3060 idc = 6*(*c--); 3061 v = aa + 36*diag[i]; 3062 x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 3063 v[18]*s4+v[24]*s5+v[30]*s6; 3064 x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 3065 v[19]*s4+v[25]*s5+v[31]*s6; 3066 x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 3067 v[20]*s4+v[26]*s5+v[32]*s6; 3068 x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 3069 v[21]*s4+v[27]*s5+v[33]*s6; 3070 x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 3071 v[22]*s4+v[28]*s5+v[34]*s6; 3072 x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 3073 v[23]*s4+v[29]*s5+v[35]*s6; 3074 } 3075 3076 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3077 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3078 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3079 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3080 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 3081 PetscFunctionReturn(0); 3082 } 3083 3084 #undef __FUNCT__ 3085 #define __FUNCT__ "MatSolve_SeqBAIJ_6" 3086 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 3087 { 3088 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 3089 IS iscol=a->col,isrow=a->row; 3090 PetscErrorCode ierr; 3091 const PetscInt *r,*c,*rout,*cout; 3092 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 3093 PetscInt i,nz,idx,idt,idc,m; 3094 const MatScalar *aa=a->a,*v; 3095 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 3096 const PetscScalar *b; 3097 3098 PetscFunctionBegin; 3099 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3100 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3101 t = a->solve_work; 3102 3103 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3104 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 3105 3106 /* forward solve the lower triangular */ 3107 idx = 6*r[0]; 3108 t[0] = b[idx]; t[1] = b[1+idx]; 3109 t[2] = b[2+idx]; t[3] = b[3+idx]; 3110 t[4] = b[4+idx]; t[5] = b[5+idx]; 3111 for (i=1; i<n; i++) { 3112 v = aa + 36*ai[i]; 3113 vi = aj + ai[i]; 3114 nz = ai[i+1] - ai[i]; 3115 idx = 6*r[i]; 3116 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3117 s5 = b[4+idx]; s6 = b[5+idx]; 3118 for(m=0;m<nz;m++){ 3119 idx = 6*vi[m]; 3120 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 3121 x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 3122 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 3123 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 3124 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 3125 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 3126 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 3127 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 3128 v += 36; 3129 } 3130 idx = 6*i; 3131 t[idx] = s1;t[1+idx] = s2; 3132 t[2+idx] = s3;t[3+idx] = s4; 3133 t[4+idx] = s5;t[5+idx] = s6; 3134 } 3135 /* backward solve the upper triangular */ 3136 for (i=n-1; i>=0; i--){ 3137 v = aa + 36*(adiag[i+1]+1); 3138 vi = aj + adiag[i+1]+1; 3139 nz = adiag[i] - adiag[i+1] - 1; 3140 idt = 6*i; 3141 s1 = t[idt]; s2 = t[1+idt]; 3142 s3 = t[2+idt];s4 = t[3+idt]; 3143 s5 = t[4+idt];s6 = t[5+idt]; 3144 for(m=0;m<nz;m++){ 3145 idx = 6*vi[m]; 3146 x1 = t[idx]; x2 = t[1+idx]; 3147 x3 = t[2+idx]; x4 = t[3+idx]; 3148 x5 = t[4+idx]; x6 = t[5+idx]; 3149 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 3150 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 3151 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 3152 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 3153 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 3154 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 3155 v += 36; 3156 } 3157 idc = 6*c[i]; 3158 x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 3159 v[18]*s4+v[24]*s5+v[30]*s6; 3160 x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 3161 v[19]*s4+v[25]*s5+v[31]*s6; 3162 x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 3163 v[20]*s4+v[26]*s5+v[32]*s6; 3164 x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 3165 v[21]*s4+v[27]*s5+v[33]*s6; 3166 x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 3167 v[22]*s4+v[28]*s5+v[34]*s6; 3168 x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 3169 v[23]*s4+v[29]*s5+v[35]*s6; 3170 } 3171 3172 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3173 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3174 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3175 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3176 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 3177 PetscFunctionReturn(0); 3178 } 3179 3180 #undef __FUNCT__ 3181 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_inplace" 3182 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 3183 { 3184 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3185 PetscInt i,nz,idx,idt,jdx; 3186 PetscErrorCode ierr; 3187 const PetscInt *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j; 3188 const MatScalar *aa=a->a,*v; 3189 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 3190 const PetscScalar *b; 3191 3192 PetscFunctionBegin; 3193 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3194 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3195 /* forward solve the lower triangular */ 3196 idx = 0; 3197 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 3198 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 3199 for (i=1; i<n; i++) { 3200 v = aa + 36*ai[i]; 3201 vi = aj + ai[i]; 3202 nz = diag[i] - ai[i]; 3203 idx = 6*i; 3204 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 3205 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 3206 while (nz--) { 3207 jdx = 6*(*vi++); 3208 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 3209 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 3210 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 3211 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 3212 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 3213 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 3214 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 3215 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 3216 v += 36; 3217 } 3218 x[idx] = s1; 3219 x[1+idx] = s2; 3220 x[2+idx] = s3; 3221 x[3+idx] = s4; 3222 x[4+idx] = s5; 3223 x[5+idx] = s6; 3224 } 3225 /* backward solve the upper triangular */ 3226 for (i=n-1; i>=0; i--){ 3227 v = aa + 36*diag[i] + 36; 3228 vi = aj + diag[i] + 1; 3229 nz = ai[i+1] - diag[i] - 1; 3230 idt = 6*i; 3231 s1 = x[idt]; s2 = x[1+idt]; 3232 s3 = x[2+idt]; s4 = x[3+idt]; 3233 s5 = x[4+idt]; s6 = x[5+idt]; 3234 while (nz--) { 3235 idx = 6*(*vi++); 3236 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 3237 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 3238 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 3239 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 3240 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 3241 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 3242 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 3243 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 3244 v += 36; 3245 } 3246 v = aa + 36*diag[i]; 3247 x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 3248 x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 3249 x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 3250 x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 3251 x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 3252 x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 3253 } 3254 3255 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3256 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3257 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 3258 PetscFunctionReturn(0); 3259 } 3260 3261 #undef __FUNCT__ 3262 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering" 3263 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 3264 { 3265 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3266 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 3267 PetscErrorCode ierr; 3268 PetscInt i,k,nz,idx,jdx,idt; 3269 const PetscInt bs = A->rmap->bs,bs2 = a->bs2; 3270 const MatScalar *aa=a->a,*v; 3271 PetscScalar *x; 3272 const PetscScalar *b; 3273 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 3274 3275 PetscFunctionBegin; 3276 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3277 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3278 /* forward solve the lower triangular */ 3279 idx = 0; 3280 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 3281 x[4] = b[4+idx];x[5] = b[5+idx]; 3282 for (i=1; i<n; i++) { 3283 v = aa + bs2*ai[i]; 3284 vi = aj + ai[i]; 3285 nz = ai[i+1] - ai[i]; 3286 idx = bs*i; 3287 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3288 s5 = b[4+idx];s6 = b[5+idx]; 3289 for(k=0;k<nz;k++){ 3290 jdx = bs*vi[k]; 3291 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 3292 x5 = x[4+jdx]; x6 = x[5+jdx]; 3293 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 3294 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 3295 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 3296 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 3297 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 3298 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 3299 v += bs2; 3300 } 3301 3302 x[idx] = s1; 3303 x[1+idx] = s2; 3304 x[2+idx] = s3; 3305 x[3+idx] = s4; 3306 x[4+idx] = s5; 3307 x[5+idx] = s6; 3308 } 3309 3310 /* backward solve the upper triangular */ 3311 for (i=n-1; i>=0; i--){ 3312 v = aa + bs2*(adiag[i+1]+1); 3313 vi = aj + adiag[i+1]+1; 3314 nz = adiag[i] - adiag[i+1]-1; 3315 idt = bs*i; 3316 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 3317 s5 = x[4+idt];s6 = x[5+idt]; 3318 for(k=0;k<nz;k++){ 3319 idx = bs*vi[k]; 3320 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 3321 x5 = x[4+idx];x6 = x[5+idx]; 3322 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 3323 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 3324 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 3325 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 3326 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 3327 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 3328 v += bs2; 3329 } 3330 /* x = inv_diagonal*x */ 3331 x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 3332 x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 3333 x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 3334 x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 3335 x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 3336 x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 3337 } 3338 3339 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3340 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3341 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 3342 PetscFunctionReturn(0); 3343 } 3344 3345 #undef __FUNCT__ 3346 #define __FUNCT__ "MatSolve_SeqBAIJ_5_inplace" 3347 PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx) 3348 { 3349 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 3350 IS iscol=a->col,isrow=a->row; 3351 PetscErrorCode ierr; 3352 const PetscInt *r,*c,*rout,*cout,*diag = a->diag; 3353 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 3354 PetscInt i,nz,idx,idt,idc; 3355 const MatScalar *aa=a->a,*v; 3356 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 3357 const PetscScalar *b; 3358 3359 PetscFunctionBegin; 3360 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3361 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3362 t = a->solve_work; 3363 3364 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3365 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3366 3367 /* forward solve the lower triangular */ 3368 idx = 5*(*r++); 3369 t[0] = b[idx]; t[1] = b[1+idx]; 3370 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 3371 for (i=1; i<n; i++) { 3372 v = aa + 25*ai[i]; 3373 vi = aj + ai[i]; 3374 nz = diag[i] - ai[i]; 3375 idx = 5*(*r++); 3376 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3377 s5 = b[4+idx]; 3378 while (nz--) { 3379 idx = 5*(*vi++); 3380 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 3381 x4 = t[3+idx];x5 = t[4+idx]; 3382 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3383 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3384 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3385 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3386 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3387 v += 25; 3388 } 3389 idx = 5*i; 3390 t[idx] = s1;t[1+idx] = s2; 3391 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 3392 } 3393 /* backward solve the upper triangular */ 3394 for (i=n-1; i>=0; i--){ 3395 v = aa + 25*diag[i] + 25; 3396 vi = aj + diag[i] + 1; 3397 nz = ai[i+1] - diag[i] - 1; 3398 idt = 5*i; 3399 s1 = t[idt]; s2 = t[1+idt]; 3400 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 3401 while (nz--) { 3402 idx = 5*(*vi++); 3403 x1 = t[idx]; x2 = t[1+idx]; 3404 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 3405 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3406 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3407 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3408 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3409 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3410 v += 25; 3411 } 3412 idc = 5*(*c--); 3413 v = aa + 25*diag[i]; 3414 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 3415 v[15]*s4+v[20]*s5; 3416 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 3417 v[16]*s4+v[21]*s5; 3418 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 3419 v[17]*s4+v[22]*s5; 3420 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 3421 v[18]*s4+v[23]*s5; 3422 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 3423 v[19]*s4+v[24]*s5; 3424 } 3425 3426 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3427 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3428 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3429 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3430 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 3431 PetscFunctionReturn(0); 3432 } 3433 3434 #undef __FUNCT__ 3435 #define __FUNCT__ "MatSolve_SeqBAIJ_5" 3436 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 3437 { 3438 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 3439 IS iscol=a->col,isrow=a->row; 3440 PetscErrorCode ierr; 3441 const PetscInt *r,*c,*rout,*cout; 3442 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 3443 PetscInt i,nz,idx,idt,idc,m; 3444 const MatScalar *aa=a->a,*v; 3445 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 3446 const PetscScalar *b; 3447 3448 PetscFunctionBegin; 3449 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3450 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3451 t = a->solve_work; 3452 3453 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3454 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 3455 3456 /* forward solve the lower triangular */ 3457 idx = 5*r[0]; 3458 t[0] = b[idx]; t[1] = b[1+idx]; 3459 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 3460 for (i=1; i<n; i++) { 3461 v = aa + 25*ai[i]; 3462 vi = aj + ai[i]; 3463 nz = ai[i+1] - ai[i]; 3464 idx = 5*r[i]; 3465 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3466 s5 = b[4+idx]; 3467 for(m=0;m<nz;m++){ 3468 idx = 5*vi[m]; 3469 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 3470 x4 = t[3+idx];x5 = t[4+idx]; 3471 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3472 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3473 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3474 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3475 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3476 v += 25; 3477 } 3478 idx = 5*i; 3479 t[idx] = s1;t[1+idx] = s2; 3480 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 3481 } 3482 /* backward solve the upper triangular */ 3483 for (i=n-1; i>=0; i--){ 3484 v = aa + 25*(adiag[i+1]+1); 3485 vi = aj + adiag[i+1]+1; 3486 nz = adiag[i] - adiag[i+1] - 1; 3487 idt = 5*i; 3488 s1 = t[idt]; s2 = t[1+idt]; 3489 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 3490 for(m=0;m<nz;m++){ 3491 idx = 5*vi[m]; 3492 x1 = t[idx]; x2 = t[1+idx]; 3493 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 3494 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3495 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3496 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3497 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3498 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3499 v += 25; 3500 } 3501 idc = 5*c[i]; 3502 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 3503 v[15]*s4+v[20]*s5; 3504 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 3505 v[16]*s4+v[21]*s5; 3506 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 3507 v[17]*s4+v[22]*s5; 3508 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 3509 v[18]*s4+v[23]*s5; 3510 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 3511 v[19]*s4+v[24]*s5; 3512 } 3513 3514 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3515 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3516 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3517 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3518 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 3519 PetscFunctionReturn(0); 3520 } 3521 3522 #undef __FUNCT__ 3523 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_inplace" 3524 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 3525 { 3526 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3527 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 3528 PetscInt i,nz,idx,idt,jdx; 3529 PetscErrorCode ierr; 3530 const MatScalar *aa=a->a,*v; 3531 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 3532 const PetscScalar *b; 3533 3534 PetscFunctionBegin; 3535 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3536 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3537 /* forward solve the lower triangular */ 3538 idx = 0; 3539 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 3540 for (i=1; i<n; i++) { 3541 v = aa + 25*ai[i]; 3542 vi = aj + ai[i]; 3543 nz = diag[i] - ai[i]; 3544 idx = 5*i; 3545 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 3546 while (nz--) { 3547 jdx = 5*(*vi++); 3548 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 3549 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3550 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3551 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3552 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3553 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3554 v += 25; 3555 } 3556 x[idx] = s1; 3557 x[1+idx] = s2; 3558 x[2+idx] = s3; 3559 x[3+idx] = s4; 3560 x[4+idx] = s5; 3561 } 3562 /* backward solve the upper triangular */ 3563 for (i=n-1; i>=0; i--){ 3564 v = aa + 25*diag[i] + 25; 3565 vi = aj + diag[i] + 1; 3566 nz = ai[i+1] - diag[i] - 1; 3567 idt = 5*i; 3568 s1 = x[idt]; s2 = x[1+idt]; 3569 s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 3570 while (nz--) { 3571 idx = 5*(*vi++); 3572 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 3573 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3574 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3575 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3576 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3577 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3578 v += 25; 3579 } 3580 v = aa + 25*diag[i]; 3581 x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 3582 x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 3583 x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 3584 x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 3585 x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 3586 } 3587 3588 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3589 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3590 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 3591 PetscFunctionReturn(0); 3592 } 3593 3594 #undef __FUNCT__ 3595 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering" 3596 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 3597 { 3598 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3599 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 3600 PetscInt i,k,nz,idx,idt,jdx; 3601 PetscErrorCode ierr; 3602 const MatScalar *aa=a->a,*v; 3603 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 3604 const PetscScalar *b; 3605 3606 PetscFunctionBegin; 3607 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3608 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3609 /* forward solve the lower triangular */ 3610 idx = 0; 3611 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 3612 for (i=1; i<n; i++) { 3613 v = aa + 25*ai[i]; 3614 vi = aj + ai[i]; 3615 nz = ai[i+1] - ai[i]; 3616 idx = 5*i; 3617 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 3618 for(k=0;k<nz;k++) { 3619 jdx = 5*vi[k]; 3620 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 3621 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3622 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3623 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3624 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3625 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3626 v += 25; 3627 } 3628 x[idx] = s1; 3629 x[1+idx] = s2; 3630 x[2+idx] = s3; 3631 x[3+idx] = s4; 3632 x[4+idx] = s5; 3633 } 3634 3635 /* backward solve the upper triangular */ 3636 for (i=n-1; i>=0; i--){ 3637 v = aa + 25*(adiag[i+1]+1); 3638 vi = aj + adiag[i+1]+1; 3639 nz = adiag[i] - adiag[i+1]-1; 3640 idt = 5*i; 3641 s1 = x[idt]; s2 = x[1+idt]; 3642 s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 3643 for(k=0;k<nz;k++){ 3644 idx = 5*vi[k]; 3645 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 3646 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3647 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3648 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3649 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3650 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3651 v += 25; 3652 } 3653 /* x = inv_diagonal*x */ 3654 x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 3655 x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 3656 x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 3657 x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 3658 x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 3659 } 3660 3661 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3662 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3663 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 3664 PetscFunctionReturn(0); 3665 } 3666 3667 #undef __FUNCT__ 3668 #define __FUNCT__ "MatSolve_SeqBAIJ_4_inplace" 3669 PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx) 3670 { 3671 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3672 IS iscol=a->col,isrow=a->row; 3673 PetscErrorCode ierr; 3674 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 3675 PetscInt i,nz,idx,idt,idc; 3676 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3677 const MatScalar *aa=a->a,*v; 3678 PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 3679 const PetscScalar *b; 3680 3681 PetscFunctionBegin; 3682 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3683 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3684 t = a->solve_work; 3685 3686 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3687 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3688 3689 /* forward solve the lower triangular */ 3690 idx = 4*(*r++); 3691 t[0] = b[idx]; t[1] = b[1+idx]; 3692 t[2] = b[2+idx]; t[3] = b[3+idx]; 3693 for (i=1; i<n; i++) { 3694 v = aa + 16*ai[i]; 3695 vi = aj + ai[i]; 3696 nz = diag[i] - ai[i]; 3697 idx = 4*(*r++); 3698 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3699 while (nz--) { 3700 idx = 4*(*vi++); 3701 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 3702 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3703 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3704 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3705 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3706 v += 16; 3707 } 3708 idx = 4*i; 3709 t[idx] = s1;t[1+idx] = s2; 3710 t[2+idx] = s3;t[3+idx] = s4; 3711 } 3712 /* backward solve the upper triangular */ 3713 for (i=n-1; i>=0; i--){ 3714 v = aa + 16*diag[i] + 16; 3715 vi = aj + diag[i] + 1; 3716 nz = ai[i+1] - diag[i] - 1; 3717 idt = 4*i; 3718 s1 = t[idt]; s2 = t[1+idt]; 3719 s3 = t[2+idt];s4 = t[3+idt]; 3720 while (nz--) { 3721 idx = 4*(*vi++); 3722 x1 = t[idx]; x2 = t[1+idx]; 3723 x3 = t[2+idx]; x4 = t[3+idx]; 3724 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3725 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3726 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3727 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3728 v += 16; 3729 } 3730 idc = 4*(*c--); 3731 v = aa + 16*diag[i]; 3732 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 3733 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 3734 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 3735 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 3736 } 3737 3738 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3739 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3740 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3741 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3742 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3743 PetscFunctionReturn(0); 3744 } 3745 3746 #undef __FUNCT__ 3747 #define __FUNCT__ "MatSolve_SeqBAIJ_4" 3748 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 3749 { 3750 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3751 IS iscol=a->col,isrow=a->row; 3752 PetscErrorCode ierr; 3753 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 3754 PetscInt i,nz,idx,idt,idc,m; 3755 const PetscInt *r,*c,*rout,*cout; 3756 const MatScalar *aa=a->a,*v; 3757 PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 3758 const PetscScalar *b; 3759 3760 PetscFunctionBegin; 3761 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3762 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3763 t = a->solve_work; 3764 3765 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3766 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 3767 3768 /* forward solve the lower triangular */ 3769 idx = 4*r[0]; 3770 t[0] = b[idx]; t[1] = b[1+idx]; 3771 t[2] = b[2+idx]; t[3] = b[3+idx]; 3772 for (i=1; i<n; i++) { 3773 v = aa + 16*ai[i]; 3774 vi = aj + ai[i]; 3775 nz = ai[i+1] - ai[i]; 3776 idx = 4*r[i]; 3777 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3778 for(m=0;m<nz;m++){ 3779 idx = 4*vi[m]; 3780 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 3781 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3782 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3783 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3784 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3785 v += 16; 3786 } 3787 idx = 4*i; 3788 t[idx] = s1;t[1+idx] = s2; 3789 t[2+idx] = s3;t[3+idx] = s4; 3790 } 3791 /* backward solve the upper triangular */ 3792 for (i=n-1; i>=0; i--){ 3793 v = aa + 16*(adiag[i+1]+1); 3794 vi = aj + adiag[i+1]+1; 3795 nz = adiag[i] - adiag[i+1] - 1; 3796 idt = 4*i; 3797 s1 = t[idt]; s2 = t[1+idt]; 3798 s3 = t[2+idt];s4 = t[3+idt]; 3799 for(m=0;m<nz;m++){ 3800 idx = 4*vi[m]; 3801 x1 = t[idx]; x2 = t[1+idx]; 3802 x3 = t[2+idx]; x4 = t[3+idx]; 3803 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3804 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3805 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3806 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3807 v += 16; 3808 } 3809 idc = 4*c[i]; 3810 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 3811 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 3812 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 3813 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 3814 } 3815 3816 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3817 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3818 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3819 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3820 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3821 PetscFunctionReturn(0); 3822 } 3823 3824 #undef __FUNCT__ 3825 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion" 3826 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx) 3827 { 3828 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3829 IS iscol=a->col,isrow=a->row; 3830 PetscErrorCode ierr; 3831 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 3832 PetscInt i,nz,idx,idt,idc; 3833 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3834 const MatScalar *aa=a->a,*v; 3835 MatScalar s1,s2,s3,s4,x1,x2,x3,x4,*t; 3836 PetscScalar *x; 3837 const PetscScalar *b; 3838 3839 PetscFunctionBegin; 3840 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3841 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3842 t = (MatScalar *)a->solve_work; 3843 3844 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3845 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3846 3847 /* forward solve the lower triangular */ 3848 idx = 4*(*r++); 3849 t[0] = (MatScalar)b[idx]; 3850 t[1] = (MatScalar)b[1+idx]; 3851 t[2] = (MatScalar)b[2+idx]; 3852 t[3] = (MatScalar)b[3+idx]; 3853 for (i=1; i<n; i++) { 3854 v = aa + 16*ai[i]; 3855 vi = aj + ai[i]; 3856 nz = diag[i] - ai[i]; 3857 idx = 4*(*r++); 3858 s1 = (MatScalar)b[idx]; 3859 s2 = (MatScalar)b[1+idx]; 3860 s3 = (MatScalar)b[2+idx]; 3861 s4 = (MatScalar)b[3+idx]; 3862 while (nz--) { 3863 idx = 4*(*vi++); 3864 x1 = t[idx]; 3865 x2 = t[1+idx]; 3866 x3 = t[2+idx]; 3867 x4 = t[3+idx]; 3868 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3869 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3870 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3871 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3872 v += 16; 3873 } 3874 idx = 4*i; 3875 t[idx] = s1; 3876 t[1+idx] = s2; 3877 t[2+idx] = s3; 3878 t[3+idx] = s4; 3879 } 3880 /* backward solve the upper triangular */ 3881 for (i=n-1; i>=0; i--){ 3882 v = aa + 16*diag[i] + 16; 3883 vi = aj + diag[i] + 1; 3884 nz = ai[i+1] - diag[i] - 1; 3885 idt = 4*i; 3886 s1 = t[idt]; 3887 s2 = t[1+idt]; 3888 s3 = t[2+idt]; 3889 s4 = t[3+idt]; 3890 while (nz--) { 3891 idx = 4*(*vi++); 3892 x1 = t[idx]; 3893 x2 = t[1+idx]; 3894 x3 = t[2+idx]; 3895 x4 = t[3+idx]; 3896 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3897 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3898 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3899 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3900 v += 16; 3901 } 3902 idc = 4*(*c--); 3903 v = aa + 16*diag[i]; 3904 t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 3905 t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 3906 t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 3907 t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 3908 x[idc] = (PetscScalar)t[idt]; 3909 x[1+idc] = (PetscScalar)t[1+idt]; 3910 x[2+idc] = (PetscScalar)t[2+idt]; 3911 x[3+idc] = (PetscScalar)t[3+idt]; 3912 } 3913 3914 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3915 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3916 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3917 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3918 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3919 PetscFunctionReturn(0); 3920 } 3921 3922 #if defined (PETSC_HAVE_SSE) 3923 3924 #include PETSC_HAVE_SSE 3925 3926 #undef __FUNCT__ 3927 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion" 3928 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx) 3929 { 3930 /* 3931 Note: This code uses demotion of double 3932 to float when performing the mixed-mode computation. 3933 This may not be numerically reasonable for all applications. 3934 */ 3935 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3936 IS iscol=a->col,isrow=a->row; 3937 PetscErrorCode ierr; 3938 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16; 3939 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3940 MatScalar *aa=a->a,*v; 3941 PetscScalar *x,*b,*t; 3942 3943 /* Make space in temp stack for 16 Byte Aligned arrays */ 3944 float ssealignedspace[11],*tmps,*tmpx; 3945 unsigned long offset; 3946 3947 PetscFunctionBegin; 3948 SSE_SCOPE_BEGIN; 3949 3950 offset = (unsigned long)ssealignedspace % 16; 3951 if (offset) offset = (16 - offset)/4; 3952 tmps = &ssealignedspace[offset]; 3953 tmpx = &ssealignedspace[offset+4]; 3954 PREFETCH_NTA(aa+16*ai[1]); 3955 3956 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3957 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3958 t = a->solve_work; 3959 3960 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3961 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3962 3963 /* forward solve the lower triangular */ 3964 idx = 4*(*r++); 3965 t[0] = b[idx]; t[1] = b[1+idx]; 3966 t[2] = b[2+idx]; t[3] = b[3+idx]; 3967 v = aa + 16*ai[1]; 3968 3969 for (i=1; i<n;) { 3970 PREFETCH_NTA(&v[8]); 3971 vi = aj + ai[i]; 3972 nz = diag[i] - ai[i]; 3973 idx = 4*(*r++); 3974 3975 /* Demote sum from double to float */ 3976 CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 3977 LOAD_PS(tmps,XMM7); 3978 3979 while (nz--) { 3980 PREFETCH_NTA(&v[16]); 3981 idx = 4*(*vi++); 3982 3983 /* Demote solution (so far) from double to float */ 3984 CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 3985 3986 /* 4x4 Matrix-Vector product with negative accumulation: */ 3987 SSE_INLINE_BEGIN_2(tmpx,v) 3988 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 3989 3990 /* First Column */ 3991 SSE_COPY_PS(XMM0,XMM6) 3992 SSE_SHUFFLE(XMM0,XMM0,0x00) 3993 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 3994 SSE_SUB_PS(XMM7,XMM0) 3995 3996 /* Second Column */ 3997 SSE_COPY_PS(XMM1,XMM6) 3998 SSE_SHUFFLE(XMM1,XMM1,0x55) 3999 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 4000 SSE_SUB_PS(XMM7,XMM1) 4001 4002 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 4003 4004 /* Third Column */ 4005 SSE_COPY_PS(XMM2,XMM6) 4006 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4007 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 4008 SSE_SUB_PS(XMM7,XMM2) 4009 4010 /* Fourth Column */ 4011 SSE_COPY_PS(XMM3,XMM6) 4012 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4013 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 4014 SSE_SUB_PS(XMM7,XMM3) 4015 SSE_INLINE_END_2 4016 4017 v += 16; 4018 } 4019 idx = 4*i; 4020 v = aa + 16*ai[++i]; 4021 PREFETCH_NTA(v); 4022 STORE_PS(tmps,XMM7); 4023 4024 /* Promote result from float to double */ 4025 CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps); 4026 } 4027 /* backward solve the upper triangular */ 4028 idt = 4*(n-1); 4029 ai16 = 16*diag[n-1]; 4030 v = aa + ai16 + 16; 4031 for (i=n-1; i>=0;){ 4032 PREFETCH_NTA(&v[8]); 4033 vi = aj + diag[i] + 1; 4034 nz = ai[i+1] - diag[i] - 1; 4035 4036 /* Demote accumulator from double to float */ 4037 CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]); 4038 LOAD_PS(tmps,XMM7); 4039 4040 while (nz--) { 4041 PREFETCH_NTA(&v[16]); 4042 idx = 4*(*vi++); 4043 4044 /* Demote solution (so far) from double to float */ 4045 CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]); 4046 4047 /* 4x4 Matrix-Vector Product with negative accumulation: */ 4048 SSE_INLINE_BEGIN_2(tmpx,v) 4049 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 4050 4051 /* First Column */ 4052 SSE_COPY_PS(XMM0,XMM6) 4053 SSE_SHUFFLE(XMM0,XMM0,0x00) 4054 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 4055 SSE_SUB_PS(XMM7,XMM0) 4056 4057 /* Second Column */ 4058 SSE_COPY_PS(XMM1,XMM6) 4059 SSE_SHUFFLE(XMM1,XMM1,0x55) 4060 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 4061 SSE_SUB_PS(XMM7,XMM1) 4062 4063 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 4064 4065 /* Third Column */ 4066 SSE_COPY_PS(XMM2,XMM6) 4067 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4068 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 4069 SSE_SUB_PS(XMM7,XMM2) 4070 4071 /* Fourth Column */ 4072 SSE_COPY_PS(XMM3,XMM6) 4073 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4074 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 4075 SSE_SUB_PS(XMM7,XMM3) 4076 SSE_INLINE_END_2 4077 v += 16; 4078 } 4079 v = aa + ai16; 4080 ai16 = 16*diag[--i]; 4081 PREFETCH_NTA(aa+ai16+16); 4082 /* 4083 Scale the result by the diagonal 4x4 block, 4084 which was inverted as part of the factorization 4085 */ 4086 SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 4087 /* First Column */ 4088 SSE_COPY_PS(XMM0,XMM7) 4089 SSE_SHUFFLE(XMM0,XMM0,0x00) 4090 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 4091 4092 /* Second Column */ 4093 SSE_COPY_PS(XMM1,XMM7) 4094 SSE_SHUFFLE(XMM1,XMM1,0x55) 4095 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 4096 SSE_ADD_PS(XMM0,XMM1) 4097 4098 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 4099 4100 /* Third Column */ 4101 SSE_COPY_PS(XMM2,XMM7) 4102 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4103 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 4104 SSE_ADD_PS(XMM0,XMM2) 4105 4106 /* Fourth Column */ 4107 SSE_COPY_PS(XMM3,XMM7) 4108 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4109 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 4110 SSE_ADD_PS(XMM0,XMM3) 4111 4112 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 4113 SSE_INLINE_END_3 4114 4115 /* Promote solution from float to double */ 4116 CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps); 4117 4118 /* Apply reordering to t and stream into x. */ 4119 /* This way, x doesn't pollute the cache. */ 4120 /* Be careful with size: 2 doubles = 4 floats! */ 4121 idc = 4*(*c--); 4122 SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc]) 4123 /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 4124 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 4125 SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0) 4126 /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 4127 SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1) 4128 SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1) 4129 SSE_INLINE_END_2 4130 v = aa + ai16 + 16; 4131 idt -= 4; 4132 } 4133 4134 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4135 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4136 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 4137 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4138 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 4139 SSE_SCOPE_END; 4140 PetscFunctionReturn(0); 4141 } 4142 4143 #endif 4144 4145 4146 /* 4147 Special case where the matrix was ILU(0) factored in the natural 4148 ordering. This eliminates the need for the column and row permutation. 4149 */ 4150 #undef __FUNCT__ 4151 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_inplace" 4152 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 4153 { 4154 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4155 PetscInt n=a->mbs; 4156 const PetscInt *ai=a->i,*aj=a->j; 4157 PetscErrorCode ierr; 4158 const PetscInt *diag = a->diag; 4159 const MatScalar *aa=a->a; 4160 PetscScalar *x; 4161 const PetscScalar *b; 4162 4163 PetscFunctionBegin; 4164 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 4165 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4166 4167 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS) 4168 { 4169 static PetscScalar w[2000]; /* very BAD need to fix */ 4170 fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w); 4171 } 4172 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 4173 { 4174 static PetscScalar w[2000]; /* very BAD need to fix */ 4175 fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w); 4176 } 4177 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 4178 fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b); 4179 #else 4180 { 4181 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 4182 const MatScalar *v; 4183 PetscInt jdx,idt,idx,nz,i,ai16; 4184 const PetscInt *vi; 4185 4186 /* forward solve the lower triangular */ 4187 idx = 0; 4188 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 4189 for (i=1; i<n; i++) { 4190 v = aa + 16*ai[i]; 4191 vi = aj + ai[i]; 4192 nz = diag[i] - ai[i]; 4193 idx += 4; 4194 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 4195 while (nz--) { 4196 jdx = 4*(*vi++); 4197 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx]; 4198 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4199 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4200 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4201 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4202 v += 16; 4203 } 4204 x[idx] = s1; 4205 x[1+idx] = s2; 4206 x[2+idx] = s3; 4207 x[3+idx] = s4; 4208 } 4209 /* backward solve the upper triangular */ 4210 idt = 4*(n-1); 4211 for (i=n-1; i>=0; i--){ 4212 ai16 = 16*diag[i]; 4213 v = aa + ai16 + 16; 4214 vi = aj + diag[i] + 1; 4215 nz = ai[i+1] - diag[i] - 1; 4216 s1 = x[idt]; s2 = x[1+idt]; 4217 s3 = x[2+idt];s4 = x[3+idt]; 4218 while (nz--) { 4219 idx = 4*(*vi++); 4220 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; 4221 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4222 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4223 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4224 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4225 v += 16; 4226 } 4227 v = aa + ai16; 4228 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 4229 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 4230 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 4231 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 4232 idt -= 4; 4233 } 4234 } 4235 #endif 4236 4237 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 4238 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4239 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 4240 PetscFunctionReturn(0); 4241 } 4242 4243 #undef __FUNCT__ 4244 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering" 4245 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 4246 { 4247 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4248 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 4249 PetscInt i,k,nz,idx,jdx,idt; 4250 PetscErrorCode ierr; 4251 const PetscInt bs = A->rmap->bs,bs2 = a->bs2; 4252 const MatScalar *aa=a->a,*v; 4253 PetscScalar *x; 4254 const PetscScalar *b; 4255 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 4256 4257 PetscFunctionBegin; 4258 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 4259 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4260 /* forward solve the lower triangular */ 4261 idx = 0; 4262 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 4263 for (i=1; i<n; i++) { 4264 v = aa + bs2*ai[i]; 4265 vi = aj + ai[i]; 4266 nz = ai[i+1] - ai[i]; 4267 idx = bs*i; 4268 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 4269 for(k=0;k<nz;k++) { 4270 jdx = bs*vi[k]; 4271 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 4272 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4273 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4274 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4275 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4276 4277 v += bs2; 4278 } 4279 4280 x[idx] = s1; 4281 x[1+idx] = s2; 4282 x[2+idx] = s3; 4283 x[3+idx] = s4; 4284 } 4285 4286 /* backward solve the upper triangular */ 4287 for (i=n-1; i>=0; i--){ 4288 v = aa + bs2*(adiag[i+1]+1); 4289 vi = aj + adiag[i+1]+1; 4290 nz = adiag[i] - adiag[i+1]-1; 4291 idt = bs*i; 4292 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 4293 4294 for(k=0;k<nz;k++){ 4295 idx = bs*vi[k]; 4296 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 4297 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4298 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4299 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4300 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4301 4302 v += bs2; 4303 } 4304 /* x = inv_diagonal*x */ 4305 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 4306 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;; 4307 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 4308 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 4309 4310 } 4311 4312 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 4313 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4314 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 4315 PetscFunctionReturn(0); 4316 } 4317 4318 #undef __FUNCT__ 4319 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion" 4320 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx) 4321 { 4322 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4323 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*diag=a->diag; 4324 PetscErrorCode ierr; 4325 const MatScalar *aa=a->a; 4326 const PetscScalar *b; 4327 PetscScalar *x; 4328 4329 PetscFunctionBegin; 4330 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 4331 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4332 4333 { 4334 MatScalar s1,s2,s3,s4,x1,x2,x3,x4; 4335 const MatScalar *v; 4336 MatScalar *t=(MatScalar *)x; 4337 PetscInt jdx,idt,idx,nz,i,ai16; 4338 const PetscInt *vi; 4339 4340 /* forward solve the lower triangular */ 4341 idx = 0; 4342 t[0] = (MatScalar)b[0]; 4343 t[1] = (MatScalar)b[1]; 4344 t[2] = (MatScalar)b[2]; 4345 t[3] = (MatScalar)b[3]; 4346 for (i=1; i<n; i++) { 4347 v = aa + 16*ai[i]; 4348 vi = aj + ai[i]; 4349 nz = diag[i] - ai[i]; 4350 idx += 4; 4351 s1 = (MatScalar)b[idx]; 4352 s2 = (MatScalar)b[1+idx]; 4353 s3 = (MatScalar)b[2+idx]; 4354 s4 = (MatScalar)b[3+idx]; 4355 while (nz--) { 4356 jdx = 4*(*vi++); 4357 x1 = t[jdx]; 4358 x2 = t[1+jdx]; 4359 x3 = t[2+jdx]; 4360 x4 = t[3+jdx]; 4361 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4362 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4363 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4364 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4365 v += 16; 4366 } 4367 t[idx] = s1; 4368 t[1+idx] = s2; 4369 t[2+idx] = s3; 4370 t[3+idx] = s4; 4371 } 4372 /* backward solve the upper triangular */ 4373 idt = 4*(n-1); 4374 for (i=n-1; i>=0; i--){ 4375 ai16 = 16*diag[i]; 4376 v = aa + ai16 + 16; 4377 vi = aj + diag[i] + 1; 4378 nz = ai[i+1] - diag[i] - 1; 4379 s1 = t[idt]; 4380 s2 = t[1+idt]; 4381 s3 = t[2+idt]; 4382 s4 = t[3+idt]; 4383 while (nz--) { 4384 idx = 4*(*vi++); 4385 x1 = (MatScalar)x[idx]; 4386 x2 = (MatScalar)x[1+idx]; 4387 x3 = (MatScalar)x[2+idx]; 4388 x4 = (MatScalar)x[3+idx]; 4389 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4390 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4391 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4392 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4393 v += 16; 4394 } 4395 v = aa + ai16; 4396 x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4); 4397 x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4); 4398 x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4); 4399 x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4); 4400 idt -= 4; 4401 } 4402 } 4403 4404 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 4405 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4406 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 4407 PetscFunctionReturn(0); 4408 } 4409 4410 #if defined (PETSC_HAVE_SSE) 4411 4412 #include PETSC_HAVE_SSE 4413 #undef __FUNCT__ 4414 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj" 4415 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx) 4416 { 4417 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4418 unsigned short *aj=(unsigned short *)a->j; 4419 PetscErrorCode ierr; 4420 int *ai=a->i,n=a->mbs,*diag = a->diag; 4421 MatScalar *aa=a->a; 4422 PetscScalar *x,*b; 4423 4424 PetscFunctionBegin; 4425 SSE_SCOPE_BEGIN; 4426 /* 4427 Note: This code currently uses demotion of double 4428 to float when performing the mixed-mode computation. 4429 This may not be numerically reasonable for all applications. 4430 */ 4431 PREFETCH_NTA(aa+16*ai[1]); 4432 4433 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 4434 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4435 { 4436 /* x will first be computed in single precision then promoted inplace to double */ 4437 MatScalar *v,*t=(MatScalar *)x; 4438 int nz,i,idt,ai16; 4439 unsigned int jdx,idx; 4440 unsigned short *vi; 4441 /* Forward solve the lower triangular factor. */ 4442 4443 /* First block is the identity. */ 4444 idx = 0; 4445 CONVERT_DOUBLE4_FLOAT4(t,b); 4446 v = aa + 16*((unsigned int)ai[1]); 4447 4448 for (i=1; i<n;) { 4449 PREFETCH_NTA(&v[8]); 4450 vi = aj + ai[i]; 4451 nz = diag[i] - ai[i]; 4452 idx += 4; 4453 4454 /* Demote RHS from double to float. */ 4455 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 4456 LOAD_PS(&t[idx],XMM7); 4457 4458 while (nz--) { 4459 PREFETCH_NTA(&v[16]); 4460 jdx = 4*((unsigned int)(*vi++)); 4461 4462 /* 4x4 Matrix-Vector product with negative accumulation: */ 4463 SSE_INLINE_BEGIN_2(&t[jdx],v) 4464 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 4465 4466 /* First Column */ 4467 SSE_COPY_PS(XMM0,XMM6) 4468 SSE_SHUFFLE(XMM0,XMM0,0x00) 4469 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 4470 SSE_SUB_PS(XMM7,XMM0) 4471 4472 /* Second Column */ 4473 SSE_COPY_PS(XMM1,XMM6) 4474 SSE_SHUFFLE(XMM1,XMM1,0x55) 4475 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 4476 SSE_SUB_PS(XMM7,XMM1) 4477 4478 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 4479 4480 /* Third Column */ 4481 SSE_COPY_PS(XMM2,XMM6) 4482 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4483 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 4484 SSE_SUB_PS(XMM7,XMM2) 4485 4486 /* Fourth Column */ 4487 SSE_COPY_PS(XMM3,XMM6) 4488 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4489 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 4490 SSE_SUB_PS(XMM7,XMM3) 4491 SSE_INLINE_END_2 4492 4493 v += 16; 4494 } 4495 v = aa + 16*ai[++i]; 4496 PREFETCH_NTA(v); 4497 STORE_PS(&t[idx],XMM7); 4498 } 4499 4500 /* Backward solve the upper triangular factor.*/ 4501 4502 idt = 4*(n-1); 4503 ai16 = 16*diag[n-1]; 4504 v = aa + ai16 + 16; 4505 for (i=n-1; i>=0;){ 4506 PREFETCH_NTA(&v[8]); 4507 vi = aj + diag[i] + 1; 4508 nz = ai[i+1] - diag[i] - 1; 4509 4510 LOAD_PS(&t[idt],XMM7); 4511 4512 while (nz--) { 4513 PREFETCH_NTA(&v[16]); 4514 idx = 4*((unsigned int)(*vi++)); 4515 4516 /* 4x4 Matrix-Vector Product with negative accumulation: */ 4517 SSE_INLINE_BEGIN_2(&t[idx],v) 4518 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 4519 4520 /* First Column */ 4521 SSE_COPY_PS(XMM0,XMM6) 4522 SSE_SHUFFLE(XMM0,XMM0,0x00) 4523 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 4524 SSE_SUB_PS(XMM7,XMM0) 4525 4526 /* Second Column */ 4527 SSE_COPY_PS(XMM1,XMM6) 4528 SSE_SHUFFLE(XMM1,XMM1,0x55) 4529 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 4530 SSE_SUB_PS(XMM7,XMM1) 4531 4532 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 4533 4534 /* Third Column */ 4535 SSE_COPY_PS(XMM2,XMM6) 4536 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4537 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 4538 SSE_SUB_PS(XMM7,XMM2) 4539 4540 /* Fourth Column */ 4541 SSE_COPY_PS(XMM3,XMM6) 4542 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4543 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 4544 SSE_SUB_PS(XMM7,XMM3) 4545 SSE_INLINE_END_2 4546 v += 16; 4547 } 4548 v = aa + ai16; 4549 ai16 = 16*diag[--i]; 4550 PREFETCH_NTA(aa+ai16+16); 4551 /* 4552 Scale the result by the diagonal 4x4 block, 4553 which was inverted as part of the factorization 4554 */ 4555 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 4556 /* First Column */ 4557 SSE_COPY_PS(XMM0,XMM7) 4558 SSE_SHUFFLE(XMM0,XMM0,0x00) 4559 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 4560 4561 /* Second Column */ 4562 SSE_COPY_PS(XMM1,XMM7) 4563 SSE_SHUFFLE(XMM1,XMM1,0x55) 4564 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 4565 SSE_ADD_PS(XMM0,XMM1) 4566 4567 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 4568 4569 /* Third Column */ 4570 SSE_COPY_PS(XMM2,XMM7) 4571 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4572 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 4573 SSE_ADD_PS(XMM0,XMM2) 4574 4575 /* Fourth Column */ 4576 SSE_COPY_PS(XMM3,XMM7) 4577 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4578 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 4579 SSE_ADD_PS(XMM0,XMM3) 4580 4581 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 4582 SSE_INLINE_END_3 4583 4584 v = aa + ai16 + 16; 4585 idt -= 4; 4586 } 4587 4588 /* Convert t from single precision back to double precision (inplace)*/ 4589 idt = 4*(n-1); 4590 for (i=n-1;i>=0;i--) { 4591 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 4592 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 4593 PetscScalar *xtemp=&x[idt]; 4594 MatScalar *ttemp=&t[idt]; 4595 xtemp[3] = (PetscScalar)ttemp[3]; 4596 xtemp[2] = (PetscScalar)ttemp[2]; 4597 xtemp[1] = (PetscScalar)ttemp[1]; 4598 xtemp[0] = (PetscScalar)ttemp[0]; 4599 idt -= 4; 4600 } 4601 4602 } /* End of artificial scope. */ 4603 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 4604 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4605 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 4606 SSE_SCOPE_END; 4607 PetscFunctionReturn(0); 4608 } 4609 4610 #undef __FUNCT__ 4611 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion" 4612 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx) 4613 { 4614 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4615 int *aj=a->j; 4616 PetscErrorCode ierr; 4617 int *ai=a->i,n=a->mbs,*diag = a->diag; 4618 MatScalar *aa=a->a; 4619 PetscScalar *x,*b; 4620 4621 PetscFunctionBegin; 4622 SSE_SCOPE_BEGIN; 4623 /* 4624 Note: This code currently uses demotion of double 4625 to float when performing the mixed-mode computation. 4626 This may not be numerically reasonable for all applications. 4627 */ 4628 PREFETCH_NTA(aa+16*ai[1]); 4629 4630 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 4631 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4632 { 4633 /* x will first be computed in single precision then promoted inplace to double */ 4634 MatScalar *v,*t=(MatScalar *)x; 4635 int nz,i,idt,ai16; 4636 int jdx,idx; 4637 int *vi; 4638 /* Forward solve the lower triangular factor. */ 4639 4640 /* First block is the identity. */ 4641 idx = 0; 4642 CONVERT_DOUBLE4_FLOAT4(t,b); 4643 v = aa + 16*ai[1]; 4644 4645 for (i=1; i<n;) { 4646 PREFETCH_NTA(&v[8]); 4647 vi = aj + ai[i]; 4648 nz = diag[i] - ai[i]; 4649 idx += 4; 4650 4651 /* Demote RHS from double to float. */ 4652 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 4653 LOAD_PS(&t[idx],XMM7); 4654 4655 while (nz--) { 4656 PREFETCH_NTA(&v[16]); 4657 jdx = 4*(*vi++); 4658 /* jdx = *vi++; */ 4659 4660 /* 4x4 Matrix-Vector product with negative accumulation: */ 4661 SSE_INLINE_BEGIN_2(&t[jdx],v) 4662 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 4663 4664 /* First Column */ 4665 SSE_COPY_PS(XMM0,XMM6) 4666 SSE_SHUFFLE(XMM0,XMM0,0x00) 4667 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 4668 SSE_SUB_PS(XMM7,XMM0) 4669 4670 /* Second Column */ 4671 SSE_COPY_PS(XMM1,XMM6) 4672 SSE_SHUFFLE(XMM1,XMM1,0x55) 4673 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 4674 SSE_SUB_PS(XMM7,XMM1) 4675 4676 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 4677 4678 /* Third Column */ 4679 SSE_COPY_PS(XMM2,XMM6) 4680 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4681 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 4682 SSE_SUB_PS(XMM7,XMM2) 4683 4684 /* Fourth Column */ 4685 SSE_COPY_PS(XMM3,XMM6) 4686 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4687 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 4688 SSE_SUB_PS(XMM7,XMM3) 4689 SSE_INLINE_END_2 4690 4691 v += 16; 4692 } 4693 v = aa + 16*ai[++i]; 4694 PREFETCH_NTA(v); 4695 STORE_PS(&t[idx],XMM7); 4696 } 4697 4698 /* Backward solve the upper triangular factor.*/ 4699 4700 idt = 4*(n-1); 4701 ai16 = 16*diag[n-1]; 4702 v = aa + ai16 + 16; 4703 for (i=n-1; i>=0;){ 4704 PREFETCH_NTA(&v[8]); 4705 vi = aj + diag[i] + 1; 4706 nz = ai[i+1] - diag[i] - 1; 4707 4708 LOAD_PS(&t[idt],XMM7); 4709 4710 while (nz--) { 4711 PREFETCH_NTA(&v[16]); 4712 idx = 4*(*vi++); 4713 /* idx = *vi++; */ 4714 4715 /* 4x4 Matrix-Vector Product with negative accumulation: */ 4716 SSE_INLINE_BEGIN_2(&t[idx],v) 4717 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 4718 4719 /* First Column */ 4720 SSE_COPY_PS(XMM0,XMM6) 4721 SSE_SHUFFLE(XMM0,XMM0,0x00) 4722 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 4723 SSE_SUB_PS(XMM7,XMM0) 4724 4725 /* Second Column */ 4726 SSE_COPY_PS(XMM1,XMM6) 4727 SSE_SHUFFLE(XMM1,XMM1,0x55) 4728 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 4729 SSE_SUB_PS(XMM7,XMM1) 4730 4731 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 4732 4733 /* Third Column */ 4734 SSE_COPY_PS(XMM2,XMM6) 4735 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4736 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 4737 SSE_SUB_PS(XMM7,XMM2) 4738 4739 /* Fourth Column */ 4740 SSE_COPY_PS(XMM3,XMM6) 4741 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4742 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 4743 SSE_SUB_PS(XMM7,XMM3) 4744 SSE_INLINE_END_2 4745 v += 16; 4746 } 4747 v = aa + ai16; 4748 ai16 = 16*diag[--i]; 4749 PREFETCH_NTA(aa+ai16+16); 4750 /* 4751 Scale the result by the diagonal 4x4 block, 4752 which was inverted as part of the factorization 4753 */ 4754 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 4755 /* First Column */ 4756 SSE_COPY_PS(XMM0,XMM7) 4757 SSE_SHUFFLE(XMM0,XMM0,0x00) 4758 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 4759 4760 /* Second Column */ 4761 SSE_COPY_PS(XMM1,XMM7) 4762 SSE_SHUFFLE(XMM1,XMM1,0x55) 4763 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 4764 SSE_ADD_PS(XMM0,XMM1) 4765 4766 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 4767 4768 /* Third Column */ 4769 SSE_COPY_PS(XMM2,XMM7) 4770 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4771 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 4772 SSE_ADD_PS(XMM0,XMM2) 4773 4774 /* Fourth Column */ 4775 SSE_COPY_PS(XMM3,XMM7) 4776 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4777 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 4778 SSE_ADD_PS(XMM0,XMM3) 4779 4780 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 4781 SSE_INLINE_END_3 4782 4783 v = aa + ai16 + 16; 4784 idt -= 4; 4785 } 4786 4787 /* Convert t from single precision back to double precision (inplace)*/ 4788 idt = 4*(n-1); 4789 for (i=n-1;i>=0;i--) { 4790 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 4791 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 4792 PetscScalar *xtemp=&x[idt]; 4793 MatScalar *ttemp=&t[idt]; 4794 xtemp[3] = (PetscScalar)ttemp[3]; 4795 xtemp[2] = (PetscScalar)ttemp[2]; 4796 xtemp[1] = (PetscScalar)ttemp[1]; 4797 xtemp[0] = (PetscScalar)ttemp[0]; 4798 idt -= 4; 4799 } 4800 4801 } /* End of artificial scope. */ 4802 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 4803 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4804 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 4805 SSE_SCOPE_END; 4806 PetscFunctionReturn(0); 4807 } 4808 4809 #endif 4810 4811 #undef __FUNCT__ 4812 #define __FUNCT__ "MatSolve_SeqBAIJ_3_inplace" 4813 PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx) 4814 { 4815 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 4816 IS iscol=a->col,isrow=a->row; 4817 PetscErrorCode ierr; 4818 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 4819 PetscInt i,nz,idx,idt,idc; 4820 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 4821 const MatScalar *aa=a->a,*v; 4822 PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 4823 const PetscScalar *b; 4824 4825 PetscFunctionBegin; 4826 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 4827 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4828 t = a->solve_work; 4829 4830 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 4831 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 4832 4833 /* forward solve the lower triangular */ 4834 idx = 3*(*r++); 4835 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 4836 for (i=1; i<n; i++) { 4837 v = aa + 9*ai[i]; 4838 vi = aj + ai[i]; 4839 nz = diag[i] - ai[i]; 4840 idx = 3*(*r++); 4841 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 4842 while (nz--) { 4843 idx = 3*(*vi++); 4844 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4845 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4846 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4847 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4848 v += 9; 4849 } 4850 idx = 3*i; 4851 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 4852 } 4853 /* backward solve the upper triangular */ 4854 for (i=n-1; i>=0; i--){ 4855 v = aa + 9*diag[i] + 9; 4856 vi = aj + diag[i] + 1; 4857 nz = ai[i+1] - diag[i] - 1; 4858 idt = 3*i; 4859 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 4860 while (nz--) { 4861 idx = 3*(*vi++); 4862 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4863 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4864 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4865 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4866 v += 9; 4867 } 4868 idc = 3*(*c--); 4869 v = aa + 9*diag[i]; 4870 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4871 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4872 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4873 } 4874 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4875 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4876 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 4877 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4878 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 4879 PetscFunctionReturn(0); 4880 } 4881 4882 #undef __FUNCT__ 4883 #define __FUNCT__ "MatSolve_SeqBAIJ_3" 4884 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 4885 { 4886 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 4887 IS iscol=a->col,isrow=a->row; 4888 PetscErrorCode ierr; 4889 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 4890 PetscInt i,nz,idx,idt,idc,m; 4891 const PetscInt *r,*c,*rout,*cout; 4892 const MatScalar *aa=a->a,*v; 4893 PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 4894 const PetscScalar *b; 4895 4896 PetscFunctionBegin; 4897 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 4898 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4899 t = a->solve_work; 4900 4901 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 4902 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 4903 4904 /* forward solve the lower triangular */ 4905 idx = 3*r[0]; 4906 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 4907 for (i=1; i<n; i++) { 4908 v = aa + 9*ai[i]; 4909 vi = aj + ai[i]; 4910 nz = ai[i+1] - ai[i]; 4911 idx = 3*r[i]; 4912 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 4913 for(m=0;m<nz;m++){ 4914 idx = 3*vi[m]; 4915 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4916 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4917 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4918 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4919 v += 9; 4920 } 4921 idx = 3*i; 4922 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 4923 } 4924 /* backward solve the upper triangular */ 4925 for (i=n-1; i>=0; i--){ 4926 v = aa + 9*(adiag[i+1]+1); 4927 vi = aj + adiag[i+1]+1; 4928 nz = adiag[i] - adiag[i+1] - 1; 4929 idt = 3*i; 4930 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 4931 for(m=0;m<nz;m++){ 4932 idx = 3*vi[m]; 4933 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4934 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4935 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4936 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4937 v += 9; 4938 } 4939 idc = 3*c[i]; 4940 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4941 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4942 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4943 } 4944 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4945 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4946 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 4947 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4948 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 4949 PetscFunctionReturn(0); 4950 } 4951 4952 /* 4953 Special case where the matrix was ILU(0) factored in the natural 4954 ordering. This eliminates the need for the column and row permutation. 4955 */ 4956 #undef __FUNCT__ 4957 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_inplace" 4958 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 4959 { 4960 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4961 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 4962 PetscErrorCode ierr; 4963 const PetscInt *diag = a->diag,*vi; 4964 const MatScalar *aa=a->a,*v; 4965 PetscScalar *x,s1,s2,s3,x1,x2,x3; 4966 const PetscScalar *b; 4967 PetscInt jdx,idt,idx,nz,i; 4968 4969 PetscFunctionBegin; 4970 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 4971 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4972 4973 /* forward solve the lower triangular */ 4974 idx = 0; 4975 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; 4976 for (i=1; i<n; i++) { 4977 v = aa + 9*ai[i]; 4978 vi = aj + ai[i]; 4979 nz = diag[i] - ai[i]; 4980 idx += 3; 4981 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 4982 while (nz--) { 4983 jdx = 3*(*vi++); 4984 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx]; 4985 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4986 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4987 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4988 v += 9; 4989 } 4990 x[idx] = s1; 4991 x[1+idx] = s2; 4992 x[2+idx] = s3; 4993 } 4994 /* backward solve the upper triangular */ 4995 for (i=n-1; i>=0; i--){ 4996 v = aa + 9*diag[i] + 9; 4997 vi = aj + diag[i] + 1; 4998 nz = ai[i+1] - diag[i] - 1; 4999 idt = 3*i; 5000 s1 = x[idt]; s2 = x[1+idt]; 5001 s3 = x[2+idt]; 5002 while (nz--) { 5003 idx = 3*(*vi++); 5004 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; 5005 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 5006 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 5007 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 5008 v += 9; 5009 } 5010 v = aa + 9*diag[i]; 5011 x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 5012 x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 5013 x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 5014 } 5015 5016 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5017 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5018 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 5019 PetscFunctionReturn(0); 5020 } 5021 5022 #undef __FUNCT__ 5023 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering" 5024 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 5025 { 5026 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 5027 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 5028 PetscErrorCode ierr; 5029 PetscInt i,k,nz,idx,jdx,idt; 5030 const PetscInt bs = A->rmap->bs,bs2 = a->bs2; 5031 const MatScalar *aa=a->a,*v; 5032 PetscScalar *x; 5033 const PetscScalar *b; 5034 PetscScalar s1,s2,s3,x1,x2,x3; 5035 5036 PetscFunctionBegin; 5037 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 5038 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5039 /* forward solve the lower triangular */ 5040 idx = 0; 5041 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx]; 5042 for (i=1; i<n; i++) { 5043 v = aa + bs2*ai[i]; 5044 vi = aj + ai[i]; 5045 nz = ai[i+1] - ai[i]; 5046 idx = bs*i; 5047 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 5048 for(k=0;k<nz;k++){ 5049 jdx = bs*vi[k]; 5050 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx]; 5051 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 5052 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 5053 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 5054 5055 v += bs2; 5056 } 5057 5058 x[idx] = s1; 5059 x[1+idx] = s2; 5060 x[2+idx] = s3; 5061 } 5062 5063 /* backward solve the upper triangular */ 5064 for (i=n-1; i>=0; i--){ 5065 v = aa + bs2*(adiag[i+1]+1); 5066 vi = aj + adiag[i+1]+1; 5067 nz = adiag[i] - adiag[i+1]-1; 5068 idt = bs*i; 5069 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt]; 5070 5071 for(k=0;k<nz;k++){ 5072 idx = bs*vi[k]; 5073 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 5074 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 5075 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 5076 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 5077 5078 v += bs2; 5079 } 5080 /* x = inv_diagonal*x */ 5081 x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 5082 x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 5083 x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 5084 5085 } 5086 5087 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5088 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5089 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 5090 PetscFunctionReturn(0); 5091 } 5092 5093 #undef __FUNCT__ 5094 #define __FUNCT__ "MatSolve_SeqBAIJ_2_inplace" 5095 PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx) 5096 { 5097 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 5098 IS iscol=a->col,isrow=a->row; 5099 PetscErrorCode ierr; 5100 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 5101 PetscInt i,nz,idx,idt,idc; 5102 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 5103 const MatScalar *aa=a->a,*v; 5104 PetscScalar *x,s1,s2,x1,x2,*t; 5105 const PetscScalar *b; 5106 5107 PetscFunctionBegin; 5108 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 5109 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5110 t = a->solve_work; 5111 5112 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 5113 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 5114 5115 /* forward solve the lower triangular */ 5116 idx = 2*(*r++); 5117 t[0] = b[idx]; t[1] = b[1+idx]; 5118 for (i=1; i<n; i++) { 5119 v = aa + 4*ai[i]; 5120 vi = aj + ai[i]; 5121 nz = diag[i] - ai[i]; 5122 idx = 2*(*r++); 5123 s1 = b[idx]; s2 = b[1+idx]; 5124 while (nz--) { 5125 idx = 2*(*vi++); 5126 x1 = t[idx]; x2 = t[1+idx]; 5127 s1 -= v[0]*x1 + v[2]*x2; 5128 s2 -= v[1]*x1 + v[3]*x2; 5129 v += 4; 5130 } 5131 idx = 2*i; 5132 t[idx] = s1; t[1+idx] = s2; 5133 } 5134 /* backward solve the upper triangular */ 5135 for (i=n-1; i>=0; i--){ 5136 v = aa + 4*diag[i] + 4; 5137 vi = aj + diag[i] + 1; 5138 nz = ai[i+1] - diag[i] - 1; 5139 idt = 2*i; 5140 s1 = t[idt]; s2 = t[1+idt]; 5141 while (nz--) { 5142 idx = 2*(*vi++); 5143 x1 = t[idx]; x2 = t[1+idx]; 5144 s1 -= v[0]*x1 + v[2]*x2; 5145 s2 -= v[1]*x1 + v[3]*x2; 5146 v += 4; 5147 } 5148 idc = 2*(*c--); 5149 v = aa + 4*diag[i]; 5150 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 5151 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 5152 } 5153 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 5154 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 5155 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5156 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5157 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 5158 PetscFunctionReturn(0); 5159 } 5160 5161 #undef __FUNCT__ 5162 #define __FUNCT__ "MatSolve_SeqBAIJ_2" 5163 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 5164 { 5165 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 5166 IS iscol=a->col,isrow=a->row; 5167 PetscErrorCode ierr; 5168 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 5169 PetscInt i,nz,idx,jdx,idt,idc,m; 5170 const PetscInt *r,*c,*rout,*cout; 5171 const MatScalar *aa=a->a,*v; 5172 PetscScalar *x,s1,s2,x1,x2,*t; 5173 const PetscScalar *b; 5174 5175 PetscFunctionBegin; 5176 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 5177 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5178 t = a->solve_work; 5179 5180 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 5181 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 5182 5183 /* forward solve the lower triangular */ 5184 idx = 2*r[0]; 5185 t[0] = b[idx]; t[1] = b[1+idx]; 5186 for (i=1; i<n; i++) { 5187 v = aa + 4*ai[i]; 5188 vi = aj + ai[i]; 5189 nz = ai[i+1] - ai[i]; 5190 idx = 2*r[i]; 5191 s1 = b[idx]; s2 = b[1+idx]; 5192 for(m=0;m<nz;m++){ 5193 jdx = 2*vi[m]; 5194 x1 = t[jdx]; x2 = t[1+jdx]; 5195 s1 -= v[0]*x1 + v[2]*x2; 5196 s2 -= v[1]*x1 + v[3]*x2; 5197 v += 4; 5198 } 5199 idx = 2*i; 5200 t[idx] = s1; t[1+idx] = s2; 5201 } 5202 /* backward solve the upper triangular */ 5203 for (i=n-1; i>=0; i--){ 5204 v = aa + 4*(adiag[i+1]+1); 5205 vi = aj + adiag[i+1]+1; 5206 nz = adiag[i] - adiag[i+1] - 1; 5207 idt = 2*i; 5208 s1 = t[idt]; s2 = t[1+idt]; 5209 for(m=0;m<nz;m++){ 5210 idx = 2*vi[m]; 5211 x1 = t[idx]; x2 = t[1+idx]; 5212 s1 -= v[0]*x1 + v[2]*x2; 5213 s2 -= v[1]*x1 + v[3]*x2; 5214 v += 4; 5215 } 5216 idc = 2*c[i]; 5217 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 5218 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 5219 } 5220 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 5221 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 5222 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5223 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5224 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 5225 PetscFunctionReturn(0); 5226 } 5227 5228 /* 5229 Special case where the matrix was ILU(0) factored in the natural 5230 ordering. This eliminates the need for the column and row permutation. 5231 */ 5232 #undef __FUNCT__ 5233 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_inplace" 5234 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 5235 { 5236 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 5237 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 5238 PetscErrorCode ierr; 5239 const MatScalar *aa=a->a,*v; 5240 PetscScalar *x,s1,s2,x1,x2; 5241 const PetscScalar *b; 5242 PetscInt jdx,idt,idx,nz,i; 5243 5244 PetscFunctionBegin; 5245 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 5246 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5247 5248 /* forward solve the lower triangular */ 5249 idx = 0; 5250 x[0] = b[0]; x[1] = b[1]; 5251 for (i=1; i<n; i++) { 5252 v = aa + 4*ai[i]; 5253 vi = aj + ai[i]; 5254 nz = diag[i] - ai[i]; 5255 idx += 2; 5256 s1 = b[idx];s2 = b[1+idx]; 5257 while (nz--) { 5258 jdx = 2*(*vi++); 5259 x1 = x[jdx];x2 = x[1+jdx]; 5260 s1 -= v[0]*x1 + v[2]*x2; 5261 s2 -= v[1]*x1 + v[3]*x2; 5262 v += 4; 5263 } 5264 x[idx] = s1; 5265 x[1+idx] = s2; 5266 } 5267 /* backward solve the upper triangular */ 5268 for (i=n-1; i>=0; i--){ 5269 v = aa + 4*diag[i] + 4; 5270 vi = aj + diag[i] + 1; 5271 nz = ai[i+1] - diag[i] - 1; 5272 idt = 2*i; 5273 s1 = x[idt]; s2 = x[1+idt]; 5274 while (nz--) { 5275 idx = 2*(*vi++); 5276 x1 = x[idx]; x2 = x[1+idx]; 5277 s1 -= v[0]*x1 + v[2]*x2; 5278 s2 -= v[1]*x1 + v[3]*x2; 5279 v += 4; 5280 } 5281 v = aa + 4*diag[i]; 5282 x[idt] = v[0]*s1 + v[2]*s2; 5283 x[1+idt] = v[1]*s1 + v[3]*s2; 5284 } 5285 5286 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5287 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5288 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 5289 PetscFunctionReturn(0); 5290 } 5291 5292 #undef __FUNCT__ 5293 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering" 5294 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 5295 { 5296 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 5297 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 5298 PetscInt i,k,nz,idx,idt,jdx; 5299 PetscErrorCode ierr; 5300 const MatScalar *aa=a->a,*v; 5301 PetscScalar *x,s1,s2,x1,x2; 5302 const PetscScalar *b; 5303 5304 PetscFunctionBegin; 5305 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 5306 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5307 /* forward solve the lower triangular */ 5308 idx = 0; 5309 x[0] = b[idx]; x[1] = b[1+idx]; 5310 for (i=1; i<n; i++) { 5311 v = aa + 4*ai[i]; 5312 vi = aj + ai[i]; 5313 nz = ai[i+1] - ai[i]; 5314 idx = 2*i; 5315 s1 = b[idx];s2 = b[1+idx]; 5316 for(k=0;k<nz;k++){ 5317 jdx = 2*vi[k]; 5318 x1 = x[jdx];x2 = x[1+jdx]; 5319 s1 -= v[0]*x1 + v[2]*x2; 5320 s2 -= v[1]*x1 + v[3]*x2; 5321 v += 4; 5322 } 5323 x[idx] = s1; 5324 x[1+idx] = s2; 5325 } 5326 5327 /* backward solve the upper triangular */ 5328 for (i=n-1; i>=0; i--){ 5329 v = aa + 4*(adiag[i+1]+1); 5330 vi = aj + adiag[i+1]+1; 5331 nz = adiag[i] - adiag[i+1]-1; 5332 idt = 2*i; 5333 s1 = x[idt]; s2 = x[1+idt]; 5334 for(k=0;k<nz;k++){ 5335 idx = 2*vi[k]; 5336 x1 = x[idx]; x2 = x[1+idx]; 5337 s1 -= v[0]*x1 + v[2]*x2; 5338 s2 -= v[1]*x1 + v[3]*x2; 5339 v += 4; 5340 } 5341 /* x = inv_diagonal*x */ 5342 x[idt] = v[0]*s1 + v[2]*s2; 5343 x[1+idt] = v[1]*s1 + v[3]*s2; 5344 } 5345 5346 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5347 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5348 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 5349 PetscFunctionReturn(0); 5350 } 5351 5352 #undef __FUNCT__ 5353 #define __FUNCT__ "MatSolve_SeqBAIJ_1_inplace" 5354 PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx) 5355 { 5356 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 5357 IS iscol=a->col,isrow=a->row; 5358 PetscErrorCode ierr; 5359 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 5360 PetscInt i,nz; 5361 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 5362 const MatScalar *aa=a->a,*v; 5363 PetscScalar *x,s1,*t; 5364 const PetscScalar *b; 5365 5366 PetscFunctionBegin; 5367 if (!n) PetscFunctionReturn(0); 5368 5369 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 5370 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5371 t = a->solve_work; 5372 5373 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 5374 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 5375 5376 /* forward solve the lower triangular */ 5377 t[0] = b[*r++]; 5378 for (i=1; i<n; i++) { 5379 v = aa + ai[i]; 5380 vi = aj + ai[i]; 5381 nz = diag[i] - ai[i]; 5382 s1 = b[*r++]; 5383 while (nz--) { 5384 s1 -= (*v++)*t[*vi++]; 5385 } 5386 t[i] = s1; 5387 } 5388 /* backward solve the upper triangular */ 5389 for (i=n-1; i>=0; i--){ 5390 v = aa + diag[i] + 1; 5391 vi = aj + diag[i] + 1; 5392 nz = ai[i+1] - diag[i] - 1; 5393 s1 = t[i]; 5394 while (nz--) { 5395 s1 -= (*v++)*t[*vi++]; 5396 } 5397 x[*c--] = t[i] = aa[diag[i]]*s1; 5398 } 5399 5400 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 5401 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 5402 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5403 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5404 ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr); 5405 PetscFunctionReturn(0); 5406 } 5407 5408 #undef __FUNCT__ 5409 #define __FUNCT__ "MatSolve_SeqBAIJ_1" 5410 PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 5411 { 5412 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 5413 IS iscol = a->col,isrow = a->row; 5414 PetscErrorCode ierr; 5415 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag = a->diag,nz; 5416 const PetscInt *rout,*cout,*r,*c; 5417 PetscScalar *x,*tmp,sum; 5418 const PetscScalar *b; 5419 const MatScalar *aa = a->a,*v; 5420 5421 PetscFunctionBegin; 5422 if (!n) PetscFunctionReturn(0); 5423 5424 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 5425 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5426 tmp = a->solve_work; 5427 5428 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 5429 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 5430 5431 /* forward solve the lower triangular */ 5432 tmp[0] = b[r[0]]; 5433 v = aa; 5434 vi = aj; 5435 for (i=1; i<n; i++) { 5436 nz = ai[i+1] - ai[i]; 5437 sum = b[r[i]]; 5438 PetscSparseDenseMinusDot(sum,tmp,v,vi,nz); 5439 tmp[i] = sum; 5440 v += nz; vi += nz; 5441 } 5442 5443 /* backward solve the upper triangular */ 5444 for (i=n-1; i>=0; i--){ 5445 v = aa + adiag[i+1]+1; 5446 vi = aj + adiag[i+1]+1; 5447 nz = adiag[i]-adiag[i+1]-1; 5448 sum = tmp[i]; 5449 PetscSparseDenseMinusDot(sum,tmp,v,vi,nz); 5450 x[c[i]] = tmp[i] = sum*v[nz]; /* v[nz] = aa[adiag[i]] */ 5451 } 5452 5453 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 5454 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 5455 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5456 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5457 ierr = PetscLogFlops(2*a->nz - A->cmap->n);CHKERRQ(ierr); 5458 PetscFunctionReturn(0); 5459 } 5460 5461 /* 5462 Special case where the matrix was ILU(0) factored in the natural 5463 ordering. This eliminates the need for the column and row permutation. 5464 */ 5465 #undef __FUNCT__ 5466 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering_inplace" 5467 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 5468 { 5469 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 5470 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 5471 PetscErrorCode ierr; 5472 const MatScalar *aa=a->a,*v; 5473 PetscScalar *x; 5474 const PetscScalar *b; 5475 PetscScalar s1,x1; 5476 PetscInt jdx,idt,idx,nz,i; 5477 5478 PetscFunctionBegin; 5479 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 5480 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5481 5482 /* forward solve the lower triangular */ 5483 idx = 0; 5484 x[0] = b[0]; 5485 for (i=1; i<n; i++) { 5486 v = aa + ai[i]; 5487 vi = aj + ai[i]; 5488 nz = diag[i] - ai[i]; 5489 idx += 1; 5490 s1 = b[idx]; 5491 while (nz--) { 5492 jdx = *vi++; 5493 x1 = x[jdx]; 5494 s1 -= v[0]*x1; 5495 v += 1; 5496 } 5497 x[idx] = s1; 5498 } 5499 /* backward solve the upper triangular */ 5500 for (i=n-1; i>=0; i--){ 5501 v = aa + diag[i] + 1; 5502 vi = aj + diag[i] + 1; 5503 nz = ai[i+1] - diag[i] - 1; 5504 idt = i; 5505 s1 = x[idt]; 5506 while (nz--) { 5507 idx = *vi++; 5508 x1 = x[idx]; 5509 s1 -= v[0]*x1; 5510 v += 1; 5511 } 5512 v = aa + diag[i]; 5513 x[idt] = v[0]*s1; 5514 } 5515 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5516 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5517 ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 5518 PetscFunctionReturn(0); 5519 } 5520 5521 5522 #undef __FUNCT__ 5523 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering" 5524 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 5525 { 5526 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 5527 PetscErrorCode ierr; 5528 const PetscInt n = a->mbs,*ai = a->i,*aj = a->j,*adiag = a->diag,*vi; 5529 PetscScalar *x,sum; 5530 const PetscScalar *b; 5531 const MatScalar *aa = a->a,*v; 5532 PetscInt i,nz; 5533 5534 PetscFunctionBegin; 5535 if (!n) PetscFunctionReturn(0); 5536 5537 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 5538 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5539 5540 /* forward solve the lower triangular */ 5541 x[0] = b[0]; 5542 v = aa; 5543 vi = aj; 5544 for (i=1; i<n; i++) { 5545 nz = ai[i+1] - ai[i]; 5546 sum = b[i]; 5547 PetscSparseDenseMinusDot(sum,x,v,vi,nz); 5548 v += nz; 5549 vi += nz; 5550 x[i] = sum; 5551 } 5552 5553 /* backward solve the upper triangular */ 5554 for (i=n-1; i>=0; i--){ 5555 v = aa + adiag[i+1] + 1; 5556 vi = aj + adiag[i+1] + 1; 5557 nz = adiag[i] - adiag[i+1]-1; 5558 sum = x[i]; 5559 PetscSparseDenseMinusDot(sum,x,v,vi,nz); 5560 x[i] = sum*v[nz]; /* x[i]=aa[adiag[i]]*sum; v++; */ 5561 } 5562 5563 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 5564 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5565 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5566 PetscFunctionReturn(0); 5567 } 5568 5569 /* ----------------------------------------------------------------*/ 5570 EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscTruth); 5571 5572 #undef __FUNCT__ 5573 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering" 5574 /* 5575 This is not much faster than MatLUFactorNumeric_SeqBAIJ_N() but the solve is faster at least sometimes 5576 */ 5577 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info) 5578 { 5579 Mat C=B; 5580 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 5581 PetscErrorCode ierr; 5582 PetscInt i,j,k,ipvt[15]; 5583 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*ajtmp,*bjtmp,*bdiag=b->diag,*pj; 5584 PetscInt nz,nzL,row; 5585 MatScalar *rtmp,*pc,*mwork,*pv,*vv,work[225]; 5586 const MatScalar *v,*aa=a->a; 5587 PetscInt bs2 = a->bs2,bs=A->rmap->bs,flg; 5588 PetscInt sol_ver; 5589 5590 PetscFunctionBegin; 5591 5592 ierr = PetscOptionsGetInt(PETSC_NULL,"-sol_ver",&sol_ver,PETSC_NULL);CHKERRQ(ierr); 5593 5594 /* generate work space needed by the factorization */ 5595 ierr = PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);CHKERRQ(ierr); 5596 ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 5597 5598 for (i=0; i<n; i++){ 5599 /* zero rtmp */ 5600 /* L part */ 5601 nz = bi[i+1] - bi[i]; 5602 bjtmp = bj + bi[i]; 5603 for (j=0; j<nz; j++){ 5604 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5605 } 5606 5607 /* U part */ 5608 nz = bdiag[i] - bdiag[i+1]; 5609 bjtmp = bj + bdiag[i+1]+1; 5610 for (j=0; j<nz; j++){ 5611 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5612 } 5613 5614 /* load in initial (unfactored row) */ 5615 nz = ai[i+1] - ai[i]; 5616 ajtmp = aj + ai[i]; 5617 v = aa + bs2*ai[i]; 5618 for (j=0; j<nz; j++) { 5619 ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 5620 } 5621 5622 /* elimination */ 5623 bjtmp = bj + bi[i]; 5624 nzL = bi[i+1] - bi[i]; 5625 for(k=0;k < nzL;k++) { 5626 row = bjtmp[k]; 5627 pc = rtmp + bs2*row; 5628 for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 5629 if (flg) { 5630 pv = b->a + bs2*bdiag[row]; 5631 Kernel_A_gets_A_times_B(bs,pc,pv,mwork); 5632 /*ierr = Kernel_A_gets_A_times_B_15(pc,pv,mwork);CHKERRQ(ierr);*/ 5633 pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 5634 pv = b->a + bs2*(bdiag[row+1]+1); 5635 nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 5636 for (j=0; j<nz; j++) { 5637 vv = rtmp + bs2*pj[j]; 5638 Kernel_A_gets_A_minus_B_times_C(bs,vv,pc,pv); 5639 /* ierr = Kernel_A_gets_A_minus_B_times_C_15(vv,pc,pv);CHKERRQ(ierr); */ 5640 pv += bs2; 5641 } 5642 ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 5643 } 5644 } 5645 5646 /* finished row so stick it into b->a */ 5647 /* L part */ 5648 pv = b->a + bs2*bi[i] ; 5649 pj = b->j + bi[i] ; 5650 nz = bi[i+1] - bi[i]; 5651 for (j=0; j<nz; j++) { 5652 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5653 } 5654 5655 /* Mark diagonal and invert diagonal for simplier triangular solves */ 5656 pv = b->a + bs2*bdiag[i]; 5657 pj = b->j + bdiag[i]; 5658 ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5659 /* Kernel_A_gets_inverse_A(bs,pv,pivots,work); */ 5660 ierr = Kernel_A_gets_inverse_A_15(pv,ipvt,work,info->shiftamount);CHKERRQ(ierr); 5661 5662 /* U part */ 5663 pv = b->a + bs2*(bdiag[i+1]+1); 5664 pj = b->j + bdiag[i+1]+1; 5665 nz = bdiag[i] - bdiag[i+1] - 1; 5666 for (j=0; j<nz; j++){ 5667 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5668 } 5669 } 5670 5671 ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr); 5672 C->ops->solve = MatSolve_SeqBAIJ_15_NaturalOrdering_ver1; 5673 C->ops->solvetranspose = MatSolve_SeqBAIJ_N_NaturalOrdering; 5674 C->assembled = PETSC_TRUE; 5675 ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 5676 PetscFunctionReturn(0); 5677 } 5678 5679 #undef __FUNCT__ 5680 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N" 5681 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N(Mat B,Mat A,const MatFactorInfo *info) 5682 { 5683 Mat C=B; 5684 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 5685 IS isrow = b->row,isicol = b->icol; 5686 PetscErrorCode ierr; 5687 const PetscInt *r,*ic; 5688 PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 5689 PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj; 5690 MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 5691 PetscInt bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg; 5692 MatScalar *v_work; 5693 PetscTruth col_identity,row_identity,both_identity; 5694 5695 PetscFunctionBegin; 5696 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 5697 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 5698 5699 ierr = PetscMalloc(bs2*n*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 5700 ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 5701 5702 /* generate work space needed by dense LU factorization */ 5703 ierr = PetscMalloc3(bs,MatScalar,&v_work,bs2,MatScalar,&mwork,bs,PetscInt,&v_pivots);CHKERRQ(ierr); 5704 5705 for (i=0; i<n; i++){ 5706 /* zero rtmp */ 5707 /* L part */ 5708 nz = bi[i+1] - bi[i]; 5709 bjtmp = bj + bi[i]; 5710 for (j=0; j<nz; j++){ 5711 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5712 } 5713 5714 /* U part */ 5715 nz = bdiag[i] - bdiag[i+1]; 5716 bjtmp = bj + bdiag[i+1]+1; 5717 for (j=0; j<nz; j++){ 5718 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5719 } 5720 5721 /* load in initial (unfactored row) */ 5722 nz = ai[r[i]+1] - ai[r[i]]; 5723 ajtmp = aj + ai[r[i]]; 5724 v = aa + bs2*ai[r[i]]; 5725 for (j=0; j<nz; j++) { 5726 ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 5727 } 5728 5729 /* elimination */ 5730 bjtmp = bj + bi[i]; 5731 nzL = bi[i+1] - bi[i]; 5732 for(k=0;k < nzL;k++) { 5733 row = bjtmp[k]; 5734 pc = rtmp + bs2*row; 5735 for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 5736 if (flg) { 5737 pv = b->a + bs2*bdiag[row]; 5738 Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */ 5739 pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 5740 pv = b->a + bs2*(bdiag[row+1]+1); 5741 nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 5742 for (j=0; j<nz; j++) { 5743 Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); 5744 } 5745 ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 5746 } 5747 } 5748 5749 /* finished row so stick it into b->a */ 5750 /* L part */ 5751 pv = b->a + bs2*bi[i] ; 5752 pj = b->j + bi[i] ; 5753 nz = bi[i+1] - bi[i]; 5754 for (j=0; j<nz; j++) { 5755 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5756 } 5757 5758 /* Mark diagonal and invert diagonal for simplier triangular solves */ 5759 pv = b->a + bs2*bdiag[i]; 5760 pj = b->j + bdiag[i]; 5761 /* if (*pj != i)SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */ 5762 ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5763 ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); 5764 5765 /* U part */ 5766 pv = b->a + bs2*(bdiag[i+1]+1); 5767 pj = b->j + bdiag[i+1]+1; 5768 nz = bdiag[i] - bdiag[i+1] - 1; 5769 for (j=0; j<nz; j++){ 5770 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5771 } 5772 } 5773 5774 ierr = PetscFree(rtmp);CHKERRQ(ierr); 5775 ierr = PetscFree3(v_work,mwork,v_pivots);CHKERRQ(ierr); 5776 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 5777 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 5778 5779 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 5780 ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr); 5781 both_identity = (PetscTruth) (row_identity && col_identity); 5782 if (both_identity){ 5783 C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering; 5784 } else { 5785 C->ops->solve = MatSolve_SeqBAIJ_N; 5786 } 5787 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_N; 5788 5789 C->assembled = PETSC_TRUE; 5790 ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 5791 PetscFunctionReturn(0); 5792 } 5793 5794 /* 5795 ilu(0) with natural ordering under new data structure. 5796 See MatILUFactorSymbolic_SeqAIJ_ilu0() for detailed description 5797 because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_inplace(). 5798 */ 5799 5800 #undef __FUNCT__ 5801 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0" 5802 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 5803 { 5804 5805 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 5806 PetscErrorCode ierr; 5807 PetscInt n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2; 5808 PetscInt i,j,nz,*bi,*bj,*bdiag,bi_temp; 5809 5810 PetscFunctionBegin; 5811 ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr); 5812 b = (Mat_SeqBAIJ*)(fact)->data; 5813 5814 /* allocate matrix arrays for new data structure */ 5815 ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,n+1,PetscInt,&b->i);CHKERRQ(ierr); 5816 ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(n+1)*sizeof(PetscInt));CHKERRQ(ierr); 5817 b->singlemalloc = PETSC_TRUE; 5818 if (!b->diag){ 5819 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr); 5820 ierr = PetscLogObjectMemory(fact,(n+1)*sizeof(PetscInt));CHKERRQ(ierr); 5821 } 5822 bdiag = b->diag; 5823 5824 if (n > 0) { 5825 ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr); 5826 } 5827 5828 /* set bi and bj with new data structure */ 5829 bi = b->i; 5830 bj = b->j; 5831 5832 /* L part */ 5833 bi[0] = 0; 5834 for (i=0; i<n; i++){ 5835 nz = adiag[i] - ai[i]; 5836 bi[i+1] = bi[i] + nz; 5837 aj = a->j + ai[i]; 5838 for (j=0; j<nz; j++){ 5839 *bj = aj[j]; bj++; 5840 } 5841 } 5842 5843 /* U part */ 5844 bi_temp = bi[n]; 5845 bdiag[n] = bi[n]-1; 5846 for (i=n-1; i>=0; i--){ 5847 nz = ai[i+1] - adiag[i] - 1; 5848 bi_temp = bi_temp + nz + 1; 5849 aj = a->j + adiag[i] + 1; 5850 for (j=0; j<nz; j++){ 5851 *bj = aj[j]; bj++; 5852 } 5853 /* diag[i] */ 5854 *bj = i; bj++; 5855 bdiag[i] = bi_temp - 1; 5856 } 5857 PetscFunctionReturn(0); 5858 } 5859 5860 #undef __FUNCT__ 5861 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ" 5862 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 5863 { 5864 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 5865 IS isicol; 5866 PetscErrorCode ierr; 5867 const PetscInt *r,*ic; 5868 PetscInt n=a->mbs,*ai=a->i,*aj=a->j,d; 5869 PetscInt *bi,*cols,nnz,*cols_lvl; 5870 PetscInt *bdiag,prow,fm,nzbd,reallocs=0,dcount=0; 5871 PetscInt i,levels,diagonal_fill; 5872 PetscTruth col_identity,row_identity,both_identity; 5873 PetscReal f; 5874 PetscInt nlnk,*lnk,*lnk_lvl=PETSC_NULL; 5875 PetscBT lnkbt; 5876 PetscInt nzi,*bj,**bj_ptr,**bjlvl_ptr; 5877 PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL; 5878 PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL; 5879 PetscTruth missing; 5880 PetscInt bs=A->rmap->bs,bs2=a->bs2; 5881 5882 PetscFunctionBegin; 5883 if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n); 5884 if (bs>1){ /* check shifttype */ 5885 if (info->shifttype == MAT_SHIFT_NONZERO || info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE) 5886 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Only MAT_SHIFT_NONE and MAT_SHIFT_INBLOCKS are supported for BAIJ matrix"); 5887 } 5888 5889 ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr); 5890 if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d); 5891 5892 f = info->fill; 5893 levels = (PetscInt)info->levels; 5894 diagonal_fill = (PetscInt)info->diagonal_fill; 5895 ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 5896 5897 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 5898 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 5899 both_identity = (PetscTruth) (row_identity && col_identity); 5900 5901 if (!levels && both_identity) { 5902 /* special case: ilu(0) with natural ordering */ 5903 ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0(fact,A,isrow,iscol,info);CHKERRQ(ierr); 5904 ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 5905 5906 fact->factortype = MAT_FACTOR_ILU; 5907 (fact)->info.factor_mallocs = 0; 5908 (fact)->info.fill_ratio_given = info->fill; 5909 (fact)->info.fill_ratio_needed = 1.0; 5910 b = (Mat_SeqBAIJ*)(fact)->data; 5911 b->row = isrow; 5912 b->col = iscol; 5913 b->icol = isicol; 5914 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 5915 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 5916 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 5917 ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 5918 PetscFunctionReturn(0); 5919 } 5920 5921 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 5922 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 5923 5924 /* get new row pointers */ 5925 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr); 5926 bi[0] = 0; 5927 /* bdiag is location of diagonal in factor */ 5928 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr); 5929 bdiag[0] = 0; 5930 5931 ierr = PetscMalloc2(n,PetscInt*,&bj_ptr,n,PetscInt*,&bjlvl_ptr);CHKERRQ(ierr); 5932 5933 /* create a linked list for storing column indices of the active row */ 5934 nlnk = n + 1; 5935 ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 5936 5937 /* initial FreeSpace size is f*(ai[n]+1) */ 5938 ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr); 5939 current_space = free_space; 5940 ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr); 5941 current_space_lvl = free_space_lvl; 5942 5943 for (i=0; i<n; i++) { 5944 nzi = 0; 5945 /* copy current row into linked list */ 5946 nnz = ai[r[i]+1] - ai[r[i]]; 5947 if (!nnz) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i); 5948 cols = aj + ai[r[i]]; 5949 lnk[i] = -1; /* marker to indicate if diagonal exists */ 5950 ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 5951 nzi += nlnk; 5952 5953 /* make sure diagonal entry is included */ 5954 if (diagonal_fill && lnk[i] == -1) { 5955 fm = n; 5956 while (lnk[fm] < i) fm = lnk[fm]; 5957 lnk[i] = lnk[fm]; /* insert diagonal into linked list */ 5958 lnk[fm] = i; 5959 lnk_lvl[i] = 0; 5960 nzi++; dcount++; 5961 } 5962 5963 /* add pivot rows into the active row */ 5964 nzbd = 0; 5965 prow = lnk[n]; 5966 while (prow < i) { 5967 nnz = bdiag[prow]; 5968 cols = bj_ptr[prow] + nnz + 1; 5969 cols_lvl = bjlvl_ptr[prow] + nnz + 1; 5970 nnz = bi[prow+1] - bi[prow] - nnz - 1; 5971 ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr); 5972 nzi += nlnk; 5973 prow = lnk[prow]; 5974 nzbd++; 5975 } 5976 bdiag[i] = nzbd; 5977 bi[i+1] = bi[i] + nzi; 5978 5979 /* if free space is not available, make more free space */ 5980 if (current_space->local_remaining<nzi) { 5981 nnz = 2*nzi*(n - i); /* estimated and max additional space needed */ 5982 ierr = PetscFreeSpaceGet(nnz,¤t_space);CHKERRQ(ierr); 5983 ierr = PetscFreeSpaceGet(nnz,¤t_space_lvl);CHKERRQ(ierr); 5984 reallocs++; 5985 } 5986 5987 /* copy data into free_space and free_space_lvl, then initialize lnk */ 5988 ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr); 5989 bj_ptr[i] = current_space->array; 5990 bjlvl_ptr[i] = current_space_lvl->array; 5991 5992 /* make sure the active row i has diagonal entry */ 5993 if (*(bj_ptr[i]+bdiag[i]) != i) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\ntry running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i); 5994 5995 current_space->array += nzi; 5996 current_space->local_used += nzi; 5997 current_space->local_remaining -= nzi; 5998 current_space_lvl->array += nzi; 5999 current_space_lvl->local_used += nzi; 6000 current_space_lvl->local_remaining -= nzi; 6001 } 6002 6003 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 6004 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 6005 6006 /* destroy list of free space and other temporary arrays */ 6007 ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr); 6008 6009 /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */ 6010 ierr = PetscFreeSpaceContiguous_LU(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr); 6011 6012 ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 6013 ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr); 6014 ierr = PetscFree2(bj_ptr,bjlvl_ptr);CHKERRQ(ierr); 6015 6016 #if defined(PETSC_USE_INFO) 6017 { 6018 PetscReal af = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]); 6019 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr); 6020 ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 6021 ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr); 6022 ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 6023 if (diagonal_fill) { 6024 ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr); 6025 } 6026 } 6027 #endif 6028 6029 /* put together the new matrix */ 6030 ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 6031 ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 6032 b = (Mat_SeqBAIJ*)(fact)->data; 6033 b->free_a = PETSC_TRUE; 6034 b->free_ij = PETSC_TRUE; 6035 b->singlemalloc = PETSC_FALSE; 6036 ierr = PetscMalloc( (bs2*(bdiag[0]+1) )*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 6037 b->j = bj; 6038 b->i = bi; 6039 b->diag = bdiag; 6040 b->free_diag = PETSC_TRUE; 6041 b->ilen = 0; 6042 b->imax = 0; 6043 b->row = isrow; 6044 b->col = iscol; 6045 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 6046 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 6047 b->icol = isicol; 6048 ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 6049 /* In b structure: Free imax, ilen, old a, old j. 6050 Allocate bdiag, solve_work, new a, new j */ 6051 ierr = PetscLogObjectMemory(fact,(bdiag[0]+1) * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr); 6052 b->maxnz = b->nz = bdiag[0]+1; 6053 fact->info.factor_mallocs = reallocs; 6054 fact->info.fill_ratio_given = f; 6055 fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]); 6056 ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 6057 PetscFunctionReturn(0); 6058 } 6059 6060 /* 6061 This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 6062 except that the data structure of Mat_SeqAIJ is slightly different. 6063 Not a good example of code reuse. 6064 */ 6065 #undef __FUNCT__ 6066 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_inplace" 6067 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_inplace(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 6068 { 6069 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 6070 IS isicol; 6071 PetscErrorCode ierr; 6072 const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi; 6073 PetscInt prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp; 6074 PetscInt *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0; 6075 PetscInt incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd; 6076 PetscTruth col_identity,row_identity,both_identity,flg; 6077 PetscReal f; 6078 6079 PetscFunctionBegin; 6080 ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr); 6081 if (flg) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd); 6082 6083 f = info->fill; 6084 levels = (PetscInt)info->levels; 6085 diagonal_fill = (PetscInt)info->diagonal_fill; 6086 ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 6087 6088 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 6089 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 6090 both_identity = (PetscTruth) (row_identity && col_identity); 6091 6092 if (!levels && both_identity) { /* special case copy the nonzero structure */ 6093 ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr); 6094 ierr = MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);CHKERRQ(ierr); 6095 6096 fact->factortype = MAT_FACTOR_ILU; 6097 b = (Mat_SeqBAIJ*)fact->data; 6098 b->row = isrow; 6099 b->col = iscol; 6100 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 6101 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 6102 b->icol = isicol; 6103 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 6104 ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 6105 PetscFunctionReturn(0); 6106 } 6107 6108 /* general case perform the symbolic factorization */ 6109 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 6110 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 6111 6112 /* get new row pointers */ 6113 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr); 6114 ainew[0] = 0; 6115 /* don't know how many column pointers are needed so estimate */ 6116 jmax = (PetscInt)(f*ai[n] + 1); 6117 ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr); 6118 /* ajfill is level of fill for each fill entry */ 6119 ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr); 6120 /* fill is a linked list of nonzeros in active row */ 6121 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr); 6122 /* im is level for each filled value */ 6123 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr); 6124 /* dloc is location of diagonal in factor */ 6125 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr); 6126 dloc[0] = 0; 6127 for (prow=0; prow<n; prow++) { 6128 6129 /* copy prow into linked list */ 6130 nzf = nz = ai[r[prow]+1] - ai[r[prow]]; 6131 if (!nz) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow); 6132 xi = aj + ai[r[prow]]; 6133 fill[n] = n; 6134 fill[prow] = -1; /* marker for diagonal entry */ 6135 while (nz--) { 6136 fm = n; 6137 idx = ic[*xi++]; 6138 do { 6139 m = fm; 6140 fm = fill[m]; 6141 } while (fm < idx); 6142 fill[m] = idx; 6143 fill[idx] = fm; 6144 im[idx] = 0; 6145 } 6146 6147 /* make sure diagonal entry is included */ 6148 if (diagonal_fill && fill[prow] == -1) { 6149 fm = n; 6150 while (fill[fm] < prow) fm = fill[fm]; 6151 fill[prow] = fill[fm]; /* insert diagonal into linked list */ 6152 fill[fm] = prow; 6153 im[prow] = 0; 6154 nzf++; 6155 dcount++; 6156 } 6157 6158 nzi = 0; 6159 row = fill[n]; 6160 while (row < prow) { 6161 incrlev = im[row] + 1; 6162 nz = dloc[row]; 6163 xi = ajnew + ainew[row] + nz + 1; 6164 flev = ajfill + ainew[row] + nz + 1; 6165 nnz = ainew[row+1] - ainew[row] - nz - 1; 6166 fm = row; 6167 while (nnz-- > 0) { 6168 idx = *xi++; 6169 if (*flev + incrlev > levels) { 6170 flev++; 6171 continue; 6172 } 6173 do { 6174 m = fm; 6175 fm = fill[m]; 6176 } while (fm < idx); 6177 if (fm != idx) { 6178 im[idx] = *flev + incrlev; 6179 fill[m] = idx; 6180 fill[idx] = fm; 6181 fm = idx; 6182 nzf++; 6183 } else { 6184 if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev; 6185 } 6186 flev++; 6187 } 6188 row = fill[row]; 6189 nzi++; 6190 } 6191 /* copy new filled row into permanent storage */ 6192 ainew[prow+1] = ainew[prow] + nzf; 6193 if (ainew[prow+1] > jmax) { 6194 6195 /* estimate how much additional space we will need */ 6196 /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 6197 /* just double the memory each time */ 6198 PetscInt maxadd = jmax; 6199 /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */ 6200 if (maxadd < nzf) maxadd = (n-prow)*(nzf+1); 6201 jmax += maxadd; 6202 6203 /* allocate a longer ajnew and ajfill */ 6204 ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 6205 ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 6206 ierr = PetscFree(ajnew);CHKERRQ(ierr); 6207 ajnew = xitmp; 6208 ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 6209 ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 6210 ierr = PetscFree(ajfill);CHKERRQ(ierr); 6211 ajfill = xitmp; 6212 reallocate++; /* count how many reallocations are needed */ 6213 } 6214 xitmp = ajnew + ainew[prow]; 6215 flev = ajfill + ainew[prow]; 6216 dloc[prow] = nzi; 6217 fm = fill[n]; 6218 while (nzf--) { 6219 *xitmp++ = fm; 6220 *flev++ = im[fm]; 6221 fm = fill[fm]; 6222 } 6223 /* make sure row has diagonal entry */ 6224 if (ajnew[ainew[prow]+dloc[prow]] != prow) { 6225 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 6226 try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow); 6227 } 6228 } 6229 ierr = PetscFree(ajfill);CHKERRQ(ierr); 6230 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 6231 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 6232 ierr = PetscFree(fill);CHKERRQ(ierr); 6233 ierr = PetscFree(im);CHKERRQ(ierr); 6234 6235 #if defined(PETSC_USE_INFO) 6236 { 6237 PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]); 6238 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr); 6239 ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 6240 ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr); 6241 ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 6242 if (diagonal_fill) { 6243 ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr); 6244 } 6245 } 6246 #endif 6247 6248 /* put together the new matrix */ 6249 ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 6250 ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 6251 b = (Mat_SeqBAIJ*)fact->data; 6252 b->free_a = PETSC_TRUE; 6253 b->free_ij = PETSC_TRUE; 6254 b->singlemalloc = PETSC_FALSE; 6255 ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 6256 b->j = ajnew; 6257 b->i = ainew; 6258 for (i=0; i<n; i++) dloc[i] += ainew[i]; 6259 b->diag = dloc; 6260 b->free_diag = PETSC_TRUE; 6261 b->ilen = 0; 6262 b->imax = 0; 6263 b->row = isrow; 6264 b->col = iscol; 6265 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 6266 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 6267 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 6268 b->icol = isicol; 6269 ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 6270 /* In b structure: Free imax, ilen, old a, old j. 6271 Allocate dloc, solve_work, new a, new j */ 6272 ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr); 6273 b->maxnz = b->nz = ainew[n]; 6274 6275 fact->info.factor_mallocs = reallocate; 6276 fact->info.fill_ratio_given = f; 6277 fact->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]); 6278 6279 ierr = MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);CHKERRQ(ierr); 6280 PetscFunctionReturn(0); 6281 } 6282 6283 #undef __FUNCT__ 6284 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE" 6285 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A) 6286 { 6287 /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */ 6288 /* int i,*AJ=a->j,nz=a->nz; */ 6289 PetscFunctionBegin; 6290 /* Undo Column scaling */ 6291 /* while (nz--) { */ 6292 /* AJ[i] = AJ[i]/4; */ 6293 /* } */ 6294 /* This should really invoke a push/pop logic, but we don't have that yet. */ 6295 A->ops->setunfactored = PETSC_NULL; 6296 PetscFunctionReturn(0); 6297 } 6298 6299 #undef __FUNCT__ 6300 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj" 6301 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A) 6302 { 6303 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 6304 PetscInt *AJ=a->j,nz=a->nz; 6305 unsigned short *aj=(unsigned short *)AJ; 6306 PetscFunctionBegin; 6307 /* Is this really necessary? */ 6308 while (nz--) { 6309 AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */ 6310 } 6311 A->ops->setunfactored = PETSC_NULL; 6312 PetscFunctionReturn(0); 6313 } 6314 6315 6316