1 2 /* 3 Factorization code for BAIJ format. 4 */ 5 6 #include <../src/mat/impls/baij/seq/baij.h> 7 #include <../src/mat/blockinvert.h> 8 #include <petscbt.h> 9 #include <../src/mat/utils/freespace.h> 10 11 #undef __FUNCT__ 12 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering" 13 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 14 { 15 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 16 PetscErrorCode ierr; 17 const PetscInt *adiag = a->diag,*ai = a->i,*aj = a->j,*vi; 18 PetscInt i,n = a->mbs,j; 19 PetscInt nz; 20 PetscScalar *x,*tmp,s1; 21 const MatScalar *aa = a->a,*v; 22 const PetscScalar *b; 23 24 PetscFunctionBegin; 25 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 26 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 27 tmp = a->solve_work; 28 29 30 /* copy the b into temp work space according to permutation */ 31 for (i=0; i<n; i++) tmp[i] = b[i]; 32 33 /* forward solve the U^T */ 34 for (i=0; i<n; i++) { 35 v = aa + adiag[i+1] + 1; 36 vi = aj + adiag[i+1] + 1; 37 nz = adiag[i] - adiag[i+1] - 1; 38 s1 = tmp[i]; 39 s1 *= v[nz]; /* multiply by inverse of diagonal entry */ 40 for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j]; 41 tmp[i] = s1; 42 } 43 44 /* backward solve the L^T */ 45 for (i=n-1; i>=0; i--) { 46 v = aa + ai[i]; 47 vi = aj + ai[i]; 48 nz = ai[i+1] - ai[i]; 49 s1 = tmp[i]; 50 for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j]; 51 } 52 53 /* copy tmp into x according to permutation */ 54 for (i=0; i<n; i++) x[i] = tmp[i]; 55 56 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 57 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 58 59 ierr = PetscLogFlops(2.0*a->nz-A->cmap->n);CHKERRQ(ierr); 60 PetscFunctionReturn(0); 61 } 62 63 #undef __FUNCT__ 64 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace" 65 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 66 { 67 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data; 68 PetscErrorCode ierr; 69 PetscInt i,nz; 70 const PetscInt *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 71 const MatScalar *aa =a->a,*v; 72 PetscScalar s1,*x; 73 74 PetscFunctionBegin; 75 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 76 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 77 78 /* forward solve the U^T */ 79 for (i=0; i<n; i++) { 80 81 v = aa + diag[i]; 82 /* multiply by the inverse of the block diagonal */ 83 s1 = (*v++)*x[i]; 84 vi = aj + diag[i] + 1; 85 nz = ai[i+1] - diag[i] - 1; 86 while (nz--) { 87 x[*vi++] -= (*v++)*s1; 88 } 89 x[i] = s1; 90 } 91 /* backward solve the L^T */ 92 for (i=n-1; i>=0; i--) { 93 v = aa + diag[i] - 1; 94 vi = aj + diag[i] - 1; 95 nz = diag[i] - ai[i]; 96 s1 = x[i]; 97 while (nz--) { 98 x[*vi--] -= (*v--)*s1; 99 } 100 } 101 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 102 ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 103 PetscFunctionReturn(0); 104 } 105 106 #undef __FUNCT__ 107 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace" 108 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 109 { 110 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data; 111 PetscErrorCode ierr; 112 PetscInt i,nz,idx,idt,oidx; 113 const PetscInt *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j; 114 const MatScalar *aa =a->a,*v; 115 PetscScalar s1,s2,x1,x2,*x; 116 117 PetscFunctionBegin; 118 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 119 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 120 121 /* forward solve the U^T */ 122 idx = 0; 123 for (i=0; i<n; i++) { 124 125 v = aa + 4*diag[i]; 126 /* multiply by the inverse of the block diagonal */ 127 x1 = x[idx]; x2 = x[1+idx]; 128 s1 = v[0]*x1 + v[1]*x2; 129 s2 = v[2]*x1 + v[3]*x2; 130 v += 4; 131 132 vi = aj + diag[i] + 1; 133 nz = ai[i+1] - diag[i] - 1; 134 while (nz--) { 135 oidx = 2*(*vi++); 136 x[oidx] -= v[0]*s1 + v[1]*s2; 137 x[oidx+1] -= v[2]*s1 + v[3]*s2; 138 v += 4; 139 } 140 x[idx] = s1;x[1+idx] = s2; 141 idx += 2; 142 } 143 /* backward solve the L^T */ 144 for (i=n-1; i>=0; i--) { 145 v = aa + 4*diag[i] - 4; 146 vi = aj + diag[i] - 1; 147 nz = diag[i] - ai[i]; 148 idt = 2*i; 149 s1 = x[idt]; s2 = x[1+idt]; 150 while (nz--) { 151 idx = 2*(*vi--); 152 x[idx] -= v[0]*s1 + v[1]*s2; 153 x[idx+1] -= v[2]*s1 + v[3]*s2; 154 v -= 4; 155 } 156 } 157 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 158 ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 159 PetscFunctionReturn(0); 160 } 161 162 #undef __FUNCT__ 163 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering" 164 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 165 { 166 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data; 167 PetscErrorCode ierr; 168 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 169 PetscInt nz,idx,idt,j,i,oidx; 170 const PetscInt bs =A->rmap->bs,bs2=a->bs2; 171 const MatScalar *aa=a->a,*v; 172 PetscScalar s1,s2,x1,x2,*x; 173 174 PetscFunctionBegin; 175 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 176 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 177 178 /* forward solve the U^T */ 179 idx = 0; 180 for (i=0; i<n; i++) { 181 v = aa + bs2*diag[i]; 182 /* multiply by the inverse of the block diagonal */ 183 x1 = x[idx]; x2 = x[1+idx]; 184 s1 = v[0]*x1 + v[1]*x2; 185 s2 = v[2]*x1 + v[3]*x2; 186 v -= bs2; 187 188 vi = aj + diag[i] - 1; 189 nz = diag[i] - diag[i+1] - 1; 190 for (j=0; j>-nz; j--) { 191 oidx = bs*vi[j]; 192 x[oidx] -= v[0]*s1 + v[1]*s2; 193 x[oidx+1] -= v[2]*s1 + v[3]*s2; 194 v -= bs2; 195 } 196 x[idx] = s1;x[1+idx] = s2; 197 idx += bs; 198 } 199 /* backward solve the L^T */ 200 for (i=n-1; i>=0; i--) { 201 v = aa + bs2*ai[i]; 202 vi = aj + ai[i]; 203 nz = ai[i+1] - ai[i]; 204 idt = bs*i; 205 s1 = x[idt]; s2 = x[1+idt]; 206 for (j=0; j<nz; j++) { 207 idx = bs*vi[j]; 208 x[idx] -= v[0]*s1 + v[1]*s2; 209 x[idx+1] -= v[2]*s1 + v[3]*s2; 210 v += bs2; 211 } 212 } 213 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 214 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 215 PetscFunctionReturn(0); 216 } 217 218 #undef __FUNCT__ 219 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace" 220 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 221 { 222 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data; 223 PetscErrorCode ierr; 224 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 225 PetscInt i,nz,idx,idt,oidx; 226 const MatScalar *aa=a->a,*v; 227 PetscScalar s1,s2,s3,x1,x2,x3,*x; 228 229 PetscFunctionBegin; 230 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 231 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 232 233 /* forward solve the U^T */ 234 idx = 0; 235 for (i=0; i<n; i++) { 236 237 v = aa + 9*diag[i]; 238 /* multiply by the inverse of the block diagonal */ 239 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 240 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 241 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 242 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 243 v += 9; 244 245 vi = aj + diag[i] + 1; 246 nz = ai[i+1] - diag[i] - 1; 247 while (nz--) { 248 oidx = 3*(*vi++); 249 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 250 x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 251 x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 252 v += 9; 253 } 254 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 255 idx += 3; 256 } 257 /* backward solve the L^T */ 258 for (i=n-1; i>=0; i--) { 259 v = aa + 9*diag[i] - 9; 260 vi = aj + diag[i] - 1; 261 nz = diag[i] - ai[i]; 262 idt = 3*i; 263 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 264 while (nz--) { 265 idx = 3*(*vi--); 266 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 267 x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 268 x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 269 v -= 9; 270 } 271 } 272 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 273 ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 274 PetscFunctionReturn(0); 275 } 276 277 #undef __FUNCT__ 278 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering" 279 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 280 { 281 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data; 282 PetscErrorCode ierr; 283 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 284 PetscInt nz,idx,idt,j,i,oidx; 285 const PetscInt bs =A->rmap->bs,bs2=a->bs2; 286 const MatScalar *aa=a->a,*v; 287 PetscScalar s1,s2,s3,x1,x2,x3,*x; 288 289 PetscFunctionBegin; 290 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 291 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 292 293 /* forward solve the U^T */ 294 idx = 0; 295 for (i=0; i<n; i++) { 296 v = aa + bs2*diag[i]; 297 /* multiply by the inverse of the block diagonal */ 298 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 299 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 300 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 301 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 302 v -= bs2; 303 304 vi = aj + diag[i] - 1; 305 nz = diag[i] - diag[i+1] - 1; 306 for (j=0; j>-nz; j--) { 307 oidx = bs*vi[j]; 308 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 309 x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 310 x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 311 v -= bs2; 312 } 313 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 314 idx += bs; 315 } 316 /* backward solve the L^T */ 317 for (i=n-1; i>=0; i--) { 318 v = aa + bs2*ai[i]; 319 vi = aj + ai[i]; 320 nz = ai[i+1] - ai[i]; 321 idt = bs*i; 322 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 323 for (j=0; j<nz; j++) { 324 idx = bs*vi[j]; 325 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 326 x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 327 x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 328 v += bs2; 329 } 330 } 331 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 332 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 333 PetscFunctionReturn(0); 334 } 335 336 #undef __FUNCT__ 337 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace" 338 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 339 { 340 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data; 341 PetscErrorCode ierr; 342 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 343 PetscInt i,nz,idx,idt,oidx; 344 const MatScalar *aa=a->a,*v; 345 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4,*x; 346 347 PetscFunctionBegin; 348 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 349 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 350 351 /* forward solve the U^T */ 352 idx = 0; 353 for (i=0; i<n; i++) { 354 355 v = aa + 16*diag[i]; 356 /* multiply by the inverse of the block diagonal */ 357 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 358 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 359 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 360 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 361 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 362 v += 16; 363 364 vi = aj + diag[i] + 1; 365 nz = ai[i+1] - diag[i] - 1; 366 while (nz--) { 367 oidx = 4*(*vi++); 368 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 369 x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 370 x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 371 x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 372 v += 16; 373 } 374 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; 375 idx += 4; 376 } 377 /* backward solve the L^T */ 378 for (i=n-1; i>=0; i--) { 379 v = aa + 16*diag[i] - 16; 380 vi = aj + diag[i] - 1; 381 nz = diag[i] - ai[i]; 382 idt = 4*i; 383 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; 384 while (nz--) { 385 idx = 4*(*vi--); 386 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 387 x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 388 x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 389 x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 390 v -= 16; 391 } 392 } 393 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 394 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 395 PetscFunctionReturn(0); 396 } 397 398 #undef __FUNCT__ 399 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering" 400 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 401 { 402 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data; 403 PetscErrorCode ierr; 404 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 405 PetscInt nz,idx,idt,j,i,oidx; 406 const PetscInt bs =A->rmap->bs,bs2=a->bs2; 407 const MatScalar *aa=a->a,*v; 408 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4,*x; 409 410 PetscFunctionBegin; 411 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 412 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 413 414 /* forward solve the U^T */ 415 idx = 0; 416 for (i=0; i<n; i++) { 417 v = aa + bs2*diag[i]; 418 /* multiply by the inverse of the block diagonal */ 419 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 420 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 421 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 422 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 423 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 424 v -= bs2; 425 426 vi = aj + diag[i] - 1; 427 nz = diag[i] - diag[i+1] - 1; 428 for (j=0; j>-nz; j--) { 429 oidx = bs*vi[j]; 430 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 431 x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 432 x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 433 x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 434 v -= bs2; 435 } 436 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; 437 idx += bs; 438 } 439 /* backward solve the L^T */ 440 for (i=n-1; i>=0; i--) { 441 v = aa + bs2*ai[i]; 442 vi = aj + ai[i]; 443 nz = ai[i+1] - ai[i]; 444 idt = bs*i; 445 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; 446 for (j=0; j<nz; j++) { 447 idx = bs*vi[j]; 448 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 449 x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 450 x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 451 x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 452 v += bs2; 453 } 454 } 455 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 456 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 457 PetscFunctionReturn(0); 458 } 459 460 #undef __FUNCT__ 461 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace" 462 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 463 { 464 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data; 465 PetscErrorCode ierr; 466 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 467 PetscInt i,nz,idx,idt,oidx; 468 const MatScalar *aa=a->a,*v; 469 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x; 470 471 PetscFunctionBegin; 472 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 473 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 474 475 /* forward solve the U^T */ 476 idx = 0; 477 for (i=0; i<n; i++) { 478 479 v = aa + 25*diag[i]; 480 /* multiply by the inverse of the block diagonal */ 481 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 482 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 483 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 484 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 485 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 486 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 487 v += 25; 488 489 vi = aj + diag[i] + 1; 490 nz = ai[i+1] - diag[i] - 1; 491 while (nz--) { 492 oidx = 5*(*vi++); 493 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 494 x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 495 x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 496 x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 497 x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 498 v += 25; 499 } 500 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 501 idx += 5; 502 } 503 /* backward solve the L^T */ 504 for (i=n-1; i>=0; i--) { 505 v = aa + 25*diag[i] - 25; 506 vi = aj + diag[i] - 1; 507 nz = diag[i] - ai[i]; 508 idt = 5*i; 509 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 510 while (nz--) { 511 idx = 5*(*vi--); 512 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 513 x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 514 x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 515 x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 516 x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 517 v -= 25; 518 } 519 } 520 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 521 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 522 PetscFunctionReturn(0); 523 } 524 525 #undef __FUNCT__ 526 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering" 527 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 528 { 529 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data; 530 PetscErrorCode ierr; 531 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 532 PetscInt nz,idx,idt,j,i,oidx; 533 const PetscInt bs =A->rmap->bs,bs2=a->bs2; 534 const MatScalar *aa=a->a,*v; 535 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x; 536 537 PetscFunctionBegin; 538 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 539 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 540 541 /* forward solve the U^T */ 542 idx = 0; 543 for (i=0; i<n; i++) { 544 v = aa + bs2*diag[i]; 545 /* multiply by the inverse of the block diagonal */ 546 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 547 x5 = x[4+idx]; 548 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 549 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 550 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 551 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 552 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 553 v -= bs2; 554 555 vi = aj + diag[i] - 1; 556 nz = diag[i] - diag[i+1] - 1; 557 for (j=0; j>-nz; j--) { 558 oidx = bs*vi[j]; 559 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 560 x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 561 x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 562 x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 563 x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 564 v -= bs2; 565 } 566 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; x[4+idx] = s5; 567 idx += bs; 568 } 569 /* backward solve the L^T */ 570 for (i=n-1; i>=0; i--) { 571 v = aa + bs2*ai[i]; 572 vi = aj + ai[i]; 573 nz = ai[i+1] - ai[i]; 574 idt = bs*i; 575 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt]; 576 for (j=0; j<nz; j++) { 577 idx = bs*vi[j]; 578 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 579 x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 580 x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 581 x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 582 x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 583 v += bs2; 584 } 585 } 586 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 587 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 588 PetscFunctionReturn(0); 589 } 590 591 #undef __FUNCT__ 592 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace" 593 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 594 { 595 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data; 596 PetscErrorCode ierr; 597 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 598 PetscInt i,nz,idx,idt,oidx; 599 const MatScalar *aa=a->a,*v; 600 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x; 601 602 PetscFunctionBegin; 603 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 604 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 605 606 /* forward solve the U^T */ 607 idx = 0; 608 for (i=0; i<n; i++) { 609 610 v = aa + 36*diag[i]; 611 /* multiply by the inverse of the block diagonal */ 612 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 613 x6 = x[5+idx]; 614 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 615 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 616 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 617 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 618 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 619 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 620 v += 36; 621 622 vi = aj + diag[i] + 1; 623 nz = ai[i+1] - diag[i] - 1; 624 while (nz--) { 625 oidx = 6*(*vi++); 626 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 627 x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 628 x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 629 x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 630 x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 631 x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 632 v += 36; 633 } 634 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 635 x[5+idx] = s6; 636 idx += 6; 637 } 638 /* backward solve the L^T */ 639 for (i=n-1; i>=0; i--) { 640 v = aa + 36*diag[i] - 36; 641 vi = aj + diag[i] - 1; 642 nz = diag[i] - ai[i]; 643 idt = 6*i; 644 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 645 s6 = x[5+idt]; 646 while (nz--) { 647 idx = 6*(*vi--); 648 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 649 x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 650 x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 651 x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 652 x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 653 x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 654 v -= 36; 655 } 656 } 657 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 658 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 659 PetscFunctionReturn(0); 660 } 661 662 #undef __FUNCT__ 663 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering" 664 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 665 { 666 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data; 667 PetscErrorCode ierr; 668 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 669 PetscInt nz,idx,idt,j,i,oidx; 670 const PetscInt bs =A->rmap->bs,bs2=a->bs2; 671 const MatScalar *aa=a->a,*v; 672 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x; 673 674 PetscFunctionBegin; 675 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 676 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 677 678 /* forward solve the U^T */ 679 idx = 0; 680 for (i=0; i<n; i++) { 681 v = aa + bs2*diag[i]; 682 /* multiply by the inverse of the block diagonal */ 683 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 684 x5 = x[4+idx]; x6 = x[5+idx]; 685 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 686 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 687 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 688 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 689 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 690 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 691 v -= bs2; 692 693 vi = aj + diag[i] - 1; 694 nz = diag[i] - diag[i+1] - 1; 695 for (j=0; j>-nz; j--) { 696 oidx = bs*vi[j]; 697 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 698 x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 699 x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 700 x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 701 x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 702 x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 703 v -= bs2; 704 } 705 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; x[4+idx] = s5; 706 x[5+idx] = s6; 707 idx += bs; 708 } 709 /* backward solve the L^T */ 710 for (i=n-1; i>=0; i--) { 711 v = aa + bs2*ai[i]; 712 vi = aj + ai[i]; 713 nz = ai[i+1] - ai[i]; 714 idt = bs*i; 715 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt]; 716 s6 = x[5+idt]; 717 for (j=0; j<nz; j++) { 718 idx = bs*vi[j]; 719 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 720 x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 721 x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 722 x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 723 x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 724 x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 725 v += bs2; 726 } 727 } 728 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 729 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 730 PetscFunctionReturn(0); 731 } 732 733 #undef __FUNCT__ 734 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace" 735 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 736 { 737 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data; 738 PetscErrorCode ierr; 739 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 740 PetscInt i,nz,idx,idt,oidx; 741 const MatScalar *aa=a->a,*v; 742 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x; 743 744 PetscFunctionBegin; 745 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 746 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 747 748 /* forward solve the U^T */ 749 idx = 0; 750 for (i=0; i<n; i++) { 751 752 v = aa + 49*diag[i]; 753 /* multiply by the inverse of the block diagonal */ 754 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 755 x6 = x[5+idx]; x7 = x[6+idx]; 756 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 757 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 758 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 759 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 760 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 761 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 762 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 763 v += 49; 764 765 vi = aj + diag[i] + 1; 766 nz = ai[i+1] - diag[i] - 1; 767 while (nz--) { 768 oidx = 7*(*vi++); 769 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 770 x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 771 x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 772 x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 773 x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 774 x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 775 x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 776 v += 49; 777 } 778 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 779 x[5+idx] = s6;x[6+idx] = s7; 780 idx += 7; 781 } 782 /* backward solve the L^T */ 783 for (i=n-1; i>=0; i--) { 784 v = aa + 49*diag[i] - 49; 785 vi = aj + diag[i] - 1; 786 nz = diag[i] - ai[i]; 787 idt = 7*i; 788 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 789 s6 = x[5+idt];s7 = x[6+idt]; 790 while (nz--) { 791 idx = 7*(*vi--); 792 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 793 x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 794 x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 795 x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 796 x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 797 x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 798 x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 799 v -= 49; 800 } 801 } 802 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 803 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 804 PetscFunctionReturn(0); 805 } 806 #undef __FUNCT__ 807 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering" 808 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 809 { 810 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data; 811 PetscErrorCode ierr; 812 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 813 PetscInt nz,idx,idt,j,i,oidx; 814 const PetscInt bs =A->rmap->bs,bs2=a->bs2; 815 const MatScalar *aa=a->a,*v; 816 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x; 817 818 PetscFunctionBegin; 819 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 820 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 821 822 /* forward solve the U^T */ 823 idx = 0; 824 for (i=0; i<n; i++) { 825 v = aa + bs2*diag[i]; 826 /* multiply by the inverse of the block diagonal */ 827 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 828 x5 = x[4+idx]; x6 = x[5+idx]; x7 = x[6+idx]; 829 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 830 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 831 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 832 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 833 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 834 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 835 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 836 v -= bs2; 837 vi = aj + diag[i] - 1; 838 nz = diag[i] - diag[i+1] - 1; 839 for (j=0; j>-nz; j--) { 840 oidx = bs*vi[j]; 841 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 842 x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 843 x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 844 x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 845 x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 846 x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 847 x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 848 v -= bs2; 849 } 850 x[idx] = s1; x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; x[4+idx] = s5; 851 x[5+idx] = s6; x[6+idx] = s7; 852 idx += bs; 853 } 854 /* backward solve the L^T */ 855 for (i=n-1; i>=0; i--) { 856 v = aa + bs2*ai[i]; 857 vi = aj + ai[i]; 858 nz = ai[i+1] - ai[i]; 859 idt = bs*i; 860 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt]; 861 s6 = x[5+idt]; s7 = x[6+idt]; 862 for (j=0; j<nz; j++) { 863 idx = bs*vi[j]; 864 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 865 x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 866 x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 867 x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 868 x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 869 x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 870 x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 871 v += bs2; 872 } 873 } 874 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 875 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 876 PetscFunctionReturn(0); 877 } 878 879 /*---------------------------------------------------------------------------------------------*/ 880 #undef __FUNCT__ 881 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1" 882 PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 883 { 884 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 885 IS iscol = a->col,isrow = a->row; 886 PetscErrorCode ierr; 887 const PetscInt *rout,*cout,*r,*c,*adiag = a->diag,*ai = a->i,*aj = a->j,*vi; 888 PetscInt i,n = a->mbs,j; 889 PetscInt nz; 890 PetscScalar *x,*tmp,s1; 891 const MatScalar *aa = a->a,*v; 892 const PetscScalar *b; 893 894 PetscFunctionBegin; 895 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 896 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 897 tmp = a->solve_work; 898 899 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 900 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 901 902 /* copy the b into temp work space according to permutation */ 903 for (i=0; i<n; i++) tmp[i] = b[c[i]]; 904 905 /* forward solve the U^T */ 906 for (i=0; i<n; i++) { 907 v = aa + adiag[i+1] + 1; 908 vi = aj + adiag[i+1] + 1; 909 nz = adiag[i] - adiag[i+1] - 1; 910 s1 = tmp[i]; 911 s1 *= v[nz]; /* multiply by inverse of diagonal entry */ 912 for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j]; 913 tmp[i] = s1; 914 } 915 916 /* backward solve the L^T */ 917 for (i=n-1; i>=0; i--) { 918 v = aa + ai[i]; 919 vi = aj + ai[i]; 920 nz = ai[i+1] - ai[i]; 921 s1 = tmp[i]; 922 for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j]; 923 } 924 925 /* copy tmp into x according to permutation */ 926 for (i=0; i<n; i++) x[r[i]] = tmp[i]; 927 928 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 929 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 930 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 931 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 932 933 ierr = PetscLogFlops(2.0*a->nz-A->cmap->n);CHKERRQ(ierr); 934 PetscFunctionReturn(0); 935 } 936 937 #undef __FUNCT__ 938 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_inplace" 939 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx) 940 { 941 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 942 IS iscol=a->col,isrow=a->row; 943 PetscErrorCode ierr; 944 const PetscInt *r,*c,*rout,*cout; 945 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 946 PetscInt i,nz; 947 const MatScalar *aa=a->a,*v; 948 PetscScalar s1,*x,*t; 949 const PetscScalar *b; 950 951 PetscFunctionBegin; 952 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 953 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 954 t = a->solve_work; 955 956 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 957 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 958 959 /* copy the b into temp work space according to permutation */ 960 for (i=0; i<n; i++) t[i] = b[c[i]]; 961 962 /* forward solve the U^T */ 963 for (i=0; i<n; i++) { 964 965 v = aa + diag[i]; 966 /* multiply by the inverse of the block diagonal */ 967 s1 = (*v++)*t[i]; 968 vi = aj + diag[i] + 1; 969 nz = ai[i+1] - diag[i] - 1; 970 while (nz--) { 971 t[*vi++] -= (*v++)*s1; 972 } 973 t[i] = s1; 974 } 975 /* backward solve the L^T */ 976 for (i=n-1; i>=0; i--) { 977 v = aa + diag[i] - 1; 978 vi = aj + diag[i] - 1; 979 nz = diag[i] - ai[i]; 980 s1 = t[i]; 981 while (nz--) { 982 t[*vi--] -= (*v--)*s1; 983 } 984 } 985 986 /* copy t into x according to permutation */ 987 for (i=0; i<n; i++) x[r[i]] = t[i]; 988 989 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 990 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 991 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 992 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 993 ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 994 PetscFunctionReturn(0); 995 } 996 997 #undef __FUNCT__ 998 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_inplace" 999 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx) 1000 { 1001 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 1002 IS iscol=a->col,isrow=a->row; 1003 PetscErrorCode ierr; 1004 const PetscInt *r,*c,*rout,*cout; 1005 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1006 PetscInt i,nz,idx,idt,ii,ic,ir,oidx; 1007 const MatScalar *aa=a->a,*v; 1008 PetscScalar s1,s2,x1,x2,*x,*t; 1009 const PetscScalar *b; 1010 1011 PetscFunctionBegin; 1012 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1013 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1014 t = a->solve_work; 1015 1016 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1017 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1018 1019 /* copy the b into temp work space according to permutation */ 1020 ii = 0; 1021 for (i=0; i<n; i++) { 1022 ic = 2*c[i]; 1023 t[ii] = b[ic]; 1024 t[ii+1] = b[ic+1]; 1025 ii += 2; 1026 } 1027 1028 /* forward solve the U^T */ 1029 idx = 0; 1030 for (i=0; i<n; i++) { 1031 1032 v = aa + 4*diag[i]; 1033 /* multiply by the inverse of the block diagonal */ 1034 x1 = t[idx]; x2 = t[1+idx]; 1035 s1 = v[0]*x1 + v[1]*x2; 1036 s2 = v[2]*x1 + v[3]*x2; 1037 v += 4; 1038 1039 vi = aj + diag[i] + 1; 1040 nz = ai[i+1] - diag[i] - 1; 1041 while (nz--) { 1042 oidx = 2*(*vi++); 1043 t[oidx] -= v[0]*s1 + v[1]*s2; 1044 t[oidx+1] -= v[2]*s1 + v[3]*s2; 1045 v += 4; 1046 } 1047 t[idx] = s1;t[1+idx] = s2; 1048 idx += 2; 1049 } 1050 /* backward solve the L^T */ 1051 for (i=n-1; i>=0; i--) { 1052 v = aa + 4*diag[i] - 4; 1053 vi = aj + diag[i] - 1; 1054 nz = diag[i] - ai[i]; 1055 idt = 2*i; 1056 s1 = t[idt]; s2 = t[1+idt]; 1057 while (nz--) { 1058 idx = 2*(*vi--); 1059 t[idx] -= v[0]*s1 + v[1]*s2; 1060 t[idx+1] -= v[2]*s1 + v[3]*s2; 1061 v -= 4; 1062 } 1063 } 1064 1065 /* copy t into x according to permutation */ 1066 ii = 0; 1067 for (i=0; i<n; i++) { 1068 ir = 2*r[i]; 1069 x[ir] = t[ii]; 1070 x[ir+1] = t[ii+1]; 1071 ii += 2; 1072 } 1073 1074 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1075 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1076 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1077 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1078 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 1079 PetscFunctionReturn(0); 1080 } 1081 1082 #undef __FUNCT__ 1083 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2" 1084 PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 1085 { 1086 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data; 1087 PetscErrorCode ierr; 1088 IS iscol=a->col,isrow=a->row; 1089 const PetscInt n =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 1090 const PetscInt *r,*c,*rout,*cout; 1091 PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1092 const PetscInt bs =A->rmap->bs,bs2=a->bs2; 1093 const MatScalar *aa=a->a,*v; 1094 PetscScalar s1,s2,x1,x2,*x,*t; 1095 const PetscScalar *b; 1096 1097 PetscFunctionBegin; 1098 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1099 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1100 t = a->solve_work; 1101 1102 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1103 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1104 1105 /* copy b into temp work space according to permutation */ 1106 for (i=0; i<n; i++) { 1107 ii = bs*i; ic = bs*c[i]; 1108 t[ii] = b[ic]; t[ii+1] = b[ic+1]; 1109 } 1110 1111 /* forward solve the U^T */ 1112 idx = 0; 1113 for (i=0; i<n; i++) { 1114 v = aa + bs2*diag[i]; 1115 /* multiply by the inverse of the block diagonal */ 1116 x1 = t[idx]; x2 = t[1+idx]; 1117 s1 = v[0]*x1 + v[1]*x2; 1118 s2 = v[2]*x1 + v[3]*x2; 1119 v -= bs2; 1120 1121 vi = aj + diag[i] - 1; 1122 nz = diag[i] - diag[i+1] - 1; 1123 for (j=0; j>-nz; j--) { 1124 oidx = bs*vi[j]; 1125 t[oidx] -= v[0]*s1 + v[1]*s2; 1126 t[oidx+1] -= v[2]*s1 + v[3]*s2; 1127 v -= bs2; 1128 } 1129 t[idx] = s1;t[1+idx] = s2; 1130 idx += bs; 1131 } 1132 /* backward solve the L^T */ 1133 for (i=n-1; i>=0; i--) { 1134 v = aa + bs2*ai[i]; 1135 vi = aj + ai[i]; 1136 nz = ai[i+1] - ai[i]; 1137 idt = bs*i; 1138 s1 = t[idt]; s2 = t[1+idt]; 1139 for (j=0; j<nz; j++) { 1140 idx = bs*vi[j]; 1141 t[idx] -= v[0]*s1 + v[1]*s2; 1142 t[idx+1] -= v[2]*s1 + v[3]*s2; 1143 v += bs2; 1144 } 1145 } 1146 1147 /* copy t into x according to permutation */ 1148 for (i=0; i<n; i++) { 1149 ii = bs*i; ir = bs*r[i]; 1150 x[ir] = t[ii]; x[ir+1] = t[ii+1]; 1151 } 1152 1153 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1154 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1155 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1156 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1157 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1158 PetscFunctionReturn(0); 1159 } 1160 1161 #undef __FUNCT__ 1162 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_inplace" 1163 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx) 1164 { 1165 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 1166 IS iscol=a->col,isrow=a->row; 1167 PetscErrorCode ierr; 1168 const PetscInt *r,*c,*rout,*cout; 1169 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1170 PetscInt i,nz,idx,idt,ii,ic,ir,oidx; 1171 const MatScalar *aa=a->a,*v; 1172 PetscScalar s1,s2,s3,x1,x2,x3,*x,*t; 1173 const PetscScalar *b; 1174 1175 PetscFunctionBegin; 1176 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1177 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1178 t = a->solve_work; 1179 1180 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1181 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1182 1183 /* copy the b into temp work space according to permutation */ 1184 ii = 0; 1185 for (i=0; i<n; i++) { 1186 ic = 3*c[i]; 1187 t[ii] = b[ic]; 1188 t[ii+1] = b[ic+1]; 1189 t[ii+2] = b[ic+2]; 1190 ii += 3; 1191 } 1192 1193 /* forward solve the U^T */ 1194 idx = 0; 1195 for (i=0; i<n; i++) { 1196 1197 v = aa + 9*diag[i]; 1198 /* multiply by the inverse of the block diagonal */ 1199 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1200 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 1201 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 1202 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 1203 v += 9; 1204 1205 vi = aj + diag[i] + 1; 1206 nz = ai[i+1] - diag[i] - 1; 1207 while (nz--) { 1208 oidx = 3*(*vi++); 1209 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 1210 t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 1211 t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 1212 v += 9; 1213 } 1214 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 1215 idx += 3; 1216 } 1217 /* backward solve the L^T */ 1218 for (i=n-1; i>=0; i--) { 1219 v = aa + 9*diag[i] - 9; 1220 vi = aj + diag[i] - 1; 1221 nz = diag[i] - ai[i]; 1222 idt = 3*i; 1223 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 1224 while (nz--) { 1225 idx = 3*(*vi--); 1226 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 1227 t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 1228 t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 1229 v -= 9; 1230 } 1231 } 1232 1233 /* copy t into x according to permutation */ 1234 ii = 0; 1235 for (i=0; i<n; i++) { 1236 ir = 3*r[i]; 1237 x[ir] = t[ii]; 1238 x[ir+1] = t[ii+1]; 1239 x[ir+2] = t[ii+2]; 1240 ii += 3; 1241 } 1242 1243 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1244 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1245 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1246 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1247 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 1248 PetscFunctionReturn(0); 1249 } 1250 1251 #undef __FUNCT__ 1252 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3" 1253 PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 1254 { 1255 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data; 1256 PetscErrorCode ierr; 1257 IS iscol=a->col,isrow=a->row; 1258 const PetscInt n =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 1259 const PetscInt *r,*c,*rout,*cout; 1260 PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1261 const PetscInt bs =A->rmap->bs,bs2=a->bs2; 1262 const MatScalar *aa=a->a,*v; 1263 PetscScalar s1,s2,s3,x1,x2,x3,*x,*t; 1264 const PetscScalar *b; 1265 1266 PetscFunctionBegin; 1267 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1268 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1269 t = a->solve_work; 1270 1271 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1272 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1273 1274 /* copy b into temp work space according to permutation */ 1275 for (i=0; i<n; i++) { 1276 ii = bs*i; ic = bs*c[i]; 1277 t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; 1278 } 1279 1280 /* forward solve the U^T */ 1281 idx = 0; 1282 for (i=0; i<n; i++) { 1283 v = aa + bs2*diag[i]; 1284 /* multiply by the inverse of the block diagonal */ 1285 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1286 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 1287 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 1288 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 1289 v -= bs2; 1290 1291 vi = aj + diag[i] - 1; 1292 nz = diag[i] - diag[i+1] - 1; 1293 for (j=0; j>-nz; j--) { 1294 oidx = bs*vi[j]; 1295 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 1296 t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 1297 t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 1298 v -= bs2; 1299 } 1300 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 1301 idx += bs; 1302 } 1303 /* backward solve the L^T */ 1304 for (i=n-1; i>=0; i--) { 1305 v = aa + bs2*ai[i]; 1306 vi = aj + ai[i]; 1307 nz = ai[i+1] - ai[i]; 1308 idt = bs*i; 1309 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 1310 for (j=0; j<nz; j++) { 1311 idx = bs*vi[j]; 1312 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 1313 t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 1314 t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 1315 v += bs2; 1316 } 1317 } 1318 1319 /* copy t into x according to permutation */ 1320 for (i=0; i<n; i++) { 1321 ii = bs*i; ir = bs*r[i]; 1322 x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; 1323 } 1324 1325 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1326 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1327 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1328 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1329 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1330 PetscFunctionReturn(0); 1331 } 1332 1333 #undef __FUNCT__ 1334 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_inplace" 1335 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx) 1336 { 1337 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 1338 IS iscol=a->col,isrow=a->row; 1339 PetscErrorCode ierr; 1340 const PetscInt *r,*c,*rout,*cout; 1341 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1342 PetscInt i,nz,idx,idt,ii,ic,ir,oidx; 1343 const MatScalar *aa=a->a,*v; 1344 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4,*x,*t; 1345 const PetscScalar *b; 1346 1347 PetscFunctionBegin; 1348 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1349 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1350 t = a->solve_work; 1351 1352 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1353 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1354 1355 /* copy the b into temp work space according to permutation */ 1356 ii = 0; 1357 for (i=0; i<n; i++) { 1358 ic = 4*c[i]; 1359 t[ii] = b[ic]; 1360 t[ii+1] = b[ic+1]; 1361 t[ii+2] = b[ic+2]; 1362 t[ii+3] = b[ic+3]; 1363 ii += 4; 1364 } 1365 1366 /* forward solve the U^T */ 1367 idx = 0; 1368 for (i=0; i<n; i++) { 1369 1370 v = aa + 16*diag[i]; 1371 /* multiply by the inverse of the block diagonal */ 1372 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 1373 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 1374 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 1375 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 1376 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 1377 v += 16; 1378 1379 vi = aj + diag[i] + 1; 1380 nz = ai[i+1] - diag[i] - 1; 1381 while (nz--) { 1382 oidx = 4*(*vi++); 1383 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 1384 t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 1385 t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 1386 t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 1387 v += 16; 1388 } 1389 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; 1390 idx += 4; 1391 } 1392 /* backward solve the L^T */ 1393 for (i=n-1; i>=0; i--) { 1394 v = aa + 16*diag[i] - 16; 1395 vi = aj + diag[i] - 1; 1396 nz = diag[i] - ai[i]; 1397 idt = 4*i; 1398 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; 1399 while (nz--) { 1400 idx = 4*(*vi--); 1401 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 1402 t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 1403 t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 1404 t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 1405 v -= 16; 1406 } 1407 } 1408 1409 /* copy t into x according to permutation */ 1410 ii = 0; 1411 for (i=0; i<n; i++) { 1412 ir = 4*r[i]; 1413 x[ir] = t[ii]; 1414 x[ir+1] = t[ii+1]; 1415 x[ir+2] = t[ii+2]; 1416 x[ir+3] = t[ii+3]; 1417 ii += 4; 1418 } 1419 1420 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1421 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1422 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1423 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1424 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 1425 PetscFunctionReturn(0); 1426 } 1427 1428 #undef __FUNCT__ 1429 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4" 1430 PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 1431 { 1432 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data; 1433 PetscErrorCode ierr; 1434 IS iscol=a->col,isrow=a->row; 1435 const PetscInt n =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 1436 const PetscInt *r,*c,*rout,*cout; 1437 PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1438 const PetscInt bs =A->rmap->bs,bs2=a->bs2; 1439 const MatScalar *aa=a->a,*v; 1440 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4,*x,*t; 1441 const PetscScalar *b; 1442 1443 PetscFunctionBegin; 1444 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1445 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1446 t = a->solve_work; 1447 1448 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1449 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1450 1451 /* copy b into temp work space according to permutation */ 1452 for (i=0; i<n; i++) { 1453 ii = bs*i; ic = bs*c[i]; 1454 t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3]; 1455 } 1456 1457 /* forward solve the U^T */ 1458 idx = 0; 1459 for (i=0; i<n; i++) { 1460 v = aa + bs2*diag[i]; 1461 /* multiply by the inverse of the block diagonal */ 1462 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 1463 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 1464 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 1465 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 1466 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 1467 v -= bs2; 1468 1469 vi = aj + diag[i] - 1; 1470 nz = diag[i] - diag[i+1] - 1; 1471 for (j=0; j>-nz; j--) { 1472 oidx = bs*vi[j]; 1473 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 1474 t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 1475 t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 1476 t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 1477 v -= bs2; 1478 } 1479 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; 1480 idx += bs; 1481 } 1482 /* backward solve the L^T */ 1483 for (i=n-1; i>=0; i--) { 1484 v = aa + bs2*ai[i]; 1485 vi = aj + ai[i]; 1486 nz = ai[i+1] - ai[i]; 1487 idt = bs*i; 1488 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; 1489 for (j=0; j<nz; j++) { 1490 idx = bs*vi[j]; 1491 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 1492 t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 1493 t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 1494 t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 1495 v += bs2; 1496 } 1497 } 1498 1499 /* copy t into x according to permutation */ 1500 for (i=0; i<n; i++) { 1501 ii = bs*i; ir = bs*r[i]; 1502 x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3]; 1503 } 1504 1505 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1506 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1507 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1508 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1509 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1510 PetscFunctionReturn(0); 1511 } 1512 1513 #undef __FUNCT__ 1514 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_inplace" 1515 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx) 1516 { 1517 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 1518 IS iscol=a->col,isrow=a->row; 1519 PetscErrorCode ierr; 1520 const PetscInt *r,*c,*rout,*cout; 1521 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1522 PetscInt i,nz,idx,idt,ii,ic,ir,oidx; 1523 const MatScalar *aa=a->a,*v; 1524 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t; 1525 const PetscScalar *b; 1526 1527 PetscFunctionBegin; 1528 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1529 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1530 t = a->solve_work; 1531 1532 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1533 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1534 1535 /* copy the b into temp work space according to permutation */ 1536 ii = 0; 1537 for (i=0; i<n; i++) { 1538 ic = 5*c[i]; 1539 t[ii] = b[ic]; 1540 t[ii+1] = b[ic+1]; 1541 t[ii+2] = b[ic+2]; 1542 t[ii+3] = b[ic+3]; 1543 t[ii+4] = b[ic+4]; 1544 ii += 5; 1545 } 1546 1547 /* forward solve the U^T */ 1548 idx = 0; 1549 for (i=0; i<n; i++) { 1550 1551 v = aa + 25*diag[i]; 1552 /* multiply by the inverse of the block diagonal */ 1553 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1554 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 1555 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 1556 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 1557 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 1558 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 1559 v += 25; 1560 1561 vi = aj + diag[i] + 1; 1562 nz = ai[i+1] - diag[i] - 1; 1563 while (nz--) { 1564 oidx = 5*(*vi++); 1565 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 1566 t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 1567 t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 1568 t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 1569 t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 1570 v += 25; 1571 } 1572 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1573 idx += 5; 1574 } 1575 /* backward solve the L^T */ 1576 for (i=n-1; i>=0; i--) { 1577 v = aa + 25*diag[i] - 25; 1578 vi = aj + diag[i] - 1; 1579 nz = diag[i] - ai[i]; 1580 idt = 5*i; 1581 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1582 while (nz--) { 1583 idx = 5*(*vi--); 1584 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 1585 t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 1586 t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 1587 t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 1588 t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 1589 v -= 25; 1590 } 1591 } 1592 1593 /* copy t into x according to permutation */ 1594 ii = 0; 1595 for (i=0; i<n; i++) { 1596 ir = 5*r[i]; 1597 x[ir] = t[ii]; 1598 x[ir+1] = t[ii+1]; 1599 x[ir+2] = t[ii+2]; 1600 x[ir+3] = t[ii+3]; 1601 x[ir+4] = t[ii+4]; 1602 ii += 5; 1603 } 1604 1605 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1606 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1607 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1608 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1609 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 1610 PetscFunctionReturn(0); 1611 } 1612 1613 #undef __FUNCT__ 1614 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5" 1615 PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 1616 { 1617 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data; 1618 PetscErrorCode ierr; 1619 IS iscol=a->col,isrow=a->row; 1620 const PetscInt n =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 1621 const PetscInt *r,*c,*rout,*cout; 1622 PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1623 const PetscInt bs =A->rmap->bs,bs2=a->bs2; 1624 const MatScalar *aa=a->a,*v; 1625 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t; 1626 const PetscScalar *b; 1627 1628 PetscFunctionBegin; 1629 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1630 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1631 t = a->solve_work; 1632 1633 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1634 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1635 1636 /* copy b into temp work space according to permutation */ 1637 for (i=0; i<n; i++) { 1638 ii = bs*i; ic = bs*c[i]; 1639 t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3]; 1640 t[ii+4] = b[ic+4]; 1641 } 1642 1643 /* forward solve the U^T */ 1644 idx = 0; 1645 for (i=0; i<n; i++) { 1646 v = aa + bs2*diag[i]; 1647 /* multiply by the inverse of the block diagonal */ 1648 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1649 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 1650 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 1651 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 1652 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 1653 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 1654 v -= bs2; 1655 1656 vi = aj + diag[i] - 1; 1657 nz = diag[i] - diag[i+1] - 1; 1658 for (j=0; j>-nz; j--) { 1659 oidx = bs*vi[j]; 1660 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 1661 t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 1662 t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 1663 t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 1664 t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 1665 v -= bs2; 1666 } 1667 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; t[4+idx] =s5; 1668 idx += bs; 1669 } 1670 /* backward solve the L^T */ 1671 for (i=n-1; i>=0; i--) { 1672 v = aa + bs2*ai[i]; 1673 vi = aj + ai[i]; 1674 nz = ai[i+1] - ai[i]; 1675 idt = bs*i; 1676 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; s5 = t[4+idt]; 1677 for (j=0; j<nz; j++) { 1678 idx = bs*vi[j]; 1679 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 1680 t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 1681 t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 1682 t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 1683 t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 1684 v += bs2; 1685 } 1686 } 1687 1688 /* copy t into x according to permutation */ 1689 for (i=0; i<n; i++) { 1690 ii = bs*i; ir = bs*r[i]; 1691 x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3]; 1692 x[ir+4] = t[ii+4]; 1693 } 1694 1695 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1696 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1697 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1698 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1699 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1700 PetscFunctionReturn(0); 1701 } 1702 1703 #undef __FUNCT__ 1704 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_inplace" 1705 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx) 1706 { 1707 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 1708 IS iscol=a->col,isrow=a->row; 1709 PetscErrorCode ierr; 1710 const PetscInt *r,*c,*rout,*cout; 1711 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1712 PetscInt i,nz,idx,idt,ii,ic,ir,oidx; 1713 const MatScalar *aa=a->a,*v; 1714 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t; 1715 const PetscScalar *b; 1716 1717 PetscFunctionBegin; 1718 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1719 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1720 t = a->solve_work; 1721 1722 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1723 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1724 1725 /* copy the b into temp work space according to permutation */ 1726 ii = 0; 1727 for (i=0; i<n; i++) { 1728 ic = 6*c[i]; 1729 t[ii] = b[ic]; 1730 t[ii+1] = b[ic+1]; 1731 t[ii+2] = b[ic+2]; 1732 t[ii+3] = b[ic+3]; 1733 t[ii+4] = b[ic+4]; 1734 t[ii+5] = b[ic+5]; 1735 ii += 6; 1736 } 1737 1738 /* forward solve the U^T */ 1739 idx = 0; 1740 for (i=0; i<n; i++) { 1741 1742 v = aa + 36*diag[i]; 1743 /* multiply by the inverse of the block diagonal */ 1744 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1745 x6 = t[5+idx]; 1746 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 1747 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 1748 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 1749 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 1750 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 1751 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 1752 v += 36; 1753 1754 vi = aj + diag[i] + 1; 1755 nz = ai[i+1] - diag[i] - 1; 1756 while (nz--) { 1757 oidx = 6*(*vi++); 1758 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 1759 t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 1760 t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 1761 t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 1762 t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 1763 t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 1764 v += 36; 1765 } 1766 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1767 t[5+idx] = s6; 1768 idx += 6; 1769 } 1770 /* backward solve the L^T */ 1771 for (i=n-1; i>=0; i--) { 1772 v = aa + 36*diag[i] - 36; 1773 vi = aj + diag[i] - 1; 1774 nz = diag[i] - ai[i]; 1775 idt = 6*i; 1776 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1777 s6 = t[5+idt]; 1778 while (nz--) { 1779 idx = 6*(*vi--); 1780 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 1781 t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 1782 t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 1783 t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 1784 t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 1785 t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 1786 v -= 36; 1787 } 1788 } 1789 1790 /* copy t into x according to permutation */ 1791 ii = 0; 1792 for (i=0; i<n; i++) { 1793 ir = 6*r[i]; 1794 x[ir] = t[ii]; 1795 x[ir+1] = t[ii+1]; 1796 x[ir+2] = t[ii+2]; 1797 x[ir+3] = t[ii+3]; 1798 x[ir+4] = t[ii+4]; 1799 x[ir+5] = t[ii+5]; 1800 ii += 6; 1801 } 1802 1803 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1804 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1805 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1806 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1807 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1808 PetscFunctionReturn(0); 1809 } 1810 1811 #undef __FUNCT__ 1812 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6" 1813 PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 1814 { 1815 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data; 1816 PetscErrorCode ierr; 1817 IS iscol=a->col,isrow=a->row; 1818 const PetscInt n =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 1819 const PetscInt *r,*c,*rout,*cout; 1820 PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1821 const PetscInt bs =A->rmap->bs,bs2=a->bs2; 1822 const MatScalar *aa=a->a,*v; 1823 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t; 1824 const PetscScalar *b; 1825 1826 PetscFunctionBegin; 1827 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1828 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1829 t = a->solve_work; 1830 1831 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1832 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1833 1834 /* copy b into temp work space according to permutation */ 1835 for (i=0; i<n; i++) { 1836 ii = bs*i; ic = bs*c[i]; 1837 t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3]; 1838 t[ii+4] = b[ic+4]; t[ii+5] = b[ic+5]; 1839 } 1840 1841 /* forward solve the U^T */ 1842 idx = 0; 1843 for (i=0; i<n; i++) { 1844 v = aa + bs2*diag[i]; 1845 /* multiply by the inverse of the block diagonal */ 1846 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1847 x6 = t[5+idx]; 1848 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 1849 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 1850 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 1851 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 1852 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 1853 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 1854 v -= bs2; 1855 1856 vi = aj + diag[i] - 1; 1857 nz = diag[i] - diag[i+1] - 1; 1858 for (j=0; j>-nz; j--) { 1859 oidx = bs*vi[j]; 1860 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 1861 t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 1862 t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 1863 t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 1864 t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 1865 t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 1866 v -= bs2; 1867 } 1868 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; t[4+idx] =s5; 1869 t[5+idx] = s6; 1870 idx += bs; 1871 } 1872 /* backward solve the L^T */ 1873 for (i=n-1; i>=0; i--) { 1874 v = aa + bs2*ai[i]; 1875 vi = aj + ai[i]; 1876 nz = ai[i+1] - ai[i]; 1877 idt = bs*i; 1878 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; s5 = t[4+idt]; 1879 s6 = t[5+idt]; 1880 for (j=0; j<nz; j++) { 1881 idx = bs*vi[j]; 1882 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 1883 t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 1884 t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 1885 t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 1886 t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 1887 t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 1888 v += bs2; 1889 } 1890 } 1891 1892 /* copy t into x according to permutation */ 1893 for (i=0; i<n; i++) { 1894 ii = bs*i; ir = bs*r[i]; 1895 x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3]; 1896 x[ir+4] = t[ii+4]; x[ir+5] = t[ii+5]; 1897 } 1898 1899 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1900 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1901 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1902 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1903 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1904 PetscFunctionReturn(0); 1905 } 1906 1907 #undef __FUNCT__ 1908 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_inplace" 1909 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx) 1910 { 1911 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 1912 IS iscol=a->col,isrow=a->row; 1913 PetscErrorCode ierr; 1914 const PetscInt *r,*c,*rout,*cout; 1915 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1916 PetscInt i,nz,idx,idt,ii,ic,ir,oidx; 1917 const MatScalar *aa=a->a,*v; 1918 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t; 1919 const PetscScalar *b; 1920 1921 PetscFunctionBegin; 1922 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1923 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1924 t = a->solve_work; 1925 1926 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1927 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1928 1929 /* copy the b into temp work space according to permutation */ 1930 ii = 0; 1931 for (i=0; i<n; i++) { 1932 ic = 7*c[i]; 1933 t[ii] = b[ic]; 1934 t[ii+1] = b[ic+1]; 1935 t[ii+2] = b[ic+2]; 1936 t[ii+3] = b[ic+3]; 1937 t[ii+4] = b[ic+4]; 1938 t[ii+5] = b[ic+5]; 1939 t[ii+6] = b[ic+6]; 1940 ii += 7; 1941 } 1942 1943 /* forward solve the U^T */ 1944 idx = 0; 1945 for (i=0; i<n; i++) { 1946 1947 v = aa + 49*diag[i]; 1948 /* multiply by the inverse of the block diagonal */ 1949 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1950 x6 = t[5+idx]; x7 = t[6+idx]; 1951 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 1952 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 1953 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 1954 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 1955 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 1956 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 1957 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 1958 v += 49; 1959 1960 vi = aj + diag[i] + 1; 1961 nz = ai[i+1] - diag[i] - 1; 1962 while (nz--) { 1963 oidx = 7*(*vi++); 1964 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1965 t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1966 t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1967 t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1968 t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1969 t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1970 t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1971 v += 49; 1972 } 1973 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1974 t[5+idx] = s6;t[6+idx] = s7; 1975 idx += 7; 1976 } 1977 /* backward solve the L^T */ 1978 for (i=n-1; i>=0; i--) { 1979 v = aa + 49*diag[i] - 49; 1980 vi = aj + diag[i] - 1; 1981 nz = diag[i] - ai[i]; 1982 idt = 7*i; 1983 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1984 s6 = t[5+idt];s7 = t[6+idt]; 1985 while (nz--) { 1986 idx = 7*(*vi--); 1987 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1988 t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1989 t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1990 t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1991 t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1992 t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1993 t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1994 v -= 49; 1995 } 1996 } 1997 1998 /* copy t into x according to permutation */ 1999 ii = 0; 2000 for (i=0; i<n; i++) { 2001 ir = 7*r[i]; 2002 x[ir] = t[ii]; 2003 x[ir+1] = t[ii+1]; 2004 x[ir+2] = t[ii+2]; 2005 x[ir+3] = t[ii+3]; 2006 x[ir+4] = t[ii+4]; 2007 x[ir+5] = t[ii+5]; 2008 x[ir+6] = t[ii+6]; 2009 ii += 7; 2010 } 2011 2012 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2013 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2014 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2015 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2016 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 2017 PetscFunctionReturn(0); 2018 } 2019 #undef __FUNCT__ 2020 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7" 2021 PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 2022 { 2023 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data; 2024 PetscErrorCode ierr; 2025 IS iscol=a->col,isrow=a->row; 2026 const PetscInt n =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 2027 const PetscInt *r,*c,*rout,*cout; 2028 PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 2029 const PetscInt bs =A->rmap->bs,bs2=a->bs2; 2030 const MatScalar *aa=a->a,*v; 2031 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t; 2032 const PetscScalar *b; 2033 2034 PetscFunctionBegin; 2035 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2036 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2037 t = a->solve_work; 2038 2039 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2040 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2041 2042 /* copy b into temp work space according to permutation */ 2043 for (i=0; i<n; i++) { 2044 ii = bs*i; ic = bs*c[i]; 2045 t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3]; 2046 t[ii+4] = b[ic+4]; t[ii+5] = b[ic+5]; t[ii+6] = b[ic+6]; 2047 } 2048 2049 /* forward solve the U^T */ 2050 idx = 0; 2051 for (i=0; i<n; i++) { 2052 v = aa + bs2*diag[i]; 2053 /* multiply by the inverse of the block diagonal */ 2054 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 2055 x6 = t[5+idx]; x7 = t[6+idx]; 2056 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 2057 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 2058 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 2059 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 2060 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 2061 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 2062 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 2063 v -= bs2; 2064 2065 vi = aj + diag[i] - 1; 2066 nz = diag[i] - diag[i+1] - 1; 2067 for (j=0; j>-nz; j--) { 2068 oidx = bs*vi[j]; 2069 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 2070 t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 2071 t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 2072 t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 2073 t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 2074 t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 2075 t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 2076 v -= bs2; 2077 } 2078 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; t[4+idx] =s5; 2079 t[5+idx] = s6; t[6+idx] = s7; 2080 idx += bs; 2081 } 2082 /* backward solve the L^T */ 2083 for (i=n-1; i>=0; i--) { 2084 v = aa + bs2*ai[i]; 2085 vi = aj + ai[i]; 2086 nz = ai[i+1] - ai[i]; 2087 idt = bs*i; 2088 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; s5 = t[4+idt]; 2089 s6 = t[5+idt]; s7 = t[6+idt]; 2090 for (j=0; j<nz; j++) { 2091 idx = bs*vi[j]; 2092 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 2093 t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 2094 t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 2095 t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 2096 t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 2097 t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 2098 t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 2099 v += bs2; 2100 } 2101 } 2102 2103 /* copy t into x according to permutation */ 2104 for (i=0; i<n; i++) { 2105 ii = bs*i; ir = bs*r[i]; 2106 x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3]; 2107 x[ir+4] = t[ii+4]; x[ir+5] = t[ii+5]; x[ir+6] = t[ii+6]; 2108 } 2109 2110 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2111 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2112 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2113 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2114 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 2115 PetscFunctionReturn(0); 2116 } 2117 2118 /* ----------------------------------------------------------- */ 2119 #undef __FUNCT__ 2120 #define __FUNCT__ "MatSolve_SeqBAIJ_N_inplace" 2121 PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx) 2122 { 2123 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 2124 IS iscol=a->col,isrow=a->row; 2125 PetscErrorCode ierr; 2126 const PetscInt *r,*c,*rout,*cout; 2127 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*vi; 2128 PetscInt i,nz; 2129 const PetscInt bs =A->rmap->bs,bs2=a->bs2; 2130 const MatScalar *aa=a->a,*v; 2131 PetscScalar *x,*s,*t,*ls; 2132 const PetscScalar *b; 2133 2134 PetscFunctionBegin; 2135 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2136 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2137 t = a->solve_work; 2138 2139 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2140 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2141 2142 /* forward solve the lower triangular */ 2143 ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 2144 for (i=1; i<n; i++) { 2145 v = aa + bs2*ai[i]; 2146 vi = aj + ai[i]; 2147 nz = a->diag[i] - ai[i]; 2148 s = t + bs*i; 2149 ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 2150 while (nz--) { 2151 PetscKernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 2152 v += bs2; 2153 } 2154 } 2155 /* backward solve the upper triangular */ 2156 ls = a->solve_work + A->cmap->n; 2157 for (i=n-1; i>=0; i--) { 2158 v = aa + bs2*(a->diag[i] + 1); 2159 vi = aj + a->diag[i] + 1; 2160 nz = ai[i+1] - a->diag[i] - 1; 2161 ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 2162 while (nz--) { 2163 PetscKernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 2164 v += bs2; 2165 } 2166 PetscKernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 2167 ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 2168 } 2169 2170 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2171 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2172 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2173 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2174 ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 2175 PetscFunctionReturn(0); 2176 } 2177 2178 /* ----------------------------------------------------------- */ 2179 #undef __FUNCT__ 2180 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N_inplace" 2181 PetscErrorCode MatSolveTranspose_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx) 2182 { 2183 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 2184 IS iscol=a->col,isrow=a->row; 2185 PetscErrorCode ierr; 2186 const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi; 2187 PetscInt i,nz,j; 2188 const PetscInt n =a->mbs,bs=A->rmap->bs,bs2=a->bs2; 2189 const MatScalar *aa=a->a,*v; 2190 PetscScalar *x,*t,*ls; 2191 const PetscScalar *b; 2192 2193 PetscFunctionBegin; 2194 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2195 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2196 t = a->solve_work; 2197 2198 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2199 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2200 2201 /* copy the b into temp work space according to permutation */ 2202 for (i=0; i<n; i++) { 2203 for (j=0; j<bs; j++) { 2204 t[i*bs+j] = b[c[i]*bs+j]; 2205 } 2206 } 2207 2208 2209 /* forward solve the upper triangular transpose */ 2210 ls = a->solve_work + A->cmap->n; 2211 for (i=0; i<n; i++) { 2212 ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 2213 PetscKernel_w_gets_transA_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 2214 v = aa + bs2*(a->diag[i] + 1); 2215 vi = aj + a->diag[i] + 1; 2216 nz = ai[i+1] - a->diag[i] - 1; 2217 while (nz--) { 2218 PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs); 2219 v += bs2; 2220 } 2221 } 2222 2223 /* backward solve the lower triangular transpose */ 2224 for (i=n-1; i>=0; i--) { 2225 v = aa + bs2*ai[i]; 2226 vi = aj + ai[i]; 2227 nz = a->diag[i] - ai[i]; 2228 while (nz--) { 2229 PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs); 2230 v += bs2; 2231 } 2232 } 2233 2234 /* copy t into x according to permutation */ 2235 for (i=0; i<n; i++) { 2236 for (j=0; j<bs; j++) { 2237 x[bs*r[i]+j] = t[bs*i+j]; 2238 } 2239 } 2240 2241 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2242 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2243 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2244 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2245 ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 2246 PetscFunctionReturn(0); 2247 } 2248 2249 #undef __FUNCT__ 2250 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N" 2251 PetscErrorCode MatSolveTranspose_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 2252 { 2253 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 2254 IS iscol=a->col,isrow=a->row; 2255 PetscErrorCode ierr; 2256 const PetscInt *r,*c,*rout,*cout; 2257 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*vi,*diag=a->diag; 2258 PetscInt i,j,nz; 2259 const PetscInt bs =A->rmap->bs,bs2=a->bs2; 2260 const MatScalar *aa=a->a,*v; 2261 PetscScalar *x,*t,*ls; 2262 const PetscScalar *b; 2263 2264 PetscFunctionBegin; 2265 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2266 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2267 t = a->solve_work; 2268 2269 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2270 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2271 2272 /* copy the b into temp work space according to permutation */ 2273 for (i=0; i<n; i++) { 2274 for (j=0; j<bs; j++) { 2275 t[i*bs+j] = b[c[i]*bs+j]; 2276 } 2277 } 2278 2279 2280 /* forward solve the upper triangular transpose */ 2281 ls = a->solve_work + A->cmap->n; 2282 for (i=0; i<n; i++) { 2283 ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 2284 PetscKernel_w_gets_transA_times_v(bs,ls,aa+bs2*diag[i],t+i*bs); 2285 v = aa + bs2*(diag[i] - 1); 2286 vi = aj + diag[i] - 1; 2287 nz = diag[i] - diag[i+1] - 1; 2288 for (j=0; j>-nz; j--) { 2289 PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs); 2290 v -= bs2; 2291 } 2292 } 2293 2294 /* backward solve the lower triangular transpose */ 2295 for (i=n-1; i>=0; i--) { 2296 v = aa + bs2*ai[i]; 2297 vi = aj + ai[i]; 2298 nz = ai[i+1] - ai[i]; 2299 for (j=0; j<nz; j++) { 2300 PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs); 2301 v += bs2; 2302 } 2303 } 2304 2305 /* copy t into x according to permutation */ 2306 for (i=0; i<n; i++) { 2307 for (j=0; j<bs; j++) { 2308 x[bs*r[i]+j] = t[bs*i+j]; 2309 } 2310 } 2311 2312 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2313 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2314 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2315 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2316 ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 2317 PetscFunctionReturn(0); 2318 } 2319 2320 /* bs = 15 for PFLOTRAN. Block operations are done by accessing all the columns of the block at once */ 2321 2322 #undef __FUNCT__ 2323 #define __FUNCT__ "MatSolve_SeqBAIJ_15_NaturalOrdering_ver2" 2324 PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver2(Mat A,Vec bb,Vec xx) 2325 { 2326 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data; 2327 PetscErrorCode ierr; 2328 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2; 2329 PetscInt i,nz,idx,idt,m; 2330 const MatScalar *aa=a->a,*v; 2331 PetscScalar s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15; 2332 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15; 2333 PetscScalar *x; 2334 const PetscScalar *b; 2335 2336 PetscFunctionBegin; 2337 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2338 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2339 2340 /* forward solve the lower triangular */ 2341 idx = 0; 2342 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx]; x[4] = b[4+idx]; 2343 x[5] = b[5+idx]; x[6] = b[6+idx]; x[7] = b[7+idx]; x[8] = b[8+idx]; x[9] = b[9+idx]; 2344 x[10] = b[10+idx]; x[11] = b[11+idx]; x[12] = b[12+idx]; x[13] = b[13+idx]; x[14] = b[14+idx]; 2345 2346 for (i=1; i<n; i++) { 2347 v = aa + bs2*ai[i]; 2348 vi = aj + ai[i]; 2349 nz = ai[i+1] - ai[i]; 2350 idt = bs*i; 2351 s1 = b[idt]; s2 = b[1+idt]; s3 = b[2+idt]; s4 = b[3+idt]; s5 = b[4+idt]; 2352 s6 = b[5+idt]; s7 = b[6+idt]; s8 = b[7+idt]; s9 = b[8+idt]; s10 = b[9+idt]; 2353 s11 = b[10+idt]; s12 = b[11+idt]; s13 = b[12+idt]; s14 = b[13+idt]; s15 = b[14+idt]; 2354 for (m=0; m<nz; m++) { 2355 idx = bs*vi[m]; 2356 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 2357 x6 = x[5+idx]; x7 = x[6+idx]; x8 = x[7+idx]; x9 = x[8+idx]; x10 = x[9+idx]; 2358 x11 = x[10+idx]; x12 = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx]; 2359 2360 2361 s1 -= v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15; 2362 s2 -= v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15; 2363 s3 -= v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15; 2364 s4 -= v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15; 2365 s5 -= v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15; 2366 s6 -= v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15; 2367 s7 -= v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15; 2368 s8 -= v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15; 2369 s9 -= v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15; 2370 s10 -= v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15; 2371 s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15; 2372 s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15; 2373 s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15; 2374 s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15; 2375 s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15; 2376 2377 v += bs2; 2378 } 2379 x[idt] = s1; x[1+idt] = s2; x[2+idt] = s3; x[3+idt] = s4; x[4+idt] = s5; 2380 x[5+idt] = s6; x[6+idt] = s7; x[7+idt] = s8; x[8+idt] = s9; x[9+idt] = s10; 2381 x[10+idt] = s11; x[11+idt] = s12; x[12+idt] = s13; x[13+idt] = s14; x[14+idt] = s15; 2382 2383 } 2384 /* backward solve the upper triangular */ 2385 for (i=n-1; i>=0; i--) { 2386 v = aa + bs2*(adiag[i+1]+1); 2387 vi = aj + adiag[i+1]+1; 2388 nz = adiag[i] - adiag[i+1] - 1; 2389 idt = bs*i; 2390 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt]; 2391 s6 = x[5+idt]; s7 = x[6+idt]; s8 = x[7+idt]; s9 = x[8+idt]; s10 = x[9+idt]; 2392 s11 = x[10+idt]; s12 = x[11+idt]; s13 = x[12+idt]; s14 = x[13+idt]; s15 = x[14+idt]; 2393 2394 for (m=0; m<nz; m++) { 2395 idx = bs*vi[m]; 2396 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 2397 x6 = x[5+idx]; x7 = x[6+idx]; x8 = x[7+idx]; x9 = x[8+idx]; x10 = x[9+idx]; 2398 x11 = x[10+idx]; x12 = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx]; 2399 2400 s1 -= v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15; 2401 s2 -= v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15; 2402 s3 -= v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15; 2403 s4 -= v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15; 2404 s5 -= v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15; 2405 s6 -= v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15; 2406 s7 -= v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15; 2407 s8 -= v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15; 2408 s9 -= v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15; 2409 s10 -= v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15; 2410 s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15; 2411 s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15; 2412 s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15; 2413 s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15; 2414 s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15; 2415 2416 v += bs2; 2417 } 2418 2419 x[idt] = v[0]*s1 + v[15]*s2 + v[30]*s3 + v[45]*s4 + v[60]*s5 + v[75]*s6 + v[90]*s7 + v[105]*s8 + v[120]*s9 + v[135]*s10 + v[150]*s11 + v[165]*s12 + v[180]*s13 + v[195]*s14 + v[210]*s15; 2420 x[1+idt] = v[1]*s1 + v[16]*s2 + v[31]*s3 + v[46]*s4 + v[61]*s5 + v[76]*s6 + v[91]*s7 + v[106]*s8 + v[121]*s9 + v[136]*s10 + v[151]*s11 + v[166]*s12 + v[181]*s13 + v[196]*s14 + v[211]*s15; 2421 x[2+idt] = v[2]*s1 + v[17]*s2 + v[32]*s3 + v[47]*s4 + v[62]*s5 + v[77]*s6 + v[92]*s7 + v[107]*s8 + v[122]*s9 + v[137]*s10 + v[152]*s11 + v[167]*s12 + v[182]*s13 + v[197]*s14 + v[212]*s15; 2422 x[3+idt] = v[3]*s1 + v[18]*s2 + v[33]*s3 + v[48]*s4 + v[63]*s5 + v[78]*s6 + v[93]*s7 + v[108]*s8 + v[123]*s9 + v[138]*s10 + v[153]*s11 + v[168]*s12 + v[183]*s13 + v[198]*s14 + v[213]*s15; 2423 x[4+idt] = v[4]*s1 + v[19]*s2 + v[34]*s3 + v[49]*s4 + v[64]*s5 + v[79]*s6 + v[94]*s7 + v[109]*s8 + v[124]*s9 + v[139]*s10 + v[154]*s11 + v[169]*s12 + v[184]*s13 + v[199]*s14 + v[214]*s15; 2424 x[5+idt] = v[5]*s1 + v[20]*s2 + v[35]*s3 + v[50]*s4 + v[65]*s5 + v[80]*s6 + v[95]*s7 + v[110]*s8 + v[125]*s9 + v[140]*s10 + v[155]*s11 + v[170]*s12 + v[185]*s13 + v[200]*s14 + v[215]*s15; 2425 x[6+idt] = v[6]*s1 + v[21]*s2 + v[36]*s3 + v[51]*s4 + v[66]*s5 + v[81]*s6 + v[96]*s7 + v[111]*s8 + v[126]*s9 + v[141]*s10 + v[156]*s11 + v[171]*s12 + v[186]*s13 + v[201]*s14 + v[216]*s15; 2426 x[7+idt] = v[7]*s1 + v[22]*s2 + v[37]*s3 + v[52]*s4 + v[67]*s5 + v[82]*s6 + v[97]*s7 + v[112]*s8 + v[127]*s9 + v[142]*s10 + v[157]*s11 + v[172]*s12 + v[187]*s13 + v[202]*s14 + v[217]*s15; 2427 x[8+idt] = v[8]*s1 + v[23]*s2 + v[38]*s3 + v[53]*s4 + v[68]*s5 + v[83]*s6 + v[98]*s7 + v[113]*s8 + v[128]*s9 + v[143]*s10 + v[158]*s11 + v[173]*s12 + v[188]*s13 + v[203]*s14 + v[218]*s15; 2428 x[9+idt] = v[9]*s1 + v[24]*s2 + v[39]*s3 + v[54]*s4 + v[69]*s5 + v[84]*s6 + v[99]*s7 + v[114]*s8 + v[129]*s9 + v[144]*s10 + v[159]*s11 + v[174]*s12 + v[189]*s13 + v[204]*s14 + v[219]*s15; 2429 x[10+idt] = v[10]*s1 + v[25]*s2 + v[40]*s3 + v[55]*s4 + v[70]*s5 + v[85]*s6 + v[100]*s7 + v[115]*s8 + v[130]*s9 + v[145]*s10 + v[160]*s11 + v[175]*s12 + v[190]*s13 + v[205]*s14 + v[220]*s15; 2430 x[11+idt] = v[11]*s1 + v[26]*s2 + v[41]*s3 + v[56]*s4 + v[71]*s5 + v[86]*s6 + v[101]*s7 + v[116]*s8 + v[131]*s9 + v[146]*s10 + v[161]*s11 + v[176]*s12 + v[191]*s13 + v[206]*s14 + v[221]*s15; 2431 x[12+idt] = v[12]*s1 + v[27]*s2 + v[42]*s3 + v[57]*s4 + v[72]*s5 + v[87]*s6 + v[102]*s7 + v[117]*s8 + v[132]*s9 + v[147]*s10 + v[162]*s11 + v[177]*s12 + v[192]*s13 + v[207]*s14 + v[222]*s15; 2432 x[13+idt] = v[13]*s1 + v[28]*s2 + v[43]*s3 + v[58]*s4 + v[73]*s5 + v[88]*s6 + v[103]*s7 + v[118]*s8 + v[133]*s9 + v[148]*s10 + v[163]*s11 + v[178]*s12 + v[193]*s13 + v[208]*s14 + v[223]*s15; 2433 x[14+idt] = v[14]*s1 + v[29]*s2 + v[44]*s3 + v[59]*s4 + v[74]*s5 + v[89]*s6 + v[104]*s7 + v[119]*s8 + v[134]*s9 + v[149]*s10 + v[164]*s11 + v[179]*s12 + v[194]*s13 + v[209]*s14 + v[224]*s15; 2434 2435 } 2436 2437 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2438 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2439 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 2440 PetscFunctionReturn(0); 2441 } 2442 2443 /* bs = 15 for PFLOTRAN. Block operations are done by accessing one column at at time */ 2444 /* Default MatSolve for block size 15 */ 2445 2446 #undef __FUNCT__ 2447 #define __FUNCT__ "MatSolve_SeqBAIJ_15_NaturalOrdering_ver1" 2448 PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver1(Mat A,Vec bb,Vec xx) 2449 { 2450 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data; 2451 PetscErrorCode ierr; 2452 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2; 2453 PetscInt i,k,nz,idx,idt,m; 2454 const MatScalar *aa=a->a,*v; 2455 PetscScalar s[15]; 2456 PetscScalar *x,xv; 2457 const PetscScalar *b; 2458 2459 PetscFunctionBegin; 2460 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2461 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2462 2463 /* forward solve the lower triangular */ 2464 for (i=0; i<n; i++) { 2465 v = aa + bs2*ai[i]; 2466 vi = aj + ai[i]; 2467 nz = ai[i+1] - ai[i]; 2468 idt = bs*i; 2469 x[idt] = b[idt]; x[1+idt] = b[1+idt]; x[2+idt] = b[2+idt]; x[3+idt] = b[3+idt]; x[4+idt] = b[4+idt]; 2470 x[5+idt] = b[5+idt]; x[6+idt] = b[6+idt]; x[7+idt] = b[7+idt]; x[8+idt] = b[8+idt]; x[9+idt] = b[9+idt]; 2471 x[10+idt] = b[10+idt]; x[11+idt] = b[11+idt]; x[12+idt] = b[12+idt]; x[13+idt] = b[13+idt]; x[14+idt] = b[14+idt]; 2472 for (m=0; m<nz; m++) { 2473 idx = bs*vi[m]; 2474 for (k=0; k<15; k++) { 2475 xv = x[k + idx]; 2476 x[idt] -= v[0]*xv; 2477 x[1+idt] -= v[1]*xv; 2478 x[2+idt] -= v[2]*xv; 2479 x[3+idt] -= v[3]*xv; 2480 x[4+idt] -= v[4]*xv; 2481 x[5+idt] -= v[5]*xv; 2482 x[6+idt] -= v[6]*xv; 2483 x[7+idt] -= v[7]*xv; 2484 x[8+idt] -= v[8]*xv; 2485 x[9+idt] -= v[9]*xv; 2486 x[10+idt] -= v[10]*xv; 2487 x[11+idt] -= v[11]*xv; 2488 x[12+idt] -= v[12]*xv; 2489 x[13+idt] -= v[13]*xv; 2490 x[14+idt] -= v[14]*xv; 2491 v += 15; 2492 } 2493 } 2494 } 2495 /* backward solve the upper triangular */ 2496 for (i=n-1; i>=0; i--) { 2497 v = aa + bs2*(adiag[i+1]+1); 2498 vi = aj + adiag[i+1]+1; 2499 nz = adiag[i] - adiag[i+1] - 1; 2500 idt = bs*i; 2501 s[0] = x[idt]; s[1] = x[1+idt]; s[2] = x[2+idt]; s[3] = x[3+idt]; s[4] = x[4+idt]; 2502 s[5] = x[5+idt]; s[6] = x[6+idt]; s[7] = x[7+idt]; s[8] = x[8+idt]; s[9] = x[9+idt]; 2503 s[10] = x[10+idt]; s[11] = x[11+idt]; s[12] = x[12+idt]; s[13] = x[13+idt]; s[14] = x[14+idt]; 2504 2505 for (m=0; m<nz; m++) { 2506 idx = bs*vi[m]; 2507 for (k=0; k<15; k++) { 2508 xv = x[k + idx]; 2509 s[0] -= v[0]*xv; 2510 s[1] -= v[1]*xv; 2511 s[2] -= v[2]*xv; 2512 s[3] -= v[3]*xv; 2513 s[4] -= v[4]*xv; 2514 s[5] -= v[5]*xv; 2515 s[6] -= v[6]*xv; 2516 s[7] -= v[7]*xv; 2517 s[8] -= v[8]*xv; 2518 s[9] -= v[9]*xv; 2519 s[10] -= v[10]*xv; 2520 s[11] -= v[11]*xv; 2521 s[12] -= v[12]*xv; 2522 s[13] -= v[13]*xv; 2523 s[14] -= v[14]*xv; 2524 v += 15; 2525 } 2526 } 2527 ierr = PetscMemzero(x+idt,bs*sizeof(MatScalar));CHKERRQ(ierr); 2528 for (k=0; k<15; k++) { 2529 x[idt] += v[0]*s[k]; 2530 x[1+idt] += v[1]*s[k]; 2531 x[2+idt] += v[2]*s[k]; 2532 x[3+idt] += v[3]*s[k]; 2533 x[4+idt] += v[4]*s[k]; 2534 x[5+idt] += v[5]*s[k]; 2535 x[6+idt] += v[6]*s[k]; 2536 x[7+idt] += v[7]*s[k]; 2537 x[8+idt] += v[8]*s[k]; 2538 x[9+idt] += v[9]*s[k]; 2539 x[10+idt] += v[10]*s[k]; 2540 x[11+idt] += v[11]*s[k]; 2541 x[12+idt] += v[12]*s[k]; 2542 x[13+idt] += v[13]*s[k]; 2543 x[14+idt] += v[14]*s[k]; 2544 v += 15; 2545 } 2546 } 2547 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2548 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2549 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 2550 PetscFunctionReturn(0); 2551 } 2552 2553 2554 #undef __FUNCT__ 2555 #define __FUNCT__ "MatSolve_SeqBAIJ_7_inplace" 2556 PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx) 2557 { 2558 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 2559 IS iscol=a->col,isrow=a->row; 2560 PetscErrorCode ierr; 2561 const PetscInt *r,*c,*ai=a->i,*aj=a->j; 2562 const PetscInt *rout,*cout,*diag = a->diag,*vi,n=a->mbs; 2563 PetscInt i,nz,idx,idt,idc; 2564 const MatScalar *aa=a->a,*v; 2565 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t; 2566 const PetscScalar *b; 2567 2568 PetscFunctionBegin; 2569 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2570 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2571 t = a->solve_work; 2572 2573 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2574 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2575 2576 /* forward solve the lower triangular */ 2577 idx = 7*(*r++); 2578 t[0] = b[idx]; t[1] = b[1+idx]; 2579 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 2580 t[5] = b[5+idx]; t[6] = b[6+idx]; 2581 2582 for (i=1; i<n; i++) { 2583 v = aa + 49*ai[i]; 2584 vi = aj + ai[i]; 2585 nz = diag[i] - ai[i]; 2586 idx = 7*(*r++); 2587 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2588 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 2589 while (nz--) { 2590 idx = 7*(*vi++); 2591 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 2592 x4 = t[3+idx];x5 = t[4+idx]; 2593 x6 = t[5+idx];x7 = t[6+idx]; 2594 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2595 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2596 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2597 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2598 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2599 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2600 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2601 v += 49; 2602 } 2603 idx = 7*i; 2604 t[idx] = s1;t[1+idx] = s2; 2605 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 2606 t[5+idx] = s6;t[6+idx] = s7; 2607 } 2608 /* backward solve the upper triangular */ 2609 for (i=n-1; i>=0; i--) { 2610 v = aa + 49*diag[i] + 49; 2611 vi = aj + diag[i] + 1; 2612 nz = ai[i+1] - diag[i] - 1; 2613 idt = 7*i; 2614 s1 = t[idt]; s2 = t[1+idt]; 2615 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 2616 s6 = t[5+idt];s7 = t[6+idt]; 2617 while (nz--) { 2618 idx = 7*(*vi++); 2619 x1 = t[idx]; x2 = t[1+idx]; 2620 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 2621 x6 = t[5+idx]; x7 = t[6+idx]; 2622 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2623 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2624 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2625 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2626 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2627 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2628 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2629 v += 49; 2630 } 2631 idc = 7*(*c--); 2632 v = aa + 49*diag[i]; 2633 x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 2634 v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 2635 x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 2636 v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 2637 x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 2638 v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 2639 x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 2640 v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 2641 x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 2642 v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 2643 x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 2644 v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 2645 x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 2646 v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 2647 } 2648 2649 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2650 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2651 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2652 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2653 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 2654 PetscFunctionReturn(0); 2655 } 2656 2657 #undef __FUNCT__ 2658 #define __FUNCT__ "MatSolve_SeqBAIJ_7" 2659 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 2660 { 2661 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 2662 IS iscol=a->col,isrow=a->row; 2663 PetscErrorCode ierr; 2664 const PetscInt *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag; 2665 const PetscInt n=a->mbs,*rout,*cout,*vi; 2666 PetscInt i,nz,idx,idt,idc,m; 2667 const MatScalar *aa=a->a,*v; 2668 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t; 2669 const PetscScalar *b; 2670 2671 PetscFunctionBegin; 2672 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2673 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2674 t = a->solve_work; 2675 2676 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2677 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2678 2679 /* forward solve the lower triangular */ 2680 idx = 7*r[0]; 2681 t[0] = b[idx]; t[1] = b[1+idx]; 2682 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 2683 t[5] = b[5+idx]; t[6] = b[6+idx]; 2684 2685 for (i=1; i<n; i++) { 2686 v = aa + 49*ai[i]; 2687 vi = aj + ai[i]; 2688 nz = ai[i+1] - ai[i]; 2689 idx = 7*r[i]; 2690 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2691 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 2692 for (m=0; m<nz; m++) { 2693 idx = 7*vi[m]; 2694 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 2695 x4 = t[3+idx];x5 = t[4+idx]; 2696 x6 = t[5+idx];x7 = t[6+idx]; 2697 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2698 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2699 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2700 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2701 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2702 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2703 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2704 v += 49; 2705 } 2706 idx = 7*i; 2707 t[idx] = s1;t[1+idx] = s2; 2708 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 2709 t[5+idx] = s6;t[6+idx] = s7; 2710 } 2711 /* backward solve the upper triangular */ 2712 for (i=n-1; i>=0; i--) { 2713 v = aa + 49*(adiag[i+1]+1); 2714 vi = aj + adiag[i+1]+1; 2715 nz = adiag[i] - adiag[i+1] - 1; 2716 idt = 7*i; 2717 s1 = t[idt]; s2 = t[1+idt]; 2718 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 2719 s6 = t[5+idt];s7 = t[6+idt]; 2720 for (m=0; m<nz; m++) { 2721 idx = 7*vi[m]; 2722 x1 = t[idx]; x2 = t[1+idx]; 2723 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 2724 x6 = t[5+idx]; x7 = t[6+idx]; 2725 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2726 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2727 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2728 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2729 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2730 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2731 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2732 v += 49; 2733 } 2734 idc = 7*c[i]; 2735 x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 2736 v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 2737 x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 2738 v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 2739 x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 2740 v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 2741 x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 2742 v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 2743 x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 2744 v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 2745 x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 2746 v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 2747 x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 2748 v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 2749 } 2750 2751 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2752 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2753 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2754 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2755 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 2756 PetscFunctionReturn(0); 2757 } 2758 2759 #undef __FUNCT__ 2760 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_inplace" 2761 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 2762 { 2763 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2764 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 2765 PetscErrorCode ierr; 2766 PetscInt i,nz,idx,idt,jdx; 2767 const MatScalar *aa=a->a,*v; 2768 PetscScalar *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 2769 const PetscScalar *b; 2770 2771 PetscFunctionBegin; 2772 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2773 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2774 /* forward solve the lower triangular */ 2775 idx = 0; 2776 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 2777 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 2778 x[6] = b[6+idx]; 2779 for (i=1; i<n; i++) { 2780 v = aa + 49*ai[i]; 2781 vi = aj + ai[i]; 2782 nz = diag[i] - ai[i]; 2783 idx = 7*i; 2784 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 2785 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 2786 s7 = b[6+idx]; 2787 while (nz--) { 2788 jdx = 7*(*vi++); 2789 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 2790 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 2791 x7 = x[6+jdx]; 2792 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2793 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2794 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2795 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2796 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2797 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2798 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2799 v += 49; 2800 } 2801 x[idx] = s1; 2802 x[1+idx] = s2; 2803 x[2+idx] = s3; 2804 x[3+idx] = s4; 2805 x[4+idx] = s5; 2806 x[5+idx] = s6; 2807 x[6+idx] = s7; 2808 } 2809 /* backward solve the upper triangular */ 2810 for (i=n-1; i>=0; i--) { 2811 v = aa + 49*diag[i] + 49; 2812 vi = aj + diag[i] + 1; 2813 nz = ai[i+1] - diag[i] - 1; 2814 idt = 7*i; 2815 s1 = x[idt]; s2 = x[1+idt]; 2816 s3 = x[2+idt]; s4 = x[3+idt]; 2817 s5 = x[4+idt]; s6 = x[5+idt]; 2818 s7 = x[6+idt]; 2819 while (nz--) { 2820 idx = 7*(*vi++); 2821 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 2822 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 2823 x7 = x[6+idx]; 2824 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2825 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2826 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2827 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2828 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2829 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2830 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2831 v += 49; 2832 } 2833 v = aa + 49*diag[i]; 2834 x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 2835 + v[28]*s5 + v[35]*s6 + v[42]*s7; 2836 x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 2837 + v[29]*s5 + v[36]*s6 + v[43]*s7; 2838 x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 2839 + v[30]*s5 + v[37]*s6 + v[44]*s7; 2840 x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 2841 + v[31]*s5 + v[38]*s6 + v[45]*s7; 2842 x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 2843 + v[32]*s5 + v[39]*s6 + v[46]*s7; 2844 x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 2845 + v[33]*s5 + v[40]*s6 + v[47]*s7; 2846 x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 2847 + v[34]*s5 + v[41]*s6 + v[48]*s7; 2848 } 2849 2850 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2851 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2852 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 2853 PetscFunctionReturn(0); 2854 } 2855 2856 #undef __FUNCT__ 2857 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering" 2858 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 2859 { 2860 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2861 const PetscInt n =a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 2862 PetscErrorCode ierr; 2863 PetscInt i,k,nz,idx,jdx,idt; 2864 const PetscInt bs = A->rmap->bs,bs2 = a->bs2; 2865 const MatScalar *aa=a->a,*v; 2866 PetscScalar *x; 2867 const PetscScalar *b; 2868 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 2869 2870 PetscFunctionBegin; 2871 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2872 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2873 /* forward solve the lower triangular */ 2874 idx = 0; 2875 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 2876 x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx]; 2877 for (i=1; i<n; i++) { 2878 v = aa + bs2*ai[i]; 2879 vi = aj + ai[i]; 2880 nz = ai[i+1] - ai[i]; 2881 idx = bs*i; 2882 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2883 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 2884 for (k=0; k<nz; k++) { 2885 jdx = bs*vi[k]; 2886 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 2887 x5 = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx]; 2888 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2889 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2890 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2891 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2892 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2893 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2894 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2895 v += bs2; 2896 } 2897 2898 x[idx] = s1; 2899 x[1+idx] = s2; 2900 x[2+idx] = s3; 2901 x[3+idx] = s4; 2902 x[4+idx] = s5; 2903 x[5+idx] = s6; 2904 x[6+idx] = s7; 2905 } 2906 2907 /* backward solve the upper triangular */ 2908 for (i=n-1; i>=0; i--) { 2909 v = aa + bs2*(adiag[i+1]+1); 2910 vi = aj + adiag[i+1]+1; 2911 nz = adiag[i] - adiag[i+1]-1; 2912 idt = bs*i; 2913 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 2914 s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt]; 2915 for (k=0; k<nz; k++) { 2916 idx = bs*vi[k]; 2917 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 2918 x5 = x[4+idx];x6 = x[5+idx];x7 = x[6+idx]; 2919 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2920 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2921 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2922 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2923 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2924 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2925 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2926 v += bs2; 2927 } 2928 /* x = inv_diagonal*x */ 2929 x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 + v[28]*s5 + v[35]*s6 + v[42]*s7; 2930 x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 + v[29]*s5 + v[36]*s6 + v[43]*s7; 2931 x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 + v[30]*s5 + v[37]*s6 + v[44]*s7; 2932 x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 + v[31]*s5 + v[38]*s6 + v[45]*s7; 2933 x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 + v[32]*s5 + v[39]*s6 + v[46]*s7; 2934 x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 + v[33]*s5 + v[40]*s6 + v[47]*s7; 2935 x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 + v[34]*s5 + v[41]*s6 + v[48]*s7; 2936 } 2937 2938 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2939 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2940 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 2941 PetscFunctionReturn(0); 2942 } 2943 2944 #undef __FUNCT__ 2945 #define __FUNCT__ "MatSolve_SeqBAIJ_6_inplace" 2946 PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx) 2947 { 2948 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 2949 IS iscol=a->col,isrow=a->row; 2950 PetscErrorCode ierr; 2951 const PetscInt *r,*c,*rout,*cout; 2952 const PetscInt *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 2953 PetscInt i,nz,idx,idt,idc; 2954 const MatScalar *aa=a->a,*v; 2955 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 2956 const PetscScalar *b; 2957 2958 PetscFunctionBegin; 2959 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2960 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2961 t = a->solve_work; 2962 2963 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2964 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2965 2966 /* forward solve the lower triangular */ 2967 idx = 6*(*r++); 2968 t[0] = b[idx]; t[1] = b[1+idx]; 2969 t[2] = b[2+idx]; t[3] = b[3+idx]; 2970 t[4] = b[4+idx]; t[5] = b[5+idx]; 2971 for (i=1; i<n; i++) { 2972 v = aa + 36*ai[i]; 2973 vi = aj + ai[i]; 2974 nz = diag[i] - ai[i]; 2975 idx = 6*(*r++); 2976 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2977 s5 = b[4+idx]; s6 = b[5+idx]; 2978 while (nz--) { 2979 idx = 6*(*vi++); 2980 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2981 x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 2982 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2983 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2984 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2985 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2986 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2987 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2988 v += 36; 2989 } 2990 idx = 6*i; 2991 t[idx] = s1;t[1+idx] = s2; 2992 t[2+idx] = s3;t[3+idx] = s4; 2993 t[4+idx] = s5;t[5+idx] = s6; 2994 } 2995 /* backward solve the upper triangular */ 2996 for (i=n-1; i>=0; i--) { 2997 v = aa + 36*diag[i] + 36; 2998 vi = aj + diag[i] + 1; 2999 nz = ai[i+1] - diag[i] - 1; 3000 idt = 6*i; 3001 s1 = t[idt]; s2 = t[1+idt]; 3002 s3 = t[2+idt];s4 = t[3+idt]; 3003 s5 = t[4+idt];s6 = t[5+idt]; 3004 while (nz--) { 3005 idx = 6*(*vi++); 3006 x1 = t[idx]; x2 = t[1+idx]; 3007 x3 = t[2+idx]; x4 = t[3+idx]; 3008 x5 = t[4+idx]; x6 = t[5+idx]; 3009 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 3010 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 3011 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 3012 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 3013 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 3014 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 3015 v += 36; 3016 } 3017 idc = 6*(*c--); 3018 v = aa + 36*diag[i]; 3019 x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 3020 v[18]*s4+v[24]*s5+v[30]*s6; 3021 x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 3022 v[19]*s4+v[25]*s5+v[31]*s6; 3023 x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 3024 v[20]*s4+v[26]*s5+v[32]*s6; 3025 x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 3026 v[21]*s4+v[27]*s5+v[33]*s6; 3027 x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 3028 v[22]*s4+v[28]*s5+v[34]*s6; 3029 x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 3030 v[23]*s4+v[29]*s5+v[35]*s6; 3031 } 3032 3033 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3034 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3035 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3036 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3037 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 3038 PetscFunctionReturn(0); 3039 } 3040 3041 #undef __FUNCT__ 3042 #define __FUNCT__ "MatSolve_SeqBAIJ_6" 3043 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 3044 { 3045 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 3046 IS iscol=a->col,isrow=a->row; 3047 PetscErrorCode ierr; 3048 const PetscInt *r,*c,*rout,*cout; 3049 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 3050 PetscInt i,nz,idx,idt,idc,m; 3051 const MatScalar *aa=a->a,*v; 3052 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 3053 const PetscScalar *b; 3054 3055 PetscFunctionBegin; 3056 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3057 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3058 t = a->solve_work; 3059 3060 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3061 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 3062 3063 /* forward solve the lower triangular */ 3064 idx = 6*r[0]; 3065 t[0] = b[idx]; t[1] = b[1+idx]; 3066 t[2] = b[2+idx]; t[3] = b[3+idx]; 3067 t[4] = b[4+idx]; t[5] = b[5+idx]; 3068 for (i=1; i<n; i++) { 3069 v = aa + 36*ai[i]; 3070 vi = aj + ai[i]; 3071 nz = ai[i+1] - ai[i]; 3072 idx = 6*r[i]; 3073 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3074 s5 = b[4+idx]; s6 = b[5+idx]; 3075 for (m=0; m<nz; m++) { 3076 idx = 6*vi[m]; 3077 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 3078 x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 3079 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 3080 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 3081 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 3082 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 3083 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 3084 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 3085 v += 36; 3086 } 3087 idx = 6*i; 3088 t[idx] = s1;t[1+idx] = s2; 3089 t[2+idx] = s3;t[3+idx] = s4; 3090 t[4+idx] = s5;t[5+idx] = s6; 3091 } 3092 /* backward solve the upper triangular */ 3093 for (i=n-1; i>=0; i--) { 3094 v = aa + 36*(adiag[i+1]+1); 3095 vi = aj + adiag[i+1]+1; 3096 nz = adiag[i] - adiag[i+1] - 1; 3097 idt = 6*i; 3098 s1 = t[idt]; s2 = t[1+idt]; 3099 s3 = t[2+idt];s4 = t[3+idt]; 3100 s5 = t[4+idt];s6 = t[5+idt]; 3101 for (m=0; m<nz; m++) { 3102 idx = 6*vi[m]; 3103 x1 = t[idx]; x2 = t[1+idx]; 3104 x3 = t[2+idx]; x4 = t[3+idx]; 3105 x5 = t[4+idx]; x6 = t[5+idx]; 3106 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 3107 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 3108 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 3109 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 3110 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 3111 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 3112 v += 36; 3113 } 3114 idc = 6*c[i]; 3115 x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 3116 v[18]*s4+v[24]*s5+v[30]*s6; 3117 x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 3118 v[19]*s4+v[25]*s5+v[31]*s6; 3119 x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 3120 v[20]*s4+v[26]*s5+v[32]*s6; 3121 x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 3122 v[21]*s4+v[27]*s5+v[33]*s6; 3123 x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 3124 v[22]*s4+v[28]*s5+v[34]*s6; 3125 x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 3126 v[23]*s4+v[29]*s5+v[35]*s6; 3127 } 3128 3129 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3130 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3131 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3132 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3133 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 3134 PetscFunctionReturn(0); 3135 } 3136 3137 #undef __FUNCT__ 3138 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_inplace" 3139 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 3140 { 3141 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 3142 PetscInt i,nz,idx,idt,jdx; 3143 PetscErrorCode ierr; 3144 const PetscInt *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j; 3145 const MatScalar *aa =a->a,*v; 3146 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 3147 const PetscScalar *b; 3148 3149 PetscFunctionBegin; 3150 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3151 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3152 /* forward solve the lower triangular */ 3153 idx = 0; 3154 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 3155 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 3156 for (i=1; i<n; i++) { 3157 v = aa + 36*ai[i]; 3158 vi = aj + ai[i]; 3159 nz = diag[i] - ai[i]; 3160 idx = 6*i; 3161 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 3162 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 3163 while (nz--) { 3164 jdx = 6*(*vi++); 3165 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 3166 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 3167 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 3168 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 3169 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 3170 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 3171 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 3172 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 3173 v += 36; 3174 } 3175 x[idx] = s1; 3176 x[1+idx] = s2; 3177 x[2+idx] = s3; 3178 x[3+idx] = s4; 3179 x[4+idx] = s5; 3180 x[5+idx] = s6; 3181 } 3182 /* backward solve the upper triangular */ 3183 for (i=n-1; i>=0; i--) { 3184 v = aa + 36*diag[i] + 36; 3185 vi = aj + diag[i] + 1; 3186 nz = ai[i+1] - diag[i] - 1; 3187 idt = 6*i; 3188 s1 = x[idt]; s2 = x[1+idt]; 3189 s3 = x[2+idt]; s4 = x[3+idt]; 3190 s5 = x[4+idt]; s6 = x[5+idt]; 3191 while (nz--) { 3192 idx = 6*(*vi++); 3193 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 3194 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 3195 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 3196 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 3197 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 3198 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 3199 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 3200 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 3201 v += 36; 3202 } 3203 v = aa + 36*diag[i]; 3204 x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 3205 x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 3206 x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 3207 x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 3208 x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 3209 x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 3210 } 3211 3212 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3213 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3214 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 3215 PetscFunctionReturn(0); 3216 } 3217 3218 #undef __FUNCT__ 3219 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering" 3220 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 3221 { 3222 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 3223 const PetscInt n =a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 3224 PetscErrorCode ierr; 3225 PetscInt i,k,nz,idx,jdx,idt; 3226 const PetscInt bs = A->rmap->bs,bs2 = a->bs2; 3227 const MatScalar *aa=a->a,*v; 3228 PetscScalar *x; 3229 const PetscScalar *b; 3230 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 3231 3232 PetscFunctionBegin; 3233 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3234 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3235 /* forward solve the lower triangular */ 3236 idx = 0; 3237 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 3238 x[4] = b[4+idx];x[5] = b[5+idx]; 3239 for (i=1; i<n; i++) { 3240 v = aa + bs2*ai[i]; 3241 vi = aj + ai[i]; 3242 nz = ai[i+1] - ai[i]; 3243 idx = bs*i; 3244 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3245 s5 = b[4+idx];s6 = b[5+idx]; 3246 for (k=0; k<nz; k++) { 3247 jdx = bs*vi[k]; 3248 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 3249 x5 = x[4+jdx]; x6 = x[5+jdx]; 3250 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 3251 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 3252 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 3253 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 3254 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 3255 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 3256 v += bs2; 3257 } 3258 3259 x[idx] = s1; 3260 x[1+idx] = s2; 3261 x[2+idx] = s3; 3262 x[3+idx] = s4; 3263 x[4+idx] = s5; 3264 x[5+idx] = s6; 3265 } 3266 3267 /* backward solve the upper triangular */ 3268 for (i=n-1; i>=0; i--) { 3269 v = aa + bs2*(adiag[i+1]+1); 3270 vi = aj + adiag[i+1]+1; 3271 nz = adiag[i] - adiag[i+1]-1; 3272 idt = bs*i; 3273 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 3274 s5 = x[4+idt];s6 = x[5+idt]; 3275 for (k=0; k<nz; k++) { 3276 idx = bs*vi[k]; 3277 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 3278 x5 = x[4+idx];x6 = x[5+idx]; 3279 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 3280 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 3281 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 3282 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 3283 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 3284 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 3285 v += bs2; 3286 } 3287 /* x = inv_diagonal*x */ 3288 x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 3289 x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 3290 x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 3291 x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 3292 x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 3293 x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 3294 } 3295 3296 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3297 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3298 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 3299 PetscFunctionReturn(0); 3300 } 3301 3302 #undef __FUNCT__ 3303 #define __FUNCT__ "MatSolve_SeqBAIJ_5_inplace" 3304 PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx) 3305 { 3306 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 3307 IS iscol=a->col,isrow=a->row; 3308 PetscErrorCode ierr; 3309 const PetscInt *r,*c,*rout,*cout,*diag = a->diag; 3310 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 3311 PetscInt i,nz,idx,idt,idc; 3312 const MatScalar *aa=a->a,*v; 3313 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 3314 const PetscScalar *b; 3315 3316 PetscFunctionBegin; 3317 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3318 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3319 t = a->solve_work; 3320 3321 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3322 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3323 3324 /* forward solve the lower triangular */ 3325 idx = 5*(*r++); 3326 t[0] = b[idx]; t[1] = b[1+idx]; 3327 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 3328 for (i=1; i<n; i++) { 3329 v = aa + 25*ai[i]; 3330 vi = aj + ai[i]; 3331 nz = diag[i] - ai[i]; 3332 idx = 5*(*r++); 3333 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3334 s5 = b[4+idx]; 3335 while (nz--) { 3336 idx = 5*(*vi++); 3337 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 3338 x4 = t[3+idx];x5 = t[4+idx]; 3339 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3340 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3341 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3342 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3343 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3344 v += 25; 3345 } 3346 idx = 5*i; 3347 t[idx] = s1;t[1+idx] = s2; 3348 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 3349 } 3350 /* backward solve the upper triangular */ 3351 for (i=n-1; i>=0; i--) { 3352 v = aa + 25*diag[i] + 25; 3353 vi = aj + diag[i] + 1; 3354 nz = ai[i+1] - diag[i] - 1; 3355 idt = 5*i; 3356 s1 = t[idt]; s2 = t[1+idt]; 3357 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 3358 while (nz--) { 3359 idx = 5*(*vi++); 3360 x1 = t[idx]; x2 = t[1+idx]; 3361 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 3362 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3363 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3364 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3365 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3366 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3367 v += 25; 3368 } 3369 idc = 5*(*c--); 3370 v = aa + 25*diag[i]; 3371 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 3372 v[15]*s4+v[20]*s5; 3373 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 3374 v[16]*s4+v[21]*s5; 3375 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 3376 v[17]*s4+v[22]*s5; 3377 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 3378 v[18]*s4+v[23]*s5; 3379 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 3380 v[19]*s4+v[24]*s5; 3381 } 3382 3383 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3384 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3385 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3386 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3387 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 3388 PetscFunctionReturn(0); 3389 } 3390 3391 #undef __FUNCT__ 3392 #define __FUNCT__ "MatSolve_SeqBAIJ_5" 3393 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 3394 { 3395 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 3396 IS iscol=a->col,isrow=a->row; 3397 PetscErrorCode ierr; 3398 const PetscInt *r,*c,*rout,*cout; 3399 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 3400 PetscInt i,nz,idx,idt,idc,m; 3401 const MatScalar *aa=a->a,*v; 3402 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 3403 const PetscScalar *b; 3404 3405 PetscFunctionBegin; 3406 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3407 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3408 t = a->solve_work; 3409 3410 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3411 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 3412 3413 /* forward solve the lower triangular */ 3414 idx = 5*r[0]; 3415 t[0] = b[idx]; t[1] = b[1+idx]; 3416 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 3417 for (i=1; i<n; i++) { 3418 v = aa + 25*ai[i]; 3419 vi = aj + ai[i]; 3420 nz = ai[i+1] - ai[i]; 3421 idx = 5*r[i]; 3422 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3423 s5 = b[4+idx]; 3424 for (m=0; m<nz; m++) { 3425 idx = 5*vi[m]; 3426 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 3427 x4 = t[3+idx];x5 = t[4+idx]; 3428 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3429 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3430 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3431 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3432 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3433 v += 25; 3434 } 3435 idx = 5*i; 3436 t[idx] = s1;t[1+idx] = s2; 3437 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 3438 } 3439 /* backward solve the upper triangular */ 3440 for (i=n-1; i>=0; i--) { 3441 v = aa + 25*(adiag[i+1]+1); 3442 vi = aj + adiag[i+1]+1; 3443 nz = adiag[i] - adiag[i+1] - 1; 3444 idt = 5*i; 3445 s1 = t[idt]; s2 = t[1+idt]; 3446 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 3447 for (m=0; m<nz; m++) { 3448 idx = 5*vi[m]; 3449 x1 = t[idx]; x2 = t[1+idx]; 3450 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 3451 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3452 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3453 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3454 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3455 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3456 v += 25; 3457 } 3458 idc = 5*c[i]; 3459 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 3460 v[15]*s4+v[20]*s5; 3461 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 3462 v[16]*s4+v[21]*s5; 3463 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 3464 v[17]*s4+v[22]*s5; 3465 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 3466 v[18]*s4+v[23]*s5; 3467 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 3468 v[19]*s4+v[24]*s5; 3469 } 3470 3471 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3472 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3473 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3474 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3475 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 3476 PetscFunctionReturn(0); 3477 } 3478 3479 #undef __FUNCT__ 3480 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_inplace" 3481 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 3482 { 3483 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 3484 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 3485 PetscInt i,nz,idx,idt,jdx; 3486 PetscErrorCode ierr; 3487 const MatScalar *aa=a->a,*v; 3488 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 3489 const PetscScalar *b; 3490 3491 PetscFunctionBegin; 3492 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3493 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3494 /* forward solve the lower triangular */ 3495 idx = 0; 3496 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 3497 for (i=1; i<n; i++) { 3498 v = aa + 25*ai[i]; 3499 vi = aj + ai[i]; 3500 nz = diag[i] - ai[i]; 3501 idx = 5*i; 3502 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 3503 while (nz--) { 3504 jdx = 5*(*vi++); 3505 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 3506 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3507 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3508 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3509 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3510 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3511 v += 25; 3512 } 3513 x[idx] = s1; 3514 x[1+idx] = s2; 3515 x[2+idx] = s3; 3516 x[3+idx] = s4; 3517 x[4+idx] = s5; 3518 } 3519 /* backward solve the upper triangular */ 3520 for (i=n-1; i>=0; i--) { 3521 v = aa + 25*diag[i] + 25; 3522 vi = aj + diag[i] + 1; 3523 nz = ai[i+1] - diag[i] - 1; 3524 idt = 5*i; 3525 s1 = x[idt]; s2 = x[1+idt]; 3526 s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 3527 while (nz--) { 3528 idx = 5*(*vi++); 3529 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 3530 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3531 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3532 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3533 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3534 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3535 v += 25; 3536 } 3537 v = aa + 25*diag[i]; 3538 x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 3539 x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 3540 x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 3541 x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 3542 x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 3543 } 3544 3545 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3546 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3547 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 3548 PetscFunctionReturn(0); 3549 } 3550 3551 #undef __FUNCT__ 3552 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering" 3553 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 3554 { 3555 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 3556 const PetscInt n = a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 3557 PetscInt i,k,nz,idx,idt,jdx; 3558 PetscErrorCode ierr; 3559 const MatScalar *aa=a->a,*v; 3560 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 3561 const PetscScalar *b; 3562 3563 PetscFunctionBegin; 3564 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3565 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3566 /* forward solve the lower triangular */ 3567 idx = 0; 3568 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 3569 for (i=1; i<n; i++) { 3570 v = aa + 25*ai[i]; 3571 vi = aj + ai[i]; 3572 nz = ai[i+1] - ai[i]; 3573 idx = 5*i; 3574 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 3575 for (k=0; k<nz; k++) { 3576 jdx = 5*vi[k]; 3577 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 3578 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3579 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3580 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3581 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3582 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3583 v += 25; 3584 } 3585 x[idx] = s1; 3586 x[1+idx] = s2; 3587 x[2+idx] = s3; 3588 x[3+idx] = s4; 3589 x[4+idx] = s5; 3590 } 3591 3592 /* backward solve the upper triangular */ 3593 for (i=n-1; i>=0; i--) { 3594 v = aa + 25*(adiag[i+1]+1); 3595 vi = aj + adiag[i+1]+1; 3596 nz = adiag[i] - adiag[i+1]-1; 3597 idt = 5*i; 3598 s1 = x[idt]; s2 = x[1+idt]; 3599 s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 3600 for (k=0; k<nz; k++) { 3601 idx = 5*vi[k]; 3602 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 3603 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3604 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3605 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3606 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3607 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3608 v += 25; 3609 } 3610 /* x = inv_diagonal*x */ 3611 x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 3612 x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 3613 x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 3614 x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 3615 x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 3616 } 3617 3618 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3619 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3620 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 3621 PetscFunctionReturn(0); 3622 } 3623 3624 #undef __FUNCT__ 3625 #define __FUNCT__ "MatSolve_SeqBAIJ_4_inplace" 3626 PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx) 3627 { 3628 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 3629 IS iscol=a->col,isrow=a->row; 3630 PetscErrorCode ierr; 3631 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 3632 PetscInt i,nz,idx,idt,idc; 3633 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3634 const MatScalar *aa=a->a,*v; 3635 PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 3636 const PetscScalar *b; 3637 3638 PetscFunctionBegin; 3639 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3640 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3641 t = a->solve_work; 3642 3643 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3644 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3645 3646 /* forward solve the lower triangular */ 3647 idx = 4*(*r++); 3648 t[0] = b[idx]; t[1] = b[1+idx]; 3649 t[2] = b[2+idx]; t[3] = b[3+idx]; 3650 for (i=1; i<n; i++) { 3651 v = aa + 16*ai[i]; 3652 vi = aj + ai[i]; 3653 nz = diag[i] - ai[i]; 3654 idx = 4*(*r++); 3655 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3656 while (nz--) { 3657 idx = 4*(*vi++); 3658 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 3659 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3660 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3661 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3662 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3663 v += 16; 3664 } 3665 idx = 4*i; 3666 t[idx] = s1;t[1+idx] = s2; 3667 t[2+idx] = s3;t[3+idx] = s4; 3668 } 3669 /* backward solve the upper triangular */ 3670 for (i=n-1; i>=0; i--) { 3671 v = aa + 16*diag[i] + 16; 3672 vi = aj + diag[i] + 1; 3673 nz = ai[i+1] - diag[i] - 1; 3674 idt = 4*i; 3675 s1 = t[idt]; s2 = t[1+idt]; 3676 s3 = t[2+idt];s4 = t[3+idt]; 3677 while (nz--) { 3678 idx = 4*(*vi++); 3679 x1 = t[idx]; x2 = t[1+idx]; 3680 x3 = t[2+idx]; x4 = t[3+idx]; 3681 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3682 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3683 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3684 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3685 v += 16; 3686 } 3687 idc = 4*(*c--); 3688 v = aa + 16*diag[i]; 3689 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 3690 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 3691 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 3692 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 3693 } 3694 3695 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3696 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3697 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3698 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3699 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3700 PetscFunctionReturn(0); 3701 } 3702 3703 #undef __FUNCT__ 3704 #define __FUNCT__ "MatSolve_SeqBAIJ_4" 3705 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 3706 { 3707 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 3708 IS iscol=a->col,isrow=a->row; 3709 PetscErrorCode ierr; 3710 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 3711 PetscInt i,nz,idx,idt,idc,m; 3712 const PetscInt *r,*c,*rout,*cout; 3713 const MatScalar *aa=a->a,*v; 3714 PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 3715 const PetscScalar *b; 3716 3717 PetscFunctionBegin; 3718 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3719 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3720 t = a->solve_work; 3721 3722 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3723 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 3724 3725 /* forward solve the lower triangular */ 3726 idx = 4*r[0]; 3727 t[0] = b[idx]; t[1] = b[1+idx]; 3728 t[2] = b[2+idx]; t[3] = b[3+idx]; 3729 for (i=1; i<n; i++) { 3730 v = aa + 16*ai[i]; 3731 vi = aj + ai[i]; 3732 nz = ai[i+1] - ai[i]; 3733 idx = 4*r[i]; 3734 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3735 for (m=0; m<nz; m++) { 3736 idx = 4*vi[m]; 3737 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 3738 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3739 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3740 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3741 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3742 v += 16; 3743 } 3744 idx = 4*i; 3745 t[idx] = s1;t[1+idx] = s2; 3746 t[2+idx] = s3;t[3+idx] = s4; 3747 } 3748 /* backward solve the upper triangular */ 3749 for (i=n-1; i>=0; i--) { 3750 v = aa + 16*(adiag[i+1]+1); 3751 vi = aj + adiag[i+1]+1; 3752 nz = adiag[i] - adiag[i+1] - 1; 3753 idt = 4*i; 3754 s1 = t[idt]; s2 = t[1+idt]; 3755 s3 = t[2+idt];s4 = t[3+idt]; 3756 for (m=0; m<nz; m++) { 3757 idx = 4*vi[m]; 3758 x1 = t[idx]; x2 = t[1+idx]; 3759 x3 = t[2+idx]; x4 = t[3+idx]; 3760 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3761 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3762 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3763 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3764 v += 16; 3765 } 3766 idc = 4*c[i]; 3767 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 3768 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 3769 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 3770 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 3771 } 3772 3773 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3774 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3775 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3776 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3777 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3778 PetscFunctionReturn(0); 3779 } 3780 3781 #undef __FUNCT__ 3782 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion" 3783 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx) 3784 { 3785 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 3786 IS iscol=a->col,isrow=a->row; 3787 PetscErrorCode ierr; 3788 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 3789 PetscInt i,nz,idx,idt,idc; 3790 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3791 const MatScalar *aa=a->a,*v; 3792 MatScalar s1,s2,s3,s4,x1,x2,x3,x4,*t; 3793 PetscScalar *x; 3794 const PetscScalar *b; 3795 3796 PetscFunctionBegin; 3797 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3798 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3799 t = (MatScalar*)a->solve_work; 3800 3801 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3802 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3803 3804 /* forward solve the lower triangular */ 3805 idx = 4*(*r++); 3806 t[0] = (MatScalar)b[idx]; 3807 t[1] = (MatScalar)b[1+idx]; 3808 t[2] = (MatScalar)b[2+idx]; 3809 t[3] = (MatScalar)b[3+idx]; 3810 for (i=1; i<n; i++) { 3811 v = aa + 16*ai[i]; 3812 vi = aj + ai[i]; 3813 nz = diag[i] - ai[i]; 3814 idx = 4*(*r++); 3815 s1 = (MatScalar)b[idx]; 3816 s2 = (MatScalar)b[1+idx]; 3817 s3 = (MatScalar)b[2+idx]; 3818 s4 = (MatScalar)b[3+idx]; 3819 while (nz--) { 3820 idx = 4*(*vi++); 3821 x1 = t[idx]; 3822 x2 = t[1+idx]; 3823 x3 = t[2+idx]; 3824 x4 = t[3+idx]; 3825 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3826 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3827 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3828 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3829 v += 16; 3830 } 3831 idx = 4*i; 3832 t[idx] = s1; 3833 t[1+idx] = s2; 3834 t[2+idx] = s3; 3835 t[3+idx] = s4; 3836 } 3837 /* backward solve the upper triangular */ 3838 for (i=n-1; i>=0; i--) { 3839 v = aa + 16*diag[i] + 16; 3840 vi = aj + diag[i] + 1; 3841 nz = ai[i+1] - diag[i] - 1; 3842 idt = 4*i; 3843 s1 = t[idt]; 3844 s2 = t[1+idt]; 3845 s3 = t[2+idt]; 3846 s4 = t[3+idt]; 3847 while (nz--) { 3848 idx = 4*(*vi++); 3849 x1 = t[idx]; 3850 x2 = t[1+idx]; 3851 x3 = t[2+idx]; 3852 x4 = t[3+idx]; 3853 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3854 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3855 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3856 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3857 v += 16; 3858 } 3859 idc = 4*(*c--); 3860 v = aa + 16*diag[i]; 3861 t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 3862 t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 3863 t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 3864 t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 3865 x[idc] = (PetscScalar)t[idt]; 3866 x[1+idc] = (PetscScalar)t[1+idt]; 3867 x[2+idc] = (PetscScalar)t[2+idt]; 3868 x[3+idc] = (PetscScalar)t[3+idt]; 3869 } 3870 3871 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3872 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3873 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3874 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3875 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3876 PetscFunctionReturn(0); 3877 } 3878 3879 #if defined(PETSC_HAVE_SSE) 3880 3881 #include PETSC_HAVE_SSE 3882 3883 #undef __FUNCT__ 3884 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion" 3885 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx) 3886 { 3887 /* 3888 Note: This code uses demotion of double 3889 to float when performing the mixed-mode computation. 3890 This may not be numerically reasonable for all applications. 3891 */ 3892 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 3893 IS iscol=a->col,isrow=a->row; 3894 PetscErrorCode ierr; 3895 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16; 3896 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3897 MatScalar *aa=a->a,*v; 3898 PetscScalar *x,*b,*t; 3899 3900 /* Make space in temp stack for 16 Byte Aligned arrays */ 3901 float ssealignedspace[11],*tmps,*tmpx; 3902 unsigned long offset; 3903 3904 PetscFunctionBegin; 3905 SSE_SCOPE_BEGIN; 3906 3907 offset = (unsigned long)ssealignedspace % 16; 3908 if (offset) offset = (16 - offset)/4; 3909 tmps = &ssealignedspace[offset]; 3910 tmpx = &ssealignedspace[offset+4]; 3911 PREFETCH_NTA(aa+16*ai[1]); 3912 3913 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3914 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3915 t = a->solve_work; 3916 3917 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3918 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3919 3920 /* forward solve the lower triangular */ 3921 idx = 4*(*r++); 3922 t[0] = b[idx]; t[1] = b[1+idx]; 3923 t[2] = b[2+idx]; t[3] = b[3+idx]; 3924 v = aa + 16*ai[1]; 3925 3926 for (i=1; i<n; ) { 3927 PREFETCH_NTA(&v[8]); 3928 vi = aj + ai[i]; 3929 nz = diag[i] - ai[i]; 3930 idx = 4*(*r++); 3931 3932 /* Demote sum from double to float */ 3933 CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 3934 LOAD_PS(tmps,XMM7); 3935 3936 while (nz--) { 3937 PREFETCH_NTA(&v[16]); 3938 idx = 4*(*vi++); 3939 3940 /* Demote solution (so far) from double to float */ 3941 CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 3942 3943 /* 4x4 Matrix-Vector product with negative accumulation: */ 3944 SSE_INLINE_BEGIN_2(tmpx,v) 3945 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 3946 3947 /* First Column */ 3948 SSE_COPY_PS(XMM0,XMM6) 3949 SSE_SHUFFLE(XMM0,XMM0,0x00) 3950 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 3951 SSE_SUB_PS(XMM7,XMM0) 3952 3953 /* Second Column */ 3954 SSE_COPY_PS(XMM1,XMM6) 3955 SSE_SHUFFLE(XMM1,XMM1,0x55) 3956 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 3957 SSE_SUB_PS(XMM7,XMM1) 3958 3959 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 3960 3961 /* Third Column */ 3962 SSE_COPY_PS(XMM2,XMM6) 3963 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3964 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 3965 SSE_SUB_PS(XMM7,XMM2) 3966 3967 /* Fourth Column */ 3968 SSE_COPY_PS(XMM3,XMM6) 3969 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3970 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 3971 SSE_SUB_PS(XMM7,XMM3) 3972 SSE_INLINE_END_2 3973 3974 v += 16; 3975 } 3976 idx = 4*i; 3977 v = aa + 16*ai[++i]; 3978 PREFETCH_NTA(v); 3979 STORE_PS(tmps,XMM7); 3980 3981 /* Promote result from float to double */ 3982 CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps); 3983 } 3984 /* backward solve the upper triangular */ 3985 idt = 4*(n-1); 3986 ai16 = 16*diag[n-1]; 3987 v = aa + ai16 + 16; 3988 for (i=n-1; i>=0; ) { 3989 PREFETCH_NTA(&v[8]); 3990 vi = aj + diag[i] + 1; 3991 nz = ai[i+1] - diag[i] - 1; 3992 3993 /* Demote accumulator from double to float */ 3994 CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]); 3995 LOAD_PS(tmps,XMM7); 3996 3997 while (nz--) { 3998 PREFETCH_NTA(&v[16]); 3999 idx = 4*(*vi++); 4000 4001 /* Demote solution (so far) from double to float */ 4002 CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]); 4003 4004 /* 4x4 Matrix-Vector Product with negative accumulation: */ 4005 SSE_INLINE_BEGIN_2(tmpx,v) 4006 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 4007 4008 /* First Column */ 4009 SSE_COPY_PS(XMM0,XMM6) 4010 SSE_SHUFFLE(XMM0,XMM0,0x00) 4011 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 4012 SSE_SUB_PS(XMM7,XMM0) 4013 4014 /* Second Column */ 4015 SSE_COPY_PS(XMM1,XMM6) 4016 SSE_SHUFFLE(XMM1,XMM1,0x55) 4017 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 4018 SSE_SUB_PS(XMM7,XMM1) 4019 4020 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 4021 4022 /* Third Column */ 4023 SSE_COPY_PS(XMM2,XMM6) 4024 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4025 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 4026 SSE_SUB_PS(XMM7,XMM2) 4027 4028 /* Fourth Column */ 4029 SSE_COPY_PS(XMM3,XMM6) 4030 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4031 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 4032 SSE_SUB_PS(XMM7,XMM3) 4033 SSE_INLINE_END_2 4034 v += 16; 4035 } 4036 v = aa + ai16; 4037 ai16 = 16*diag[--i]; 4038 PREFETCH_NTA(aa+ai16+16); 4039 /* 4040 Scale the result by the diagonal 4x4 block, 4041 which was inverted as part of the factorization 4042 */ 4043 SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 4044 /* First Column */ 4045 SSE_COPY_PS(XMM0,XMM7) 4046 SSE_SHUFFLE(XMM0,XMM0,0x00) 4047 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 4048 4049 /* Second Column */ 4050 SSE_COPY_PS(XMM1,XMM7) 4051 SSE_SHUFFLE(XMM1,XMM1,0x55) 4052 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 4053 SSE_ADD_PS(XMM0,XMM1) 4054 4055 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 4056 4057 /* Third Column */ 4058 SSE_COPY_PS(XMM2,XMM7) 4059 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4060 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 4061 SSE_ADD_PS(XMM0,XMM2) 4062 4063 /* Fourth Column */ 4064 SSE_COPY_PS(XMM3,XMM7) 4065 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4066 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 4067 SSE_ADD_PS(XMM0,XMM3) 4068 4069 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 4070 SSE_INLINE_END_3 4071 4072 /* Promote solution from float to double */ 4073 CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps); 4074 4075 /* Apply reordering to t and stream into x. */ 4076 /* This way, x doesn't pollute the cache. */ 4077 /* Be careful with size: 2 doubles = 4 floats! */ 4078 idc = 4*(*c--); 4079 SSE_INLINE_BEGIN_2((float*)&t[idt],(float*)&x[idc]) 4080 /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 4081 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 4082 SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0) 4083 /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 4084 SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1) 4085 SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1) 4086 SSE_INLINE_END_2 4087 v = aa + ai16 + 16; 4088 idt -= 4; 4089 } 4090 4091 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4092 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4093 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 4094 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4095 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 4096 SSE_SCOPE_END; 4097 PetscFunctionReturn(0); 4098 } 4099 4100 #endif 4101 4102 4103 /* 4104 Special case where the matrix was ILU(0) factored in the natural 4105 ordering. This eliminates the need for the column and row permutation. 4106 */ 4107 #undef __FUNCT__ 4108 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_inplace" 4109 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 4110 { 4111 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 4112 PetscInt n =a->mbs; 4113 const PetscInt *ai=a->i,*aj=a->j; 4114 PetscErrorCode ierr; 4115 const PetscInt *diag = a->diag; 4116 const MatScalar *aa =a->a; 4117 PetscScalar *x; 4118 const PetscScalar *b; 4119 4120 PetscFunctionBegin; 4121 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 4122 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4123 4124 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS) 4125 { 4126 static PetscScalar w[2000]; /* very BAD need to fix */ 4127 fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w); 4128 } 4129 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 4130 { 4131 static PetscScalar w[2000]; /* very BAD need to fix */ 4132 fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w); 4133 } 4134 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 4135 fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b); 4136 #else 4137 { 4138 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 4139 const MatScalar *v; 4140 PetscInt jdx,idt,idx,nz,i,ai16; 4141 const PetscInt *vi; 4142 4143 /* forward solve the lower triangular */ 4144 idx = 0; 4145 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 4146 for (i=1; i<n; i++) { 4147 v = aa + 16*ai[i]; 4148 vi = aj + ai[i]; 4149 nz = diag[i] - ai[i]; 4150 idx += 4; 4151 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 4152 while (nz--) { 4153 jdx = 4*(*vi++); 4154 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx]; 4155 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4156 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4157 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4158 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4159 v += 16; 4160 } 4161 x[idx] = s1; 4162 x[1+idx] = s2; 4163 x[2+idx] = s3; 4164 x[3+idx] = s4; 4165 } 4166 /* backward solve the upper triangular */ 4167 idt = 4*(n-1); 4168 for (i=n-1; i>=0; i--) { 4169 ai16 = 16*diag[i]; 4170 v = aa + ai16 + 16; 4171 vi = aj + diag[i] + 1; 4172 nz = ai[i+1] - diag[i] - 1; 4173 s1 = x[idt]; s2 = x[1+idt]; 4174 s3 = x[2+idt];s4 = x[3+idt]; 4175 while (nz--) { 4176 idx = 4*(*vi++); 4177 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; 4178 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4179 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4180 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4181 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4182 v += 16; 4183 } 4184 v = aa + ai16; 4185 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 4186 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 4187 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 4188 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 4189 idt -= 4; 4190 } 4191 } 4192 #endif 4193 4194 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 4195 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4196 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 4197 PetscFunctionReturn(0); 4198 } 4199 4200 #undef __FUNCT__ 4201 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering" 4202 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 4203 { 4204 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 4205 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 4206 PetscInt i,k,nz,idx,jdx,idt; 4207 PetscErrorCode ierr; 4208 const PetscInt bs = A->rmap->bs,bs2 = a->bs2; 4209 const MatScalar *aa=a->a,*v; 4210 PetscScalar *x; 4211 const PetscScalar *b; 4212 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 4213 4214 PetscFunctionBegin; 4215 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 4216 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4217 /* forward solve the lower triangular */ 4218 idx = 0; 4219 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 4220 for (i=1; i<n; i++) { 4221 v = aa + bs2*ai[i]; 4222 vi = aj + ai[i]; 4223 nz = ai[i+1] - ai[i]; 4224 idx = bs*i; 4225 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 4226 for (k=0; k<nz; k++) { 4227 jdx = bs*vi[k]; 4228 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 4229 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4230 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4231 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4232 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4233 4234 v += bs2; 4235 } 4236 4237 x[idx] = s1; 4238 x[1+idx] = s2; 4239 x[2+idx] = s3; 4240 x[3+idx] = s4; 4241 } 4242 4243 /* backward solve the upper triangular */ 4244 for (i=n-1; i>=0; i--) { 4245 v = aa + bs2*(adiag[i+1]+1); 4246 vi = aj + adiag[i+1]+1; 4247 nz = adiag[i] - adiag[i+1]-1; 4248 idt = bs*i; 4249 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 4250 4251 for (k=0; k<nz; k++) { 4252 idx = bs*vi[k]; 4253 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 4254 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4255 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4256 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4257 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4258 4259 v += bs2; 4260 } 4261 /* x = inv_diagonal*x */ 4262 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 4263 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;; 4264 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 4265 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 4266 4267 } 4268 4269 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 4270 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4271 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 4272 PetscFunctionReturn(0); 4273 } 4274 4275 #undef __FUNCT__ 4276 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion" 4277 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx) 4278 { 4279 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 4280 const PetscInt n =a->mbs,*ai=a->i,*aj=a->j,*diag=a->diag; 4281 PetscErrorCode ierr; 4282 const MatScalar *aa=a->a; 4283 const PetscScalar *b; 4284 PetscScalar *x; 4285 4286 PetscFunctionBegin; 4287 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 4288 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4289 4290 { 4291 MatScalar s1,s2,s3,s4,x1,x2,x3,x4; 4292 const MatScalar *v; 4293 MatScalar *t=(MatScalar*)x; 4294 PetscInt jdx,idt,idx,nz,i,ai16; 4295 const PetscInt *vi; 4296 4297 /* forward solve the lower triangular */ 4298 idx = 0; 4299 t[0] = (MatScalar)b[0]; 4300 t[1] = (MatScalar)b[1]; 4301 t[2] = (MatScalar)b[2]; 4302 t[3] = (MatScalar)b[3]; 4303 for (i=1; i<n; i++) { 4304 v = aa + 16*ai[i]; 4305 vi = aj + ai[i]; 4306 nz = diag[i] - ai[i]; 4307 idx += 4; 4308 s1 = (MatScalar)b[idx]; 4309 s2 = (MatScalar)b[1+idx]; 4310 s3 = (MatScalar)b[2+idx]; 4311 s4 = (MatScalar)b[3+idx]; 4312 while (nz--) { 4313 jdx = 4*(*vi++); 4314 x1 = t[jdx]; 4315 x2 = t[1+jdx]; 4316 x3 = t[2+jdx]; 4317 x4 = t[3+jdx]; 4318 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4319 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4320 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4321 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4322 v += 16; 4323 } 4324 t[idx] = s1; 4325 t[1+idx] = s2; 4326 t[2+idx] = s3; 4327 t[3+idx] = s4; 4328 } 4329 /* backward solve the upper triangular */ 4330 idt = 4*(n-1); 4331 for (i=n-1; i>=0; i--) { 4332 ai16 = 16*diag[i]; 4333 v = aa + ai16 + 16; 4334 vi = aj + diag[i] + 1; 4335 nz = ai[i+1] - diag[i] - 1; 4336 s1 = t[idt]; 4337 s2 = t[1+idt]; 4338 s3 = t[2+idt]; 4339 s4 = t[3+idt]; 4340 while (nz--) { 4341 idx = 4*(*vi++); 4342 x1 = (MatScalar)x[idx]; 4343 x2 = (MatScalar)x[1+idx]; 4344 x3 = (MatScalar)x[2+idx]; 4345 x4 = (MatScalar)x[3+idx]; 4346 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4347 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4348 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4349 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4350 v += 16; 4351 } 4352 v = aa + ai16; 4353 x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4); 4354 x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4); 4355 x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4); 4356 x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4); 4357 idt -= 4; 4358 } 4359 } 4360 4361 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 4362 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4363 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 4364 PetscFunctionReturn(0); 4365 } 4366 4367 #if defined(PETSC_HAVE_SSE) 4368 4369 #include PETSC_HAVE_SSE 4370 #undef __FUNCT__ 4371 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj" 4372 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx) 4373 { 4374 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 4375 unsigned short *aj=(unsigned short*)a->j; 4376 PetscErrorCode ierr; 4377 int *ai=a->i,n=a->mbs,*diag = a->diag; 4378 MatScalar *aa=a->a; 4379 PetscScalar *x,*b; 4380 4381 PetscFunctionBegin; 4382 SSE_SCOPE_BEGIN; 4383 /* 4384 Note: This code currently uses demotion of double 4385 to float when performing the mixed-mode computation. 4386 This may not be numerically reasonable for all applications. 4387 */ 4388 PREFETCH_NTA(aa+16*ai[1]); 4389 4390 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 4391 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4392 { 4393 /* x will first be computed in single precision then promoted inplace to double */ 4394 MatScalar *v,*t=(MatScalar*)x; 4395 int nz,i,idt,ai16; 4396 unsigned int jdx,idx; 4397 unsigned short *vi; 4398 /* Forward solve the lower triangular factor. */ 4399 4400 /* First block is the identity. */ 4401 idx = 0; 4402 CONVERT_DOUBLE4_FLOAT4(t,b); 4403 v = aa + 16*((unsigned int)ai[1]); 4404 4405 for (i=1; i<n; ) { 4406 PREFETCH_NTA(&v[8]); 4407 vi = aj + ai[i]; 4408 nz = diag[i] - ai[i]; 4409 idx += 4; 4410 4411 /* Demote RHS from double to float. */ 4412 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 4413 LOAD_PS(&t[idx],XMM7); 4414 4415 while (nz--) { 4416 PREFETCH_NTA(&v[16]); 4417 jdx = 4*((unsigned int)(*vi++)); 4418 4419 /* 4x4 Matrix-Vector product with negative accumulation: */ 4420 SSE_INLINE_BEGIN_2(&t[jdx],v) 4421 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 4422 4423 /* First Column */ 4424 SSE_COPY_PS(XMM0,XMM6) 4425 SSE_SHUFFLE(XMM0,XMM0,0x00) 4426 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 4427 SSE_SUB_PS(XMM7,XMM0) 4428 4429 /* Second Column */ 4430 SSE_COPY_PS(XMM1,XMM6) 4431 SSE_SHUFFLE(XMM1,XMM1,0x55) 4432 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 4433 SSE_SUB_PS(XMM7,XMM1) 4434 4435 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 4436 4437 /* Third Column */ 4438 SSE_COPY_PS(XMM2,XMM6) 4439 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4440 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 4441 SSE_SUB_PS(XMM7,XMM2) 4442 4443 /* Fourth Column */ 4444 SSE_COPY_PS(XMM3,XMM6) 4445 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4446 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 4447 SSE_SUB_PS(XMM7,XMM3) 4448 SSE_INLINE_END_2 4449 4450 v += 16; 4451 } 4452 v = aa + 16*ai[++i]; 4453 PREFETCH_NTA(v); 4454 STORE_PS(&t[idx],XMM7); 4455 } 4456 4457 /* Backward solve the upper triangular factor.*/ 4458 4459 idt = 4*(n-1); 4460 ai16 = 16*diag[n-1]; 4461 v = aa + ai16 + 16; 4462 for (i=n-1; i>=0; ) { 4463 PREFETCH_NTA(&v[8]); 4464 vi = aj + diag[i] + 1; 4465 nz = ai[i+1] - diag[i] - 1; 4466 4467 LOAD_PS(&t[idt],XMM7); 4468 4469 while (nz--) { 4470 PREFETCH_NTA(&v[16]); 4471 idx = 4*((unsigned int)(*vi++)); 4472 4473 /* 4x4 Matrix-Vector Product with negative accumulation: */ 4474 SSE_INLINE_BEGIN_2(&t[idx],v) 4475 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 4476 4477 /* First Column */ 4478 SSE_COPY_PS(XMM0,XMM6) 4479 SSE_SHUFFLE(XMM0,XMM0,0x00) 4480 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 4481 SSE_SUB_PS(XMM7,XMM0) 4482 4483 /* Second Column */ 4484 SSE_COPY_PS(XMM1,XMM6) 4485 SSE_SHUFFLE(XMM1,XMM1,0x55) 4486 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 4487 SSE_SUB_PS(XMM7,XMM1) 4488 4489 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 4490 4491 /* Third Column */ 4492 SSE_COPY_PS(XMM2,XMM6) 4493 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4494 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 4495 SSE_SUB_PS(XMM7,XMM2) 4496 4497 /* Fourth Column */ 4498 SSE_COPY_PS(XMM3,XMM6) 4499 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4500 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 4501 SSE_SUB_PS(XMM7,XMM3) 4502 SSE_INLINE_END_2 4503 v += 16; 4504 } 4505 v = aa + ai16; 4506 ai16 = 16*diag[--i]; 4507 PREFETCH_NTA(aa+ai16+16); 4508 /* 4509 Scale the result by the diagonal 4x4 block, 4510 which was inverted as part of the factorization 4511 */ 4512 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 4513 /* First Column */ 4514 SSE_COPY_PS(XMM0,XMM7) 4515 SSE_SHUFFLE(XMM0,XMM0,0x00) 4516 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 4517 4518 /* Second Column */ 4519 SSE_COPY_PS(XMM1,XMM7) 4520 SSE_SHUFFLE(XMM1,XMM1,0x55) 4521 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 4522 SSE_ADD_PS(XMM0,XMM1) 4523 4524 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 4525 4526 /* Third Column */ 4527 SSE_COPY_PS(XMM2,XMM7) 4528 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4529 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 4530 SSE_ADD_PS(XMM0,XMM2) 4531 4532 /* Fourth Column */ 4533 SSE_COPY_PS(XMM3,XMM7) 4534 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4535 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 4536 SSE_ADD_PS(XMM0,XMM3) 4537 4538 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 4539 SSE_INLINE_END_3 4540 4541 v = aa + ai16 + 16; 4542 idt -= 4; 4543 } 4544 4545 /* Convert t from single precision back to double precision (inplace)*/ 4546 idt = 4*(n-1); 4547 for (i=n-1; i>=0; i--) { 4548 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 4549 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 4550 PetscScalar *xtemp=&x[idt]; 4551 MatScalar *ttemp=&t[idt]; 4552 xtemp[3] = (PetscScalar)ttemp[3]; 4553 xtemp[2] = (PetscScalar)ttemp[2]; 4554 xtemp[1] = (PetscScalar)ttemp[1]; 4555 xtemp[0] = (PetscScalar)ttemp[0]; 4556 idt -= 4; 4557 } 4558 4559 } /* End of artificial scope. */ 4560 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 4561 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4562 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 4563 SSE_SCOPE_END; 4564 PetscFunctionReturn(0); 4565 } 4566 4567 #undef __FUNCT__ 4568 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion" 4569 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx) 4570 { 4571 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 4572 int *aj=a->j; 4573 PetscErrorCode ierr; 4574 int *ai=a->i,n=a->mbs,*diag = a->diag; 4575 MatScalar *aa=a->a; 4576 PetscScalar *x,*b; 4577 4578 PetscFunctionBegin; 4579 SSE_SCOPE_BEGIN; 4580 /* 4581 Note: This code currently uses demotion of double 4582 to float when performing the mixed-mode computation. 4583 This may not be numerically reasonable for all applications. 4584 */ 4585 PREFETCH_NTA(aa+16*ai[1]); 4586 4587 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 4588 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4589 { 4590 /* x will first be computed in single precision then promoted inplace to double */ 4591 MatScalar *v,*t=(MatScalar*)x; 4592 int nz,i,idt,ai16; 4593 int jdx,idx; 4594 int *vi; 4595 /* Forward solve the lower triangular factor. */ 4596 4597 /* First block is the identity. */ 4598 idx = 0; 4599 CONVERT_DOUBLE4_FLOAT4(t,b); 4600 v = aa + 16*ai[1]; 4601 4602 for (i=1; i<n; ) { 4603 PREFETCH_NTA(&v[8]); 4604 vi = aj + ai[i]; 4605 nz = diag[i] - ai[i]; 4606 idx += 4; 4607 4608 /* Demote RHS from double to float. */ 4609 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 4610 LOAD_PS(&t[idx],XMM7); 4611 4612 while (nz--) { 4613 PREFETCH_NTA(&v[16]); 4614 jdx = 4*(*vi++); 4615 /* jdx = *vi++; */ 4616 4617 /* 4x4 Matrix-Vector product with negative accumulation: */ 4618 SSE_INLINE_BEGIN_2(&t[jdx],v) 4619 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 4620 4621 /* First Column */ 4622 SSE_COPY_PS(XMM0,XMM6) 4623 SSE_SHUFFLE(XMM0,XMM0,0x00) 4624 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 4625 SSE_SUB_PS(XMM7,XMM0) 4626 4627 /* Second Column */ 4628 SSE_COPY_PS(XMM1,XMM6) 4629 SSE_SHUFFLE(XMM1,XMM1,0x55) 4630 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 4631 SSE_SUB_PS(XMM7,XMM1) 4632 4633 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 4634 4635 /* Third Column */ 4636 SSE_COPY_PS(XMM2,XMM6) 4637 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4638 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 4639 SSE_SUB_PS(XMM7,XMM2) 4640 4641 /* Fourth Column */ 4642 SSE_COPY_PS(XMM3,XMM6) 4643 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4644 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 4645 SSE_SUB_PS(XMM7,XMM3) 4646 SSE_INLINE_END_2 4647 4648 v += 16; 4649 } 4650 v = aa + 16*ai[++i]; 4651 PREFETCH_NTA(v); 4652 STORE_PS(&t[idx],XMM7); 4653 } 4654 4655 /* Backward solve the upper triangular factor.*/ 4656 4657 idt = 4*(n-1); 4658 ai16 = 16*diag[n-1]; 4659 v = aa + ai16 + 16; 4660 for (i=n-1; i>=0; ) { 4661 PREFETCH_NTA(&v[8]); 4662 vi = aj + diag[i] + 1; 4663 nz = ai[i+1] - diag[i] - 1; 4664 4665 LOAD_PS(&t[idt],XMM7); 4666 4667 while (nz--) { 4668 PREFETCH_NTA(&v[16]); 4669 idx = 4*(*vi++); 4670 /* idx = *vi++; */ 4671 4672 /* 4x4 Matrix-Vector Product with negative accumulation: */ 4673 SSE_INLINE_BEGIN_2(&t[idx],v) 4674 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 4675 4676 /* First Column */ 4677 SSE_COPY_PS(XMM0,XMM6) 4678 SSE_SHUFFLE(XMM0,XMM0,0x00) 4679 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 4680 SSE_SUB_PS(XMM7,XMM0) 4681 4682 /* Second Column */ 4683 SSE_COPY_PS(XMM1,XMM6) 4684 SSE_SHUFFLE(XMM1,XMM1,0x55) 4685 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 4686 SSE_SUB_PS(XMM7,XMM1) 4687 4688 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 4689 4690 /* Third Column */ 4691 SSE_COPY_PS(XMM2,XMM6) 4692 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4693 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 4694 SSE_SUB_PS(XMM7,XMM2) 4695 4696 /* Fourth Column */ 4697 SSE_COPY_PS(XMM3,XMM6) 4698 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4699 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 4700 SSE_SUB_PS(XMM7,XMM3) 4701 SSE_INLINE_END_2 4702 v += 16; 4703 } 4704 v = aa + ai16; 4705 ai16 = 16*diag[--i]; 4706 PREFETCH_NTA(aa+ai16+16); 4707 /* 4708 Scale the result by the diagonal 4x4 block, 4709 which was inverted as part of the factorization 4710 */ 4711 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 4712 /* First Column */ 4713 SSE_COPY_PS(XMM0,XMM7) 4714 SSE_SHUFFLE(XMM0,XMM0,0x00) 4715 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 4716 4717 /* Second Column */ 4718 SSE_COPY_PS(XMM1,XMM7) 4719 SSE_SHUFFLE(XMM1,XMM1,0x55) 4720 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 4721 SSE_ADD_PS(XMM0,XMM1) 4722 4723 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 4724 4725 /* Third Column */ 4726 SSE_COPY_PS(XMM2,XMM7) 4727 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4728 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 4729 SSE_ADD_PS(XMM0,XMM2) 4730 4731 /* Fourth Column */ 4732 SSE_COPY_PS(XMM3,XMM7) 4733 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4734 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 4735 SSE_ADD_PS(XMM0,XMM3) 4736 4737 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 4738 SSE_INLINE_END_3 4739 4740 v = aa + ai16 + 16; 4741 idt -= 4; 4742 } 4743 4744 /* Convert t from single precision back to double precision (inplace)*/ 4745 idt = 4*(n-1); 4746 for (i=n-1; i>=0; i--) { 4747 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 4748 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 4749 PetscScalar *xtemp=&x[idt]; 4750 MatScalar *ttemp=&t[idt]; 4751 xtemp[3] = (PetscScalar)ttemp[3]; 4752 xtemp[2] = (PetscScalar)ttemp[2]; 4753 xtemp[1] = (PetscScalar)ttemp[1]; 4754 xtemp[0] = (PetscScalar)ttemp[0]; 4755 idt -= 4; 4756 } 4757 4758 } /* End of artificial scope. */ 4759 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 4760 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4761 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 4762 SSE_SCOPE_END; 4763 PetscFunctionReturn(0); 4764 } 4765 4766 #endif 4767 4768 #undef __FUNCT__ 4769 #define __FUNCT__ "MatSolve_SeqBAIJ_3_inplace" 4770 PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx) 4771 { 4772 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 4773 IS iscol=a->col,isrow=a->row; 4774 PetscErrorCode ierr; 4775 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 4776 PetscInt i,nz,idx,idt,idc; 4777 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 4778 const MatScalar *aa=a->a,*v; 4779 PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 4780 const PetscScalar *b; 4781 4782 PetscFunctionBegin; 4783 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 4784 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4785 t = a->solve_work; 4786 4787 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 4788 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 4789 4790 /* forward solve the lower triangular */ 4791 idx = 3*(*r++); 4792 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 4793 for (i=1; i<n; i++) { 4794 v = aa + 9*ai[i]; 4795 vi = aj + ai[i]; 4796 nz = diag[i] - ai[i]; 4797 idx = 3*(*r++); 4798 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 4799 while (nz--) { 4800 idx = 3*(*vi++); 4801 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4802 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4803 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4804 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4805 v += 9; 4806 } 4807 idx = 3*i; 4808 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 4809 } 4810 /* backward solve the upper triangular */ 4811 for (i=n-1; i>=0; i--) { 4812 v = aa + 9*diag[i] + 9; 4813 vi = aj + diag[i] + 1; 4814 nz = ai[i+1] - diag[i] - 1; 4815 idt = 3*i; 4816 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 4817 while (nz--) { 4818 idx = 3*(*vi++); 4819 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4820 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4821 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4822 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4823 v += 9; 4824 } 4825 idc = 3*(*c--); 4826 v = aa + 9*diag[i]; 4827 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4828 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4829 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4830 } 4831 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4832 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4833 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 4834 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4835 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 4836 PetscFunctionReturn(0); 4837 } 4838 4839 #undef __FUNCT__ 4840 #define __FUNCT__ "MatSolve_SeqBAIJ_3" 4841 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 4842 { 4843 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 4844 IS iscol=a->col,isrow=a->row; 4845 PetscErrorCode ierr; 4846 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 4847 PetscInt i,nz,idx,idt,idc,m; 4848 const PetscInt *r,*c,*rout,*cout; 4849 const MatScalar *aa=a->a,*v; 4850 PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 4851 const PetscScalar *b; 4852 4853 PetscFunctionBegin; 4854 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 4855 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4856 t = a->solve_work; 4857 4858 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 4859 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 4860 4861 /* forward solve the lower triangular */ 4862 idx = 3*r[0]; 4863 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 4864 for (i=1; i<n; i++) { 4865 v = aa + 9*ai[i]; 4866 vi = aj + ai[i]; 4867 nz = ai[i+1] - ai[i]; 4868 idx = 3*r[i]; 4869 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 4870 for (m=0; m<nz; m++) { 4871 idx = 3*vi[m]; 4872 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4873 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4874 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4875 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4876 v += 9; 4877 } 4878 idx = 3*i; 4879 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 4880 } 4881 /* backward solve the upper triangular */ 4882 for (i=n-1; i>=0; i--) { 4883 v = aa + 9*(adiag[i+1]+1); 4884 vi = aj + adiag[i+1]+1; 4885 nz = adiag[i] - adiag[i+1] - 1; 4886 idt = 3*i; 4887 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 4888 for (m=0; m<nz; m++) { 4889 idx = 3*vi[m]; 4890 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4891 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4892 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4893 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4894 v += 9; 4895 } 4896 idc = 3*c[i]; 4897 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4898 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4899 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4900 } 4901 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4902 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4903 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 4904 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4905 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 4906 PetscFunctionReturn(0); 4907 } 4908 4909 /* 4910 Special case where the matrix was ILU(0) factored in the natural 4911 ordering. This eliminates the need for the column and row permutation. 4912 */ 4913 #undef __FUNCT__ 4914 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_inplace" 4915 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 4916 { 4917 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 4918 const PetscInt n =a->mbs,*ai=a->i,*aj=a->j; 4919 PetscErrorCode ierr; 4920 const PetscInt *diag = a->diag,*vi; 4921 const MatScalar *aa =a->a,*v; 4922 PetscScalar *x,s1,s2,s3,x1,x2,x3; 4923 const PetscScalar *b; 4924 PetscInt jdx,idt,idx,nz,i; 4925 4926 PetscFunctionBegin; 4927 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 4928 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4929 4930 /* forward solve the lower triangular */ 4931 idx = 0; 4932 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; 4933 for (i=1; i<n; i++) { 4934 v = aa + 9*ai[i]; 4935 vi = aj + ai[i]; 4936 nz = diag[i] - ai[i]; 4937 idx += 3; 4938 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 4939 while (nz--) { 4940 jdx = 3*(*vi++); 4941 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx]; 4942 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4943 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4944 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4945 v += 9; 4946 } 4947 x[idx] = s1; 4948 x[1+idx] = s2; 4949 x[2+idx] = s3; 4950 } 4951 /* backward solve the upper triangular */ 4952 for (i=n-1; i>=0; i--) { 4953 v = aa + 9*diag[i] + 9; 4954 vi = aj + diag[i] + 1; 4955 nz = ai[i+1] - diag[i] - 1; 4956 idt = 3*i; 4957 s1 = x[idt]; s2 = x[1+idt]; 4958 s3 = x[2+idt]; 4959 while (nz--) { 4960 idx = 3*(*vi++); 4961 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; 4962 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4963 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4964 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4965 v += 9; 4966 } 4967 v = aa + 9*diag[i]; 4968 x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4969 x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4970 x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4971 } 4972 4973 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 4974 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4975 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 4976 PetscFunctionReturn(0); 4977 } 4978 4979 #undef __FUNCT__ 4980 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering" 4981 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 4982 { 4983 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 4984 const PetscInt n =a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 4985 PetscErrorCode ierr; 4986 PetscInt i,k,nz,idx,jdx,idt; 4987 const PetscInt bs = A->rmap->bs,bs2 = a->bs2; 4988 const MatScalar *aa=a->a,*v; 4989 PetscScalar *x; 4990 const PetscScalar *b; 4991 PetscScalar s1,s2,s3,x1,x2,x3; 4992 4993 PetscFunctionBegin; 4994 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 4995 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4996 /* forward solve the lower triangular */ 4997 idx = 0; 4998 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx]; 4999 for (i=1; i<n; i++) { 5000 v = aa + bs2*ai[i]; 5001 vi = aj + ai[i]; 5002 nz = ai[i+1] - ai[i]; 5003 idx = bs*i; 5004 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 5005 for (k=0; k<nz; k++) { 5006 jdx = bs*vi[k]; 5007 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx]; 5008 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 5009 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 5010 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 5011 5012 v += bs2; 5013 } 5014 5015 x[idx] = s1; 5016 x[1+idx] = s2; 5017 x[2+idx] = s3; 5018 } 5019 5020 /* backward solve the upper triangular */ 5021 for (i=n-1; i>=0; i--) { 5022 v = aa + bs2*(adiag[i+1]+1); 5023 vi = aj + adiag[i+1]+1; 5024 nz = adiag[i] - adiag[i+1]-1; 5025 idt = bs*i; 5026 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt]; 5027 5028 for (k=0; k<nz; k++) { 5029 idx = bs*vi[k]; 5030 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 5031 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 5032 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 5033 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 5034 5035 v += bs2; 5036 } 5037 /* x = inv_diagonal*x */ 5038 x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 5039 x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 5040 x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 5041 5042 } 5043 5044 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5045 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5046 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 5047 PetscFunctionReturn(0); 5048 } 5049 5050 #undef __FUNCT__ 5051 #define __FUNCT__ "MatSolve_SeqBAIJ_2_inplace" 5052 PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx) 5053 { 5054 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 5055 IS iscol=a->col,isrow=a->row; 5056 PetscErrorCode ierr; 5057 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 5058 PetscInt i,nz,idx,idt,idc; 5059 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 5060 const MatScalar *aa=a->a,*v; 5061 PetscScalar *x,s1,s2,x1,x2,*t; 5062 const PetscScalar *b; 5063 5064 PetscFunctionBegin; 5065 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 5066 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5067 t = a->solve_work; 5068 5069 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 5070 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 5071 5072 /* forward solve the lower triangular */ 5073 idx = 2*(*r++); 5074 t[0] = b[idx]; t[1] = b[1+idx]; 5075 for (i=1; i<n; i++) { 5076 v = aa + 4*ai[i]; 5077 vi = aj + ai[i]; 5078 nz = diag[i] - ai[i]; 5079 idx = 2*(*r++); 5080 s1 = b[idx]; s2 = b[1+idx]; 5081 while (nz--) { 5082 idx = 2*(*vi++); 5083 x1 = t[idx]; x2 = t[1+idx]; 5084 s1 -= v[0]*x1 + v[2]*x2; 5085 s2 -= v[1]*x1 + v[3]*x2; 5086 v += 4; 5087 } 5088 idx = 2*i; 5089 t[idx] = s1; t[1+idx] = s2; 5090 } 5091 /* backward solve the upper triangular */ 5092 for (i=n-1; i>=0; i--) { 5093 v = aa + 4*diag[i] + 4; 5094 vi = aj + diag[i] + 1; 5095 nz = ai[i+1] - diag[i] - 1; 5096 idt = 2*i; 5097 s1 = t[idt]; s2 = t[1+idt]; 5098 while (nz--) { 5099 idx = 2*(*vi++); 5100 x1 = t[idx]; x2 = t[1+idx]; 5101 s1 -= v[0]*x1 + v[2]*x2; 5102 s2 -= v[1]*x1 + v[3]*x2; 5103 v += 4; 5104 } 5105 idc = 2*(*c--); 5106 v = aa + 4*diag[i]; 5107 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 5108 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 5109 } 5110 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 5111 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 5112 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5113 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5114 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 5115 PetscFunctionReturn(0); 5116 } 5117 5118 #undef __FUNCT__ 5119 #define __FUNCT__ "MatSolve_SeqBAIJ_2" 5120 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 5121 { 5122 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 5123 IS iscol=a->col,isrow=a->row; 5124 PetscErrorCode ierr; 5125 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 5126 PetscInt i,nz,idx,jdx,idt,idc,m; 5127 const PetscInt *r,*c,*rout,*cout; 5128 const MatScalar *aa=a->a,*v; 5129 PetscScalar *x,s1,s2,x1,x2,*t; 5130 const PetscScalar *b; 5131 5132 PetscFunctionBegin; 5133 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 5134 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5135 t = a->solve_work; 5136 5137 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 5138 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 5139 5140 /* forward solve the lower triangular */ 5141 idx = 2*r[0]; 5142 t[0] = b[idx]; t[1] = b[1+idx]; 5143 for (i=1; i<n; i++) { 5144 v = aa + 4*ai[i]; 5145 vi = aj + ai[i]; 5146 nz = ai[i+1] - ai[i]; 5147 idx = 2*r[i]; 5148 s1 = b[idx]; s2 = b[1+idx]; 5149 for (m=0; m<nz; m++) { 5150 jdx = 2*vi[m]; 5151 x1 = t[jdx]; x2 = t[1+jdx]; 5152 s1 -= v[0]*x1 + v[2]*x2; 5153 s2 -= v[1]*x1 + v[3]*x2; 5154 v += 4; 5155 } 5156 idx = 2*i; 5157 t[idx] = s1; t[1+idx] = s2; 5158 } 5159 /* backward solve the upper triangular */ 5160 for (i=n-1; i>=0; i--) { 5161 v = aa + 4*(adiag[i+1]+1); 5162 vi = aj + adiag[i+1]+1; 5163 nz = adiag[i] - adiag[i+1] - 1; 5164 idt = 2*i; 5165 s1 = t[idt]; s2 = t[1+idt]; 5166 for (m=0; m<nz; m++) { 5167 idx = 2*vi[m]; 5168 x1 = t[idx]; x2 = t[1+idx]; 5169 s1 -= v[0]*x1 + v[2]*x2; 5170 s2 -= v[1]*x1 + v[3]*x2; 5171 v += 4; 5172 } 5173 idc = 2*c[i]; 5174 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 5175 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 5176 } 5177 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 5178 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 5179 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5180 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5181 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 5182 PetscFunctionReturn(0); 5183 } 5184 5185 /* 5186 Special case where the matrix was ILU(0) factored in the natural 5187 ordering. This eliminates the need for the column and row permutation. 5188 */ 5189 #undef __FUNCT__ 5190 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_inplace" 5191 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 5192 { 5193 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 5194 const PetscInt n =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 5195 PetscErrorCode ierr; 5196 const MatScalar *aa=a->a,*v; 5197 PetscScalar *x,s1,s2,x1,x2; 5198 const PetscScalar *b; 5199 PetscInt jdx,idt,idx,nz,i; 5200 5201 PetscFunctionBegin; 5202 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 5203 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5204 5205 /* forward solve the lower triangular */ 5206 idx = 0; 5207 x[0] = b[0]; x[1] = b[1]; 5208 for (i=1; i<n; i++) { 5209 v = aa + 4*ai[i]; 5210 vi = aj + ai[i]; 5211 nz = diag[i] - ai[i]; 5212 idx += 2; 5213 s1 = b[idx];s2 = b[1+idx]; 5214 while (nz--) { 5215 jdx = 2*(*vi++); 5216 x1 = x[jdx];x2 = x[1+jdx]; 5217 s1 -= v[0]*x1 + v[2]*x2; 5218 s2 -= v[1]*x1 + v[3]*x2; 5219 v += 4; 5220 } 5221 x[idx] = s1; 5222 x[1+idx] = s2; 5223 } 5224 /* backward solve the upper triangular */ 5225 for (i=n-1; i>=0; i--) { 5226 v = aa + 4*diag[i] + 4; 5227 vi = aj + diag[i] + 1; 5228 nz = ai[i+1] - diag[i] - 1; 5229 idt = 2*i; 5230 s1 = x[idt]; s2 = x[1+idt]; 5231 while (nz--) { 5232 idx = 2*(*vi++); 5233 x1 = x[idx]; x2 = x[1+idx]; 5234 s1 -= v[0]*x1 + v[2]*x2; 5235 s2 -= v[1]*x1 + v[3]*x2; 5236 v += 4; 5237 } 5238 v = aa + 4*diag[i]; 5239 x[idt] = v[0]*s1 + v[2]*s2; 5240 x[1+idt] = v[1]*s1 + v[3]*s2; 5241 } 5242 5243 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5244 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5245 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 5246 PetscFunctionReturn(0); 5247 } 5248 5249 #undef __FUNCT__ 5250 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering" 5251 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 5252 { 5253 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 5254 const PetscInt n = a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 5255 PetscInt i,k,nz,idx,idt,jdx; 5256 PetscErrorCode ierr; 5257 const MatScalar *aa=a->a,*v; 5258 PetscScalar *x,s1,s2,x1,x2; 5259 const PetscScalar *b; 5260 5261 PetscFunctionBegin; 5262 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 5263 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5264 /* forward solve the lower triangular */ 5265 idx = 0; 5266 x[0] = b[idx]; x[1] = b[1+idx]; 5267 for (i=1; i<n; i++) { 5268 v = aa + 4*ai[i]; 5269 vi = aj + ai[i]; 5270 nz = ai[i+1] - ai[i]; 5271 idx = 2*i; 5272 s1 = b[idx];s2 = b[1+idx]; 5273 PetscPrefetchBlock(vi+nz,nz,0,PETSC_PREFETCH_HINT_NTA); 5274 PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); 5275 for (k=0; k<nz; k++) { 5276 jdx = 2*vi[k]; 5277 x1 = x[jdx];x2 = x[1+jdx]; 5278 s1 -= v[0]*x1 + v[2]*x2; 5279 s2 -= v[1]*x1 + v[3]*x2; 5280 v += 4; 5281 } 5282 x[idx] = s1; 5283 x[1+idx] = s2; 5284 } 5285 5286 /* backward solve the upper triangular */ 5287 for (i=n-1; i>=0; i--) { 5288 v = aa + 4*(adiag[i+1]+1); 5289 vi = aj + adiag[i+1]+1; 5290 nz = adiag[i] - adiag[i+1]-1; 5291 idt = 2*i; 5292 s1 = x[idt]; s2 = x[1+idt]; 5293 PetscPrefetchBlock(vi+nz,nz,0,PETSC_PREFETCH_HINT_NTA); 5294 PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); 5295 for (k=0; k<nz; k++) { 5296 idx = 2*vi[k]; 5297 x1 = x[idx]; x2 = x[1+idx]; 5298 s1 -= v[0]*x1 + v[2]*x2; 5299 s2 -= v[1]*x1 + v[3]*x2; 5300 v += 4; 5301 } 5302 /* x = inv_diagonal*x */ 5303 x[idt] = v[0]*s1 + v[2]*s2; 5304 x[1+idt] = v[1]*s1 + v[3]*s2; 5305 } 5306 5307 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5308 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5309 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 5310 PetscFunctionReturn(0); 5311 } 5312 5313 #undef __FUNCT__ 5314 #define __FUNCT__ "MatSolve_SeqBAIJ_1_inplace" 5315 PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx) 5316 { 5317 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 5318 IS iscol=a->col,isrow=a->row; 5319 PetscErrorCode ierr; 5320 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 5321 PetscInt i,nz; 5322 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 5323 const MatScalar *aa=a->a,*v; 5324 PetscScalar *x,s1,*t; 5325 const PetscScalar *b; 5326 5327 PetscFunctionBegin; 5328 if (!n) PetscFunctionReturn(0); 5329 5330 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 5331 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5332 t = a->solve_work; 5333 5334 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 5335 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 5336 5337 /* forward solve the lower triangular */ 5338 t[0] = b[*r++]; 5339 for (i=1; i<n; i++) { 5340 v = aa + ai[i]; 5341 vi = aj + ai[i]; 5342 nz = diag[i] - ai[i]; 5343 s1 = b[*r++]; 5344 while (nz--) { 5345 s1 -= (*v++)*t[*vi++]; 5346 } 5347 t[i] = s1; 5348 } 5349 /* backward solve the upper triangular */ 5350 for (i=n-1; i>=0; i--) { 5351 v = aa + diag[i] + 1; 5352 vi = aj + diag[i] + 1; 5353 nz = ai[i+1] - diag[i] - 1; 5354 s1 = t[i]; 5355 while (nz--) { 5356 s1 -= (*v++)*t[*vi++]; 5357 } 5358 x[*c--] = t[i] = aa[diag[i]]*s1; 5359 } 5360 5361 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 5362 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 5363 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5364 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5365 ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr); 5366 PetscFunctionReturn(0); 5367 } 5368 5369 #undef __FUNCT__ 5370 #define __FUNCT__ "MatSolve_SeqBAIJ_1" 5371 PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 5372 { 5373 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 5374 IS iscol = a->col,isrow = a->row; 5375 PetscErrorCode ierr; 5376 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag = a->diag,nz; 5377 const PetscInt *rout,*cout,*r,*c; 5378 PetscScalar *x,*tmp,sum; 5379 const PetscScalar *b; 5380 const MatScalar *aa = a->a,*v; 5381 5382 PetscFunctionBegin; 5383 if (!n) PetscFunctionReturn(0); 5384 5385 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 5386 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5387 tmp = a->solve_work; 5388 5389 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 5390 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 5391 5392 /* forward solve the lower triangular */ 5393 tmp[0] = b[r[0]]; 5394 v = aa; 5395 vi = aj; 5396 for (i=1; i<n; i++) { 5397 nz = ai[i+1] - ai[i]; 5398 sum = b[r[i]]; 5399 PetscSparseDenseMinusDot(sum,tmp,v,vi,nz); 5400 tmp[i] = sum; 5401 v += nz; vi += nz; 5402 } 5403 5404 /* backward solve the upper triangular */ 5405 for (i=n-1; i>=0; i--) { 5406 v = aa + adiag[i+1]+1; 5407 vi = aj + adiag[i+1]+1; 5408 nz = adiag[i]-adiag[i+1]-1; 5409 sum = tmp[i]; 5410 PetscSparseDenseMinusDot(sum,tmp,v,vi,nz); 5411 x[c[i]] = tmp[i] = sum*v[nz]; /* v[nz] = aa[adiag[i]] */ 5412 } 5413 5414 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 5415 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 5416 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5417 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5418 ierr = PetscLogFlops(2*a->nz - A->cmap->n);CHKERRQ(ierr); 5419 PetscFunctionReturn(0); 5420 } 5421 5422 /* 5423 Special case where the matrix was ILU(0) factored in the natural 5424 ordering. This eliminates the need for the column and row permutation. 5425 */ 5426 #undef __FUNCT__ 5427 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering_inplace" 5428 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 5429 { 5430 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 5431 const PetscInt n = a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 5432 PetscErrorCode ierr; 5433 const MatScalar *aa=a->a,*v; 5434 PetscScalar *x; 5435 const PetscScalar *b; 5436 PetscScalar s1,x1; 5437 PetscInt jdx,idt,idx,nz,i; 5438 5439 PetscFunctionBegin; 5440 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 5441 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5442 5443 /* forward solve the lower triangular */ 5444 idx = 0; 5445 x[0] = b[0]; 5446 for (i=1; i<n; i++) { 5447 v = aa + ai[i]; 5448 vi = aj + ai[i]; 5449 nz = diag[i] - ai[i]; 5450 idx += 1; 5451 s1 = b[idx]; 5452 while (nz--) { 5453 jdx = *vi++; 5454 x1 = x[jdx]; 5455 s1 -= v[0]*x1; 5456 v += 1; 5457 } 5458 x[idx] = s1; 5459 } 5460 /* backward solve the upper triangular */ 5461 for (i=n-1; i>=0; i--) { 5462 v = aa + diag[i] + 1; 5463 vi = aj + diag[i] + 1; 5464 nz = ai[i+1] - diag[i] - 1; 5465 idt = i; 5466 s1 = x[idt]; 5467 while (nz--) { 5468 idx = *vi++; 5469 x1 = x[idx]; 5470 s1 -= v[0]*x1; 5471 v += 1; 5472 } 5473 v = aa + diag[i]; 5474 x[idt] = v[0]*s1; 5475 } 5476 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5477 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5478 ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 5479 PetscFunctionReturn(0); 5480 } 5481 5482 5483 #undef __FUNCT__ 5484 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering" 5485 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 5486 { 5487 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 5488 PetscErrorCode ierr; 5489 const PetscInt n = a->mbs,*ai = a->i,*aj = a->j,*adiag = a->diag,*vi; 5490 PetscScalar *x,sum; 5491 const PetscScalar *b; 5492 const MatScalar *aa = a->a,*v; 5493 PetscInt i,nz; 5494 5495 PetscFunctionBegin; 5496 if (!n) PetscFunctionReturn(0); 5497 5498 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 5499 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5500 5501 /* forward solve the lower triangular */ 5502 x[0] = b[0]; 5503 v = aa; 5504 vi = aj; 5505 for (i=1; i<n; i++) { 5506 nz = ai[i+1] - ai[i]; 5507 sum = b[i]; 5508 PetscSparseDenseMinusDot(sum,x,v,vi,nz); 5509 v += nz; 5510 vi += nz; 5511 x[i] = sum; 5512 } 5513 5514 /* backward solve the upper triangular */ 5515 for (i=n-1; i>=0; i--) { 5516 v = aa + adiag[i+1] + 1; 5517 vi = aj + adiag[i+1] + 1; 5518 nz = adiag[i] - adiag[i+1]-1; 5519 sum = x[i]; 5520 PetscSparseDenseMinusDot(sum,x,v,vi,nz); 5521 x[i] = sum*v[nz]; /* x[i]=aa[adiag[i]]*sum; v++; */ 5522 } 5523 5524 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 5525 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5526 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5527 PetscFunctionReturn(0); 5528 } 5529 5530 /* ----------------------------------------------------------------*/ 5531 extern PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscBool); 5532 5533 #undef __FUNCT__ 5534 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering" 5535 /* 5536 This is not much faster than MatLUFactorNumeric_SeqBAIJ_N() but the solve is faster at least sometimes 5537 */ 5538 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info) 5539 { 5540 Mat C =B; 5541 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data; 5542 PetscErrorCode ierr; 5543 PetscInt i,j,k,ipvt[15]; 5544 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*ajtmp,*bjtmp,*bdiag=b->diag,*pj; 5545 PetscInt nz,nzL,row; 5546 MatScalar *rtmp,*pc,*mwork,*pv,*vv,work[225]; 5547 const MatScalar *v,*aa=a->a; 5548 PetscInt bs2 = a->bs2,bs=A->rmap->bs,flg; 5549 PetscInt sol_ver; 5550 5551 PetscFunctionBegin; 5552 ierr = PetscOptionsGetInt(((PetscObject)A)->prefix,"-sol_ver",&sol_ver,PETSC_NULL);CHKERRQ(ierr); 5553 5554 /* generate work space needed by the factorization */ 5555 ierr = PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);CHKERRQ(ierr); 5556 ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 5557 5558 for (i=0; i<n; i++) { 5559 /* zero rtmp */ 5560 /* L part */ 5561 nz = bi[i+1] - bi[i]; 5562 bjtmp = bj + bi[i]; 5563 for (j=0; j<nz; j++) { 5564 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5565 } 5566 5567 /* U part */ 5568 nz = bdiag[i] - bdiag[i+1]; 5569 bjtmp = bj + bdiag[i+1]+1; 5570 for (j=0; j<nz; j++) { 5571 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5572 } 5573 5574 /* load in initial (unfactored row) */ 5575 nz = ai[i+1] - ai[i]; 5576 ajtmp = aj + ai[i]; 5577 v = aa + bs2*ai[i]; 5578 for (j=0; j<nz; j++) { 5579 ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 5580 } 5581 5582 /* elimination */ 5583 bjtmp = bj + bi[i]; 5584 nzL = bi[i+1] - bi[i]; 5585 for (k=0; k < nzL; k++) { 5586 row = bjtmp[k]; 5587 pc = rtmp + bs2*row; 5588 for (flg=0,j=0; j<bs2; j++) { 5589 if (pc[j]!=0.0) { 5590 flg = 1; 5591 break; 5592 } 5593 } 5594 if (flg) { 5595 pv = b->a + bs2*bdiag[row]; 5596 PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); 5597 /*ierr = PetscKernel_A_gets_A_times_B_15(pc,pv,mwork);CHKERRQ(ierr);*/ 5598 pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 5599 pv = b->a + bs2*(bdiag[row+1]+1); 5600 nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 5601 for (j=0; j<nz; j++) { 5602 vv = rtmp + bs2*pj[j]; 5603 PetscKernel_A_gets_A_minus_B_times_C(bs,vv,pc,pv); 5604 /* ierr = PetscKernel_A_gets_A_minus_B_times_C_15(vv,pc,pv);CHKERRQ(ierr); */ 5605 pv += bs2; 5606 } 5607 ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 5608 } 5609 } 5610 5611 /* finished row so stick it into b->a */ 5612 /* L part */ 5613 pv = b->a + bs2*bi[i]; 5614 pj = b->j + bi[i]; 5615 nz = bi[i+1] - bi[i]; 5616 for (j=0; j<nz; j++) { 5617 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5618 } 5619 5620 /* Mark diagonal and invert diagonal for simplier triangular solves */ 5621 pv = b->a + bs2*bdiag[i]; 5622 pj = b->j + bdiag[i]; 5623 ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5624 /* PetscKernel_A_gets_inverse_A(bs,pv,pivots,work); */ 5625 ierr = PetscKernel_A_gets_inverse_A_15(pv,ipvt,work,info->shiftamount);CHKERRQ(ierr); 5626 5627 /* U part */ 5628 pv = b->a + bs2*(bdiag[i+1]+1); 5629 pj = b->j + bdiag[i+1]+1; 5630 nz = bdiag[i] - bdiag[i+1] - 1; 5631 for (j=0; j<nz; j++) { 5632 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5633 } 5634 } 5635 5636 ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr); 5637 5638 C->ops->solve = MatSolve_SeqBAIJ_15_NaturalOrdering_ver1; 5639 C->ops->solvetranspose = MatSolve_SeqBAIJ_N_NaturalOrdering; 5640 C->assembled = PETSC_TRUE; 5641 5642 ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 5643 PetscFunctionReturn(0); 5644 } 5645 5646 #undef __FUNCT__ 5647 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N" 5648 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N(Mat B,Mat A,const MatFactorInfo *info) 5649 { 5650 Mat C =B; 5651 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data; 5652 IS isrow = b->row,isicol = b->icol; 5653 PetscErrorCode ierr; 5654 const PetscInt *r,*ic; 5655 PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 5656 PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj; 5657 MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 5658 PetscInt bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg; 5659 MatScalar *v_work; 5660 PetscBool col_identity,row_identity,both_identity; 5661 5662 PetscFunctionBegin; 5663 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 5664 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 5665 5666 ierr = PetscMalloc(bs2*n*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 5667 ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 5668 5669 /* generate work space needed by dense LU factorization */ 5670 ierr = PetscMalloc3(bs,MatScalar,&v_work,bs2,MatScalar,&mwork,bs,PetscInt,&v_pivots);CHKERRQ(ierr); 5671 5672 for (i=0; i<n; i++) { 5673 /* zero rtmp */ 5674 /* L part */ 5675 nz = bi[i+1] - bi[i]; 5676 bjtmp = bj + bi[i]; 5677 for (j=0; j<nz; j++) { 5678 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5679 } 5680 5681 /* U part */ 5682 nz = bdiag[i] - bdiag[i+1]; 5683 bjtmp = bj + bdiag[i+1]+1; 5684 for (j=0; j<nz; j++) { 5685 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5686 } 5687 5688 /* load in initial (unfactored row) */ 5689 nz = ai[r[i]+1] - ai[r[i]]; 5690 ajtmp = aj + ai[r[i]]; 5691 v = aa + bs2*ai[r[i]]; 5692 for (j=0; j<nz; j++) { 5693 ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 5694 } 5695 5696 /* elimination */ 5697 bjtmp = bj + bi[i]; 5698 nzL = bi[i+1] - bi[i]; 5699 for (k=0; k < nzL; k++) { 5700 row = bjtmp[k]; 5701 pc = rtmp + bs2*row; 5702 for (flg=0,j=0; j<bs2; j++) { 5703 if (pc[j]!=0.0) { 5704 flg = 1; 5705 break; 5706 } 5707 } 5708 if (flg) { 5709 pv = b->a + bs2*bdiag[row]; 5710 PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */ 5711 pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 5712 pv = b->a + bs2*(bdiag[row+1]+1); 5713 nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 5714 for (j=0; j<nz; j++) { 5715 PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); 5716 } 5717 ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 5718 } 5719 } 5720 5721 /* finished row so stick it into b->a */ 5722 /* L part */ 5723 pv = b->a + bs2*bi[i]; 5724 pj = b->j + bi[i]; 5725 nz = bi[i+1] - bi[i]; 5726 for (j=0; j<nz; j++) { 5727 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5728 } 5729 5730 /* Mark diagonal and invert diagonal for simplier triangular solves */ 5731 pv = b->a + bs2*bdiag[i]; 5732 pj = b->j + bdiag[i]; 5733 /* if (*pj != i)SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */ 5734 ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5735 ierr = PetscKernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); 5736 5737 /* U part */ 5738 pv = b->a + bs2*(bdiag[i+1]+1); 5739 pj = b->j + bdiag[i+1]+1; 5740 nz = bdiag[i] - bdiag[i+1] - 1; 5741 for (j=0; j<nz; j++) { 5742 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5743 } 5744 } 5745 5746 ierr = PetscFree(rtmp);CHKERRQ(ierr); 5747 ierr = PetscFree3(v_work,mwork,v_pivots);CHKERRQ(ierr); 5748 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 5749 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 5750 5751 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 5752 ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr); 5753 5754 both_identity = (PetscBool) (row_identity && col_identity); 5755 if (both_identity) { 5756 C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering; 5757 } else { 5758 C->ops->solve = MatSolve_SeqBAIJ_N; 5759 } 5760 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_N; 5761 5762 C->assembled = PETSC_TRUE; 5763 5764 ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 5765 PetscFunctionReturn(0); 5766 } 5767 5768 /* 5769 ilu(0) with natural ordering under new data structure. 5770 See MatILUFactorSymbolic_SeqAIJ_ilu0() for detailed description 5771 because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_inplace(). 5772 */ 5773 5774 #undef __FUNCT__ 5775 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0" 5776 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 5777 { 5778 5779 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 5780 PetscErrorCode ierr; 5781 PetscInt n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2; 5782 PetscInt i,j,nz,*bi,*bj,*bdiag,bi_temp; 5783 5784 PetscFunctionBegin; 5785 ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr); 5786 b = (Mat_SeqBAIJ*)(fact)->data; 5787 5788 /* allocate matrix arrays for new data structure */ 5789 ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,n+1,PetscInt,&b->i);CHKERRQ(ierr); 5790 ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(n+1)*sizeof(PetscInt));CHKERRQ(ierr); 5791 5792 b->singlemalloc = PETSC_TRUE; 5793 b->free_a = PETSC_TRUE; 5794 b->free_ij = PETSC_TRUE; 5795 fact->preallocated = PETSC_TRUE; 5796 fact->assembled = PETSC_TRUE; 5797 if (!b->diag) { 5798 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr); 5799 ierr = PetscLogObjectMemory(fact,(n+1)*sizeof(PetscInt));CHKERRQ(ierr); 5800 } 5801 bdiag = b->diag; 5802 5803 if (n > 0) { 5804 ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr); 5805 } 5806 5807 /* set bi and bj with new data structure */ 5808 bi = b->i; 5809 bj = b->j; 5810 5811 /* L part */ 5812 bi[0] = 0; 5813 for (i=0; i<n; i++) { 5814 nz = adiag[i] - ai[i]; 5815 bi[i+1] = bi[i] + nz; 5816 aj = a->j + ai[i]; 5817 for (j=0; j<nz; j++) { 5818 *bj = aj[j]; bj++; 5819 } 5820 } 5821 5822 /* U part */ 5823 bi_temp = bi[n]; 5824 bdiag[n] = bi[n]-1; 5825 for (i=n-1; i>=0; i--) { 5826 nz = ai[i+1] - adiag[i] - 1; 5827 bi_temp = bi_temp + nz + 1; 5828 aj = a->j + adiag[i] + 1; 5829 for (j=0; j<nz; j++) { 5830 *bj = aj[j]; bj++; 5831 } 5832 /* diag[i] */ 5833 *bj = i; bj++; 5834 bdiag[i] = bi_temp - 1; 5835 } 5836 PetscFunctionReturn(0); 5837 } 5838 5839 #undef __FUNCT__ 5840 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ" 5841 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 5842 { 5843 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 5844 IS isicol; 5845 PetscErrorCode ierr; 5846 const PetscInt *r,*ic; 5847 PetscInt n=a->mbs,*ai=a->i,*aj=a->j,d; 5848 PetscInt *bi,*cols,nnz,*cols_lvl; 5849 PetscInt *bdiag,prow,fm,nzbd,reallocs=0,dcount=0; 5850 PetscInt i,levels,diagonal_fill; 5851 PetscBool col_identity,row_identity,both_identity; 5852 PetscReal f; 5853 PetscInt nlnk,*lnk,*lnk_lvl=PETSC_NULL; 5854 PetscBT lnkbt; 5855 PetscInt nzi,*bj,**bj_ptr,**bjlvl_ptr; 5856 PetscFreeSpaceList free_space =PETSC_NULL,current_space=PETSC_NULL; 5857 PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL; 5858 PetscBool missing; 5859 PetscInt bs=A->rmap->bs,bs2=a->bs2; 5860 5861 PetscFunctionBegin; 5862 if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n); 5863 if (bs>1) { /* check shifttype */ 5864 if (info->shifttype == MAT_SHIFT_NONZERO || info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE) 5865 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Only MAT_SHIFT_NONE and MAT_SHIFT_INBLOCKS are supported for BAIJ matrix"); 5866 } 5867 5868 ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr); 5869 if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d); 5870 5871 f = info->fill; 5872 levels = (PetscInt)info->levels; 5873 diagonal_fill = (PetscInt)info->diagonal_fill; 5874 5875 ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 5876 5877 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 5878 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 5879 5880 both_identity = (PetscBool) (row_identity && col_identity); 5881 5882 if (!levels && both_identity) { 5883 /* special case: ilu(0) with natural ordering */ 5884 ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0(fact,A,isrow,iscol,info);CHKERRQ(ierr); 5885 ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 5886 5887 fact->factortype = MAT_FACTOR_ILU; 5888 (fact)->info.factor_mallocs = 0; 5889 (fact)->info.fill_ratio_given = info->fill; 5890 (fact)->info.fill_ratio_needed = 1.0; 5891 5892 b = (Mat_SeqBAIJ*)(fact)->data; 5893 b->row = isrow; 5894 b->col = iscol; 5895 b->icol = isicol; 5896 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 5897 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 5898 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 5899 5900 ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 5901 PetscFunctionReturn(0); 5902 } 5903 5904 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 5905 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 5906 5907 /* get new row pointers */ 5908 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr); 5909 bi[0] = 0; 5910 /* bdiag is location of diagonal in factor */ 5911 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr); 5912 bdiag[0] = 0; 5913 5914 ierr = PetscMalloc2(n,PetscInt*,&bj_ptr,n,PetscInt*,&bjlvl_ptr);CHKERRQ(ierr); 5915 5916 /* create a linked list for storing column indices of the active row */ 5917 nlnk = n + 1; 5918 ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 5919 5920 /* initial FreeSpace size is f*(ai[n]+1) */ 5921 ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr); 5922 current_space = free_space; 5923 ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr); 5924 current_space_lvl = free_space_lvl; 5925 5926 for (i=0; i<n; i++) { 5927 nzi = 0; 5928 /* copy current row into linked list */ 5929 nnz = ai[r[i]+1] - ai[r[i]]; 5930 if (!nnz) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i); 5931 cols = aj + ai[r[i]]; 5932 lnk[i] = -1; /* marker to indicate if diagonal exists */ 5933 ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 5934 nzi += nlnk; 5935 5936 /* make sure diagonal entry is included */ 5937 if (diagonal_fill && lnk[i] == -1) { 5938 fm = n; 5939 while (lnk[fm] < i) fm = lnk[fm]; 5940 lnk[i] = lnk[fm]; /* insert diagonal into linked list */ 5941 lnk[fm] = i; 5942 lnk_lvl[i] = 0; 5943 nzi++; dcount++; 5944 } 5945 5946 /* add pivot rows into the active row */ 5947 nzbd = 0; 5948 prow = lnk[n]; 5949 while (prow < i) { 5950 nnz = bdiag[prow]; 5951 cols = bj_ptr[prow] + nnz + 1; 5952 cols_lvl = bjlvl_ptr[prow] + nnz + 1; 5953 nnz = bi[prow+1] - bi[prow] - nnz - 1; 5954 5955 ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr); 5956 nzi += nlnk; 5957 prow = lnk[prow]; 5958 nzbd++; 5959 } 5960 bdiag[i] = nzbd; 5961 bi[i+1] = bi[i] + nzi; 5962 5963 /* if free space is not available, make more free space */ 5964 if (current_space->local_remaining<nzi) { 5965 nnz = 2*nzi*(n - i); /* estimated and max additional space needed */ 5966 ierr = PetscFreeSpaceGet(nnz,¤t_space);CHKERRQ(ierr); 5967 ierr = PetscFreeSpaceGet(nnz,¤t_space_lvl);CHKERRQ(ierr); 5968 reallocs++; 5969 } 5970 5971 /* copy data into free_space and free_space_lvl, then initialize lnk */ 5972 ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr); 5973 5974 bj_ptr[i] = current_space->array; 5975 bjlvl_ptr[i] = current_space_lvl->array; 5976 5977 /* make sure the active row i has diagonal entry */ 5978 if (*(bj_ptr[i]+bdiag[i]) != i) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\ntry running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i); 5979 5980 current_space->array += nzi; 5981 current_space->local_used += nzi; 5982 current_space->local_remaining -= nzi; 5983 5984 current_space_lvl->array += nzi; 5985 current_space_lvl->local_used += nzi; 5986 current_space_lvl->local_remaining -= nzi; 5987 } 5988 5989 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 5990 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 5991 5992 /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */ 5993 ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr); 5994 ierr = PetscFreeSpaceContiguous_LU(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr); 5995 5996 ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 5997 ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr); 5998 ierr = PetscFree2(bj_ptr,bjlvl_ptr);CHKERRQ(ierr); 5999 6000 #if defined(PETSC_USE_INFO) 6001 { 6002 PetscReal af = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]); 6003 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr); 6004 ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 6005 ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr); 6006 ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 6007 if (diagonal_fill) { 6008 ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr); 6009 } 6010 } 6011 #endif 6012 6013 /* put together the new matrix */ 6014 ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 6015 ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 6016 6017 b = (Mat_SeqBAIJ*)(fact)->data; 6018 b->free_a = PETSC_TRUE; 6019 b->free_ij = PETSC_TRUE; 6020 b->singlemalloc = PETSC_FALSE; 6021 6022 ierr = PetscMalloc((bs2*(bdiag[0]+1))*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 6023 6024 b->j = bj; 6025 b->i = bi; 6026 b->diag = bdiag; 6027 b->free_diag = PETSC_TRUE; 6028 b->ilen = 0; 6029 b->imax = 0; 6030 b->row = isrow; 6031 b->col = iscol; 6032 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 6033 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 6034 b->icol = isicol; 6035 6036 ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 6037 /* In b structure: Free imax, ilen, old a, old j. 6038 Allocate bdiag, solve_work, new a, new j */ 6039 ierr = PetscLogObjectMemory(fact,(bdiag[0]+1) * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr); 6040 b->maxnz = b->nz = bdiag[0]+1; 6041 6042 fact->info.factor_mallocs = reallocs; 6043 fact->info.fill_ratio_given = f; 6044 fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]); 6045 6046 ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 6047 PetscFunctionReturn(0); 6048 } 6049 6050 /* 6051 This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 6052 except that the data structure of Mat_SeqAIJ is slightly different. 6053 Not a good example of code reuse. 6054 */ 6055 #undef __FUNCT__ 6056 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_inplace" 6057 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_inplace(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 6058 { 6059 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 6060 IS isicol; 6061 PetscErrorCode ierr; 6062 const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi; 6063 PetscInt prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp; 6064 PetscInt *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0; 6065 PetscInt incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd; 6066 PetscBool col_identity,row_identity,both_identity,flg; 6067 PetscReal f; 6068 6069 PetscFunctionBegin; 6070 ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr); 6071 if (flg) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd); 6072 6073 f = info->fill; 6074 levels = (PetscInt)info->levels; 6075 diagonal_fill = (PetscInt)info->diagonal_fill; 6076 6077 ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 6078 6079 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 6080 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 6081 both_identity = (PetscBool) (row_identity && col_identity); 6082 6083 if (!levels && both_identity) { /* special case copy the nonzero structure */ 6084 ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr); 6085 ierr = MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);CHKERRQ(ierr); 6086 6087 fact->factortype = MAT_FACTOR_ILU; 6088 b = (Mat_SeqBAIJ*)fact->data; 6089 b->row = isrow; 6090 b->col = iscol; 6091 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 6092 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 6093 b->icol = isicol; 6094 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 6095 6096 ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 6097 PetscFunctionReturn(0); 6098 } 6099 6100 /* general case perform the symbolic factorization */ 6101 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 6102 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 6103 6104 /* get new row pointers */ 6105 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr); 6106 ainew[0] = 0; 6107 /* don't know how many column pointers are needed so estimate */ 6108 jmax = (PetscInt)(f*ai[n] + 1); 6109 ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr); 6110 /* ajfill is level of fill for each fill entry */ 6111 ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr); 6112 /* fill is a linked list of nonzeros in active row */ 6113 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr); 6114 /* im is level for each filled value */ 6115 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr); 6116 /* dloc is location of diagonal in factor */ 6117 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr); 6118 dloc[0] = 0; 6119 for (prow=0; prow<n; prow++) { 6120 6121 /* copy prow into linked list */ 6122 nzf = nz = ai[r[prow]+1] - ai[r[prow]]; 6123 if (!nz) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow); 6124 xi = aj + ai[r[prow]]; 6125 fill[n] = n; 6126 fill[prow] = -1; /* marker for diagonal entry */ 6127 while (nz--) { 6128 fm = n; 6129 idx = ic[*xi++]; 6130 do { 6131 m = fm; 6132 fm = fill[m]; 6133 } while (fm < idx); 6134 fill[m] = idx; 6135 fill[idx] = fm; 6136 im[idx] = 0; 6137 } 6138 6139 /* make sure diagonal entry is included */ 6140 if (diagonal_fill && fill[prow] == -1) { 6141 fm = n; 6142 while (fill[fm] < prow) fm = fill[fm]; 6143 fill[prow] = fill[fm]; /* insert diagonal into linked list */ 6144 fill[fm] = prow; 6145 im[prow] = 0; 6146 nzf++; 6147 dcount++; 6148 } 6149 6150 nzi = 0; 6151 row = fill[n]; 6152 while (row < prow) { 6153 incrlev = im[row] + 1; 6154 nz = dloc[row]; 6155 xi = ajnew + ainew[row] + nz + 1; 6156 flev = ajfill + ainew[row] + nz + 1; 6157 nnz = ainew[row+1] - ainew[row] - nz - 1; 6158 fm = row; 6159 while (nnz-- > 0) { 6160 idx = *xi++; 6161 if (*flev + incrlev > levels) { 6162 flev++; 6163 continue; 6164 } 6165 do { 6166 m = fm; 6167 fm = fill[m]; 6168 } while (fm < idx); 6169 if (fm != idx) { 6170 im[idx] = *flev + incrlev; 6171 fill[m] = idx; 6172 fill[idx] = fm; 6173 fm = idx; 6174 nzf++; 6175 } else if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev; 6176 flev++; 6177 } 6178 row = fill[row]; 6179 nzi++; 6180 } 6181 /* copy new filled row into permanent storage */ 6182 ainew[prow+1] = ainew[prow] + nzf; 6183 if (ainew[prow+1] > jmax) { 6184 6185 /* estimate how much additional space we will need */ 6186 /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 6187 /* just double the memory each time */ 6188 PetscInt maxadd = jmax; 6189 /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */ 6190 if (maxadd < nzf) maxadd = (n-prow)*(nzf+1); 6191 jmax += maxadd; 6192 6193 /* allocate a longer ajnew and ajfill */ 6194 ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 6195 ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 6196 ierr = PetscFree(ajnew);CHKERRQ(ierr); 6197 ajnew = xitmp; 6198 ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 6199 ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 6200 ierr = PetscFree(ajfill);CHKERRQ(ierr); 6201 ajfill = xitmp; 6202 reallocate++; /* count how many reallocations are needed */ 6203 } 6204 xitmp = ajnew + ainew[prow]; 6205 flev = ajfill + ainew[prow]; 6206 dloc[prow] = nzi; 6207 fm = fill[n]; 6208 while (nzf--) { 6209 *xitmp++ = fm; 6210 *flev++ = im[fm]; 6211 fm = fill[fm]; 6212 } 6213 /* make sure row has diagonal entry */ 6214 if (ajnew[ainew[prow]+dloc[prow]] != prow) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 6215 try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow); 6216 } 6217 ierr = PetscFree(ajfill);CHKERRQ(ierr); 6218 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 6219 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 6220 ierr = PetscFree(fill);CHKERRQ(ierr); 6221 ierr = PetscFree(im);CHKERRQ(ierr); 6222 6223 #if defined(PETSC_USE_INFO) 6224 { 6225 PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]); 6226 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr); 6227 ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 6228 ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr); 6229 ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 6230 if (diagonal_fill) { 6231 ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr); 6232 } 6233 } 6234 #endif 6235 6236 /* put together the new matrix */ 6237 ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 6238 ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 6239 b = (Mat_SeqBAIJ*)fact->data; 6240 6241 b->free_a = PETSC_TRUE; 6242 b->free_ij = PETSC_TRUE; 6243 b->singlemalloc = PETSC_FALSE; 6244 6245 ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 6246 6247 b->j = ajnew; 6248 b->i = ainew; 6249 for (i=0; i<n; i++) dloc[i] += ainew[i]; 6250 b->diag = dloc; 6251 b->free_diag = PETSC_TRUE; 6252 b->ilen = 0; 6253 b->imax = 0; 6254 b->row = isrow; 6255 b->col = iscol; 6256 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 6257 6258 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 6259 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 6260 b->icol = isicol; 6261 ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 6262 /* In b structure: Free imax, ilen, old a, old j. 6263 Allocate dloc, solve_work, new a, new j */ 6264 ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr); 6265 b->maxnz = b->nz = ainew[n]; 6266 6267 fact->info.factor_mallocs = reallocate; 6268 fact->info.fill_ratio_given = f; 6269 fact->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]); 6270 6271 ierr = MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);CHKERRQ(ierr); 6272 PetscFunctionReturn(0); 6273 } 6274 6275 #undef __FUNCT__ 6276 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE" 6277 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A) 6278 { 6279 /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; */ 6280 /* int i,*AJ=a->j,nz=a->nz; */ 6281 6282 PetscFunctionBegin; 6283 /* Undo Column scaling */ 6284 /* while (nz--) { */ 6285 /* AJ[i] = AJ[i]/4; */ 6286 /* } */ 6287 /* This should really invoke a push/pop logic, but we don't have that yet. */ 6288 A->ops->setunfactored = PETSC_NULL; 6289 PetscFunctionReturn(0); 6290 } 6291 6292 #undef __FUNCT__ 6293 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj" 6294 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A) 6295 { 6296 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 6297 PetscInt *AJ=a->j,nz=a->nz; 6298 unsigned short *aj=(unsigned short*)AJ; 6299 6300 PetscFunctionBegin; 6301 /* Is this really necessary? */ 6302 while (nz--) { 6303 AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */ 6304 } 6305 A->ops->setunfactored = PETSC_NULL; 6306 PetscFunctionReturn(0); 6307 } 6308 6309 6310