1 2 /* 3 Factorization code for BAIJ format. 4 */ 5 6 #include "../src/mat/impls/baij/seq/baij.h" 7 #include "../src/mat/blockinvert.h" 8 #include "petscbt.h" 9 #include "../src/mat/utils/freespace.h" 10 11 #undef __FUNCT__ 12 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering" 13 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 14 { 15 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 16 PetscErrorCode ierr; 17 const PetscInt *adiag = a->diag,*ai = a->i,*aj = a->j,*vi; 18 PetscInt i,n = a->mbs,j; 19 PetscInt nz; 20 PetscScalar *x,*tmp,s1; 21 const MatScalar *aa = a->a,*v; 22 const PetscScalar *b; 23 24 PetscFunctionBegin; 25 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 26 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 27 tmp = a->solve_work; 28 29 30 /* copy the b into temp work space according to permutation */ 31 for (i=0; i<n; i++) tmp[i] = b[i]; 32 33 /* forward solve the U^T */ 34 for (i=0; i<n; i++) { 35 v = aa + adiag[i+1] + 1; 36 vi = aj + adiag[i+1] + 1; 37 nz = adiag[i] - adiag[i+1] - 1; 38 s1 = tmp[i]; 39 s1 *= v[nz]; /* multiply by inverse of diagonal entry */ 40 for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j]; 41 tmp[i] = s1; 42 } 43 44 /* backward solve the L^T */ 45 for (i=n-1; i>=0; i--){ 46 v = aa + ai[i]; 47 vi = aj + ai[i]; 48 nz = ai[i+1] - ai[i]; 49 s1 = tmp[i]; 50 for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j]; 51 } 52 53 /* copy tmp into x according to permutation */ 54 for (i=0; i<n; i++) x[i] = tmp[i]; 55 56 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 57 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 58 59 ierr = PetscLogFlops(2.0*a->nz-A->cmap->n);CHKERRQ(ierr); 60 PetscFunctionReturn(0); 61 } 62 63 #undef __FUNCT__ 64 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace" 65 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 66 { 67 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 68 PetscErrorCode ierr; 69 PetscInt i,nz; 70 const PetscInt *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 71 const MatScalar *aa=a->a,*v; 72 PetscScalar s1,*x; 73 const PetscScalar *b; 74 75 PetscFunctionBegin; 76 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 77 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 78 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 79 80 /* forward solve the U^T */ 81 for (i=0; i<n; i++) { 82 83 v = aa + diag[i]; 84 /* multiply by the inverse of the block diagonal */ 85 s1 = (*v++)*x[i]; 86 vi = aj + diag[i] + 1; 87 nz = ai[i+1] - diag[i] - 1; 88 while (nz--) { 89 x[*vi++] -= (*v++)*s1; 90 } 91 x[i] = s1; 92 } 93 /* backward solve the L^T */ 94 for (i=n-1; i>=0; i--){ 95 v = aa + diag[i] - 1; 96 vi = aj + diag[i] - 1; 97 nz = diag[i] - ai[i]; 98 s1 = x[i]; 99 while (nz--) { 100 x[*vi--] -= (*v--)*s1; 101 } 102 } 103 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 104 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 105 ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 106 PetscFunctionReturn(0); 107 } 108 109 #undef __FUNCT__ 110 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace" 111 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 112 { 113 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 114 PetscErrorCode ierr; 115 PetscInt i,nz,idx,idt,oidx; 116 const PetscInt *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j; 117 const MatScalar *aa=a->a,*v; 118 PetscScalar s1,s2,x1,x2,*x; 119 const PetscScalar *b; 120 121 PetscFunctionBegin; 122 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 123 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 124 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 125 126 /* forward solve the U^T */ 127 idx = 0; 128 for (i=0; i<n; i++) { 129 130 v = aa + 4*diag[i]; 131 /* multiply by the inverse of the block diagonal */ 132 x1 = x[idx]; x2 = x[1+idx]; 133 s1 = v[0]*x1 + v[1]*x2; 134 s2 = v[2]*x1 + v[3]*x2; 135 v += 4; 136 137 vi = aj + diag[i] + 1; 138 nz = ai[i+1] - diag[i] - 1; 139 while (nz--) { 140 oidx = 2*(*vi++); 141 x[oidx] -= v[0]*s1 + v[1]*s2; 142 x[oidx+1] -= v[2]*s1 + v[3]*s2; 143 v += 4; 144 } 145 x[idx] = s1;x[1+idx] = s2; 146 idx += 2; 147 } 148 /* backward solve the L^T */ 149 for (i=n-1; i>=0; i--){ 150 v = aa + 4*diag[i] - 4; 151 vi = aj + diag[i] - 1; 152 nz = diag[i] - ai[i]; 153 idt = 2*i; 154 s1 = x[idt]; s2 = x[1+idt]; 155 while (nz--) { 156 idx = 2*(*vi--); 157 x[idx] -= v[0]*s1 + v[1]*s2; 158 x[idx+1] -= v[2]*s1 + v[3]*s2; 159 v -= 4; 160 } 161 } 162 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 163 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 164 ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 165 PetscFunctionReturn(0); 166 } 167 168 #undef __FUNCT__ 169 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering" 170 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 171 { 172 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 173 PetscErrorCode ierr; 174 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 175 PetscInt nz,idx,idt,j,i,oidx; 176 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 177 const MatScalar *aa=a->a,*v; 178 PetscScalar s1,s2,x1,x2,*x; 179 const PetscScalar *b; 180 181 PetscFunctionBegin; 182 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 183 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 184 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 185 186 /* forward solve the U^T */ 187 idx = 0; 188 for (i=0; i<n; i++) { 189 v = aa + bs2*diag[i]; 190 /* multiply by the inverse of the block diagonal */ 191 x1 = x[idx]; x2 = x[1+idx]; 192 s1 = v[0]*x1 + v[1]*x2; 193 s2 = v[2]*x1 + v[3]*x2; 194 v -= bs2; 195 196 vi = aj + diag[i] - 1; 197 nz = diag[i] - diag[i+1] - 1; 198 for(j=0;j>-nz;j--){ 199 oidx = bs*vi[j]; 200 x[oidx] -= v[0]*s1 + v[1]*s2; 201 x[oidx+1] -= v[2]*s1 + v[3]*s2; 202 v -= bs2; 203 } 204 x[idx] = s1;x[1+idx] = s2; 205 idx += bs; 206 } 207 /* backward solve the L^T */ 208 for (i=n-1; i>=0; i--){ 209 v = aa + bs2*ai[i]; 210 vi = aj + ai[i]; 211 nz = ai[i+1] - ai[i]; 212 idt = bs*i; 213 s1 = x[idt]; s2 = x[1+idt]; 214 for(j=0;j<nz;j++){ 215 idx = bs*vi[j]; 216 x[idx] -= v[0]*s1 + v[1]*s2; 217 x[idx+1] -= v[2]*s1 + v[3]*s2; 218 v += bs2; 219 } 220 } 221 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 222 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 223 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 224 PetscFunctionReturn(0); 225 } 226 227 #undef __FUNCT__ 228 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace" 229 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 230 { 231 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 232 PetscErrorCode ierr; 233 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 234 PetscInt i,nz,idx,idt,oidx; 235 const MatScalar *aa=a->a,*v; 236 PetscScalar s1,s2,s3,x1,x2,x3,*x; 237 const PetscScalar *b; 238 239 PetscFunctionBegin; 240 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 241 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 242 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 243 244 /* forward solve the U^T */ 245 idx = 0; 246 for (i=0; i<n; i++) { 247 248 v = aa + 9*diag[i]; 249 /* multiply by the inverse of the block diagonal */ 250 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 251 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 252 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 253 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 254 v += 9; 255 256 vi = aj + diag[i] + 1; 257 nz = ai[i+1] - diag[i] - 1; 258 while (nz--) { 259 oidx = 3*(*vi++); 260 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 261 x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 262 x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 263 v += 9; 264 } 265 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 266 idx += 3; 267 } 268 /* backward solve the L^T */ 269 for (i=n-1; i>=0; i--){ 270 v = aa + 9*diag[i] - 9; 271 vi = aj + diag[i] - 1; 272 nz = diag[i] - ai[i]; 273 idt = 3*i; 274 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 275 while (nz--) { 276 idx = 3*(*vi--); 277 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 278 x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 279 x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 280 v -= 9; 281 } 282 } 283 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 284 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 285 ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 286 PetscFunctionReturn(0); 287 } 288 289 #undef __FUNCT__ 290 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering" 291 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 292 { 293 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 294 PetscErrorCode ierr; 295 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 296 PetscInt nz,idx,idt,j,i,oidx; 297 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 298 const MatScalar *aa=a->a,*v; 299 PetscScalar s1,s2,s3,x1,x2,x3,*x; 300 const PetscScalar *b; 301 302 PetscFunctionBegin; 303 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 304 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 305 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 306 307 /* forward solve the U^T */ 308 idx = 0; 309 for (i=0; i<n; i++) { 310 v = aa + bs2*diag[i]; 311 /* multiply by the inverse of the block diagonal */ 312 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 313 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 314 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 315 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 316 v -= bs2; 317 318 vi = aj + diag[i] - 1; 319 nz = diag[i] - diag[i+1] - 1; 320 for(j=0;j>-nz;j--){ 321 oidx = bs*vi[j]; 322 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 323 x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 324 x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 325 v -= bs2; 326 } 327 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 328 idx += bs; 329 } 330 /* backward solve the L^T */ 331 for (i=n-1; i>=0; i--){ 332 v = aa + bs2*ai[i]; 333 vi = aj + ai[i]; 334 nz = ai[i+1] - ai[i]; 335 idt = bs*i; 336 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 337 for(j=0;j<nz;j++){ 338 idx = bs*vi[j]; 339 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 340 x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 341 x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 342 v += bs2; 343 } 344 } 345 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 346 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 347 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 348 PetscFunctionReturn(0); 349 } 350 351 #undef __FUNCT__ 352 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace" 353 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 354 { 355 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 356 PetscErrorCode ierr; 357 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 358 PetscInt i,nz,idx,idt,oidx; 359 const MatScalar *aa=a->a,*v; 360 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4,*x; 361 const PetscScalar *b; 362 363 PetscFunctionBegin; 364 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 365 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 366 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 367 368 /* forward solve the U^T */ 369 idx = 0; 370 for (i=0; i<n; i++) { 371 372 v = aa + 16*diag[i]; 373 /* multiply by the inverse of the block diagonal */ 374 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 375 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 376 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 377 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 378 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 379 v += 16; 380 381 vi = aj + diag[i] + 1; 382 nz = ai[i+1] - diag[i] - 1; 383 while (nz--) { 384 oidx = 4*(*vi++); 385 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 386 x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 387 x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 388 x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 389 v += 16; 390 } 391 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; 392 idx += 4; 393 } 394 /* backward solve the L^T */ 395 for (i=n-1; i>=0; i--){ 396 v = aa + 16*diag[i] - 16; 397 vi = aj + diag[i] - 1; 398 nz = diag[i] - ai[i]; 399 idt = 4*i; 400 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; 401 while (nz--) { 402 idx = 4*(*vi--); 403 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 404 x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 405 x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 406 x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 407 v -= 16; 408 } 409 } 410 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 411 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 412 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 413 PetscFunctionReturn(0); 414 } 415 416 #undef __FUNCT__ 417 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering" 418 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 419 { 420 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 421 PetscErrorCode ierr; 422 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 423 PetscInt nz,idx,idt,j,i,oidx; 424 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 425 const MatScalar *aa=a->a,*v; 426 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4,*x; 427 const PetscScalar *b; 428 429 PetscFunctionBegin; 430 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 431 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 432 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 433 434 /* forward solve the U^T */ 435 idx = 0; 436 for (i=0; i<n; i++) { 437 v = aa + bs2*diag[i]; 438 /* multiply by the inverse of the block diagonal */ 439 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 440 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 441 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 442 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 443 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 444 v -= bs2; 445 446 vi = aj + diag[i] - 1; 447 nz = diag[i] - diag[i+1] - 1; 448 for(j=0;j>-nz;j--){ 449 oidx = bs*vi[j]; 450 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 451 x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 452 x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 453 x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 454 v -= bs2; 455 } 456 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; 457 idx += bs; 458 } 459 /* backward solve the L^T */ 460 for (i=n-1; i>=0; i--){ 461 v = aa + bs2*ai[i]; 462 vi = aj + ai[i]; 463 nz = ai[i+1] - ai[i]; 464 idt = bs*i; 465 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; 466 for(j=0;j<nz;j++){ 467 idx = bs*vi[j]; 468 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 469 x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 470 x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 471 x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 472 v += bs2; 473 } 474 } 475 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 476 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 477 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 478 PetscFunctionReturn(0); 479 } 480 481 #undef __FUNCT__ 482 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace" 483 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 484 { 485 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 486 PetscErrorCode ierr; 487 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 488 PetscInt i,nz,idx,idt,oidx; 489 const MatScalar *aa=a->a,*v; 490 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x; 491 const PetscScalar *b; 492 493 PetscFunctionBegin; 494 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 495 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 496 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 497 498 /* forward solve the U^T */ 499 idx = 0; 500 for (i=0; i<n; i++) { 501 502 v = aa + 25*diag[i]; 503 /* multiply by the inverse of the block diagonal */ 504 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 505 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 506 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 507 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 508 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 509 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 510 v += 25; 511 512 vi = aj + diag[i] + 1; 513 nz = ai[i+1] - diag[i] - 1; 514 while (nz--) { 515 oidx = 5*(*vi++); 516 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 517 x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 518 x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 519 x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 520 x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 521 v += 25; 522 } 523 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 524 idx += 5; 525 } 526 /* backward solve the L^T */ 527 for (i=n-1; i>=0; i--){ 528 v = aa + 25*diag[i] - 25; 529 vi = aj + diag[i] - 1; 530 nz = diag[i] - ai[i]; 531 idt = 5*i; 532 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 533 while (nz--) { 534 idx = 5*(*vi--); 535 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 536 x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 537 x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 538 x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 539 x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 540 v -= 25; 541 } 542 } 543 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 544 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 545 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 546 PetscFunctionReturn(0); 547 } 548 549 #undef __FUNCT__ 550 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering" 551 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 552 { 553 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 554 PetscErrorCode ierr; 555 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 556 PetscInt nz,idx,idt,j,i,oidx; 557 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 558 const MatScalar *aa=a->a,*v; 559 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x; 560 const PetscScalar *b; 561 562 PetscFunctionBegin; 563 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 564 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 565 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 566 567 /* forward solve the U^T */ 568 idx = 0; 569 for (i=0; i<n; i++) { 570 v = aa + bs2*diag[i]; 571 /* multiply by the inverse of the block diagonal */ 572 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 573 x5 = x[4+idx]; 574 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 575 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 576 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 577 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 578 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 579 v -= bs2; 580 581 vi = aj + diag[i] - 1; 582 nz = diag[i] - diag[i+1] - 1; 583 for(j=0;j>-nz;j--){ 584 oidx = bs*vi[j]; 585 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 586 x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 587 x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 588 x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 589 x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 590 v -= bs2; 591 } 592 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; x[4+idx] = s5; 593 idx += bs; 594 } 595 /* backward solve the L^T */ 596 for (i=n-1; i>=0; i--){ 597 v = aa + bs2*ai[i]; 598 vi = aj + ai[i]; 599 nz = ai[i+1] - ai[i]; 600 idt = bs*i; 601 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt]; 602 for(j=0;j<nz;j++){ 603 idx = bs*vi[j]; 604 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 605 x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 606 x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 607 x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 608 x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 609 v += bs2; 610 } 611 } 612 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 613 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 614 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 615 PetscFunctionReturn(0); 616 } 617 618 #undef __FUNCT__ 619 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace" 620 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 621 { 622 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 623 PetscErrorCode ierr; 624 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 625 PetscInt i,nz,idx,idt,oidx; 626 const MatScalar *aa=a->a,*v; 627 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x; 628 const PetscScalar *b; 629 630 PetscFunctionBegin; 631 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 632 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 633 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 634 635 /* forward solve the U^T */ 636 idx = 0; 637 for (i=0; i<n; i++) { 638 639 v = aa + 36*diag[i]; 640 /* multiply by the inverse of the block diagonal */ 641 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 642 x6 = x[5+idx]; 643 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 644 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 645 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 646 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 647 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 648 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 649 v += 36; 650 651 vi = aj + diag[i] + 1; 652 nz = ai[i+1] - diag[i] - 1; 653 while (nz--) { 654 oidx = 6*(*vi++); 655 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 656 x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 657 x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 658 x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 659 x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 660 x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 661 v += 36; 662 } 663 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 664 x[5+idx] = s6; 665 idx += 6; 666 } 667 /* backward solve the L^T */ 668 for (i=n-1; i>=0; i--){ 669 v = aa + 36*diag[i] - 36; 670 vi = aj + diag[i] - 1; 671 nz = diag[i] - ai[i]; 672 idt = 6*i; 673 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 674 s6 = x[5+idt]; 675 while (nz--) { 676 idx = 6*(*vi--); 677 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 678 x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 679 x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 680 x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 681 x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 682 x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 683 v -= 36; 684 } 685 } 686 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 687 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 688 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 689 PetscFunctionReturn(0); 690 } 691 692 #undef __FUNCT__ 693 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering" 694 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 695 { 696 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 697 PetscErrorCode ierr; 698 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 699 PetscInt nz,idx,idt,j,i,oidx; 700 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 701 const MatScalar *aa=a->a,*v; 702 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x; 703 const PetscScalar *b; 704 705 PetscFunctionBegin; 706 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 707 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 708 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 709 710 /* forward solve the U^T */ 711 idx = 0; 712 for (i=0; i<n; i++) { 713 v = aa + bs2*diag[i]; 714 /* multiply by the inverse of the block diagonal */ 715 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 716 x5 = x[4+idx]; x6 = x[5+idx]; 717 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 718 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 719 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 720 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 721 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 722 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 723 v -= bs2; 724 725 vi = aj + diag[i] - 1; 726 nz = diag[i] - diag[i+1] - 1; 727 for(j=0;j>-nz;j--){ 728 oidx = bs*vi[j]; 729 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 730 x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 731 x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 732 x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 733 x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 734 x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 735 v -= bs2; 736 } 737 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; x[4+idx] = s5; 738 x[5+idx] = s6; 739 idx += bs; 740 } 741 /* backward solve the L^T */ 742 for (i=n-1; i>=0; i--){ 743 v = aa + bs2*ai[i]; 744 vi = aj + ai[i]; 745 nz = ai[i+1] - ai[i]; 746 idt = bs*i; 747 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt]; 748 s6 = x[5+idt]; 749 for(j=0;j<nz;j++){ 750 idx = bs*vi[j]; 751 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 752 x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 753 x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 754 x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 755 x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 756 x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 757 v += bs2; 758 } 759 } 760 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 761 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 762 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 763 PetscFunctionReturn(0); 764 } 765 766 #undef __FUNCT__ 767 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace" 768 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 769 { 770 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 771 PetscErrorCode ierr; 772 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 773 PetscInt i,nz,idx,idt,oidx; 774 const MatScalar *aa=a->a,*v; 775 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x; 776 const PetscScalar *b; 777 778 PetscFunctionBegin; 779 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 780 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 781 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 782 783 /* forward solve the U^T */ 784 idx = 0; 785 for (i=0; i<n; i++) { 786 787 v = aa + 49*diag[i]; 788 /* multiply by the inverse of the block diagonal */ 789 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 790 x6 = x[5+idx]; x7 = x[6+idx]; 791 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 792 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 793 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 794 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 795 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 796 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 797 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 798 v += 49; 799 800 vi = aj + diag[i] + 1; 801 nz = ai[i+1] - diag[i] - 1; 802 while (nz--) { 803 oidx = 7*(*vi++); 804 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 805 x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 806 x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 807 x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 808 x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 809 x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 810 x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 811 v += 49; 812 } 813 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 814 x[5+idx] = s6;x[6+idx] = s7; 815 idx += 7; 816 } 817 /* backward solve the L^T */ 818 for (i=n-1; i>=0; i--){ 819 v = aa + 49*diag[i] - 49; 820 vi = aj + diag[i] - 1; 821 nz = diag[i] - ai[i]; 822 idt = 7*i; 823 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 824 s6 = x[5+idt];s7 = x[6+idt]; 825 while (nz--) { 826 idx = 7*(*vi--); 827 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 828 x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 829 x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 830 x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 831 x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 832 x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 833 x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 834 v -= 49; 835 } 836 } 837 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 838 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 839 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 840 PetscFunctionReturn(0); 841 } 842 #undef __FUNCT__ 843 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering" 844 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 845 { 846 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 847 PetscErrorCode ierr; 848 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 849 PetscInt nz,idx,idt,j,i,oidx; 850 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 851 const MatScalar *aa=a->a,*v; 852 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x; 853 const PetscScalar *b; 854 855 PetscFunctionBegin; 856 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 857 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 858 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 859 860 /* forward solve the U^T */ 861 idx = 0; 862 for (i=0; i<n; i++) { 863 v = aa + bs2*diag[i]; 864 /* multiply by the inverse of the block diagonal */ 865 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 866 x5 = x[4+idx]; x6 = x[5+idx]; x7 = x[6+idx]; 867 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 868 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 869 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 870 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 871 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 872 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 873 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 874 v -= bs2; 875 vi = aj + diag[i] - 1; 876 nz = diag[i] - diag[i+1] - 1; 877 for(j=0;j>-nz;j--){ 878 oidx = bs*vi[j]; 879 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 880 x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 881 x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 882 x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 883 x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 884 x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 885 x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 886 v -= bs2; 887 } 888 x[idx] = s1; x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; x[4+idx] = s5; 889 x[5+idx] = s6; x[6+idx] = s7; 890 idx += bs; 891 } 892 /* backward solve the L^T */ 893 for (i=n-1; i>=0; i--){ 894 v = aa + bs2*ai[i]; 895 vi = aj + ai[i]; 896 nz = ai[i+1] - ai[i]; 897 idt = bs*i; 898 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt]; 899 s6 = x[5+idt]; s7 = x[6+idt]; 900 for(j=0;j<nz;j++){ 901 idx = bs*vi[j]; 902 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 903 x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 904 x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 905 x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 906 x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 907 x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 908 x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 909 v += bs2; 910 } 911 } 912 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 913 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 914 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 915 PetscFunctionReturn(0); 916 } 917 918 /*---------------------------------------------------------------------------------------------*/ 919 #undef __FUNCT__ 920 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1" 921 PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 922 { 923 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 924 IS iscol = a->col,isrow = a->row; 925 PetscErrorCode ierr; 926 const PetscInt *rout,*cout,*r,*c,*adiag = a->diag,*ai = a->i,*aj = a->j,*vi; 927 PetscInt i,n = a->mbs,j; 928 PetscInt nz; 929 PetscScalar *x,*tmp,s1; 930 const MatScalar *aa = a->a,*v; 931 const PetscScalar *b; 932 933 PetscFunctionBegin; 934 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 935 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 936 tmp = a->solve_work; 937 938 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 939 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 940 941 /* copy the b into temp work space according to permutation */ 942 for (i=0; i<n; i++) tmp[i] = b[c[i]]; 943 944 /* forward solve the U^T */ 945 for (i=0; i<n; i++) { 946 v = aa + adiag[i+1] + 1; 947 vi = aj + adiag[i+1] + 1; 948 nz = adiag[i] - adiag[i+1] - 1; 949 s1 = tmp[i]; 950 s1 *= v[nz]; /* multiply by inverse of diagonal entry */ 951 for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j]; 952 tmp[i] = s1; 953 } 954 955 /* backward solve the L^T */ 956 for (i=n-1; i>=0; i--){ 957 v = aa + ai[i]; 958 vi = aj + ai[i]; 959 nz = ai[i+1] - ai[i]; 960 s1 = tmp[i]; 961 for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j]; 962 } 963 964 /* copy tmp into x according to permutation */ 965 for (i=0; i<n; i++) x[r[i]] = tmp[i]; 966 967 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 968 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 969 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 970 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 971 972 ierr = PetscLogFlops(2.0*a->nz-A->cmap->n);CHKERRQ(ierr); 973 PetscFunctionReturn(0); 974 } 975 976 #undef __FUNCT__ 977 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_inplace" 978 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx) 979 { 980 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 981 IS iscol=a->col,isrow=a->row; 982 PetscErrorCode ierr; 983 const PetscInt *r,*c,*rout,*cout; 984 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 985 PetscInt i,nz; 986 const MatScalar *aa=a->a,*v; 987 PetscScalar s1,*x,*t; 988 const PetscScalar *b; 989 990 PetscFunctionBegin; 991 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 992 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 993 t = a->solve_work; 994 995 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 996 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 997 998 /* copy the b into temp work space according to permutation */ 999 for (i=0; i<n; i++) { 1000 t[i] = b[c[i]]; 1001 } 1002 1003 /* forward solve the U^T */ 1004 for (i=0; i<n; i++) { 1005 1006 v = aa + diag[i]; 1007 /* multiply by the inverse of the block diagonal */ 1008 s1 = (*v++)*t[i]; 1009 vi = aj + diag[i] + 1; 1010 nz = ai[i+1] - diag[i] - 1; 1011 while (nz--) { 1012 t[*vi++] -= (*v++)*s1; 1013 } 1014 t[i] = s1; 1015 } 1016 /* backward solve the L^T */ 1017 for (i=n-1; i>=0; i--){ 1018 v = aa + diag[i] - 1; 1019 vi = aj + diag[i] - 1; 1020 nz = diag[i] - ai[i]; 1021 s1 = t[i]; 1022 while (nz--) { 1023 t[*vi--] -= (*v--)*s1; 1024 } 1025 } 1026 1027 /* copy t into x according to permutation */ 1028 for (i=0; i<n; i++) { 1029 x[r[i]] = t[i]; 1030 } 1031 1032 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1033 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1034 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1035 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1036 ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 1037 PetscFunctionReturn(0); 1038 } 1039 1040 #undef __FUNCT__ 1041 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_inplace" 1042 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx) 1043 { 1044 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1045 IS iscol=a->col,isrow=a->row; 1046 PetscErrorCode ierr; 1047 const PetscInt *r,*c,*rout,*cout; 1048 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1049 PetscInt i,nz,idx,idt,ii,ic,ir,oidx; 1050 const MatScalar *aa=a->a,*v; 1051 PetscScalar s1,s2,x1,x2,*x,*t; 1052 const PetscScalar *b; 1053 1054 PetscFunctionBegin; 1055 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1056 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1057 t = a->solve_work; 1058 1059 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1060 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1061 1062 /* copy the b into temp work space according to permutation */ 1063 ii = 0; 1064 for (i=0; i<n; i++) { 1065 ic = 2*c[i]; 1066 t[ii] = b[ic]; 1067 t[ii+1] = b[ic+1]; 1068 ii += 2; 1069 } 1070 1071 /* forward solve the U^T */ 1072 idx = 0; 1073 for (i=0; i<n; i++) { 1074 1075 v = aa + 4*diag[i]; 1076 /* multiply by the inverse of the block diagonal */ 1077 x1 = t[idx]; x2 = t[1+idx]; 1078 s1 = v[0]*x1 + v[1]*x2; 1079 s2 = v[2]*x1 + v[3]*x2; 1080 v += 4; 1081 1082 vi = aj + diag[i] + 1; 1083 nz = ai[i+1] - diag[i] - 1; 1084 while (nz--) { 1085 oidx = 2*(*vi++); 1086 t[oidx] -= v[0]*s1 + v[1]*s2; 1087 t[oidx+1] -= v[2]*s1 + v[3]*s2; 1088 v += 4; 1089 } 1090 t[idx] = s1;t[1+idx] = s2; 1091 idx += 2; 1092 } 1093 /* backward solve the L^T */ 1094 for (i=n-1; i>=0; i--){ 1095 v = aa + 4*diag[i] - 4; 1096 vi = aj + diag[i] - 1; 1097 nz = diag[i] - ai[i]; 1098 idt = 2*i; 1099 s1 = t[idt]; s2 = t[1+idt]; 1100 while (nz--) { 1101 idx = 2*(*vi--); 1102 t[idx] -= v[0]*s1 + v[1]*s2; 1103 t[idx+1] -= v[2]*s1 + v[3]*s2; 1104 v -= 4; 1105 } 1106 } 1107 1108 /* copy t into x according to permutation */ 1109 ii = 0; 1110 for (i=0; i<n; i++) { 1111 ir = 2*r[i]; 1112 x[ir] = t[ii]; 1113 x[ir+1] = t[ii+1]; 1114 ii += 2; 1115 } 1116 1117 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1118 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1119 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1120 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1121 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 1122 PetscFunctionReturn(0); 1123 } 1124 1125 #undef __FUNCT__ 1126 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2" 1127 PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 1128 { 1129 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1130 PetscErrorCode ierr; 1131 IS iscol=a->col,isrow=a->row; 1132 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 1133 const PetscInt *r,*c,*rout,*cout; 1134 PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1135 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 1136 const MatScalar *aa=a->a,*v; 1137 PetscScalar s1,s2,x1,x2,*x,*t; 1138 const PetscScalar *b; 1139 1140 PetscFunctionBegin; 1141 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1142 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1143 t = a->solve_work; 1144 1145 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1146 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1147 1148 /* copy b into temp work space according to permutation */ 1149 for(i=0;i<n;i++){ 1150 ii = bs*i; ic = bs*c[i]; 1151 t[ii] = b[ic]; t[ii+1] = b[ic+1]; 1152 } 1153 1154 /* forward solve the U^T */ 1155 idx = 0; 1156 for (i=0; i<n; i++) { 1157 v = aa + bs2*diag[i]; 1158 /* multiply by the inverse of the block diagonal */ 1159 x1 = t[idx]; x2 = t[1+idx]; 1160 s1 = v[0]*x1 + v[1]*x2; 1161 s2 = v[2]*x1 + v[3]*x2; 1162 v -= bs2; 1163 1164 vi = aj + diag[i] - 1; 1165 nz = diag[i] - diag[i+1] - 1; 1166 for(j=0;j>-nz;j--){ 1167 oidx = bs*vi[j]; 1168 t[oidx] -= v[0]*s1 + v[1]*s2; 1169 t[oidx+1] -= v[2]*s1 + v[3]*s2; 1170 v -= bs2; 1171 } 1172 t[idx] = s1;t[1+idx] = s2; 1173 idx += bs; 1174 } 1175 /* backward solve the L^T */ 1176 for (i=n-1; i>=0; i--){ 1177 v = aa + bs2*ai[i]; 1178 vi = aj + ai[i]; 1179 nz = ai[i+1] - ai[i]; 1180 idt = bs*i; 1181 s1 = t[idt]; s2 = t[1+idt]; 1182 for(j=0;j<nz;j++){ 1183 idx = bs*vi[j]; 1184 t[idx] -= v[0]*s1 + v[1]*s2; 1185 t[idx+1] -= v[2]*s1 + v[3]*s2; 1186 v += bs2; 1187 } 1188 } 1189 1190 /* copy t into x according to permutation */ 1191 for(i=0;i<n;i++){ 1192 ii = bs*i; ir = bs*r[i]; 1193 x[ir] = t[ii]; x[ir+1] = t[ii+1]; 1194 } 1195 1196 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1197 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1198 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1199 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1200 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1201 PetscFunctionReturn(0); 1202 } 1203 1204 #undef __FUNCT__ 1205 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_inplace" 1206 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx) 1207 { 1208 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1209 IS iscol=a->col,isrow=a->row; 1210 PetscErrorCode ierr; 1211 const PetscInt *r,*c,*rout,*cout; 1212 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1213 PetscInt i,nz,idx,idt,ii,ic,ir,oidx; 1214 const MatScalar *aa=a->a,*v; 1215 PetscScalar s1,s2,s3,x1,x2,x3,*x,*t; 1216 const PetscScalar *b; 1217 1218 PetscFunctionBegin; 1219 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1220 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1221 t = a->solve_work; 1222 1223 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1224 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1225 1226 /* copy the b into temp work space according to permutation */ 1227 ii = 0; 1228 for (i=0; i<n; i++) { 1229 ic = 3*c[i]; 1230 t[ii] = b[ic]; 1231 t[ii+1] = b[ic+1]; 1232 t[ii+2] = b[ic+2]; 1233 ii += 3; 1234 } 1235 1236 /* forward solve the U^T */ 1237 idx = 0; 1238 for (i=0; i<n; i++) { 1239 1240 v = aa + 9*diag[i]; 1241 /* multiply by the inverse of the block diagonal */ 1242 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1243 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 1244 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 1245 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 1246 v += 9; 1247 1248 vi = aj + diag[i] + 1; 1249 nz = ai[i+1] - diag[i] - 1; 1250 while (nz--) { 1251 oidx = 3*(*vi++); 1252 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 1253 t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 1254 t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 1255 v += 9; 1256 } 1257 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 1258 idx += 3; 1259 } 1260 /* backward solve the L^T */ 1261 for (i=n-1; i>=0; i--){ 1262 v = aa + 9*diag[i] - 9; 1263 vi = aj + diag[i] - 1; 1264 nz = diag[i] - ai[i]; 1265 idt = 3*i; 1266 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 1267 while (nz--) { 1268 idx = 3*(*vi--); 1269 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 1270 t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 1271 t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 1272 v -= 9; 1273 } 1274 } 1275 1276 /* copy t into x according to permutation */ 1277 ii = 0; 1278 for (i=0; i<n; i++) { 1279 ir = 3*r[i]; 1280 x[ir] = t[ii]; 1281 x[ir+1] = t[ii+1]; 1282 x[ir+2] = t[ii+2]; 1283 ii += 3; 1284 } 1285 1286 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1287 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1288 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1289 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1290 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 1291 PetscFunctionReturn(0); 1292 } 1293 1294 #undef __FUNCT__ 1295 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3" 1296 PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 1297 { 1298 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1299 PetscErrorCode ierr; 1300 IS iscol=a->col,isrow=a->row; 1301 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 1302 const PetscInt *r,*c,*rout,*cout; 1303 PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1304 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 1305 const MatScalar *aa=a->a,*v; 1306 PetscScalar s1,s2,s3,x1,x2,x3,*x,*t; 1307 const PetscScalar *b; 1308 1309 PetscFunctionBegin; 1310 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1311 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1312 t = a->solve_work; 1313 1314 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1315 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1316 1317 /* copy b into temp work space according to permutation */ 1318 for(i=0;i<n;i++){ 1319 ii = bs*i; ic = bs*c[i]; 1320 t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; 1321 } 1322 1323 /* forward solve the U^T */ 1324 idx = 0; 1325 for (i=0; i<n; i++) { 1326 v = aa + bs2*diag[i]; 1327 /* multiply by the inverse of the block diagonal */ 1328 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1329 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 1330 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 1331 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 1332 v -= bs2; 1333 1334 vi = aj + diag[i] - 1; 1335 nz = diag[i] - diag[i+1] - 1; 1336 for(j=0;j>-nz;j--){ 1337 oidx = bs*vi[j]; 1338 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 1339 t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 1340 t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 1341 v -= bs2; 1342 } 1343 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 1344 idx += bs; 1345 } 1346 /* backward solve the L^T */ 1347 for (i=n-1; i>=0; i--){ 1348 v = aa + bs2*ai[i]; 1349 vi = aj + ai[i]; 1350 nz = ai[i+1] - ai[i]; 1351 idt = bs*i; 1352 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 1353 for(j=0;j<nz;j++){ 1354 idx = bs*vi[j]; 1355 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 1356 t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 1357 t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 1358 v += bs2; 1359 } 1360 } 1361 1362 /* copy t into x according to permutation */ 1363 for(i=0;i<n;i++){ 1364 ii = bs*i; ir = bs*r[i]; 1365 x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; 1366 } 1367 1368 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1369 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1370 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1371 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1372 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1373 PetscFunctionReturn(0); 1374 } 1375 1376 #undef __FUNCT__ 1377 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_inplace" 1378 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx) 1379 { 1380 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1381 IS iscol=a->col,isrow=a->row; 1382 PetscErrorCode ierr; 1383 const PetscInt *r,*c,*rout,*cout; 1384 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1385 PetscInt i,nz,idx,idt,ii,ic,ir,oidx; 1386 const MatScalar *aa=a->a,*v; 1387 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4,*x,*t; 1388 const PetscScalar *b; 1389 1390 PetscFunctionBegin; 1391 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1392 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1393 t = a->solve_work; 1394 1395 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1396 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1397 1398 /* copy the b into temp work space according to permutation */ 1399 ii = 0; 1400 for (i=0; i<n; i++) { 1401 ic = 4*c[i]; 1402 t[ii] = b[ic]; 1403 t[ii+1] = b[ic+1]; 1404 t[ii+2] = b[ic+2]; 1405 t[ii+3] = b[ic+3]; 1406 ii += 4; 1407 } 1408 1409 /* forward solve the U^T */ 1410 idx = 0; 1411 for (i=0; i<n; i++) { 1412 1413 v = aa + 16*diag[i]; 1414 /* multiply by the inverse of the block diagonal */ 1415 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 1416 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 1417 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 1418 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 1419 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 1420 v += 16; 1421 1422 vi = aj + diag[i] + 1; 1423 nz = ai[i+1] - diag[i] - 1; 1424 while (nz--) { 1425 oidx = 4*(*vi++); 1426 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 1427 t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 1428 t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 1429 t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 1430 v += 16; 1431 } 1432 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; 1433 idx += 4; 1434 } 1435 /* backward solve the L^T */ 1436 for (i=n-1; i>=0; i--){ 1437 v = aa + 16*diag[i] - 16; 1438 vi = aj + diag[i] - 1; 1439 nz = diag[i] - ai[i]; 1440 idt = 4*i; 1441 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; 1442 while (nz--) { 1443 idx = 4*(*vi--); 1444 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 1445 t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 1446 t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 1447 t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 1448 v -= 16; 1449 } 1450 } 1451 1452 /* copy t into x according to permutation */ 1453 ii = 0; 1454 for (i=0; i<n; i++) { 1455 ir = 4*r[i]; 1456 x[ir] = t[ii]; 1457 x[ir+1] = t[ii+1]; 1458 x[ir+2] = t[ii+2]; 1459 x[ir+3] = t[ii+3]; 1460 ii += 4; 1461 } 1462 1463 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1464 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1465 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1466 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1467 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 1468 PetscFunctionReturn(0); 1469 } 1470 1471 #undef __FUNCT__ 1472 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4" 1473 PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 1474 { 1475 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1476 PetscErrorCode ierr; 1477 IS iscol=a->col,isrow=a->row; 1478 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 1479 const PetscInt *r,*c,*rout,*cout; 1480 PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1481 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 1482 const MatScalar *aa=a->a,*v; 1483 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4,*x,*t; 1484 const PetscScalar *b; 1485 1486 PetscFunctionBegin; 1487 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1488 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1489 t = a->solve_work; 1490 1491 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1492 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1493 1494 /* copy b into temp work space according to permutation */ 1495 for(i=0;i<n;i++){ 1496 ii = bs*i; ic = bs*c[i]; 1497 t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3]; 1498 } 1499 1500 /* forward solve the U^T */ 1501 idx = 0; 1502 for (i=0; i<n; i++) { 1503 v = aa + bs2*diag[i]; 1504 /* multiply by the inverse of the block diagonal */ 1505 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 1506 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 1507 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 1508 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 1509 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 1510 v -= bs2; 1511 1512 vi = aj + diag[i] - 1; 1513 nz = diag[i] - diag[i+1] - 1; 1514 for(j=0;j>-nz;j--){ 1515 oidx = bs*vi[j]; 1516 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 1517 t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 1518 t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 1519 t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 1520 v -= bs2; 1521 } 1522 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; 1523 idx += bs; 1524 } 1525 /* backward solve the L^T */ 1526 for (i=n-1; i>=0; i--){ 1527 v = aa + bs2*ai[i]; 1528 vi = aj + ai[i]; 1529 nz = ai[i+1] - ai[i]; 1530 idt = bs*i; 1531 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; 1532 for(j=0;j<nz;j++){ 1533 idx = bs*vi[j]; 1534 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 1535 t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 1536 t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 1537 t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 1538 v += bs2; 1539 } 1540 } 1541 1542 /* copy t into x according to permutation */ 1543 for(i=0;i<n;i++){ 1544 ii = bs*i; ir = bs*r[i]; 1545 x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3]; 1546 } 1547 1548 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1549 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1550 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1551 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1552 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1553 PetscFunctionReturn(0); 1554 } 1555 1556 #undef __FUNCT__ 1557 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_inplace" 1558 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx) 1559 { 1560 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1561 IS iscol=a->col,isrow=a->row; 1562 PetscErrorCode ierr; 1563 const PetscInt *r,*c,*rout,*cout; 1564 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1565 PetscInt i,nz,idx,idt,ii,ic,ir,oidx; 1566 const MatScalar *aa=a->a,*v; 1567 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t; 1568 const PetscScalar *b; 1569 1570 PetscFunctionBegin; 1571 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1572 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1573 t = a->solve_work; 1574 1575 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1576 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1577 1578 /* copy the b into temp work space according to permutation */ 1579 ii = 0; 1580 for (i=0; i<n; i++) { 1581 ic = 5*c[i]; 1582 t[ii] = b[ic]; 1583 t[ii+1] = b[ic+1]; 1584 t[ii+2] = b[ic+2]; 1585 t[ii+3] = b[ic+3]; 1586 t[ii+4] = b[ic+4]; 1587 ii += 5; 1588 } 1589 1590 /* forward solve the U^T */ 1591 idx = 0; 1592 for (i=0; i<n; i++) { 1593 1594 v = aa + 25*diag[i]; 1595 /* multiply by the inverse of the block diagonal */ 1596 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1597 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 1598 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 1599 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 1600 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 1601 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 1602 v += 25; 1603 1604 vi = aj + diag[i] + 1; 1605 nz = ai[i+1] - diag[i] - 1; 1606 while (nz--) { 1607 oidx = 5*(*vi++); 1608 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 1609 t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 1610 t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 1611 t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 1612 t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 1613 v += 25; 1614 } 1615 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1616 idx += 5; 1617 } 1618 /* backward solve the L^T */ 1619 for (i=n-1; i>=0; i--){ 1620 v = aa + 25*diag[i] - 25; 1621 vi = aj + diag[i] - 1; 1622 nz = diag[i] - ai[i]; 1623 idt = 5*i; 1624 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1625 while (nz--) { 1626 idx = 5*(*vi--); 1627 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 1628 t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 1629 t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 1630 t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 1631 t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 1632 v -= 25; 1633 } 1634 } 1635 1636 /* copy t into x according to permutation */ 1637 ii = 0; 1638 for (i=0; i<n; i++) { 1639 ir = 5*r[i]; 1640 x[ir] = t[ii]; 1641 x[ir+1] = t[ii+1]; 1642 x[ir+2] = t[ii+2]; 1643 x[ir+3] = t[ii+3]; 1644 x[ir+4] = t[ii+4]; 1645 ii += 5; 1646 } 1647 1648 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1649 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1650 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1651 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1652 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 1653 PetscFunctionReturn(0); 1654 } 1655 1656 #undef __FUNCT__ 1657 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5" 1658 PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 1659 { 1660 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1661 PetscErrorCode ierr; 1662 IS iscol=a->col,isrow=a->row; 1663 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 1664 const PetscInt *r,*c,*rout,*cout; 1665 PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1666 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 1667 const MatScalar *aa=a->a,*v; 1668 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t; 1669 const PetscScalar *b; 1670 1671 PetscFunctionBegin; 1672 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1673 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1674 t = a->solve_work; 1675 1676 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1677 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1678 1679 /* copy b into temp work space according to permutation */ 1680 for(i=0;i<n;i++){ 1681 ii = bs*i; ic = bs*c[i]; 1682 t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3]; 1683 t[ii+4] = b[ic+4]; 1684 } 1685 1686 /* forward solve the U^T */ 1687 idx = 0; 1688 for (i=0; i<n; i++) { 1689 v = aa + bs2*diag[i]; 1690 /* multiply by the inverse of the block diagonal */ 1691 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1692 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 1693 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 1694 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 1695 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 1696 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 1697 v -= bs2; 1698 1699 vi = aj + diag[i] - 1; 1700 nz = diag[i] - diag[i+1] - 1; 1701 for(j=0;j>-nz;j--){ 1702 oidx = bs*vi[j]; 1703 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 1704 t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 1705 t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 1706 t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 1707 t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 1708 v -= bs2; 1709 } 1710 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; t[4+idx] =s5; 1711 idx += bs; 1712 } 1713 /* backward solve the L^T */ 1714 for (i=n-1; i>=0; i--){ 1715 v = aa + bs2*ai[i]; 1716 vi = aj + ai[i]; 1717 nz = ai[i+1] - ai[i]; 1718 idt = bs*i; 1719 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; s5 = t[4+idt]; 1720 for(j=0;j<nz;j++){ 1721 idx = bs*vi[j]; 1722 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 1723 t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 1724 t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 1725 t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 1726 t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 1727 v += bs2; 1728 } 1729 } 1730 1731 /* copy t into x according to permutation */ 1732 for(i=0;i<n;i++){ 1733 ii = bs*i; ir = bs*r[i]; 1734 x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3]; 1735 x[ir+4] = t[ii+4]; 1736 } 1737 1738 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1739 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1740 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1741 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1742 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1743 PetscFunctionReturn(0); 1744 } 1745 1746 #undef __FUNCT__ 1747 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_inplace" 1748 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx) 1749 { 1750 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1751 IS iscol=a->col,isrow=a->row; 1752 PetscErrorCode ierr; 1753 const PetscInt *r,*c,*rout,*cout; 1754 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1755 PetscInt i,nz,idx,idt,ii,ic,ir,oidx; 1756 const MatScalar *aa=a->a,*v; 1757 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t; 1758 const PetscScalar *b; 1759 1760 PetscFunctionBegin; 1761 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1762 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1763 t = a->solve_work; 1764 1765 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1766 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1767 1768 /* copy the b into temp work space according to permutation */ 1769 ii = 0; 1770 for (i=0; i<n; i++) { 1771 ic = 6*c[i]; 1772 t[ii] = b[ic]; 1773 t[ii+1] = b[ic+1]; 1774 t[ii+2] = b[ic+2]; 1775 t[ii+3] = b[ic+3]; 1776 t[ii+4] = b[ic+4]; 1777 t[ii+5] = b[ic+5]; 1778 ii += 6; 1779 } 1780 1781 /* forward solve the U^T */ 1782 idx = 0; 1783 for (i=0; i<n; i++) { 1784 1785 v = aa + 36*diag[i]; 1786 /* multiply by the inverse of the block diagonal */ 1787 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1788 x6 = t[5+idx]; 1789 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 1790 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 1791 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 1792 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 1793 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 1794 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 1795 v += 36; 1796 1797 vi = aj + diag[i] + 1; 1798 nz = ai[i+1] - diag[i] - 1; 1799 while (nz--) { 1800 oidx = 6*(*vi++); 1801 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 1802 t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 1803 t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 1804 t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 1805 t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 1806 t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 1807 v += 36; 1808 } 1809 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1810 t[5+idx] = s6; 1811 idx += 6; 1812 } 1813 /* backward solve the L^T */ 1814 for (i=n-1; i>=0; i--){ 1815 v = aa + 36*diag[i] - 36; 1816 vi = aj + diag[i] - 1; 1817 nz = diag[i] - ai[i]; 1818 idt = 6*i; 1819 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1820 s6 = t[5+idt]; 1821 while (nz--) { 1822 idx = 6*(*vi--); 1823 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 1824 t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 1825 t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 1826 t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 1827 t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 1828 t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 1829 v -= 36; 1830 } 1831 } 1832 1833 /* copy t into x according to permutation */ 1834 ii = 0; 1835 for (i=0; i<n; i++) { 1836 ir = 6*r[i]; 1837 x[ir] = t[ii]; 1838 x[ir+1] = t[ii+1]; 1839 x[ir+2] = t[ii+2]; 1840 x[ir+3] = t[ii+3]; 1841 x[ir+4] = t[ii+4]; 1842 x[ir+5] = t[ii+5]; 1843 ii += 6; 1844 } 1845 1846 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1847 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1848 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1849 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1850 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1851 PetscFunctionReturn(0); 1852 } 1853 1854 #undef __FUNCT__ 1855 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6" 1856 PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 1857 { 1858 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1859 PetscErrorCode ierr; 1860 IS iscol=a->col,isrow=a->row; 1861 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 1862 const PetscInt *r,*c,*rout,*cout; 1863 PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1864 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 1865 const MatScalar *aa=a->a,*v; 1866 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t; 1867 const PetscScalar *b; 1868 1869 PetscFunctionBegin; 1870 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1871 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1872 t = a->solve_work; 1873 1874 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1875 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1876 1877 /* copy b into temp work space according to permutation */ 1878 for(i=0;i<n;i++){ 1879 ii = bs*i; ic = bs*c[i]; 1880 t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3]; 1881 t[ii+4] = b[ic+4]; t[ii+5] = b[ic+5]; 1882 } 1883 1884 /* forward solve the U^T */ 1885 idx = 0; 1886 for (i=0; i<n; i++) { 1887 v = aa + bs2*diag[i]; 1888 /* multiply by the inverse of the block diagonal */ 1889 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1890 x6 = t[5+idx]; 1891 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 1892 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 1893 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 1894 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 1895 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 1896 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 1897 v -= bs2; 1898 1899 vi = aj + diag[i] - 1; 1900 nz = diag[i] - diag[i+1] - 1; 1901 for(j=0;j>-nz;j--){ 1902 oidx = bs*vi[j]; 1903 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 1904 t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 1905 t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 1906 t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 1907 t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 1908 t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 1909 v -= bs2; 1910 } 1911 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; t[4+idx] =s5; 1912 t[5+idx] = s6; 1913 idx += bs; 1914 } 1915 /* backward solve the L^T */ 1916 for (i=n-1; i>=0; i--){ 1917 v = aa + bs2*ai[i]; 1918 vi = aj + ai[i]; 1919 nz = ai[i+1] - ai[i]; 1920 idt = bs*i; 1921 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; s5 = t[4+idt]; 1922 s6 = t[5+idt]; 1923 for(j=0;j<nz;j++){ 1924 idx = bs*vi[j]; 1925 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 1926 t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 1927 t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 1928 t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 1929 t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 1930 t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 1931 v += bs2; 1932 } 1933 } 1934 1935 /* copy t into x according to permutation */ 1936 for(i=0;i<n;i++){ 1937 ii = bs*i; ir = bs*r[i]; 1938 x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3]; 1939 x[ir+4] = t[ii+4]; x[ir+5] = t[ii+5]; 1940 } 1941 1942 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1943 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1944 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1945 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1946 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1947 PetscFunctionReturn(0); 1948 } 1949 1950 #undef __FUNCT__ 1951 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_inplace" 1952 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx) 1953 { 1954 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1955 IS iscol=a->col,isrow=a->row; 1956 PetscErrorCode ierr; 1957 const PetscInt *r,*c,*rout,*cout; 1958 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1959 PetscInt i,nz,idx,idt,ii,ic,ir,oidx; 1960 const MatScalar *aa=a->a,*v; 1961 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t; 1962 const PetscScalar *b; 1963 1964 PetscFunctionBegin; 1965 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1966 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1967 t = a->solve_work; 1968 1969 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1970 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1971 1972 /* copy the b into temp work space according to permutation */ 1973 ii = 0; 1974 for (i=0; i<n; i++) { 1975 ic = 7*c[i]; 1976 t[ii] = b[ic]; 1977 t[ii+1] = b[ic+1]; 1978 t[ii+2] = b[ic+2]; 1979 t[ii+3] = b[ic+3]; 1980 t[ii+4] = b[ic+4]; 1981 t[ii+5] = b[ic+5]; 1982 t[ii+6] = b[ic+6]; 1983 ii += 7; 1984 } 1985 1986 /* forward solve the U^T */ 1987 idx = 0; 1988 for (i=0; i<n; i++) { 1989 1990 v = aa + 49*diag[i]; 1991 /* multiply by the inverse of the block diagonal */ 1992 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1993 x6 = t[5+idx]; x7 = t[6+idx]; 1994 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 1995 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 1996 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 1997 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 1998 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 1999 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 2000 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 2001 v += 49; 2002 2003 vi = aj + diag[i] + 1; 2004 nz = ai[i+1] - diag[i] - 1; 2005 while (nz--) { 2006 oidx = 7*(*vi++); 2007 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 2008 t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 2009 t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 2010 t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 2011 t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 2012 t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 2013 t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 2014 v += 49; 2015 } 2016 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 2017 t[5+idx] = s6;t[6+idx] = s7; 2018 idx += 7; 2019 } 2020 /* backward solve the L^T */ 2021 for (i=n-1; i>=0; i--){ 2022 v = aa + 49*diag[i] - 49; 2023 vi = aj + diag[i] - 1; 2024 nz = diag[i] - ai[i]; 2025 idt = 7*i; 2026 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 2027 s6 = t[5+idt];s7 = t[6+idt]; 2028 while (nz--) { 2029 idx = 7*(*vi--); 2030 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 2031 t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 2032 t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 2033 t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 2034 t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 2035 t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 2036 t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 2037 v -= 49; 2038 } 2039 } 2040 2041 /* copy t into x according to permutation */ 2042 ii = 0; 2043 for (i=0; i<n; i++) { 2044 ir = 7*r[i]; 2045 x[ir] = t[ii]; 2046 x[ir+1] = t[ii+1]; 2047 x[ir+2] = t[ii+2]; 2048 x[ir+3] = t[ii+3]; 2049 x[ir+4] = t[ii+4]; 2050 x[ir+5] = t[ii+5]; 2051 x[ir+6] = t[ii+6]; 2052 ii += 7; 2053 } 2054 2055 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2056 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2057 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2058 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2059 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 2060 PetscFunctionReturn(0); 2061 } 2062 #undef __FUNCT__ 2063 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7" 2064 PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 2065 { 2066 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2067 PetscErrorCode ierr; 2068 IS iscol=a->col,isrow=a->row; 2069 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 2070 const PetscInt *r,*c,*rout,*cout; 2071 PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 2072 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 2073 const MatScalar *aa=a->a,*v; 2074 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t; 2075 const PetscScalar *b; 2076 2077 PetscFunctionBegin; 2078 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2079 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2080 t = a->solve_work; 2081 2082 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2083 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2084 2085 /* copy b into temp work space according to permutation */ 2086 for(i=0;i<n;i++){ 2087 ii = bs*i; ic = bs*c[i]; 2088 t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3]; 2089 t[ii+4] = b[ic+4]; t[ii+5] = b[ic+5]; t[ii+6] = b[ic+6]; 2090 } 2091 2092 /* forward solve the U^T */ 2093 idx = 0; 2094 for (i=0; i<n; i++) { 2095 v = aa + bs2*diag[i]; 2096 /* multiply by the inverse of the block diagonal */ 2097 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 2098 x6 = t[5+idx]; x7 = t[6+idx]; 2099 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 2100 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 2101 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 2102 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 2103 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 2104 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 2105 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 2106 v -= bs2; 2107 2108 vi = aj + diag[i] - 1; 2109 nz = diag[i] - diag[i+1] - 1; 2110 for(j=0;j>-nz;j--){ 2111 oidx = bs*vi[j]; 2112 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 2113 t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 2114 t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 2115 t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 2116 t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 2117 t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 2118 t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 2119 v -= bs2; 2120 } 2121 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; t[4+idx] =s5; 2122 t[5+idx] = s6; t[6+idx] = s7; 2123 idx += bs; 2124 } 2125 /* backward solve the L^T */ 2126 for (i=n-1; i>=0; i--){ 2127 v = aa + bs2*ai[i]; 2128 vi = aj + ai[i]; 2129 nz = ai[i+1] - ai[i]; 2130 idt = bs*i; 2131 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; s5 = t[4+idt]; 2132 s6 = t[5+idt]; s7 = t[6+idt]; 2133 for(j=0;j<nz;j++){ 2134 idx = bs*vi[j]; 2135 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 2136 t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 2137 t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 2138 t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 2139 t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 2140 t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 2141 t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 2142 v += bs2; 2143 } 2144 } 2145 2146 /* copy t into x according to permutation */ 2147 for(i=0;i<n;i++){ 2148 ii = bs*i; ir = bs*r[i]; 2149 x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3]; 2150 x[ir+4] = t[ii+4]; x[ir+5] = t[ii+5]; x[ir+6] = t[ii+6]; 2151 } 2152 2153 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2154 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2155 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2156 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2157 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 2158 PetscFunctionReturn(0); 2159 } 2160 2161 /* ----------------------------------------------------------- */ 2162 #undef __FUNCT__ 2163 #define __FUNCT__ "MatSolve_SeqBAIJ_N_inplace" 2164 PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx) 2165 { 2166 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2167 IS iscol=a->col,isrow=a->row; 2168 PetscErrorCode ierr; 2169 const PetscInt *r,*c,*rout,*cout; 2170 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*vi; 2171 PetscInt i,nz; 2172 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 2173 const MatScalar *aa=a->a,*v; 2174 PetscScalar *x,*s,*t,*ls; 2175 const PetscScalar *b; 2176 2177 PetscFunctionBegin; 2178 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2179 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2180 t = a->solve_work; 2181 2182 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2183 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2184 2185 /* forward solve the lower triangular */ 2186 ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 2187 for (i=1; i<n; i++) { 2188 v = aa + bs2*ai[i]; 2189 vi = aj + ai[i]; 2190 nz = a->diag[i] - ai[i]; 2191 s = t + bs*i; 2192 ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 2193 while (nz--) { 2194 Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 2195 v += bs2; 2196 } 2197 } 2198 /* backward solve the upper triangular */ 2199 ls = a->solve_work + A->cmap->n; 2200 for (i=n-1; i>=0; i--){ 2201 v = aa + bs2*(a->diag[i] + 1); 2202 vi = aj + a->diag[i] + 1; 2203 nz = ai[i+1] - a->diag[i] - 1; 2204 ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 2205 while (nz--) { 2206 Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 2207 v += bs2; 2208 } 2209 Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 2210 ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 2211 } 2212 2213 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2214 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2215 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2216 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2217 ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 2218 PetscFunctionReturn(0); 2219 } 2220 2221 /* ----------------------------------------------------------- */ 2222 #undef __FUNCT__ 2223 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N_inplace" 2224 PetscErrorCode MatSolveTranspose_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx) 2225 { 2226 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2227 IS iscol=a->col,isrow=a->row; 2228 PetscErrorCode ierr; 2229 const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi; 2230 PetscInt i,nz,j; 2231 const PetscInt n=a->mbs,bs=A->rmap->bs,bs2=a->bs2; 2232 const MatScalar *aa=a->a,*v; 2233 PetscScalar *x,*t,*ls; 2234 const PetscScalar *b; 2235 PetscFunctionBegin; 2236 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2237 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2238 t = a->solve_work; 2239 2240 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2241 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2242 2243 /* copy the b into temp work space according to permutation */ 2244 for (i=0; i<n; i++) { 2245 for (j=0; j<bs; j++) { 2246 t[i*bs+j] = b[c[i]*bs+j]; 2247 } 2248 } 2249 2250 2251 /* forward solve the upper triangular transpose */ 2252 ls = a->solve_work + A->cmap->n; 2253 for (i=0; i<n; i++){ 2254 ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 2255 Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 2256 v = aa + bs2*(a->diag[i] + 1); 2257 vi = aj + a->diag[i] + 1; 2258 nz = ai[i+1] - a->diag[i] - 1; 2259 while (nz--) { 2260 Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs); 2261 v += bs2; 2262 } 2263 } 2264 2265 /* backward solve the lower triangular transpose */ 2266 for (i=n-1; i>=0; i--) { 2267 v = aa + bs2*ai[i]; 2268 vi = aj + ai[i]; 2269 nz = a->diag[i] - ai[i]; 2270 while (nz--) { 2271 Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs); 2272 v += bs2; 2273 } 2274 } 2275 2276 /* copy t into x according to permutation */ 2277 for (i=0; i<n; i++) { 2278 for (j=0; j<bs; j++) { 2279 x[bs*r[i]+j] = t[bs*i+j]; 2280 } 2281 } 2282 2283 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2284 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2285 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2286 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2287 ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 2288 PetscFunctionReturn(0); 2289 } 2290 2291 #undef __FUNCT__ 2292 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N" 2293 PetscErrorCode MatSolveTranspose_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 2294 { 2295 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2296 IS iscol=a->col,isrow=a->row; 2297 PetscErrorCode ierr; 2298 const PetscInt *r,*c,*rout,*cout; 2299 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*vi,*diag=a->diag; 2300 PetscInt i,j,nz; 2301 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 2302 const MatScalar *aa=a->a,*v; 2303 PetscScalar *x,*t,*ls; 2304 const PetscScalar *b; 2305 2306 PetscFunctionBegin; 2307 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2308 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2309 t = a->solve_work; 2310 2311 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2312 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2313 2314 /* copy the b into temp work space according to permutation */ 2315 for (i=0; i<n; i++) { 2316 for (j=0; j<bs; j++) { 2317 t[i*bs+j] = b[c[i]*bs+j]; 2318 } 2319 } 2320 2321 2322 /* forward solve the upper triangular transpose */ 2323 ls = a->solve_work + A->cmap->n; 2324 for (i=0; i<n; i++){ 2325 ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 2326 Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*diag[i],t+i*bs); 2327 v = aa + bs2*(diag[i] - 1); 2328 vi = aj + diag[i] - 1; 2329 nz = diag[i] - diag[i+1] - 1; 2330 for(j=0;j>-nz;j--){ 2331 Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs); 2332 v -= bs2; 2333 } 2334 } 2335 2336 /* backward solve the lower triangular transpose */ 2337 for (i=n-1; i>=0; i--) { 2338 v = aa + bs2*ai[i]; 2339 vi = aj + ai[i]; 2340 nz = ai[i+1] - ai[i]; 2341 for(j=0;j<nz;j++){ 2342 Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs); 2343 v += bs2; 2344 } 2345 } 2346 2347 /* copy t into x according to permutation */ 2348 for (i=0; i<n; i++) { 2349 for (j=0; j<bs; j++) { 2350 x[bs*r[i]+j] = t[bs*i+j]; 2351 } 2352 } 2353 2354 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2355 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2356 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2357 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2358 ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 2359 PetscFunctionReturn(0); 2360 } 2361 2362 /* bs = 15 for PFLOTRAN. Block operations are done by accessing all the columns of the block at once */ 2363 2364 #undef __FUNCT__ 2365 #define __FUNCT__ "MatSolve_SeqBAIJ_15_NaturalOrdering_ver2" 2366 PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver2(Mat A,Vec bb,Vec xx) 2367 { 2368 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2369 PetscErrorCode ierr; 2370 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2; 2371 PetscInt i,nz,idx,idt,m; 2372 const MatScalar *aa=a->a,*v; 2373 PetscScalar s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15; 2374 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15; 2375 PetscScalar *x; 2376 const PetscScalar *b; 2377 2378 PetscFunctionBegin; 2379 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2380 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2381 2382 /* forward solve the lower triangular */ 2383 idx = 0; 2384 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx]; x[4] = b[4+idx]; 2385 x[5] = b[5+idx]; x[6] = b[6+idx]; x[7] = b[7+idx]; x[8] = b[8+idx]; x[9] = b[9+idx]; 2386 x[10] = b[10+idx]; x[11] = b[11+idx]; x[12] = b[12+idx]; x[13] = b[13+idx]; x[14] = b[14+idx]; 2387 2388 for (i=1; i<n; i++) { 2389 v = aa + bs2*ai[i]; 2390 vi = aj + ai[i]; 2391 nz = ai[i+1] - ai[i]; 2392 idt = bs*i; 2393 s1 = b[idt]; s2 = b[1+idt]; s3 = b[2+idt]; s4 = b[3+idt]; s5 = b[4+idt]; 2394 s6 = b[5+idt]; s7 = b[6+idt]; s8 = b[7+idt]; s9 = b[8+idt]; s10 = b[9+idt]; 2395 s11 = b[10+idt]; s12 = b[11+idt]; s13 = b[12+idt]; s14 = b[13+idt]; s15 = b[14+idt]; 2396 for(m=0;m<nz;m++){ 2397 idx = bs*vi[m]; 2398 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 2399 x6 = x[5+idx]; x7 = x[6+idx]; x8 = x[7+idx]; x9 = x[8+idx]; x10 = x[9+idx]; 2400 x11 = x[10+idx]; x12 = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx]; 2401 2402 2403 s1 -= v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15; 2404 s2 -= v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15; 2405 s3 -= v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15; 2406 s4 -= v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15; 2407 s5 -= v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15; 2408 s6 -= v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15; 2409 s7 -= v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15; 2410 s8 -= v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15; 2411 s9 -= v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15; 2412 s10 -= v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15; 2413 s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15; 2414 s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15; 2415 s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15; 2416 s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15; 2417 s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15; 2418 2419 v += bs2; 2420 } 2421 x[idt] = s1; x[1+idt] = s2; x[2+idt] = s3; x[3+idt] = s4; x[4+idt] = s5; 2422 x[5+idt] = s6; x[6+idt] = s7; x[7+idt] = s8; x[8+idt] = s9; x[9+idt] = s10; 2423 x[10+idt] = s11; x[11+idt] = s12; x[12+idt] = s13; x[13+idt] = s14; x[14+idt] = s15; 2424 2425 } 2426 /* backward solve the upper triangular */ 2427 for (i=n-1; i>=0; i--){ 2428 v = aa + bs2*(adiag[i+1]+1); 2429 vi = aj + adiag[i+1]+1; 2430 nz = adiag[i] - adiag[i+1] - 1; 2431 idt = bs*i; 2432 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt]; 2433 s6 = x[5+idt]; s7 = x[6+idt]; s8 = x[7+idt]; s9 = x[8+idt]; s10 = x[9+idt]; 2434 s11 = x[10+idt]; s12 = x[11+idt]; s13 = x[12+idt]; s14 = x[13+idt]; s15 = x[14+idt]; 2435 2436 for(m=0;m<nz;m++){ 2437 idx = bs*vi[m]; 2438 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 2439 x6 = x[5+idx]; x7 = x[6+idx]; x8 = x[7+idx]; x9 = x[8+idx]; x10 = x[9+idx]; 2440 x11 = x[10+idx]; x12 = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx]; 2441 2442 s1 -= v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15; 2443 s2 -= v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15; 2444 s3 -= v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15; 2445 s4 -= v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15; 2446 s5 -= v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15; 2447 s6 -= v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15; 2448 s7 -= v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15; 2449 s8 -= v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15; 2450 s9 -= v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15; 2451 s10 -= v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15; 2452 s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15; 2453 s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15; 2454 s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15; 2455 s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15; 2456 s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15; 2457 2458 v += bs2; 2459 } 2460 2461 x[idt] = v[0]*s1 + v[15]*s2 + v[30]*s3 + v[45]*s4 + v[60]*s5 + v[75]*s6 + v[90]*s7 + v[105]*s8 + v[120]*s9 + v[135]*s10 + v[150]*s11 + v[165]*s12 + v[180]*s13 + v[195]*s14 + v[210]*s15; 2462 x[1+idt] = v[1]*s1 + v[16]*s2 + v[31]*s3 + v[46]*s4 + v[61]*s5 + v[76]*s6 + v[91]*s7 + v[106]*s8 + v[121]*s9 + v[136]*s10 + v[151]*s11 + v[166]*s12 + v[181]*s13 + v[196]*s14 + v[211]*s15; 2463 x[2+idt] = v[2]*s1 + v[17]*s2 + v[32]*s3 + v[47]*s4 + v[62]*s5 + v[77]*s6 + v[92]*s7 + v[107]*s8 + v[122]*s9 + v[137]*s10 + v[152]*s11 + v[167]*s12 + v[182]*s13 + v[197]*s14 + v[212]*s15; 2464 x[3+idt] = v[3]*s1 + v[18]*s2 + v[33]*s3 + v[48]*s4 + v[63]*s5 + v[78]*s6 + v[93]*s7 + v[108]*s8 + v[123]*s9 + v[138]*s10 + v[153]*s11 + v[168]*s12 + v[183]*s13 + v[198]*s14 + v[213]*s15; 2465 x[4+idt] = v[4]*s1 + v[19]*s2 + v[34]*s3 + v[49]*s4 + v[64]*s5 + v[79]*s6 + v[94]*s7 + v[109]*s8 + v[124]*s9 + v[139]*s10 + v[154]*s11 + v[169]*s12 + v[184]*s13 + v[199]*s14 + v[214]*s15; 2466 x[5+idt] = v[5]*s1 + v[20]*s2 + v[35]*s3 + v[50]*s4 + v[65]*s5 + v[80]*s6 + v[95]*s7 + v[110]*s8 + v[125]*s9 + v[140]*s10 + v[155]*s11 + v[170]*s12 + v[185]*s13 + v[200]*s14 + v[215]*s15; 2467 x[6+idt] = v[6]*s1 + v[21]*s2 + v[36]*s3 + v[51]*s4 + v[66]*s5 + v[81]*s6 + v[96]*s7 + v[111]*s8 + v[126]*s9 + v[141]*s10 + v[156]*s11 + v[171]*s12 + v[186]*s13 + v[201]*s14 + v[216]*s15; 2468 x[7+idt] = v[7]*s1 + v[22]*s2 + v[37]*s3 + v[52]*s4 + v[67]*s5 + v[82]*s6 + v[97]*s7 + v[112]*s8 + v[127]*s9 + v[142]*s10 + v[157]*s11 + v[172]*s12 + v[187]*s13 + v[202]*s14 + v[217]*s15; 2469 x[8+idt] = v[8]*s1 + v[23]*s2 + v[38]*s3 + v[53]*s4 + v[68]*s5 + v[83]*s6 + v[98]*s7 + v[113]*s8 + v[128]*s9 + v[143]*s10 + v[158]*s11 + v[173]*s12 + v[188]*s13 + v[203]*s14 + v[218]*s15; 2470 x[9+idt] = v[9]*s1 + v[24]*s2 + v[39]*s3 + v[54]*s4 + v[69]*s5 + v[84]*s6 + v[99]*s7 + v[114]*s8 + v[129]*s9 + v[144]*s10 + v[159]*s11 + v[174]*s12 + v[189]*s13 + v[204]*s14 + v[219]*s15; 2471 x[10+idt] = v[10]*s1 + v[25]*s2 + v[40]*s3 + v[55]*s4 + v[70]*s5 + v[85]*s6 + v[100]*s7 + v[115]*s8 + v[130]*s9 + v[145]*s10 + v[160]*s11 + v[175]*s12 + v[190]*s13 + v[205]*s14 + v[220]*s15; 2472 x[11+idt] = v[11]*s1 + v[26]*s2 + v[41]*s3 + v[56]*s4 + v[71]*s5 + v[86]*s6 + v[101]*s7 + v[116]*s8 + v[131]*s9 + v[146]*s10 + v[161]*s11 + v[176]*s12 + v[191]*s13 + v[206]*s14 + v[221]*s15; 2473 x[12+idt] = v[12]*s1 + v[27]*s2 + v[42]*s3 + v[57]*s4 + v[72]*s5 + v[87]*s6 + v[102]*s7 + v[117]*s8 + v[132]*s9 + v[147]*s10 + v[162]*s11 + v[177]*s12 + v[192]*s13 + v[207]*s14 + v[222]*s15; 2474 x[13+idt] = v[13]*s1 + v[28]*s2 + v[43]*s3 + v[58]*s4 + v[73]*s5 + v[88]*s6 + v[103]*s7 + v[118]*s8 + v[133]*s9 + v[148]*s10 + v[163]*s11 + v[178]*s12 + v[193]*s13 + v[208]*s14 + v[223]*s15; 2475 x[14+idt] = v[14]*s1 + v[29]*s2 + v[44]*s3 + v[59]*s4 + v[74]*s5 + v[89]*s6 + v[104]*s7 + v[119]*s8 + v[134]*s9 + v[149]*s10 + v[164]*s11 + v[179]*s12 + v[194]*s13 + v[209]*s14 + v[224]*s15; 2476 2477 } 2478 2479 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2480 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2481 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 2482 PetscFunctionReturn(0); 2483 } 2484 2485 /* bs = 15 for PFLOTRAN. Block operations are done by accessing one column at at time */ 2486 /* Default MatSolve for block size 15 */ 2487 2488 #undef __FUNCT__ 2489 #define __FUNCT__ "MatSolve_SeqBAIJ_15_NaturalOrdering_ver1" 2490 PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver1(Mat A,Vec bb,Vec xx) 2491 { 2492 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2493 PetscErrorCode ierr; 2494 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2; 2495 PetscInt i,k,nz,idx,idt,m; 2496 const MatScalar *aa=a->a,*v; 2497 PetscScalar s[15]; 2498 PetscScalar *x,xv; 2499 const PetscScalar *b; 2500 2501 PetscFunctionBegin; 2502 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2503 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2504 2505 /* forward solve the lower triangular */ 2506 for (i=0; i<n; i++) { 2507 v = aa + bs2*ai[i]; 2508 vi = aj + ai[i]; 2509 nz = ai[i+1] - ai[i]; 2510 idt = bs*i; 2511 x[idt] = b[idt]; x[1+idt] = b[1+idt]; x[2+idt] = b[2+idt]; x[3+idt] = b[3+idt]; x[4+idt] = b[4+idt]; 2512 x[5+idt] = b[5+idt]; x[6+idt] = b[6+idt]; x[7+idt] = b[7+idt]; x[8+idt] = b[8+idt]; x[9+idt] = b[9+idt]; 2513 x[10+idt] = b[10+idt]; x[11+idt] = b[11+idt]; x[12+idt] = b[12+idt]; x[13+idt] = b[13+idt]; x[14+idt] = b[14+idt]; 2514 for(m=0;m<nz;m++){ 2515 idx = bs*vi[m]; 2516 for(k=0;k<15;k++){ 2517 xv = x[k + idx]; 2518 x[idt] -= v[0]*xv; 2519 x[1+idt] -= v[1]*xv; 2520 x[2+idt] -= v[2]*xv; 2521 x[3+idt] -= v[3]*xv; 2522 x[4+idt] -= v[4]*xv; 2523 x[5+idt] -= v[5]*xv; 2524 x[6+idt] -= v[6]*xv; 2525 x[7+idt] -= v[7]*xv; 2526 x[8+idt] -= v[8]*xv; 2527 x[9+idt] -= v[9]*xv; 2528 x[10+idt] -= v[10]*xv; 2529 x[11+idt] -= v[11]*xv; 2530 x[12+idt] -= v[12]*xv; 2531 x[13+idt] -= v[13]*xv; 2532 x[14+idt] -= v[14]*xv; 2533 v += 15; 2534 } 2535 } 2536 } 2537 /* backward solve the upper triangular */ 2538 for (i=n-1; i>=0; i--){ 2539 v = aa + bs2*(adiag[i+1]+1); 2540 vi = aj + adiag[i+1]+1; 2541 nz = adiag[i] - adiag[i+1] - 1; 2542 idt = bs*i; 2543 s[0] = x[idt]; s[1] = x[1+idt]; s[2] = x[2+idt]; s[3] = x[3+idt]; s[4] = x[4+idt]; 2544 s[5] = x[5+idt]; s[6] = x[6+idt]; s[7] = x[7+idt]; s[8] = x[8+idt]; s[9] = x[9+idt]; 2545 s[10] = x[10+idt]; s[11] = x[11+idt]; s[12] = x[12+idt]; s[13] = x[13+idt]; s[14] = x[14+idt]; 2546 2547 for(m=0;m<nz;m++){ 2548 idx = bs*vi[m]; 2549 for(k=0;k<15;k++){ 2550 xv = x[k + idx]; 2551 s[0] -= v[0]*xv; 2552 s[1] -= v[1]*xv; 2553 s[2] -= v[2]*xv; 2554 s[3] -= v[3]*xv; 2555 s[4] -= v[4]*xv; 2556 s[5] -= v[5]*xv; 2557 s[6] -= v[6]*xv; 2558 s[7] -= v[7]*xv; 2559 s[8] -= v[8]*xv; 2560 s[9] -= v[9]*xv; 2561 s[10] -= v[10]*xv; 2562 s[11] -= v[11]*xv; 2563 s[12] -= v[12]*xv; 2564 s[13] -= v[13]*xv; 2565 s[14] -= v[14]*xv; 2566 v += 15; 2567 } 2568 } 2569 ierr = PetscMemzero(x+idt,bs*sizeof(MatScalar));CHKERRQ(ierr); 2570 for(k=0;k<15;k++){ 2571 x[idt] += v[0]*s[k]; 2572 x[1+idt] += v[1]*s[k]; 2573 x[2+idt] += v[2]*s[k]; 2574 x[3+idt] += v[3]*s[k]; 2575 x[4+idt] += v[4]*s[k]; 2576 x[5+idt] += v[5]*s[k]; 2577 x[6+idt] += v[6]*s[k]; 2578 x[7+idt] += v[7]*s[k]; 2579 x[8+idt] += v[8]*s[k]; 2580 x[9+idt] += v[9]*s[k]; 2581 x[10+idt] += v[10]*s[k]; 2582 x[11+idt] += v[11]*s[k]; 2583 x[12+idt] += v[12]*s[k]; 2584 x[13+idt] += v[13]*s[k]; 2585 x[14+idt] += v[14]*s[k]; 2586 v += 15; 2587 } 2588 } 2589 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2590 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2591 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 2592 PetscFunctionReturn(0); 2593 } 2594 2595 2596 #undef __FUNCT__ 2597 #define __FUNCT__ "MatSolve_SeqBAIJ_7_inplace" 2598 PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx) 2599 { 2600 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2601 IS iscol=a->col,isrow=a->row; 2602 PetscErrorCode ierr; 2603 const PetscInt *r,*c,*ai=a->i,*aj=a->j; 2604 const PetscInt *rout,*cout,*diag = a->diag,*vi,n=a->mbs; 2605 PetscInt i,nz,idx,idt,idc; 2606 const MatScalar *aa=a->a,*v; 2607 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t; 2608 const PetscScalar *b; 2609 2610 PetscFunctionBegin; 2611 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2612 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2613 t = a->solve_work; 2614 2615 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2616 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2617 2618 /* forward solve the lower triangular */ 2619 idx = 7*(*r++); 2620 t[0] = b[idx]; t[1] = b[1+idx]; 2621 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 2622 t[5] = b[5+idx]; t[6] = b[6+idx]; 2623 2624 for (i=1; i<n; i++) { 2625 v = aa + 49*ai[i]; 2626 vi = aj + ai[i]; 2627 nz = diag[i] - ai[i]; 2628 idx = 7*(*r++); 2629 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2630 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 2631 while (nz--) { 2632 idx = 7*(*vi++); 2633 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 2634 x4 = t[3+idx];x5 = t[4+idx]; 2635 x6 = t[5+idx];x7 = t[6+idx]; 2636 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2637 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2638 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2639 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2640 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2641 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2642 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2643 v += 49; 2644 } 2645 idx = 7*i; 2646 t[idx] = s1;t[1+idx] = s2; 2647 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 2648 t[5+idx] = s6;t[6+idx] = s7; 2649 } 2650 /* backward solve the upper triangular */ 2651 for (i=n-1; i>=0; i--){ 2652 v = aa + 49*diag[i] + 49; 2653 vi = aj + diag[i] + 1; 2654 nz = ai[i+1] - diag[i] - 1; 2655 idt = 7*i; 2656 s1 = t[idt]; s2 = t[1+idt]; 2657 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 2658 s6 = t[5+idt];s7 = t[6+idt]; 2659 while (nz--) { 2660 idx = 7*(*vi++); 2661 x1 = t[idx]; x2 = t[1+idx]; 2662 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 2663 x6 = t[5+idx]; x7 = t[6+idx]; 2664 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2665 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2666 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2667 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2668 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2669 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2670 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2671 v += 49; 2672 } 2673 idc = 7*(*c--); 2674 v = aa + 49*diag[i]; 2675 x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 2676 v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 2677 x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 2678 v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 2679 x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 2680 v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 2681 x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 2682 v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 2683 x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 2684 v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 2685 x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 2686 v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 2687 x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 2688 v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 2689 } 2690 2691 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2692 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2693 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2694 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2695 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 2696 PetscFunctionReturn(0); 2697 } 2698 2699 #undef __FUNCT__ 2700 #define __FUNCT__ "MatSolve_SeqBAIJ_7" 2701 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 2702 { 2703 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2704 IS iscol=a->col,isrow=a->row; 2705 PetscErrorCode ierr; 2706 const PetscInt *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag; 2707 const PetscInt n=a->mbs,*rout,*cout,*vi; 2708 PetscInt i,nz,idx,idt,idc,m; 2709 const MatScalar *aa=a->a,*v; 2710 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t; 2711 const PetscScalar *b; 2712 2713 PetscFunctionBegin; 2714 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2715 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2716 t = a->solve_work; 2717 2718 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2719 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2720 2721 /* forward solve the lower triangular */ 2722 idx = 7*r[0]; 2723 t[0] = b[idx]; t[1] = b[1+idx]; 2724 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 2725 t[5] = b[5+idx]; t[6] = b[6+idx]; 2726 2727 for (i=1; i<n; i++) { 2728 v = aa + 49*ai[i]; 2729 vi = aj + ai[i]; 2730 nz = ai[i+1] - ai[i]; 2731 idx = 7*r[i]; 2732 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2733 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 2734 for(m=0;m<nz;m++){ 2735 idx = 7*vi[m]; 2736 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 2737 x4 = t[3+idx];x5 = t[4+idx]; 2738 x6 = t[5+idx];x7 = t[6+idx]; 2739 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2740 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2741 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2742 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2743 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2744 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2745 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2746 v += 49; 2747 } 2748 idx = 7*i; 2749 t[idx] = s1;t[1+idx] = s2; 2750 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 2751 t[5+idx] = s6;t[6+idx] = s7; 2752 } 2753 /* backward solve the upper triangular */ 2754 for (i=n-1; i>=0; i--){ 2755 v = aa + 49*(adiag[i+1]+1); 2756 vi = aj + adiag[i+1]+1; 2757 nz = adiag[i] - adiag[i+1] - 1; 2758 idt = 7*i; 2759 s1 = t[idt]; s2 = t[1+idt]; 2760 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 2761 s6 = t[5+idt];s7 = t[6+idt]; 2762 for(m=0;m<nz;m++){ 2763 idx = 7*vi[m]; 2764 x1 = t[idx]; x2 = t[1+idx]; 2765 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 2766 x6 = t[5+idx]; x7 = t[6+idx]; 2767 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2768 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2769 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2770 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2771 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2772 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2773 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2774 v += 49; 2775 } 2776 idc = 7*c[i]; 2777 x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 2778 v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 2779 x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 2780 v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 2781 x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 2782 v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 2783 x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 2784 v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 2785 x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 2786 v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 2787 x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 2788 v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 2789 x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 2790 v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 2791 } 2792 2793 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2794 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2795 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2796 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2797 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 2798 PetscFunctionReturn(0); 2799 } 2800 2801 #undef __FUNCT__ 2802 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_inplace" 2803 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 2804 { 2805 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2806 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 2807 PetscErrorCode ierr; 2808 PetscInt i,nz,idx,idt,jdx; 2809 const MatScalar *aa=a->a,*v; 2810 PetscScalar *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 2811 const PetscScalar *b; 2812 2813 PetscFunctionBegin; 2814 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2815 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2816 /* forward solve the lower triangular */ 2817 idx = 0; 2818 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 2819 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 2820 x[6] = b[6+idx]; 2821 for (i=1; i<n; i++) { 2822 v = aa + 49*ai[i]; 2823 vi = aj + ai[i]; 2824 nz = diag[i] - ai[i]; 2825 idx = 7*i; 2826 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 2827 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 2828 s7 = b[6+idx]; 2829 while (nz--) { 2830 jdx = 7*(*vi++); 2831 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 2832 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 2833 x7 = x[6+jdx]; 2834 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2835 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2836 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2837 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2838 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2839 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2840 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2841 v += 49; 2842 } 2843 x[idx] = s1; 2844 x[1+idx] = s2; 2845 x[2+idx] = s3; 2846 x[3+idx] = s4; 2847 x[4+idx] = s5; 2848 x[5+idx] = s6; 2849 x[6+idx] = s7; 2850 } 2851 /* backward solve the upper triangular */ 2852 for (i=n-1; i>=0; i--){ 2853 v = aa + 49*diag[i] + 49; 2854 vi = aj + diag[i] + 1; 2855 nz = ai[i+1] - diag[i] - 1; 2856 idt = 7*i; 2857 s1 = x[idt]; s2 = x[1+idt]; 2858 s3 = x[2+idt]; s4 = x[3+idt]; 2859 s5 = x[4+idt]; s6 = x[5+idt]; 2860 s7 = x[6+idt]; 2861 while (nz--) { 2862 idx = 7*(*vi++); 2863 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 2864 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 2865 x7 = x[6+idx]; 2866 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2867 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2868 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2869 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2870 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2871 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2872 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2873 v += 49; 2874 } 2875 v = aa + 49*diag[i]; 2876 x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 2877 + v[28]*s5 + v[35]*s6 + v[42]*s7; 2878 x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 2879 + v[29]*s5 + v[36]*s6 + v[43]*s7; 2880 x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 2881 + v[30]*s5 + v[37]*s6 + v[44]*s7; 2882 x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 2883 + v[31]*s5 + v[38]*s6 + v[45]*s7; 2884 x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 2885 + v[32]*s5 + v[39]*s6 + v[46]*s7; 2886 x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 2887 + v[33]*s5 + v[40]*s6 + v[47]*s7; 2888 x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 2889 + v[34]*s5 + v[41]*s6 + v[48]*s7; 2890 } 2891 2892 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2893 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2894 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 2895 PetscFunctionReturn(0); 2896 } 2897 2898 #undef __FUNCT__ 2899 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering" 2900 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 2901 { 2902 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2903 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 2904 PetscErrorCode ierr; 2905 PetscInt i,k,nz,idx,jdx,idt; 2906 const PetscInt bs = A->rmap->bs,bs2 = a->bs2; 2907 const MatScalar *aa=a->a,*v; 2908 PetscScalar *x; 2909 const PetscScalar *b; 2910 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 2911 2912 PetscFunctionBegin; 2913 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2914 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2915 /* forward solve the lower triangular */ 2916 idx = 0; 2917 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 2918 x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx]; 2919 for (i=1; i<n; i++) { 2920 v = aa + bs2*ai[i]; 2921 vi = aj + ai[i]; 2922 nz = ai[i+1] - ai[i]; 2923 idx = bs*i; 2924 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2925 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 2926 for(k=0;k<nz;k++) { 2927 jdx = bs*vi[k]; 2928 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 2929 x5 = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx]; 2930 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2931 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2932 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2933 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2934 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2935 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2936 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2937 v += bs2; 2938 } 2939 2940 x[idx] = s1; 2941 x[1+idx] = s2; 2942 x[2+idx] = s3; 2943 x[3+idx] = s4; 2944 x[4+idx] = s5; 2945 x[5+idx] = s6; 2946 x[6+idx] = s7; 2947 } 2948 2949 /* backward solve the upper triangular */ 2950 for (i=n-1; i>=0; i--){ 2951 v = aa + bs2*(adiag[i+1]+1); 2952 vi = aj + adiag[i+1]+1; 2953 nz = adiag[i] - adiag[i+1]-1; 2954 idt = bs*i; 2955 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 2956 s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt]; 2957 for(k=0;k<nz;k++) { 2958 idx = bs*vi[k]; 2959 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 2960 x5 = x[4+idx];x6 = x[5+idx];x7 = x[6+idx]; 2961 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2962 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2963 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2964 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2965 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2966 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2967 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2968 v += bs2; 2969 } 2970 /* x = inv_diagonal*x */ 2971 x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 + v[28]*s5 + v[35]*s6 + v[42]*s7; 2972 x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 + v[29]*s5 + v[36]*s6 + v[43]*s7; 2973 x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 + v[30]*s5 + v[37]*s6 + v[44]*s7; 2974 x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 + v[31]*s5 + v[38]*s6 + v[45]*s7; 2975 x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 + v[32]*s5 + v[39]*s6 + v[46]*s7; 2976 x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 + v[33]*s5 + v[40]*s6 + v[47]*s7; 2977 x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 + v[34]*s5 + v[41]*s6 + v[48]*s7; 2978 } 2979 2980 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2981 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2982 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 2983 PetscFunctionReturn(0); 2984 } 2985 2986 #undef __FUNCT__ 2987 #define __FUNCT__ "MatSolve_SeqBAIJ_6_inplace" 2988 PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx) 2989 { 2990 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2991 IS iscol=a->col,isrow=a->row; 2992 PetscErrorCode ierr; 2993 const PetscInt *r,*c,*rout,*cout; 2994 const PetscInt *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 2995 PetscInt i,nz,idx,idt,idc; 2996 const MatScalar *aa=a->a,*v; 2997 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 2998 const PetscScalar *b; 2999 3000 PetscFunctionBegin; 3001 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3002 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3003 t = a->solve_work; 3004 3005 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3006 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3007 3008 /* forward solve the lower triangular */ 3009 idx = 6*(*r++); 3010 t[0] = b[idx]; t[1] = b[1+idx]; 3011 t[2] = b[2+idx]; t[3] = b[3+idx]; 3012 t[4] = b[4+idx]; t[5] = b[5+idx]; 3013 for (i=1; i<n; i++) { 3014 v = aa + 36*ai[i]; 3015 vi = aj + ai[i]; 3016 nz = diag[i] - ai[i]; 3017 idx = 6*(*r++); 3018 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3019 s5 = b[4+idx]; s6 = b[5+idx]; 3020 while (nz--) { 3021 idx = 6*(*vi++); 3022 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 3023 x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 3024 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 3025 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 3026 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 3027 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 3028 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 3029 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 3030 v += 36; 3031 } 3032 idx = 6*i; 3033 t[idx] = s1;t[1+idx] = s2; 3034 t[2+idx] = s3;t[3+idx] = s4; 3035 t[4+idx] = s5;t[5+idx] = s6; 3036 } 3037 /* backward solve the upper triangular */ 3038 for (i=n-1; i>=0; i--){ 3039 v = aa + 36*diag[i] + 36; 3040 vi = aj + diag[i] + 1; 3041 nz = ai[i+1] - diag[i] - 1; 3042 idt = 6*i; 3043 s1 = t[idt]; s2 = t[1+idt]; 3044 s3 = t[2+idt];s4 = t[3+idt]; 3045 s5 = t[4+idt];s6 = t[5+idt]; 3046 while (nz--) { 3047 idx = 6*(*vi++); 3048 x1 = t[idx]; x2 = t[1+idx]; 3049 x3 = t[2+idx]; x4 = t[3+idx]; 3050 x5 = t[4+idx]; x6 = t[5+idx]; 3051 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 3052 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 3053 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 3054 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 3055 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 3056 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 3057 v += 36; 3058 } 3059 idc = 6*(*c--); 3060 v = aa + 36*diag[i]; 3061 x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 3062 v[18]*s4+v[24]*s5+v[30]*s6; 3063 x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 3064 v[19]*s4+v[25]*s5+v[31]*s6; 3065 x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 3066 v[20]*s4+v[26]*s5+v[32]*s6; 3067 x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 3068 v[21]*s4+v[27]*s5+v[33]*s6; 3069 x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 3070 v[22]*s4+v[28]*s5+v[34]*s6; 3071 x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 3072 v[23]*s4+v[29]*s5+v[35]*s6; 3073 } 3074 3075 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3076 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3077 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3078 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3079 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 3080 PetscFunctionReturn(0); 3081 } 3082 3083 #undef __FUNCT__ 3084 #define __FUNCT__ "MatSolve_SeqBAIJ_6" 3085 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 3086 { 3087 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 3088 IS iscol=a->col,isrow=a->row; 3089 PetscErrorCode ierr; 3090 const PetscInt *r,*c,*rout,*cout; 3091 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 3092 PetscInt i,nz,idx,idt,idc,m; 3093 const MatScalar *aa=a->a,*v; 3094 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 3095 const PetscScalar *b; 3096 3097 PetscFunctionBegin; 3098 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3099 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3100 t = a->solve_work; 3101 3102 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3103 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 3104 3105 /* forward solve the lower triangular */ 3106 idx = 6*r[0]; 3107 t[0] = b[idx]; t[1] = b[1+idx]; 3108 t[2] = b[2+idx]; t[3] = b[3+idx]; 3109 t[4] = b[4+idx]; t[5] = b[5+idx]; 3110 for (i=1; i<n; i++) { 3111 v = aa + 36*ai[i]; 3112 vi = aj + ai[i]; 3113 nz = ai[i+1] - ai[i]; 3114 idx = 6*r[i]; 3115 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3116 s5 = b[4+idx]; s6 = b[5+idx]; 3117 for(m=0;m<nz;m++){ 3118 idx = 6*vi[m]; 3119 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 3120 x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 3121 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 3122 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 3123 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 3124 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 3125 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 3126 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 3127 v += 36; 3128 } 3129 idx = 6*i; 3130 t[idx] = s1;t[1+idx] = s2; 3131 t[2+idx] = s3;t[3+idx] = s4; 3132 t[4+idx] = s5;t[5+idx] = s6; 3133 } 3134 /* backward solve the upper triangular */ 3135 for (i=n-1; i>=0; i--){ 3136 v = aa + 36*(adiag[i+1]+1); 3137 vi = aj + adiag[i+1]+1; 3138 nz = adiag[i] - adiag[i+1] - 1; 3139 idt = 6*i; 3140 s1 = t[idt]; s2 = t[1+idt]; 3141 s3 = t[2+idt];s4 = t[3+idt]; 3142 s5 = t[4+idt];s6 = t[5+idt]; 3143 for(m=0;m<nz;m++){ 3144 idx = 6*vi[m]; 3145 x1 = t[idx]; x2 = t[1+idx]; 3146 x3 = t[2+idx]; x4 = t[3+idx]; 3147 x5 = t[4+idx]; x6 = t[5+idx]; 3148 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 3149 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 3150 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 3151 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 3152 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 3153 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 3154 v += 36; 3155 } 3156 idc = 6*c[i]; 3157 x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 3158 v[18]*s4+v[24]*s5+v[30]*s6; 3159 x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 3160 v[19]*s4+v[25]*s5+v[31]*s6; 3161 x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 3162 v[20]*s4+v[26]*s5+v[32]*s6; 3163 x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 3164 v[21]*s4+v[27]*s5+v[33]*s6; 3165 x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 3166 v[22]*s4+v[28]*s5+v[34]*s6; 3167 x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 3168 v[23]*s4+v[29]*s5+v[35]*s6; 3169 } 3170 3171 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3172 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3173 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3174 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3175 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 3176 PetscFunctionReturn(0); 3177 } 3178 3179 #undef __FUNCT__ 3180 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_inplace" 3181 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 3182 { 3183 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3184 PetscInt i,nz,idx,idt,jdx; 3185 PetscErrorCode ierr; 3186 const PetscInt *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j; 3187 const MatScalar *aa=a->a,*v; 3188 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 3189 const PetscScalar *b; 3190 3191 PetscFunctionBegin; 3192 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3193 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3194 /* forward solve the lower triangular */ 3195 idx = 0; 3196 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 3197 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 3198 for (i=1; i<n; i++) { 3199 v = aa + 36*ai[i]; 3200 vi = aj + ai[i]; 3201 nz = diag[i] - ai[i]; 3202 idx = 6*i; 3203 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 3204 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 3205 while (nz--) { 3206 jdx = 6*(*vi++); 3207 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 3208 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 3209 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 3210 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 3211 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 3212 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 3213 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 3214 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 3215 v += 36; 3216 } 3217 x[idx] = s1; 3218 x[1+idx] = s2; 3219 x[2+idx] = s3; 3220 x[3+idx] = s4; 3221 x[4+idx] = s5; 3222 x[5+idx] = s6; 3223 } 3224 /* backward solve the upper triangular */ 3225 for (i=n-1; i>=0; i--){ 3226 v = aa + 36*diag[i] + 36; 3227 vi = aj + diag[i] + 1; 3228 nz = ai[i+1] - diag[i] - 1; 3229 idt = 6*i; 3230 s1 = x[idt]; s2 = x[1+idt]; 3231 s3 = x[2+idt]; s4 = x[3+idt]; 3232 s5 = x[4+idt]; s6 = x[5+idt]; 3233 while (nz--) { 3234 idx = 6*(*vi++); 3235 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 3236 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 3237 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 3238 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 3239 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 3240 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 3241 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 3242 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 3243 v += 36; 3244 } 3245 v = aa + 36*diag[i]; 3246 x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 3247 x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 3248 x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 3249 x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 3250 x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 3251 x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 3252 } 3253 3254 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3255 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3256 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 3257 PetscFunctionReturn(0); 3258 } 3259 3260 #undef __FUNCT__ 3261 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering" 3262 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 3263 { 3264 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3265 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 3266 PetscErrorCode ierr; 3267 PetscInt i,k,nz,idx,jdx,idt; 3268 const PetscInt bs = A->rmap->bs,bs2 = a->bs2; 3269 const MatScalar *aa=a->a,*v; 3270 PetscScalar *x; 3271 const PetscScalar *b; 3272 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 3273 3274 PetscFunctionBegin; 3275 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3276 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3277 /* forward solve the lower triangular */ 3278 idx = 0; 3279 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 3280 x[4] = b[4+idx];x[5] = b[5+idx]; 3281 for (i=1; i<n; i++) { 3282 v = aa + bs2*ai[i]; 3283 vi = aj + ai[i]; 3284 nz = ai[i+1] - ai[i]; 3285 idx = bs*i; 3286 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3287 s5 = b[4+idx];s6 = b[5+idx]; 3288 for(k=0;k<nz;k++){ 3289 jdx = bs*vi[k]; 3290 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 3291 x5 = x[4+jdx]; x6 = x[5+jdx]; 3292 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 3293 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 3294 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 3295 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 3296 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 3297 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 3298 v += bs2; 3299 } 3300 3301 x[idx] = s1; 3302 x[1+idx] = s2; 3303 x[2+idx] = s3; 3304 x[3+idx] = s4; 3305 x[4+idx] = s5; 3306 x[5+idx] = s6; 3307 } 3308 3309 /* backward solve the upper triangular */ 3310 for (i=n-1; i>=0; i--){ 3311 v = aa + bs2*(adiag[i+1]+1); 3312 vi = aj + adiag[i+1]+1; 3313 nz = adiag[i] - adiag[i+1]-1; 3314 idt = bs*i; 3315 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 3316 s5 = x[4+idt];s6 = x[5+idt]; 3317 for(k=0;k<nz;k++){ 3318 idx = bs*vi[k]; 3319 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 3320 x5 = x[4+idx];x6 = x[5+idx]; 3321 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 3322 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 3323 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 3324 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 3325 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 3326 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 3327 v += bs2; 3328 } 3329 /* x = inv_diagonal*x */ 3330 x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 3331 x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 3332 x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 3333 x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 3334 x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 3335 x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 3336 } 3337 3338 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3339 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3340 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 3341 PetscFunctionReturn(0); 3342 } 3343 3344 #undef __FUNCT__ 3345 #define __FUNCT__ "MatSolve_SeqBAIJ_5_inplace" 3346 PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx) 3347 { 3348 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 3349 IS iscol=a->col,isrow=a->row; 3350 PetscErrorCode ierr; 3351 const PetscInt *r,*c,*rout,*cout,*diag = a->diag; 3352 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 3353 PetscInt i,nz,idx,idt,idc; 3354 const MatScalar *aa=a->a,*v; 3355 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 3356 const PetscScalar *b; 3357 3358 PetscFunctionBegin; 3359 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3360 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3361 t = a->solve_work; 3362 3363 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3364 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3365 3366 /* forward solve the lower triangular */ 3367 idx = 5*(*r++); 3368 t[0] = b[idx]; t[1] = b[1+idx]; 3369 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 3370 for (i=1; i<n; i++) { 3371 v = aa + 25*ai[i]; 3372 vi = aj + ai[i]; 3373 nz = diag[i] - ai[i]; 3374 idx = 5*(*r++); 3375 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3376 s5 = b[4+idx]; 3377 while (nz--) { 3378 idx = 5*(*vi++); 3379 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 3380 x4 = t[3+idx];x5 = t[4+idx]; 3381 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3382 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3383 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3384 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3385 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3386 v += 25; 3387 } 3388 idx = 5*i; 3389 t[idx] = s1;t[1+idx] = s2; 3390 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 3391 } 3392 /* backward solve the upper triangular */ 3393 for (i=n-1; i>=0; i--){ 3394 v = aa + 25*diag[i] + 25; 3395 vi = aj + diag[i] + 1; 3396 nz = ai[i+1] - diag[i] - 1; 3397 idt = 5*i; 3398 s1 = t[idt]; s2 = t[1+idt]; 3399 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 3400 while (nz--) { 3401 idx = 5*(*vi++); 3402 x1 = t[idx]; x2 = t[1+idx]; 3403 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 3404 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3405 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3406 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3407 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3408 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3409 v += 25; 3410 } 3411 idc = 5*(*c--); 3412 v = aa + 25*diag[i]; 3413 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 3414 v[15]*s4+v[20]*s5; 3415 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 3416 v[16]*s4+v[21]*s5; 3417 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 3418 v[17]*s4+v[22]*s5; 3419 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 3420 v[18]*s4+v[23]*s5; 3421 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 3422 v[19]*s4+v[24]*s5; 3423 } 3424 3425 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3426 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3427 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3428 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3429 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 3430 PetscFunctionReturn(0); 3431 } 3432 3433 #undef __FUNCT__ 3434 #define __FUNCT__ "MatSolve_SeqBAIJ_5" 3435 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 3436 { 3437 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 3438 IS iscol=a->col,isrow=a->row; 3439 PetscErrorCode ierr; 3440 const PetscInt *r,*c,*rout,*cout; 3441 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 3442 PetscInt i,nz,idx,idt,idc,m; 3443 const MatScalar *aa=a->a,*v; 3444 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 3445 const PetscScalar *b; 3446 3447 PetscFunctionBegin; 3448 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3449 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3450 t = a->solve_work; 3451 3452 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3453 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 3454 3455 /* forward solve the lower triangular */ 3456 idx = 5*r[0]; 3457 t[0] = b[idx]; t[1] = b[1+idx]; 3458 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 3459 for (i=1; i<n; i++) { 3460 v = aa + 25*ai[i]; 3461 vi = aj + ai[i]; 3462 nz = ai[i+1] - ai[i]; 3463 idx = 5*r[i]; 3464 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3465 s5 = b[4+idx]; 3466 for(m=0;m<nz;m++){ 3467 idx = 5*vi[m]; 3468 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 3469 x4 = t[3+idx];x5 = t[4+idx]; 3470 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3471 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3472 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3473 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3474 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3475 v += 25; 3476 } 3477 idx = 5*i; 3478 t[idx] = s1;t[1+idx] = s2; 3479 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 3480 } 3481 /* backward solve the upper triangular */ 3482 for (i=n-1; i>=0; i--){ 3483 v = aa + 25*(adiag[i+1]+1); 3484 vi = aj + adiag[i+1]+1; 3485 nz = adiag[i] - adiag[i+1] - 1; 3486 idt = 5*i; 3487 s1 = t[idt]; s2 = t[1+idt]; 3488 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 3489 for(m=0;m<nz;m++){ 3490 idx = 5*vi[m]; 3491 x1 = t[idx]; x2 = t[1+idx]; 3492 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 3493 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3494 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3495 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3496 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3497 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3498 v += 25; 3499 } 3500 idc = 5*c[i]; 3501 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 3502 v[15]*s4+v[20]*s5; 3503 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 3504 v[16]*s4+v[21]*s5; 3505 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 3506 v[17]*s4+v[22]*s5; 3507 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 3508 v[18]*s4+v[23]*s5; 3509 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 3510 v[19]*s4+v[24]*s5; 3511 } 3512 3513 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3514 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3515 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3516 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3517 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 3518 PetscFunctionReturn(0); 3519 } 3520 3521 #undef __FUNCT__ 3522 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_inplace" 3523 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 3524 { 3525 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3526 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 3527 PetscInt i,nz,idx,idt,jdx; 3528 PetscErrorCode ierr; 3529 const MatScalar *aa=a->a,*v; 3530 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 3531 const PetscScalar *b; 3532 3533 PetscFunctionBegin; 3534 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3535 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3536 /* forward solve the lower triangular */ 3537 idx = 0; 3538 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 3539 for (i=1; i<n; i++) { 3540 v = aa + 25*ai[i]; 3541 vi = aj + ai[i]; 3542 nz = diag[i] - ai[i]; 3543 idx = 5*i; 3544 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 3545 while (nz--) { 3546 jdx = 5*(*vi++); 3547 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 3548 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3549 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3550 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3551 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3552 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3553 v += 25; 3554 } 3555 x[idx] = s1; 3556 x[1+idx] = s2; 3557 x[2+idx] = s3; 3558 x[3+idx] = s4; 3559 x[4+idx] = s5; 3560 } 3561 /* backward solve the upper triangular */ 3562 for (i=n-1; i>=0; i--){ 3563 v = aa + 25*diag[i] + 25; 3564 vi = aj + diag[i] + 1; 3565 nz = ai[i+1] - diag[i] - 1; 3566 idt = 5*i; 3567 s1 = x[idt]; s2 = x[1+idt]; 3568 s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 3569 while (nz--) { 3570 idx = 5*(*vi++); 3571 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 3572 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3573 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3574 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3575 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3576 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3577 v += 25; 3578 } 3579 v = aa + 25*diag[i]; 3580 x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 3581 x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 3582 x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 3583 x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 3584 x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 3585 } 3586 3587 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3588 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3589 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 3590 PetscFunctionReturn(0); 3591 } 3592 3593 #undef __FUNCT__ 3594 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering" 3595 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 3596 { 3597 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3598 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 3599 PetscInt i,k,nz,idx,idt,jdx; 3600 PetscErrorCode ierr; 3601 const MatScalar *aa=a->a,*v; 3602 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 3603 const PetscScalar *b; 3604 3605 PetscFunctionBegin; 3606 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3607 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3608 /* forward solve the lower triangular */ 3609 idx = 0; 3610 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 3611 for (i=1; i<n; i++) { 3612 v = aa + 25*ai[i]; 3613 vi = aj + ai[i]; 3614 nz = ai[i+1] - ai[i]; 3615 idx = 5*i; 3616 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 3617 for(k=0;k<nz;k++) { 3618 jdx = 5*vi[k]; 3619 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 3620 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3621 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3622 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3623 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3624 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3625 v += 25; 3626 } 3627 x[idx] = s1; 3628 x[1+idx] = s2; 3629 x[2+idx] = s3; 3630 x[3+idx] = s4; 3631 x[4+idx] = s5; 3632 } 3633 3634 /* backward solve the upper triangular */ 3635 for (i=n-1; i>=0; i--){ 3636 v = aa + 25*(adiag[i+1]+1); 3637 vi = aj + adiag[i+1]+1; 3638 nz = adiag[i] - adiag[i+1]-1; 3639 idt = 5*i; 3640 s1 = x[idt]; s2 = x[1+idt]; 3641 s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 3642 for(k=0;k<nz;k++){ 3643 idx = 5*vi[k]; 3644 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 3645 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3646 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3647 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3648 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3649 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3650 v += 25; 3651 } 3652 /* x = inv_diagonal*x */ 3653 x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 3654 x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 3655 x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 3656 x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 3657 x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 3658 } 3659 3660 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3661 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3662 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 3663 PetscFunctionReturn(0); 3664 } 3665 3666 #undef __FUNCT__ 3667 #define __FUNCT__ "MatSolve_SeqBAIJ_4_inplace" 3668 PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx) 3669 { 3670 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3671 IS iscol=a->col,isrow=a->row; 3672 PetscErrorCode ierr; 3673 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 3674 PetscInt i,nz,idx,idt,idc; 3675 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3676 const MatScalar *aa=a->a,*v; 3677 PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 3678 const PetscScalar *b; 3679 3680 PetscFunctionBegin; 3681 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3682 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3683 t = a->solve_work; 3684 3685 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3686 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3687 3688 /* forward solve the lower triangular */ 3689 idx = 4*(*r++); 3690 t[0] = b[idx]; t[1] = b[1+idx]; 3691 t[2] = b[2+idx]; t[3] = b[3+idx]; 3692 for (i=1; i<n; i++) { 3693 v = aa + 16*ai[i]; 3694 vi = aj + ai[i]; 3695 nz = diag[i] - ai[i]; 3696 idx = 4*(*r++); 3697 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3698 while (nz--) { 3699 idx = 4*(*vi++); 3700 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 3701 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3702 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3703 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3704 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3705 v += 16; 3706 } 3707 idx = 4*i; 3708 t[idx] = s1;t[1+idx] = s2; 3709 t[2+idx] = s3;t[3+idx] = s4; 3710 } 3711 /* backward solve the upper triangular */ 3712 for (i=n-1; i>=0; i--){ 3713 v = aa + 16*diag[i] + 16; 3714 vi = aj + diag[i] + 1; 3715 nz = ai[i+1] - diag[i] - 1; 3716 idt = 4*i; 3717 s1 = t[idt]; s2 = t[1+idt]; 3718 s3 = t[2+idt];s4 = t[3+idt]; 3719 while (nz--) { 3720 idx = 4*(*vi++); 3721 x1 = t[idx]; x2 = t[1+idx]; 3722 x3 = t[2+idx]; x4 = t[3+idx]; 3723 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3724 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3725 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3726 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3727 v += 16; 3728 } 3729 idc = 4*(*c--); 3730 v = aa + 16*diag[i]; 3731 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 3732 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 3733 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 3734 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 3735 } 3736 3737 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3738 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3739 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3740 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3741 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3742 PetscFunctionReturn(0); 3743 } 3744 3745 #undef __FUNCT__ 3746 #define __FUNCT__ "MatSolve_SeqBAIJ_4" 3747 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 3748 { 3749 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3750 IS iscol=a->col,isrow=a->row; 3751 PetscErrorCode ierr; 3752 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 3753 PetscInt i,nz,idx,idt,idc,m; 3754 const PetscInt *r,*c,*rout,*cout; 3755 const MatScalar *aa=a->a,*v; 3756 PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 3757 const PetscScalar *b; 3758 3759 PetscFunctionBegin; 3760 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3761 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3762 t = a->solve_work; 3763 3764 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3765 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 3766 3767 /* forward solve the lower triangular */ 3768 idx = 4*r[0]; 3769 t[0] = b[idx]; t[1] = b[1+idx]; 3770 t[2] = b[2+idx]; t[3] = b[3+idx]; 3771 for (i=1; i<n; i++) { 3772 v = aa + 16*ai[i]; 3773 vi = aj + ai[i]; 3774 nz = ai[i+1] - ai[i]; 3775 idx = 4*r[i]; 3776 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3777 for(m=0;m<nz;m++){ 3778 idx = 4*vi[m]; 3779 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 3780 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3781 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3782 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3783 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3784 v += 16; 3785 } 3786 idx = 4*i; 3787 t[idx] = s1;t[1+idx] = s2; 3788 t[2+idx] = s3;t[3+idx] = s4; 3789 } 3790 /* backward solve the upper triangular */ 3791 for (i=n-1; i>=0; i--){ 3792 v = aa + 16*(adiag[i+1]+1); 3793 vi = aj + adiag[i+1]+1; 3794 nz = adiag[i] - adiag[i+1] - 1; 3795 idt = 4*i; 3796 s1 = t[idt]; s2 = t[1+idt]; 3797 s3 = t[2+idt];s4 = t[3+idt]; 3798 for(m=0;m<nz;m++){ 3799 idx = 4*vi[m]; 3800 x1 = t[idx]; x2 = t[1+idx]; 3801 x3 = t[2+idx]; x4 = t[3+idx]; 3802 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3803 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3804 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3805 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3806 v += 16; 3807 } 3808 idc = 4*c[i]; 3809 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 3810 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 3811 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 3812 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 3813 } 3814 3815 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3816 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3817 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3818 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3819 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3820 PetscFunctionReturn(0); 3821 } 3822 3823 #undef __FUNCT__ 3824 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion" 3825 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx) 3826 { 3827 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3828 IS iscol=a->col,isrow=a->row; 3829 PetscErrorCode ierr; 3830 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 3831 PetscInt i,nz,idx,idt,idc; 3832 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3833 const MatScalar *aa=a->a,*v; 3834 MatScalar s1,s2,s3,s4,x1,x2,x3,x4,*t; 3835 PetscScalar *x; 3836 const PetscScalar *b; 3837 3838 PetscFunctionBegin; 3839 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3840 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3841 t = (MatScalar *)a->solve_work; 3842 3843 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3844 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3845 3846 /* forward solve the lower triangular */ 3847 idx = 4*(*r++); 3848 t[0] = (MatScalar)b[idx]; 3849 t[1] = (MatScalar)b[1+idx]; 3850 t[2] = (MatScalar)b[2+idx]; 3851 t[3] = (MatScalar)b[3+idx]; 3852 for (i=1; i<n; i++) { 3853 v = aa + 16*ai[i]; 3854 vi = aj + ai[i]; 3855 nz = diag[i] - ai[i]; 3856 idx = 4*(*r++); 3857 s1 = (MatScalar)b[idx]; 3858 s2 = (MatScalar)b[1+idx]; 3859 s3 = (MatScalar)b[2+idx]; 3860 s4 = (MatScalar)b[3+idx]; 3861 while (nz--) { 3862 idx = 4*(*vi++); 3863 x1 = t[idx]; 3864 x2 = t[1+idx]; 3865 x3 = t[2+idx]; 3866 x4 = t[3+idx]; 3867 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3868 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3869 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3870 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3871 v += 16; 3872 } 3873 idx = 4*i; 3874 t[idx] = s1; 3875 t[1+idx] = s2; 3876 t[2+idx] = s3; 3877 t[3+idx] = s4; 3878 } 3879 /* backward solve the upper triangular */ 3880 for (i=n-1; i>=0; i--){ 3881 v = aa + 16*diag[i] + 16; 3882 vi = aj + diag[i] + 1; 3883 nz = ai[i+1] - diag[i] - 1; 3884 idt = 4*i; 3885 s1 = t[idt]; 3886 s2 = t[1+idt]; 3887 s3 = t[2+idt]; 3888 s4 = t[3+idt]; 3889 while (nz--) { 3890 idx = 4*(*vi++); 3891 x1 = t[idx]; 3892 x2 = t[1+idx]; 3893 x3 = t[2+idx]; 3894 x4 = t[3+idx]; 3895 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3896 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3897 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3898 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3899 v += 16; 3900 } 3901 idc = 4*(*c--); 3902 v = aa + 16*diag[i]; 3903 t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 3904 t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 3905 t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 3906 t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 3907 x[idc] = (PetscScalar)t[idt]; 3908 x[1+idc] = (PetscScalar)t[1+idt]; 3909 x[2+idc] = (PetscScalar)t[2+idt]; 3910 x[3+idc] = (PetscScalar)t[3+idt]; 3911 } 3912 3913 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3914 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3915 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3916 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3917 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3918 PetscFunctionReturn(0); 3919 } 3920 3921 #if defined (PETSC_HAVE_SSE) 3922 3923 #include PETSC_HAVE_SSE 3924 3925 #undef __FUNCT__ 3926 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion" 3927 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx) 3928 { 3929 /* 3930 Note: This code uses demotion of double 3931 to float when performing the mixed-mode computation. 3932 This may not be numerically reasonable for all applications. 3933 */ 3934 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3935 IS iscol=a->col,isrow=a->row; 3936 PetscErrorCode ierr; 3937 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16; 3938 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3939 MatScalar *aa=a->a,*v; 3940 PetscScalar *x,*b,*t; 3941 3942 /* Make space in temp stack for 16 Byte Aligned arrays */ 3943 float ssealignedspace[11],*tmps,*tmpx; 3944 unsigned long offset; 3945 3946 PetscFunctionBegin; 3947 SSE_SCOPE_BEGIN; 3948 3949 offset = (unsigned long)ssealignedspace % 16; 3950 if (offset) offset = (16 - offset)/4; 3951 tmps = &ssealignedspace[offset]; 3952 tmpx = &ssealignedspace[offset+4]; 3953 PREFETCH_NTA(aa+16*ai[1]); 3954 3955 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3956 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3957 t = a->solve_work; 3958 3959 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3960 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3961 3962 /* forward solve the lower triangular */ 3963 idx = 4*(*r++); 3964 t[0] = b[idx]; t[1] = b[1+idx]; 3965 t[2] = b[2+idx]; t[3] = b[3+idx]; 3966 v = aa + 16*ai[1]; 3967 3968 for (i=1; i<n;) { 3969 PREFETCH_NTA(&v[8]); 3970 vi = aj + ai[i]; 3971 nz = diag[i] - ai[i]; 3972 idx = 4*(*r++); 3973 3974 /* Demote sum from double to float */ 3975 CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 3976 LOAD_PS(tmps,XMM7); 3977 3978 while (nz--) { 3979 PREFETCH_NTA(&v[16]); 3980 idx = 4*(*vi++); 3981 3982 /* Demote solution (so far) from double to float */ 3983 CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 3984 3985 /* 4x4 Matrix-Vector product with negative accumulation: */ 3986 SSE_INLINE_BEGIN_2(tmpx,v) 3987 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 3988 3989 /* First Column */ 3990 SSE_COPY_PS(XMM0,XMM6) 3991 SSE_SHUFFLE(XMM0,XMM0,0x00) 3992 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 3993 SSE_SUB_PS(XMM7,XMM0) 3994 3995 /* Second Column */ 3996 SSE_COPY_PS(XMM1,XMM6) 3997 SSE_SHUFFLE(XMM1,XMM1,0x55) 3998 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 3999 SSE_SUB_PS(XMM7,XMM1) 4000 4001 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 4002 4003 /* Third Column */ 4004 SSE_COPY_PS(XMM2,XMM6) 4005 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4006 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 4007 SSE_SUB_PS(XMM7,XMM2) 4008 4009 /* Fourth Column */ 4010 SSE_COPY_PS(XMM3,XMM6) 4011 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4012 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 4013 SSE_SUB_PS(XMM7,XMM3) 4014 SSE_INLINE_END_2 4015 4016 v += 16; 4017 } 4018 idx = 4*i; 4019 v = aa + 16*ai[++i]; 4020 PREFETCH_NTA(v); 4021 STORE_PS(tmps,XMM7); 4022 4023 /* Promote result from float to double */ 4024 CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps); 4025 } 4026 /* backward solve the upper triangular */ 4027 idt = 4*(n-1); 4028 ai16 = 16*diag[n-1]; 4029 v = aa + ai16 + 16; 4030 for (i=n-1; i>=0;){ 4031 PREFETCH_NTA(&v[8]); 4032 vi = aj + diag[i] + 1; 4033 nz = ai[i+1] - diag[i] - 1; 4034 4035 /* Demote accumulator from double to float */ 4036 CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]); 4037 LOAD_PS(tmps,XMM7); 4038 4039 while (nz--) { 4040 PREFETCH_NTA(&v[16]); 4041 idx = 4*(*vi++); 4042 4043 /* Demote solution (so far) from double to float */ 4044 CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]); 4045 4046 /* 4x4 Matrix-Vector Product with negative accumulation: */ 4047 SSE_INLINE_BEGIN_2(tmpx,v) 4048 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 4049 4050 /* First Column */ 4051 SSE_COPY_PS(XMM0,XMM6) 4052 SSE_SHUFFLE(XMM0,XMM0,0x00) 4053 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 4054 SSE_SUB_PS(XMM7,XMM0) 4055 4056 /* Second Column */ 4057 SSE_COPY_PS(XMM1,XMM6) 4058 SSE_SHUFFLE(XMM1,XMM1,0x55) 4059 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 4060 SSE_SUB_PS(XMM7,XMM1) 4061 4062 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 4063 4064 /* Third Column */ 4065 SSE_COPY_PS(XMM2,XMM6) 4066 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4067 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 4068 SSE_SUB_PS(XMM7,XMM2) 4069 4070 /* Fourth Column */ 4071 SSE_COPY_PS(XMM3,XMM6) 4072 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4073 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 4074 SSE_SUB_PS(XMM7,XMM3) 4075 SSE_INLINE_END_2 4076 v += 16; 4077 } 4078 v = aa + ai16; 4079 ai16 = 16*diag[--i]; 4080 PREFETCH_NTA(aa+ai16+16); 4081 /* 4082 Scale the result by the diagonal 4x4 block, 4083 which was inverted as part of the factorization 4084 */ 4085 SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 4086 /* First Column */ 4087 SSE_COPY_PS(XMM0,XMM7) 4088 SSE_SHUFFLE(XMM0,XMM0,0x00) 4089 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 4090 4091 /* Second Column */ 4092 SSE_COPY_PS(XMM1,XMM7) 4093 SSE_SHUFFLE(XMM1,XMM1,0x55) 4094 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 4095 SSE_ADD_PS(XMM0,XMM1) 4096 4097 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 4098 4099 /* Third Column */ 4100 SSE_COPY_PS(XMM2,XMM7) 4101 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4102 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 4103 SSE_ADD_PS(XMM0,XMM2) 4104 4105 /* Fourth Column */ 4106 SSE_COPY_PS(XMM3,XMM7) 4107 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4108 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 4109 SSE_ADD_PS(XMM0,XMM3) 4110 4111 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 4112 SSE_INLINE_END_3 4113 4114 /* Promote solution from float to double */ 4115 CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps); 4116 4117 /* Apply reordering to t and stream into x. */ 4118 /* This way, x doesn't pollute the cache. */ 4119 /* Be careful with size: 2 doubles = 4 floats! */ 4120 idc = 4*(*c--); 4121 SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc]) 4122 /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 4123 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 4124 SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0) 4125 /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 4126 SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1) 4127 SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1) 4128 SSE_INLINE_END_2 4129 v = aa + ai16 + 16; 4130 idt -= 4; 4131 } 4132 4133 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4134 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4135 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 4136 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4137 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 4138 SSE_SCOPE_END; 4139 PetscFunctionReturn(0); 4140 } 4141 4142 #endif 4143 4144 4145 /* 4146 Special case where the matrix was ILU(0) factored in the natural 4147 ordering. This eliminates the need for the column and row permutation. 4148 */ 4149 #undef __FUNCT__ 4150 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_inplace" 4151 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 4152 { 4153 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4154 PetscInt n=a->mbs; 4155 const PetscInt *ai=a->i,*aj=a->j; 4156 PetscErrorCode ierr; 4157 const PetscInt *diag = a->diag; 4158 const MatScalar *aa=a->a; 4159 PetscScalar *x; 4160 const PetscScalar *b; 4161 4162 PetscFunctionBegin; 4163 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 4164 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4165 4166 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS) 4167 { 4168 static PetscScalar w[2000]; /* very BAD need to fix */ 4169 fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w); 4170 } 4171 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 4172 { 4173 static PetscScalar w[2000]; /* very BAD need to fix */ 4174 fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w); 4175 } 4176 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 4177 fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b); 4178 #else 4179 { 4180 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 4181 const MatScalar *v; 4182 PetscInt jdx,idt,idx,nz,i,ai16; 4183 const PetscInt *vi; 4184 4185 /* forward solve the lower triangular */ 4186 idx = 0; 4187 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 4188 for (i=1; i<n; i++) { 4189 v = aa + 16*ai[i]; 4190 vi = aj + ai[i]; 4191 nz = diag[i] - ai[i]; 4192 idx += 4; 4193 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 4194 while (nz--) { 4195 jdx = 4*(*vi++); 4196 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx]; 4197 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4198 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4199 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4200 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4201 v += 16; 4202 } 4203 x[idx] = s1; 4204 x[1+idx] = s2; 4205 x[2+idx] = s3; 4206 x[3+idx] = s4; 4207 } 4208 /* backward solve the upper triangular */ 4209 idt = 4*(n-1); 4210 for (i=n-1; i>=0; i--){ 4211 ai16 = 16*diag[i]; 4212 v = aa + ai16 + 16; 4213 vi = aj + diag[i] + 1; 4214 nz = ai[i+1] - diag[i] - 1; 4215 s1 = x[idt]; s2 = x[1+idt]; 4216 s3 = x[2+idt];s4 = x[3+idt]; 4217 while (nz--) { 4218 idx = 4*(*vi++); 4219 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; 4220 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4221 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4222 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4223 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4224 v += 16; 4225 } 4226 v = aa + ai16; 4227 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 4228 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 4229 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 4230 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 4231 idt -= 4; 4232 } 4233 } 4234 #endif 4235 4236 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 4237 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4238 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 4239 PetscFunctionReturn(0); 4240 } 4241 4242 #undef __FUNCT__ 4243 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering" 4244 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 4245 { 4246 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4247 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 4248 PetscInt i,k,nz,idx,jdx,idt; 4249 PetscErrorCode ierr; 4250 const PetscInt bs = A->rmap->bs,bs2 = a->bs2; 4251 const MatScalar *aa=a->a,*v; 4252 PetscScalar *x; 4253 const PetscScalar *b; 4254 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 4255 4256 PetscFunctionBegin; 4257 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 4258 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4259 /* forward solve the lower triangular */ 4260 idx = 0; 4261 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 4262 for (i=1; i<n; i++) { 4263 v = aa + bs2*ai[i]; 4264 vi = aj + ai[i]; 4265 nz = ai[i+1] - ai[i]; 4266 idx = bs*i; 4267 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 4268 for(k=0;k<nz;k++) { 4269 jdx = bs*vi[k]; 4270 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 4271 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4272 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4273 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4274 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4275 4276 v += bs2; 4277 } 4278 4279 x[idx] = s1; 4280 x[1+idx] = s2; 4281 x[2+idx] = s3; 4282 x[3+idx] = s4; 4283 } 4284 4285 /* backward solve the upper triangular */ 4286 for (i=n-1; i>=0; i--){ 4287 v = aa + bs2*(adiag[i+1]+1); 4288 vi = aj + adiag[i+1]+1; 4289 nz = adiag[i] - adiag[i+1]-1; 4290 idt = bs*i; 4291 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 4292 4293 for(k=0;k<nz;k++){ 4294 idx = bs*vi[k]; 4295 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 4296 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4297 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4298 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4299 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4300 4301 v += bs2; 4302 } 4303 /* x = inv_diagonal*x */ 4304 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 4305 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;; 4306 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 4307 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 4308 4309 } 4310 4311 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 4312 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4313 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 4314 PetscFunctionReturn(0); 4315 } 4316 4317 #undef __FUNCT__ 4318 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion" 4319 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx) 4320 { 4321 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4322 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*diag=a->diag; 4323 PetscErrorCode ierr; 4324 const MatScalar *aa=a->a; 4325 const PetscScalar *b; 4326 PetscScalar *x; 4327 4328 PetscFunctionBegin; 4329 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 4330 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4331 4332 { 4333 MatScalar s1,s2,s3,s4,x1,x2,x3,x4; 4334 const MatScalar *v; 4335 MatScalar *t=(MatScalar *)x; 4336 PetscInt jdx,idt,idx,nz,i,ai16; 4337 const PetscInt *vi; 4338 4339 /* forward solve the lower triangular */ 4340 idx = 0; 4341 t[0] = (MatScalar)b[0]; 4342 t[1] = (MatScalar)b[1]; 4343 t[2] = (MatScalar)b[2]; 4344 t[3] = (MatScalar)b[3]; 4345 for (i=1; i<n; i++) { 4346 v = aa + 16*ai[i]; 4347 vi = aj + ai[i]; 4348 nz = diag[i] - ai[i]; 4349 idx += 4; 4350 s1 = (MatScalar)b[idx]; 4351 s2 = (MatScalar)b[1+idx]; 4352 s3 = (MatScalar)b[2+idx]; 4353 s4 = (MatScalar)b[3+idx]; 4354 while (nz--) { 4355 jdx = 4*(*vi++); 4356 x1 = t[jdx]; 4357 x2 = t[1+jdx]; 4358 x3 = t[2+jdx]; 4359 x4 = t[3+jdx]; 4360 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4361 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4362 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4363 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4364 v += 16; 4365 } 4366 t[idx] = s1; 4367 t[1+idx] = s2; 4368 t[2+idx] = s3; 4369 t[3+idx] = s4; 4370 } 4371 /* backward solve the upper triangular */ 4372 idt = 4*(n-1); 4373 for (i=n-1; i>=0; i--){ 4374 ai16 = 16*diag[i]; 4375 v = aa + ai16 + 16; 4376 vi = aj + diag[i] + 1; 4377 nz = ai[i+1] - diag[i] - 1; 4378 s1 = t[idt]; 4379 s2 = t[1+idt]; 4380 s3 = t[2+idt]; 4381 s4 = t[3+idt]; 4382 while (nz--) { 4383 idx = 4*(*vi++); 4384 x1 = (MatScalar)x[idx]; 4385 x2 = (MatScalar)x[1+idx]; 4386 x3 = (MatScalar)x[2+idx]; 4387 x4 = (MatScalar)x[3+idx]; 4388 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4389 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4390 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4391 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4392 v += 16; 4393 } 4394 v = aa + ai16; 4395 x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4); 4396 x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4); 4397 x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4); 4398 x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4); 4399 idt -= 4; 4400 } 4401 } 4402 4403 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 4404 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4405 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 4406 PetscFunctionReturn(0); 4407 } 4408 4409 #if defined (PETSC_HAVE_SSE) 4410 4411 #include PETSC_HAVE_SSE 4412 #undef __FUNCT__ 4413 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj" 4414 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx) 4415 { 4416 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4417 unsigned short *aj=(unsigned short *)a->j; 4418 PetscErrorCode ierr; 4419 int *ai=a->i,n=a->mbs,*diag = a->diag; 4420 MatScalar *aa=a->a; 4421 PetscScalar *x,*b; 4422 4423 PetscFunctionBegin; 4424 SSE_SCOPE_BEGIN; 4425 /* 4426 Note: This code currently uses demotion of double 4427 to float when performing the mixed-mode computation. 4428 This may not be numerically reasonable for all applications. 4429 */ 4430 PREFETCH_NTA(aa+16*ai[1]); 4431 4432 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 4433 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4434 { 4435 /* x will first be computed in single precision then promoted inplace to double */ 4436 MatScalar *v,*t=(MatScalar *)x; 4437 int nz,i,idt,ai16; 4438 unsigned int jdx,idx; 4439 unsigned short *vi; 4440 /* Forward solve the lower triangular factor. */ 4441 4442 /* First block is the identity. */ 4443 idx = 0; 4444 CONVERT_DOUBLE4_FLOAT4(t,b); 4445 v = aa + 16*((unsigned int)ai[1]); 4446 4447 for (i=1; i<n;) { 4448 PREFETCH_NTA(&v[8]); 4449 vi = aj + ai[i]; 4450 nz = diag[i] - ai[i]; 4451 idx += 4; 4452 4453 /* Demote RHS from double to float. */ 4454 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 4455 LOAD_PS(&t[idx],XMM7); 4456 4457 while (nz--) { 4458 PREFETCH_NTA(&v[16]); 4459 jdx = 4*((unsigned int)(*vi++)); 4460 4461 /* 4x4 Matrix-Vector product with negative accumulation: */ 4462 SSE_INLINE_BEGIN_2(&t[jdx],v) 4463 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 4464 4465 /* First Column */ 4466 SSE_COPY_PS(XMM0,XMM6) 4467 SSE_SHUFFLE(XMM0,XMM0,0x00) 4468 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 4469 SSE_SUB_PS(XMM7,XMM0) 4470 4471 /* Second Column */ 4472 SSE_COPY_PS(XMM1,XMM6) 4473 SSE_SHUFFLE(XMM1,XMM1,0x55) 4474 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 4475 SSE_SUB_PS(XMM7,XMM1) 4476 4477 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 4478 4479 /* Third Column */ 4480 SSE_COPY_PS(XMM2,XMM6) 4481 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4482 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 4483 SSE_SUB_PS(XMM7,XMM2) 4484 4485 /* Fourth Column */ 4486 SSE_COPY_PS(XMM3,XMM6) 4487 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4488 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 4489 SSE_SUB_PS(XMM7,XMM3) 4490 SSE_INLINE_END_2 4491 4492 v += 16; 4493 } 4494 v = aa + 16*ai[++i]; 4495 PREFETCH_NTA(v); 4496 STORE_PS(&t[idx],XMM7); 4497 } 4498 4499 /* Backward solve the upper triangular factor.*/ 4500 4501 idt = 4*(n-1); 4502 ai16 = 16*diag[n-1]; 4503 v = aa + ai16 + 16; 4504 for (i=n-1; i>=0;){ 4505 PREFETCH_NTA(&v[8]); 4506 vi = aj + diag[i] + 1; 4507 nz = ai[i+1] - diag[i] - 1; 4508 4509 LOAD_PS(&t[idt],XMM7); 4510 4511 while (nz--) { 4512 PREFETCH_NTA(&v[16]); 4513 idx = 4*((unsigned int)(*vi++)); 4514 4515 /* 4x4 Matrix-Vector Product with negative accumulation: */ 4516 SSE_INLINE_BEGIN_2(&t[idx],v) 4517 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 4518 4519 /* First Column */ 4520 SSE_COPY_PS(XMM0,XMM6) 4521 SSE_SHUFFLE(XMM0,XMM0,0x00) 4522 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 4523 SSE_SUB_PS(XMM7,XMM0) 4524 4525 /* Second Column */ 4526 SSE_COPY_PS(XMM1,XMM6) 4527 SSE_SHUFFLE(XMM1,XMM1,0x55) 4528 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 4529 SSE_SUB_PS(XMM7,XMM1) 4530 4531 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 4532 4533 /* Third Column */ 4534 SSE_COPY_PS(XMM2,XMM6) 4535 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4536 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 4537 SSE_SUB_PS(XMM7,XMM2) 4538 4539 /* Fourth Column */ 4540 SSE_COPY_PS(XMM3,XMM6) 4541 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4542 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 4543 SSE_SUB_PS(XMM7,XMM3) 4544 SSE_INLINE_END_2 4545 v += 16; 4546 } 4547 v = aa + ai16; 4548 ai16 = 16*diag[--i]; 4549 PREFETCH_NTA(aa+ai16+16); 4550 /* 4551 Scale the result by the diagonal 4x4 block, 4552 which was inverted as part of the factorization 4553 */ 4554 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 4555 /* First Column */ 4556 SSE_COPY_PS(XMM0,XMM7) 4557 SSE_SHUFFLE(XMM0,XMM0,0x00) 4558 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 4559 4560 /* Second Column */ 4561 SSE_COPY_PS(XMM1,XMM7) 4562 SSE_SHUFFLE(XMM1,XMM1,0x55) 4563 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 4564 SSE_ADD_PS(XMM0,XMM1) 4565 4566 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 4567 4568 /* Third Column */ 4569 SSE_COPY_PS(XMM2,XMM7) 4570 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4571 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 4572 SSE_ADD_PS(XMM0,XMM2) 4573 4574 /* Fourth Column */ 4575 SSE_COPY_PS(XMM3,XMM7) 4576 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4577 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 4578 SSE_ADD_PS(XMM0,XMM3) 4579 4580 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 4581 SSE_INLINE_END_3 4582 4583 v = aa + ai16 + 16; 4584 idt -= 4; 4585 } 4586 4587 /* Convert t from single precision back to double precision (inplace)*/ 4588 idt = 4*(n-1); 4589 for (i=n-1;i>=0;i--) { 4590 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 4591 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 4592 PetscScalar *xtemp=&x[idt]; 4593 MatScalar *ttemp=&t[idt]; 4594 xtemp[3] = (PetscScalar)ttemp[3]; 4595 xtemp[2] = (PetscScalar)ttemp[2]; 4596 xtemp[1] = (PetscScalar)ttemp[1]; 4597 xtemp[0] = (PetscScalar)ttemp[0]; 4598 idt -= 4; 4599 } 4600 4601 } /* End of artificial scope. */ 4602 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 4603 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4604 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 4605 SSE_SCOPE_END; 4606 PetscFunctionReturn(0); 4607 } 4608 4609 #undef __FUNCT__ 4610 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion" 4611 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx) 4612 { 4613 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4614 int *aj=a->j; 4615 PetscErrorCode ierr; 4616 int *ai=a->i,n=a->mbs,*diag = a->diag; 4617 MatScalar *aa=a->a; 4618 PetscScalar *x,*b; 4619 4620 PetscFunctionBegin; 4621 SSE_SCOPE_BEGIN; 4622 /* 4623 Note: This code currently uses demotion of double 4624 to float when performing the mixed-mode computation. 4625 This may not be numerically reasonable for all applications. 4626 */ 4627 PREFETCH_NTA(aa+16*ai[1]); 4628 4629 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 4630 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4631 { 4632 /* x will first be computed in single precision then promoted inplace to double */ 4633 MatScalar *v,*t=(MatScalar *)x; 4634 int nz,i,idt,ai16; 4635 int jdx,idx; 4636 int *vi; 4637 /* Forward solve the lower triangular factor. */ 4638 4639 /* First block is the identity. */ 4640 idx = 0; 4641 CONVERT_DOUBLE4_FLOAT4(t,b); 4642 v = aa + 16*ai[1]; 4643 4644 for (i=1; i<n;) { 4645 PREFETCH_NTA(&v[8]); 4646 vi = aj + ai[i]; 4647 nz = diag[i] - ai[i]; 4648 idx += 4; 4649 4650 /* Demote RHS from double to float. */ 4651 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 4652 LOAD_PS(&t[idx],XMM7); 4653 4654 while (nz--) { 4655 PREFETCH_NTA(&v[16]); 4656 jdx = 4*(*vi++); 4657 /* jdx = *vi++; */ 4658 4659 /* 4x4 Matrix-Vector product with negative accumulation: */ 4660 SSE_INLINE_BEGIN_2(&t[jdx],v) 4661 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 4662 4663 /* First Column */ 4664 SSE_COPY_PS(XMM0,XMM6) 4665 SSE_SHUFFLE(XMM0,XMM0,0x00) 4666 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 4667 SSE_SUB_PS(XMM7,XMM0) 4668 4669 /* Second Column */ 4670 SSE_COPY_PS(XMM1,XMM6) 4671 SSE_SHUFFLE(XMM1,XMM1,0x55) 4672 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 4673 SSE_SUB_PS(XMM7,XMM1) 4674 4675 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 4676 4677 /* Third Column */ 4678 SSE_COPY_PS(XMM2,XMM6) 4679 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4680 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 4681 SSE_SUB_PS(XMM7,XMM2) 4682 4683 /* Fourth Column */ 4684 SSE_COPY_PS(XMM3,XMM6) 4685 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4686 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 4687 SSE_SUB_PS(XMM7,XMM3) 4688 SSE_INLINE_END_2 4689 4690 v += 16; 4691 } 4692 v = aa + 16*ai[++i]; 4693 PREFETCH_NTA(v); 4694 STORE_PS(&t[idx],XMM7); 4695 } 4696 4697 /* Backward solve the upper triangular factor.*/ 4698 4699 idt = 4*(n-1); 4700 ai16 = 16*diag[n-1]; 4701 v = aa + ai16 + 16; 4702 for (i=n-1; i>=0;){ 4703 PREFETCH_NTA(&v[8]); 4704 vi = aj + diag[i] + 1; 4705 nz = ai[i+1] - diag[i] - 1; 4706 4707 LOAD_PS(&t[idt],XMM7); 4708 4709 while (nz--) { 4710 PREFETCH_NTA(&v[16]); 4711 idx = 4*(*vi++); 4712 /* idx = *vi++; */ 4713 4714 /* 4x4 Matrix-Vector Product with negative accumulation: */ 4715 SSE_INLINE_BEGIN_2(&t[idx],v) 4716 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 4717 4718 /* First Column */ 4719 SSE_COPY_PS(XMM0,XMM6) 4720 SSE_SHUFFLE(XMM0,XMM0,0x00) 4721 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 4722 SSE_SUB_PS(XMM7,XMM0) 4723 4724 /* Second Column */ 4725 SSE_COPY_PS(XMM1,XMM6) 4726 SSE_SHUFFLE(XMM1,XMM1,0x55) 4727 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 4728 SSE_SUB_PS(XMM7,XMM1) 4729 4730 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 4731 4732 /* Third Column */ 4733 SSE_COPY_PS(XMM2,XMM6) 4734 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4735 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 4736 SSE_SUB_PS(XMM7,XMM2) 4737 4738 /* Fourth Column */ 4739 SSE_COPY_PS(XMM3,XMM6) 4740 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4741 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 4742 SSE_SUB_PS(XMM7,XMM3) 4743 SSE_INLINE_END_2 4744 v += 16; 4745 } 4746 v = aa + ai16; 4747 ai16 = 16*diag[--i]; 4748 PREFETCH_NTA(aa+ai16+16); 4749 /* 4750 Scale the result by the diagonal 4x4 block, 4751 which was inverted as part of the factorization 4752 */ 4753 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 4754 /* First Column */ 4755 SSE_COPY_PS(XMM0,XMM7) 4756 SSE_SHUFFLE(XMM0,XMM0,0x00) 4757 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 4758 4759 /* Second Column */ 4760 SSE_COPY_PS(XMM1,XMM7) 4761 SSE_SHUFFLE(XMM1,XMM1,0x55) 4762 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 4763 SSE_ADD_PS(XMM0,XMM1) 4764 4765 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 4766 4767 /* Third Column */ 4768 SSE_COPY_PS(XMM2,XMM7) 4769 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4770 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 4771 SSE_ADD_PS(XMM0,XMM2) 4772 4773 /* Fourth Column */ 4774 SSE_COPY_PS(XMM3,XMM7) 4775 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4776 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 4777 SSE_ADD_PS(XMM0,XMM3) 4778 4779 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 4780 SSE_INLINE_END_3 4781 4782 v = aa + ai16 + 16; 4783 idt -= 4; 4784 } 4785 4786 /* Convert t from single precision back to double precision (inplace)*/ 4787 idt = 4*(n-1); 4788 for (i=n-1;i>=0;i--) { 4789 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 4790 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 4791 PetscScalar *xtemp=&x[idt]; 4792 MatScalar *ttemp=&t[idt]; 4793 xtemp[3] = (PetscScalar)ttemp[3]; 4794 xtemp[2] = (PetscScalar)ttemp[2]; 4795 xtemp[1] = (PetscScalar)ttemp[1]; 4796 xtemp[0] = (PetscScalar)ttemp[0]; 4797 idt -= 4; 4798 } 4799 4800 } /* End of artificial scope. */ 4801 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 4802 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4803 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 4804 SSE_SCOPE_END; 4805 PetscFunctionReturn(0); 4806 } 4807 4808 #endif 4809 4810 #undef __FUNCT__ 4811 #define __FUNCT__ "MatSolve_SeqBAIJ_3_inplace" 4812 PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx) 4813 { 4814 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 4815 IS iscol=a->col,isrow=a->row; 4816 PetscErrorCode ierr; 4817 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 4818 PetscInt i,nz,idx,idt,idc; 4819 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 4820 const MatScalar *aa=a->a,*v; 4821 PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 4822 const PetscScalar *b; 4823 4824 PetscFunctionBegin; 4825 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 4826 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4827 t = a->solve_work; 4828 4829 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 4830 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 4831 4832 /* forward solve the lower triangular */ 4833 idx = 3*(*r++); 4834 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 4835 for (i=1; i<n; i++) { 4836 v = aa + 9*ai[i]; 4837 vi = aj + ai[i]; 4838 nz = diag[i] - ai[i]; 4839 idx = 3*(*r++); 4840 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 4841 while (nz--) { 4842 idx = 3*(*vi++); 4843 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4844 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4845 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4846 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4847 v += 9; 4848 } 4849 idx = 3*i; 4850 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 4851 } 4852 /* backward solve the upper triangular */ 4853 for (i=n-1; i>=0; i--){ 4854 v = aa + 9*diag[i] + 9; 4855 vi = aj + diag[i] + 1; 4856 nz = ai[i+1] - diag[i] - 1; 4857 idt = 3*i; 4858 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 4859 while (nz--) { 4860 idx = 3*(*vi++); 4861 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4862 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4863 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4864 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4865 v += 9; 4866 } 4867 idc = 3*(*c--); 4868 v = aa + 9*diag[i]; 4869 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4870 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4871 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4872 } 4873 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4874 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4875 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 4876 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4877 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 4878 PetscFunctionReturn(0); 4879 } 4880 4881 #undef __FUNCT__ 4882 #define __FUNCT__ "MatSolve_SeqBAIJ_3" 4883 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 4884 { 4885 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 4886 IS iscol=a->col,isrow=a->row; 4887 PetscErrorCode ierr; 4888 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 4889 PetscInt i,nz,idx,idt,idc,m; 4890 const PetscInt *r,*c,*rout,*cout; 4891 const MatScalar *aa=a->a,*v; 4892 PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 4893 const PetscScalar *b; 4894 4895 PetscFunctionBegin; 4896 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 4897 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4898 t = a->solve_work; 4899 4900 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 4901 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 4902 4903 /* forward solve the lower triangular */ 4904 idx = 3*r[0]; 4905 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 4906 for (i=1; i<n; i++) { 4907 v = aa + 9*ai[i]; 4908 vi = aj + ai[i]; 4909 nz = ai[i+1] - ai[i]; 4910 idx = 3*r[i]; 4911 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 4912 for(m=0;m<nz;m++){ 4913 idx = 3*vi[m]; 4914 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4915 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4916 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4917 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4918 v += 9; 4919 } 4920 idx = 3*i; 4921 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 4922 } 4923 /* backward solve the upper triangular */ 4924 for (i=n-1; i>=0; i--){ 4925 v = aa + 9*(adiag[i+1]+1); 4926 vi = aj + adiag[i+1]+1; 4927 nz = adiag[i] - adiag[i+1] - 1; 4928 idt = 3*i; 4929 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 4930 for(m=0;m<nz;m++){ 4931 idx = 3*vi[m]; 4932 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4933 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4934 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4935 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4936 v += 9; 4937 } 4938 idc = 3*c[i]; 4939 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4940 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4941 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4942 } 4943 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4944 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4945 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 4946 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4947 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 4948 PetscFunctionReturn(0); 4949 } 4950 4951 /* 4952 Special case where the matrix was ILU(0) factored in the natural 4953 ordering. This eliminates the need for the column and row permutation. 4954 */ 4955 #undef __FUNCT__ 4956 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_inplace" 4957 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 4958 { 4959 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4960 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 4961 PetscErrorCode ierr; 4962 const PetscInt *diag = a->diag,*vi; 4963 const MatScalar *aa=a->a,*v; 4964 PetscScalar *x,s1,s2,s3,x1,x2,x3; 4965 const PetscScalar *b; 4966 PetscInt jdx,idt,idx,nz,i; 4967 4968 PetscFunctionBegin; 4969 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 4970 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4971 4972 /* forward solve the lower triangular */ 4973 idx = 0; 4974 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; 4975 for (i=1; i<n; i++) { 4976 v = aa + 9*ai[i]; 4977 vi = aj + ai[i]; 4978 nz = diag[i] - ai[i]; 4979 idx += 3; 4980 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 4981 while (nz--) { 4982 jdx = 3*(*vi++); 4983 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx]; 4984 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4985 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4986 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4987 v += 9; 4988 } 4989 x[idx] = s1; 4990 x[1+idx] = s2; 4991 x[2+idx] = s3; 4992 } 4993 /* backward solve the upper triangular */ 4994 for (i=n-1; i>=0; i--){ 4995 v = aa + 9*diag[i] + 9; 4996 vi = aj + diag[i] + 1; 4997 nz = ai[i+1] - diag[i] - 1; 4998 idt = 3*i; 4999 s1 = x[idt]; s2 = x[1+idt]; 5000 s3 = x[2+idt]; 5001 while (nz--) { 5002 idx = 3*(*vi++); 5003 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; 5004 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 5005 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 5006 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 5007 v += 9; 5008 } 5009 v = aa + 9*diag[i]; 5010 x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 5011 x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 5012 x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 5013 } 5014 5015 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5016 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5017 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 5018 PetscFunctionReturn(0); 5019 } 5020 5021 #undef __FUNCT__ 5022 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering" 5023 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 5024 { 5025 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 5026 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 5027 PetscErrorCode ierr; 5028 PetscInt i,k,nz,idx,jdx,idt; 5029 const PetscInt bs = A->rmap->bs,bs2 = a->bs2; 5030 const MatScalar *aa=a->a,*v; 5031 PetscScalar *x; 5032 const PetscScalar *b; 5033 PetscScalar s1,s2,s3,x1,x2,x3; 5034 5035 PetscFunctionBegin; 5036 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 5037 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5038 /* forward solve the lower triangular */ 5039 idx = 0; 5040 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx]; 5041 for (i=1; i<n; i++) { 5042 v = aa + bs2*ai[i]; 5043 vi = aj + ai[i]; 5044 nz = ai[i+1] - ai[i]; 5045 idx = bs*i; 5046 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 5047 for(k=0;k<nz;k++){ 5048 jdx = bs*vi[k]; 5049 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx]; 5050 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 5051 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 5052 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 5053 5054 v += bs2; 5055 } 5056 5057 x[idx] = s1; 5058 x[1+idx] = s2; 5059 x[2+idx] = s3; 5060 } 5061 5062 /* backward solve the upper triangular */ 5063 for (i=n-1; i>=0; i--){ 5064 v = aa + bs2*(adiag[i+1]+1); 5065 vi = aj + adiag[i+1]+1; 5066 nz = adiag[i] - adiag[i+1]-1; 5067 idt = bs*i; 5068 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt]; 5069 5070 for(k=0;k<nz;k++){ 5071 idx = bs*vi[k]; 5072 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 5073 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 5074 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 5075 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 5076 5077 v += bs2; 5078 } 5079 /* x = inv_diagonal*x */ 5080 x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 5081 x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 5082 x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 5083 5084 } 5085 5086 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5087 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5088 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 5089 PetscFunctionReturn(0); 5090 } 5091 5092 #undef __FUNCT__ 5093 #define __FUNCT__ "MatSolve_SeqBAIJ_2_inplace" 5094 PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx) 5095 { 5096 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 5097 IS iscol=a->col,isrow=a->row; 5098 PetscErrorCode ierr; 5099 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 5100 PetscInt i,nz,idx,idt,idc; 5101 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 5102 const MatScalar *aa=a->a,*v; 5103 PetscScalar *x,s1,s2,x1,x2,*t; 5104 const PetscScalar *b; 5105 5106 PetscFunctionBegin; 5107 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 5108 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5109 t = a->solve_work; 5110 5111 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 5112 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 5113 5114 /* forward solve the lower triangular */ 5115 idx = 2*(*r++); 5116 t[0] = b[idx]; t[1] = b[1+idx]; 5117 for (i=1; i<n; i++) { 5118 v = aa + 4*ai[i]; 5119 vi = aj + ai[i]; 5120 nz = diag[i] - ai[i]; 5121 idx = 2*(*r++); 5122 s1 = b[idx]; s2 = b[1+idx]; 5123 while (nz--) { 5124 idx = 2*(*vi++); 5125 x1 = t[idx]; x2 = t[1+idx]; 5126 s1 -= v[0]*x1 + v[2]*x2; 5127 s2 -= v[1]*x1 + v[3]*x2; 5128 v += 4; 5129 } 5130 idx = 2*i; 5131 t[idx] = s1; t[1+idx] = s2; 5132 } 5133 /* backward solve the upper triangular */ 5134 for (i=n-1; i>=0; i--){ 5135 v = aa + 4*diag[i] + 4; 5136 vi = aj + diag[i] + 1; 5137 nz = ai[i+1] - diag[i] - 1; 5138 idt = 2*i; 5139 s1 = t[idt]; s2 = t[1+idt]; 5140 while (nz--) { 5141 idx = 2*(*vi++); 5142 x1 = t[idx]; x2 = t[1+idx]; 5143 s1 -= v[0]*x1 + v[2]*x2; 5144 s2 -= v[1]*x1 + v[3]*x2; 5145 v += 4; 5146 } 5147 idc = 2*(*c--); 5148 v = aa + 4*diag[i]; 5149 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 5150 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 5151 } 5152 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 5153 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 5154 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5155 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5156 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 5157 PetscFunctionReturn(0); 5158 } 5159 5160 #undef __FUNCT__ 5161 #define __FUNCT__ "MatSolve_SeqBAIJ_2" 5162 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 5163 { 5164 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 5165 IS iscol=a->col,isrow=a->row; 5166 PetscErrorCode ierr; 5167 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 5168 PetscInt i,nz,idx,jdx,idt,idc,m; 5169 const PetscInt *r,*c,*rout,*cout; 5170 const MatScalar *aa=a->a,*v; 5171 PetscScalar *x,s1,s2,x1,x2,*t; 5172 const PetscScalar *b; 5173 5174 PetscFunctionBegin; 5175 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 5176 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5177 t = a->solve_work; 5178 5179 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 5180 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 5181 5182 /* forward solve the lower triangular */ 5183 idx = 2*r[0]; 5184 t[0] = b[idx]; t[1] = b[1+idx]; 5185 for (i=1; i<n; i++) { 5186 v = aa + 4*ai[i]; 5187 vi = aj + ai[i]; 5188 nz = ai[i+1] - ai[i]; 5189 idx = 2*r[i]; 5190 s1 = b[idx]; s2 = b[1+idx]; 5191 for(m=0;m<nz;m++){ 5192 jdx = 2*vi[m]; 5193 x1 = t[jdx]; x2 = t[1+jdx]; 5194 s1 -= v[0]*x1 + v[2]*x2; 5195 s2 -= v[1]*x1 + v[3]*x2; 5196 v += 4; 5197 } 5198 idx = 2*i; 5199 t[idx] = s1; t[1+idx] = s2; 5200 } 5201 /* backward solve the upper triangular */ 5202 for (i=n-1; i>=0; i--){ 5203 v = aa + 4*(adiag[i+1]+1); 5204 vi = aj + adiag[i+1]+1; 5205 nz = adiag[i] - adiag[i+1] - 1; 5206 idt = 2*i; 5207 s1 = t[idt]; s2 = t[1+idt]; 5208 for(m=0;m<nz;m++){ 5209 idx = 2*vi[m]; 5210 x1 = t[idx]; x2 = t[1+idx]; 5211 s1 -= v[0]*x1 + v[2]*x2; 5212 s2 -= v[1]*x1 + v[3]*x2; 5213 v += 4; 5214 } 5215 idc = 2*c[i]; 5216 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 5217 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 5218 } 5219 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 5220 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 5221 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5222 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5223 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 5224 PetscFunctionReturn(0); 5225 } 5226 5227 /* 5228 Special case where the matrix was ILU(0) factored in the natural 5229 ordering. This eliminates the need for the column and row permutation. 5230 */ 5231 #undef __FUNCT__ 5232 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_inplace" 5233 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 5234 { 5235 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 5236 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 5237 PetscErrorCode ierr; 5238 const MatScalar *aa=a->a,*v; 5239 PetscScalar *x,s1,s2,x1,x2; 5240 const PetscScalar *b; 5241 PetscInt jdx,idt,idx,nz,i; 5242 5243 PetscFunctionBegin; 5244 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 5245 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5246 5247 /* forward solve the lower triangular */ 5248 idx = 0; 5249 x[0] = b[0]; x[1] = b[1]; 5250 for (i=1; i<n; i++) { 5251 v = aa + 4*ai[i]; 5252 vi = aj + ai[i]; 5253 nz = diag[i] - ai[i]; 5254 idx += 2; 5255 s1 = b[idx];s2 = b[1+idx]; 5256 while (nz--) { 5257 jdx = 2*(*vi++); 5258 x1 = x[jdx];x2 = x[1+jdx]; 5259 s1 -= v[0]*x1 + v[2]*x2; 5260 s2 -= v[1]*x1 + v[3]*x2; 5261 v += 4; 5262 } 5263 x[idx] = s1; 5264 x[1+idx] = s2; 5265 } 5266 /* backward solve the upper triangular */ 5267 for (i=n-1; i>=0; i--){ 5268 v = aa + 4*diag[i] + 4; 5269 vi = aj + diag[i] + 1; 5270 nz = ai[i+1] - diag[i] - 1; 5271 idt = 2*i; 5272 s1 = x[idt]; s2 = x[1+idt]; 5273 while (nz--) { 5274 idx = 2*(*vi++); 5275 x1 = x[idx]; x2 = x[1+idx]; 5276 s1 -= v[0]*x1 + v[2]*x2; 5277 s2 -= v[1]*x1 + v[3]*x2; 5278 v += 4; 5279 } 5280 v = aa + 4*diag[i]; 5281 x[idt] = v[0]*s1 + v[2]*s2; 5282 x[1+idt] = v[1]*s1 + v[3]*s2; 5283 } 5284 5285 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5286 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5287 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 5288 PetscFunctionReturn(0); 5289 } 5290 5291 #undef __FUNCT__ 5292 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering" 5293 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 5294 { 5295 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 5296 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 5297 PetscInt i,k,nz,idx,idt,jdx; 5298 PetscErrorCode ierr; 5299 const MatScalar *aa=a->a,*v; 5300 PetscScalar *x,s1,s2,x1,x2; 5301 const PetscScalar *b; 5302 5303 PetscFunctionBegin; 5304 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 5305 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5306 /* forward solve the lower triangular */ 5307 idx = 0; 5308 x[0] = b[idx]; x[1] = b[1+idx]; 5309 for (i=1; i<n; i++) { 5310 v = aa + 4*ai[i]; 5311 vi = aj + ai[i]; 5312 nz = ai[i+1] - ai[i]; 5313 idx = 2*i; 5314 s1 = b[idx];s2 = b[1+idx]; 5315 for(k=0;k<nz;k++){ 5316 jdx = 2*vi[k]; 5317 x1 = x[jdx];x2 = x[1+jdx]; 5318 s1 -= v[0]*x1 + v[2]*x2; 5319 s2 -= v[1]*x1 + v[3]*x2; 5320 v += 4; 5321 } 5322 x[idx] = s1; 5323 x[1+idx] = s2; 5324 } 5325 5326 /* backward solve the upper triangular */ 5327 for (i=n-1; i>=0; i--){ 5328 v = aa + 4*(adiag[i+1]+1); 5329 vi = aj + adiag[i+1]+1; 5330 nz = adiag[i] - adiag[i+1]-1; 5331 idt = 2*i; 5332 s1 = x[idt]; s2 = x[1+idt]; 5333 for(k=0;k<nz;k++){ 5334 idx = 2*vi[k]; 5335 x1 = x[idx]; x2 = x[1+idx]; 5336 s1 -= v[0]*x1 + v[2]*x2; 5337 s2 -= v[1]*x1 + v[3]*x2; 5338 v += 4; 5339 } 5340 /* x = inv_diagonal*x */ 5341 x[idt] = v[0]*s1 + v[2]*s2; 5342 x[1+idt] = v[1]*s1 + v[3]*s2; 5343 } 5344 5345 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5346 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5347 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 5348 PetscFunctionReturn(0); 5349 } 5350 5351 #undef __FUNCT__ 5352 #define __FUNCT__ "MatSolve_SeqBAIJ_1_inplace" 5353 PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx) 5354 { 5355 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 5356 IS iscol=a->col,isrow=a->row; 5357 PetscErrorCode ierr; 5358 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 5359 PetscInt i,nz; 5360 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 5361 const MatScalar *aa=a->a,*v; 5362 PetscScalar *x,s1,*t; 5363 const PetscScalar *b; 5364 5365 PetscFunctionBegin; 5366 if (!n) PetscFunctionReturn(0); 5367 5368 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 5369 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5370 t = a->solve_work; 5371 5372 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 5373 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 5374 5375 /* forward solve the lower triangular */ 5376 t[0] = b[*r++]; 5377 for (i=1; i<n; i++) { 5378 v = aa + ai[i]; 5379 vi = aj + ai[i]; 5380 nz = diag[i] - ai[i]; 5381 s1 = b[*r++]; 5382 while (nz--) { 5383 s1 -= (*v++)*t[*vi++]; 5384 } 5385 t[i] = s1; 5386 } 5387 /* backward solve the upper triangular */ 5388 for (i=n-1; i>=0; i--){ 5389 v = aa + diag[i] + 1; 5390 vi = aj + diag[i] + 1; 5391 nz = ai[i+1] - diag[i] - 1; 5392 s1 = t[i]; 5393 while (nz--) { 5394 s1 -= (*v++)*t[*vi++]; 5395 } 5396 x[*c--] = t[i] = aa[diag[i]]*s1; 5397 } 5398 5399 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 5400 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 5401 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5402 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5403 ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr); 5404 PetscFunctionReturn(0); 5405 } 5406 5407 #undef __FUNCT__ 5408 #define __FUNCT__ "MatSolve_SeqBAIJ_1" 5409 PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 5410 { 5411 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 5412 IS iscol = a->col,isrow = a->row; 5413 PetscErrorCode ierr; 5414 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag = a->diag,nz; 5415 const PetscInt *rout,*cout,*r,*c; 5416 PetscScalar *x,*tmp,sum; 5417 const PetscScalar *b; 5418 const MatScalar *aa = a->a,*v; 5419 5420 PetscFunctionBegin; 5421 if (!n) PetscFunctionReturn(0); 5422 5423 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 5424 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5425 tmp = a->solve_work; 5426 5427 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 5428 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 5429 5430 /* forward solve the lower triangular */ 5431 tmp[0] = b[r[0]]; 5432 v = aa; 5433 vi = aj; 5434 for (i=1; i<n; i++) { 5435 nz = ai[i+1] - ai[i]; 5436 sum = b[r[i]]; 5437 PetscSparseDenseMinusDot(sum,tmp,v,vi,nz); 5438 tmp[i] = sum; 5439 v += nz; vi += nz; 5440 } 5441 5442 /* backward solve the upper triangular */ 5443 for (i=n-1; i>=0; i--){ 5444 v = aa + adiag[i+1]+1; 5445 vi = aj + adiag[i+1]+1; 5446 nz = adiag[i]-adiag[i+1]-1; 5447 sum = tmp[i]; 5448 PetscSparseDenseMinusDot(sum,tmp,v,vi,nz); 5449 x[c[i]] = tmp[i] = sum*v[nz]; /* v[nz] = aa[adiag[i]] */ 5450 } 5451 5452 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 5453 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 5454 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5455 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5456 ierr = PetscLogFlops(2*a->nz - A->cmap->n);CHKERRQ(ierr); 5457 PetscFunctionReturn(0); 5458 } 5459 5460 /* 5461 Special case where the matrix was ILU(0) factored in the natural 5462 ordering. This eliminates the need for the column and row permutation. 5463 */ 5464 #undef __FUNCT__ 5465 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering_inplace" 5466 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 5467 { 5468 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 5469 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 5470 PetscErrorCode ierr; 5471 const MatScalar *aa=a->a,*v; 5472 PetscScalar *x; 5473 const PetscScalar *b; 5474 PetscScalar s1,x1; 5475 PetscInt jdx,idt,idx,nz,i; 5476 5477 PetscFunctionBegin; 5478 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 5479 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5480 5481 /* forward solve the lower triangular */ 5482 idx = 0; 5483 x[0] = b[0]; 5484 for (i=1; i<n; i++) { 5485 v = aa + ai[i]; 5486 vi = aj + ai[i]; 5487 nz = diag[i] - ai[i]; 5488 idx += 1; 5489 s1 = b[idx]; 5490 while (nz--) { 5491 jdx = *vi++; 5492 x1 = x[jdx]; 5493 s1 -= v[0]*x1; 5494 v += 1; 5495 } 5496 x[idx] = s1; 5497 } 5498 /* backward solve the upper triangular */ 5499 for (i=n-1; i>=0; i--){ 5500 v = aa + diag[i] + 1; 5501 vi = aj + diag[i] + 1; 5502 nz = ai[i+1] - diag[i] - 1; 5503 idt = i; 5504 s1 = x[idt]; 5505 while (nz--) { 5506 idx = *vi++; 5507 x1 = x[idx]; 5508 s1 -= v[0]*x1; 5509 v += 1; 5510 } 5511 v = aa + diag[i]; 5512 x[idt] = v[0]*s1; 5513 } 5514 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5515 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5516 ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 5517 PetscFunctionReturn(0); 5518 } 5519 5520 5521 #undef __FUNCT__ 5522 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering" 5523 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 5524 { 5525 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 5526 PetscErrorCode ierr; 5527 const PetscInt n = a->mbs,*ai = a->i,*aj = a->j,*adiag = a->diag,*vi; 5528 PetscScalar *x,sum; 5529 const PetscScalar *b; 5530 const MatScalar *aa = a->a,*v; 5531 PetscInt i,nz; 5532 5533 PetscFunctionBegin; 5534 if (!n) PetscFunctionReturn(0); 5535 5536 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 5537 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5538 5539 /* forward solve the lower triangular */ 5540 x[0] = b[0]; 5541 v = aa; 5542 vi = aj; 5543 for (i=1; i<n; i++) { 5544 nz = ai[i+1] - ai[i]; 5545 sum = b[i]; 5546 PetscSparseDenseMinusDot(sum,x,v,vi,nz); 5547 v += nz; 5548 vi += nz; 5549 x[i] = sum; 5550 } 5551 5552 /* backward solve the upper triangular */ 5553 for (i=n-1; i>=0; i--){ 5554 v = aa + adiag[i+1] + 1; 5555 vi = aj + adiag[i+1] + 1; 5556 nz = adiag[i] - adiag[i+1]-1; 5557 sum = x[i]; 5558 PetscSparseDenseMinusDot(sum,x,v,vi,nz); 5559 x[i] = sum*v[nz]; /* x[i]=aa[adiag[i]]*sum; v++; */ 5560 } 5561 5562 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 5563 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5564 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5565 PetscFunctionReturn(0); 5566 } 5567 5568 /* ----------------------------------------------------------------*/ 5569 extern PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscBool ); 5570 5571 #undef __FUNCT__ 5572 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering" 5573 /* 5574 This is not much faster than MatLUFactorNumeric_SeqBAIJ_N() but the solve is faster at least sometimes 5575 */ 5576 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info) 5577 { 5578 Mat C=B; 5579 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 5580 PetscErrorCode ierr; 5581 PetscInt i,j,k,ipvt[15]; 5582 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*ajtmp,*bjtmp,*bdiag=b->diag,*pj; 5583 PetscInt nz,nzL,row; 5584 MatScalar *rtmp,*pc,*mwork,*pv,*vv,work[225]; 5585 const MatScalar *v,*aa=a->a; 5586 PetscInt bs2 = a->bs2,bs=A->rmap->bs,flg; 5587 PetscInt sol_ver; 5588 5589 PetscFunctionBegin; 5590 5591 ierr = PetscOptionsGetInt(PETSC_NULL,"-sol_ver",&sol_ver,PETSC_NULL);CHKERRQ(ierr); 5592 5593 /* generate work space needed by the factorization */ 5594 ierr = PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);CHKERRQ(ierr); 5595 ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 5596 5597 for (i=0; i<n; i++){ 5598 /* zero rtmp */ 5599 /* L part */ 5600 nz = bi[i+1] - bi[i]; 5601 bjtmp = bj + bi[i]; 5602 for (j=0; j<nz; j++){ 5603 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5604 } 5605 5606 /* U part */ 5607 nz = bdiag[i] - bdiag[i+1]; 5608 bjtmp = bj + bdiag[i+1]+1; 5609 for (j=0; j<nz; j++){ 5610 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5611 } 5612 5613 /* load in initial (unfactored row) */ 5614 nz = ai[i+1] - ai[i]; 5615 ajtmp = aj + ai[i]; 5616 v = aa + bs2*ai[i]; 5617 for (j=0; j<nz; j++) { 5618 ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 5619 } 5620 5621 /* elimination */ 5622 bjtmp = bj + bi[i]; 5623 nzL = bi[i+1] - bi[i]; 5624 for(k=0;k < nzL;k++) { 5625 row = bjtmp[k]; 5626 pc = rtmp + bs2*row; 5627 for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 5628 if (flg) { 5629 pv = b->a + bs2*bdiag[row]; 5630 Kernel_A_gets_A_times_B(bs,pc,pv,mwork); 5631 /*ierr = Kernel_A_gets_A_times_B_15(pc,pv,mwork);CHKERRQ(ierr);*/ 5632 pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 5633 pv = b->a + bs2*(bdiag[row+1]+1); 5634 nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 5635 for (j=0; j<nz; j++) { 5636 vv = rtmp + bs2*pj[j]; 5637 Kernel_A_gets_A_minus_B_times_C(bs,vv,pc,pv); 5638 /* ierr = Kernel_A_gets_A_minus_B_times_C_15(vv,pc,pv);CHKERRQ(ierr); */ 5639 pv += bs2; 5640 } 5641 ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 5642 } 5643 } 5644 5645 /* finished row so stick it into b->a */ 5646 /* L part */ 5647 pv = b->a + bs2*bi[i] ; 5648 pj = b->j + bi[i] ; 5649 nz = bi[i+1] - bi[i]; 5650 for (j=0; j<nz; j++) { 5651 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5652 } 5653 5654 /* Mark diagonal and invert diagonal for simplier triangular solves */ 5655 pv = b->a + bs2*bdiag[i]; 5656 pj = b->j + bdiag[i]; 5657 ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5658 /* Kernel_A_gets_inverse_A(bs,pv,pivots,work); */ 5659 ierr = Kernel_A_gets_inverse_A_15(pv,ipvt,work,info->shiftamount);CHKERRQ(ierr); 5660 5661 /* U part */ 5662 pv = b->a + bs2*(bdiag[i+1]+1); 5663 pj = b->j + bdiag[i+1]+1; 5664 nz = bdiag[i] - bdiag[i+1] - 1; 5665 for (j=0; j<nz; j++){ 5666 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5667 } 5668 } 5669 5670 ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr); 5671 C->ops->solve = MatSolve_SeqBAIJ_15_NaturalOrdering_ver1; 5672 C->ops->solvetranspose = MatSolve_SeqBAIJ_N_NaturalOrdering; 5673 C->assembled = PETSC_TRUE; 5674 ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 5675 PetscFunctionReturn(0); 5676 } 5677 5678 #undef __FUNCT__ 5679 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N" 5680 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N(Mat B,Mat A,const MatFactorInfo *info) 5681 { 5682 Mat C=B; 5683 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 5684 IS isrow = b->row,isicol = b->icol; 5685 PetscErrorCode ierr; 5686 const PetscInt *r,*ic; 5687 PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 5688 PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj; 5689 MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 5690 PetscInt bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg; 5691 MatScalar *v_work; 5692 PetscBool col_identity,row_identity,both_identity; 5693 5694 PetscFunctionBegin; 5695 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 5696 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 5697 5698 ierr = PetscMalloc(bs2*n*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 5699 ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 5700 5701 /* generate work space needed by dense LU factorization */ 5702 ierr = PetscMalloc3(bs,MatScalar,&v_work,bs2,MatScalar,&mwork,bs,PetscInt,&v_pivots);CHKERRQ(ierr); 5703 5704 for (i=0; i<n; i++){ 5705 /* zero rtmp */ 5706 /* L part */ 5707 nz = bi[i+1] - bi[i]; 5708 bjtmp = bj + bi[i]; 5709 for (j=0; j<nz; j++){ 5710 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5711 } 5712 5713 /* U part */ 5714 nz = bdiag[i] - bdiag[i+1]; 5715 bjtmp = bj + bdiag[i+1]+1; 5716 for (j=0; j<nz; j++){ 5717 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5718 } 5719 5720 /* load in initial (unfactored row) */ 5721 nz = ai[r[i]+1] - ai[r[i]]; 5722 ajtmp = aj + ai[r[i]]; 5723 v = aa + bs2*ai[r[i]]; 5724 for (j=0; j<nz; j++) { 5725 ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 5726 } 5727 5728 /* elimination */ 5729 bjtmp = bj + bi[i]; 5730 nzL = bi[i+1] - bi[i]; 5731 for(k=0;k < nzL;k++) { 5732 row = bjtmp[k]; 5733 pc = rtmp + bs2*row; 5734 for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 5735 if (flg) { 5736 pv = b->a + bs2*bdiag[row]; 5737 Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */ 5738 pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 5739 pv = b->a + bs2*(bdiag[row+1]+1); 5740 nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 5741 for (j=0; j<nz; j++) { 5742 Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); 5743 } 5744 ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 5745 } 5746 } 5747 5748 /* finished row so stick it into b->a */ 5749 /* L part */ 5750 pv = b->a + bs2*bi[i] ; 5751 pj = b->j + bi[i] ; 5752 nz = bi[i+1] - bi[i]; 5753 for (j=0; j<nz; j++) { 5754 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5755 } 5756 5757 /* Mark diagonal and invert diagonal for simplier triangular solves */ 5758 pv = b->a + bs2*bdiag[i]; 5759 pj = b->j + bdiag[i]; 5760 /* if (*pj != i)SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */ 5761 ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5762 ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); 5763 5764 /* U part */ 5765 pv = b->a + bs2*(bdiag[i+1]+1); 5766 pj = b->j + bdiag[i+1]+1; 5767 nz = bdiag[i] - bdiag[i+1] - 1; 5768 for (j=0; j<nz; j++){ 5769 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5770 } 5771 } 5772 5773 ierr = PetscFree(rtmp);CHKERRQ(ierr); 5774 ierr = PetscFree3(v_work,mwork,v_pivots);CHKERRQ(ierr); 5775 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 5776 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 5777 5778 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 5779 ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr); 5780 both_identity = (PetscBool) (row_identity && col_identity); 5781 if (both_identity){ 5782 C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering; 5783 } else { 5784 C->ops->solve = MatSolve_SeqBAIJ_N; 5785 } 5786 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_N; 5787 5788 C->assembled = PETSC_TRUE; 5789 ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 5790 PetscFunctionReturn(0); 5791 } 5792 5793 /* 5794 ilu(0) with natural ordering under new data structure. 5795 See MatILUFactorSymbolic_SeqAIJ_ilu0() for detailed description 5796 because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_inplace(). 5797 */ 5798 5799 #undef __FUNCT__ 5800 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0" 5801 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 5802 { 5803 5804 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 5805 PetscErrorCode ierr; 5806 PetscInt n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2; 5807 PetscInt i,j,nz,*bi,*bj,*bdiag,bi_temp; 5808 5809 PetscFunctionBegin; 5810 ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr); 5811 b = (Mat_SeqBAIJ*)(fact)->data; 5812 5813 /* allocate matrix arrays for new data structure */ 5814 ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,n+1,PetscInt,&b->i);CHKERRQ(ierr); 5815 ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(n+1)*sizeof(PetscInt));CHKERRQ(ierr); 5816 b->singlemalloc = PETSC_TRUE; 5817 if (!b->diag){ 5818 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr); 5819 ierr = PetscLogObjectMemory(fact,(n+1)*sizeof(PetscInt));CHKERRQ(ierr); 5820 } 5821 bdiag = b->diag; 5822 5823 if (n > 0) { 5824 ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr); 5825 } 5826 5827 /* set bi and bj with new data structure */ 5828 bi = b->i; 5829 bj = b->j; 5830 5831 /* L part */ 5832 bi[0] = 0; 5833 for (i=0; i<n; i++){ 5834 nz = adiag[i] - ai[i]; 5835 bi[i+1] = bi[i] + nz; 5836 aj = a->j + ai[i]; 5837 for (j=0; j<nz; j++){ 5838 *bj = aj[j]; bj++; 5839 } 5840 } 5841 5842 /* U part */ 5843 bi_temp = bi[n]; 5844 bdiag[n] = bi[n]-1; 5845 for (i=n-1; i>=0; i--){ 5846 nz = ai[i+1] - adiag[i] - 1; 5847 bi_temp = bi_temp + nz + 1; 5848 aj = a->j + adiag[i] + 1; 5849 for (j=0; j<nz; j++){ 5850 *bj = aj[j]; bj++; 5851 } 5852 /* diag[i] */ 5853 *bj = i; bj++; 5854 bdiag[i] = bi_temp - 1; 5855 } 5856 PetscFunctionReturn(0); 5857 } 5858 5859 #undef __FUNCT__ 5860 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ" 5861 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 5862 { 5863 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 5864 IS isicol; 5865 PetscErrorCode ierr; 5866 const PetscInt *r,*ic; 5867 PetscInt n=a->mbs,*ai=a->i,*aj=a->j,d; 5868 PetscInt *bi,*cols,nnz,*cols_lvl; 5869 PetscInt *bdiag,prow,fm,nzbd,reallocs=0,dcount=0; 5870 PetscInt i,levels,diagonal_fill; 5871 PetscBool col_identity,row_identity,both_identity; 5872 PetscReal f; 5873 PetscInt nlnk,*lnk,*lnk_lvl=PETSC_NULL; 5874 PetscBT lnkbt; 5875 PetscInt nzi,*bj,**bj_ptr,**bjlvl_ptr; 5876 PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL; 5877 PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL; 5878 PetscBool missing; 5879 PetscInt bs=A->rmap->bs,bs2=a->bs2; 5880 5881 PetscFunctionBegin; 5882 if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n); 5883 if (bs>1){ /* check shifttype */ 5884 if (info->shifttype == MAT_SHIFT_NONZERO || info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE) 5885 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Only MAT_SHIFT_NONE and MAT_SHIFT_INBLOCKS are supported for BAIJ matrix"); 5886 } 5887 5888 ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr); 5889 if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d); 5890 5891 f = info->fill; 5892 levels = (PetscInt)info->levels; 5893 diagonal_fill = (PetscInt)info->diagonal_fill; 5894 ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 5895 5896 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 5897 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 5898 both_identity = (PetscBool) (row_identity && col_identity); 5899 5900 if (!levels && both_identity) { 5901 /* special case: ilu(0) with natural ordering */ 5902 ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0(fact,A,isrow,iscol,info);CHKERRQ(ierr); 5903 ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 5904 5905 fact->factortype = MAT_FACTOR_ILU; 5906 (fact)->info.factor_mallocs = 0; 5907 (fact)->info.fill_ratio_given = info->fill; 5908 (fact)->info.fill_ratio_needed = 1.0; 5909 b = (Mat_SeqBAIJ*)(fact)->data; 5910 b->row = isrow; 5911 b->col = iscol; 5912 b->icol = isicol; 5913 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 5914 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 5915 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 5916 ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 5917 PetscFunctionReturn(0); 5918 } 5919 5920 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 5921 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 5922 5923 /* get new row pointers */ 5924 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr); 5925 bi[0] = 0; 5926 /* bdiag is location of diagonal in factor */ 5927 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr); 5928 bdiag[0] = 0; 5929 5930 ierr = PetscMalloc2(n,PetscInt*,&bj_ptr,n,PetscInt*,&bjlvl_ptr);CHKERRQ(ierr); 5931 5932 /* create a linked list for storing column indices of the active row */ 5933 nlnk = n + 1; 5934 ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 5935 5936 /* initial FreeSpace size is f*(ai[n]+1) */ 5937 ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr); 5938 current_space = free_space; 5939 ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr); 5940 current_space_lvl = free_space_lvl; 5941 5942 for (i=0; i<n; i++) { 5943 nzi = 0; 5944 /* copy current row into linked list */ 5945 nnz = ai[r[i]+1] - ai[r[i]]; 5946 if (!nnz) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i); 5947 cols = aj + ai[r[i]]; 5948 lnk[i] = -1; /* marker to indicate if diagonal exists */ 5949 ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 5950 nzi += nlnk; 5951 5952 /* make sure diagonal entry is included */ 5953 if (diagonal_fill && lnk[i] == -1) { 5954 fm = n; 5955 while (lnk[fm] < i) fm = lnk[fm]; 5956 lnk[i] = lnk[fm]; /* insert diagonal into linked list */ 5957 lnk[fm] = i; 5958 lnk_lvl[i] = 0; 5959 nzi++; dcount++; 5960 } 5961 5962 /* add pivot rows into the active row */ 5963 nzbd = 0; 5964 prow = lnk[n]; 5965 while (prow < i) { 5966 nnz = bdiag[prow]; 5967 cols = bj_ptr[prow] + nnz + 1; 5968 cols_lvl = bjlvl_ptr[prow] + nnz + 1; 5969 nnz = bi[prow+1] - bi[prow] - nnz - 1; 5970 ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr); 5971 nzi += nlnk; 5972 prow = lnk[prow]; 5973 nzbd++; 5974 } 5975 bdiag[i] = nzbd; 5976 bi[i+1] = bi[i] + nzi; 5977 5978 /* if free space is not available, make more free space */ 5979 if (current_space->local_remaining<nzi) { 5980 nnz = 2*nzi*(n - i); /* estimated and max additional space needed */ 5981 ierr = PetscFreeSpaceGet(nnz,¤t_space);CHKERRQ(ierr); 5982 ierr = PetscFreeSpaceGet(nnz,¤t_space_lvl);CHKERRQ(ierr); 5983 reallocs++; 5984 } 5985 5986 /* copy data into free_space and free_space_lvl, then initialize lnk */ 5987 ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr); 5988 bj_ptr[i] = current_space->array; 5989 bjlvl_ptr[i] = current_space_lvl->array; 5990 5991 /* make sure the active row i has diagonal entry */ 5992 if (*(bj_ptr[i]+bdiag[i]) != i) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\ntry running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i); 5993 5994 current_space->array += nzi; 5995 current_space->local_used += nzi; 5996 current_space->local_remaining -= nzi; 5997 current_space_lvl->array += nzi; 5998 current_space_lvl->local_used += nzi; 5999 current_space_lvl->local_remaining -= nzi; 6000 } 6001 6002 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 6003 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 6004 6005 /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */ 6006 ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr); 6007 ierr = PetscFreeSpaceContiguous_LU(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr); 6008 6009 ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 6010 ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr); 6011 ierr = PetscFree2(bj_ptr,bjlvl_ptr);CHKERRQ(ierr); 6012 6013 #if defined(PETSC_USE_INFO) 6014 { 6015 PetscReal af = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]); 6016 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr); 6017 ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 6018 ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr); 6019 ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 6020 if (diagonal_fill) { 6021 ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr); 6022 } 6023 } 6024 #endif 6025 6026 /* put together the new matrix */ 6027 ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 6028 ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 6029 b = (Mat_SeqBAIJ*)(fact)->data; 6030 b->free_a = PETSC_TRUE; 6031 b->free_ij = PETSC_TRUE; 6032 b->singlemalloc = PETSC_FALSE; 6033 ierr = PetscMalloc( (bs2*(bdiag[0]+1) )*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 6034 b->j = bj; 6035 b->i = bi; 6036 b->diag = bdiag; 6037 b->free_diag = PETSC_TRUE; 6038 b->ilen = 0; 6039 b->imax = 0; 6040 b->row = isrow; 6041 b->col = iscol; 6042 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 6043 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 6044 b->icol = isicol; 6045 ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 6046 /* In b structure: Free imax, ilen, old a, old j. 6047 Allocate bdiag, solve_work, new a, new j */ 6048 ierr = PetscLogObjectMemory(fact,(bdiag[0]+1) * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr); 6049 b->maxnz = b->nz = bdiag[0]+1; 6050 fact->info.factor_mallocs = reallocs; 6051 fact->info.fill_ratio_given = f; 6052 fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]); 6053 ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 6054 PetscFunctionReturn(0); 6055 } 6056 6057 /* 6058 This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 6059 except that the data structure of Mat_SeqAIJ is slightly different. 6060 Not a good example of code reuse. 6061 */ 6062 #undef __FUNCT__ 6063 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_inplace" 6064 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_inplace(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 6065 { 6066 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 6067 IS isicol; 6068 PetscErrorCode ierr; 6069 const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi; 6070 PetscInt prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp; 6071 PetscInt *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0; 6072 PetscInt incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd; 6073 PetscBool col_identity,row_identity,both_identity,flg; 6074 PetscReal f; 6075 6076 PetscFunctionBegin; 6077 ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr); 6078 if (flg) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd); 6079 6080 f = info->fill; 6081 levels = (PetscInt)info->levels; 6082 diagonal_fill = (PetscInt)info->diagonal_fill; 6083 ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 6084 6085 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 6086 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 6087 both_identity = (PetscBool) (row_identity && col_identity); 6088 6089 if (!levels && both_identity) { /* special case copy the nonzero structure */ 6090 ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr); 6091 ierr = MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);CHKERRQ(ierr); 6092 6093 fact->factortype = MAT_FACTOR_ILU; 6094 b = (Mat_SeqBAIJ*)fact->data; 6095 b->row = isrow; 6096 b->col = iscol; 6097 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 6098 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 6099 b->icol = isicol; 6100 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 6101 ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 6102 PetscFunctionReturn(0); 6103 } 6104 6105 /* general case perform the symbolic factorization */ 6106 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 6107 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 6108 6109 /* get new row pointers */ 6110 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr); 6111 ainew[0] = 0; 6112 /* don't know how many column pointers are needed so estimate */ 6113 jmax = (PetscInt)(f*ai[n] + 1); 6114 ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr); 6115 /* ajfill is level of fill for each fill entry */ 6116 ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr); 6117 /* fill is a linked list of nonzeros in active row */ 6118 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr); 6119 /* im is level for each filled value */ 6120 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr); 6121 /* dloc is location of diagonal in factor */ 6122 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr); 6123 dloc[0] = 0; 6124 for (prow=0; prow<n; prow++) { 6125 6126 /* copy prow into linked list */ 6127 nzf = nz = ai[r[prow]+1] - ai[r[prow]]; 6128 if (!nz) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow); 6129 xi = aj + ai[r[prow]]; 6130 fill[n] = n; 6131 fill[prow] = -1; /* marker for diagonal entry */ 6132 while (nz--) { 6133 fm = n; 6134 idx = ic[*xi++]; 6135 do { 6136 m = fm; 6137 fm = fill[m]; 6138 } while (fm < idx); 6139 fill[m] = idx; 6140 fill[idx] = fm; 6141 im[idx] = 0; 6142 } 6143 6144 /* make sure diagonal entry is included */ 6145 if (diagonal_fill && fill[prow] == -1) { 6146 fm = n; 6147 while (fill[fm] < prow) fm = fill[fm]; 6148 fill[prow] = fill[fm]; /* insert diagonal into linked list */ 6149 fill[fm] = prow; 6150 im[prow] = 0; 6151 nzf++; 6152 dcount++; 6153 } 6154 6155 nzi = 0; 6156 row = fill[n]; 6157 while (row < prow) { 6158 incrlev = im[row] + 1; 6159 nz = dloc[row]; 6160 xi = ajnew + ainew[row] + nz + 1; 6161 flev = ajfill + ainew[row] + nz + 1; 6162 nnz = ainew[row+1] - ainew[row] - nz - 1; 6163 fm = row; 6164 while (nnz-- > 0) { 6165 idx = *xi++; 6166 if (*flev + incrlev > levels) { 6167 flev++; 6168 continue; 6169 } 6170 do { 6171 m = fm; 6172 fm = fill[m]; 6173 } while (fm < idx); 6174 if (fm != idx) { 6175 im[idx] = *flev + incrlev; 6176 fill[m] = idx; 6177 fill[idx] = fm; 6178 fm = idx; 6179 nzf++; 6180 } else { 6181 if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev; 6182 } 6183 flev++; 6184 } 6185 row = fill[row]; 6186 nzi++; 6187 } 6188 /* copy new filled row into permanent storage */ 6189 ainew[prow+1] = ainew[prow] + nzf; 6190 if (ainew[prow+1] > jmax) { 6191 6192 /* estimate how much additional space we will need */ 6193 /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 6194 /* just double the memory each time */ 6195 PetscInt maxadd = jmax; 6196 /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */ 6197 if (maxadd < nzf) maxadd = (n-prow)*(nzf+1); 6198 jmax += maxadd; 6199 6200 /* allocate a longer ajnew and ajfill */ 6201 ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 6202 ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 6203 ierr = PetscFree(ajnew);CHKERRQ(ierr); 6204 ajnew = xitmp; 6205 ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 6206 ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 6207 ierr = PetscFree(ajfill);CHKERRQ(ierr); 6208 ajfill = xitmp; 6209 reallocate++; /* count how many reallocations are needed */ 6210 } 6211 xitmp = ajnew + ainew[prow]; 6212 flev = ajfill + ainew[prow]; 6213 dloc[prow] = nzi; 6214 fm = fill[n]; 6215 while (nzf--) { 6216 *xitmp++ = fm; 6217 *flev++ = im[fm]; 6218 fm = fill[fm]; 6219 } 6220 /* make sure row has diagonal entry */ 6221 if (ajnew[ainew[prow]+dloc[prow]] != prow) { 6222 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 6223 try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow); 6224 } 6225 } 6226 ierr = PetscFree(ajfill);CHKERRQ(ierr); 6227 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 6228 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 6229 ierr = PetscFree(fill);CHKERRQ(ierr); 6230 ierr = PetscFree(im);CHKERRQ(ierr); 6231 6232 #if defined(PETSC_USE_INFO) 6233 { 6234 PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]); 6235 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr); 6236 ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 6237 ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr); 6238 ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 6239 if (diagonal_fill) { 6240 ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr); 6241 } 6242 } 6243 #endif 6244 6245 /* put together the new matrix */ 6246 ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 6247 ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 6248 b = (Mat_SeqBAIJ*)fact->data; 6249 b->free_a = PETSC_TRUE; 6250 b->free_ij = PETSC_TRUE; 6251 b->singlemalloc = PETSC_FALSE; 6252 ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 6253 b->j = ajnew; 6254 b->i = ainew; 6255 for (i=0; i<n; i++) dloc[i] += ainew[i]; 6256 b->diag = dloc; 6257 b->free_diag = PETSC_TRUE; 6258 b->ilen = 0; 6259 b->imax = 0; 6260 b->row = isrow; 6261 b->col = iscol; 6262 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 6263 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 6264 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 6265 b->icol = isicol; 6266 ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 6267 /* In b structure: Free imax, ilen, old a, old j. 6268 Allocate dloc, solve_work, new a, new j */ 6269 ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr); 6270 b->maxnz = b->nz = ainew[n]; 6271 6272 fact->info.factor_mallocs = reallocate; 6273 fact->info.fill_ratio_given = f; 6274 fact->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]); 6275 6276 ierr = MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);CHKERRQ(ierr); 6277 PetscFunctionReturn(0); 6278 } 6279 6280 #undef __FUNCT__ 6281 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE" 6282 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A) 6283 { 6284 /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */ 6285 /* int i,*AJ=a->j,nz=a->nz; */ 6286 PetscFunctionBegin; 6287 /* Undo Column scaling */ 6288 /* while (nz--) { */ 6289 /* AJ[i] = AJ[i]/4; */ 6290 /* } */ 6291 /* This should really invoke a push/pop logic, but we don't have that yet. */ 6292 A->ops->setunfactored = PETSC_NULL; 6293 PetscFunctionReturn(0); 6294 } 6295 6296 #undef __FUNCT__ 6297 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj" 6298 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A) 6299 { 6300 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 6301 PetscInt *AJ=a->j,nz=a->nz; 6302 unsigned short *aj=(unsigned short *)AJ; 6303 PetscFunctionBegin; 6304 /* Is this really necessary? */ 6305 while (nz--) { 6306 AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */ 6307 } 6308 A->ops->setunfactored = PETSC_NULL; 6309 PetscFunctionReturn(0); 6310 } 6311 6312 6313