1 2 /* 3 Factorization code for BAIJ format. 4 */ 5 6 #include <../src/mat/impls/baij/seq/baij.h> 7 #include <../src/mat/blockinvert.h> 8 #include <petscbt.h> 9 #include <../src/mat/utils/freespace.h> 10 11 #undef __FUNCT__ 12 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering" 13 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 14 { 15 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 16 PetscErrorCode ierr; 17 const PetscInt *adiag = a->diag,*ai = a->i,*aj = a->j,*vi; 18 PetscInt i,n = a->mbs,j; 19 PetscInt nz; 20 PetscScalar *x,*tmp,s1; 21 const MatScalar *aa = a->a,*v; 22 const PetscScalar *b; 23 24 PetscFunctionBegin; 25 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 26 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 27 tmp = a->solve_work; 28 29 30 /* copy the b into temp work space according to permutation */ 31 for (i=0; i<n; i++) tmp[i] = b[i]; 32 33 /* forward solve the U^T */ 34 for (i=0; i<n; i++) { 35 v = aa + adiag[i+1] + 1; 36 vi = aj + adiag[i+1] + 1; 37 nz = adiag[i] - adiag[i+1] - 1; 38 s1 = tmp[i]; 39 s1 *= v[nz]; /* multiply by inverse of diagonal entry */ 40 for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j]; 41 tmp[i] = s1; 42 } 43 44 /* backward solve the L^T */ 45 for (i=n-1; i>=0; i--) { 46 v = aa + ai[i]; 47 vi = aj + ai[i]; 48 nz = ai[i+1] - ai[i]; 49 s1 = tmp[i]; 50 for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j]; 51 } 52 53 /* copy tmp into x according to permutation */ 54 for (i=0; i<n; i++) x[i] = tmp[i]; 55 56 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 57 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 58 59 ierr = PetscLogFlops(2.0*a->nz-A->cmap->n);CHKERRQ(ierr); 60 PetscFunctionReturn(0); 61 } 62 63 #undef __FUNCT__ 64 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace" 65 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 66 { 67 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 68 PetscErrorCode ierr; 69 PetscInt i,nz; 70 const PetscInt *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 71 const MatScalar *aa=a->a,*v; 72 PetscScalar s1,*x; 73 74 PetscFunctionBegin; 75 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 76 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 77 78 /* forward solve the U^T */ 79 for (i=0; i<n; i++) { 80 81 v = aa + diag[i]; 82 /* multiply by the inverse of the block diagonal */ 83 s1 = (*v++)*x[i]; 84 vi = aj + diag[i] + 1; 85 nz = ai[i+1] - diag[i] - 1; 86 while (nz--) { 87 x[*vi++] -= (*v++)*s1; 88 } 89 x[i] = s1; 90 } 91 /* backward solve the L^T */ 92 for (i=n-1; i>=0; i--) { 93 v = aa + diag[i] - 1; 94 vi = aj + diag[i] - 1; 95 nz = diag[i] - ai[i]; 96 s1 = x[i]; 97 while (nz--) { 98 x[*vi--] -= (*v--)*s1; 99 } 100 } 101 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 102 ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 103 PetscFunctionReturn(0); 104 } 105 106 #undef __FUNCT__ 107 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace" 108 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 109 { 110 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 111 PetscErrorCode ierr; 112 PetscInt i,nz,idx,idt,oidx; 113 const PetscInt *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j; 114 const MatScalar *aa=a->a,*v; 115 PetscScalar s1,s2,x1,x2,*x; 116 117 PetscFunctionBegin; 118 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 119 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 120 121 /* forward solve the U^T */ 122 idx = 0; 123 for (i=0; i<n; i++) { 124 125 v = aa + 4*diag[i]; 126 /* multiply by the inverse of the block diagonal */ 127 x1 = x[idx]; x2 = x[1+idx]; 128 s1 = v[0]*x1 + v[1]*x2; 129 s2 = v[2]*x1 + v[3]*x2; 130 v += 4; 131 132 vi = aj + diag[i] + 1; 133 nz = ai[i+1] - diag[i] - 1; 134 while (nz--) { 135 oidx = 2*(*vi++); 136 x[oidx] -= v[0]*s1 + v[1]*s2; 137 x[oidx+1] -= v[2]*s1 + v[3]*s2; 138 v += 4; 139 } 140 x[idx] = s1;x[1+idx] = s2; 141 idx += 2; 142 } 143 /* backward solve the L^T */ 144 for (i=n-1; i>=0; i--) { 145 v = aa + 4*diag[i] - 4; 146 vi = aj + diag[i] - 1; 147 nz = diag[i] - ai[i]; 148 idt = 2*i; 149 s1 = x[idt]; s2 = x[1+idt]; 150 while (nz--) { 151 idx = 2*(*vi--); 152 x[idx] -= v[0]*s1 + v[1]*s2; 153 x[idx+1] -= v[2]*s1 + v[3]*s2; 154 v -= 4; 155 } 156 } 157 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 158 ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 159 PetscFunctionReturn(0); 160 } 161 162 #undef __FUNCT__ 163 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering" 164 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 165 { 166 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 167 PetscErrorCode ierr; 168 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 169 PetscInt nz,idx,idt,j,i,oidx; 170 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 171 const MatScalar *aa=a->a,*v; 172 PetscScalar s1,s2,x1,x2,*x; 173 174 PetscFunctionBegin; 175 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 176 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 177 178 /* forward solve the U^T */ 179 idx = 0; 180 for (i=0; i<n; i++) { 181 v = aa + bs2*diag[i]; 182 /* multiply by the inverse of the block diagonal */ 183 x1 = x[idx]; x2 = x[1+idx]; 184 s1 = v[0]*x1 + v[1]*x2; 185 s2 = v[2]*x1 + v[3]*x2; 186 v -= bs2; 187 188 vi = aj + diag[i] - 1; 189 nz = diag[i] - diag[i+1] - 1; 190 for (j=0;j>-nz;j--) { 191 oidx = bs*vi[j]; 192 x[oidx] -= v[0]*s1 + v[1]*s2; 193 x[oidx+1] -= v[2]*s1 + v[3]*s2; 194 v -= bs2; 195 } 196 x[idx] = s1;x[1+idx] = s2; 197 idx += bs; 198 } 199 /* backward solve the L^T */ 200 for (i=n-1; i>=0; i--) { 201 v = aa + bs2*ai[i]; 202 vi = aj + ai[i]; 203 nz = ai[i+1] - ai[i]; 204 idt = bs*i; 205 s1 = x[idt]; s2 = x[1+idt]; 206 for (j=0;j<nz;j++) { 207 idx = bs*vi[j]; 208 x[idx] -= v[0]*s1 + v[1]*s2; 209 x[idx+1] -= v[2]*s1 + v[3]*s2; 210 v += bs2; 211 } 212 } 213 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 214 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 215 PetscFunctionReturn(0); 216 } 217 218 #undef __FUNCT__ 219 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace" 220 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 221 { 222 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 223 PetscErrorCode ierr; 224 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 225 PetscInt i,nz,idx,idt,oidx; 226 const MatScalar *aa=a->a,*v; 227 PetscScalar s1,s2,s3,x1,x2,x3,*x; 228 229 PetscFunctionBegin; 230 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 231 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 232 233 /* forward solve the U^T */ 234 idx = 0; 235 for (i=0; i<n; i++) { 236 237 v = aa + 9*diag[i]; 238 /* multiply by the inverse of the block diagonal */ 239 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 240 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 241 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 242 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 243 v += 9; 244 245 vi = aj + diag[i] + 1; 246 nz = ai[i+1] - diag[i] - 1; 247 while (nz--) { 248 oidx = 3*(*vi++); 249 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 250 x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 251 x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 252 v += 9; 253 } 254 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 255 idx += 3; 256 } 257 /* backward solve the L^T */ 258 for (i=n-1; i>=0; i--) { 259 v = aa + 9*diag[i] - 9; 260 vi = aj + diag[i] - 1; 261 nz = diag[i] - ai[i]; 262 idt = 3*i; 263 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 264 while (nz--) { 265 idx = 3*(*vi--); 266 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 267 x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 268 x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 269 v -= 9; 270 } 271 } 272 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 273 ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 274 PetscFunctionReturn(0); 275 } 276 277 #undef __FUNCT__ 278 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering" 279 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 280 { 281 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 282 PetscErrorCode ierr; 283 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 284 PetscInt nz,idx,idt,j,i,oidx; 285 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 286 const MatScalar *aa=a->a,*v; 287 PetscScalar s1,s2,s3,x1,x2,x3,*x; 288 289 PetscFunctionBegin; 290 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 291 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 292 293 /* forward solve the U^T */ 294 idx = 0; 295 for (i=0; i<n; i++) { 296 v = aa + bs2*diag[i]; 297 /* multiply by the inverse of the block diagonal */ 298 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 299 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 300 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 301 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 302 v -= bs2; 303 304 vi = aj + diag[i] - 1; 305 nz = diag[i] - diag[i+1] - 1; 306 for (j=0;j>-nz;j--) { 307 oidx = bs*vi[j]; 308 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 309 x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 310 x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 311 v -= bs2; 312 } 313 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 314 idx += bs; 315 } 316 /* backward solve the L^T */ 317 for (i=n-1; i>=0; i--) { 318 v = aa + bs2*ai[i]; 319 vi = aj + ai[i]; 320 nz = ai[i+1] - ai[i]; 321 idt = bs*i; 322 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 323 for (j=0;j<nz;j++) { 324 idx = bs*vi[j]; 325 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 326 x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 327 x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 328 v += bs2; 329 } 330 } 331 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 332 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 333 PetscFunctionReturn(0); 334 } 335 336 #undef __FUNCT__ 337 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace" 338 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 339 { 340 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 341 PetscErrorCode ierr; 342 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 343 PetscInt i,nz,idx,idt,oidx; 344 const MatScalar *aa=a->a,*v; 345 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4,*x; 346 347 PetscFunctionBegin; 348 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 349 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 350 351 /* forward solve the U^T */ 352 idx = 0; 353 for (i=0; i<n; i++) { 354 355 v = aa + 16*diag[i]; 356 /* multiply by the inverse of the block diagonal */ 357 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 358 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 359 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 360 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 361 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 362 v += 16; 363 364 vi = aj + diag[i] + 1; 365 nz = ai[i+1] - diag[i] - 1; 366 while (nz--) { 367 oidx = 4*(*vi++); 368 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 369 x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 370 x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 371 x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 372 v += 16; 373 } 374 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; 375 idx += 4; 376 } 377 /* backward solve the L^T */ 378 for (i=n-1; i>=0; i--) { 379 v = aa + 16*diag[i] - 16; 380 vi = aj + diag[i] - 1; 381 nz = diag[i] - ai[i]; 382 idt = 4*i; 383 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; 384 while (nz--) { 385 idx = 4*(*vi--); 386 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 387 x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 388 x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 389 x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 390 v -= 16; 391 } 392 } 393 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 394 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 395 PetscFunctionReturn(0); 396 } 397 398 #undef __FUNCT__ 399 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering" 400 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 401 { 402 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 403 PetscErrorCode ierr; 404 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 405 PetscInt nz,idx,idt,j,i,oidx; 406 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 407 const MatScalar *aa=a->a,*v; 408 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4,*x; 409 410 PetscFunctionBegin; 411 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 412 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 413 414 /* forward solve the U^T */ 415 idx = 0; 416 for (i=0; i<n; i++) { 417 v = aa + bs2*diag[i]; 418 /* multiply by the inverse of the block diagonal */ 419 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 420 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 421 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 422 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 423 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 424 v -= bs2; 425 426 vi = aj + diag[i] - 1; 427 nz = diag[i] - diag[i+1] - 1; 428 for (j=0;j>-nz;j--) { 429 oidx = bs*vi[j]; 430 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 431 x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 432 x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 433 x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 434 v -= bs2; 435 } 436 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; 437 idx += bs; 438 } 439 /* backward solve the L^T */ 440 for (i=n-1; i>=0; i--) { 441 v = aa + bs2*ai[i]; 442 vi = aj + ai[i]; 443 nz = ai[i+1] - ai[i]; 444 idt = bs*i; 445 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; 446 for (j=0;j<nz;j++) { 447 idx = bs*vi[j]; 448 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 449 x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 450 x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 451 x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 452 v += bs2; 453 } 454 } 455 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 456 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 457 PetscFunctionReturn(0); 458 } 459 460 #undef __FUNCT__ 461 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace" 462 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 463 { 464 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 465 PetscErrorCode ierr; 466 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 467 PetscInt i,nz,idx,idt,oidx; 468 const MatScalar *aa=a->a,*v; 469 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x; 470 471 PetscFunctionBegin; 472 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 473 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 474 475 /* forward solve the U^T */ 476 idx = 0; 477 for (i=0; i<n; i++) { 478 479 v = aa + 25*diag[i]; 480 /* multiply by the inverse of the block diagonal */ 481 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 482 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 483 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 484 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 485 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 486 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 487 v += 25; 488 489 vi = aj + diag[i] + 1; 490 nz = ai[i+1] - diag[i] - 1; 491 while (nz--) { 492 oidx = 5*(*vi++); 493 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 494 x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 495 x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 496 x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 497 x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 498 v += 25; 499 } 500 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 501 idx += 5; 502 } 503 /* backward solve the L^T */ 504 for (i=n-1; i>=0; i--) { 505 v = aa + 25*diag[i] - 25; 506 vi = aj + diag[i] - 1; 507 nz = diag[i] - ai[i]; 508 idt = 5*i; 509 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 510 while (nz--) { 511 idx = 5*(*vi--); 512 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 513 x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 514 x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 515 x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 516 x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 517 v -= 25; 518 } 519 } 520 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 521 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 522 PetscFunctionReturn(0); 523 } 524 525 #undef __FUNCT__ 526 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering" 527 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 528 { 529 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 530 PetscErrorCode ierr; 531 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 532 PetscInt nz,idx,idt,j,i,oidx; 533 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 534 const MatScalar *aa=a->a,*v; 535 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x; 536 537 PetscFunctionBegin; 538 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 539 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 540 541 /* forward solve the U^T */ 542 idx = 0; 543 for (i=0; i<n; i++) { 544 v = aa + bs2*diag[i]; 545 /* multiply by the inverse of the block diagonal */ 546 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 547 x5 = x[4+idx]; 548 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 549 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 550 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 551 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 552 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 553 v -= bs2; 554 555 vi = aj + diag[i] - 1; 556 nz = diag[i] - diag[i+1] - 1; 557 for (j=0;j>-nz;j--) { 558 oidx = bs*vi[j]; 559 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 560 x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 561 x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 562 x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 563 x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 564 v -= bs2; 565 } 566 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; x[4+idx] = s5; 567 idx += bs; 568 } 569 /* backward solve the L^T */ 570 for (i=n-1; i>=0; i--) { 571 v = aa + bs2*ai[i]; 572 vi = aj + ai[i]; 573 nz = ai[i+1] - ai[i]; 574 idt = bs*i; 575 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt]; 576 for (j=0;j<nz;j++) { 577 idx = bs*vi[j]; 578 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 579 x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 580 x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 581 x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 582 x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 583 v += bs2; 584 } 585 } 586 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 587 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 588 PetscFunctionReturn(0); 589 } 590 591 #undef __FUNCT__ 592 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace" 593 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 594 { 595 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 596 PetscErrorCode ierr; 597 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 598 PetscInt i,nz,idx,idt,oidx; 599 const MatScalar *aa=a->a,*v; 600 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x; 601 602 PetscFunctionBegin; 603 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 604 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 605 606 /* forward solve the U^T */ 607 idx = 0; 608 for (i=0; i<n; i++) { 609 610 v = aa + 36*diag[i]; 611 /* multiply by the inverse of the block diagonal */ 612 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 613 x6 = x[5+idx]; 614 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 615 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 616 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 617 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 618 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 619 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 620 v += 36; 621 622 vi = aj + diag[i] + 1; 623 nz = ai[i+1] - diag[i] - 1; 624 while (nz--) { 625 oidx = 6*(*vi++); 626 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 627 x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 628 x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 629 x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 630 x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 631 x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 632 v += 36; 633 } 634 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 635 x[5+idx] = s6; 636 idx += 6; 637 } 638 /* backward solve the L^T */ 639 for (i=n-1; i>=0; i--) { 640 v = aa + 36*diag[i] - 36; 641 vi = aj + diag[i] - 1; 642 nz = diag[i] - ai[i]; 643 idt = 6*i; 644 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 645 s6 = x[5+idt]; 646 while (nz--) { 647 idx = 6*(*vi--); 648 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 649 x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 650 x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 651 x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 652 x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 653 x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 654 v -= 36; 655 } 656 } 657 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 658 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 659 PetscFunctionReturn(0); 660 } 661 662 #undef __FUNCT__ 663 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering" 664 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 665 { 666 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 667 PetscErrorCode ierr; 668 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 669 PetscInt nz,idx,idt,j,i,oidx; 670 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 671 const MatScalar *aa=a->a,*v; 672 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x; 673 674 PetscFunctionBegin; 675 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 676 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 677 678 /* forward solve the U^T */ 679 idx = 0; 680 for (i=0; i<n; i++) { 681 v = aa + bs2*diag[i]; 682 /* multiply by the inverse of the block diagonal */ 683 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 684 x5 = x[4+idx]; x6 = x[5+idx]; 685 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 686 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 687 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 688 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 689 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 690 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 691 v -= bs2; 692 693 vi = aj + diag[i] - 1; 694 nz = diag[i] - diag[i+1] - 1; 695 for (j=0;j>-nz;j--) { 696 oidx = bs*vi[j]; 697 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 698 x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 699 x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 700 x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 701 x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 702 x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 703 v -= bs2; 704 } 705 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; x[4+idx] = s5; 706 x[5+idx] = s6; 707 idx += bs; 708 } 709 /* backward solve the L^T */ 710 for (i=n-1; i>=0; i--) { 711 v = aa + bs2*ai[i]; 712 vi = aj + ai[i]; 713 nz = ai[i+1] - ai[i]; 714 idt = bs*i; 715 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt]; 716 s6 = x[5+idt]; 717 for (j=0;j<nz;j++) { 718 idx = bs*vi[j]; 719 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 720 x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 721 x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 722 x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 723 x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 724 x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 725 v += bs2; 726 } 727 } 728 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 729 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 730 PetscFunctionReturn(0); 731 } 732 733 #undef __FUNCT__ 734 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace" 735 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 736 { 737 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 738 PetscErrorCode ierr; 739 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 740 PetscInt i,nz,idx,idt,oidx; 741 const MatScalar *aa=a->a,*v; 742 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x; 743 744 PetscFunctionBegin; 745 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 746 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 747 748 /* forward solve the U^T */ 749 idx = 0; 750 for (i=0; i<n; i++) { 751 752 v = aa + 49*diag[i]; 753 /* multiply by the inverse of the block diagonal */ 754 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 755 x6 = x[5+idx]; x7 = x[6+idx]; 756 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 757 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 758 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 759 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 760 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 761 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 762 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 763 v += 49; 764 765 vi = aj + diag[i] + 1; 766 nz = ai[i+1] - diag[i] - 1; 767 while (nz--) { 768 oidx = 7*(*vi++); 769 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 770 x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 771 x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 772 x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 773 x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 774 x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 775 x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 776 v += 49; 777 } 778 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 779 x[5+idx] = s6;x[6+idx] = s7; 780 idx += 7; 781 } 782 /* backward solve the L^T */ 783 for (i=n-1; i>=0; i--) { 784 v = aa + 49*diag[i] - 49; 785 vi = aj + diag[i] - 1; 786 nz = diag[i] - ai[i]; 787 idt = 7*i; 788 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 789 s6 = x[5+idt];s7 = x[6+idt]; 790 while (nz--) { 791 idx = 7*(*vi--); 792 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 793 x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 794 x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 795 x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 796 x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 797 x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 798 x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 799 v -= 49; 800 } 801 } 802 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 803 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 804 PetscFunctionReturn(0); 805 } 806 #undef __FUNCT__ 807 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering" 808 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 809 { 810 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 811 PetscErrorCode ierr; 812 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 813 PetscInt nz,idx,idt,j,i,oidx; 814 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 815 const MatScalar *aa=a->a,*v; 816 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x; 817 818 PetscFunctionBegin; 819 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 820 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 821 822 /* forward solve the U^T */ 823 idx = 0; 824 for (i=0; i<n; i++) { 825 v = aa + bs2*diag[i]; 826 /* multiply by the inverse of the block diagonal */ 827 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 828 x5 = x[4+idx]; x6 = x[5+idx]; x7 = x[6+idx]; 829 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 830 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 831 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 832 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 833 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 834 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 835 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 836 v -= bs2; 837 vi = aj + diag[i] - 1; 838 nz = diag[i] - diag[i+1] - 1; 839 for (j=0;j>-nz;j--) { 840 oidx = bs*vi[j]; 841 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 842 x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 843 x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 844 x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 845 x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 846 x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 847 x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 848 v -= bs2; 849 } 850 x[idx] = s1; x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; x[4+idx] = s5; 851 x[5+idx] = s6; x[6+idx] = s7; 852 idx += bs; 853 } 854 /* backward solve the L^T */ 855 for (i=n-1; i>=0; i--) { 856 v = aa + bs2*ai[i]; 857 vi = aj + ai[i]; 858 nz = ai[i+1] - ai[i]; 859 idt = bs*i; 860 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt]; 861 s6 = x[5+idt]; s7 = x[6+idt]; 862 for (j=0;j<nz;j++) { 863 idx = bs*vi[j]; 864 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 865 x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 866 x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 867 x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 868 x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 869 x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 870 x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 871 v += bs2; 872 } 873 } 874 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 875 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 876 PetscFunctionReturn(0); 877 } 878 879 /*---------------------------------------------------------------------------------------------*/ 880 #undef __FUNCT__ 881 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1" 882 PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 883 { 884 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 885 IS iscol = a->col,isrow = a->row; 886 PetscErrorCode ierr; 887 const PetscInt *rout,*cout,*r,*c,*adiag = a->diag,*ai = a->i,*aj = a->j,*vi; 888 PetscInt i,n = a->mbs,j; 889 PetscInt nz; 890 PetscScalar *x,*tmp,s1; 891 const MatScalar *aa = a->a,*v; 892 const PetscScalar *b; 893 894 PetscFunctionBegin; 895 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 896 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 897 tmp = a->solve_work; 898 899 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 900 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 901 902 /* copy the b into temp work space according to permutation */ 903 for (i=0; i<n; i++) tmp[i] = b[c[i]]; 904 905 /* forward solve the U^T */ 906 for (i=0; i<n; i++) { 907 v = aa + adiag[i+1] + 1; 908 vi = aj + adiag[i+1] + 1; 909 nz = adiag[i] - adiag[i+1] - 1; 910 s1 = tmp[i]; 911 s1 *= v[nz]; /* multiply by inverse of diagonal entry */ 912 for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j]; 913 tmp[i] = s1; 914 } 915 916 /* backward solve the L^T */ 917 for (i=n-1; i>=0; i--) { 918 v = aa + ai[i]; 919 vi = aj + ai[i]; 920 nz = ai[i+1] - ai[i]; 921 s1 = tmp[i]; 922 for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j]; 923 } 924 925 /* copy tmp into x according to permutation */ 926 for (i=0; i<n; i++) x[r[i]] = tmp[i]; 927 928 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 929 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 930 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 931 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 932 933 ierr = PetscLogFlops(2.0*a->nz-A->cmap->n);CHKERRQ(ierr); 934 PetscFunctionReturn(0); 935 } 936 937 #undef __FUNCT__ 938 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_inplace" 939 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx) 940 { 941 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 942 IS iscol=a->col,isrow=a->row; 943 PetscErrorCode ierr; 944 const PetscInt *r,*c,*rout,*cout; 945 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 946 PetscInt i,nz; 947 const MatScalar *aa=a->a,*v; 948 PetscScalar s1,*x,*t; 949 const PetscScalar *b; 950 951 PetscFunctionBegin; 952 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 953 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 954 t = a->solve_work; 955 956 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 957 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 958 959 /* copy the b into temp work space according to permutation */ 960 for (i=0; i<n; i++) { 961 t[i] = b[c[i]]; 962 } 963 964 /* forward solve the U^T */ 965 for (i=0; i<n; i++) { 966 967 v = aa + diag[i]; 968 /* multiply by the inverse of the block diagonal */ 969 s1 = (*v++)*t[i]; 970 vi = aj + diag[i] + 1; 971 nz = ai[i+1] - diag[i] - 1; 972 while (nz--) { 973 t[*vi++] -= (*v++)*s1; 974 } 975 t[i] = s1; 976 } 977 /* backward solve the L^T */ 978 for (i=n-1; i>=0; i--) { 979 v = aa + diag[i] - 1; 980 vi = aj + diag[i] - 1; 981 nz = diag[i] - ai[i]; 982 s1 = t[i]; 983 while (nz--) { 984 t[*vi--] -= (*v--)*s1; 985 } 986 } 987 988 /* copy t into x according to permutation */ 989 for (i=0; i<n; i++) { 990 x[r[i]] = t[i]; 991 } 992 993 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 994 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 995 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 996 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 997 ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 998 PetscFunctionReturn(0); 999 } 1000 1001 #undef __FUNCT__ 1002 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_inplace" 1003 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx) 1004 { 1005 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1006 IS iscol=a->col,isrow=a->row; 1007 PetscErrorCode ierr; 1008 const PetscInt *r,*c,*rout,*cout; 1009 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1010 PetscInt i,nz,idx,idt,ii,ic,ir,oidx; 1011 const MatScalar *aa=a->a,*v; 1012 PetscScalar s1,s2,x1,x2,*x,*t; 1013 const PetscScalar *b; 1014 1015 PetscFunctionBegin; 1016 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1017 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1018 t = a->solve_work; 1019 1020 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1021 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1022 1023 /* copy the b into temp work space according to permutation */ 1024 ii = 0; 1025 for (i=0; i<n; i++) { 1026 ic = 2*c[i]; 1027 t[ii] = b[ic]; 1028 t[ii+1] = b[ic+1]; 1029 ii += 2; 1030 } 1031 1032 /* forward solve the U^T */ 1033 idx = 0; 1034 for (i=0; i<n; i++) { 1035 1036 v = aa + 4*diag[i]; 1037 /* multiply by the inverse of the block diagonal */ 1038 x1 = t[idx]; x2 = t[1+idx]; 1039 s1 = v[0]*x1 + v[1]*x2; 1040 s2 = v[2]*x1 + v[3]*x2; 1041 v += 4; 1042 1043 vi = aj + diag[i] + 1; 1044 nz = ai[i+1] - diag[i] - 1; 1045 while (nz--) { 1046 oidx = 2*(*vi++); 1047 t[oidx] -= v[0]*s1 + v[1]*s2; 1048 t[oidx+1] -= v[2]*s1 + v[3]*s2; 1049 v += 4; 1050 } 1051 t[idx] = s1;t[1+idx] = s2; 1052 idx += 2; 1053 } 1054 /* backward solve the L^T */ 1055 for (i=n-1; i>=0; i--) { 1056 v = aa + 4*diag[i] - 4; 1057 vi = aj + diag[i] - 1; 1058 nz = diag[i] - ai[i]; 1059 idt = 2*i; 1060 s1 = t[idt]; s2 = t[1+idt]; 1061 while (nz--) { 1062 idx = 2*(*vi--); 1063 t[idx] -= v[0]*s1 + v[1]*s2; 1064 t[idx+1] -= v[2]*s1 + v[3]*s2; 1065 v -= 4; 1066 } 1067 } 1068 1069 /* copy t into x according to permutation */ 1070 ii = 0; 1071 for (i=0; i<n; i++) { 1072 ir = 2*r[i]; 1073 x[ir] = t[ii]; 1074 x[ir+1] = t[ii+1]; 1075 ii += 2; 1076 } 1077 1078 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1079 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1080 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1081 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1082 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 1083 PetscFunctionReturn(0); 1084 } 1085 1086 #undef __FUNCT__ 1087 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2" 1088 PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 1089 { 1090 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1091 PetscErrorCode ierr; 1092 IS iscol=a->col,isrow=a->row; 1093 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 1094 const PetscInt *r,*c,*rout,*cout; 1095 PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1096 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 1097 const MatScalar *aa=a->a,*v; 1098 PetscScalar s1,s2,x1,x2,*x,*t; 1099 const PetscScalar *b; 1100 1101 PetscFunctionBegin; 1102 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1103 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1104 t = a->solve_work; 1105 1106 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1107 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1108 1109 /* copy b into temp work space according to permutation */ 1110 for (i=0;i<n;i++) { 1111 ii = bs*i; ic = bs*c[i]; 1112 t[ii] = b[ic]; t[ii+1] = b[ic+1]; 1113 } 1114 1115 /* forward solve the U^T */ 1116 idx = 0; 1117 for (i=0; i<n; i++) { 1118 v = aa + bs2*diag[i]; 1119 /* multiply by the inverse of the block diagonal */ 1120 x1 = t[idx]; x2 = t[1+idx]; 1121 s1 = v[0]*x1 + v[1]*x2; 1122 s2 = v[2]*x1 + v[3]*x2; 1123 v -= bs2; 1124 1125 vi = aj + diag[i] - 1; 1126 nz = diag[i] - diag[i+1] - 1; 1127 for (j=0;j>-nz;j--) { 1128 oidx = bs*vi[j]; 1129 t[oidx] -= v[0]*s1 + v[1]*s2; 1130 t[oidx+1] -= v[2]*s1 + v[3]*s2; 1131 v -= bs2; 1132 } 1133 t[idx] = s1;t[1+idx] = s2; 1134 idx += bs; 1135 } 1136 /* backward solve the L^T */ 1137 for (i=n-1; i>=0; i--) { 1138 v = aa + bs2*ai[i]; 1139 vi = aj + ai[i]; 1140 nz = ai[i+1] - ai[i]; 1141 idt = bs*i; 1142 s1 = t[idt]; s2 = t[1+idt]; 1143 for (j=0;j<nz;j++) { 1144 idx = bs*vi[j]; 1145 t[idx] -= v[0]*s1 + v[1]*s2; 1146 t[idx+1] -= v[2]*s1 + v[3]*s2; 1147 v += bs2; 1148 } 1149 } 1150 1151 /* copy t into x according to permutation */ 1152 for (i=0;i<n;i++) { 1153 ii = bs*i; ir = bs*r[i]; 1154 x[ir] = t[ii]; x[ir+1] = t[ii+1]; 1155 } 1156 1157 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1158 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1159 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1160 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1161 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1162 PetscFunctionReturn(0); 1163 } 1164 1165 #undef __FUNCT__ 1166 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_inplace" 1167 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx) 1168 { 1169 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1170 IS iscol=a->col,isrow=a->row; 1171 PetscErrorCode ierr; 1172 const PetscInt *r,*c,*rout,*cout; 1173 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1174 PetscInt i,nz,idx,idt,ii,ic,ir,oidx; 1175 const MatScalar *aa=a->a,*v; 1176 PetscScalar s1,s2,s3,x1,x2,x3,*x,*t; 1177 const PetscScalar *b; 1178 1179 PetscFunctionBegin; 1180 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1181 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1182 t = a->solve_work; 1183 1184 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1185 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1186 1187 /* copy the b into temp work space according to permutation */ 1188 ii = 0; 1189 for (i=0; i<n; i++) { 1190 ic = 3*c[i]; 1191 t[ii] = b[ic]; 1192 t[ii+1] = b[ic+1]; 1193 t[ii+2] = b[ic+2]; 1194 ii += 3; 1195 } 1196 1197 /* forward solve the U^T */ 1198 idx = 0; 1199 for (i=0; i<n; i++) { 1200 1201 v = aa + 9*diag[i]; 1202 /* multiply by the inverse of the block diagonal */ 1203 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1204 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 1205 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 1206 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 1207 v += 9; 1208 1209 vi = aj + diag[i] + 1; 1210 nz = ai[i+1] - diag[i] - 1; 1211 while (nz--) { 1212 oidx = 3*(*vi++); 1213 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 1214 t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 1215 t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 1216 v += 9; 1217 } 1218 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 1219 idx += 3; 1220 } 1221 /* backward solve the L^T */ 1222 for (i=n-1; i>=0; i--) { 1223 v = aa + 9*diag[i] - 9; 1224 vi = aj + diag[i] - 1; 1225 nz = diag[i] - ai[i]; 1226 idt = 3*i; 1227 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 1228 while (nz--) { 1229 idx = 3*(*vi--); 1230 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 1231 t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 1232 t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 1233 v -= 9; 1234 } 1235 } 1236 1237 /* copy t into x according to permutation */ 1238 ii = 0; 1239 for (i=0; i<n; i++) { 1240 ir = 3*r[i]; 1241 x[ir] = t[ii]; 1242 x[ir+1] = t[ii+1]; 1243 x[ir+2] = t[ii+2]; 1244 ii += 3; 1245 } 1246 1247 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1248 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1249 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1250 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1251 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 1252 PetscFunctionReturn(0); 1253 } 1254 1255 #undef __FUNCT__ 1256 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3" 1257 PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 1258 { 1259 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1260 PetscErrorCode ierr; 1261 IS iscol=a->col,isrow=a->row; 1262 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 1263 const PetscInt *r,*c,*rout,*cout; 1264 PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1265 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 1266 const MatScalar *aa=a->a,*v; 1267 PetscScalar s1,s2,s3,x1,x2,x3,*x,*t; 1268 const PetscScalar *b; 1269 1270 PetscFunctionBegin; 1271 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1272 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1273 t = a->solve_work; 1274 1275 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1276 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1277 1278 /* copy b into temp work space according to permutation */ 1279 for (i=0;i<n;i++) { 1280 ii = bs*i; ic = bs*c[i]; 1281 t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; 1282 } 1283 1284 /* forward solve the U^T */ 1285 idx = 0; 1286 for (i=0; i<n; i++) { 1287 v = aa + bs2*diag[i]; 1288 /* multiply by the inverse of the block diagonal */ 1289 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1290 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 1291 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 1292 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 1293 v -= bs2; 1294 1295 vi = aj + diag[i] - 1; 1296 nz = diag[i] - diag[i+1] - 1; 1297 for (j=0;j>-nz;j--) { 1298 oidx = bs*vi[j]; 1299 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 1300 t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 1301 t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 1302 v -= bs2; 1303 } 1304 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 1305 idx += bs; 1306 } 1307 /* backward solve the L^T */ 1308 for (i=n-1; i>=0; i--) { 1309 v = aa + bs2*ai[i]; 1310 vi = aj + ai[i]; 1311 nz = ai[i+1] - ai[i]; 1312 idt = bs*i; 1313 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 1314 for (j=0;j<nz;j++) { 1315 idx = bs*vi[j]; 1316 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 1317 t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 1318 t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 1319 v += bs2; 1320 } 1321 } 1322 1323 /* copy t into x according to permutation */ 1324 for (i=0;i<n;i++) { 1325 ii = bs*i; ir = bs*r[i]; 1326 x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; 1327 } 1328 1329 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1330 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1331 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1332 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1333 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1334 PetscFunctionReturn(0); 1335 } 1336 1337 #undef __FUNCT__ 1338 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_inplace" 1339 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx) 1340 { 1341 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1342 IS iscol=a->col,isrow=a->row; 1343 PetscErrorCode ierr; 1344 const PetscInt *r,*c,*rout,*cout; 1345 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1346 PetscInt i,nz,idx,idt,ii,ic,ir,oidx; 1347 const MatScalar *aa=a->a,*v; 1348 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4,*x,*t; 1349 const PetscScalar *b; 1350 1351 PetscFunctionBegin; 1352 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1353 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1354 t = a->solve_work; 1355 1356 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1357 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1358 1359 /* copy the b into temp work space according to permutation */ 1360 ii = 0; 1361 for (i=0; i<n; i++) { 1362 ic = 4*c[i]; 1363 t[ii] = b[ic]; 1364 t[ii+1] = b[ic+1]; 1365 t[ii+2] = b[ic+2]; 1366 t[ii+3] = b[ic+3]; 1367 ii += 4; 1368 } 1369 1370 /* forward solve the U^T */ 1371 idx = 0; 1372 for (i=0; i<n; i++) { 1373 1374 v = aa + 16*diag[i]; 1375 /* multiply by the inverse of the block diagonal */ 1376 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 1377 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 1378 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 1379 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 1380 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 1381 v += 16; 1382 1383 vi = aj + diag[i] + 1; 1384 nz = ai[i+1] - diag[i] - 1; 1385 while (nz--) { 1386 oidx = 4*(*vi++); 1387 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 1388 t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 1389 t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 1390 t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 1391 v += 16; 1392 } 1393 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; 1394 idx += 4; 1395 } 1396 /* backward solve the L^T */ 1397 for (i=n-1; i>=0; i--) { 1398 v = aa + 16*diag[i] - 16; 1399 vi = aj + diag[i] - 1; 1400 nz = diag[i] - ai[i]; 1401 idt = 4*i; 1402 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; 1403 while (nz--) { 1404 idx = 4*(*vi--); 1405 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 1406 t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 1407 t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 1408 t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 1409 v -= 16; 1410 } 1411 } 1412 1413 /* copy t into x according to permutation */ 1414 ii = 0; 1415 for (i=0; i<n; i++) { 1416 ir = 4*r[i]; 1417 x[ir] = t[ii]; 1418 x[ir+1] = t[ii+1]; 1419 x[ir+2] = t[ii+2]; 1420 x[ir+3] = t[ii+3]; 1421 ii += 4; 1422 } 1423 1424 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1425 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1426 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1427 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1428 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 1429 PetscFunctionReturn(0); 1430 } 1431 1432 #undef __FUNCT__ 1433 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4" 1434 PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 1435 { 1436 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1437 PetscErrorCode ierr; 1438 IS iscol=a->col,isrow=a->row; 1439 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 1440 const PetscInt *r,*c,*rout,*cout; 1441 PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1442 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 1443 const MatScalar *aa=a->a,*v; 1444 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4,*x,*t; 1445 const PetscScalar *b; 1446 1447 PetscFunctionBegin; 1448 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1449 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1450 t = a->solve_work; 1451 1452 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1453 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1454 1455 /* copy b into temp work space according to permutation */ 1456 for (i=0;i<n;i++) { 1457 ii = bs*i; ic = bs*c[i]; 1458 t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3]; 1459 } 1460 1461 /* forward solve the U^T */ 1462 idx = 0; 1463 for (i=0; i<n; i++) { 1464 v = aa + bs2*diag[i]; 1465 /* multiply by the inverse of the block diagonal */ 1466 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 1467 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 1468 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 1469 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 1470 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 1471 v -= bs2; 1472 1473 vi = aj + diag[i] - 1; 1474 nz = diag[i] - diag[i+1] - 1; 1475 for (j=0;j>-nz;j--) { 1476 oidx = bs*vi[j]; 1477 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 1478 t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 1479 t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 1480 t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 1481 v -= bs2; 1482 } 1483 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; 1484 idx += bs; 1485 } 1486 /* backward solve the L^T */ 1487 for (i=n-1; i>=0; i--) { 1488 v = aa + bs2*ai[i]; 1489 vi = aj + ai[i]; 1490 nz = ai[i+1] - ai[i]; 1491 idt = bs*i; 1492 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; 1493 for (j=0;j<nz;j++) { 1494 idx = bs*vi[j]; 1495 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 1496 t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 1497 t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 1498 t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 1499 v += bs2; 1500 } 1501 } 1502 1503 /* copy t into x according to permutation */ 1504 for (i=0;i<n;i++) { 1505 ii = bs*i; ir = bs*r[i]; 1506 x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3]; 1507 } 1508 1509 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1510 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1511 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1512 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1513 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1514 PetscFunctionReturn(0); 1515 } 1516 1517 #undef __FUNCT__ 1518 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_inplace" 1519 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx) 1520 { 1521 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1522 IS iscol=a->col,isrow=a->row; 1523 PetscErrorCode ierr; 1524 const PetscInt *r,*c,*rout,*cout; 1525 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1526 PetscInt i,nz,idx,idt,ii,ic,ir,oidx; 1527 const MatScalar *aa=a->a,*v; 1528 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t; 1529 const PetscScalar *b; 1530 1531 PetscFunctionBegin; 1532 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1533 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1534 t = a->solve_work; 1535 1536 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1537 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1538 1539 /* copy the b into temp work space according to permutation */ 1540 ii = 0; 1541 for (i=0; i<n; i++) { 1542 ic = 5*c[i]; 1543 t[ii] = b[ic]; 1544 t[ii+1] = b[ic+1]; 1545 t[ii+2] = b[ic+2]; 1546 t[ii+3] = b[ic+3]; 1547 t[ii+4] = b[ic+4]; 1548 ii += 5; 1549 } 1550 1551 /* forward solve the U^T */ 1552 idx = 0; 1553 for (i=0; i<n; i++) { 1554 1555 v = aa + 25*diag[i]; 1556 /* multiply by the inverse of the block diagonal */ 1557 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1558 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 1559 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 1560 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 1561 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 1562 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 1563 v += 25; 1564 1565 vi = aj + diag[i] + 1; 1566 nz = ai[i+1] - diag[i] - 1; 1567 while (nz--) { 1568 oidx = 5*(*vi++); 1569 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 1570 t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 1571 t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 1572 t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 1573 t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 1574 v += 25; 1575 } 1576 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1577 idx += 5; 1578 } 1579 /* backward solve the L^T */ 1580 for (i=n-1; i>=0; i--) { 1581 v = aa + 25*diag[i] - 25; 1582 vi = aj + diag[i] - 1; 1583 nz = diag[i] - ai[i]; 1584 idt = 5*i; 1585 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1586 while (nz--) { 1587 idx = 5*(*vi--); 1588 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 1589 t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 1590 t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 1591 t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 1592 t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 1593 v -= 25; 1594 } 1595 } 1596 1597 /* copy t into x according to permutation */ 1598 ii = 0; 1599 for (i=0; i<n; i++) { 1600 ir = 5*r[i]; 1601 x[ir] = t[ii]; 1602 x[ir+1] = t[ii+1]; 1603 x[ir+2] = t[ii+2]; 1604 x[ir+3] = t[ii+3]; 1605 x[ir+4] = t[ii+4]; 1606 ii += 5; 1607 } 1608 1609 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1610 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1611 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1612 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1613 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 1614 PetscFunctionReturn(0); 1615 } 1616 1617 #undef __FUNCT__ 1618 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5" 1619 PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 1620 { 1621 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1622 PetscErrorCode ierr; 1623 IS iscol=a->col,isrow=a->row; 1624 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 1625 const PetscInt *r,*c,*rout,*cout; 1626 PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1627 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 1628 const MatScalar *aa=a->a,*v; 1629 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t; 1630 const PetscScalar *b; 1631 1632 PetscFunctionBegin; 1633 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1634 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1635 t = a->solve_work; 1636 1637 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1638 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1639 1640 /* copy b into temp work space according to permutation */ 1641 for (i=0;i<n;i++) { 1642 ii = bs*i; ic = bs*c[i]; 1643 t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3]; 1644 t[ii+4] = b[ic+4]; 1645 } 1646 1647 /* forward solve the U^T */ 1648 idx = 0; 1649 for (i=0; i<n; i++) { 1650 v = aa + bs2*diag[i]; 1651 /* multiply by the inverse of the block diagonal */ 1652 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1653 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 1654 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 1655 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 1656 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 1657 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 1658 v -= bs2; 1659 1660 vi = aj + diag[i] - 1; 1661 nz = diag[i] - diag[i+1] - 1; 1662 for (j=0;j>-nz;j--) { 1663 oidx = bs*vi[j]; 1664 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 1665 t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 1666 t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 1667 t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 1668 t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 1669 v -= bs2; 1670 } 1671 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; t[4+idx] =s5; 1672 idx += bs; 1673 } 1674 /* backward solve the L^T */ 1675 for (i=n-1; i>=0; i--) { 1676 v = aa + bs2*ai[i]; 1677 vi = aj + ai[i]; 1678 nz = ai[i+1] - ai[i]; 1679 idt = bs*i; 1680 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; s5 = t[4+idt]; 1681 for (j=0;j<nz;j++) { 1682 idx = bs*vi[j]; 1683 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 1684 t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 1685 t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 1686 t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 1687 t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 1688 v += bs2; 1689 } 1690 } 1691 1692 /* copy t into x according to permutation */ 1693 for (i=0;i<n;i++) { 1694 ii = bs*i; ir = bs*r[i]; 1695 x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3]; 1696 x[ir+4] = t[ii+4]; 1697 } 1698 1699 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1700 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1701 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1702 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1703 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1704 PetscFunctionReturn(0); 1705 } 1706 1707 #undef __FUNCT__ 1708 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_inplace" 1709 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx) 1710 { 1711 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1712 IS iscol=a->col,isrow=a->row; 1713 PetscErrorCode ierr; 1714 const PetscInt *r,*c,*rout,*cout; 1715 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1716 PetscInt i,nz,idx,idt,ii,ic,ir,oidx; 1717 const MatScalar *aa=a->a,*v; 1718 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t; 1719 const PetscScalar *b; 1720 1721 PetscFunctionBegin; 1722 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1723 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1724 t = a->solve_work; 1725 1726 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1727 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1728 1729 /* copy the b into temp work space according to permutation */ 1730 ii = 0; 1731 for (i=0; i<n; i++) { 1732 ic = 6*c[i]; 1733 t[ii] = b[ic]; 1734 t[ii+1] = b[ic+1]; 1735 t[ii+2] = b[ic+2]; 1736 t[ii+3] = b[ic+3]; 1737 t[ii+4] = b[ic+4]; 1738 t[ii+5] = b[ic+5]; 1739 ii += 6; 1740 } 1741 1742 /* forward solve the U^T */ 1743 idx = 0; 1744 for (i=0; i<n; i++) { 1745 1746 v = aa + 36*diag[i]; 1747 /* multiply by the inverse of the block diagonal */ 1748 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1749 x6 = t[5+idx]; 1750 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 1751 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 1752 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 1753 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 1754 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 1755 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 1756 v += 36; 1757 1758 vi = aj + diag[i] + 1; 1759 nz = ai[i+1] - diag[i] - 1; 1760 while (nz--) { 1761 oidx = 6*(*vi++); 1762 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 1763 t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 1764 t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 1765 t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 1766 t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 1767 t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 1768 v += 36; 1769 } 1770 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1771 t[5+idx] = s6; 1772 idx += 6; 1773 } 1774 /* backward solve the L^T */ 1775 for (i=n-1; i>=0; i--) { 1776 v = aa + 36*diag[i] - 36; 1777 vi = aj + diag[i] - 1; 1778 nz = diag[i] - ai[i]; 1779 idt = 6*i; 1780 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1781 s6 = t[5+idt]; 1782 while (nz--) { 1783 idx = 6*(*vi--); 1784 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 1785 t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 1786 t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 1787 t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 1788 t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 1789 t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 1790 v -= 36; 1791 } 1792 } 1793 1794 /* copy t into x according to permutation */ 1795 ii = 0; 1796 for (i=0; i<n; i++) { 1797 ir = 6*r[i]; 1798 x[ir] = t[ii]; 1799 x[ir+1] = t[ii+1]; 1800 x[ir+2] = t[ii+2]; 1801 x[ir+3] = t[ii+3]; 1802 x[ir+4] = t[ii+4]; 1803 x[ir+5] = t[ii+5]; 1804 ii += 6; 1805 } 1806 1807 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1808 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1809 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1810 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1811 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1812 PetscFunctionReturn(0); 1813 } 1814 1815 #undef __FUNCT__ 1816 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6" 1817 PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 1818 { 1819 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1820 PetscErrorCode ierr; 1821 IS iscol=a->col,isrow=a->row; 1822 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 1823 const PetscInt *r,*c,*rout,*cout; 1824 PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1825 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 1826 const MatScalar *aa=a->a,*v; 1827 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t; 1828 const PetscScalar *b; 1829 1830 PetscFunctionBegin; 1831 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1832 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1833 t = a->solve_work; 1834 1835 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1836 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1837 1838 /* copy b into temp work space according to permutation */ 1839 for (i=0;i<n;i++) { 1840 ii = bs*i; ic = bs*c[i]; 1841 t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3]; 1842 t[ii+4] = b[ic+4]; t[ii+5] = b[ic+5]; 1843 } 1844 1845 /* forward solve the U^T */ 1846 idx = 0; 1847 for (i=0; i<n; i++) { 1848 v = aa + bs2*diag[i]; 1849 /* multiply by the inverse of the block diagonal */ 1850 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1851 x6 = t[5+idx]; 1852 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 1853 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 1854 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 1855 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 1856 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 1857 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 1858 v -= bs2; 1859 1860 vi = aj + diag[i] - 1; 1861 nz = diag[i] - diag[i+1] - 1; 1862 for (j=0;j>-nz;j--) { 1863 oidx = bs*vi[j]; 1864 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 1865 t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 1866 t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 1867 t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 1868 t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 1869 t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 1870 v -= bs2; 1871 } 1872 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; t[4+idx] =s5; 1873 t[5+idx] = s6; 1874 idx += bs; 1875 } 1876 /* backward solve the L^T */ 1877 for (i=n-1; i>=0; i--) { 1878 v = aa + bs2*ai[i]; 1879 vi = aj + ai[i]; 1880 nz = ai[i+1] - ai[i]; 1881 idt = bs*i; 1882 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; s5 = t[4+idt]; 1883 s6 = t[5+idt]; 1884 for (j=0;j<nz;j++) { 1885 idx = bs*vi[j]; 1886 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 1887 t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 1888 t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 1889 t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 1890 t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 1891 t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 1892 v += bs2; 1893 } 1894 } 1895 1896 /* copy t into x according to permutation */ 1897 for (i=0;i<n;i++) { 1898 ii = bs*i; ir = bs*r[i]; 1899 x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3]; 1900 x[ir+4] = t[ii+4]; x[ir+5] = t[ii+5]; 1901 } 1902 1903 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1904 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1905 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1906 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1907 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1908 PetscFunctionReturn(0); 1909 } 1910 1911 #undef __FUNCT__ 1912 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_inplace" 1913 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx) 1914 { 1915 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1916 IS iscol=a->col,isrow=a->row; 1917 PetscErrorCode ierr; 1918 const PetscInt *r,*c,*rout,*cout; 1919 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1920 PetscInt i,nz,idx,idt,ii,ic,ir,oidx; 1921 const MatScalar *aa=a->a,*v; 1922 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t; 1923 const PetscScalar *b; 1924 1925 PetscFunctionBegin; 1926 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1927 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1928 t = a->solve_work; 1929 1930 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1931 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1932 1933 /* copy the b into temp work space according to permutation */ 1934 ii = 0; 1935 for (i=0; i<n; i++) { 1936 ic = 7*c[i]; 1937 t[ii] = b[ic]; 1938 t[ii+1] = b[ic+1]; 1939 t[ii+2] = b[ic+2]; 1940 t[ii+3] = b[ic+3]; 1941 t[ii+4] = b[ic+4]; 1942 t[ii+5] = b[ic+5]; 1943 t[ii+6] = b[ic+6]; 1944 ii += 7; 1945 } 1946 1947 /* forward solve the U^T */ 1948 idx = 0; 1949 for (i=0; i<n; i++) { 1950 1951 v = aa + 49*diag[i]; 1952 /* multiply by the inverse of the block diagonal */ 1953 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1954 x6 = t[5+idx]; x7 = t[6+idx]; 1955 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 1956 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 1957 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 1958 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 1959 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 1960 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 1961 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 1962 v += 49; 1963 1964 vi = aj + diag[i] + 1; 1965 nz = ai[i+1] - diag[i] - 1; 1966 while (nz--) { 1967 oidx = 7*(*vi++); 1968 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1969 t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1970 t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1971 t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1972 t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1973 t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1974 t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1975 v += 49; 1976 } 1977 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1978 t[5+idx] = s6;t[6+idx] = s7; 1979 idx += 7; 1980 } 1981 /* backward solve the L^T */ 1982 for (i=n-1; i>=0; i--) { 1983 v = aa + 49*diag[i] - 49; 1984 vi = aj + diag[i] - 1; 1985 nz = diag[i] - ai[i]; 1986 idt = 7*i; 1987 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1988 s6 = t[5+idt];s7 = t[6+idt]; 1989 while (nz--) { 1990 idx = 7*(*vi--); 1991 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1992 t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1993 t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1994 t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1995 t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1996 t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1997 t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1998 v -= 49; 1999 } 2000 } 2001 2002 /* copy t into x according to permutation */ 2003 ii = 0; 2004 for (i=0; i<n; i++) { 2005 ir = 7*r[i]; 2006 x[ir] = t[ii]; 2007 x[ir+1] = t[ii+1]; 2008 x[ir+2] = t[ii+2]; 2009 x[ir+3] = t[ii+3]; 2010 x[ir+4] = t[ii+4]; 2011 x[ir+5] = t[ii+5]; 2012 x[ir+6] = t[ii+6]; 2013 ii += 7; 2014 } 2015 2016 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2017 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2018 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2019 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2020 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 2021 PetscFunctionReturn(0); 2022 } 2023 #undef __FUNCT__ 2024 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7" 2025 PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 2026 { 2027 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2028 PetscErrorCode ierr; 2029 IS iscol=a->col,isrow=a->row; 2030 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 2031 const PetscInt *r,*c,*rout,*cout; 2032 PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 2033 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 2034 const MatScalar *aa=a->a,*v; 2035 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t; 2036 const PetscScalar *b; 2037 2038 PetscFunctionBegin; 2039 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2040 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2041 t = a->solve_work; 2042 2043 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2044 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2045 2046 /* copy b into temp work space according to permutation */ 2047 for (i=0;i<n;i++) { 2048 ii = bs*i; ic = bs*c[i]; 2049 t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3]; 2050 t[ii+4] = b[ic+4]; t[ii+5] = b[ic+5]; t[ii+6] = b[ic+6]; 2051 } 2052 2053 /* forward solve the U^T */ 2054 idx = 0; 2055 for (i=0; i<n; i++) { 2056 v = aa + bs2*diag[i]; 2057 /* multiply by the inverse of the block diagonal */ 2058 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 2059 x6 = t[5+idx]; x7 = t[6+idx]; 2060 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 2061 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 2062 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 2063 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 2064 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 2065 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 2066 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 2067 v -= bs2; 2068 2069 vi = aj + diag[i] - 1; 2070 nz = diag[i] - diag[i+1] - 1; 2071 for (j=0;j>-nz;j--) { 2072 oidx = bs*vi[j]; 2073 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 2074 t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 2075 t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 2076 t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 2077 t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 2078 t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 2079 t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 2080 v -= bs2; 2081 } 2082 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; t[4+idx] =s5; 2083 t[5+idx] = s6; t[6+idx] = s7; 2084 idx += bs; 2085 } 2086 /* backward solve the L^T */ 2087 for (i=n-1; i>=0; i--) { 2088 v = aa + bs2*ai[i]; 2089 vi = aj + ai[i]; 2090 nz = ai[i+1] - ai[i]; 2091 idt = bs*i; 2092 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; s5 = t[4+idt]; 2093 s6 = t[5+idt]; s7 = t[6+idt]; 2094 for (j=0;j<nz;j++) { 2095 idx = bs*vi[j]; 2096 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 2097 t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 2098 t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 2099 t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 2100 t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 2101 t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 2102 t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 2103 v += bs2; 2104 } 2105 } 2106 2107 /* copy t into x according to permutation */ 2108 for (i=0;i<n;i++) { 2109 ii = bs*i; ir = bs*r[i]; 2110 x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3]; 2111 x[ir+4] = t[ii+4]; x[ir+5] = t[ii+5]; x[ir+6] = t[ii+6]; 2112 } 2113 2114 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2115 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2116 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2117 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2118 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 2119 PetscFunctionReturn(0); 2120 } 2121 2122 /* ----------------------------------------------------------- */ 2123 #undef __FUNCT__ 2124 #define __FUNCT__ "MatSolve_SeqBAIJ_N_inplace" 2125 PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx) 2126 { 2127 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2128 IS iscol=a->col,isrow=a->row; 2129 PetscErrorCode ierr; 2130 const PetscInt *r,*c,*rout,*cout; 2131 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*vi; 2132 PetscInt i,nz; 2133 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 2134 const MatScalar *aa=a->a,*v; 2135 PetscScalar *x,*s,*t,*ls; 2136 const PetscScalar *b; 2137 2138 PetscFunctionBegin; 2139 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2140 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2141 t = a->solve_work; 2142 2143 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2144 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2145 2146 /* forward solve the lower triangular */ 2147 ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 2148 for (i=1; i<n; i++) { 2149 v = aa + bs2*ai[i]; 2150 vi = aj + ai[i]; 2151 nz = a->diag[i] - ai[i]; 2152 s = t + bs*i; 2153 ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 2154 while (nz--) { 2155 PetscKernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 2156 v += bs2; 2157 } 2158 } 2159 /* backward solve the upper triangular */ 2160 ls = a->solve_work + A->cmap->n; 2161 for (i=n-1; i>=0; i--) { 2162 v = aa + bs2*(a->diag[i] + 1); 2163 vi = aj + a->diag[i] + 1; 2164 nz = ai[i+1] - a->diag[i] - 1; 2165 ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 2166 while (nz--) { 2167 PetscKernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 2168 v += bs2; 2169 } 2170 PetscKernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 2171 ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 2172 } 2173 2174 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2175 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2176 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2177 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2178 ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 2179 PetscFunctionReturn(0); 2180 } 2181 2182 /* ----------------------------------------------------------- */ 2183 #undef __FUNCT__ 2184 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N_inplace" 2185 PetscErrorCode MatSolveTranspose_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx) 2186 { 2187 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2188 IS iscol=a->col,isrow=a->row; 2189 PetscErrorCode ierr; 2190 const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi; 2191 PetscInt i,nz,j; 2192 const PetscInt n=a->mbs,bs=A->rmap->bs,bs2=a->bs2; 2193 const MatScalar *aa=a->a,*v; 2194 PetscScalar *x,*t,*ls; 2195 const PetscScalar *b; 2196 2197 PetscFunctionBegin; 2198 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2199 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2200 t = a->solve_work; 2201 2202 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2203 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2204 2205 /* copy the b into temp work space according to permutation */ 2206 for (i=0; i<n; i++) { 2207 for (j=0; j<bs; j++) { 2208 t[i*bs+j] = b[c[i]*bs+j]; 2209 } 2210 } 2211 2212 2213 /* forward solve the upper triangular transpose */ 2214 ls = a->solve_work + A->cmap->n; 2215 for (i=0; i<n; i++) { 2216 ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 2217 PetscKernel_w_gets_transA_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 2218 v = aa + bs2*(a->diag[i] + 1); 2219 vi = aj + a->diag[i] + 1; 2220 nz = ai[i+1] - a->diag[i] - 1; 2221 while (nz--) { 2222 PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs); 2223 v += bs2; 2224 } 2225 } 2226 2227 /* backward solve the lower triangular transpose */ 2228 for (i=n-1; i>=0; i--) { 2229 v = aa + bs2*ai[i]; 2230 vi = aj + ai[i]; 2231 nz = a->diag[i] - ai[i]; 2232 while (nz--) { 2233 PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs); 2234 v += bs2; 2235 } 2236 } 2237 2238 /* copy t into x according to permutation */ 2239 for (i=0; i<n; i++) { 2240 for (j=0; j<bs; j++) { 2241 x[bs*r[i]+j] = t[bs*i+j]; 2242 } 2243 } 2244 2245 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2246 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2247 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2248 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2249 ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 2250 PetscFunctionReturn(0); 2251 } 2252 2253 #undef __FUNCT__ 2254 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N" 2255 PetscErrorCode MatSolveTranspose_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 2256 { 2257 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2258 IS iscol=a->col,isrow=a->row; 2259 PetscErrorCode ierr; 2260 const PetscInt *r,*c,*rout,*cout; 2261 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*vi,*diag=a->diag; 2262 PetscInt i,j,nz; 2263 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 2264 const MatScalar *aa=a->a,*v; 2265 PetscScalar *x,*t,*ls; 2266 const PetscScalar *b; 2267 2268 PetscFunctionBegin; 2269 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2270 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2271 t = a->solve_work; 2272 2273 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2274 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2275 2276 /* copy the b into temp work space according to permutation */ 2277 for (i=0; i<n; i++) { 2278 for (j=0; j<bs; j++) { 2279 t[i*bs+j] = b[c[i]*bs+j]; 2280 } 2281 } 2282 2283 2284 /* forward solve the upper triangular transpose */ 2285 ls = a->solve_work + A->cmap->n; 2286 for (i=0; i<n; i++) { 2287 ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 2288 PetscKernel_w_gets_transA_times_v(bs,ls,aa+bs2*diag[i],t+i*bs); 2289 v = aa + bs2*(diag[i] - 1); 2290 vi = aj + diag[i] - 1; 2291 nz = diag[i] - diag[i+1] - 1; 2292 for (j=0;j>-nz;j--) { 2293 PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs); 2294 v -= bs2; 2295 } 2296 } 2297 2298 /* backward solve the lower triangular transpose */ 2299 for (i=n-1; i>=0; i--) { 2300 v = aa + bs2*ai[i]; 2301 vi = aj + ai[i]; 2302 nz = ai[i+1] - ai[i]; 2303 for (j=0;j<nz;j++) { 2304 PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs); 2305 v += bs2; 2306 } 2307 } 2308 2309 /* copy t into x according to permutation */ 2310 for (i=0; i<n; i++) { 2311 for (j=0; j<bs; j++) { 2312 x[bs*r[i]+j] = t[bs*i+j]; 2313 } 2314 } 2315 2316 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2317 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2318 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2319 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2320 ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 2321 PetscFunctionReturn(0); 2322 } 2323 2324 /* bs = 15 for PFLOTRAN. Block operations are done by accessing all the columns of the block at once */ 2325 2326 #undef __FUNCT__ 2327 #define __FUNCT__ "MatSolve_SeqBAIJ_15_NaturalOrdering_ver2" 2328 PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver2(Mat A,Vec bb,Vec xx) 2329 { 2330 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2331 PetscErrorCode ierr; 2332 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2; 2333 PetscInt i,nz,idx,idt,m; 2334 const MatScalar *aa=a->a,*v; 2335 PetscScalar s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15; 2336 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15; 2337 PetscScalar *x; 2338 const PetscScalar *b; 2339 2340 PetscFunctionBegin; 2341 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2342 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2343 2344 /* forward solve the lower triangular */ 2345 idx = 0; 2346 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx]; x[4] = b[4+idx]; 2347 x[5] = b[5+idx]; x[6] = b[6+idx]; x[7] = b[7+idx]; x[8] = b[8+idx]; x[9] = b[9+idx]; 2348 x[10] = b[10+idx]; x[11] = b[11+idx]; x[12] = b[12+idx]; x[13] = b[13+idx]; x[14] = b[14+idx]; 2349 2350 for (i=1; i<n; i++) { 2351 v = aa + bs2*ai[i]; 2352 vi = aj + ai[i]; 2353 nz = ai[i+1] - ai[i]; 2354 idt = bs*i; 2355 s1 = b[idt]; s2 = b[1+idt]; s3 = b[2+idt]; s4 = b[3+idt]; s5 = b[4+idt]; 2356 s6 = b[5+idt]; s7 = b[6+idt]; s8 = b[7+idt]; s9 = b[8+idt]; s10 = b[9+idt]; 2357 s11 = b[10+idt]; s12 = b[11+idt]; s13 = b[12+idt]; s14 = b[13+idt]; s15 = b[14+idt]; 2358 for (m=0;m<nz;m++) { 2359 idx = bs*vi[m]; 2360 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 2361 x6 = x[5+idx]; x7 = x[6+idx]; x8 = x[7+idx]; x9 = x[8+idx]; x10 = x[9+idx]; 2362 x11 = x[10+idx]; x12 = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx]; 2363 2364 2365 s1 -= v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15; 2366 s2 -= v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15; 2367 s3 -= v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15; 2368 s4 -= v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15; 2369 s5 -= v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15; 2370 s6 -= v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15; 2371 s7 -= v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15; 2372 s8 -= v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15; 2373 s9 -= v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15; 2374 s10 -= v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15; 2375 s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15; 2376 s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15; 2377 s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15; 2378 s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15; 2379 s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15; 2380 2381 v += bs2; 2382 } 2383 x[idt] = s1; x[1+idt] = s2; x[2+idt] = s3; x[3+idt] = s4; x[4+idt] = s5; 2384 x[5+idt] = s6; x[6+idt] = s7; x[7+idt] = s8; x[8+idt] = s9; x[9+idt] = s10; 2385 x[10+idt] = s11; x[11+idt] = s12; x[12+idt] = s13; x[13+idt] = s14; x[14+idt] = s15; 2386 2387 } 2388 /* backward solve the upper triangular */ 2389 for (i=n-1; i>=0; i--) { 2390 v = aa + bs2*(adiag[i+1]+1); 2391 vi = aj + adiag[i+1]+1; 2392 nz = adiag[i] - adiag[i+1] - 1; 2393 idt = bs*i; 2394 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt]; 2395 s6 = x[5+idt]; s7 = x[6+idt]; s8 = x[7+idt]; s9 = x[8+idt]; s10 = x[9+idt]; 2396 s11 = x[10+idt]; s12 = x[11+idt]; s13 = x[12+idt]; s14 = x[13+idt]; s15 = x[14+idt]; 2397 2398 for (m=0;m<nz;m++) { 2399 idx = bs*vi[m]; 2400 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 2401 x6 = x[5+idx]; x7 = x[6+idx]; x8 = x[7+idx]; x9 = x[8+idx]; x10 = x[9+idx]; 2402 x11 = x[10+idx]; x12 = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx]; 2403 2404 s1 -= v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15; 2405 s2 -= v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15; 2406 s3 -= v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15; 2407 s4 -= v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15; 2408 s5 -= v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15; 2409 s6 -= v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15; 2410 s7 -= v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15; 2411 s8 -= v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15; 2412 s9 -= v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15; 2413 s10 -= v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15; 2414 s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15; 2415 s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15; 2416 s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15; 2417 s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15; 2418 s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15; 2419 2420 v += bs2; 2421 } 2422 2423 x[idt] = v[0]*s1 + v[15]*s2 + v[30]*s3 + v[45]*s4 + v[60]*s5 + v[75]*s6 + v[90]*s7 + v[105]*s8 + v[120]*s9 + v[135]*s10 + v[150]*s11 + v[165]*s12 + v[180]*s13 + v[195]*s14 + v[210]*s15; 2424 x[1+idt] = v[1]*s1 + v[16]*s2 + v[31]*s3 + v[46]*s4 + v[61]*s5 + v[76]*s6 + v[91]*s7 + v[106]*s8 + v[121]*s9 + v[136]*s10 + v[151]*s11 + v[166]*s12 + v[181]*s13 + v[196]*s14 + v[211]*s15; 2425 x[2+idt] = v[2]*s1 + v[17]*s2 + v[32]*s3 + v[47]*s4 + v[62]*s5 + v[77]*s6 + v[92]*s7 + v[107]*s8 + v[122]*s9 + v[137]*s10 + v[152]*s11 + v[167]*s12 + v[182]*s13 + v[197]*s14 + v[212]*s15; 2426 x[3+idt] = v[3]*s1 + v[18]*s2 + v[33]*s3 + v[48]*s4 + v[63]*s5 + v[78]*s6 + v[93]*s7 + v[108]*s8 + v[123]*s9 + v[138]*s10 + v[153]*s11 + v[168]*s12 + v[183]*s13 + v[198]*s14 + v[213]*s15; 2427 x[4+idt] = v[4]*s1 + v[19]*s2 + v[34]*s3 + v[49]*s4 + v[64]*s5 + v[79]*s6 + v[94]*s7 + v[109]*s8 + v[124]*s9 + v[139]*s10 + v[154]*s11 + v[169]*s12 + v[184]*s13 + v[199]*s14 + v[214]*s15; 2428 x[5+idt] = v[5]*s1 + v[20]*s2 + v[35]*s3 + v[50]*s4 + v[65]*s5 + v[80]*s6 + v[95]*s7 + v[110]*s8 + v[125]*s9 + v[140]*s10 + v[155]*s11 + v[170]*s12 + v[185]*s13 + v[200]*s14 + v[215]*s15; 2429 x[6+idt] = v[6]*s1 + v[21]*s2 + v[36]*s3 + v[51]*s4 + v[66]*s5 + v[81]*s6 + v[96]*s7 + v[111]*s8 + v[126]*s9 + v[141]*s10 + v[156]*s11 + v[171]*s12 + v[186]*s13 + v[201]*s14 + v[216]*s15; 2430 x[7+idt] = v[7]*s1 + v[22]*s2 + v[37]*s3 + v[52]*s4 + v[67]*s5 + v[82]*s6 + v[97]*s7 + v[112]*s8 + v[127]*s9 + v[142]*s10 + v[157]*s11 + v[172]*s12 + v[187]*s13 + v[202]*s14 + v[217]*s15; 2431 x[8+idt] = v[8]*s1 + v[23]*s2 + v[38]*s3 + v[53]*s4 + v[68]*s5 + v[83]*s6 + v[98]*s7 + v[113]*s8 + v[128]*s9 + v[143]*s10 + v[158]*s11 + v[173]*s12 + v[188]*s13 + v[203]*s14 + v[218]*s15; 2432 x[9+idt] = v[9]*s1 + v[24]*s2 + v[39]*s3 + v[54]*s4 + v[69]*s5 + v[84]*s6 + v[99]*s7 + v[114]*s8 + v[129]*s9 + v[144]*s10 + v[159]*s11 + v[174]*s12 + v[189]*s13 + v[204]*s14 + v[219]*s15; 2433 x[10+idt] = v[10]*s1 + v[25]*s2 + v[40]*s3 + v[55]*s4 + v[70]*s5 + v[85]*s6 + v[100]*s7 + v[115]*s8 + v[130]*s9 + v[145]*s10 + v[160]*s11 + v[175]*s12 + v[190]*s13 + v[205]*s14 + v[220]*s15; 2434 x[11+idt] = v[11]*s1 + v[26]*s2 + v[41]*s3 + v[56]*s4 + v[71]*s5 + v[86]*s6 + v[101]*s7 + v[116]*s8 + v[131]*s9 + v[146]*s10 + v[161]*s11 + v[176]*s12 + v[191]*s13 + v[206]*s14 + v[221]*s15; 2435 x[12+idt] = v[12]*s1 + v[27]*s2 + v[42]*s3 + v[57]*s4 + v[72]*s5 + v[87]*s6 + v[102]*s7 + v[117]*s8 + v[132]*s9 + v[147]*s10 + v[162]*s11 + v[177]*s12 + v[192]*s13 + v[207]*s14 + v[222]*s15; 2436 x[13+idt] = v[13]*s1 + v[28]*s2 + v[43]*s3 + v[58]*s4 + v[73]*s5 + v[88]*s6 + v[103]*s7 + v[118]*s8 + v[133]*s9 + v[148]*s10 + v[163]*s11 + v[178]*s12 + v[193]*s13 + v[208]*s14 + v[223]*s15; 2437 x[14+idt] = v[14]*s1 + v[29]*s2 + v[44]*s3 + v[59]*s4 + v[74]*s5 + v[89]*s6 + v[104]*s7 + v[119]*s8 + v[134]*s9 + v[149]*s10 + v[164]*s11 + v[179]*s12 + v[194]*s13 + v[209]*s14 + v[224]*s15; 2438 2439 } 2440 2441 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2442 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2443 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 2444 PetscFunctionReturn(0); 2445 } 2446 2447 /* bs = 15 for PFLOTRAN. Block operations are done by accessing one column at at time */ 2448 /* Default MatSolve for block size 15 */ 2449 2450 #undef __FUNCT__ 2451 #define __FUNCT__ "MatSolve_SeqBAIJ_15_NaturalOrdering_ver1" 2452 PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver1(Mat A,Vec bb,Vec xx) 2453 { 2454 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2455 PetscErrorCode ierr; 2456 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2; 2457 PetscInt i,k,nz,idx,idt,m; 2458 const MatScalar *aa=a->a,*v; 2459 PetscScalar s[15]; 2460 PetscScalar *x,xv; 2461 const PetscScalar *b; 2462 2463 PetscFunctionBegin; 2464 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2465 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2466 2467 /* forward solve the lower triangular */ 2468 for (i=0; i<n; i++) { 2469 v = aa + bs2*ai[i]; 2470 vi = aj + ai[i]; 2471 nz = ai[i+1] - ai[i]; 2472 idt = bs*i; 2473 x[idt] = b[idt]; x[1+idt] = b[1+idt]; x[2+idt] = b[2+idt]; x[3+idt] = b[3+idt]; x[4+idt] = b[4+idt]; 2474 x[5+idt] = b[5+idt]; x[6+idt] = b[6+idt]; x[7+idt] = b[7+idt]; x[8+idt] = b[8+idt]; x[9+idt] = b[9+idt]; 2475 x[10+idt] = b[10+idt]; x[11+idt] = b[11+idt]; x[12+idt] = b[12+idt]; x[13+idt] = b[13+idt]; x[14+idt] = b[14+idt]; 2476 for (m=0;m<nz;m++) { 2477 idx = bs*vi[m]; 2478 for (k=0;k<15;k++) { 2479 xv = x[k + idx]; 2480 x[idt] -= v[0]*xv; 2481 x[1+idt] -= v[1]*xv; 2482 x[2+idt] -= v[2]*xv; 2483 x[3+idt] -= v[3]*xv; 2484 x[4+idt] -= v[4]*xv; 2485 x[5+idt] -= v[5]*xv; 2486 x[6+idt] -= v[6]*xv; 2487 x[7+idt] -= v[7]*xv; 2488 x[8+idt] -= v[8]*xv; 2489 x[9+idt] -= v[9]*xv; 2490 x[10+idt] -= v[10]*xv; 2491 x[11+idt] -= v[11]*xv; 2492 x[12+idt] -= v[12]*xv; 2493 x[13+idt] -= v[13]*xv; 2494 x[14+idt] -= v[14]*xv; 2495 v += 15; 2496 } 2497 } 2498 } 2499 /* backward solve the upper triangular */ 2500 for (i=n-1; i>=0; i--) { 2501 v = aa + bs2*(adiag[i+1]+1); 2502 vi = aj + adiag[i+1]+1; 2503 nz = adiag[i] - adiag[i+1] - 1; 2504 idt = bs*i; 2505 s[0] = x[idt]; s[1] = x[1+idt]; s[2] = x[2+idt]; s[3] = x[3+idt]; s[4] = x[4+idt]; 2506 s[5] = x[5+idt]; s[6] = x[6+idt]; s[7] = x[7+idt]; s[8] = x[8+idt]; s[9] = x[9+idt]; 2507 s[10] = x[10+idt]; s[11] = x[11+idt]; s[12] = x[12+idt]; s[13] = x[13+idt]; s[14] = x[14+idt]; 2508 2509 for (m=0;m<nz;m++) { 2510 idx = bs*vi[m]; 2511 for (k=0;k<15;k++) { 2512 xv = x[k + idx]; 2513 s[0] -= v[0]*xv; 2514 s[1] -= v[1]*xv; 2515 s[2] -= v[2]*xv; 2516 s[3] -= v[3]*xv; 2517 s[4] -= v[4]*xv; 2518 s[5] -= v[5]*xv; 2519 s[6] -= v[6]*xv; 2520 s[7] -= v[7]*xv; 2521 s[8] -= v[8]*xv; 2522 s[9] -= v[9]*xv; 2523 s[10] -= v[10]*xv; 2524 s[11] -= v[11]*xv; 2525 s[12] -= v[12]*xv; 2526 s[13] -= v[13]*xv; 2527 s[14] -= v[14]*xv; 2528 v += 15; 2529 } 2530 } 2531 ierr = PetscMemzero(x+idt,bs*sizeof(MatScalar));CHKERRQ(ierr); 2532 for (k=0;k<15;k++) { 2533 x[idt] += v[0]*s[k]; 2534 x[1+idt] += v[1]*s[k]; 2535 x[2+idt] += v[2]*s[k]; 2536 x[3+idt] += v[3]*s[k]; 2537 x[4+idt] += v[4]*s[k]; 2538 x[5+idt] += v[5]*s[k]; 2539 x[6+idt] += v[6]*s[k]; 2540 x[7+idt] += v[7]*s[k]; 2541 x[8+idt] += v[8]*s[k]; 2542 x[9+idt] += v[9]*s[k]; 2543 x[10+idt] += v[10]*s[k]; 2544 x[11+idt] += v[11]*s[k]; 2545 x[12+idt] += v[12]*s[k]; 2546 x[13+idt] += v[13]*s[k]; 2547 x[14+idt] += v[14]*s[k]; 2548 v += 15; 2549 } 2550 } 2551 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2552 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2553 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 2554 PetscFunctionReturn(0); 2555 } 2556 2557 2558 #undef __FUNCT__ 2559 #define __FUNCT__ "MatSolve_SeqBAIJ_7_inplace" 2560 PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx) 2561 { 2562 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2563 IS iscol=a->col,isrow=a->row; 2564 PetscErrorCode ierr; 2565 const PetscInt *r,*c,*ai=a->i,*aj=a->j; 2566 const PetscInt *rout,*cout,*diag = a->diag,*vi,n=a->mbs; 2567 PetscInt i,nz,idx,idt,idc; 2568 const MatScalar *aa=a->a,*v; 2569 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t; 2570 const PetscScalar *b; 2571 2572 PetscFunctionBegin; 2573 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2574 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2575 t = a->solve_work; 2576 2577 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2578 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2579 2580 /* forward solve the lower triangular */ 2581 idx = 7*(*r++); 2582 t[0] = b[idx]; t[1] = b[1+idx]; 2583 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 2584 t[5] = b[5+idx]; t[6] = b[6+idx]; 2585 2586 for (i=1; i<n; i++) { 2587 v = aa + 49*ai[i]; 2588 vi = aj + ai[i]; 2589 nz = diag[i] - ai[i]; 2590 idx = 7*(*r++); 2591 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2592 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 2593 while (nz--) { 2594 idx = 7*(*vi++); 2595 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 2596 x4 = t[3+idx];x5 = t[4+idx]; 2597 x6 = t[5+idx];x7 = t[6+idx]; 2598 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2599 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2600 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2601 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2602 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2603 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2604 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2605 v += 49; 2606 } 2607 idx = 7*i; 2608 t[idx] = s1;t[1+idx] = s2; 2609 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 2610 t[5+idx] = s6;t[6+idx] = s7; 2611 } 2612 /* backward solve the upper triangular */ 2613 for (i=n-1; i>=0; i--) { 2614 v = aa + 49*diag[i] + 49; 2615 vi = aj + diag[i] + 1; 2616 nz = ai[i+1] - diag[i] - 1; 2617 idt = 7*i; 2618 s1 = t[idt]; s2 = t[1+idt]; 2619 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 2620 s6 = t[5+idt];s7 = t[6+idt]; 2621 while (nz--) { 2622 idx = 7*(*vi++); 2623 x1 = t[idx]; x2 = t[1+idx]; 2624 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 2625 x6 = t[5+idx]; x7 = t[6+idx]; 2626 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2627 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2628 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2629 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2630 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2631 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2632 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2633 v += 49; 2634 } 2635 idc = 7*(*c--); 2636 v = aa + 49*diag[i]; 2637 x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 2638 v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 2639 x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 2640 v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 2641 x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 2642 v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 2643 x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 2644 v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 2645 x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 2646 v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 2647 x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 2648 v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 2649 x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 2650 v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 2651 } 2652 2653 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2654 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2655 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2656 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2657 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 2658 PetscFunctionReturn(0); 2659 } 2660 2661 #undef __FUNCT__ 2662 #define __FUNCT__ "MatSolve_SeqBAIJ_7" 2663 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 2664 { 2665 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2666 IS iscol=a->col,isrow=a->row; 2667 PetscErrorCode ierr; 2668 const PetscInt *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag; 2669 const PetscInt n=a->mbs,*rout,*cout,*vi; 2670 PetscInt i,nz,idx,idt,idc,m; 2671 const MatScalar *aa=a->a,*v; 2672 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t; 2673 const PetscScalar *b; 2674 2675 PetscFunctionBegin; 2676 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2677 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2678 t = a->solve_work; 2679 2680 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2681 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2682 2683 /* forward solve the lower triangular */ 2684 idx = 7*r[0]; 2685 t[0] = b[idx]; t[1] = b[1+idx]; 2686 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 2687 t[5] = b[5+idx]; t[6] = b[6+idx]; 2688 2689 for (i=1; i<n; i++) { 2690 v = aa + 49*ai[i]; 2691 vi = aj + ai[i]; 2692 nz = ai[i+1] - ai[i]; 2693 idx = 7*r[i]; 2694 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2695 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 2696 for (m=0;m<nz;m++) { 2697 idx = 7*vi[m]; 2698 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 2699 x4 = t[3+idx];x5 = t[4+idx]; 2700 x6 = t[5+idx];x7 = t[6+idx]; 2701 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2702 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2703 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2704 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2705 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2706 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2707 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2708 v += 49; 2709 } 2710 idx = 7*i; 2711 t[idx] = s1;t[1+idx] = s2; 2712 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 2713 t[5+idx] = s6;t[6+idx] = s7; 2714 } 2715 /* backward solve the upper triangular */ 2716 for (i=n-1; i>=0; i--) { 2717 v = aa + 49*(adiag[i+1]+1); 2718 vi = aj + adiag[i+1]+1; 2719 nz = adiag[i] - adiag[i+1] - 1; 2720 idt = 7*i; 2721 s1 = t[idt]; s2 = t[1+idt]; 2722 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 2723 s6 = t[5+idt];s7 = t[6+idt]; 2724 for (m=0;m<nz;m++) { 2725 idx = 7*vi[m]; 2726 x1 = t[idx]; x2 = t[1+idx]; 2727 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 2728 x6 = t[5+idx]; x7 = t[6+idx]; 2729 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2730 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2731 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2732 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2733 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2734 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2735 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2736 v += 49; 2737 } 2738 idc = 7*c[i]; 2739 x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 2740 v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 2741 x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 2742 v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 2743 x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 2744 v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 2745 x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 2746 v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 2747 x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 2748 v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 2749 x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 2750 v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 2751 x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 2752 v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 2753 } 2754 2755 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2756 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2757 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2758 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2759 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 2760 PetscFunctionReturn(0); 2761 } 2762 2763 #undef __FUNCT__ 2764 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_inplace" 2765 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 2766 { 2767 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2768 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 2769 PetscErrorCode ierr; 2770 PetscInt i,nz,idx,idt,jdx; 2771 const MatScalar *aa=a->a,*v; 2772 PetscScalar *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 2773 const PetscScalar *b; 2774 2775 PetscFunctionBegin; 2776 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2777 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2778 /* forward solve the lower triangular */ 2779 idx = 0; 2780 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 2781 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 2782 x[6] = b[6+idx]; 2783 for (i=1; i<n; i++) { 2784 v = aa + 49*ai[i]; 2785 vi = aj + ai[i]; 2786 nz = diag[i] - ai[i]; 2787 idx = 7*i; 2788 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 2789 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 2790 s7 = b[6+idx]; 2791 while (nz--) { 2792 jdx = 7*(*vi++); 2793 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 2794 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 2795 x7 = x[6+jdx]; 2796 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2797 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2798 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2799 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2800 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2801 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2802 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2803 v += 49; 2804 } 2805 x[idx] = s1; 2806 x[1+idx] = s2; 2807 x[2+idx] = s3; 2808 x[3+idx] = s4; 2809 x[4+idx] = s5; 2810 x[5+idx] = s6; 2811 x[6+idx] = s7; 2812 } 2813 /* backward solve the upper triangular */ 2814 for (i=n-1; i>=0; i--) { 2815 v = aa + 49*diag[i] + 49; 2816 vi = aj + diag[i] + 1; 2817 nz = ai[i+1] - diag[i] - 1; 2818 idt = 7*i; 2819 s1 = x[idt]; s2 = x[1+idt]; 2820 s3 = x[2+idt]; s4 = x[3+idt]; 2821 s5 = x[4+idt]; s6 = x[5+idt]; 2822 s7 = x[6+idt]; 2823 while (nz--) { 2824 idx = 7*(*vi++); 2825 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 2826 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 2827 x7 = x[6+idx]; 2828 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2829 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2830 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2831 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2832 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2833 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2834 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2835 v += 49; 2836 } 2837 v = aa + 49*diag[i]; 2838 x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 2839 + v[28]*s5 + v[35]*s6 + v[42]*s7; 2840 x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 2841 + v[29]*s5 + v[36]*s6 + v[43]*s7; 2842 x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 2843 + v[30]*s5 + v[37]*s6 + v[44]*s7; 2844 x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 2845 + v[31]*s5 + v[38]*s6 + v[45]*s7; 2846 x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 2847 + v[32]*s5 + v[39]*s6 + v[46]*s7; 2848 x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 2849 + v[33]*s5 + v[40]*s6 + v[47]*s7; 2850 x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 2851 + v[34]*s5 + v[41]*s6 + v[48]*s7; 2852 } 2853 2854 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2855 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2856 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 2857 PetscFunctionReturn(0); 2858 } 2859 2860 #undef __FUNCT__ 2861 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering" 2862 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 2863 { 2864 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2865 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 2866 PetscErrorCode ierr; 2867 PetscInt i,k,nz,idx,jdx,idt; 2868 const PetscInt bs = A->rmap->bs,bs2 = a->bs2; 2869 const MatScalar *aa=a->a,*v; 2870 PetscScalar *x; 2871 const PetscScalar *b; 2872 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 2873 2874 PetscFunctionBegin; 2875 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2876 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2877 /* forward solve the lower triangular */ 2878 idx = 0; 2879 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 2880 x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx]; 2881 for (i=1; i<n; i++) { 2882 v = aa + bs2*ai[i]; 2883 vi = aj + ai[i]; 2884 nz = ai[i+1] - ai[i]; 2885 idx = bs*i; 2886 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2887 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 2888 for (k=0;k<nz;k++) { 2889 jdx = bs*vi[k]; 2890 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 2891 x5 = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx]; 2892 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2893 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2894 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2895 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2896 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2897 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2898 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2899 v += bs2; 2900 } 2901 2902 x[idx] = s1; 2903 x[1+idx] = s2; 2904 x[2+idx] = s3; 2905 x[3+idx] = s4; 2906 x[4+idx] = s5; 2907 x[5+idx] = s6; 2908 x[6+idx] = s7; 2909 } 2910 2911 /* backward solve the upper triangular */ 2912 for (i=n-1; i>=0; i--) { 2913 v = aa + bs2*(adiag[i+1]+1); 2914 vi = aj + adiag[i+1]+1; 2915 nz = adiag[i] - adiag[i+1]-1; 2916 idt = bs*i; 2917 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 2918 s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt]; 2919 for (k=0;k<nz;k++) { 2920 idx = bs*vi[k]; 2921 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 2922 x5 = x[4+idx];x6 = x[5+idx];x7 = x[6+idx]; 2923 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2924 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2925 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2926 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2927 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2928 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2929 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2930 v += bs2; 2931 } 2932 /* x = inv_diagonal*x */ 2933 x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 + v[28]*s5 + v[35]*s6 + v[42]*s7; 2934 x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 + v[29]*s5 + v[36]*s6 + v[43]*s7; 2935 x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 + v[30]*s5 + v[37]*s6 + v[44]*s7; 2936 x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 + v[31]*s5 + v[38]*s6 + v[45]*s7; 2937 x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 + v[32]*s5 + v[39]*s6 + v[46]*s7; 2938 x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 + v[33]*s5 + v[40]*s6 + v[47]*s7; 2939 x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 + v[34]*s5 + v[41]*s6 + v[48]*s7; 2940 } 2941 2942 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2943 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2944 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 2945 PetscFunctionReturn(0); 2946 } 2947 2948 #undef __FUNCT__ 2949 #define __FUNCT__ "MatSolve_SeqBAIJ_6_inplace" 2950 PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx) 2951 { 2952 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2953 IS iscol=a->col,isrow=a->row; 2954 PetscErrorCode ierr; 2955 const PetscInt *r,*c,*rout,*cout; 2956 const PetscInt *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 2957 PetscInt i,nz,idx,idt,idc; 2958 const MatScalar *aa=a->a,*v; 2959 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 2960 const PetscScalar *b; 2961 2962 PetscFunctionBegin; 2963 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2964 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2965 t = a->solve_work; 2966 2967 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2968 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2969 2970 /* forward solve the lower triangular */ 2971 idx = 6*(*r++); 2972 t[0] = b[idx]; t[1] = b[1+idx]; 2973 t[2] = b[2+idx]; t[3] = b[3+idx]; 2974 t[4] = b[4+idx]; t[5] = b[5+idx]; 2975 for (i=1; i<n; i++) { 2976 v = aa + 36*ai[i]; 2977 vi = aj + ai[i]; 2978 nz = diag[i] - ai[i]; 2979 idx = 6*(*r++); 2980 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2981 s5 = b[4+idx]; s6 = b[5+idx]; 2982 while (nz--) { 2983 idx = 6*(*vi++); 2984 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2985 x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 2986 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2987 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2988 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2989 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2990 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2991 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2992 v += 36; 2993 } 2994 idx = 6*i; 2995 t[idx] = s1;t[1+idx] = s2; 2996 t[2+idx] = s3;t[3+idx] = s4; 2997 t[4+idx] = s5;t[5+idx] = s6; 2998 } 2999 /* backward solve the upper triangular */ 3000 for (i=n-1; i>=0; i--) { 3001 v = aa + 36*diag[i] + 36; 3002 vi = aj + diag[i] + 1; 3003 nz = ai[i+1] - diag[i] - 1; 3004 idt = 6*i; 3005 s1 = t[idt]; s2 = t[1+idt]; 3006 s3 = t[2+idt];s4 = t[3+idt]; 3007 s5 = t[4+idt];s6 = t[5+idt]; 3008 while (nz--) { 3009 idx = 6*(*vi++); 3010 x1 = t[idx]; x2 = t[1+idx]; 3011 x3 = t[2+idx]; x4 = t[3+idx]; 3012 x5 = t[4+idx]; x6 = t[5+idx]; 3013 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 3014 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 3015 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 3016 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 3017 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 3018 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 3019 v += 36; 3020 } 3021 idc = 6*(*c--); 3022 v = aa + 36*diag[i]; 3023 x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 3024 v[18]*s4+v[24]*s5+v[30]*s6; 3025 x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 3026 v[19]*s4+v[25]*s5+v[31]*s6; 3027 x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 3028 v[20]*s4+v[26]*s5+v[32]*s6; 3029 x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 3030 v[21]*s4+v[27]*s5+v[33]*s6; 3031 x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 3032 v[22]*s4+v[28]*s5+v[34]*s6; 3033 x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 3034 v[23]*s4+v[29]*s5+v[35]*s6; 3035 } 3036 3037 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3038 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3039 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3040 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3041 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 3042 PetscFunctionReturn(0); 3043 } 3044 3045 #undef __FUNCT__ 3046 #define __FUNCT__ "MatSolve_SeqBAIJ_6" 3047 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 3048 { 3049 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 3050 IS iscol=a->col,isrow=a->row; 3051 PetscErrorCode ierr; 3052 const PetscInt *r,*c,*rout,*cout; 3053 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 3054 PetscInt i,nz,idx,idt,idc,m; 3055 const MatScalar *aa=a->a,*v; 3056 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 3057 const PetscScalar *b; 3058 3059 PetscFunctionBegin; 3060 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3061 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3062 t = a->solve_work; 3063 3064 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3065 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 3066 3067 /* forward solve the lower triangular */ 3068 idx = 6*r[0]; 3069 t[0] = b[idx]; t[1] = b[1+idx]; 3070 t[2] = b[2+idx]; t[3] = b[3+idx]; 3071 t[4] = b[4+idx]; t[5] = b[5+idx]; 3072 for (i=1; i<n; i++) { 3073 v = aa + 36*ai[i]; 3074 vi = aj + ai[i]; 3075 nz = ai[i+1] - ai[i]; 3076 idx = 6*r[i]; 3077 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3078 s5 = b[4+idx]; s6 = b[5+idx]; 3079 for (m=0;m<nz;m++) { 3080 idx = 6*vi[m]; 3081 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 3082 x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 3083 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 3084 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 3085 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 3086 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 3087 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 3088 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 3089 v += 36; 3090 } 3091 idx = 6*i; 3092 t[idx] = s1;t[1+idx] = s2; 3093 t[2+idx] = s3;t[3+idx] = s4; 3094 t[4+idx] = s5;t[5+idx] = s6; 3095 } 3096 /* backward solve the upper triangular */ 3097 for (i=n-1; i>=0; i--) { 3098 v = aa + 36*(adiag[i+1]+1); 3099 vi = aj + adiag[i+1]+1; 3100 nz = adiag[i] - adiag[i+1] - 1; 3101 idt = 6*i; 3102 s1 = t[idt]; s2 = t[1+idt]; 3103 s3 = t[2+idt];s4 = t[3+idt]; 3104 s5 = t[4+idt];s6 = t[5+idt]; 3105 for (m=0;m<nz;m++) { 3106 idx = 6*vi[m]; 3107 x1 = t[idx]; x2 = t[1+idx]; 3108 x3 = t[2+idx]; x4 = t[3+idx]; 3109 x5 = t[4+idx]; x6 = t[5+idx]; 3110 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 3111 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 3112 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 3113 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 3114 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 3115 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 3116 v += 36; 3117 } 3118 idc = 6*c[i]; 3119 x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 3120 v[18]*s4+v[24]*s5+v[30]*s6; 3121 x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 3122 v[19]*s4+v[25]*s5+v[31]*s6; 3123 x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 3124 v[20]*s4+v[26]*s5+v[32]*s6; 3125 x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 3126 v[21]*s4+v[27]*s5+v[33]*s6; 3127 x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 3128 v[22]*s4+v[28]*s5+v[34]*s6; 3129 x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 3130 v[23]*s4+v[29]*s5+v[35]*s6; 3131 } 3132 3133 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3134 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3135 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3136 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3137 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 3138 PetscFunctionReturn(0); 3139 } 3140 3141 #undef __FUNCT__ 3142 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_inplace" 3143 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 3144 { 3145 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3146 PetscInt i,nz,idx,idt,jdx; 3147 PetscErrorCode ierr; 3148 const PetscInt *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j; 3149 const MatScalar *aa=a->a,*v; 3150 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 3151 const PetscScalar *b; 3152 3153 PetscFunctionBegin; 3154 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3155 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3156 /* forward solve the lower triangular */ 3157 idx = 0; 3158 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 3159 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 3160 for (i=1; i<n; i++) { 3161 v = aa + 36*ai[i]; 3162 vi = aj + ai[i]; 3163 nz = diag[i] - ai[i]; 3164 idx = 6*i; 3165 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 3166 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 3167 while (nz--) { 3168 jdx = 6*(*vi++); 3169 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 3170 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 3171 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 3172 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 3173 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 3174 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 3175 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 3176 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 3177 v += 36; 3178 } 3179 x[idx] = s1; 3180 x[1+idx] = s2; 3181 x[2+idx] = s3; 3182 x[3+idx] = s4; 3183 x[4+idx] = s5; 3184 x[5+idx] = s6; 3185 } 3186 /* backward solve the upper triangular */ 3187 for (i=n-1; i>=0; i--) { 3188 v = aa + 36*diag[i] + 36; 3189 vi = aj + diag[i] + 1; 3190 nz = ai[i+1] - diag[i] - 1; 3191 idt = 6*i; 3192 s1 = x[idt]; s2 = x[1+idt]; 3193 s3 = x[2+idt]; s4 = x[3+idt]; 3194 s5 = x[4+idt]; s6 = x[5+idt]; 3195 while (nz--) { 3196 idx = 6*(*vi++); 3197 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 3198 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 3199 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 3200 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 3201 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 3202 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 3203 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 3204 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 3205 v += 36; 3206 } 3207 v = aa + 36*diag[i]; 3208 x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 3209 x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 3210 x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 3211 x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 3212 x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 3213 x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 3214 } 3215 3216 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3217 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3218 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 3219 PetscFunctionReturn(0); 3220 } 3221 3222 #undef __FUNCT__ 3223 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering" 3224 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 3225 { 3226 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3227 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 3228 PetscErrorCode ierr; 3229 PetscInt i,k,nz,idx,jdx,idt; 3230 const PetscInt bs = A->rmap->bs,bs2 = a->bs2; 3231 const MatScalar *aa=a->a,*v; 3232 PetscScalar *x; 3233 const PetscScalar *b; 3234 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 3235 3236 PetscFunctionBegin; 3237 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3238 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3239 /* forward solve the lower triangular */ 3240 idx = 0; 3241 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 3242 x[4] = b[4+idx];x[5] = b[5+idx]; 3243 for (i=1; i<n; i++) { 3244 v = aa + bs2*ai[i]; 3245 vi = aj + ai[i]; 3246 nz = ai[i+1] - ai[i]; 3247 idx = bs*i; 3248 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3249 s5 = b[4+idx];s6 = b[5+idx]; 3250 for (k=0;k<nz;k++) { 3251 jdx = bs*vi[k]; 3252 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 3253 x5 = x[4+jdx]; x6 = x[5+jdx]; 3254 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 3255 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 3256 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 3257 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 3258 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 3259 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 3260 v += bs2; 3261 } 3262 3263 x[idx] = s1; 3264 x[1+idx] = s2; 3265 x[2+idx] = s3; 3266 x[3+idx] = s4; 3267 x[4+idx] = s5; 3268 x[5+idx] = s6; 3269 } 3270 3271 /* backward solve the upper triangular */ 3272 for (i=n-1; i>=0; i--) { 3273 v = aa + bs2*(adiag[i+1]+1); 3274 vi = aj + adiag[i+1]+1; 3275 nz = adiag[i] - adiag[i+1]-1; 3276 idt = bs*i; 3277 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 3278 s5 = x[4+idt];s6 = x[5+idt]; 3279 for (k=0;k<nz;k++) { 3280 idx = bs*vi[k]; 3281 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 3282 x5 = x[4+idx];x6 = x[5+idx]; 3283 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 3284 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 3285 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 3286 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 3287 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 3288 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 3289 v += bs2; 3290 } 3291 /* x = inv_diagonal*x */ 3292 x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 3293 x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 3294 x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 3295 x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 3296 x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 3297 x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 3298 } 3299 3300 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3301 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3302 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 3303 PetscFunctionReturn(0); 3304 } 3305 3306 #undef __FUNCT__ 3307 #define __FUNCT__ "MatSolve_SeqBAIJ_5_inplace" 3308 PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx) 3309 { 3310 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 3311 IS iscol=a->col,isrow=a->row; 3312 PetscErrorCode ierr; 3313 const PetscInt *r,*c,*rout,*cout,*diag = a->diag; 3314 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 3315 PetscInt i,nz,idx,idt,idc; 3316 const MatScalar *aa=a->a,*v; 3317 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 3318 const PetscScalar *b; 3319 3320 PetscFunctionBegin; 3321 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3322 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3323 t = a->solve_work; 3324 3325 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3326 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3327 3328 /* forward solve the lower triangular */ 3329 idx = 5*(*r++); 3330 t[0] = b[idx]; t[1] = b[1+idx]; 3331 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 3332 for (i=1; i<n; i++) { 3333 v = aa + 25*ai[i]; 3334 vi = aj + ai[i]; 3335 nz = diag[i] - ai[i]; 3336 idx = 5*(*r++); 3337 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3338 s5 = b[4+idx]; 3339 while (nz--) { 3340 idx = 5*(*vi++); 3341 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 3342 x4 = t[3+idx];x5 = t[4+idx]; 3343 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3344 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3345 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3346 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3347 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3348 v += 25; 3349 } 3350 idx = 5*i; 3351 t[idx] = s1;t[1+idx] = s2; 3352 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 3353 } 3354 /* backward solve the upper triangular */ 3355 for (i=n-1; i>=0; i--) { 3356 v = aa + 25*diag[i] + 25; 3357 vi = aj + diag[i] + 1; 3358 nz = ai[i+1] - diag[i] - 1; 3359 idt = 5*i; 3360 s1 = t[idt]; s2 = t[1+idt]; 3361 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 3362 while (nz--) { 3363 idx = 5*(*vi++); 3364 x1 = t[idx]; x2 = t[1+idx]; 3365 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 3366 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3367 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3368 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3369 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3370 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3371 v += 25; 3372 } 3373 idc = 5*(*c--); 3374 v = aa + 25*diag[i]; 3375 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 3376 v[15]*s4+v[20]*s5; 3377 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 3378 v[16]*s4+v[21]*s5; 3379 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 3380 v[17]*s4+v[22]*s5; 3381 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 3382 v[18]*s4+v[23]*s5; 3383 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 3384 v[19]*s4+v[24]*s5; 3385 } 3386 3387 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3388 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3389 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3390 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3391 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 3392 PetscFunctionReturn(0); 3393 } 3394 3395 #undef __FUNCT__ 3396 #define __FUNCT__ "MatSolve_SeqBAIJ_5" 3397 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 3398 { 3399 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 3400 IS iscol=a->col,isrow=a->row; 3401 PetscErrorCode ierr; 3402 const PetscInt *r,*c,*rout,*cout; 3403 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 3404 PetscInt i,nz,idx,idt,idc,m; 3405 const MatScalar *aa=a->a,*v; 3406 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 3407 const PetscScalar *b; 3408 3409 PetscFunctionBegin; 3410 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3411 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3412 t = a->solve_work; 3413 3414 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3415 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 3416 3417 /* forward solve the lower triangular */ 3418 idx = 5*r[0]; 3419 t[0] = b[idx]; t[1] = b[1+idx]; 3420 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 3421 for (i=1; i<n; i++) { 3422 v = aa + 25*ai[i]; 3423 vi = aj + ai[i]; 3424 nz = ai[i+1] - ai[i]; 3425 idx = 5*r[i]; 3426 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3427 s5 = b[4+idx]; 3428 for (m=0;m<nz;m++) { 3429 idx = 5*vi[m]; 3430 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 3431 x4 = t[3+idx];x5 = t[4+idx]; 3432 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3433 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3434 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3435 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3436 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3437 v += 25; 3438 } 3439 idx = 5*i; 3440 t[idx] = s1;t[1+idx] = s2; 3441 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 3442 } 3443 /* backward solve the upper triangular */ 3444 for (i=n-1; i>=0; i--) { 3445 v = aa + 25*(adiag[i+1]+1); 3446 vi = aj + adiag[i+1]+1; 3447 nz = adiag[i] - adiag[i+1] - 1; 3448 idt = 5*i; 3449 s1 = t[idt]; s2 = t[1+idt]; 3450 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 3451 for (m=0;m<nz;m++) { 3452 idx = 5*vi[m]; 3453 x1 = t[idx]; x2 = t[1+idx]; 3454 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 3455 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3456 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3457 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3458 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3459 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3460 v += 25; 3461 } 3462 idc = 5*c[i]; 3463 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 3464 v[15]*s4+v[20]*s5; 3465 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 3466 v[16]*s4+v[21]*s5; 3467 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 3468 v[17]*s4+v[22]*s5; 3469 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 3470 v[18]*s4+v[23]*s5; 3471 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 3472 v[19]*s4+v[24]*s5; 3473 } 3474 3475 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3476 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3477 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3478 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3479 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 3480 PetscFunctionReturn(0); 3481 } 3482 3483 #undef __FUNCT__ 3484 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_inplace" 3485 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 3486 { 3487 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3488 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 3489 PetscInt i,nz,idx,idt,jdx; 3490 PetscErrorCode ierr; 3491 const MatScalar *aa=a->a,*v; 3492 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 3493 const PetscScalar *b; 3494 3495 PetscFunctionBegin; 3496 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3497 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3498 /* forward solve the lower triangular */ 3499 idx = 0; 3500 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 3501 for (i=1; i<n; i++) { 3502 v = aa + 25*ai[i]; 3503 vi = aj + ai[i]; 3504 nz = diag[i] - ai[i]; 3505 idx = 5*i; 3506 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 3507 while (nz--) { 3508 jdx = 5*(*vi++); 3509 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 3510 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3511 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3512 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3513 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3514 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3515 v += 25; 3516 } 3517 x[idx] = s1; 3518 x[1+idx] = s2; 3519 x[2+idx] = s3; 3520 x[3+idx] = s4; 3521 x[4+idx] = s5; 3522 } 3523 /* backward solve the upper triangular */ 3524 for (i=n-1; i>=0; i--) { 3525 v = aa + 25*diag[i] + 25; 3526 vi = aj + diag[i] + 1; 3527 nz = ai[i+1] - diag[i] - 1; 3528 idt = 5*i; 3529 s1 = x[idt]; s2 = x[1+idt]; 3530 s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 3531 while (nz--) { 3532 idx = 5*(*vi++); 3533 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 3534 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3535 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3536 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3537 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3538 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3539 v += 25; 3540 } 3541 v = aa + 25*diag[i]; 3542 x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 3543 x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 3544 x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 3545 x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 3546 x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 3547 } 3548 3549 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3550 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3551 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 3552 PetscFunctionReturn(0); 3553 } 3554 3555 #undef __FUNCT__ 3556 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering" 3557 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 3558 { 3559 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3560 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 3561 PetscInt i,k,nz,idx,idt,jdx; 3562 PetscErrorCode ierr; 3563 const MatScalar *aa=a->a,*v; 3564 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 3565 const PetscScalar *b; 3566 3567 PetscFunctionBegin; 3568 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3569 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3570 /* forward solve the lower triangular */ 3571 idx = 0; 3572 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 3573 for (i=1; i<n; i++) { 3574 v = aa + 25*ai[i]; 3575 vi = aj + ai[i]; 3576 nz = ai[i+1] - ai[i]; 3577 idx = 5*i; 3578 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 3579 for (k=0;k<nz;k++) { 3580 jdx = 5*vi[k]; 3581 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 3582 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3583 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3584 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3585 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3586 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3587 v += 25; 3588 } 3589 x[idx] = s1; 3590 x[1+idx] = s2; 3591 x[2+idx] = s3; 3592 x[3+idx] = s4; 3593 x[4+idx] = s5; 3594 } 3595 3596 /* backward solve the upper triangular */ 3597 for (i=n-1; i>=0; i--) { 3598 v = aa + 25*(adiag[i+1]+1); 3599 vi = aj + adiag[i+1]+1; 3600 nz = adiag[i] - adiag[i+1]-1; 3601 idt = 5*i; 3602 s1 = x[idt]; s2 = x[1+idt]; 3603 s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 3604 for (k=0;k<nz;k++) { 3605 idx = 5*vi[k]; 3606 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 3607 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3608 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3609 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3610 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3611 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3612 v += 25; 3613 } 3614 /* x = inv_diagonal*x */ 3615 x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 3616 x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 3617 x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 3618 x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 3619 x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 3620 } 3621 3622 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3623 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3624 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 3625 PetscFunctionReturn(0); 3626 } 3627 3628 #undef __FUNCT__ 3629 #define __FUNCT__ "MatSolve_SeqBAIJ_4_inplace" 3630 PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx) 3631 { 3632 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3633 IS iscol=a->col,isrow=a->row; 3634 PetscErrorCode ierr; 3635 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 3636 PetscInt i,nz,idx,idt,idc; 3637 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3638 const MatScalar *aa=a->a,*v; 3639 PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 3640 const PetscScalar *b; 3641 3642 PetscFunctionBegin; 3643 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3644 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3645 t = a->solve_work; 3646 3647 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3648 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3649 3650 /* forward solve the lower triangular */ 3651 idx = 4*(*r++); 3652 t[0] = b[idx]; t[1] = b[1+idx]; 3653 t[2] = b[2+idx]; t[3] = b[3+idx]; 3654 for (i=1; i<n; i++) { 3655 v = aa + 16*ai[i]; 3656 vi = aj + ai[i]; 3657 nz = diag[i] - ai[i]; 3658 idx = 4*(*r++); 3659 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3660 while (nz--) { 3661 idx = 4*(*vi++); 3662 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 3663 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3664 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3665 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3666 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3667 v += 16; 3668 } 3669 idx = 4*i; 3670 t[idx] = s1;t[1+idx] = s2; 3671 t[2+idx] = s3;t[3+idx] = s4; 3672 } 3673 /* backward solve the upper triangular */ 3674 for (i=n-1; i>=0; i--) { 3675 v = aa + 16*diag[i] + 16; 3676 vi = aj + diag[i] + 1; 3677 nz = ai[i+1] - diag[i] - 1; 3678 idt = 4*i; 3679 s1 = t[idt]; s2 = t[1+idt]; 3680 s3 = t[2+idt];s4 = t[3+idt]; 3681 while (nz--) { 3682 idx = 4*(*vi++); 3683 x1 = t[idx]; x2 = t[1+idx]; 3684 x3 = t[2+idx]; x4 = t[3+idx]; 3685 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3686 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3687 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3688 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3689 v += 16; 3690 } 3691 idc = 4*(*c--); 3692 v = aa + 16*diag[i]; 3693 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 3694 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 3695 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 3696 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 3697 } 3698 3699 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3700 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3701 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3702 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3703 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3704 PetscFunctionReturn(0); 3705 } 3706 3707 #undef __FUNCT__ 3708 #define __FUNCT__ "MatSolve_SeqBAIJ_4" 3709 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 3710 { 3711 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3712 IS iscol=a->col,isrow=a->row; 3713 PetscErrorCode ierr; 3714 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 3715 PetscInt i,nz,idx,idt,idc,m; 3716 const PetscInt *r,*c,*rout,*cout; 3717 const MatScalar *aa=a->a,*v; 3718 PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 3719 const PetscScalar *b; 3720 3721 PetscFunctionBegin; 3722 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3723 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3724 t = a->solve_work; 3725 3726 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3727 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 3728 3729 /* forward solve the lower triangular */ 3730 idx = 4*r[0]; 3731 t[0] = b[idx]; t[1] = b[1+idx]; 3732 t[2] = b[2+idx]; t[3] = b[3+idx]; 3733 for (i=1; i<n; i++) { 3734 v = aa + 16*ai[i]; 3735 vi = aj + ai[i]; 3736 nz = ai[i+1] - ai[i]; 3737 idx = 4*r[i]; 3738 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3739 for (m=0;m<nz;m++) { 3740 idx = 4*vi[m]; 3741 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 3742 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3743 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3744 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3745 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3746 v += 16; 3747 } 3748 idx = 4*i; 3749 t[idx] = s1;t[1+idx] = s2; 3750 t[2+idx] = s3;t[3+idx] = s4; 3751 } 3752 /* backward solve the upper triangular */ 3753 for (i=n-1; i>=0; i--) { 3754 v = aa + 16*(adiag[i+1]+1); 3755 vi = aj + adiag[i+1]+1; 3756 nz = adiag[i] - adiag[i+1] - 1; 3757 idt = 4*i; 3758 s1 = t[idt]; s2 = t[1+idt]; 3759 s3 = t[2+idt];s4 = t[3+idt]; 3760 for (m=0;m<nz;m++) { 3761 idx = 4*vi[m]; 3762 x1 = t[idx]; x2 = t[1+idx]; 3763 x3 = t[2+idx]; x4 = t[3+idx]; 3764 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3765 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3766 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3767 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3768 v += 16; 3769 } 3770 idc = 4*c[i]; 3771 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 3772 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 3773 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 3774 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 3775 } 3776 3777 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3778 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3779 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3780 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3781 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3782 PetscFunctionReturn(0); 3783 } 3784 3785 #undef __FUNCT__ 3786 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion" 3787 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx) 3788 { 3789 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3790 IS iscol=a->col,isrow=a->row; 3791 PetscErrorCode ierr; 3792 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 3793 PetscInt i,nz,idx,idt,idc; 3794 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3795 const MatScalar *aa=a->a,*v; 3796 MatScalar s1,s2,s3,s4,x1,x2,x3,x4,*t; 3797 PetscScalar *x; 3798 const PetscScalar *b; 3799 3800 PetscFunctionBegin; 3801 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3802 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3803 t = (MatScalar *)a->solve_work; 3804 3805 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3806 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3807 3808 /* forward solve the lower triangular */ 3809 idx = 4*(*r++); 3810 t[0] = (MatScalar)b[idx]; 3811 t[1] = (MatScalar)b[1+idx]; 3812 t[2] = (MatScalar)b[2+idx]; 3813 t[3] = (MatScalar)b[3+idx]; 3814 for (i=1; i<n; i++) { 3815 v = aa + 16*ai[i]; 3816 vi = aj + ai[i]; 3817 nz = diag[i] - ai[i]; 3818 idx = 4*(*r++); 3819 s1 = (MatScalar)b[idx]; 3820 s2 = (MatScalar)b[1+idx]; 3821 s3 = (MatScalar)b[2+idx]; 3822 s4 = (MatScalar)b[3+idx]; 3823 while (nz--) { 3824 idx = 4*(*vi++); 3825 x1 = t[idx]; 3826 x2 = t[1+idx]; 3827 x3 = t[2+idx]; 3828 x4 = t[3+idx]; 3829 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3830 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3831 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3832 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3833 v += 16; 3834 } 3835 idx = 4*i; 3836 t[idx] = s1; 3837 t[1+idx] = s2; 3838 t[2+idx] = s3; 3839 t[3+idx] = s4; 3840 } 3841 /* backward solve the upper triangular */ 3842 for (i=n-1; i>=0; i--) { 3843 v = aa + 16*diag[i] + 16; 3844 vi = aj + diag[i] + 1; 3845 nz = ai[i+1] - diag[i] - 1; 3846 idt = 4*i; 3847 s1 = t[idt]; 3848 s2 = t[1+idt]; 3849 s3 = t[2+idt]; 3850 s4 = t[3+idt]; 3851 while (nz--) { 3852 idx = 4*(*vi++); 3853 x1 = t[idx]; 3854 x2 = t[1+idx]; 3855 x3 = t[2+idx]; 3856 x4 = t[3+idx]; 3857 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3858 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3859 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3860 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3861 v += 16; 3862 } 3863 idc = 4*(*c--); 3864 v = aa + 16*diag[i]; 3865 t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 3866 t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 3867 t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 3868 t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 3869 x[idc] = (PetscScalar)t[idt]; 3870 x[1+idc] = (PetscScalar)t[1+idt]; 3871 x[2+idc] = (PetscScalar)t[2+idt]; 3872 x[3+idc] = (PetscScalar)t[3+idt]; 3873 } 3874 3875 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3876 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3877 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3878 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3879 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3880 PetscFunctionReturn(0); 3881 } 3882 3883 #if defined (PETSC_HAVE_SSE) 3884 3885 #include PETSC_HAVE_SSE 3886 3887 #undef __FUNCT__ 3888 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion" 3889 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx) 3890 { 3891 /* 3892 Note: This code uses demotion of double 3893 to float when performing the mixed-mode computation. 3894 This may not be numerically reasonable for all applications. 3895 */ 3896 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3897 IS iscol=a->col,isrow=a->row; 3898 PetscErrorCode ierr; 3899 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16; 3900 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3901 MatScalar *aa=a->a,*v; 3902 PetscScalar *x,*b,*t; 3903 3904 /* Make space in temp stack for 16 Byte Aligned arrays */ 3905 float ssealignedspace[11],*tmps,*tmpx; 3906 unsigned long offset; 3907 3908 PetscFunctionBegin; 3909 SSE_SCOPE_BEGIN; 3910 3911 offset = (unsigned long)ssealignedspace % 16; 3912 if (offset) offset = (16 - offset)/4; 3913 tmps = &ssealignedspace[offset]; 3914 tmpx = &ssealignedspace[offset+4]; 3915 PREFETCH_NTA(aa+16*ai[1]); 3916 3917 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3918 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3919 t = a->solve_work; 3920 3921 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3922 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3923 3924 /* forward solve the lower triangular */ 3925 idx = 4*(*r++); 3926 t[0] = b[idx]; t[1] = b[1+idx]; 3927 t[2] = b[2+idx]; t[3] = b[3+idx]; 3928 v = aa + 16*ai[1]; 3929 3930 for (i=1; i<n;) { 3931 PREFETCH_NTA(&v[8]); 3932 vi = aj + ai[i]; 3933 nz = diag[i] - ai[i]; 3934 idx = 4*(*r++); 3935 3936 /* Demote sum from double to float */ 3937 CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 3938 LOAD_PS(tmps,XMM7); 3939 3940 while (nz--) { 3941 PREFETCH_NTA(&v[16]); 3942 idx = 4*(*vi++); 3943 3944 /* Demote solution (so far) from double to float */ 3945 CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 3946 3947 /* 4x4 Matrix-Vector product with negative accumulation: */ 3948 SSE_INLINE_BEGIN_2(tmpx,v) 3949 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 3950 3951 /* First Column */ 3952 SSE_COPY_PS(XMM0,XMM6) 3953 SSE_SHUFFLE(XMM0,XMM0,0x00) 3954 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 3955 SSE_SUB_PS(XMM7,XMM0) 3956 3957 /* Second Column */ 3958 SSE_COPY_PS(XMM1,XMM6) 3959 SSE_SHUFFLE(XMM1,XMM1,0x55) 3960 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 3961 SSE_SUB_PS(XMM7,XMM1) 3962 3963 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 3964 3965 /* Third Column */ 3966 SSE_COPY_PS(XMM2,XMM6) 3967 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3968 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 3969 SSE_SUB_PS(XMM7,XMM2) 3970 3971 /* Fourth Column */ 3972 SSE_COPY_PS(XMM3,XMM6) 3973 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3974 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 3975 SSE_SUB_PS(XMM7,XMM3) 3976 SSE_INLINE_END_2 3977 3978 v += 16; 3979 } 3980 idx = 4*i; 3981 v = aa + 16*ai[++i]; 3982 PREFETCH_NTA(v); 3983 STORE_PS(tmps,XMM7); 3984 3985 /* Promote result from float to double */ 3986 CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps); 3987 } 3988 /* backward solve the upper triangular */ 3989 idt = 4*(n-1); 3990 ai16 = 16*diag[n-1]; 3991 v = aa + ai16 + 16; 3992 for (i=n-1; i>=0;) { 3993 PREFETCH_NTA(&v[8]); 3994 vi = aj + diag[i] + 1; 3995 nz = ai[i+1] - diag[i] - 1; 3996 3997 /* Demote accumulator from double to float */ 3998 CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]); 3999 LOAD_PS(tmps,XMM7); 4000 4001 while (nz--) { 4002 PREFETCH_NTA(&v[16]); 4003 idx = 4*(*vi++); 4004 4005 /* Demote solution (so far) from double to float */ 4006 CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]); 4007 4008 /* 4x4 Matrix-Vector Product with negative accumulation: */ 4009 SSE_INLINE_BEGIN_2(tmpx,v) 4010 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 4011 4012 /* First Column */ 4013 SSE_COPY_PS(XMM0,XMM6) 4014 SSE_SHUFFLE(XMM0,XMM0,0x00) 4015 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 4016 SSE_SUB_PS(XMM7,XMM0) 4017 4018 /* Second Column */ 4019 SSE_COPY_PS(XMM1,XMM6) 4020 SSE_SHUFFLE(XMM1,XMM1,0x55) 4021 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 4022 SSE_SUB_PS(XMM7,XMM1) 4023 4024 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 4025 4026 /* Third Column */ 4027 SSE_COPY_PS(XMM2,XMM6) 4028 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4029 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 4030 SSE_SUB_PS(XMM7,XMM2) 4031 4032 /* Fourth Column */ 4033 SSE_COPY_PS(XMM3,XMM6) 4034 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4035 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 4036 SSE_SUB_PS(XMM7,XMM3) 4037 SSE_INLINE_END_2 4038 v += 16; 4039 } 4040 v = aa + ai16; 4041 ai16 = 16*diag[--i]; 4042 PREFETCH_NTA(aa+ai16+16); 4043 /* 4044 Scale the result by the diagonal 4x4 block, 4045 which was inverted as part of the factorization 4046 */ 4047 SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 4048 /* First Column */ 4049 SSE_COPY_PS(XMM0,XMM7) 4050 SSE_SHUFFLE(XMM0,XMM0,0x00) 4051 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 4052 4053 /* Second Column */ 4054 SSE_COPY_PS(XMM1,XMM7) 4055 SSE_SHUFFLE(XMM1,XMM1,0x55) 4056 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 4057 SSE_ADD_PS(XMM0,XMM1) 4058 4059 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 4060 4061 /* Third Column */ 4062 SSE_COPY_PS(XMM2,XMM7) 4063 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4064 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 4065 SSE_ADD_PS(XMM0,XMM2) 4066 4067 /* Fourth Column */ 4068 SSE_COPY_PS(XMM3,XMM7) 4069 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4070 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 4071 SSE_ADD_PS(XMM0,XMM3) 4072 4073 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 4074 SSE_INLINE_END_3 4075 4076 /* Promote solution from float to double */ 4077 CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps); 4078 4079 /* Apply reordering to t and stream into x. */ 4080 /* This way, x doesn't pollute the cache. */ 4081 /* Be careful with size: 2 doubles = 4 floats! */ 4082 idc = 4*(*c--); 4083 SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc]) 4084 /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 4085 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 4086 SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0) 4087 /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 4088 SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1) 4089 SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1) 4090 SSE_INLINE_END_2 4091 v = aa + ai16 + 16; 4092 idt -= 4; 4093 } 4094 4095 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4096 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4097 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 4098 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4099 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 4100 SSE_SCOPE_END; 4101 PetscFunctionReturn(0); 4102 } 4103 4104 #endif 4105 4106 4107 /* 4108 Special case where the matrix was ILU(0) factored in the natural 4109 ordering. This eliminates the need for the column and row permutation. 4110 */ 4111 #undef __FUNCT__ 4112 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_inplace" 4113 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 4114 { 4115 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4116 PetscInt n=a->mbs; 4117 const PetscInt *ai=a->i,*aj=a->j; 4118 PetscErrorCode ierr; 4119 const PetscInt *diag = a->diag; 4120 const MatScalar *aa=a->a; 4121 PetscScalar *x; 4122 const PetscScalar *b; 4123 4124 PetscFunctionBegin; 4125 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 4126 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4127 4128 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS) 4129 { 4130 static PetscScalar w[2000]; /* very BAD need to fix */ 4131 fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w); 4132 } 4133 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 4134 { 4135 static PetscScalar w[2000]; /* very BAD need to fix */ 4136 fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w); 4137 } 4138 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 4139 fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b); 4140 #else 4141 { 4142 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 4143 const MatScalar *v; 4144 PetscInt jdx,idt,idx,nz,i,ai16; 4145 const PetscInt *vi; 4146 4147 /* forward solve the lower triangular */ 4148 idx = 0; 4149 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 4150 for (i=1; i<n; i++) { 4151 v = aa + 16*ai[i]; 4152 vi = aj + ai[i]; 4153 nz = diag[i] - ai[i]; 4154 idx += 4; 4155 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 4156 while (nz--) { 4157 jdx = 4*(*vi++); 4158 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx]; 4159 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4160 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4161 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4162 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4163 v += 16; 4164 } 4165 x[idx] = s1; 4166 x[1+idx] = s2; 4167 x[2+idx] = s3; 4168 x[3+idx] = s4; 4169 } 4170 /* backward solve the upper triangular */ 4171 idt = 4*(n-1); 4172 for (i=n-1; i>=0; i--) { 4173 ai16 = 16*diag[i]; 4174 v = aa + ai16 + 16; 4175 vi = aj + diag[i] + 1; 4176 nz = ai[i+1] - diag[i] - 1; 4177 s1 = x[idt]; s2 = x[1+idt]; 4178 s3 = x[2+idt];s4 = x[3+idt]; 4179 while (nz--) { 4180 idx = 4*(*vi++); 4181 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; 4182 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4183 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4184 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4185 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4186 v += 16; 4187 } 4188 v = aa + ai16; 4189 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 4190 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 4191 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 4192 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 4193 idt -= 4; 4194 } 4195 } 4196 #endif 4197 4198 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 4199 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4200 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 4201 PetscFunctionReturn(0); 4202 } 4203 4204 #undef __FUNCT__ 4205 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering" 4206 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 4207 { 4208 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4209 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 4210 PetscInt i,k,nz,idx,jdx,idt; 4211 PetscErrorCode ierr; 4212 const PetscInt bs = A->rmap->bs,bs2 = a->bs2; 4213 const MatScalar *aa=a->a,*v; 4214 PetscScalar *x; 4215 const PetscScalar *b; 4216 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 4217 4218 PetscFunctionBegin; 4219 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 4220 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4221 /* forward solve the lower triangular */ 4222 idx = 0; 4223 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 4224 for (i=1; i<n; i++) { 4225 v = aa + bs2*ai[i]; 4226 vi = aj + ai[i]; 4227 nz = ai[i+1] - ai[i]; 4228 idx = bs*i; 4229 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 4230 for (k=0;k<nz;k++) { 4231 jdx = bs*vi[k]; 4232 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 4233 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4234 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4235 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4236 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4237 4238 v += bs2; 4239 } 4240 4241 x[idx] = s1; 4242 x[1+idx] = s2; 4243 x[2+idx] = s3; 4244 x[3+idx] = s4; 4245 } 4246 4247 /* backward solve the upper triangular */ 4248 for (i=n-1; i>=0; i--) { 4249 v = aa + bs2*(adiag[i+1]+1); 4250 vi = aj + adiag[i+1]+1; 4251 nz = adiag[i] - adiag[i+1]-1; 4252 idt = bs*i; 4253 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 4254 4255 for (k=0;k<nz;k++) { 4256 idx = bs*vi[k]; 4257 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 4258 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4259 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4260 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4261 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4262 4263 v += bs2; 4264 } 4265 /* x = inv_diagonal*x */ 4266 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 4267 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;; 4268 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 4269 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 4270 4271 } 4272 4273 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 4274 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4275 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 4276 PetscFunctionReturn(0); 4277 } 4278 4279 #undef __FUNCT__ 4280 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion" 4281 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx) 4282 { 4283 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4284 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*diag=a->diag; 4285 PetscErrorCode ierr; 4286 const MatScalar *aa=a->a; 4287 const PetscScalar *b; 4288 PetscScalar *x; 4289 4290 PetscFunctionBegin; 4291 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 4292 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4293 4294 { 4295 MatScalar s1,s2,s3,s4,x1,x2,x3,x4; 4296 const MatScalar *v; 4297 MatScalar *t=(MatScalar *)x; 4298 PetscInt jdx,idt,idx,nz,i,ai16; 4299 const PetscInt *vi; 4300 4301 /* forward solve the lower triangular */ 4302 idx = 0; 4303 t[0] = (MatScalar)b[0]; 4304 t[1] = (MatScalar)b[1]; 4305 t[2] = (MatScalar)b[2]; 4306 t[3] = (MatScalar)b[3]; 4307 for (i=1; i<n; i++) { 4308 v = aa + 16*ai[i]; 4309 vi = aj + ai[i]; 4310 nz = diag[i] - ai[i]; 4311 idx += 4; 4312 s1 = (MatScalar)b[idx]; 4313 s2 = (MatScalar)b[1+idx]; 4314 s3 = (MatScalar)b[2+idx]; 4315 s4 = (MatScalar)b[3+idx]; 4316 while (nz--) { 4317 jdx = 4*(*vi++); 4318 x1 = t[jdx]; 4319 x2 = t[1+jdx]; 4320 x3 = t[2+jdx]; 4321 x4 = t[3+jdx]; 4322 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4323 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4324 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4325 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4326 v += 16; 4327 } 4328 t[idx] = s1; 4329 t[1+idx] = s2; 4330 t[2+idx] = s3; 4331 t[3+idx] = s4; 4332 } 4333 /* backward solve the upper triangular */ 4334 idt = 4*(n-1); 4335 for (i=n-1; i>=0; i--) { 4336 ai16 = 16*diag[i]; 4337 v = aa + ai16 + 16; 4338 vi = aj + diag[i] + 1; 4339 nz = ai[i+1] - diag[i] - 1; 4340 s1 = t[idt]; 4341 s2 = t[1+idt]; 4342 s3 = t[2+idt]; 4343 s4 = t[3+idt]; 4344 while (nz--) { 4345 idx = 4*(*vi++); 4346 x1 = (MatScalar)x[idx]; 4347 x2 = (MatScalar)x[1+idx]; 4348 x3 = (MatScalar)x[2+idx]; 4349 x4 = (MatScalar)x[3+idx]; 4350 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4351 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4352 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4353 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4354 v += 16; 4355 } 4356 v = aa + ai16; 4357 x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4); 4358 x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4); 4359 x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4); 4360 x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4); 4361 idt -= 4; 4362 } 4363 } 4364 4365 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 4366 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4367 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 4368 PetscFunctionReturn(0); 4369 } 4370 4371 #if defined (PETSC_HAVE_SSE) 4372 4373 #include PETSC_HAVE_SSE 4374 #undef __FUNCT__ 4375 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj" 4376 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx) 4377 { 4378 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4379 unsigned short *aj=(unsigned short *)a->j; 4380 PetscErrorCode ierr; 4381 int *ai=a->i,n=a->mbs,*diag = a->diag; 4382 MatScalar *aa=a->a; 4383 PetscScalar *x,*b; 4384 4385 PetscFunctionBegin; 4386 SSE_SCOPE_BEGIN; 4387 /* 4388 Note: This code currently uses demotion of double 4389 to float when performing the mixed-mode computation. 4390 This may not be numerically reasonable for all applications. 4391 */ 4392 PREFETCH_NTA(aa+16*ai[1]); 4393 4394 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 4395 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4396 { 4397 /* x will first be computed in single precision then promoted inplace to double */ 4398 MatScalar *v,*t=(MatScalar *)x; 4399 int nz,i,idt,ai16; 4400 unsigned int jdx,idx; 4401 unsigned short *vi; 4402 /* Forward solve the lower triangular factor. */ 4403 4404 /* First block is the identity. */ 4405 idx = 0; 4406 CONVERT_DOUBLE4_FLOAT4(t,b); 4407 v = aa + 16*((unsigned int)ai[1]); 4408 4409 for (i=1; i<n;) { 4410 PREFETCH_NTA(&v[8]); 4411 vi = aj + ai[i]; 4412 nz = diag[i] - ai[i]; 4413 idx += 4; 4414 4415 /* Demote RHS from double to float. */ 4416 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 4417 LOAD_PS(&t[idx],XMM7); 4418 4419 while (nz--) { 4420 PREFETCH_NTA(&v[16]); 4421 jdx = 4*((unsigned int)(*vi++)); 4422 4423 /* 4x4 Matrix-Vector product with negative accumulation: */ 4424 SSE_INLINE_BEGIN_2(&t[jdx],v) 4425 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 4426 4427 /* First Column */ 4428 SSE_COPY_PS(XMM0,XMM6) 4429 SSE_SHUFFLE(XMM0,XMM0,0x00) 4430 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 4431 SSE_SUB_PS(XMM7,XMM0) 4432 4433 /* Second Column */ 4434 SSE_COPY_PS(XMM1,XMM6) 4435 SSE_SHUFFLE(XMM1,XMM1,0x55) 4436 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 4437 SSE_SUB_PS(XMM7,XMM1) 4438 4439 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 4440 4441 /* Third Column */ 4442 SSE_COPY_PS(XMM2,XMM6) 4443 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4444 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 4445 SSE_SUB_PS(XMM7,XMM2) 4446 4447 /* Fourth Column */ 4448 SSE_COPY_PS(XMM3,XMM6) 4449 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4450 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 4451 SSE_SUB_PS(XMM7,XMM3) 4452 SSE_INLINE_END_2 4453 4454 v += 16; 4455 } 4456 v = aa + 16*ai[++i]; 4457 PREFETCH_NTA(v); 4458 STORE_PS(&t[idx],XMM7); 4459 } 4460 4461 /* Backward solve the upper triangular factor.*/ 4462 4463 idt = 4*(n-1); 4464 ai16 = 16*diag[n-1]; 4465 v = aa + ai16 + 16; 4466 for (i=n-1; i>=0;) { 4467 PREFETCH_NTA(&v[8]); 4468 vi = aj + diag[i] + 1; 4469 nz = ai[i+1] - diag[i] - 1; 4470 4471 LOAD_PS(&t[idt],XMM7); 4472 4473 while (nz--) { 4474 PREFETCH_NTA(&v[16]); 4475 idx = 4*((unsigned int)(*vi++)); 4476 4477 /* 4x4 Matrix-Vector Product with negative accumulation: */ 4478 SSE_INLINE_BEGIN_2(&t[idx],v) 4479 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 4480 4481 /* First Column */ 4482 SSE_COPY_PS(XMM0,XMM6) 4483 SSE_SHUFFLE(XMM0,XMM0,0x00) 4484 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 4485 SSE_SUB_PS(XMM7,XMM0) 4486 4487 /* Second Column */ 4488 SSE_COPY_PS(XMM1,XMM6) 4489 SSE_SHUFFLE(XMM1,XMM1,0x55) 4490 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 4491 SSE_SUB_PS(XMM7,XMM1) 4492 4493 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 4494 4495 /* Third Column */ 4496 SSE_COPY_PS(XMM2,XMM6) 4497 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4498 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 4499 SSE_SUB_PS(XMM7,XMM2) 4500 4501 /* Fourth Column */ 4502 SSE_COPY_PS(XMM3,XMM6) 4503 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4504 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 4505 SSE_SUB_PS(XMM7,XMM3) 4506 SSE_INLINE_END_2 4507 v += 16; 4508 } 4509 v = aa + ai16; 4510 ai16 = 16*diag[--i]; 4511 PREFETCH_NTA(aa+ai16+16); 4512 /* 4513 Scale the result by the diagonal 4x4 block, 4514 which was inverted as part of the factorization 4515 */ 4516 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 4517 /* First Column */ 4518 SSE_COPY_PS(XMM0,XMM7) 4519 SSE_SHUFFLE(XMM0,XMM0,0x00) 4520 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 4521 4522 /* Second Column */ 4523 SSE_COPY_PS(XMM1,XMM7) 4524 SSE_SHUFFLE(XMM1,XMM1,0x55) 4525 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 4526 SSE_ADD_PS(XMM0,XMM1) 4527 4528 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 4529 4530 /* Third Column */ 4531 SSE_COPY_PS(XMM2,XMM7) 4532 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4533 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 4534 SSE_ADD_PS(XMM0,XMM2) 4535 4536 /* Fourth Column */ 4537 SSE_COPY_PS(XMM3,XMM7) 4538 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4539 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 4540 SSE_ADD_PS(XMM0,XMM3) 4541 4542 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 4543 SSE_INLINE_END_3 4544 4545 v = aa + ai16 + 16; 4546 idt -= 4; 4547 } 4548 4549 /* Convert t from single precision back to double precision (inplace)*/ 4550 idt = 4*(n-1); 4551 for (i=n-1;i>=0;i--) { 4552 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 4553 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 4554 PetscScalar *xtemp=&x[idt]; 4555 MatScalar *ttemp=&t[idt]; 4556 xtemp[3] = (PetscScalar)ttemp[3]; 4557 xtemp[2] = (PetscScalar)ttemp[2]; 4558 xtemp[1] = (PetscScalar)ttemp[1]; 4559 xtemp[0] = (PetscScalar)ttemp[0]; 4560 idt -= 4; 4561 } 4562 4563 } /* End of artificial scope. */ 4564 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 4565 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4566 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 4567 SSE_SCOPE_END; 4568 PetscFunctionReturn(0); 4569 } 4570 4571 #undef __FUNCT__ 4572 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion" 4573 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx) 4574 { 4575 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4576 int *aj=a->j; 4577 PetscErrorCode ierr; 4578 int *ai=a->i,n=a->mbs,*diag = a->diag; 4579 MatScalar *aa=a->a; 4580 PetscScalar *x,*b; 4581 4582 PetscFunctionBegin; 4583 SSE_SCOPE_BEGIN; 4584 /* 4585 Note: This code currently uses demotion of double 4586 to float when performing the mixed-mode computation. 4587 This may not be numerically reasonable for all applications. 4588 */ 4589 PREFETCH_NTA(aa+16*ai[1]); 4590 4591 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 4592 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4593 { 4594 /* x will first be computed in single precision then promoted inplace to double */ 4595 MatScalar *v,*t=(MatScalar *)x; 4596 int nz,i,idt,ai16; 4597 int jdx,idx; 4598 int *vi; 4599 /* Forward solve the lower triangular factor. */ 4600 4601 /* First block is the identity. */ 4602 idx = 0; 4603 CONVERT_DOUBLE4_FLOAT4(t,b); 4604 v = aa + 16*ai[1]; 4605 4606 for (i=1; i<n;) { 4607 PREFETCH_NTA(&v[8]); 4608 vi = aj + ai[i]; 4609 nz = diag[i] - ai[i]; 4610 idx += 4; 4611 4612 /* Demote RHS from double to float. */ 4613 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 4614 LOAD_PS(&t[idx],XMM7); 4615 4616 while (nz--) { 4617 PREFETCH_NTA(&v[16]); 4618 jdx = 4*(*vi++); 4619 /* jdx = *vi++; */ 4620 4621 /* 4x4 Matrix-Vector product with negative accumulation: */ 4622 SSE_INLINE_BEGIN_2(&t[jdx],v) 4623 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 4624 4625 /* First Column */ 4626 SSE_COPY_PS(XMM0,XMM6) 4627 SSE_SHUFFLE(XMM0,XMM0,0x00) 4628 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 4629 SSE_SUB_PS(XMM7,XMM0) 4630 4631 /* Second Column */ 4632 SSE_COPY_PS(XMM1,XMM6) 4633 SSE_SHUFFLE(XMM1,XMM1,0x55) 4634 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 4635 SSE_SUB_PS(XMM7,XMM1) 4636 4637 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 4638 4639 /* Third Column */ 4640 SSE_COPY_PS(XMM2,XMM6) 4641 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4642 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 4643 SSE_SUB_PS(XMM7,XMM2) 4644 4645 /* Fourth Column */ 4646 SSE_COPY_PS(XMM3,XMM6) 4647 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4648 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 4649 SSE_SUB_PS(XMM7,XMM3) 4650 SSE_INLINE_END_2 4651 4652 v += 16; 4653 } 4654 v = aa + 16*ai[++i]; 4655 PREFETCH_NTA(v); 4656 STORE_PS(&t[idx],XMM7); 4657 } 4658 4659 /* Backward solve the upper triangular factor.*/ 4660 4661 idt = 4*(n-1); 4662 ai16 = 16*diag[n-1]; 4663 v = aa + ai16 + 16; 4664 for (i=n-1; i>=0;) { 4665 PREFETCH_NTA(&v[8]); 4666 vi = aj + diag[i] + 1; 4667 nz = ai[i+1] - diag[i] - 1; 4668 4669 LOAD_PS(&t[idt],XMM7); 4670 4671 while (nz--) { 4672 PREFETCH_NTA(&v[16]); 4673 idx = 4*(*vi++); 4674 /* idx = *vi++; */ 4675 4676 /* 4x4 Matrix-Vector Product with negative accumulation: */ 4677 SSE_INLINE_BEGIN_2(&t[idx],v) 4678 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 4679 4680 /* First Column */ 4681 SSE_COPY_PS(XMM0,XMM6) 4682 SSE_SHUFFLE(XMM0,XMM0,0x00) 4683 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 4684 SSE_SUB_PS(XMM7,XMM0) 4685 4686 /* Second Column */ 4687 SSE_COPY_PS(XMM1,XMM6) 4688 SSE_SHUFFLE(XMM1,XMM1,0x55) 4689 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 4690 SSE_SUB_PS(XMM7,XMM1) 4691 4692 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 4693 4694 /* Third Column */ 4695 SSE_COPY_PS(XMM2,XMM6) 4696 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4697 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 4698 SSE_SUB_PS(XMM7,XMM2) 4699 4700 /* Fourth Column */ 4701 SSE_COPY_PS(XMM3,XMM6) 4702 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4703 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 4704 SSE_SUB_PS(XMM7,XMM3) 4705 SSE_INLINE_END_2 4706 v += 16; 4707 } 4708 v = aa + ai16; 4709 ai16 = 16*diag[--i]; 4710 PREFETCH_NTA(aa+ai16+16); 4711 /* 4712 Scale the result by the diagonal 4x4 block, 4713 which was inverted as part of the factorization 4714 */ 4715 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 4716 /* First Column */ 4717 SSE_COPY_PS(XMM0,XMM7) 4718 SSE_SHUFFLE(XMM0,XMM0,0x00) 4719 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 4720 4721 /* Second Column */ 4722 SSE_COPY_PS(XMM1,XMM7) 4723 SSE_SHUFFLE(XMM1,XMM1,0x55) 4724 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 4725 SSE_ADD_PS(XMM0,XMM1) 4726 4727 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 4728 4729 /* Third Column */ 4730 SSE_COPY_PS(XMM2,XMM7) 4731 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4732 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 4733 SSE_ADD_PS(XMM0,XMM2) 4734 4735 /* Fourth Column */ 4736 SSE_COPY_PS(XMM3,XMM7) 4737 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4738 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 4739 SSE_ADD_PS(XMM0,XMM3) 4740 4741 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 4742 SSE_INLINE_END_3 4743 4744 v = aa + ai16 + 16; 4745 idt -= 4; 4746 } 4747 4748 /* Convert t from single precision back to double precision (inplace)*/ 4749 idt = 4*(n-1); 4750 for (i=n-1;i>=0;i--) { 4751 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 4752 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 4753 PetscScalar *xtemp=&x[idt]; 4754 MatScalar *ttemp=&t[idt]; 4755 xtemp[3] = (PetscScalar)ttemp[3]; 4756 xtemp[2] = (PetscScalar)ttemp[2]; 4757 xtemp[1] = (PetscScalar)ttemp[1]; 4758 xtemp[0] = (PetscScalar)ttemp[0]; 4759 idt -= 4; 4760 } 4761 4762 } /* End of artificial scope. */ 4763 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 4764 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4765 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 4766 SSE_SCOPE_END; 4767 PetscFunctionReturn(0); 4768 } 4769 4770 #endif 4771 4772 #undef __FUNCT__ 4773 #define __FUNCT__ "MatSolve_SeqBAIJ_3_inplace" 4774 PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx) 4775 { 4776 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 4777 IS iscol=a->col,isrow=a->row; 4778 PetscErrorCode ierr; 4779 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 4780 PetscInt i,nz,idx,idt,idc; 4781 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 4782 const MatScalar *aa=a->a,*v; 4783 PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 4784 const PetscScalar *b; 4785 4786 PetscFunctionBegin; 4787 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 4788 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4789 t = a->solve_work; 4790 4791 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 4792 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 4793 4794 /* forward solve the lower triangular */ 4795 idx = 3*(*r++); 4796 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 4797 for (i=1; i<n; i++) { 4798 v = aa + 9*ai[i]; 4799 vi = aj + ai[i]; 4800 nz = diag[i] - ai[i]; 4801 idx = 3*(*r++); 4802 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 4803 while (nz--) { 4804 idx = 3*(*vi++); 4805 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4806 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4807 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4808 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4809 v += 9; 4810 } 4811 idx = 3*i; 4812 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 4813 } 4814 /* backward solve the upper triangular */ 4815 for (i=n-1; i>=0; i--) { 4816 v = aa + 9*diag[i] + 9; 4817 vi = aj + diag[i] + 1; 4818 nz = ai[i+1] - diag[i] - 1; 4819 idt = 3*i; 4820 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 4821 while (nz--) { 4822 idx = 3*(*vi++); 4823 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4824 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4825 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4826 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4827 v += 9; 4828 } 4829 idc = 3*(*c--); 4830 v = aa + 9*diag[i]; 4831 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4832 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4833 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4834 } 4835 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4836 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4837 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 4838 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4839 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 4840 PetscFunctionReturn(0); 4841 } 4842 4843 #undef __FUNCT__ 4844 #define __FUNCT__ "MatSolve_SeqBAIJ_3" 4845 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 4846 { 4847 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 4848 IS iscol=a->col,isrow=a->row; 4849 PetscErrorCode ierr; 4850 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 4851 PetscInt i,nz,idx,idt,idc,m; 4852 const PetscInt *r,*c,*rout,*cout; 4853 const MatScalar *aa=a->a,*v; 4854 PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 4855 const PetscScalar *b; 4856 4857 PetscFunctionBegin; 4858 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 4859 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4860 t = a->solve_work; 4861 4862 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 4863 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 4864 4865 /* forward solve the lower triangular */ 4866 idx = 3*r[0]; 4867 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 4868 for (i=1; i<n; i++) { 4869 v = aa + 9*ai[i]; 4870 vi = aj + ai[i]; 4871 nz = ai[i+1] - ai[i]; 4872 idx = 3*r[i]; 4873 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 4874 for (m=0;m<nz;m++) { 4875 idx = 3*vi[m]; 4876 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4877 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4878 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4879 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4880 v += 9; 4881 } 4882 idx = 3*i; 4883 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 4884 } 4885 /* backward solve the upper triangular */ 4886 for (i=n-1; i>=0; i--) { 4887 v = aa + 9*(adiag[i+1]+1); 4888 vi = aj + adiag[i+1]+1; 4889 nz = adiag[i] - adiag[i+1] - 1; 4890 idt = 3*i; 4891 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 4892 for (m=0;m<nz;m++) { 4893 idx = 3*vi[m]; 4894 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4895 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4896 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4897 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4898 v += 9; 4899 } 4900 idc = 3*c[i]; 4901 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4902 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4903 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4904 } 4905 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4906 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4907 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 4908 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4909 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 4910 PetscFunctionReturn(0); 4911 } 4912 4913 /* 4914 Special case where the matrix was ILU(0) factored in the natural 4915 ordering. This eliminates the need for the column and row permutation. 4916 */ 4917 #undef __FUNCT__ 4918 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_inplace" 4919 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 4920 { 4921 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4922 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 4923 PetscErrorCode ierr; 4924 const PetscInt *diag = a->diag,*vi; 4925 const MatScalar *aa=a->a,*v; 4926 PetscScalar *x,s1,s2,s3,x1,x2,x3; 4927 const PetscScalar *b; 4928 PetscInt jdx,idt,idx,nz,i; 4929 4930 PetscFunctionBegin; 4931 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 4932 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4933 4934 /* forward solve the lower triangular */ 4935 idx = 0; 4936 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; 4937 for (i=1; i<n; i++) { 4938 v = aa + 9*ai[i]; 4939 vi = aj + ai[i]; 4940 nz = diag[i] - ai[i]; 4941 idx += 3; 4942 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 4943 while (nz--) { 4944 jdx = 3*(*vi++); 4945 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx]; 4946 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4947 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4948 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4949 v += 9; 4950 } 4951 x[idx] = s1; 4952 x[1+idx] = s2; 4953 x[2+idx] = s3; 4954 } 4955 /* backward solve the upper triangular */ 4956 for (i=n-1; i>=0; i--) { 4957 v = aa + 9*diag[i] + 9; 4958 vi = aj + diag[i] + 1; 4959 nz = ai[i+1] - diag[i] - 1; 4960 idt = 3*i; 4961 s1 = x[idt]; s2 = x[1+idt]; 4962 s3 = x[2+idt]; 4963 while (nz--) { 4964 idx = 3*(*vi++); 4965 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; 4966 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4967 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4968 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4969 v += 9; 4970 } 4971 v = aa + 9*diag[i]; 4972 x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4973 x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4974 x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4975 } 4976 4977 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 4978 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4979 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 4980 PetscFunctionReturn(0); 4981 } 4982 4983 #undef __FUNCT__ 4984 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering" 4985 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 4986 { 4987 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4988 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 4989 PetscErrorCode ierr; 4990 PetscInt i,k,nz,idx,jdx,idt; 4991 const PetscInt bs = A->rmap->bs,bs2 = a->bs2; 4992 const MatScalar *aa=a->a,*v; 4993 PetscScalar *x; 4994 const PetscScalar *b; 4995 PetscScalar s1,s2,s3,x1,x2,x3; 4996 4997 PetscFunctionBegin; 4998 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 4999 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5000 /* forward solve the lower triangular */ 5001 idx = 0; 5002 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx]; 5003 for (i=1; i<n; i++) { 5004 v = aa + bs2*ai[i]; 5005 vi = aj + ai[i]; 5006 nz = ai[i+1] - ai[i]; 5007 idx = bs*i; 5008 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 5009 for (k=0;k<nz;k++) { 5010 jdx = bs*vi[k]; 5011 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx]; 5012 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 5013 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 5014 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 5015 5016 v += bs2; 5017 } 5018 5019 x[idx] = s1; 5020 x[1+idx] = s2; 5021 x[2+idx] = s3; 5022 } 5023 5024 /* backward solve the upper triangular */ 5025 for (i=n-1; i>=0; i--) { 5026 v = aa + bs2*(adiag[i+1]+1); 5027 vi = aj + adiag[i+1]+1; 5028 nz = adiag[i] - adiag[i+1]-1; 5029 idt = bs*i; 5030 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt]; 5031 5032 for (k=0;k<nz;k++) { 5033 idx = bs*vi[k]; 5034 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 5035 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 5036 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 5037 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 5038 5039 v += bs2; 5040 } 5041 /* x = inv_diagonal*x */ 5042 x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 5043 x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 5044 x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 5045 5046 } 5047 5048 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5049 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5050 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 5051 PetscFunctionReturn(0); 5052 } 5053 5054 #undef __FUNCT__ 5055 #define __FUNCT__ "MatSolve_SeqBAIJ_2_inplace" 5056 PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx) 5057 { 5058 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 5059 IS iscol=a->col,isrow=a->row; 5060 PetscErrorCode ierr; 5061 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 5062 PetscInt i,nz,idx,idt,idc; 5063 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 5064 const MatScalar *aa=a->a,*v; 5065 PetscScalar *x,s1,s2,x1,x2,*t; 5066 const PetscScalar *b; 5067 5068 PetscFunctionBegin; 5069 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 5070 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5071 t = a->solve_work; 5072 5073 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 5074 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 5075 5076 /* forward solve the lower triangular */ 5077 idx = 2*(*r++); 5078 t[0] = b[idx]; t[1] = b[1+idx]; 5079 for (i=1; i<n; i++) { 5080 v = aa + 4*ai[i]; 5081 vi = aj + ai[i]; 5082 nz = diag[i] - ai[i]; 5083 idx = 2*(*r++); 5084 s1 = b[idx]; s2 = b[1+idx]; 5085 while (nz--) { 5086 idx = 2*(*vi++); 5087 x1 = t[idx]; x2 = t[1+idx]; 5088 s1 -= v[0]*x1 + v[2]*x2; 5089 s2 -= v[1]*x1 + v[3]*x2; 5090 v += 4; 5091 } 5092 idx = 2*i; 5093 t[idx] = s1; t[1+idx] = s2; 5094 } 5095 /* backward solve the upper triangular */ 5096 for (i=n-1; i>=0; i--) { 5097 v = aa + 4*diag[i] + 4; 5098 vi = aj + diag[i] + 1; 5099 nz = ai[i+1] - diag[i] - 1; 5100 idt = 2*i; 5101 s1 = t[idt]; s2 = t[1+idt]; 5102 while (nz--) { 5103 idx = 2*(*vi++); 5104 x1 = t[idx]; x2 = t[1+idx]; 5105 s1 -= v[0]*x1 + v[2]*x2; 5106 s2 -= v[1]*x1 + v[3]*x2; 5107 v += 4; 5108 } 5109 idc = 2*(*c--); 5110 v = aa + 4*diag[i]; 5111 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 5112 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 5113 } 5114 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 5115 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 5116 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5117 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5118 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 5119 PetscFunctionReturn(0); 5120 } 5121 5122 #undef __FUNCT__ 5123 #define __FUNCT__ "MatSolve_SeqBAIJ_2" 5124 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 5125 { 5126 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 5127 IS iscol=a->col,isrow=a->row; 5128 PetscErrorCode ierr; 5129 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 5130 PetscInt i,nz,idx,jdx,idt,idc,m; 5131 const PetscInt *r,*c,*rout,*cout; 5132 const MatScalar *aa=a->a,*v; 5133 PetscScalar *x,s1,s2,x1,x2,*t; 5134 const PetscScalar *b; 5135 5136 PetscFunctionBegin; 5137 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 5138 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5139 t = a->solve_work; 5140 5141 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 5142 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 5143 5144 /* forward solve the lower triangular */ 5145 idx = 2*r[0]; 5146 t[0] = b[idx]; t[1] = b[1+idx]; 5147 for (i=1; i<n; i++) { 5148 v = aa + 4*ai[i]; 5149 vi = aj + ai[i]; 5150 nz = ai[i+1] - ai[i]; 5151 idx = 2*r[i]; 5152 s1 = b[idx]; s2 = b[1+idx]; 5153 for (m=0;m<nz;m++) { 5154 jdx = 2*vi[m]; 5155 x1 = t[jdx]; x2 = t[1+jdx]; 5156 s1 -= v[0]*x1 + v[2]*x2; 5157 s2 -= v[1]*x1 + v[3]*x2; 5158 v += 4; 5159 } 5160 idx = 2*i; 5161 t[idx] = s1; t[1+idx] = s2; 5162 } 5163 /* backward solve the upper triangular */ 5164 for (i=n-1; i>=0; i--) { 5165 v = aa + 4*(adiag[i+1]+1); 5166 vi = aj + adiag[i+1]+1; 5167 nz = adiag[i] - adiag[i+1] - 1; 5168 idt = 2*i; 5169 s1 = t[idt]; s2 = t[1+idt]; 5170 for (m=0;m<nz;m++) { 5171 idx = 2*vi[m]; 5172 x1 = t[idx]; x2 = t[1+idx]; 5173 s1 -= v[0]*x1 + v[2]*x2; 5174 s2 -= v[1]*x1 + v[3]*x2; 5175 v += 4; 5176 } 5177 idc = 2*c[i]; 5178 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 5179 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 5180 } 5181 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 5182 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 5183 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5184 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5185 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 5186 PetscFunctionReturn(0); 5187 } 5188 5189 /* 5190 Special case where the matrix was ILU(0) factored in the natural 5191 ordering. This eliminates the need for the column and row permutation. 5192 */ 5193 #undef __FUNCT__ 5194 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_inplace" 5195 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 5196 { 5197 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 5198 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 5199 PetscErrorCode ierr; 5200 const MatScalar *aa=a->a,*v; 5201 PetscScalar *x,s1,s2,x1,x2; 5202 const PetscScalar *b; 5203 PetscInt jdx,idt,idx,nz,i; 5204 5205 PetscFunctionBegin; 5206 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 5207 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5208 5209 /* forward solve the lower triangular */ 5210 idx = 0; 5211 x[0] = b[0]; x[1] = b[1]; 5212 for (i=1; i<n; i++) { 5213 v = aa + 4*ai[i]; 5214 vi = aj + ai[i]; 5215 nz = diag[i] - ai[i]; 5216 idx += 2; 5217 s1 = b[idx];s2 = b[1+idx]; 5218 while (nz--) { 5219 jdx = 2*(*vi++); 5220 x1 = x[jdx];x2 = x[1+jdx]; 5221 s1 -= v[0]*x1 + v[2]*x2; 5222 s2 -= v[1]*x1 + v[3]*x2; 5223 v += 4; 5224 } 5225 x[idx] = s1; 5226 x[1+idx] = s2; 5227 } 5228 /* backward solve the upper triangular */ 5229 for (i=n-1; i>=0; i--) { 5230 v = aa + 4*diag[i] + 4; 5231 vi = aj + diag[i] + 1; 5232 nz = ai[i+1] - diag[i] - 1; 5233 idt = 2*i; 5234 s1 = x[idt]; s2 = x[1+idt]; 5235 while (nz--) { 5236 idx = 2*(*vi++); 5237 x1 = x[idx]; x2 = x[1+idx]; 5238 s1 -= v[0]*x1 + v[2]*x2; 5239 s2 -= v[1]*x1 + v[3]*x2; 5240 v += 4; 5241 } 5242 v = aa + 4*diag[i]; 5243 x[idt] = v[0]*s1 + v[2]*s2; 5244 x[1+idt] = v[1]*s1 + v[3]*s2; 5245 } 5246 5247 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5248 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5249 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 5250 PetscFunctionReturn(0); 5251 } 5252 5253 #undef __FUNCT__ 5254 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering" 5255 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 5256 { 5257 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 5258 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 5259 PetscInt i,k,nz,idx,idt,jdx; 5260 PetscErrorCode ierr; 5261 const MatScalar *aa=a->a,*v; 5262 PetscScalar *x,s1,s2,x1,x2; 5263 const PetscScalar *b; 5264 5265 PetscFunctionBegin; 5266 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 5267 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5268 /* forward solve the lower triangular */ 5269 idx = 0; 5270 x[0] = b[idx]; x[1] = b[1+idx]; 5271 for (i=1; i<n; i++) { 5272 v = aa + 4*ai[i]; 5273 vi = aj + ai[i]; 5274 nz = ai[i+1] - ai[i]; 5275 idx = 2*i; 5276 s1 = b[idx];s2 = b[1+idx]; 5277 PetscPrefetchBlock(vi+nz,nz,0,PETSC_PREFETCH_HINT_NTA); 5278 PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); 5279 for (k=0;k<nz;k++) { 5280 jdx = 2*vi[k]; 5281 x1 = x[jdx];x2 = x[1+jdx]; 5282 s1 -= v[0]*x1 + v[2]*x2; 5283 s2 -= v[1]*x1 + v[3]*x2; 5284 v += 4; 5285 } 5286 x[idx] = s1; 5287 x[1+idx] = s2; 5288 } 5289 5290 /* backward solve the upper triangular */ 5291 for (i=n-1; i>=0; i--) { 5292 v = aa + 4*(adiag[i+1]+1); 5293 vi = aj + adiag[i+1]+1; 5294 nz = adiag[i] - adiag[i+1]-1; 5295 idt = 2*i; 5296 s1 = x[idt]; s2 = x[1+idt]; 5297 PetscPrefetchBlock(vi+nz,nz,0,PETSC_PREFETCH_HINT_NTA); 5298 PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); 5299 for (k=0;k<nz;k++) { 5300 idx = 2*vi[k]; 5301 x1 = x[idx]; x2 = x[1+idx]; 5302 s1 -= v[0]*x1 + v[2]*x2; 5303 s2 -= v[1]*x1 + v[3]*x2; 5304 v += 4; 5305 } 5306 /* x = inv_diagonal*x */ 5307 x[idt] = v[0]*s1 + v[2]*s2; 5308 x[1+idt] = v[1]*s1 + v[3]*s2; 5309 } 5310 5311 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5312 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5313 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 5314 PetscFunctionReturn(0); 5315 } 5316 5317 #undef __FUNCT__ 5318 #define __FUNCT__ "MatSolve_SeqBAIJ_1_inplace" 5319 PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx) 5320 { 5321 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 5322 IS iscol=a->col,isrow=a->row; 5323 PetscErrorCode ierr; 5324 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 5325 PetscInt i,nz; 5326 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 5327 const MatScalar *aa=a->a,*v; 5328 PetscScalar *x,s1,*t; 5329 const PetscScalar *b; 5330 5331 PetscFunctionBegin; 5332 if (!n) PetscFunctionReturn(0); 5333 5334 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 5335 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5336 t = a->solve_work; 5337 5338 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 5339 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 5340 5341 /* forward solve the lower triangular */ 5342 t[0] = b[*r++]; 5343 for (i=1; i<n; i++) { 5344 v = aa + ai[i]; 5345 vi = aj + ai[i]; 5346 nz = diag[i] - ai[i]; 5347 s1 = b[*r++]; 5348 while (nz--) { 5349 s1 -= (*v++)*t[*vi++]; 5350 } 5351 t[i] = s1; 5352 } 5353 /* backward solve the upper triangular */ 5354 for (i=n-1; i>=0; i--) { 5355 v = aa + diag[i] + 1; 5356 vi = aj + diag[i] + 1; 5357 nz = ai[i+1] - diag[i] - 1; 5358 s1 = t[i]; 5359 while (nz--) { 5360 s1 -= (*v++)*t[*vi++]; 5361 } 5362 x[*c--] = t[i] = aa[diag[i]]*s1; 5363 } 5364 5365 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 5366 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 5367 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5368 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5369 ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr); 5370 PetscFunctionReturn(0); 5371 } 5372 5373 #undef __FUNCT__ 5374 #define __FUNCT__ "MatSolve_SeqBAIJ_1" 5375 PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 5376 { 5377 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 5378 IS iscol = a->col,isrow = a->row; 5379 PetscErrorCode ierr; 5380 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag = a->diag,nz; 5381 const PetscInt *rout,*cout,*r,*c; 5382 PetscScalar *x,*tmp,sum; 5383 const PetscScalar *b; 5384 const MatScalar *aa = a->a,*v; 5385 5386 PetscFunctionBegin; 5387 if (!n) PetscFunctionReturn(0); 5388 5389 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 5390 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5391 tmp = a->solve_work; 5392 5393 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 5394 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 5395 5396 /* forward solve the lower triangular */ 5397 tmp[0] = b[r[0]]; 5398 v = aa; 5399 vi = aj; 5400 for (i=1; i<n; i++) { 5401 nz = ai[i+1] - ai[i]; 5402 sum = b[r[i]]; 5403 PetscSparseDenseMinusDot(sum,tmp,v,vi,nz); 5404 tmp[i] = sum; 5405 v += nz; vi += nz; 5406 } 5407 5408 /* backward solve the upper triangular */ 5409 for (i=n-1; i>=0; i--) { 5410 v = aa + adiag[i+1]+1; 5411 vi = aj + adiag[i+1]+1; 5412 nz = adiag[i]-adiag[i+1]-1; 5413 sum = tmp[i]; 5414 PetscSparseDenseMinusDot(sum,tmp,v,vi,nz); 5415 x[c[i]] = tmp[i] = sum*v[nz]; /* v[nz] = aa[adiag[i]] */ 5416 } 5417 5418 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 5419 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 5420 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5421 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5422 ierr = PetscLogFlops(2*a->nz - A->cmap->n);CHKERRQ(ierr); 5423 PetscFunctionReturn(0); 5424 } 5425 5426 /* 5427 Special case where the matrix was ILU(0) factored in the natural 5428 ordering. This eliminates the need for the column and row permutation. 5429 */ 5430 #undef __FUNCT__ 5431 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering_inplace" 5432 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 5433 { 5434 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 5435 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 5436 PetscErrorCode ierr; 5437 const MatScalar *aa=a->a,*v; 5438 PetscScalar *x; 5439 const PetscScalar *b; 5440 PetscScalar s1,x1; 5441 PetscInt jdx,idt,idx,nz,i; 5442 5443 PetscFunctionBegin; 5444 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 5445 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5446 5447 /* forward solve the lower triangular */ 5448 idx = 0; 5449 x[0] = b[0]; 5450 for (i=1; i<n; i++) { 5451 v = aa + ai[i]; 5452 vi = aj + ai[i]; 5453 nz = diag[i] - ai[i]; 5454 idx += 1; 5455 s1 = b[idx]; 5456 while (nz--) { 5457 jdx = *vi++; 5458 x1 = x[jdx]; 5459 s1 -= v[0]*x1; 5460 v += 1; 5461 } 5462 x[idx] = s1; 5463 } 5464 /* backward solve the upper triangular */ 5465 for (i=n-1; i>=0; i--) { 5466 v = aa + diag[i] + 1; 5467 vi = aj + diag[i] + 1; 5468 nz = ai[i+1] - diag[i] - 1; 5469 idt = i; 5470 s1 = x[idt]; 5471 while (nz--) { 5472 idx = *vi++; 5473 x1 = x[idx]; 5474 s1 -= v[0]*x1; 5475 v += 1; 5476 } 5477 v = aa + diag[i]; 5478 x[idt] = v[0]*s1; 5479 } 5480 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5481 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5482 ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 5483 PetscFunctionReturn(0); 5484 } 5485 5486 5487 #undef __FUNCT__ 5488 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering" 5489 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 5490 { 5491 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 5492 PetscErrorCode ierr; 5493 const PetscInt n = a->mbs,*ai = a->i,*aj = a->j,*adiag = a->diag,*vi; 5494 PetscScalar *x,sum; 5495 const PetscScalar *b; 5496 const MatScalar *aa = a->a,*v; 5497 PetscInt i,nz; 5498 5499 PetscFunctionBegin; 5500 if (!n) PetscFunctionReturn(0); 5501 5502 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 5503 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5504 5505 /* forward solve the lower triangular */ 5506 x[0] = b[0]; 5507 v = aa; 5508 vi = aj; 5509 for (i=1; i<n; i++) { 5510 nz = ai[i+1] - ai[i]; 5511 sum = b[i]; 5512 PetscSparseDenseMinusDot(sum,x,v,vi,nz); 5513 v += nz; 5514 vi += nz; 5515 x[i] = sum; 5516 } 5517 5518 /* backward solve the upper triangular */ 5519 for (i=n-1; i>=0; i--) { 5520 v = aa + adiag[i+1] + 1; 5521 vi = aj + adiag[i+1] + 1; 5522 nz = adiag[i] - adiag[i+1]-1; 5523 sum = x[i]; 5524 PetscSparseDenseMinusDot(sum,x,v,vi,nz); 5525 x[i] = sum*v[nz]; /* x[i]=aa[adiag[i]]*sum; v++; */ 5526 } 5527 5528 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 5529 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5530 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5531 PetscFunctionReturn(0); 5532 } 5533 5534 /* ----------------------------------------------------------------*/ 5535 extern PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscBool); 5536 5537 #undef __FUNCT__ 5538 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering" 5539 /* 5540 This is not much faster than MatLUFactorNumeric_SeqBAIJ_N() but the solve is faster at least sometimes 5541 */ 5542 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info) 5543 { 5544 Mat C=B; 5545 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 5546 PetscErrorCode ierr; 5547 PetscInt i,j,k,ipvt[15]; 5548 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*ajtmp,*bjtmp,*bdiag=b->diag,*pj; 5549 PetscInt nz,nzL,row; 5550 MatScalar *rtmp,*pc,*mwork,*pv,*vv,work[225]; 5551 const MatScalar *v,*aa=a->a; 5552 PetscInt bs2 = a->bs2,bs=A->rmap->bs,flg; 5553 PetscInt sol_ver; 5554 5555 PetscFunctionBegin; 5556 ierr = PetscOptionsGetInt(((PetscObject)A)->prefix,"-sol_ver",&sol_ver,PETSC_NULL);CHKERRQ(ierr); 5557 5558 /* generate work space needed by the factorization */ 5559 ierr = PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);CHKERRQ(ierr); 5560 ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 5561 5562 for (i=0; i<n; i++) { 5563 /* zero rtmp */ 5564 /* L part */ 5565 nz = bi[i+1] - bi[i]; 5566 bjtmp = bj + bi[i]; 5567 for (j=0; j<nz; j++) { 5568 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5569 } 5570 5571 /* U part */ 5572 nz = bdiag[i] - bdiag[i+1]; 5573 bjtmp = bj + bdiag[i+1]+1; 5574 for (j=0; j<nz; j++) { 5575 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5576 } 5577 5578 /* load in initial (unfactored row) */ 5579 nz = ai[i+1] - ai[i]; 5580 ajtmp = aj + ai[i]; 5581 v = aa + bs2*ai[i]; 5582 for (j=0; j<nz; j++) { 5583 ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 5584 } 5585 5586 /* elimination */ 5587 bjtmp = bj + bi[i]; 5588 nzL = bi[i+1] - bi[i]; 5589 for (k=0;k < nzL;k++) { 5590 row = bjtmp[k]; 5591 pc = rtmp + bs2*row; 5592 for (flg=0,j=0; j<bs2; j++) { 5593 if (pc[j]!=0.0) { 5594 flg = 1; 5595 break; 5596 } 5597 } 5598 if (flg) { 5599 pv = b->a + bs2*bdiag[row]; 5600 PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); 5601 /*ierr = PetscKernel_A_gets_A_times_B_15(pc,pv,mwork);CHKERRQ(ierr);*/ 5602 pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 5603 pv = b->a + bs2*(bdiag[row+1]+1); 5604 nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 5605 for (j=0; j<nz; j++) { 5606 vv = rtmp + bs2*pj[j]; 5607 PetscKernel_A_gets_A_minus_B_times_C(bs,vv,pc,pv); 5608 /* ierr = PetscKernel_A_gets_A_minus_B_times_C_15(vv,pc,pv);CHKERRQ(ierr); */ 5609 pv += bs2; 5610 } 5611 ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 5612 } 5613 } 5614 5615 /* finished row so stick it into b->a */ 5616 /* L part */ 5617 pv = b->a + bs2*bi[i] ; 5618 pj = b->j + bi[i] ; 5619 nz = bi[i+1] - bi[i]; 5620 for (j=0; j<nz; j++) { 5621 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5622 } 5623 5624 /* Mark diagonal and invert diagonal for simplier triangular solves */ 5625 pv = b->a + bs2*bdiag[i]; 5626 pj = b->j + bdiag[i]; 5627 ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5628 /* PetscKernel_A_gets_inverse_A(bs,pv,pivots,work); */ 5629 ierr = PetscKernel_A_gets_inverse_A_15(pv,ipvt,work,info->shiftamount);CHKERRQ(ierr); 5630 5631 /* U part */ 5632 pv = b->a + bs2*(bdiag[i+1]+1); 5633 pj = b->j + bdiag[i+1]+1; 5634 nz = bdiag[i] - bdiag[i+1] - 1; 5635 for (j=0; j<nz; j++) { 5636 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5637 } 5638 } 5639 5640 ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr); 5641 C->ops->solve = MatSolve_SeqBAIJ_15_NaturalOrdering_ver1; 5642 C->ops->solvetranspose = MatSolve_SeqBAIJ_N_NaturalOrdering; 5643 C->assembled = PETSC_TRUE; 5644 ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 5645 PetscFunctionReturn(0); 5646 } 5647 5648 #undef __FUNCT__ 5649 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N" 5650 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N(Mat B,Mat A,const MatFactorInfo *info) 5651 { 5652 Mat C=B; 5653 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 5654 IS isrow = b->row,isicol = b->icol; 5655 PetscErrorCode ierr; 5656 const PetscInt *r,*ic; 5657 PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 5658 PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj; 5659 MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 5660 PetscInt bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg; 5661 MatScalar *v_work; 5662 PetscBool col_identity,row_identity,both_identity; 5663 5664 PetscFunctionBegin; 5665 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 5666 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 5667 5668 ierr = PetscMalloc(bs2*n*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 5669 ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 5670 5671 /* generate work space needed by dense LU factorization */ 5672 ierr = PetscMalloc3(bs,MatScalar,&v_work,bs2,MatScalar,&mwork,bs,PetscInt,&v_pivots);CHKERRQ(ierr); 5673 5674 for (i=0; i<n; i++) { 5675 /* zero rtmp */ 5676 /* L part */ 5677 nz = bi[i+1] - bi[i]; 5678 bjtmp = bj + bi[i]; 5679 for (j=0; j<nz; j++) { 5680 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5681 } 5682 5683 /* U part */ 5684 nz = bdiag[i] - bdiag[i+1]; 5685 bjtmp = bj + bdiag[i+1]+1; 5686 for (j=0; j<nz; j++) { 5687 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5688 } 5689 5690 /* load in initial (unfactored row) */ 5691 nz = ai[r[i]+1] - ai[r[i]]; 5692 ajtmp = aj + ai[r[i]]; 5693 v = aa + bs2*ai[r[i]]; 5694 for (j=0; j<nz; j++) { 5695 ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 5696 } 5697 5698 /* elimination */ 5699 bjtmp = bj + bi[i]; 5700 nzL = bi[i+1] - bi[i]; 5701 for (k=0;k < nzL;k++) { 5702 row = bjtmp[k]; 5703 pc = rtmp + bs2*row; 5704 for (flg=0,j=0; j<bs2; j++) { 5705 if (pc[j]!=0.0) { 5706 flg = 1; 5707 break; 5708 } 5709 } 5710 if (flg) { 5711 pv = b->a + bs2*bdiag[row]; 5712 PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */ 5713 pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 5714 pv = b->a + bs2*(bdiag[row+1]+1); 5715 nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 5716 for (j=0; j<nz; j++) { 5717 PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); 5718 } 5719 ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 5720 } 5721 } 5722 5723 /* finished row so stick it into b->a */ 5724 /* L part */ 5725 pv = b->a + bs2*bi[i] ; 5726 pj = b->j + bi[i] ; 5727 nz = bi[i+1] - bi[i]; 5728 for (j=0; j<nz; j++) { 5729 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5730 } 5731 5732 /* Mark diagonal and invert diagonal for simplier triangular solves */ 5733 pv = b->a + bs2*bdiag[i]; 5734 pj = b->j + bdiag[i]; 5735 /* if (*pj != i)SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */ 5736 ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5737 ierr = PetscKernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); 5738 5739 /* U part */ 5740 pv = b->a + bs2*(bdiag[i+1]+1); 5741 pj = b->j + bdiag[i+1]+1; 5742 nz = bdiag[i] - bdiag[i+1] - 1; 5743 for (j=0; j<nz; j++) { 5744 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5745 } 5746 } 5747 5748 ierr = PetscFree(rtmp);CHKERRQ(ierr); 5749 ierr = PetscFree3(v_work,mwork,v_pivots);CHKERRQ(ierr); 5750 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 5751 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 5752 5753 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 5754 ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr); 5755 both_identity = (PetscBool) (row_identity && col_identity); 5756 if (both_identity) { 5757 C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering; 5758 } else { 5759 C->ops->solve = MatSolve_SeqBAIJ_N; 5760 } 5761 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_N; 5762 5763 C->assembled = PETSC_TRUE; 5764 ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 5765 PetscFunctionReturn(0); 5766 } 5767 5768 /* 5769 ilu(0) with natural ordering under new data structure. 5770 See MatILUFactorSymbolic_SeqAIJ_ilu0() for detailed description 5771 because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_inplace(). 5772 */ 5773 5774 #undef __FUNCT__ 5775 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0" 5776 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 5777 { 5778 5779 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 5780 PetscErrorCode ierr; 5781 PetscInt n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2; 5782 PetscInt i,j,nz,*bi,*bj,*bdiag,bi_temp; 5783 5784 PetscFunctionBegin; 5785 ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr); 5786 b = (Mat_SeqBAIJ*)(fact)->data; 5787 5788 /* allocate matrix arrays for new data structure */ 5789 ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,n+1,PetscInt,&b->i);CHKERRQ(ierr); 5790 ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(n+1)*sizeof(PetscInt));CHKERRQ(ierr); 5791 b->singlemalloc = PETSC_TRUE; 5792 b->free_a = PETSC_TRUE; 5793 b->free_ij = PETSC_TRUE; 5794 fact->preallocated = PETSC_TRUE; 5795 fact->assembled = PETSC_TRUE; 5796 if (!b->diag) { 5797 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr); 5798 ierr = PetscLogObjectMemory(fact,(n+1)*sizeof(PetscInt));CHKERRQ(ierr); 5799 } 5800 bdiag = b->diag; 5801 5802 if (n > 0) { 5803 ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr); 5804 } 5805 5806 /* set bi and bj with new data structure */ 5807 bi = b->i; 5808 bj = b->j; 5809 5810 /* L part */ 5811 bi[0] = 0; 5812 for (i=0; i<n; i++) { 5813 nz = adiag[i] - ai[i]; 5814 bi[i+1] = bi[i] + nz; 5815 aj = a->j + ai[i]; 5816 for (j=0; j<nz; j++) { 5817 *bj = aj[j]; bj++; 5818 } 5819 } 5820 5821 /* U part */ 5822 bi_temp = bi[n]; 5823 bdiag[n] = bi[n]-1; 5824 for (i=n-1; i>=0; i--) { 5825 nz = ai[i+1] - adiag[i] - 1; 5826 bi_temp = bi_temp + nz + 1; 5827 aj = a->j + adiag[i] + 1; 5828 for (j=0; j<nz; j++) { 5829 *bj = aj[j]; bj++; 5830 } 5831 /* diag[i] */ 5832 *bj = i; bj++; 5833 bdiag[i] = bi_temp - 1; 5834 } 5835 PetscFunctionReturn(0); 5836 } 5837 5838 #undef __FUNCT__ 5839 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ" 5840 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 5841 { 5842 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 5843 IS isicol; 5844 PetscErrorCode ierr; 5845 const PetscInt *r,*ic; 5846 PetscInt n=a->mbs,*ai=a->i,*aj=a->j,d; 5847 PetscInt *bi,*cols,nnz,*cols_lvl; 5848 PetscInt *bdiag,prow,fm,nzbd,reallocs=0,dcount=0; 5849 PetscInt i,levels,diagonal_fill; 5850 PetscBool col_identity,row_identity,both_identity; 5851 PetscReal f; 5852 PetscInt nlnk,*lnk,*lnk_lvl=PETSC_NULL; 5853 PetscBT lnkbt; 5854 PetscInt nzi,*bj,**bj_ptr,**bjlvl_ptr; 5855 PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL; 5856 PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL; 5857 PetscBool missing; 5858 PetscInt bs=A->rmap->bs,bs2=a->bs2; 5859 5860 PetscFunctionBegin; 5861 if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n); 5862 if (bs>1) { /* check shifttype */ 5863 if (info->shifttype == MAT_SHIFT_NONZERO || info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE) 5864 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Only MAT_SHIFT_NONE and MAT_SHIFT_INBLOCKS are supported for BAIJ matrix"); 5865 } 5866 5867 ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr); 5868 if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d); 5869 5870 f = info->fill; 5871 levels = (PetscInt)info->levels; 5872 diagonal_fill = (PetscInt)info->diagonal_fill; 5873 ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 5874 5875 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 5876 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 5877 both_identity = (PetscBool) (row_identity && col_identity); 5878 5879 if (!levels && both_identity) { 5880 /* special case: ilu(0) with natural ordering */ 5881 ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0(fact,A,isrow,iscol,info);CHKERRQ(ierr); 5882 ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 5883 5884 fact->factortype = MAT_FACTOR_ILU; 5885 (fact)->info.factor_mallocs = 0; 5886 (fact)->info.fill_ratio_given = info->fill; 5887 (fact)->info.fill_ratio_needed = 1.0; 5888 b = (Mat_SeqBAIJ*)(fact)->data; 5889 b->row = isrow; 5890 b->col = iscol; 5891 b->icol = isicol; 5892 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 5893 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 5894 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 5895 ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 5896 PetscFunctionReturn(0); 5897 } 5898 5899 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 5900 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 5901 5902 /* get new row pointers */ 5903 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr); 5904 bi[0] = 0; 5905 /* bdiag is location of diagonal in factor */ 5906 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr); 5907 bdiag[0] = 0; 5908 5909 ierr = PetscMalloc2(n,PetscInt*,&bj_ptr,n,PetscInt*,&bjlvl_ptr);CHKERRQ(ierr); 5910 5911 /* create a linked list for storing column indices of the active row */ 5912 nlnk = n + 1; 5913 ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 5914 5915 /* initial FreeSpace size is f*(ai[n]+1) */ 5916 ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr); 5917 current_space = free_space; 5918 ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr); 5919 current_space_lvl = free_space_lvl; 5920 5921 for (i=0; i<n; i++) { 5922 nzi = 0; 5923 /* copy current row into linked list */ 5924 nnz = ai[r[i]+1] - ai[r[i]]; 5925 if (!nnz) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i); 5926 cols = aj + ai[r[i]]; 5927 lnk[i] = -1; /* marker to indicate if diagonal exists */ 5928 ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 5929 nzi += nlnk; 5930 5931 /* make sure diagonal entry is included */ 5932 if (diagonal_fill && lnk[i] == -1) { 5933 fm = n; 5934 while (lnk[fm] < i) fm = lnk[fm]; 5935 lnk[i] = lnk[fm]; /* insert diagonal into linked list */ 5936 lnk[fm] = i; 5937 lnk_lvl[i] = 0; 5938 nzi++; dcount++; 5939 } 5940 5941 /* add pivot rows into the active row */ 5942 nzbd = 0; 5943 prow = lnk[n]; 5944 while (prow < i) { 5945 nnz = bdiag[prow]; 5946 cols = bj_ptr[prow] + nnz + 1; 5947 cols_lvl = bjlvl_ptr[prow] + nnz + 1; 5948 nnz = bi[prow+1] - bi[prow] - nnz - 1; 5949 ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr); 5950 nzi += nlnk; 5951 prow = lnk[prow]; 5952 nzbd++; 5953 } 5954 bdiag[i] = nzbd; 5955 bi[i+1] = bi[i] + nzi; 5956 5957 /* if free space is not available, make more free space */ 5958 if (current_space->local_remaining<nzi) { 5959 nnz = 2*nzi*(n - i); /* estimated and max additional space needed */ 5960 ierr = PetscFreeSpaceGet(nnz,¤t_space);CHKERRQ(ierr); 5961 ierr = PetscFreeSpaceGet(nnz,¤t_space_lvl);CHKERRQ(ierr); 5962 reallocs++; 5963 } 5964 5965 /* copy data into free_space and free_space_lvl, then initialize lnk */ 5966 ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr); 5967 bj_ptr[i] = current_space->array; 5968 bjlvl_ptr[i] = current_space_lvl->array; 5969 5970 /* make sure the active row i has diagonal entry */ 5971 if (*(bj_ptr[i]+bdiag[i]) != i) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\ntry running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i); 5972 5973 current_space->array += nzi; 5974 current_space->local_used += nzi; 5975 current_space->local_remaining -= nzi; 5976 current_space_lvl->array += nzi; 5977 current_space_lvl->local_used += nzi; 5978 current_space_lvl->local_remaining -= nzi; 5979 } 5980 5981 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 5982 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 5983 5984 /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */ 5985 ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr); 5986 ierr = PetscFreeSpaceContiguous_LU(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr); 5987 5988 ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 5989 ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr); 5990 ierr = PetscFree2(bj_ptr,bjlvl_ptr);CHKERRQ(ierr); 5991 5992 #if defined(PETSC_USE_INFO) 5993 { 5994 PetscReal af = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]); 5995 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr); 5996 ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 5997 ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr); 5998 ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 5999 if (diagonal_fill) { 6000 ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr); 6001 } 6002 } 6003 #endif 6004 6005 /* put together the new matrix */ 6006 ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 6007 ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 6008 b = (Mat_SeqBAIJ*)(fact)->data; 6009 b->free_a = PETSC_TRUE; 6010 b->free_ij = PETSC_TRUE; 6011 b->singlemalloc = PETSC_FALSE; 6012 ierr = PetscMalloc((bs2*(bdiag[0]+1))*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 6013 b->j = bj; 6014 b->i = bi; 6015 b->diag = bdiag; 6016 b->free_diag = PETSC_TRUE; 6017 b->ilen = 0; 6018 b->imax = 0; 6019 b->row = isrow; 6020 b->col = iscol; 6021 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 6022 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 6023 b->icol = isicol; 6024 ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 6025 /* In b structure: Free imax, ilen, old a, old j. 6026 Allocate bdiag, solve_work, new a, new j */ 6027 ierr = PetscLogObjectMemory(fact,(bdiag[0]+1) * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr); 6028 b->maxnz = b->nz = bdiag[0]+1; 6029 fact->info.factor_mallocs = reallocs; 6030 fact->info.fill_ratio_given = f; 6031 fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]); 6032 ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 6033 PetscFunctionReturn(0); 6034 } 6035 6036 /* 6037 This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 6038 except that the data structure of Mat_SeqAIJ is slightly different. 6039 Not a good example of code reuse. 6040 */ 6041 #undef __FUNCT__ 6042 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_inplace" 6043 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_inplace(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 6044 { 6045 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 6046 IS isicol; 6047 PetscErrorCode ierr; 6048 const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi; 6049 PetscInt prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp; 6050 PetscInt *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0; 6051 PetscInt incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd; 6052 PetscBool col_identity,row_identity,both_identity,flg; 6053 PetscReal f; 6054 6055 PetscFunctionBegin; 6056 ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr); 6057 if (flg) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd); 6058 6059 f = info->fill; 6060 levels = (PetscInt)info->levels; 6061 diagonal_fill = (PetscInt)info->diagonal_fill; 6062 ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 6063 6064 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 6065 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 6066 both_identity = (PetscBool) (row_identity && col_identity); 6067 6068 if (!levels && both_identity) { /* special case copy the nonzero structure */ 6069 ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr); 6070 ierr = MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);CHKERRQ(ierr); 6071 6072 fact->factortype = MAT_FACTOR_ILU; 6073 b = (Mat_SeqBAIJ*)fact->data; 6074 b->row = isrow; 6075 b->col = iscol; 6076 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 6077 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 6078 b->icol = isicol; 6079 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 6080 ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 6081 PetscFunctionReturn(0); 6082 } 6083 6084 /* general case perform the symbolic factorization */ 6085 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 6086 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 6087 6088 /* get new row pointers */ 6089 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr); 6090 ainew[0] = 0; 6091 /* don't know how many column pointers are needed so estimate */ 6092 jmax = (PetscInt)(f*ai[n] + 1); 6093 ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr); 6094 /* ajfill is level of fill for each fill entry */ 6095 ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr); 6096 /* fill is a linked list of nonzeros in active row */ 6097 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr); 6098 /* im is level for each filled value */ 6099 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr); 6100 /* dloc is location of diagonal in factor */ 6101 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr); 6102 dloc[0] = 0; 6103 for (prow=0; prow<n; prow++) { 6104 6105 /* copy prow into linked list */ 6106 nzf = nz = ai[r[prow]+1] - ai[r[prow]]; 6107 if (!nz) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow); 6108 xi = aj + ai[r[prow]]; 6109 fill[n] = n; 6110 fill[prow] = -1; /* marker for diagonal entry */ 6111 while (nz--) { 6112 fm = n; 6113 idx = ic[*xi++]; 6114 do { 6115 m = fm; 6116 fm = fill[m]; 6117 } while (fm < idx); 6118 fill[m] = idx; 6119 fill[idx] = fm; 6120 im[idx] = 0; 6121 } 6122 6123 /* make sure diagonal entry is included */ 6124 if (diagonal_fill && fill[prow] == -1) { 6125 fm = n; 6126 while (fill[fm] < prow) fm = fill[fm]; 6127 fill[prow] = fill[fm]; /* insert diagonal into linked list */ 6128 fill[fm] = prow; 6129 im[prow] = 0; 6130 nzf++; 6131 dcount++; 6132 } 6133 6134 nzi = 0; 6135 row = fill[n]; 6136 while (row < prow) { 6137 incrlev = im[row] + 1; 6138 nz = dloc[row]; 6139 xi = ajnew + ainew[row] + nz + 1; 6140 flev = ajfill + ainew[row] + nz + 1; 6141 nnz = ainew[row+1] - ainew[row] - nz - 1; 6142 fm = row; 6143 while (nnz-- > 0) { 6144 idx = *xi++; 6145 if (*flev + incrlev > levels) { 6146 flev++; 6147 continue; 6148 } 6149 do { 6150 m = fm; 6151 fm = fill[m]; 6152 } while (fm < idx); 6153 if (fm != idx) { 6154 im[idx] = *flev + incrlev; 6155 fill[m] = idx; 6156 fill[idx] = fm; 6157 fm = idx; 6158 nzf++; 6159 } else { 6160 if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev; 6161 } 6162 flev++; 6163 } 6164 row = fill[row]; 6165 nzi++; 6166 } 6167 /* copy new filled row into permanent storage */ 6168 ainew[prow+1] = ainew[prow] + nzf; 6169 if (ainew[prow+1] > jmax) { 6170 6171 /* estimate how much additional space we will need */ 6172 /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 6173 /* just double the memory each time */ 6174 PetscInt maxadd = jmax; 6175 /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */ 6176 if (maxadd < nzf) maxadd = (n-prow)*(nzf+1); 6177 jmax += maxadd; 6178 6179 /* allocate a longer ajnew and ajfill */ 6180 ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 6181 ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 6182 ierr = PetscFree(ajnew);CHKERRQ(ierr); 6183 ajnew = xitmp; 6184 ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 6185 ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 6186 ierr = PetscFree(ajfill);CHKERRQ(ierr); 6187 ajfill = xitmp; 6188 reallocate++; /* count how many reallocations are needed */ 6189 } 6190 xitmp = ajnew + ainew[prow]; 6191 flev = ajfill + ainew[prow]; 6192 dloc[prow] = nzi; 6193 fm = fill[n]; 6194 while (nzf--) { 6195 *xitmp++ = fm; 6196 *flev++ = im[fm]; 6197 fm = fill[fm]; 6198 } 6199 /* make sure row has diagonal entry */ 6200 if (ajnew[ainew[prow]+dloc[prow]] != prow) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 6201 try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow); 6202 } 6203 ierr = PetscFree(ajfill);CHKERRQ(ierr); 6204 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 6205 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 6206 ierr = PetscFree(fill);CHKERRQ(ierr); 6207 ierr = PetscFree(im);CHKERRQ(ierr); 6208 6209 #if defined(PETSC_USE_INFO) 6210 { 6211 PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]); 6212 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr); 6213 ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 6214 ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr); 6215 ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 6216 if (diagonal_fill) { 6217 ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr); 6218 } 6219 } 6220 #endif 6221 6222 /* put together the new matrix */ 6223 ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 6224 ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 6225 b = (Mat_SeqBAIJ*)fact->data; 6226 b->free_a = PETSC_TRUE; 6227 b->free_ij = PETSC_TRUE; 6228 b->singlemalloc = PETSC_FALSE; 6229 ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 6230 b->j = ajnew; 6231 b->i = ainew; 6232 for (i=0; i<n; i++) dloc[i] += ainew[i]; 6233 b->diag = dloc; 6234 b->free_diag = PETSC_TRUE; 6235 b->ilen = 0; 6236 b->imax = 0; 6237 b->row = isrow; 6238 b->col = iscol; 6239 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 6240 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 6241 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 6242 b->icol = isicol; 6243 ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 6244 /* In b structure: Free imax, ilen, old a, old j. 6245 Allocate dloc, solve_work, new a, new j */ 6246 ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr); 6247 b->maxnz = b->nz = ainew[n]; 6248 6249 fact->info.factor_mallocs = reallocate; 6250 fact->info.fill_ratio_given = f; 6251 fact->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]); 6252 6253 ierr = MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);CHKERRQ(ierr); 6254 PetscFunctionReturn(0); 6255 } 6256 6257 #undef __FUNCT__ 6258 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE" 6259 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A) 6260 { 6261 /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */ 6262 /* int i,*AJ=a->j,nz=a->nz; */ 6263 6264 PetscFunctionBegin; 6265 /* Undo Column scaling */ 6266 /* while (nz--) { */ 6267 /* AJ[i] = AJ[i]/4; */ 6268 /* } */ 6269 /* This should really invoke a push/pop logic, but we don't have that yet. */ 6270 A->ops->setunfactored = PETSC_NULL; 6271 PetscFunctionReturn(0); 6272 } 6273 6274 #undef __FUNCT__ 6275 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj" 6276 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A) 6277 { 6278 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 6279 PetscInt *AJ=a->j,nz=a->nz; 6280 unsigned short *aj=(unsigned short *)AJ; 6281 6282 PetscFunctionBegin; 6283 /* Is this really necessary? */ 6284 while (nz--) { 6285 AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */ 6286 } 6287 A->ops->setunfactored = PETSC_NULL; 6288 PetscFunctionReturn(0); 6289 } 6290 6291 6292