1 /*$Id: baijfact2.c,v 1.72 2001/09/11 16:32:33 bsmith Exp $*/ 2 /* 3 Factorization code for BAIJ format. 4 */ 5 6 #include "src/mat/impls/baij/seq/baij.h" 7 #include "src/vec/vecimpl.h" 8 #include "src/inline/ilu.h" 9 #include "src/inline/dot.h" 10 11 #undef __FUNCT__ 12 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering" 13 int MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 14 { 15 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 16 int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 17 int *diag = a->diag; 18 MatScalar *aa=a->a,*v; 19 PetscScalar s1,*x,*b; 20 21 PetscFunctionBegin; 22 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 23 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 24 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 25 26 /* forward solve the U^T */ 27 for (i=0; i<n; i++) { 28 29 v = aa + diag[i]; 30 /* multiply by the inverse of the block diagonal */ 31 s1 = (*v++)*x[i]; 32 vi = aj + diag[i] + 1; 33 nz = ai[i+1] - diag[i] - 1; 34 while (nz--) { 35 x[*vi++] -= (*v++)*s1; 36 } 37 x[i] = s1; 38 } 39 /* backward solve the L^T */ 40 for (i=n-1; i>=0; i--){ 41 v = aa + diag[i] - 1; 42 vi = aj + diag[i] - 1; 43 nz = diag[i] - ai[i]; 44 s1 = x[i]; 45 while (nz--) { 46 x[*vi--] -= (*v--)*s1; 47 } 48 } 49 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 50 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 51 PetscLogFlops(2*(a->nz) - A->n); 52 PetscFunctionReturn(0); 53 } 54 55 #undef __FUNCT__ 56 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering" 57 int MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 58 { 59 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 60 int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 61 int *diag = a->diag,oidx; 62 MatScalar *aa=a->a,*v; 63 PetscScalar s1,s2,x1,x2; 64 PetscScalar *x,*b; 65 66 PetscFunctionBegin; 67 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 68 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 69 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 70 71 /* forward solve the U^T */ 72 idx = 0; 73 for (i=0; i<n; i++) { 74 75 v = aa + 4*diag[i]; 76 /* multiply by the inverse of the block diagonal */ 77 x1 = x[idx]; x2 = x[1+idx]; 78 s1 = v[0]*x1 + v[1]*x2; 79 s2 = v[2]*x1 + v[3]*x2; 80 v += 4; 81 82 vi = aj + diag[i] + 1; 83 nz = ai[i+1] - diag[i] - 1; 84 while (nz--) { 85 oidx = 2*(*vi++); 86 x[oidx] -= v[0]*s1 + v[1]*s2; 87 x[oidx+1] -= v[2]*s1 + v[3]*s2; 88 v += 4; 89 } 90 x[idx] = s1;x[1+idx] = s2; 91 idx += 2; 92 } 93 /* backward solve the L^T */ 94 for (i=n-1; i>=0; i--){ 95 v = aa + 4*diag[i] - 4; 96 vi = aj + diag[i] - 1; 97 nz = diag[i] - ai[i]; 98 idt = 2*i; 99 s1 = x[idt]; s2 = x[1+idt]; 100 while (nz--) { 101 idx = 2*(*vi--); 102 x[idx] -= v[0]*s1 + v[1]*s2; 103 x[idx+1] -= v[2]*s1 + v[3]*s2; 104 v -= 4; 105 } 106 } 107 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 108 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 109 PetscLogFlops(2*4*(a->nz) - 2*A->n); 110 PetscFunctionReturn(0); 111 } 112 113 #undef __FUNCT__ 114 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering" 115 int MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 116 { 117 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 118 int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 119 int *diag = a->diag,oidx; 120 MatScalar *aa=a->a,*v; 121 PetscScalar s1,s2,s3,x1,x2,x3; 122 PetscScalar *x,*b; 123 124 PetscFunctionBegin; 125 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 126 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 127 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 128 129 /* forward solve the U^T */ 130 idx = 0; 131 for (i=0; i<n; i++) { 132 133 v = aa + 9*diag[i]; 134 /* multiply by the inverse of the block diagonal */ 135 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 136 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 137 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 138 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 139 v += 9; 140 141 vi = aj + diag[i] + 1; 142 nz = ai[i+1] - diag[i] - 1; 143 while (nz--) { 144 oidx = 3*(*vi++); 145 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 146 x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 147 x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 148 v += 9; 149 } 150 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 151 idx += 3; 152 } 153 /* backward solve the L^T */ 154 for (i=n-1; i>=0; i--){ 155 v = aa + 9*diag[i] - 9; 156 vi = aj + diag[i] - 1; 157 nz = diag[i] - ai[i]; 158 idt = 3*i; 159 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 160 while (nz--) { 161 idx = 3*(*vi--); 162 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 163 x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 164 x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 165 v -= 9; 166 } 167 } 168 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 169 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 170 PetscLogFlops(2*9*(a->nz) - 3*A->n); 171 PetscFunctionReturn(0); 172 } 173 174 #undef __FUNCT__ 175 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering" 176 int MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 177 { 178 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 179 int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 180 int *diag = a->diag,oidx; 181 MatScalar *aa=a->a,*v; 182 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 183 PetscScalar *x,*b; 184 185 PetscFunctionBegin; 186 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 187 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 188 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 189 190 /* forward solve the U^T */ 191 idx = 0; 192 for (i=0; i<n; i++) { 193 194 v = aa + 16*diag[i]; 195 /* multiply by the inverse of the block diagonal */ 196 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 197 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 198 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 199 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 200 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 201 v += 16; 202 203 vi = aj + diag[i] + 1; 204 nz = ai[i+1] - diag[i] - 1; 205 while (nz--) { 206 oidx = 4*(*vi++); 207 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 208 x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 209 x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 210 x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 211 v += 16; 212 } 213 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; 214 idx += 4; 215 } 216 /* backward solve the L^T */ 217 for (i=n-1; i>=0; i--){ 218 v = aa + 16*diag[i] - 16; 219 vi = aj + diag[i] - 1; 220 nz = diag[i] - ai[i]; 221 idt = 4*i; 222 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; 223 while (nz--) { 224 idx = 4*(*vi--); 225 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 226 x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 227 x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 228 x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 229 v -= 16; 230 } 231 } 232 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 233 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 234 PetscLogFlops(2*16*(a->nz) - 4*A->n); 235 PetscFunctionReturn(0); 236 } 237 238 #undef __FUNCT__ 239 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering" 240 int MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 241 { 242 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 243 int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 244 int *diag = a->diag,oidx; 245 MatScalar *aa=a->a,*v; 246 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 247 PetscScalar *x,*b; 248 249 PetscFunctionBegin; 250 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 251 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 252 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 253 254 /* forward solve the U^T */ 255 idx = 0; 256 for (i=0; i<n; i++) { 257 258 v = aa + 25*diag[i]; 259 /* multiply by the inverse of the block diagonal */ 260 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 261 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 262 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 263 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 264 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 265 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 266 v += 25; 267 268 vi = aj + diag[i] + 1; 269 nz = ai[i+1] - diag[i] - 1; 270 while (nz--) { 271 oidx = 5*(*vi++); 272 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 273 x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 274 x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 275 x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 276 x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 277 v += 25; 278 } 279 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 280 idx += 5; 281 } 282 /* backward solve the L^T */ 283 for (i=n-1; i>=0; i--){ 284 v = aa + 25*diag[i] - 25; 285 vi = aj + diag[i] - 1; 286 nz = diag[i] - ai[i]; 287 idt = 5*i; 288 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 289 while (nz--) { 290 idx = 5*(*vi--); 291 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 292 x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 293 x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 294 x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 295 x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 296 v -= 25; 297 } 298 } 299 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 300 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 301 PetscLogFlops(2*25*(a->nz) - 5*A->n); 302 PetscFunctionReturn(0); 303 } 304 305 #undef __FUNCT__ 306 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering" 307 int MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 308 { 309 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 310 int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 311 int *diag = a->diag,oidx; 312 MatScalar *aa=a->a,*v; 313 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 314 PetscScalar *x,*b; 315 316 PetscFunctionBegin; 317 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 318 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 319 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 320 321 /* forward solve the U^T */ 322 idx = 0; 323 for (i=0; i<n; i++) { 324 325 v = aa + 36*diag[i]; 326 /* multiply by the inverse of the block diagonal */ 327 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 328 x6 = x[5+idx]; 329 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 330 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 331 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 332 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 333 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 334 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 335 v += 36; 336 337 vi = aj + diag[i] + 1; 338 nz = ai[i+1] - diag[i] - 1; 339 while (nz--) { 340 oidx = 6*(*vi++); 341 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 342 x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 343 x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 344 x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 345 x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 346 x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 347 v += 36; 348 } 349 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 350 x[5+idx] = s6; 351 idx += 6; 352 } 353 /* backward solve the L^T */ 354 for (i=n-1; i>=0; i--){ 355 v = aa + 36*diag[i] - 36; 356 vi = aj + diag[i] - 1; 357 nz = diag[i] - ai[i]; 358 idt = 6*i; 359 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 360 s6 = x[5+idt]; 361 while (nz--) { 362 idx = 6*(*vi--); 363 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 364 x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 365 x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 366 x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 367 x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 368 x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 369 v -= 36; 370 } 371 } 372 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 373 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 374 PetscLogFlops(2*36*(a->nz) - 6*A->n); 375 PetscFunctionReturn(0); 376 } 377 378 #undef __FUNCT__ 379 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering" 380 int MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 381 { 382 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 383 int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 384 int *diag = a->diag,oidx; 385 MatScalar *aa=a->a,*v; 386 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 387 PetscScalar *x,*b; 388 389 PetscFunctionBegin; 390 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 391 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 392 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 393 394 /* forward solve the U^T */ 395 idx = 0; 396 for (i=0; i<n; i++) { 397 398 v = aa + 49*diag[i]; 399 /* multiply by the inverse of the block diagonal */ 400 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 401 x6 = x[5+idx]; x7 = x[6+idx]; 402 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 403 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 404 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 405 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 406 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 407 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 408 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 409 v += 49; 410 411 vi = aj + diag[i] + 1; 412 nz = ai[i+1] - diag[i] - 1; 413 while (nz--) { 414 oidx = 7*(*vi++); 415 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 416 x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 417 x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 418 x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 419 x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 420 x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 421 x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 422 v += 49; 423 } 424 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 425 x[5+idx] = s6;x[6+idx] = s7; 426 idx += 7; 427 } 428 /* backward solve the L^T */ 429 for (i=n-1; i>=0; i--){ 430 v = aa + 49*diag[i] - 49; 431 vi = aj + diag[i] - 1; 432 nz = diag[i] - ai[i]; 433 idt = 7*i; 434 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 435 s6 = x[5+idt];s7 = x[6+idt]; 436 while (nz--) { 437 idx = 7*(*vi--); 438 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 439 x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 440 x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 441 x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 442 x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 443 x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 444 x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 445 v -= 49; 446 } 447 } 448 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 449 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 450 PetscLogFlops(2*49*(a->nz) - 7*A->n); 451 PetscFunctionReturn(0); 452 } 453 454 /*---------------------------------------------------------------------------------------------*/ 455 #undef __FUNCT__ 456 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1" 457 int MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 458 { 459 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 460 IS iscol=a->col,isrow=a->row; 461 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout; 462 int *diag = a->diag; 463 MatScalar *aa=a->a,*v; 464 PetscScalar s1,*x,*b,*t; 465 466 PetscFunctionBegin; 467 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 468 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 469 t = a->solve_work; 470 471 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 472 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 473 474 /* copy the b into temp work space according to permutation */ 475 for (i=0; i<n; i++) { 476 t[i] = b[c[i]]; 477 } 478 479 /* forward solve the U^T */ 480 for (i=0; i<n; i++) { 481 482 v = aa + diag[i]; 483 /* multiply by the inverse of the block diagonal */ 484 s1 = (*v++)*t[i]; 485 vi = aj + diag[i] + 1; 486 nz = ai[i+1] - diag[i] - 1; 487 while (nz--) { 488 t[*vi++] -= (*v++)*s1; 489 } 490 t[i] = s1; 491 } 492 /* backward solve the L^T */ 493 for (i=n-1; i>=0; i--){ 494 v = aa + diag[i] - 1; 495 vi = aj + diag[i] - 1; 496 nz = diag[i] - ai[i]; 497 s1 = t[i]; 498 while (nz--) { 499 t[*vi--] -= (*v--)*s1; 500 } 501 } 502 503 /* copy t into x according to permutation */ 504 for (i=0; i<n; i++) { 505 x[r[i]] = t[i]; 506 } 507 508 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 509 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 510 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 511 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 512 PetscLogFlops(2*(a->nz) - A->n); 513 PetscFunctionReturn(0); 514 } 515 516 #undef __FUNCT__ 517 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2" 518 int MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 519 { 520 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 521 IS iscol=a->col,isrow=a->row; 522 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 523 int *diag = a->diag,ii,ic,ir,oidx; 524 MatScalar *aa=a->a,*v; 525 PetscScalar s1,s2,x1,x2; 526 PetscScalar *x,*b,*t; 527 528 PetscFunctionBegin; 529 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 530 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 531 t = a->solve_work; 532 533 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 534 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 535 536 /* copy the b into temp work space according to permutation */ 537 ii = 0; 538 for (i=0; i<n; i++) { 539 ic = 2*c[i]; 540 t[ii] = b[ic]; 541 t[ii+1] = b[ic+1]; 542 ii += 2; 543 } 544 545 /* forward solve the U^T */ 546 idx = 0; 547 for (i=0; i<n; i++) { 548 549 v = aa + 4*diag[i]; 550 /* multiply by the inverse of the block diagonal */ 551 x1 = t[idx]; x2 = t[1+idx]; 552 s1 = v[0]*x1 + v[1]*x2; 553 s2 = v[2]*x1 + v[3]*x2; 554 v += 4; 555 556 vi = aj + diag[i] + 1; 557 nz = ai[i+1] - diag[i] - 1; 558 while (nz--) { 559 oidx = 2*(*vi++); 560 t[oidx] -= v[0]*s1 + v[1]*s2; 561 t[oidx+1] -= v[2]*s1 + v[3]*s2; 562 v += 4; 563 } 564 t[idx] = s1;t[1+idx] = s2; 565 idx += 2; 566 } 567 /* backward solve the L^T */ 568 for (i=n-1; i>=0; i--){ 569 v = aa + 4*diag[i] - 4; 570 vi = aj + diag[i] - 1; 571 nz = diag[i] - ai[i]; 572 idt = 2*i; 573 s1 = t[idt]; s2 = t[1+idt]; 574 while (nz--) { 575 idx = 2*(*vi--); 576 t[idx] -= v[0]*s1 + v[1]*s2; 577 t[idx+1] -= v[2]*s1 + v[3]*s2; 578 v -= 4; 579 } 580 } 581 582 /* copy t into x according to permutation */ 583 ii = 0; 584 for (i=0; i<n; i++) { 585 ir = 2*r[i]; 586 x[ir] = t[ii]; 587 x[ir+1] = t[ii+1]; 588 ii += 2; 589 } 590 591 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 592 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 593 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 594 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 595 PetscLogFlops(2*4*(a->nz) - 2*A->n); 596 PetscFunctionReturn(0); 597 } 598 599 #undef __FUNCT__ 600 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3" 601 int MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 602 { 603 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 604 IS iscol=a->col,isrow=a->row; 605 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 606 int *diag = a->diag,ii,ic,ir,oidx; 607 MatScalar *aa=a->a,*v; 608 PetscScalar s1,s2,s3,x1,x2,x3; 609 PetscScalar *x,*b,*t; 610 611 PetscFunctionBegin; 612 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 613 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 614 t = a->solve_work; 615 616 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 617 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 618 619 /* copy the b into temp work space according to permutation */ 620 ii = 0; 621 for (i=0; i<n; i++) { 622 ic = 3*c[i]; 623 t[ii] = b[ic]; 624 t[ii+1] = b[ic+1]; 625 t[ii+2] = b[ic+2]; 626 ii += 3; 627 } 628 629 /* forward solve the U^T */ 630 idx = 0; 631 for (i=0; i<n; i++) { 632 633 v = aa + 9*diag[i]; 634 /* multiply by the inverse of the block diagonal */ 635 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 636 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 637 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 638 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 639 v += 9; 640 641 vi = aj + diag[i] + 1; 642 nz = ai[i+1] - diag[i] - 1; 643 while (nz--) { 644 oidx = 3*(*vi++); 645 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 646 t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 647 t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 648 v += 9; 649 } 650 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 651 idx += 3; 652 } 653 /* backward solve the L^T */ 654 for (i=n-1; i>=0; i--){ 655 v = aa + 9*diag[i] - 9; 656 vi = aj + diag[i] - 1; 657 nz = diag[i] - ai[i]; 658 idt = 3*i; 659 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 660 while (nz--) { 661 idx = 3*(*vi--); 662 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 663 t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 664 t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 665 v -= 9; 666 } 667 } 668 669 /* copy t into x according to permutation */ 670 ii = 0; 671 for (i=0; i<n; i++) { 672 ir = 3*r[i]; 673 x[ir] = t[ii]; 674 x[ir+1] = t[ii+1]; 675 x[ir+2] = t[ii+2]; 676 ii += 3; 677 } 678 679 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 680 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 681 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 682 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 683 PetscLogFlops(2*9*(a->nz) - 3*A->n); 684 PetscFunctionReturn(0); 685 } 686 687 #undef __FUNCT__ 688 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4" 689 int MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 690 { 691 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 692 IS iscol=a->col,isrow=a->row; 693 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 694 int *diag = a->diag,ii,ic,ir,oidx; 695 MatScalar *aa=a->a,*v; 696 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 697 PetscScalar *x,*b,*t; 698 699 PetscFunctionBegin; 700 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 701 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 702 t = a->solve_work; 703 704 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 705 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 706 707 /* copy the b into temp work space according to permutation */ 708 ii = 0; 709 for (i=0; i<n; i++) { 710 ic = 4*c[i]; 711 t[ii] = b[ic]; 712 t[ii+1] = b[ic+1]; 713 t[ii+2] = b[ic+2]; 714 t[ii+3] = b[ic+3]; 715 ii += 4; 716 } 717 718 /* forward solve the U^T */ 719 idx = 0; 720 for (i=0; i<n; i++) { 721 722 v = aa + 16*diag[i]; 723 /* multiply by the inverse of the block diagonal */ 724 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 725 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 726 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 727 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 728 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 729 v += 16; 730 731 vi = aj + diag[i] + 1; 732 nz = ai[i+1] - diag[i] - 1; 733 while (nz--) { 734 oidx = 4*(*vi++); 735 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 736 t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 737 t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 738 t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 739 v += 16; 740 } 741 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; 742 idx += 4; 743 } 744 /* backward solve the L^T */ 745 for (i=n-1; i>=0; i--){ 746 v = aa + 16*diag[i] - 16; 747 vi = aj + diag[i] - 1; 748 nz = diag[i] - ai[i]; 749 idt = 4*i; 750 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; 751 while (nz--) { 752 idx = 4*(*vi--); 753 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 754 t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 755 t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 756 t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 757 v -= 16; 758 } 759 } 760 761 /* copy t into x according to permutation */ 762 ii = 0; 763 for (i=0; i<n; i++) { 764 ir = 4*r[i]; 765 x[ir] = t[ii]; 766 x[ir+1] = t[ii+1]; 767 x[ir+2] = t[ii+2]; 768 x[ir+3] = t[ii+3]; 769 ii += 4; 770 } 771 772 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 773 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 774 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 775 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 776 PetscLogFlops(2*16*(a->nz) - 4*A->n); 777 PetscFunctionReturn(0); 778 } 779 780 #undef __FUNCT__ 781 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5" 782 int MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 783 { 784 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 785 IS iscol=a->col,isrow=a->row; 786 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 787 int *diag = a->diag,ii,ic,ir,oidx; 788 MatScalar *aa=a->a,*v; 789 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 790 PetscScalar *x,*b,*t; 791 792 PetscFunctionBegin; 793 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 794 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 795 t = a->solve_work; 796 797 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 798 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 799 800 /* copy the b into temp work space according to permutation */ 801 ii = 0; 802 for (i=0; i<n; i++) { 803 ic = 5*c[i]; 804 t[ii] = b[ic]; 805 t[ii+1] = b[ic+1]; 806 t[ii+2] = b[ic+2]; 807 t[ii+3] = b[ic+3]; 808 t[ii+4] = b[ic+4]; 809 ii += 5; 810 } 811 812 /* forward solve the U^T */ 813 idx = 0; 814 for (i=0; i<n; i++) { 815 816 v = aa + 25*diag[i]; 817 /* multiply by the inverse of the block diagonal */ 818 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 819 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 820 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 821 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 822 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 823 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 824 v += 25; 825 826 vi = aj + diag[i] + 1; 827 nz = ai[i+1] - diag[i] - 1; 828 while (nz--) { 829 oidx = 5*(*vi++); 830 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 831 t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 832 t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 833 t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 834 t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 835 v += 25; 836 } 837 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 838 idx += 5; 839 } 840 /* backward solve the L^T */ 841 for (i=n-1; i>=0; i--){ 842 v = aa + 25*diag[i] - 25; 843 vi = aj + diag[i] - 1; 844 nz = diag[i] - ai[i]; 845 idt = 5*i; 846 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 847 while (nz--) { 848 idx = 5*(*vi--); 849 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 850 t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 851 t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 852 t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 853 t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 854 v -= 25; 855 } 856 } 857 858 /* copy t into x according to permutation */ 859 ii = 0; 860 for (i=0; i<n; i++) { 861 ir = 5*r[i]; 862 x[ir] = t[ii]; 863 x[ir+1] = t[ii+1]; 864 x[ir+2] = t[ii+2]; 865 x[ir+3] = t[ii+3]; 866 x[ir+4] = t[ii+4]; 867 ii += 5; 868 } 869 870 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 871 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 872 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 873 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 874 PetscLogFlops(2*25*(a->nz) - 5*A->n); 875 PetscFunctionReturn(0); 876 } 877 878 #undef __FUNCT__ 879 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6" 880 int MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 881 { 882 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 883 IS iscol=a->col,isrow=a->row; 884 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 885 int *diag = a->diag,ii,ic,ir,oidx; 886 MatScalar *aa=a->a,*v; 887 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 888 PetscScalar *x,*b,*t; 889 890 PetscFunctionBegin; 891 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 892 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 893 t = a->solve_work; 894 895 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 896 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 897 898 /* copy the b into temp work space according to permutation */ 899 ii = 0; 900 for (i=0; i<n; i++) { 901 ic = 6*c[i]; 902 t[ii] = b[ic]; 903 t[ii+1] = b[ic+1]; 904 t[ii+2] = b[ic+2]; 905 t[ii+3] = b[ic+3]; 906 t[ii+4] = b[ic+4]; 907 t[ii+5] = b[ic+5]; 908 ii += 6; 909 } 910 911 /* forward solve the U^T */ 912 idx = 0; 913 for (i=0; i<n; i++) { 914 915 v = aa + 36*diag[i]; 916 /* multiply by the inverse of the block diagonal */ 917 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 918 x6 = t[5+idx]; 919 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 920 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 921 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 922 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 923 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 924 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 925 v += 36; 926 927 vi = aj + diag[i] + 1; 928 nz = ai[i+1] - diag[i] - 1; 929 while (nz--) { 930 oidx = 6*(*vi++); 931 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 932 t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 933 t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 934 t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 935 t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 936 t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 937 v += 36; 938 } 939 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 940 t[5+idx] = s6; 941 idx += 6; 942 } 943 /* backward solve the L^T */ 944 for (i=n-1; i>=0; i--){ 945 v = aa + 36*diag[i] - 36; 946 vi = aj + diag[i] - 1; 947 nz = diag[i] - ai[i]; 948 idt = 6*i; 949 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 950 s6 = t[5+idt]; 951 while (nz--) { 952 idx = 6*(*vi--); 953 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 954 t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 955 t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 956 t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 957 t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 958 t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 959 v -= 36; 960 } 961 } 962 963 /* copy t into x according to permutation */ 964 ii = 0; 965 for (i=0; i<n; i++) { 966 ir = 6*r[i]; 967 x[ir] = t[ii]; 968 x[ir+1] = t[ii+1]; 969 x[ir+2] = t[ii+2]; 970 x[ir+3] = t[ii+3]; 971 x[ir+4] = t[ii+4]; 972 x[ir+5] = t[ii+5]; 973 ii += 6; 974 } 975 976 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 977 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 978 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 979 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 980 PetscLogFlops(2*36*(a->nz) - 6*A->n); 981 PetscFunctionReturn(0); 982 } 983 984 #undef __FUNCT__ 985 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7" 986 int MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 987 { 988 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 989 IS iscol=a->col,isrow=a->row; 990 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 991 int *diag = a->diag,ii,ic,ir,oidx; 992 MatScalar *aa=a->a,*v; 993 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 994 PetscScalar *x,*b,*t; 995 996 PetscFunctionBegin; 997 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 998 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 999 t = a->solve_work; 1000 1001 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1002 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1003 1004 /* copy the b into temp work space according to permutation */ 1005 ii = 0; 1006 for (i=0; i<n; i++) { 1007 ic = 7*c[i]; 1008 t[ii] = b[ic]; 1009 t[ii+1] = b[ic+1]; 1010 t[ii+2] = b[ic+2]; 1011 t[ii+3] = b[ic+3]; 1012 t[ii+4] = b[ic+4]; 1013 t[ii+5] = b[ic+5]; 1014 t[ii+6] = b[ic+6]; 1015 ii += 7; 1016 } 1017 1018 /* forward solve the U^T */ 1019 idx = 0; 1020 for (i=0; i<n; i++) { 1021 1022 v = aa + 49*diag[i]; 1023 /* multiply by the inverse of the block diagonal */ 1024 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1025 x6 = t[5+idx]; x7 = t[6+idx]; 1026 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 1027 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 1028 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 1029 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 1030 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 1031 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 1032 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 1033 v += 49; 1034 1035 vi = aj + diag[i] + 1; 1036 nz = ai[i+1] - diag[i] - 1; 1037 while (nz--) { 1038 oidx = 7*(*vi++); 1039 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1040 t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1041 t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1042 t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1043 t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1044 t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1045 t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1046 v += 49; 1047 } 1048 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1049 t[5+idx] = s6;t[6+idx] = s7; 1050 idx += 7; 1051 } 1052 /* backward solve the L^T */ 1053 for (i=n-1; i>=0; i--){ 1054 v = aa + 49*diag[i] - 49; 1055 vi = aj + diag[i] - 1; 1056 nz = diag[i] - ai[i]; 1057 idt = 7*i; 1058 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1059 s6 = t[5+idt];s7 = t[6+idt]; 1060 while (nz--) { 1061 idx = 7*(*vi--); 1062 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1063 t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1064 t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1065 t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1066 t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1067 t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1068 t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1069 v -= 49; 1070 } 1071 } 1072 1073 /* copy t into x according to permutation */ 1074 ii = 0; 1075 for (i=0; i<n; i++) { 1076 ir = 7*r[i]; 1077 x[ir] = t[ii]; 1078 x[ir+1] = t[ii+1]; 1079 x[ir+2] = t[ii+2]; 1080 x[ir+3] = t[ii+3]; 1081 x[ir+4] = t[ii+4]; 1082 x[ir+5] = t[ii+5]; 1083 x[ir+6] = t[ii+6]; 1084 ii += 7; 1085 } 1086 1087 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1088 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1089 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1090 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1091 PetscLogFlops(2*49*(a->nz) - 7*A->n); 1092 PetscFunctionReturn(0); 1093 } 1094 1095 /* ----------------------------------------------------------- */ 1096 #undef __FUNCT__ 1097 #define __FUNCT__ "MatSolve_SeqBAIJ_N" 1098 int MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 1099 { 1100 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1101 IS iscol=a->col,isrow=a->row; 1102 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1103 int nz,bs=a->bs,bs2=a->bs2,*rout,*cout; 1104 MatScalar *aa=a->a,*v; 1105 PetscScalar *x,*b,*s,*t,*ls; 1106 1107 PetscFunctionBegin; 1108 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1109 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1110 t = a->solve_work; 1111 1112 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1113 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1114 1115 /* forward solve the lower triangular */ 1116 ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 1117 for (i=1; i<n; i++) { 1118 v = aa + bs2*ai[i]; 1119 vi = aj + ai[i]; 1120 nz = a->diag[i] - ai[i]; 1121 s = t + bs*i; 1122 ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 1123 while (nz--) { 1124 Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 1125 v += bs2; 1126 } 1127 } 1128 /* backward solve the upper triangular */ 1129 ls = a->solve_work + A->n; 1130 for (i=n-1; i>=0; i--){ 1131 v = aa + bs2*(a->diag[i] + 1); 1132 vi = aj + a->diag[i] + 1; 1133 nz = ai[i+1] - a->diag[i] - 1; 1134 ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 1135 while (nz--) { 1136 Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 1137 v += bs2; 1138 } 1139 Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 1140 ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 1141 } 1142 1143 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1144 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1145 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1146 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1147 PetscLogFlops(2*(a->bs2)*(a->nz) - a->bs*A->n); 1148 PetscFunctionReturn(0); 1149 } 1150 1151 #undef __FUNCT__ 1152 #define __FUNCT__ "MatSolve_SeqBAIJ_7" 1153 int MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 1154 { 1155 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1156 IS iscol=a->col,isrow=a->row; 1157 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1158 int *diag = a->diag; 1159 MatScalar *aa=a->a,*v; 1160 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1161 PetscScalar *x,*b,*t; 1162 1163 PetscFunctionBegin; 1164 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1165 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1166 t = a->solve_work; 1167 1168 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1169 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1170 1171 /* forward solve the lower triangular */ 1172 idx = 7*(*r++); 1173 t[0] = b[idx]; t[1] = b[1+idx]; 1174 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1175 t[5] = b[5+idx]; t[6] = b[6+idx]; 1176 1177 for (i=1; i<n; i++) { 1178 v = aa + 49*ai[i]; 1179 vi = aj + ai[i]; 1180 nz = diag[i] - ai[i]; 1181 idx = 7*(*r++); 1182 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1183 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 1184 while (nz--) { 1185 idx = 7*(*vi++); 1186 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1187 x4 = t[3+idx];x5 = t[4+idx]; 1188 x6 = t[5+idx];x7 = t[6+idx]; 1189 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1190 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1191 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1192 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1193 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1194 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1195 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1196 v += 49; 1197 } 1198 idx = 7*i; 1199 t[idx] = s1;t[1+idx] = s2; 1200 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1201 t[5+idx] = s6;t[6+idx] = s7; 1202 } 1203 /* backward solve the upper triangular */ 1204 for (i=n-1; i>=0; i--){ 1205 v = aa + 49*diag[i] + 49; 1206 vi = aj + diag[i] + 1; 1207 nz = ai[i+1] - diag[i] - 1; 1208 idt = 7*i; 1209 s1 = t[idt]; s2 = t[1+idt]; 1210 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1211 s6 = t[5+idt];s7 = t[6+idt]; 1212 while (nz--) { 1213 idx = 7*(*vi++); 1214 x1 = t[idx]; x2 = t[1+idx]; 1215 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1216 x6 = t[5+idx]; x7 = t[6+idx]; 1217 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1218 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1219 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1220 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1221 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1222 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1223 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1224 v += 49; 1225 } 1226 idc = 7*(*c--); 1227 v = aa + 49*diag[i]; 1228 x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 1229 v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 1230 x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 1231 v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 1232 x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 1233 v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 1234 x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 1235 v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 1236 x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 1237 v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 1238 x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 1239 v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 1240 x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 1241 v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 1242 } 1243 1244 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1245 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1246 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1247 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1248 PetscLogFlops(2*49*(a->nz) - 7*A->n); 1249 PetscFunctionReturn(0); 1250 } 1251 1252 #undef __FUNCT__ 1253 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering" 1254 int MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 1255 { 1256 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1257 int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1258 int ierr,*diag = a->diag,jdx; 1259 MatScalar *aa=a->a,*v; 1260 PetscScalar *x,*b,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1261 1262 PetscFunctionBegin; 1263 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1264 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1265 /* forward solve the lower triangular */ 1266 idx = 0; 1267 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 1268 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 1269 x[6] = b[6+idx]; 1270 for (i=1; i<n; i++) { 1271 v = aa + 49*ai[i]; 1272 vi = aj + ai[i]; 1273 nz = diag[i] - ai[i]; 1274 idx = 7*i; 1275 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1276 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1277 s7 = b[6+idx]; 1278 while (nz--) { 1279 jdx = 7*(*vi++); 1280 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 1281 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 1282 x7 = x[6+jdx]; 1283 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1284 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1285 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1286 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1287 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1288 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1289 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1290 v += 49; 1291 } 1292 x[idx] = s1; 1293 x[1+idx] = s2; 1294 x[2+idx] = s3; 1295 x[3+idx] = s4; 1296 x[4+idx] = s5; 1297 x[5+idx] = s6; 1298 x[6+idx] = s7; 1299 } 1300 /* backward solve the upper triangular */ 1301 for (i=n-1; i>=0; i--){ 1302 v = aa + 49*diag[i] + 49; 1303 vi = aj + diag[i] + 1; 1304 nz = ai[i+1] - diag[i] - 1; 1305 idt = 7*i; 1306 s1 = x[idt]; s2 = x[1+idt]; 1307 s3 = x[2+idt]; s4 = x[3+idt]; 1308 s5 = x[4+idt]; s6 = x[5+idt]; 1309 s7 = x[6+idt]; 1310 while (nz--) { 1311 idx = 7*(*vi++); 1312 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 1313 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 1314 x7 = x[6+idx]; 1315 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1316 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1317 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1318 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1319 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1320 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1321 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1322 v += 49; 1323 } 1324 v = aa + 49*diag[i]; 1325 x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 1326 + v[28]*s5 + v[35]*s6 + v[42]*s7; 1327 x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 1328 + v[29]*s5 + v[36]*s6 + v[43]*s7; 1329 x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 1330 + v[30]*s5 + v[37]*s6 + v[44]*s7; 1331 x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 1332 + v[31]*s5 + v[38]*s6 + v[45]*s7; 1333 x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 1334 + v[32]*s5 + v[39]*s6 + v[46]*s7; 1335 x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 1336 + v[33]*s5 + v[40]*s6 + v[47]*s7; 1337 x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 1338 + v[34]*s5 + v[41]*s6 + v[48]*s7; 1339 } 1340 1341 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1342 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1343 PetscLogFlops(2*36*(a->nz) - 6*A->n); 1344 PetscFunctionReturn(0); 1345 } 1346 1347 #undef __FUNCT__ 1348 #define __FUNCT__ "MatSolve_SeqBAIJ_6" 1349 int MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 1350 { 1351 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1352 IS iscol=a->col,isrow=a->row; 1353 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1354 int *diag = a->diag; 1355 MatScalar *aa=a->a,*v; 1356 PetscScalar *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 1357 1358 PetscFunctionBegin; 1359 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1360 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1361 t = a->solve_work; 1362 1363 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1364 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1365 1366 /* forward solve the lower triangular */ 1367 idx = 6*(*r++); 1368 t[0] = b[idx]; t[1] = b[1+idx]; 1369 t[2] = b[2+idx]; t[3] = b[3+idx]; 1370 t[4] = b[4+idx]; t[5] = b[5+idx]; 1371 for (i=1; i<n; i++) { 1372 v = aa + 36*ai[i]; 1373 vi = aj + ai[i]; 1374 nz = diag[i] - ai[i]; 1375 idx = 6*(*r++); 1376 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1377 s5 = b[4+idx]; s6 = b[5+idx]; 1378 while (nz--) { 1379 idx = 6*(*vi++); 1380 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1381 x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 1382 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1383 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1384 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1385 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1386 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1387 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1388 v += 36; 1389 } 1390 idx = 6*i; 1391 t[idx] = s1;t[1+idx] = s2; 1392 t[2+idx] = s3;t[3+idx] = s4; 1393 t[4+idx] = s5;t[5+idx] = s6; 1394 } 1395 /* backward solve the upper triangular */ 1396 for (i=n-1; i>=0; i--){ 1397 v = aa + 36*diag[i] + 36; 1398 vi = aj + diag[i] + 1; 1399 nz = ai[i+1] - diag[i] - 1; 1400 idt = 6*i; 1401 s1 = t[idt]; s2 = t[1+idt]; 1402 s3 = t[2+idt];s4 = t[3+idt]; 1403 s5 = t[4+idt];s6 = t[5+idt]; 1404 while (nz--) { 1405 idx = 6*(*vi++); 1406 x1 = t[idx]; x2 = t[1+idx]; 1407 x3 = t[2+idx]; x4 = t[3+idx]; 1408 x5 = t[4+idx]; x6 = t[5+idx]; 1409 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1410 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1411 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1412 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1413 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1414 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1415 v += 36; 1416 } 1417 idc = 6*(*c--); 1418 v = aa + 36*diag[i]; 1419 x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 1420 v[18]*s4+v[24]*s5+v[30]*s6; 1421 x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 1422 v[19]*s4+v[25]*s5+v[31]*s6; 1423 x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 1424 v[20]*s4+v[26]*s5+v[32]*s6; 1425 x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 1426 v[21]*s4+v[27]*s5+v[33]*s6; 1427 x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 1428 v[22]*s4+v[28]*s5+v[34]*s6; 1429 x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 1430 v[23]*s4+v[29]*s5+v[35]*s6; 1431 } 1432 1433 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1434 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1435 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1436 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1437 PetscLogFlops(2*36*(a->nz) - 6*A->n); 1438 PetscFunctionReturn(0); 1439 } 1440 1441 #undef __FUNCT__ 1442 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering" 1443 int MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 1444 { 1445 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1446 int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1447 int ierr,*diag = a->diag,jdx; 1448 MatScalar *aa=a->a,*v; 1449 PetscScalar *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 1450 1451 PetscFunctionBegin; 1452 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1453 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1454 /* forward solve the lower triangular */ 1455 idx = 0; 1456 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 1457 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 1458 for (i=1; i<n; i++) { 1459 v = aa + 36*ai[i]; 1460 vi = aj + ai[i]; 1461 nz = diag[i] - ai[i]; 1462 idx = 6*i; 1463 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1464 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1465 while (nz--) { 1466 jdx = 6*(*vi++); 1467 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 1468 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 1469 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1470 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1471 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1472 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1473 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1474 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1475 v += 36; 1476 } 1477 x[idx] = s1; 1478 x[1+idx] = s2; 1479 x[2+idx] = s3; 1480 x[3+idx] = s4; 1481 x[4+idx] = s5; 1482 x[5+idx] = s6; 1483 } 1484 /* backward solve the upper triangular */ 1485 for (i=n-1; i>=0; i--){ 1486 v = aa + 36*diag[i] + 36; 1487 vi = aj + diag[i] + 1; 1488 nz = ai[i+1] - diag[i] - 1; 1489 idt = 6*i; 1490 s1 = x[idt]; s2 = x[1+idt]; 1491 s3 = x[2+idt]; s4 = x[3+idt]; 1492 s5 = x[4+idt]; s6 = x[5+idt]; 1493 while (nz--) { 1494 idx = 6*(*vi++); 1495 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 1496 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 1497 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1498 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1499 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1500 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1501 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1502 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1503 v += 36; 1504 } 1505 v = aa + 36*diag[i]; 1506 x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 1507 x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 1508 x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 1509 x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 1510 x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 1511 x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 1512 } 1513 1514 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1515 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1516 PetscLogFlops(2*36*(a->nz) - 6*A->n); 1517 PetscFunctionReturn(0); 1518 } 1519 1520 #undef __FUNCT__ 1521 #define __FUNCT__ "MatSolve_SeqBAIJ_5" 1522 int MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 1523 { 1524 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1525 IS iscol=a->col,isrow=a->row; 1526 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1527 int *diag = a->diag; 1528 MatScalar *aa=a->a,*v; 1529 PetscScalar *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 1530 1531 PetscFunctionBegin; 1532 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1533 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1534 t = a->solve_work; 1535 1536 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1537 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1538 1539 /* forward solve the lower triangular */ 1540 idx = 5*(*r++); 1541 t[0] = b[idx]; t[1] = b[1+idx]; 1542 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1543 for (i=1; i<n; i++) { 1544 v = aa + 25*ai[i]; 1545 vi = aj + ai[i]; 1546 nz = diag[i] - ai[i]; 1547 idx = 5*(*r++); 1548 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1549 s5 = b[4+idx]; 1550 while (nz--) { 1551 idx = 5*(*vi++); 1552 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1553 x4 = t[3+idx];x5 = t[4+idx]; 1554 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1555 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1556 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1557 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1558 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1559 v += 25; 1560 } 1561 idx = 5*i; 1562 t[idx] = s1;t[1+idx] = s2; 1563 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1564 } 1565 /* backward solve the upper triangular */ 1566 for (i=n-1; i>=0; i--){ 1567 v = aa + 25*diag[i] + 25; 1568 vi = aj + diag[i] + 1; 1569 nz = ai[i+1] - diag[i] - 1; 1570 idt = 5*i; 1571 s1 = t[idt]; s2 = t[1+idt]; 1572 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1573 while (nz--) { 1574 idx = 5*(*vi++); 1575 x1 = t[idx]; x2 = t[1+idx]; 1576 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1577 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1578 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1579 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1580 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1581 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1582 v += 25; 1583 } 1584 idc = 5*(*c--); 1585 v = aa + 25*diag[i]; 1586 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 1587 v[15]*s4+v[20]*s5; 1588 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 1589 v[16]*s4+v[21]*s5; 1590 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 1591 v[17]*s4+v[22]*s5; 1592 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 1593 v[18]*s4+v[23]*s5; 1594 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 1595 v[19]*s4+v[24]*s5; 1596 } 1597 1598 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1599 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1600 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1601 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1602 PetscLogFlops(2*25*(a->nz) - 5*A->n); 1603 PetscFunctionReturn(0); 1604 } 1605 1606 #undef __FUNCT__ 1607 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering" 1608 int MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 1609 { 1610 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1611 int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1612 int ierr,*diag = a->diag,jdx; 1613 MatScalar *aa=a->a,*v; 1614 PetscScalar *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 1615 1616 PetscFunctionBegin; 1617 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1618 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1619 /* forward solve the lower triangular */ 1620 idx = 0; 1621 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 1622 for (i=1; i<n; i++) { 1623 v = aa + 25*ai[i]; 1624 vi = aj + ai[i]; 1625 nz = diag[i] - ai[i]; 1626 idx = 5*i; 1627 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 1628 while (nz--) { 1629 jdx = 5*(*vi++); 1630 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 1631 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1632 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1633 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1634 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1635 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1636 v += 25; 1637 } 1638 x[idx] = s1; 1639 x[1+idx] = s2; 1640 x[2+idx] = s3; 1641 x[3+idx] = s4; 1642 x[4+idx] = s5; 1643 } 1644 /* backward solve the upper triangular */ 1645 for (i=n-1; i>=0; i--){ 1646 v = aa + 25*diag[i] + 25; 1647 vi = aj + diag[i] + 1; 1648 nz = ai[i+1] - diag[i] - 1; 1649 idt = 5*i; 1650 s1 = x[idt]; s2 = x[1+idt]; 1651 s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 1652 while (nz--) { 1653 idx = 5*(*vi++); 1654 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 1655 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1656 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1657 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1658 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1659 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1660 v += 25; 1661 } 1662 v = aa + 25*diag[i]; 1663 x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 1664 x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 1665 x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 1666 x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 1667 x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 1668 } 1669 1670 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1671 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1672 PetscLogFlops(2*25*(a->nz) - 5*A->n); 1673 PetscFunctionReturn(0); 1674 } 1675 1676 #undef __FUNCT__ 1677 #define __FUNCT__ "MatSolve_SeqBAIJ_4" 1678 int MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 1679 { 1680 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1681 IS iscol=a->col,isrow=a->row; 1682 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1683 int *diag = a->diag; 1684 MatScalar *aa=a->a,*v; 1685 PetscScalar *x,*b,s1,s2,s3,s4,x1,x2,x3,x4,*t; 1686 1687 PetscFunctionBegin; 1688 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1689 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1690 t = a->solve_work; 1691 1692 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1693 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1694 1695 /* forward solve the lower triangular */ 1696 idx = 4*(*r++); 1697 t[0] = b[idx]; t[1] = b[1+idx]; 1698 t[2] = b[2+idx]; t[3] = b[3+idx]; 1699 for (i=1; i<n; i++) { 1700 v = aa + 16*ai[i]; 1701 vi = aj + ai[i]; 1702 nz = diag[i] - ai[i]; 1703 idx = 4*(*r++); 1704 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1705 while (nz--) { 1706 idx = 4*(*vi++); 1707 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 1708 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1709 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1710 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1711 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1712 v += 16; 1713 } 1714 idx = 4*i; 1715 t[idx] = s1;t[1+idx] = s2; 1716 t[2+idx] = s3;t[3+idx] = s4; 1717 } 1718 /* backward solve the upper triangular */ 1719 for (i=n-1; i>=0; i--){ 1720 v = aa + 16*diag[i] + 16; 1721 vi = aj + diag[i] + 1; 1722 nz = ai[i+1] - diag[i] - 1; 1723 idt = 4*i; 1724 s1 = t[idt]; s2 = t[1+idt]; 1725 s3 = t[2+idt];s4 = t[3+idt]; 1726 while (nz--) { 1727 idx = 4*(*vi++); 1728 x1 = t[idx]; x2 = t[1+idx]; 1729 x3 = t[2+idx]; x4 = t[3+idx]; 1730 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1731 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1732 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1733 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1734 v += 16; 1735 } 1736 idc = 4*(*c--); 1737 v = aa + 16*diag[i]; 1738 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 1739 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 1740 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 1741 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 1742 } 1743 1744 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1745 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1746 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1747 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1748 PetscLogFlops(2*16*(a->nz) - 4*A->n); 1749 PetscFunctionReturn(0); 1750 } 1751 1752 #undef __FUNCT__ 1753 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion" 1754 int MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx) 1755 { 1756 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1757 IS iscol=a->col,isrow=a->row; 1758 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1759 int *diag = a->diag; 1760 MatScalar *aa=a->a,*v,s1,s2,s3,s4,x1,x2,x3,x4,*t; 1761 PetscScalar *x,*b; 1762 1763 PetscFunctionBegin; 1764 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1765 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1766 t = (MatScalar *)a->solve_work; 1767 1768 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1769 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1770 1771 /* forward solve the lower triangular */ 1772 idx = 4*(*r++); 1773 t[0] = (MatScalar)b[idx]; 1774 t[1] = (MatScalar)b[1+idx]; 1775 t[2] = (MatScalar)b[2+idx]; 1776 t[3] = (MatScalar)b[3+idx]; 1777 for (i=1; i<n; i++) { 1778 v = aa + 16*ai[i]; 1779 vi = aj + ai[i]; 1780 nz = diag[i] - ai[i]; 1781 idx = 4*(*r++); 1782 s1 = (MatScalar)b[idx]; 1783 s2 = (MatScalar)b[1+idx]; 1784 s3 = (MatScalar)b[2+idx]; 1785 s4 = (MatScalar)b[3+idx]; 1786 while (nz--) { 1787 idx = 4*(*vi++); 1788 x1 = t[idx]; 1789 x2 = t[1+idx]; 1790 x3 = t[2+idx]; 1791 x4 = t[3+idx]; 1792 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1793 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1794 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1795 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1796 v += 16; 1797 } 1798 idx = 4*i; 1799 t[idx] = s1; 1800 t[1+idx] = s2; 1801 t[2+idx] = s3; 1802 t[3+idx] = s4; 1803 } 1804 /* backward solve the upper triangular */ 1805 for (i=n-1; i>=0; i--){ 1806 v = aa + 16*diag[i] + 16; 1807 vi = aj + diag[i] + 1; 1808 nz = ai[i+1] - diag[i] - 1; 1809 idt = 4*i; 1810 s1 = t[idt]; 1811 s2 = t[1+idt]; 1812 s3 = t[2+idt]; 1813 s4 = t[3+idt]; 1814 while (nz--) { 1815 idx = 4*(*vi++); 1816 x1 = t[idx]; 1817 x2 = t[1+idx]; 1818 x3 = t[2+idx]; 1819 x4 = t[3+idx]; 1820 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1821 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1822 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1823 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1824 v += 16; 1825 } 1826 idc = 4*(*c--); 1827 v = aa + 16*diag[i]; 1828 t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 1829 t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 1830 t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 1831 t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 1832 x[idc] = (PetscScalar)t[idt]; 1833 x[1+idc] = (PetscScalar)t[1+idt]; 1834 x[2+idc] = (PetscScalar)t[2+idt]; 1835 x[3+idc] = (PetscScalar)t[3+idt]; 1836 } 1837 1838 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1839 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1840 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1841 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1842 PetscLogFlops(2*16*(a->nz) - 4*A->n); 1843 PetscFunctionReturn(0); 1844 } 1845 1846 #if defined (PETSC_HAVE_SSE) 1847 1848 #include PETSC_HAVE_SSE 1849 1850 #undef __FUNCT__ 1851 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion" 1852 int MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx) 1853 { 1854 /* 1855 Note: This code uses demotion of double 1856 to float when performing the mixed-mode computation. 1857 This may not be numerically reasonable for all applications. 1858 */ 1859 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1860 IS iscol=a->col,isrow=a->row; 1861 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1862 int *diag = a->diag,ai16; 1863 MatScalar *aa=a->a,*v; 1864 PetscScalar *x,*b,*t; 1865 1866 /* Make space in temp stack for 16 Byte Aligned arrays */ 1867 float ssealignedspace[11],*tmps,*tmpx; 1868 unsigned long offset; 1869 1870 PetscFunctionBegin; 1871 SSE_SCOPE_BEGIN; 1872 1873 offset = (unsigned long)ssealignedspace % 16; 1874 if (offset) offset = (16 - offset)/4; 1875 tmps = &ssealignedspace[offset]; 1876 tmpx = &ssealignedspace[offset+4]; 1877 PREFETCH_NTA(aa+16*ai[1]); 1878 1879 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1880 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1881 t = a->solve_work; 1882 1883 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1884 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1885 1886 /* forward solve the lower triangular */ 1887 idx = 4*(*r++); 1888 t[0] = b[idx]; t[1] = b[1+idx]; 1889 t[2] = b[2+idx]; t[3] = b[3+idx]; 1890 v = aa + 16*ai[1]; 1891 1892 for (i=1; i<n;) { 1893 PREFETCH_NTA(&v[8]); 1894 vi = aj + ai[i]; 1895 nz = diag[i] - ai[i]; 1896 idx = 4*(*r++); 1897 1898 /* Demote sum from double to float */ 1899 CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 1900 LOAD_PS(tmps,XMM7); 1901 1902 while (nz--) { 1903 PREFETCH_NTA(&v[16]); 1904 idx = 4*(*vi++); 1905 1906 /* Demote solution (so far) from double to float */ 1907 CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 1908 1909 /* 4x4 Matrix-Vector product with negative accumulation: */ 1910 SSE_INLINE_BEGIN_2(tmpx,v) 1911 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 1912 1913 /* First Column */ 1914 SSE_COPY_PS(XMM0,XMM6) 1915 SSE_SHUFFLE(XMM0,XMM0,0x00) 1916 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 1917 SSE_SUB_PS(XMM7,XMM0) 1918 1919 /* Second Column */ 1920 SSE_COPY_PS(XMM1,XMM6) 1921 SSE_SHUFFLE(XMM1,XMM1,0x55) 1922 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 1923 SSE_SUB_PS(XMM7,XMM1) 1924 1925 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 1926 1927 /* Third Column */ 1928 SSE_COPY_PS(XMM2,XMM6) 1929 SSE_SHUFFLE(XMM2,XMM2,0xAA) 1930 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 1931 SSE_SUB_PS(XMM7,XMM2) 1932 1933 /* Fourth Column */ 1934 SSE_COPY_PS(XMM3,XMM6) 1935 SSE_SHUFFLE(XMM3,XMM3,0xFF) 1936 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 1937 SSE_SUB_PS(XMM7,XMM3) 1938 SSE_INLINE_END_2 1939 1940 v += 16; 1941 } 1942 idx = 4*i; 1943 v = aa + 16*ai[++i]; 1944 PREFETCH_NTA(v); 1945 STORE_PS(tmps,XMM7); 1946 1947 /* Promote result from float to double */ 1948 CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps); 1949 } 1950 /* backward solve the upper triangular */ 1951 idt = 4*(n-1); 1952 ai16 = 16*diag[n-1]; 1953 v = aa + ai16 + 16; 1954 for (i=n-1; i>=0;){ 1955 PREFETCH_NTA(&v[8]); 1956 vi = aj + diag[i] + 1; 1957 nz = ai[i+1] - diag[i] - 1; 1958 1959 /* Demote accumulator from double to float */ 1960 CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]); 1961 LOAD_PS(tmps,XMM7); 1962 1963 while (nz--) { 1964 PREFETCH_NTA(&v[16]); 1965 idx = 4*(*vi++); 1966 1967 /* Demote solution (so far) from double to float */ 1968 CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]); 1969 1970 /* 4x4 Matrix-Vector Product with negative accumulation: */ 1971 SSE_INLINE_BEGIN_2(tmpx,v) 1972 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 1973 1974 /* First Column */ 1975 SSE_COPY_PS(XMM0,XMM6) 1976 SSE_SHUFFLE(XMM0,XMM0,0x00) 1977 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 1978 SSE_SUB_PS(XMM7,XMM0) 1979 1980 /* Second Column */ 1981 SSE_COPY_PS(XMM1,XMM6) 1982 SSE_SHUFFLE(XMM1,XMM1,0x55) 1983 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 1984 SSE_SUB_PS(XMM7,XMM1) 1985 1986 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 1987 1988 /* Third Column */ 1989 SSE_COPY_PS(XMM2,XMM6) 1990 SSE_SHUFFLE(XMM2,XMM2,0xAA) 1991 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 1992 SSE_SUB_PS(XMM7,XMM2) 1993 1994 /* Fourth Column */ 1995 SSE_COPY_PS(XMM3,XMM6) 1996 SSE_SHUFFLE(XMM3,XMM3,0xFF) 1997 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 1998 SSE_SUB_PS(XMM7,XMM3) 1999 SSE_INLINE_END_2 2000 v += 16; 2001 } 2002 v = aa + ai16; 2003 ai16 = 16*diag[--i]; 2004 PREFETCH_NTA(aa+ai16+16); 2005 /* 2006 Scale the result by the diagonal 4x4 block, 2007 which was inverted as part of the factorization 2008 */ 2009 SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 2010 /* First Column */ 2011 SSE_COPY_PS(XMM0,XMM7) 2012 SSE_SHUFFLE(XMM0,XMM0,0x00) 2013 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 2014 2015 /* Second Column */ 2016 SSE_COPY_PS(XMM1,XMM7) 2017 SSE_SHUFFLE(XMM1,XMM1,0x55) 2018 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 2019 SSE_ADD_PS(XMM0,XMM1) 2020 2021 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 2022 2023 /* Third Column */ 2024 SSE_COPY_PS(XMM2,XMM7) 2025 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2026 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 2027 SSE_ADD_PS(XMM0,XMM2) 2028 2029 /* Fourth Column */ 2030 SSE_COPY_PS(XMM3,XMM7) 2031 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2032 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 2033 SSE_ADD_PS(XMM0,XMM3) 2034 2035 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 2036 SSE_INLINE_END_3 2037 2038 /* Promote solution from float to double */ 2039 CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps); 2040 2041 /* Apply reordering to t and stream into x. */ 2042 /* This way, x doesn't pollute the cache. */ 2043 /* Be careful with size: 2 doubles = 4 floats! */ 2044 idc = 4*(*c--); 2045 SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc]) 2046 /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 2047 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 2048 SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0) 2049 /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 2050 SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1) 2051 SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1) 2052 SSE_INLINE_END_2 2053 v = aa + ai16 + 16; 2054 idt -= 4; 2055 } 2056 2057 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2058 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2059 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2060 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2061 PetscLogFlops(2*16*(a->nz) - 4*A->n); 2062 SSE_SCOPE_END; 2063 PetscFunctionReturn(0); 2064 } 2065 2066 #endif 2067 2068 2069 /* 2070 Special case where the matrix was ILU(0) factored in the natural 2071 ordering. This eliminates the need for the column and row permutation. 2072 */ 2073 #undef __FUNCT__ 2074 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering" 2075 int MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 2076 { 2077 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2078 int n=a->mbs,*ai=a->i,*aj=a->j; 2079 int ierr,*diag = a->diag; 2080 MatScalar *aa=a->a; 2081 PetscScalar *x,*b; 2082 2083 PetscFunctionBegin; 2084 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2085 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2086 2087 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS) 2088 { 2089 static PetscScalar w[2000]; /* very BAD need to fix */ 2090 fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w); 2091 } 2092 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 2093 { 2094 static PetscScalar w[2000]; /* very BAD need to fix */ 2095 fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w); 2096 } 2097 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 2098 fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b); 2099 #else 2100 { 2101 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 2102 MatScalar *v; 2103 int jdx,idt,idx,nz,*vi,i,ai16; 2104 2105 /* forward solve the lower triangular */ 2106 idx = 0; 2107 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 2108 for (i=1; i<n; i++) { 2109 v = aa + 16*ai[i]; 2110 vi = aj + ai[i]; 2111 nz = diag[i] - ai[i]; 2112 idx += 4; 2113 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2114 while (nz--) { 2115 jdx = 4*(*vi++); 2116 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx]; 2117 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2118 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2119 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2120 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2121 v += 16; 2122 } 2123 x[idx] = s1; 2124 x[1+idx] = s2; 2125 x[2+idx] = s3; 2126 x[3+idx] = s4; 2127 } 2128 /* backward solve the upper triangular */ 2129 idt = 4*(n-1); 2130 for (i=n-1; i>=0; i--){ 2131 ai16 = 16*diag[i]; 2132 v = aa + ai16 + 16; 2133 vi = aj + diag[i] + 1; 2134 nz = ai[i+1] - diag[i] - 1; 2135 s1 = x[idt]; s2 = x[1+idt]; 2136 s3 = x[2+idt];s4 = x[3+idt]; 2137 while (nz--) { 2138 idx = 4*(*vi++); 2139 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; 2140 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2141 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2142 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2143 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2144 v += 16; 2145 } 2146 v = aa + ai16; 2147 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 2148 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 2149 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 2150 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 2151 idt -= 4; 2152 } 2153 } 2154 #endif 2155 2156 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2157 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2158 PetscLogFlops(2*16*(a->nz) - 4*A->n); 2159 PetscFunctionReturn(0); 2160 } 2161 2162 #undef __FUNCT__ 2163 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion" 2164 int MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx) 2165 { 2166 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2167 int n=a->mbs,*ai=a->i,*aj=a->j; 2168 int ierr,*diag = a->diag; 2169 MatScalar *aa=a->a; 2170 PetscScalar *x,*b; 2171 2172 PetscFunctionBegin; 2173 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2174 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2175 2176 { 2177 MatScalar s1,s2,s3,s4,x1,x2,x3,x4; 2178 MatScalar *v,*t=(MatScalar *)x; 2179 int jdx,idt,idx,nz,*vi,i,ai16; 2180 2181 /* forward solve the lower triangular */ 2182 idx = 0; 2183 t[0] = (MatScalar)b[0]; 2184 t[1] = (MatScalar)b[1]; 2185 t[2] = (MatScalar)b[2]; 2186 t[3] = (MatScalar)b[3]; 2187 for (i=1; i<n; i++) { 2188 v = aa + 16*ai[i]; 2189 vi = aj + ai[i]; 2190 nz = diag[i] - ai[i]; 2191 idx += 4; 2192 s1 = (MatScalar)b[idx]; 2193 s2 = (MatScalar)b[1+idx]; 2194 s3 = (MatScalar)b[2+idx]; 2195 s4 = (MatScalar)b[3+idx]; 2196 while (nz--) { 2197 jdx = 4*(*vi++); 2198 x1 = t[jdx]; 2199 x2 = t[1+jdx]; 2200 x3 = t[2+jdx]; 2201 x4 = t[3+jdx]; 2202 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2203 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2204 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2205 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2206 v += 16; 2207 } 2208 t[idx] = s1; 2209 t[1+idx] = s2; 2210 t[2+idx] = s3; 2211 t[3+idx] = s4; 2212 } 2213 /* backward solve the upper triangular */ 2214 idt = 4*(n-1); 2215 for (i=n-1; i>=0; i--){ 2216 ai16 = 16*diag[i]; 2217 v = aa + ai16 + 16; 2218 vi = aj + diag[i] + 1; 2219 nz = ai[i+1] - diag[i] - 1; 2220 s1 = t[idt]; 2221 s2 = t[1+idt]; 2222 s3 = t[2+idt]; 2223 s4 = t[3+idt]; 2224 while (nz--) { 2225 idx = 4*(*vi++); 2226 x1 = (MatScalar)x[idx]; 2227 x2 = (MatScalar)x[1+idx]; 2228 x3 = (MatScalar)x[2+idx]; 2229 x4 = (MatScalar)x[3+idx]; 2230 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2231 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2232 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2233 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2234 v += 16; 2235 } 2236 v = aa + ai16; 2237 x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4); 2238 x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4); 2239 x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4); 2240 x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4); 2241 idt -= 4; 2242 } 2243 } 2244 2245 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2246 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2247 PetscLogFlops(2*16*(a->nz) - 4*A->n); 2248 PetscFunctionReturn(0); 2249 } 2250 2251 #if defined (PETSC_HAVE_SSE) 2252 2253 #include PETSC_HAVE_SSE 2254 2255 #undef __FUNCT__ 2256 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion" 2257 int MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx) 2258 { 2259 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2260 int n=a->mbs,*ai=a->i,*aj=a->j; 2261 int ierr,*diag = a->diag; 2262 MatScalar *aa=a->a; 2263 PetscScalar *x,*b; 2264 2265 PetscFunctionBegin; 2266 SSE_SCOPE_BEGIN; 2267 /* 2268 Note: This code currently uses demotion of double 2269 to float when performing the mixed-mode computation. 2270 This may not be numerically reasonable for all applications. 2271 */ 2272 PREFETCH_NTA(aa+16*ai[1]); 2273 2274 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2275 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2276 { 2277 MatScalar *v; 2278 int jdx,idt,idx,nz,*vi,i,ai16; 2279 2280 /* Make space in temp stack for 16 Byte Aligned arrays */ 2281 float ssealignedspace[11],*tmps,*tmpx; 2282 unsigned long offset = (unsigned long)ssealignedspace % 16; 2283 if (offset) offset = (16 - offset)/4; 2284 tmps = &ssealignedspace[offset]; 2285 tmpx = &ssealignedspace[offset+4]; 2286 2287 /* forward solve the lower triangular */ 2288 idx = 0; 2289 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 2290 v = aa + 16*ai[1]; 2291 2292 for (i=1; i<n;) { 2293 PREFETCH_NTA(&v[8]); 2294 vi = aj + ai[i]; 2295 nz = diag[i] - ai[i]; 2296 idx += 4; 2297 2298 /* Demote sum from double to float */ 2299 CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 2300 LOAD_PS(tmps,XMM7); 2301 2302 while (nz--) { 2303 PREFETCH_NTA(&v[16]); 2304 jdx = 4*(*vi++); 2305 2306 /* Demote solution (so far) from double to float */ 2307 CONVERT_DOUBLE4_FLOAT4(tmpx,&x[jdx]); 2308 2309 /* 4x4 Matrix-Vector product with negative accumulation: */ 2310 SSE_INLINE_BEGIN_2(tmpx,v) 2311 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2312 2313 /* First Column */ 2314 SSE_COPY_PS(XMM0,XMM6) 2315 SSE_SHUFFLE(XMM0,XMM0,0x00) 2316 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2317 SSE_SUB_PS(XMM7,XMM0) 2318 2319 /* Second Column */ 2320 SSE_COPY_PS(XMM1,XMM6) 2321 SSE_SHUFFLE(XMM1,XMM1,0x55) 2322 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2323 SSE_SUB_PS(XMM7,XMM1) 2324 2325 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2326 2327 /* Third Column */ 2328 SSE_COPY_PS(XMM2,XMM6) 2329 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2330 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2331 SSE_SUB_PS(XMM7,XMM2) 2332 2333 /* Fourth Column */ 2334 SSE_COPY_PS(XMM3,XMM6) 2335 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2336 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2337 SSE_SUB_PS(XMM7,XMM3) 2338 SSE_INLINE_END_2 2339 2340 v += 16; 2341 } 2342 v = aa + 16*ai[++i]; 2343 PREFETCH_NTA(v); 2344 STORE_PS(tmps,XMM7); 2345 2346 /* Promote result from float to double */ 2347 CONVERT_FLOAT4_DOUBLE4(&x[idx],tmps); 2348 } 2349 /* backward solve the upper triangular */ 2350 idt = 4*(n-1); 2351 ai16 = 16*diag[n-1]; 2352 v = aa + ai16 + 16; 2353 for (i=n-1; i>=0;){ 2354 PREFETCH_NTA(&v[8]); 2355 vi = aj + diag[i] + 1; 2356 nz = ai[i+1] - diag[i] - 1; 2357 2358 /* Demote accumulator from double to float */ 2359 CONVERT_DOUBLE4_FLOAT4(tmps,&x[idt]); 2360 LOAD_PS(tmps,XMM7); 2361 2362 while (nz--) { 2363 PREFETCH_NTA(&v[16]); 2364 idx = 4*(*vi++); 2365 2366 /* Demote solution (so far) from double to float */ 2367 CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 2368 2369 /* 4x4 Matrix-Vector Product with negative accumulation: */ 2370 SSE_INLINE_BEGIN_2(tmpx,v) 2371 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2372 2373 /* First Column */ 2374 SSE_COPY_PS(XMM0,XMM6) 2375 SSE_SHUFFLE(XMM0,XMM0,0x00) 2376 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2377 SSE_SUB_PS(XMM7,XMM0) 2378 2379 /* Second Column */ 2380 SSE_COPY_PS(XMM1,XMM6) 2381 SSE_SHUFFLE(XMM1,XMM1,0x55) 2382 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2383 SSE_SUB_PS(XMM7,XMM1) 2384 2385 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2386 2387 /* Third Column */ 2388 SSE_COPY_PS(XMM2,XMM6) 2389 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2390 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2391 SSE_SUB_PS(XMM7,XMM2) 2392 2393 /* Fourth Column */ 2394 SSE_COPY_PS(XMM3,XMM6) 2395 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2396 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2397 SSE_SUB_PS(XMM7,XMM3) 2398 SSE_INLINE_END_2 2399 v += 16; 2400 } 2401 v = aa + ai16; 2402 ai16 = 16*diag[--i]; 2403 PREFETCH_NTA(aa+ai16+16); 2404 /* 2405 Scale the result by the diagonal 4x4 block, 2406 which was inverted as part of the factorization 2407 */ 2408 SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 2409 /* First Column */ 2410 SSE_COPY_PS(XMM0,XMM7) 2411 SSE_SHUFFLE(XMM0,XMM0,0x00) 2412 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 2413 2414 /* Second Column */ 2415 SSE_COPY_PS(XMM1,XMM7) 2416 SSE_SHUFFLE(XMM1,XMM1,0x55) 2417 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 2418 SSE_ADD_PS(XMM0,XMM1) 2419 2420 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 2421 2422 /* Third Column */ 2423 SSE_COPY_PS(XMM2,XMM7) 2424 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2425 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 2426 SSE_ADD_PS(XMM0,XMM2) 2427 2428 /* Fourth Column */ 2429 SSE_COPY_PS(XMM3,XMM7) 2430 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2431 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 2432 SSE_ADD_PS(XMM0,XMM3) 2433 2434 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 2435 SSE_INLINE_END_3 2436 2437 /* Promote solution from float to double */ 2438 CONVERT_FLOAT4_DOUBLE4(&x[idt],tmps); 2439 2440 v = aa + ai16 + 16; 2441 idt -= 4; 2442 } 2443 } 2444 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2445 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2446 PetscLogFlops(2*16*(a->nz) - 4*A->n); 2447 SSE_SCOPE_END; 2448 PetscFunctionReturn(0); 2449 } 2450 2451 #endif 2452 #undef __FUNCT__ 2453 #define __FUNCT__ "MatSolve_SeqBAIJ_3" 2454 int MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 2455 { 2456 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2457 IS iscol=a->col,isrow=a->row; 2458 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 2459 int *diag = a->diag; 2460 MatScalar *aa=a->a,*v; 2461 PetscScalar *x,*b,s1,s2,s3,x1,x2,x3,*t; 2462 2463 PetscFunctionBegin; 2464 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2465 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2466 t = a->solve_work; 2467 2468 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2469 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2470 2471 /* forward solve the lower triangular */ 2472 idx = 3*(*r++); 2473 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 2474 for (i=1; i<n; i++) { 2475 v = aa + 9*ai[i]; 2476 vi = aj + ai[i]; 2477 nz = diag[i] - ai[i]; 2478 idx = 3*(*r++); 2479 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 2480 while (nz--) { 2481 idx = 3*(*vi++); 2482 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2483 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2484 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2485 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2486 v += 9; 2487 } 2488 idx = 3*i; 2489 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 2490 } 2491 /* backward solve the upper triangular */ 2492 for (i=n-1; i>=0; i--){ 2493 v = aa + 9*diag[i] + 9; 2494 vi = aj + diag[i] + 1; 2495 nz = ai[i+1] - diag[i] - 1; 2496 idt = 3*i; 2497 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 2498 while (nz--) { 2499 idx = 3*(*vi++); 2500 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2501 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2502 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2503 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2504 v += 9; 2505 } 2506 idc = 3*(*c--); 2507 v = aa + 9*diag[i]; 2508 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 2509 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 2510 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 2511 } 2512 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2513 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2514 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2515 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2516 PetscLogFlops(2*9*(a->nz) - 3*A->n); 2517 PetscFunctionReturn(0); 2518 } 2519 2520 /* 2521 Special case where the matrix was ILU(0) factored in the natural 2522 ordering. This eliminates the need for the column and row permutation. 2523 */ 2524 #undef __FUNCT__ 2525 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering" 2526 int MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 2527 { 2528 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2529 int n=a->mbs,*ai=a->i,*aj=a->j; 2530 int ierr,*diag = a->diag; 2531 MatScalar *aa=a->a,*v; 2532 PetscScalar *x,*b,s1,s2,s3,x1,x2,x3; 2533 int jdx,idt,idx,nz,*vi,i; 2534 2535 PetscFunctionBegin; 2536 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2537 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2538 2539 2540 /* forward solve the lower triangular */ 2541 idx = 0; 2542 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; 2543 for (i=1; i<n; i++) { 2544 v = aa + 9*ai[i]; 2545 vi = aj + ai[i]; 2546 nz = diag[i] - ai[i]; 2547 idx += 3; 2548 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 2549 while (nz--) { 2550 jdx = 3*(*vi++); 2551 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx]; 2552 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2553 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2554 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2555 v += 9; 2556 } 2557 x[idx] = s1; 2558 x[1+idx] = s2; 2559 x[2+idx] = s3; 2560 } 2561 /* backward solve the upper triangular */ 2562 for (i=n-1; i>=0; i--){ 2563 v = aa + 9*diag[i] + 9; 2564 vi = aj + diag[i] + 1; 2565 nz = ai[i+1] - diag[i] - 1; 2566 idt = 3*i; 2567 s1 = x[idt]; s2 = x[1+idt]; 2568 s3 = x[2+idt]; 2569 while (nz--) { 2570 idx = 3*(*vi++); 2571 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; 2572 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2573 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2574 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2575 v += 9; 2576 } 2577 v = aa + 9*diag[i]; 2578 x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 2579 x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 2580 x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 2581 } 2582 2583 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2584 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2585 PetscLogFlops(2*9*(a->nz) - 3*A->n); 2586 PetscFunctionReturn(0); 2587 } 2588 2589 #undef __FUNCT__ 2590 #define __FUNCT__ "MatSolve_SeqBAIJ_2" 2591 int MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 2592 { 2593 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2594 IS iscol=a->col,isrow=a->row; 2595 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 2596 int *diag = a->diag; 2597 MatScalar *aa=a->a,*v; 2598 PetscScalar *x,*b,s1,s2,x1,x2,*t; 2599 2600 PetscFunctionBegin; 2601 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2602 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2603 t = a->solve_work; 2604 2605 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2606 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2607 2608 /* forward solve the lower triangular */ 2609 idx = 2*(*r++); 2610 t[0] = b[idx]; t[1] = b[1+idx]; 2611 for (i=1; i<n; i++) { 2612 v = aa + 4*ai[i]; 2613 vi = aj + ai[i]; 2614 nz = diag[i] - ai[i]; 2615 idx = 2*(*r++); 2616 s1 = b[idx]; s2 = b[1+idx]; 2617 while (nz--) { 2618 idx = 2*(*vi++); 2619 x1 = t[idx]; x2 = t[1+idx]; 2620 s1 -= v[0]*x1 + v[2]*x2; 2621 s2 -= v[1]*x1 + v[3]*x2; 2622 v += 4; 2623 } 2624 idx = 2*i; 2625 t[idx] = s1; t[1+idx] = s2; 2626 } 2627 /* backward solve the upper triangular */ 2628 for (i=n-1; i>=0; i--){ 2629 v = aa + 4*diag[i] + 4; 2630 vi = aj + diag[i] + 1; 2631 nz = ai[i+1] - diag[i] - 1; 2632 idt = 2*i; 2633 s1 = t[idt]; s2 = t[1+idt]; 2634 while (nz--) { 2635 idx = 2*(*vi++); 2636 x1 = t[idx]; x2 = t[1+idx]; 2637 s1 -= v[0]*x1 + v[2]*x2; 2638 s2 -= v[1]*x1 + v[3]*x2; 2639 v += 4; 2640 } 2641 idc = 2*(*c--); 2642 v = aa + 4*diag[i]; 2643 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 2644 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 2645 } 2646 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2647 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2648 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2649 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2650 PetscLogFlops(2*4*(a->nz) - 2*A->n); 2651 PetscFunctionReturn(0); 2652 } 2653 2654 /* 2655 Special case where the matrix was ILU(0) factored in the natural 2656 ordering. This eliminates the need for the column and row permutation. 2657 */ 2658 #undef __FUNCT__ 2659 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering" 2660 int MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 2661 { 2662 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2663 int n=a->mbs,*ai=a->i,*aj=a->j; 2664 int ierr,*diag = a->diag; 2665 MatScalar *aa=a->a,*v; 2666 PetscScalar *x,*b,s1,s2,x1,x2; 2667 int jdx,idt,idx,nz,*vi,i; 2668 2669 PetscFunctionBegin; 2670 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2671 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2672 2673 /* forward solve the lower triangular */ 2674 idx = 0; 2675 x[0] = b[0]; x[1] = b[1]; 2676 for (i=1; i<n; i++) { 2677 v = aa + 4*ai[i]; 2678 vi = aj + ai[i]; 2679 nz = diag[i] - ai[i]; 2680 idx += 2; 2681 s1 = b[idx];s2 = b[1+idx]; 2682 while (nz--) { 2683 jdx = 2*(*vi++); 2684 x1 = x[jdx];x2 = x[1+jdx]; 2685 s1 -= v[0]*x1 + v[2]*x2; 2686 s2 -= v[1]*x1 + v[3]*x2; 2687 v += 4; 2688 } 2689 x[idx] = s1; 2690 x[1+idx] = s2; 2691 } 2692 /* backward solve the upper triangular */ 2693 for (i=n-1; i>=0; i--){ 2694 v = aa + 4*diag[i] + 4; 2695 vi = aj + diag[i] + 1; 2696 nz = ai[i+1] - diag[i] - 1; 2697 idt = 2*i; 2698 s1 = x[idt]; s2 = x[1+idt]; 2699 while (nz--) { 2700 idx = 2*(*vi++); 2701 x1 = x[idx]; x2 = x[1+idx]; 2702 s1 -= v[0]*x1 + v[2]*x2; 2703 s2 -= v[1]*x1 + v[3]*x2; 2704 v += 4; 2705 } 2706 v = aa + 4*diag[i]; 2707 x[idt] = v[0]*s1 + v[2]*s2; 2708 x[1+idt] = v[1]*s1 + v[3]*s2; 2709 } 2710 2711 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2712 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2713 PetscLogFlops(2*4*(a->nz) - 2*A->n); 2714 PetscFunctionReturn(0); 2715 } 2716 2717 #undef __FUNCT__ 2718 #define __FUNCT__ "MatSolve_SeqBAIJ_1" 2719 int MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 2720 { 2721 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2722 IS iscol=a->col,isrow=a->row; 2723 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout; 2724 int *diag = a->diag; 2725 MatScalar *aa=a->a,*v; 2726 PetscScalar *x,*b,s1,*t; 2727 2728 PetscFunctionBegin; 2729 if (!n) PetscFunctionReturn(0); 2730 2731 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2732 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2733 t = a->solve_work; 2734 2735 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2736 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2737 2738 /* forward solve the lower triangular */ 2739 t[0] = b[*r++]; 2740 for (i=1; i<n; i++) { 2741 v = aa + ai[i]; 2742 vi = aj + ai[i]; 2743 nz = diag[i] - ai[i]; 2744 s1 = b[*r++]; 2745 while (nz--) { 2746 s1 -= (*v++)*t[*vi++]; 2747 } 2748 t[i] = s1; 2749 } 2750 /* backward solve the upper triangular */ 2751 for (i=n-1; i>=0; i--){ 2752 v = aa + diag[i] + 1; 2753 vi = aj + diag[i] + 1; 2754 nz = ai[i+1] - diag[i] - 1; 2755 s1 = t[i]; 2756 while (nz--) { 2757 s1 -= (*v++)*t[*vi++]; 2758 } 2759 x[*c--] = t[i] = aa[diag[i]]*s1; 2760 } 2761 2762 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2763 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2764 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2765 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2766 PetscLogFlops(2*1*(a->nz) - A->n); 2767 PetscFunctionReturn(0); 2768 } 2769 /* 2770 Special case where the matrix was ILU(0) factored in the natural 2771 ordering. This eliminates the need for the column and row permutation. 2772 */ 2773 #undef __FUNCT__ 2774 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering" 2775 int MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 2776 { 2777 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2778 int n=a->mbs,*ai=a->i,*aj=a->j; 2779 int ierr,*diag = a->diag; 2780 MatScalar *aa=a->a; 2781 PetscScalar *x,*b; 2782 PetscScalar s1,x1; 2783 MatScalar *v; 2784 int jdx,idt,idx,nz,*vi,i; 2785 2786 PetscFunctionBegin; 2787 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2788 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2789 2790 /* forward solve the lower triangular */ 2791 idx = 0; 2792 x[0] = b[0]; 2793 for (i=1; i<n; i++) { 2794 v = aa + ai[i]; 2795 vi = aj + ai[i]; 2796 nz = diag[i] - ai[i]; 2797 idx += 1; 2798 s1 = b[idx]; 2799 while (nz--) { 2800 jdx = *vi++; 2801 x1 = x[jdx]; 2802 s1 -= v[0]*x1; 2803 v += 1; 2804 } 2805 x[idx] = s1; 2806 } 2807 /* backward solve the upper triangular */ 2808 for (i=n-1; i>=0; i--){ 2809 v = aa + diag[i] + 1; 2810 vi = aj + diag[i] + 1; 2811 nz = ai[i+1] - diag[i] - 1; 2812 idt = i; 2813 s1 = x[idt]; 2814 while (nz--) { 2815 idx = *vi++; 2816 x1 = x[idx]; 2817 s1 -= v[0]*x1; 2818 v += 1; 2819 } 2820 v = aa + diag[i]; 2821 x[idt] = v[0]*s1; 2822 } 2823 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2824 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2825 PetscLogFlops(2*(a->nz) - A->n); 2826 PetscFunctionReturn(0); 2827 } 2828 2829 /* ----------------------------------------------------------------*/ 2830 /* 2831 This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 2832 except that the data structure of Mat_SeqAIJ is slightly different. 2833 Not a good example of code reuse. 2834 */ 2835 EXTERN int MatMissingDiagonal_SeqBAIJ(Mat); 2836 2837 #undef __FUNCT__ 2838 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ" 2839 int MatILUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,MatILUInfo *info,Mat *fact) 2840 { 2841 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 2842 IS isicol; 2843 int *r,*ic,ierr,prow,n = a->mbs,*ai = a->i,*aj = a->j; 2844 int *ainew,*ajnew,jmax,*fill,*xi,nz,*im,*ajfill,*flev; 2845 int *dloc,idx,row,m,fm,nzf,nzi,len, realloc = 0,dcount = 0; 2846 int incrlev,nnz,i,bs = a->bs,bs2 = a->bs2,levels,diagonal_fill; 2847 PetscTruth col_identity,row_identity; 2848 PetscReal f; 2849 2850 PetscFunctionBegin; 2851 f = info->fill; 2852 levels = (int)info->levels; 2853 diagonal_fill = (int)info->diagonal_fill; 2854 ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 2855 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 2856 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 2857 2858 if (!levels && row_identity && col_identity) { /* special case copy the nonzero structure */ 2859 ierr = MatDuplicate_SeqBAIJ(A,MAT_DO_NOT_COPY_VALUES,fact);CHKERRQ(ierr); 2860 (*fact)->factor = FACTOR_LU; 2861 b = (Mat_SeqBAIJ*)(*fact)->data; 2862 if (!b->diag) { 2863 ierr = MatMarkDiagonal_SeqBAIJ(*fact);CHKERRQ(ierr); 2864 } 2865 ierr = MatMissingDiagonal_SeqBAIJ(*fact);CHKERRQ(ierr); 2866 b->row = isrow; 2867 b->col = iscol; 2868 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 2869 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 2870 b->icol = isicol; 2871 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 2872 ierr = PetscMalloc(((*fact)->m+1+b->bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 2873 } else { /* general case perform the symbolic factorization */ 2874 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 2875 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 2876 2877 /* get new row pointers */ 2878 ierr = PetscMalloc((n+1)*sizeof(int),&ainew);CHKERRQ(ierr); 2879 ainew[0] = 0; 2880 /* don't know how many column pointers are needed so estimate */ 2881 jmax = (int)(f*ai[n] + 1); 2882 ierr = PetscMalloc((jmax)*sizeof(int),&ajnew);CHKERRQ(ierr); 2883 /* ajfill is level of fill for each fill entry */ 2884 ierr = PetscMalloc((jmax)*sizeof(int),&ajfill);CHKERRQ(ierr); 2885 /* fill is a linked list of nonzeros in active row */ 2886 ierr = PetscMalloc((n+1)*sizeof(int),&fill);CHKERRQ(ierr); 2887 /* im is level for each filled value */ 2888 ierr = PetscMalloc((n+1)*sizeof(int),&im);CHKERRQ(ierr); 2889 /* dloc is location of diagonal in factor */ 2890 ierr = PetscMalloc((n+1)*sizeof(int),&dloc);CHKERRQ(ierr); 2891 dloc[0] = 0; 2892 for (prow=0; prow<n; prow++) { 2893 2894 /* copy prow into linked list */ 2895 nzf = nz = ai[r[prow]+1] - ai[r[prow]]; 2896 if (!nz) SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix"); 2897 xi = aj + ai[r[prow]]; 2898 fill[n] = n; 2899 fill[prow] = -1; /* marker for diagonal entry */ 2900 while (nz--) { 2901 fm = n; 2902 idx = ic[*xi++]; 2903 do { 2904 m = fm; 2905 fm = fill[m]; 2906 } while (fm < idx); 2907 fill[m] = idx; 2908 fill[idx] = fm; 2909 im[idx] = 0; 2910 } 2911 2912 /* make sure diagonal entry is included */ 2913 if (diagonal_fill && fill[prow] == -1) { 2914 fm = n; 2915 while (fill[fm] < prow) fm = fill[fm]; 2916 fill[prow] = fill[fm]; /* insert diagonal into linked list */ 2917 fill[fm] = prow; 2918 im[prow] = 0; 2919 nzf++; 2920 dcount++; 2921 } 2922 2923 nzi = 0; 2924 row = fill[n]; 2925 while (row < prow) { 2926 incrlev = im[row] + 1; 2927 nz = dloc[row]; 2928 xi = ajnew + ainew[row] + nz + 1; 2929 flev = ajfill + ainew[row] + nz + 1; 2930 nnz = ainew[row+1] - ainew[row] - nz - 1; 2931 fm = row; 2932 while (nnz-- > 0) { 2933 idx = *xi++; 2934 if (*flev + incrlev > levels) { 2935 flev++; 2936 continue; 2937 } 2938 do { 2939 m = fm; 2940 fm = fill[m]; 2941 } while (fm < idx); 2942 if (fm != idx) { 2943 im[idx] = *flev + incrlev; 2944 fill[m] = idx; 2945 fill[idx] = fm; 2946 fm = idx; 2947 nzf++; 2948 } else { 2949 if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev; 2950 } 2951 flev++; 2952 } 2953 row = fill[row]; 2954 nzi++; 2955 } 2956 /* copy new filled row into permanent storage */ 2957 ainew[prow+1] = ainew[prow] + nzf; 2958 if (ainew[prow+1] > jmax) { 2959 2960 /* estimate how much additional space we will need */ 2961 /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 2962 /* just double the memory each time */ 2963 int maxadd = jmax; 2964 /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */ 2965 if (maxadd < nzf) maxadd = (n-prow)*(nzf+1); 2966 jmax += maxadd; 2967 2968 /* allocate a longer ajnew and ajfill */ 2969 ierr = PetscMalloc(jmax*sizeof(int),&xi);CHKERRQ(ierr); 2970 ierr = PetscMemcpy(xi,ajnew,ainew[prow]*sizeof(int));CHKERRQ(ierr); 2971 ierr = PetscFree(ajnew);CHKERRQ(ierr); 2972 ajnew = xi; 2973 ierr = PetscMalloc(jmax*sizeof(int),&xi);CHKERRQ(ierr); 2974 ierr = PetscMemcpy(xi,ajfill,ainew[prow]*sizeof(int));CHKERRQ(ierr); 2975 ierr = PetscFree(ajfill);CHKERRQ(ierr); 2976 ajfill = xi; 2977 realloc++; /* count how many reallocations are needed */ 2978 } 2979 xi = ajnew + ainew[prow]; 2980 flev = ajfill + ainew[prow]; 2981 dloc[prow] = nzi; 2982 fm = fill[n]; 2983 while (nzf--) { 2984 *xi++ = fm; 2985 *flev++ = im[fm]; 2986 fm = fill[fm]; 2987 } 2988 /* make sure row has diagonal entry */ 2989 if (ajnew[ainew[prow]+dloc[prow]] != prow) { 2990 SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %d has missing diagonal in factored matrix\n\ 2991 try running with -pc_ilu_nonzeros_along_diagonal or -pc_ilu_diagonal_fill",prow); 2992 } 2993 } 2994 ierr = PetscFree(ajfill);CHKERRQ(ierr); 2995 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 2996 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 2997 ierr = PetscFree(fill);CHKERRQ(ierr); 2998 ierr = PetscFree(im);CHKERRQ(ierr); 2999 3000 { 3001 PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]); 3002 PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",realloc,f,af); 3003 PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Run with -pc_ilu_fill %g or use \n",af); 3004 PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:PCILUSetFill(pc,%g);\n",af); 3005 PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:for best performance.\n"); 3006 if (diagonal_fill) { 3007 PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Detected and replaced %d missing diagonals",dcount); 3008 } 3009 } 3010 3011 /* put together the new matrix */ 3012 ierr = MatCreateSeqBAIJ(A->comm,bs,bs*n,bs*n,0,PETSC_NULL,fact);CHKERRQ(ierr); 3013 PetscLogObjectParent(*fact,isicol); 3014 b = (Mat_SeqBAIJ*)(*fact)->data; 3015 ierr = PetscFree(b->imax);CHKERRQ(ierr); 3016 b->singlemalloc = PETSC_FALSE; 3017 len = bs2*ainew[n]*sizeof(MatScalar); 3018 /* the next line frees the default space generated by the Create() */ 3019 ierr = PetscFree(b->a);CHKERRQ(ierr); 3020 ierr = PetscFree(b->ilen);CHKERRQ(ierr); 3021 ierr = PetscMalloc(len,&b->a);CHKERRQ(ierr); 3022 b->j = ajnew; 3023 b->i = ainew; 3024 for (i=0; i<n; i++) dloc[i] += ainew[i]; 3025 b->diag = dloc; 3026 b->ilen = 0; 3027 b->imax = 0; 3028 b->row = isrow; 3029 b->col = iscol; 3030 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 3031 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 3032 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 3033 b->icol = isicol; 3034 ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 3035 /* In b structure: Free imax, ilen, old a, old j. 3036 Allocate dloc, solve_work, new a, new j */ 3037 PetscLogObjectMemory(*fact,(ainew[n]-n)*(sizeof(int))+bs2*ainew[n]*sizeof(PetscScalar)); 3038 b->maxnz = b->nz = ainew[n]; 3039 (*fact)->factor = FACTOR_LU; 3040 3041 (*fact)->info.factor_mallocs = realloc; 3042 (*fact)->info.fill_ratio_given = f; 3043 (*fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]); 3044 } 3045 3046 if (row_identity && col_identity) { 3047 ierr = MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(*fact);CHKERRQ(ierr); 3048 } 3049 PetscFunctionReturn(0); 3050 } 3051 3052 #undef __FUNCT__ 3053 #define __FUNCT__ "MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering" 3054 int MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(Mat inA) 3055 { 3056 /* 3057 Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver 3058 with natural ordering 3059 */ 3060 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)inA->data; 3061 PetscTruth sse_enabled_local,sse_enabled_global; 3062 int ierr; 3063 3064 PetscFunctionBegin; 3065 inA->ops->solve = MatSolve_SeqBAIJ_Update; 3066 inA->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_Update; 3067 switch (a->bs) { 3068 case 1: 3069 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_1; 3070 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=1\n"); 3071 break; 3072 case 2: 3073 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering; 3074 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=2\n"); 3075 break; 3076 case 3: 3077 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering; 3078 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=3\n"); 3079 break; 3080 case 4: 3081 ierr = PetscSSEIsEnabled(inA->comm,&sse_enabled_local,&sse_enabled_global);CHKERRQ(ierr); 3082 if (sse_enabled_local) { 3083 #if defined(PETSC_HAVE_SSE) 3084 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE; 3085 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special SSE, in-place natural ordering factor BS=4\n"); 3086 #else 3087 /* This should never be reached. If so, problem in PetscSSEIsEnabled. */ 3088 SETERRQ(PETSC_ERR_SUP,"SSE Hardware unavailable"); 3089 #endif 3090 } else { 3091 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering; 3092 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=4\n"); 3093 } 3094 break; 3095 case 5: 3096 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_5_NaturalOrdering; 3097 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=5\n"); 3098 break; 3099 case 6: 3100 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_6_NaturalOrdering; 3101 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=6\n"); 3102 break; 3103 case 7: 3104 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_7_NaturalOrdering; 3105 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=7\n"); 3106 break; 3107 } 3108 PetscFunctionReturn(0); 3109 } 3110 3111 #undef __FUNCT__ 3112 #define __FUNCT__ "MatSeqBAIJ_UpdateSolvers" 3113 int MatSeqBAIJ_UpdateSolvers(Mat A) 3114 { 3115 /* 3116 Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver 3117 with natural ordering 3118 */ 3119 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3120 IS row = a->row, col = a->col; 3121 PetscTruth row_identity, col_identity; 3122 PetscTruth use_single, use_natural; 3123 int ierr; 3124 3125 PetscFunctionBegin; 3126 3127 use_single = PETSC_FALSE; 3128 use_natural = PETSC_FALSE; 3129 3130 ierr = ISIdentity(row,&row_identity);CHKERRQ(ierr); 3131 ierr = ISIdentity(col,&col_identity);CHKERRQ(ierr); 3132 3133 if (row_identity && col_identity) { 3134 use_natural = PETSC_TRUE; 3135 } else { 3136 use_natural = PETSC_FALSE; 3137 } 3138 switch (a->bs) { 3139 case 1: 3140 if (use_natural) { 3141 A->ops->solve = MatSolve_SeqBAIJ_1_NaturalOrdering; 3142 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering; 3143 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=1\n"); 3144 PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n"); 3145 } else { 3146 A->ops->solve = MatSolve_SeqBAIJ_1; 3147 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1; 3148 } 3149 break; 3150 case 2: 3151 if (use_natural) { 3152 A->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering; 3153 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering; 3154 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=2\n"); 3155 PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n"); 3156 } else { 3157 A->ops->solve = MatSolve_SeqBAIJ_2; 3158 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2; 3159 } 3160 break; 3161 case 3: 3162 if (use_natural) { 3163 A->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering; 3164 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering; 3165 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=3\n"); 3166 PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n"); 3167 } else { 3168 A->ops->solve = MatSolve_SeqBAIJ_3; 3169 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3; 3170 } 3171 break; 3172 case 4: 3173 { 3174 PetscTruth sse_enabled_local, sse_enabled_global; 3175 PetscTruth single_prec, flg; 3176 single_prec = flg = PETSC_FALSE; 3177 3178 ierr = PetscSSEIsEnabled(A->comm,&sse_enabled_local,&sse_enabled_global);CHKERRQ(ierr); 3179 ierr = PetscOptionsGetLogical(PETSC_NULL,"-mat_single_precision_solves",&single_prec,&flg);CHKERRQ(ierr); 3180 if (flg) { 3181 a->single_precision_solves = single_prec; 3182 } 3183 if (a->single_precision_solves) { 3184 use_single = PETSC_TRUE; 3185 } 3186 if (use_natural) { 3187 if (use_single) { 3188 if (sse_enabled_local) { /* Natural + Single + SSE */ 3189 #if defined(PETSC_HAVE_SSE) 3190 A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion; 3191 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE, in-place natural ordering solve BS=4\n"); 3192 #else 3193 /* This should never be reached. If so, problem in PetscSSEIsEnabled. */ 3194 SETERRQ(PETSC_ERR_SUP,"SSE Hardware unavailable"); 3195 #endif 3196 } else { /* Natural + Single */ 3197 A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion; 3198 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, in-place natural ordering solve BS=4\n"); 3199 } 3200 } else { /* Natural */ 3201 A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering; 3202 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=4\n"); 3203 } /* Only one version of SolveTranspose for Natural Ordering */ 3204 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering; 3205 PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n"); 3206 } else { /* Arbitrary ordering */ 3207 if (use_single) { 3208 if (sse_enabled_local) { /* Arbitrary + Single + SSE */ 3209 #if defined(PETSC_HAVE_SSE) 3210 A->ops->solve = MatSolve_SeqBAIJ_4_SSE_Demotion; 3211 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE solve BS=4\n"); 3212 #else 3213 /* This should never be reached. If so, problem in PetscSSEIsEnabled. */ 3214 SETERRQ(PETSC_ERR_SUP,"SSE Hardware unavailable"); 3215 #endif 3216 } else { /* Arbitrary + Single */ 3217 A->ops->solve = MatSolve_SeqBAIJ_4_Demotion; 3218 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision solve BS=4\n"); 3219 } 3220 } else { /* Arbitrary */ 3221 A->ops->solve = MatSolve_SeqBAIJ_4; 3222 } /* Only one version of SolveTranspose for Natural Ordering */ 3223 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4; 3224 } 3225 } 3226 break; 3227 case 5: 3228 if (use_natural) { 3229 A->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering; 3230 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_5_NaturalOrdering; 3231 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=5\n"); 3232 PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=5\n"); 3233 } else { 3234 A->ops->solve = MatSolve_SeqBAIJ_5; 3235 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_5; 3236 } 3237 break; 3238 case 6: 3239 if (use_natural) { 3240 A->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering; 3241 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_6_NaturalOrdering; 3242 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=6\n"); 3243 PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=6\n"); 3244 } else { 3245 A->ops->solve = MatSolve_SeqBAIJ_6; 3246 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_6; 3247 } 3248 break; 3249 case 7: 3250 if (use_natural) { 3251 A->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering; 3252 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_7_NaturalOrdering; 3253 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=7\n"); 3254 PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=7\n"); 3255 } else { 3256 A->ops->solve = MatSolve_SeqBAIJ_7; 3257 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_7; 3258 } 3259 break; 3260 default: 3261 A->ops->solve = MatSolve_SeqBAIJ_N; 3262 break; 3263 } 3264 PetscFunctionReturn(0); 3265 } 3266 3267 #undef __FUNCT__ 3268 #define __FUNCT__ "MatSolve_SeqBAIJ_Update" 3269 int MatSolve_SeqBAIJ_Update(Mat A,Vec x,Vec y) { 3270 int ierr; 3271 3272 PetscFunctionBegin; 3273 ierr = MatSeqBAIJ_UpdateSolvers(A); 3274 if (A->ops->solve != MatSolve_SeqBAIJ_Update) { 3275 ierr = (*A->ops->solve)(A,x,y);CHKERRQ(ierr); 3276 } else { 3277 SETERRQ(PETSC_ERR_SUP,"Something really wrong happened."); 3278 } 3279 PetscFunctionReturn(0); 3280 } 3281 3282 #undef __FUNCT__ 3283 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_Update" 3284 int MatSolveTranspose_SeqBAIJ_Update(Mat A,Vec x,Vec y) { 3285 int ierr; 3286 3287 PetscFunctionBegin; 3288 ierr = MatSeqBAIJ_UpdateSolvers(A); 3289 ierr = (*A->ops->solvetranspose)(A,x,y);CHKERRQ(ierr); 3290 PetscFunctionReturn(0); 3291 } 3292 3293 3294