1 /*$Id: baijfact2.c,v 1.72 2001/09/11 16:32:33 bsmith Exp $*/ 2 /* 3 Factorization code for BAIJ format. 4 */ 5 6 #include "src/mat/impls/baij/seq/baij.h" 7 #include "src/inline/ilu.h" 8 #include "src/inline/dot.h" 9 10 #undef __FUNCT__ 11 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering" 12 int MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 13 { 14 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 15 int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 16 int *diag = a->diag; 17 MatScalar *aa=a->a,*v; 18 PetscScalar s1,*x,*b; 19 20 PetscFunctionBegin; 21 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 22 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 23 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 24 25 /* forward solve the U^T */ 26 for (i=0; i<n; i++) { 27 28 v = aa + diag[i]; 29 /* multiply by the inverse of the block diagonal */ 30 s1 = (*v++)*x[i]; 31 vi = aj + diag[i] + 1; 32 nz = ai[i+1] - diag[i] - 1; 33 while (nz--) { 34 x[*vi++] -= (*v++)*s1; 35 } 36 x[i] = s1; 37 } 38 /* backward solve the L^T */ 39 for (i=n-1; i>=0; i--){ 40 v = aa + diag[i] - 1; 41 vi = aj + diag[i] - 1; 42 nz = diag[i] - ai[i]; 43 s1 = x[i]; 44 while (nz--) { 45 x[*vi--] -= (*v--)*s1; 46 } 47 } 48 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 49 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 50 PetscLogFlops(2*(a->nz) - A->n); 51 PetscFunctionReturn(0); 52 } 53 54 #undef __FUNCT__ 55 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering" 56 int MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 57 { 58 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 59 int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 60 int *diag = a->diag,oidx; 61 MatScalar *aa=a->a,*v; 62 PetscScalar s1,s2,x1,x2; 63 PetscScalar *x,*b; 64 65 PetscFunctionBegin; 66 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 67 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 68 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 69 70 /* forward solve the U^T */ 71 idx = 0; 72 for (i=0; i<n; i++) { 73 74 v = aa + 4*diag[i]; 75 /* multiply by the inverse of the block diagonal */ 76 x1 = x[idx]; x2 = x[1+idx]; 77 s1 = v[0]*x1 + v[1]*x2; 78 s2 = v[2]*x1 + v[3]*x2; 79 v += 4; 80 81 vi = aj + diag[i] + 1; 82 nz = ai[i+1] - diag[i] - 1; 83 while (nz--) { 84 oidx = 2*(*vi++); 85 x[oidx] -= v[0]*s1 + v[1]*s2; 86 x[oidx+1] -= v[2]*s1 + v[3]*s2; 87 v += 4; 88 } 89 x[idx] = s1;x[1+idx] = s2; 90 idx += 2; 91 } 92 /* backward solve the L^T */ 93 for (i=n-1; i>=0; i--){ 94 v = aa + 4*diag[i] - 4; 95 vi = aj + diag[i] - 1; 96 nz = diag[i] - ai[i]; 97 idt = 2*i; 98 s1 = x[idt]; s2 = x[1+idt]; 99 while (nz--) { 100 idx = 2*(*vi--); 101 x[idx] -= v[0]*s1 + v[1]*s2; 102 x[idx+1] -= v[2]*s1 + v[3]*s2; 103 v -= 4; 104 } 105 } 106 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 107 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 108 PetscLogFlops(2*4*(a->nz) - 2*A->n); 109 PetscFunctionReturn(0); 110 } 111 112 #undef __FUNCT__ 113 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering" 114 int MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 115 { 116 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 117 int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 118 int *diag = a->diag,oidx; 119 MatScalar *aa=a->a,*v; 120 PetscScalar s1,s2,s3,x1,x2,x3; 121 PetscScalar *x,*b; 122 123 PetscFunctionBegin; 124 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 125 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 126 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 127 128 /* forward solve the U^T */ 129 idx = 0; 130 for (i=0; i<n; i++) { 131 132 v = aa + 9*diag[i]; 133 /* multiply by the inverse of the block diagonal */ 134 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 135 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 136 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 137 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 138 v += 9; 139 140 vi = aj + diag[i] + 1; 141 nz = ai[i+1] - diag[i] - 1; 142 while (nz--) { 143 oidx = 3*(*vi++); 144 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 145 x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 146 x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 147 v += 9; 148 } 149 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 150 idx += 3; 151 } 152 /* backward solve the L^T */ 153 for (i=n-1; i>=0; i--){ 154 v = aa + 9*diag[i] - 9; 155 vi = aj + diag[i] - 1; 156 nz = diag[i] - ai[i]; 157 idt = 3*i; 158 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 159 while (nz--) { 160 idx = 3*(*vi--); 161 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 162 x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 163 x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 164 v -= 9; 165 } 166 } 167 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 168 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 169 PetscLogFlops(2*9*(a->nz) - 3*A->n); 170 PetscFunctionReturn(0); 171 } 172 173 #undef __FUNCT__ 174 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering" 175 int MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 176 { 177 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 178 int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 179 int *diag = a->diag,oidx; 180 MatScalar *aa=a->a,*v; 181 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 182 PetscScalar *x,*b; 183 184 PetscFunctionBegin; 185 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 186 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 187 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 188 189 /* forward solve the U^T */ 190 idx = 0; 191 for (i=0; i<n; i++) { 192 193 v = aa + 16*diag[i]; 194 /* multiply by the inverse of the block diagonal */ 195 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 196 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 197 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 198 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 199 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 200 v += 16; 201 202 vi = aj + diag[i] + 1; 203 nz = ai[i+1] - diag[i] - 1; 204 while (nz--) { 205 oidx = 4*(*vi++); 206 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 207 x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 208 x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 209 x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 210 v += 16; 211 } 212 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; 213 idx += 4; 214 } 215 /* backward solve the L^T */ 216 for (i=n-1; i>=0; i--){ 217 v = aa + 16*diag[i] - 16; 218 vi = aj + diag[i] - 1; 219 nz = diag[i] - ai[i]; 220 idt = 4*i; 221 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; 222 while (nz--) { 223 idx = 4*(*vi--); 224 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 225 x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 226 x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 227 x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 228 v -= 16; 229 } 230 } 231 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 232 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 233 PetscLogFlops(2*16*(a->nz) - 4*A->n); 234 PetscFunctionReturn(0); 235 } 236 237 #undef __FUNCT__ 238 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering" 239 int MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 240 { 241 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 242 int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 243 int *diag = a->diag,oidx; 244 MatScalar *aa=a->a,*v; 245 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 246 PetscScalar *x,*b; 247 248 PetscFunctionBegin; 249 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 250 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 251 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 252 253 /* forward solve the U^T */ 254 idx = 0; 255 for (i=0; i<n; i++) { 256 257 v = aa + 25*diag[i]; 258 /* multiply by the inverse of the block diagonal */ 259 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 260 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 261 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 262 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 263 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 264 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 265 v += 25; 266 267 vi = aj + diag[i] + 1; 268 nz = ai[i+1] - diag[i] - 1; 269 while (nz--) { 270 oidx = 5*(*vi++); 271 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 272 x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 273 x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 274 x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 275 x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 276 v += 25; 277 } 278 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 279 idx += 5; 280 } 281 /* backward solve the L^T */ 282 for (i=n-1; i>=0; i--){ 283 v = aa + 25*diag[i] - 25; 284 vi = aj + diag[i] - 1; 285 nz = diag[i] - ai[i]; 286 idt = 5*i; 287 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 288 while (nz--) { 289 idx = 5*(*vi--); 290 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 291 x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 292 x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 293 x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 294 x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 295 v -= 25; 296 } 297 } 298 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 299 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 300 PetscLogFlops(2*25*(a->nz) - 5*A->n); 301 PetscFunctionReturn(0); 302 } 303 304 #undef __FUNCT__ 305 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering" 306 int MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 307 { 308 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 309 int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 310 int *diag = a->diag,oidx; 311 MatScalar *aa=a->a,*v; 312 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 313 PetscScalar *x,*b; 314 315 PetscFunctionBegin; 316 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 317 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 318 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 319 320 /* forward solve the U^T */ 321 idx = 0; 322 for (i=0; i<n; i++) { 323 324 v = aa + 36*diag[i]; 325 /* multiply by the inverse of the block diagonal */ 326 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 327 x6 = x[5+idx]; 328 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 329 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 330 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 331 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 332 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 333 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 334 v += 36; 335 336 vi = aj + diag[i] + 1; 337 nz = ai[i+1] - diag[i] - 1; 338 while (nz--) { 339 oidx = 6*(*vi++); 340 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 341 x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 342 x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 343 x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 344 x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 345 x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 346 v += 36; 347 } 348 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 349 x[5+idx] = s6; 350 idx += 6; 351 } 352 /* backward solve the L^T */ 353 for (i=n-1; i>=0; i--){ 354 v = aa + 36*diag[i] - 36; 355 vi = aj + diag[i] - 1; 356 nz = diag[i] - ai[i]; 357 idt = 6*i; 358 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 359 s6 = x[5+idt]; 360 while (nz--) { 361 idx = 6*(*vi--); 362 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 363 x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 364 x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 365 x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 366 x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 367 x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 368 v -= 36; 369 } 370 } 371 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 372 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 373 PetscLogFlops(2*36*(a->nz) - 6*A->n); 374 PetscFunctionReturn(0); 375 } 376 377 #undef __FUNCT__ 378 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering" 379 int MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 380 { 381 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 382 int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 383 int *diag = a->diag,oidx; 384 MatScalar *aa=a->a,*v; 385 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 386 PetscScalar *x,*b; 387 388 PetscFunctionBegin; 389 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 390 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 391 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 392 393 /* forward solve the U^T */ 394 idx = 0; 395 for (i=0; i<n; i++) { 396 397 v = aa + 49*diag[i]; 398 /* multiply by the inverse of the block diagonal */ 399 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 400 x6 = x[5+idx]; x7 = x[6+idx]; 401 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 402 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 403 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 404 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 405 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 406 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 407 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 408 v += 49; 409 410 vi = aj + diag[i] + 1; 411 nz = ai[i+1] - diag[i] - 1; 412 while (nz--) { 413 oidx = 7*(*vi++); 414 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 415 x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 416 x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 417 x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 418 x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 419 x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 420 x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 421 v += 49; 422 } 423 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 424 x[5+idx] = s6;x[6+idx] = s7; 425 idx += 7; 426 } 427 /* backward solve the L^T */ 428 for (i=n-1; i>=0; i--){ 429 v = aa + 49*diag[i] - 49; 430 vi = aj + diag[i] - 1; 431 nz = diag[i] - ai[i]; 432 idt = 7*i; 433 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 434 s6 = x[5+idt];s7 = x[6+idt]; 435 while (nz--) { 436 idx = 7*(*vi--); 437 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 438 x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 439 x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 440 x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 441 x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 442 x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 443 x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 444 v -= 49; 445 } 446 } 447 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 448 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 449 PetscLogFlops(2*49*(a->nz) - 7*A->n); 450 PetscFunctionReturn(0); 451 } 452 453 /*---------------------------------------------------------------------------------------------*/ 454 #undef __FUNCT__ 455 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1" 456 int MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 457 { 458 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 459 IS iscol=a->col,isrow=a->row; 460 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout; 461 int *diag = a->diag; 462 MatScalar *aa=a->a,*v; 463 PetscScalar s1,*x,*b,*t; 464 465 PetscFunctionBegin; 466 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 467 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 468 t = a->solve_work; 469 470 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 471 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 472 473 /* copy the b into temp work space according to permutation */ 474 for (i=0; i<n; i++) { 475 t[i] = b[c[i]]; 476 } 477 478 /* forward solve the U^T */ 479 for (i=0; i<n; i++) { 480 481 v = aa + diag[i]; 482 /* multiply by the inverse of the block diagonal */ 483 s1 = (*v++)*t[i]; 484 vi = aj + diag[i] + 1; 485 nz = ai[i+1] - diag[i] - 1; 486 while (nz--) { 487 t[*vi++] -= (*v++)*s1; 488 } 489 t[i] = s1; 490 } 491 /* backward solve the L^T */ 492 for (i=n-1; i>=0; i--){ 493 v = aa + diag[i] - 1; 494 vi = aj + diag[i] - 1; 495 nz = diag[i] - ai[i]; 496 s1 = t[i]; 497 while (nz--) { 498 t[*vi--] -= (*v--)*s1; 499 } 500 } 501 502 /* copy t into x according to permutation */ 503 for (i=0; i<n; i++) { 504 x[r[i]] = t[i]; 505 } 506 507 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 508 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 509 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 510 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 511 PetscLogFlops(2*(a->nz) - A->n); 512 PetscFunctionReturn(0); 513 } 514 515 #undef __FUNCT__ 516 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2" 517 int MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 518 { 519 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 520 IS iscol=a->col,isrow=a->row; 521 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 522 int *diag = a->diag,ii,ic,ir,oidx; 523 MatScalar *aa=a->a,*v; 524 PetscScalar s1,s2,x1,x2; 525 PetscScalar *x,*b,*t; 526 527 PetscFunctionBegin; 528 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 529 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 530 t = a->solve_work; 531 532 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 533 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 534 535 /* copy the b into temp work space according to permutation */ 536 ii = 0; 537 for (i=0; i<n; i++) { 538 ic = 2*c[i]; 539 t[ii] = b[ic]; 540 t[ii+1] = b[ic+1]; 541 ii += 2; 542 } 543 544 /* forward solve the U^T */ 545 idx = 0; 546 for (i=0; i<n; i++) { 547 548 v = aa + 4*diag[i]; 549 /* multiply by the inverse of the block diagonal */ 550 x1 = t[idx]; x2 = t[1+idx]; 551 s1 = v[0]*x1 + v[1]*x2; 552 s2 = v[2]*x1 + v[3]*x2; 553 v += 4; 554 555 vi = aj + diag[i] + 1; 556 nz = ai[i+1] - diag[i] - 1; 557 while (nz--) { 558 oidx = 2*(*vi++); 559 t[oidx] -= v[0]*s1 + v[1]*s2; 560 t[oidx+1] -= v[2]*s1 + v[3]*s2; 561 v += 4; 562 } 563 t[idx] = s1;t[1+idx] = s2; 564 idx += 2; 565 } 566 /* backward solve the L^T */ 567 for (i=n-1; i>=0; i--){ 568 v = aa + 4*diag[i] - 4; 569 vi = aj + diag[i] - 1; 570 nz = diag[i] - ai[i]; 571 idt = 2*i; 572 s1 = t[idt]; s2 = t[1+idt]; 573 while (nz--) { 574 idx = 2*(*vi--); 575 t[idx] -= v[0]*s1 + v[1]*s2; 576 t[idx+1] -= v[2]*s1 + v[3]*s2; 577 v -= 4; 578 } 579 } 580 581 /* copy t into x according to permutation */ 582 ii = 0; 583 for (i=0; i<n; i++) { 584 ir = 2*r[i]; 585 x[ir] = t[ii]; 586 x[ir+1] = t[ii+1]; 587 ii += 2; 588 } 589 590 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 591 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 592 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 593 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 594 PetscLogFlops(2*4*(a->nz) - 2*A->n); 595 PetscFunctionReturn(0); 596 } 597 598 #undef __FUNCT__ 599 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3" 600 int MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 601 { 602 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 603 IS iscol=a->col,isrow=a->row; 604 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 605 int *diag = a->diag,ii,ic,ir,oidx; 606 MatScalar *aa=a->a,*v; 607 PetscScalar s1,s2,s3,x1,x2,x3; 608 PetscScalar *x,*b,*t; 609 610 PetscFunctionBegin; 611 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 612 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 613 t = a->solve_work; 614 615 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 616 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 617 618 /* copy the b into temp work space according to permutation */ 619 ii = 0; 620 for (i=0; i<n; i++) { 621 ic = 3*c[i]; 622 t[ii] = b[ic]; 623 t[ii+1] = b[ic+1]; 624 t[ii+2] = b[ic+2]; 625 ii += 3; 626 } 627 628 /* forward solve the U^T */ 629 idx = 0; 630 for (i=0; i<n; i++) { 631 632 v = aa + 9*diag[i]; 633 /* multiply by the inverse of the block diagonal */ 634 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 635 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 636 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 637 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 638 v += 9; 639 640 vi = aj + diag[i] + 1; 641 nz = ai[i+1] - diag[i] - 1; 642 while (nz--) { 643 oidx = 3*(*vi++); 644 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 645 t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 646 t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 647 v += 9; 648 } 649 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 650 idx += 3; 651 } 652 /* backward solve the L^T */ 653 for (i=n-1; i>=0; i--){ 654 v = aa + 9*diag[i] - 9; 655 vi = aj + diag[i] - 1; 656 nz = diag[i] - ai[i]; 657 idt = 3*i; 658 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 659 while (nz--) { 660 idx = 3*(*vi--); 661 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 662 t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 663 t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 664 v -= 9; 665 } 666 } 667 668 /* copy t into x according to permutation */ 669 ii = 0; 670 for (i=0; i<n; i++) { 671 ir = 3*r[i]; 672 x[ir] = t[ii]; 673 x[ir+1] = t[ii+1]; 674 x[ir+2] = t[ii+2]; 675 ii += 3; 676 } 677 678 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 679 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 680 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 681 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 682 PetscLogFlops(2*9*(a->nz) - 3*A->n); 683 PetscFunctionReturn(0); 684 } 685 686 #undef __FUNCT__ 687 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4" 688 int MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 689 { 690 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 691 IS iscol=a->col,isrow=a->row; 692 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 693 int *diag = a->diag,ii,ic,ir,oidx; 694 MatScalar *aa=a->a,*v; 695 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 696 PetscScalar *x,*b,*t; 697 698 PetscFunctionBegin; 699 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 700 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 701 t = a->solve_work; 702 703 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 704 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 705 706 /* copy the b into temp work space according to permutation */ 707 ii = 0; 708 for (i=0; i<n; i++) { 709 ic = 4*c[i]; 710 t[ii] = b[ic]; 711 t[ii+1] = b[ic+1]; 712 t[ii+2] = b[ic+2]; 713 t[ii+3] = b[ic+3]; 714 ii += 4; 715 } 716 717 /* forward solve the U^T */ 718 idx = 0; 719 for (i=0; i<n; i++) { 720 721 v = aa + 16*diag[i]; 722 /* multiply by the inverse of the block diagonal */ 723 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 724 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 725 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 726 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 727 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 728 v += 16; 729 730 vi = aj + diag[i] + 1; 731 nz = ai[i+1] - diag[i] - 1; 732 while (nz--) { 733 oidx = 4*(*vi++); 734 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 735 t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 736 t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 737 t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 738 v += 16; 739 } 740 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; 741 idx += 4; 742 } 743 /* backward solve the L^T */ 744 for (i=n-1; i>=0; i--){ 745 v = aa + 16*diag[i] - 16; 746 vi = aj + diag[i] - 1; 747 nz = diag[i] - ai[i]; 748 idt = 4*i; 749 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; 750 while (nz--) { 751 idx = 4*(*vi--); 752 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 753 t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 754 t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 755 t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 756 v -= 16; 757 } 758 } 759 760 /* copy t into x according to permutation */ 761 ii = 0; 762 for (i=0; i<n; i++) { 763 ir = 4*r[i]; 764 x[ir] = t[ii]; 765 x[ir+1] = t[ii+1]; 766 x[ir+2] = t[ii+2]; 767 x[ir+3] = t[ii+3]; 768 ii += 4; 769 } 770 771 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 772 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 773 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 774 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 775 PetscLogFlops(2*16*(a->nz) - 4*A->n); 776 PetscFunctionReturn(0); 777 } 778 779 #undef __FUNCT__ 780 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5" 781 int MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 782 { 783 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 784 IS iscol=a->col,isrow=a->row; 785 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 786 int *diag = a->diag,ii,ic,ir,oidx; 787 MatScalar *aa=a->a,*v; 788 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 789 PetscScalar *x,*b,*t; 790 791 PetscFunctionBegin; 792 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 793 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 794 t = a->solve_work; 795 796 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 797 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 798 799 /* copy the b into temp work space according to permutation */ 800 ii = 0; 801 for (i=0; i<n; i++) { 802 ic = 5*c[i]; 803 t[ii] = b[ic]; 804 t[ii+1] = b[ic+1]; 805 t[ii+2] = b[ic+2]; 806 t[ii+3] = b[ic+3]; 807 t[ii+4] = b[ic+4]; 808 ii += 5; 809 } 810 811 /* forward solve the U^T */ 812 idx = 0; 813 for (i=0; i<n; i++) { 814 815 v = aa + 25*diag[i]; 816 /* multiply by the inverse of the block diagonal */ 817 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 818 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 819 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 820 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 821 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 822 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 823 v += 25; 824 825 vi = aj + diag[i] + 1; 826 nz = ai[i+1] - diag[i] - 1; 827 while (nz--) { 828 oidx = 5*(*vi++); 829 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 830 t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 831 t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 832 t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 833 t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 834 v += 25; 835 } 836 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 837 idx += 5; 838 } 839 /* backward solve the L^T */ 840 for (i=n-1; i>=0; i--){ 841 v = aa + 25*diag[i] - 25; 842 vi = aj + diag[i] - 1; 843 nz = diag[i] - ai[i]; 844 idt = 5*i; 845 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 846 while (nz--) { 847 idx = 5*(*vi--); 848 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 849 t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 850 t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 851 t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 852 t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 853 v -= 25; 854 } 855 } 856 857 /* copy t into x according to permutation */ 858 ii = 0; 859 for (i=0; i<n; i++) { 860 ir = 5*r[i]; 861 x[ir] = t[ii]; 862 x[ir+1] = t[ii+1]; 863 x[ir+2] = t[ii+2]; 864 x[ir+3] = t[ii+3]; 865 x[ir+4] = t[ii+4]; 866 ii += 5; 867 } 868 869 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 870 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 871 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 872 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 873 PetscLogFlops(2*25*(a->nz) - 5*A->n); 874 PetscFunctionReturn(0); 875 } 876 877 #undef __FUNCT__ 878 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6" 879 int MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 880 { 881 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 882 IS iscol=a->col,isrow=a->row; 883 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 884 int *diag = a->diag,ii,ic,ir,oidx; 885 MatScalar *aa=a->a,*v; 886 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 887 PetscScalar *x,*b,*t; 888 889 PetscFunctionBegin; 890 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 891 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 892 t = a->solve_work; 893 894 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 895 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 896 897 /* copy the b into temp work space according to permutation */ 898 ii = 0; 899 for (i=0; i<n; i++) { 900 ic = 6*c[i]; 901 t[ii] = b[ic]; 902 t[ii+1] = b[ic+1]; 903 t[ii+2] = b[ic+2]; 904 t[ii+3] = b[ic+3]; 905 t[ii+4] = b[ic+4]; 906 t[ii+5] = b[ic+5]; 907 ii += 6; 908 } 909 910 /* forward solve the U^T */ 911 idx = 0; 912 for (i=0; i<n; i++) { 913 914 v = aa + 36*diag[i]; 915 /* multiply by the inverse of the block diagonal */ 916 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 917 x6 = t[5+idx]; 918 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 919 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 920 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 921 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 922 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 923 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 924 v += 36; 925 926 vi = aj + diag[i] + 1; 927 nz = ai[i+1] - diag[i] - 1; 928 while (nz--) { 929 oidx = 6*(*vi++); 930 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 931 t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 932 t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 933 t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 934 t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 935 t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 936 v += 36; 937 } 938 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 939 t[5+idx] = s6; 940 idx += 6; 941 } 942 /* backward solve the L^T */ 943 for (i=n-1; i>=0; i--){ 944 v = aa + 36*diag[i] - 36; 945 vi = aj + diag[i] - 1; 946 nz = diag[i] - ai[i]; 947 idt = 6*i; 948 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 949 s6 = t[5+idt]; 950 while (nz--) { 951 idx = 6*(*vi--); 952 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 953 t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 954 t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 955 t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 956 t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 957 t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 958 v -= 36; 959 } 960 } 961 962 /* copy t into x according to permutation */ 963 ii = 0; 964 for (i=0; i<n; i++) { 965 ir = 6*r[i]; 966 x[ir] = t[ii]; 967 x[ir+1] = t[ii+1]; 968 x[ir+2] = t[ii+2]; 969 x[ir+3] = t[ii+3]; 970 x[ir+4] = t[ii+4]; 971 x[ir+5] = t[ii+5]; 972 ii += 6; 973 } 974 975 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 976 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 977 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 978 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 979 PetscLogFlops(2*36*(a->nz) - 6*A->n); 980 PetscFunctionReturn(0); 981 } 982 983 #undef __FUNCT__ 984 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7" 985 int MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 986 { 987 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 988 IS iscol=a->col,isrow=a->row; 989 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 990 int *diag = a->diag,ii,ic,ir,oidx; 991 MatScalar *aa=a->a,*v; 992 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 993 PetscScalar *x,*b,*t; 994 995 PetscFunctionBegin; 996 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 997 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 998 t = a->solve_work; 999 1000 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1001 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1002 1003 /* copy the b into temp work space according to permutation */ 1004 ii = 0; 1005 for (i=0; i<n; i++) { 1006 ic = 7*c[i]; 1007 t[ii] = b[ic]; 1008 t[ii+1] = b[ic+1]; 1009 t[ii+2] = b[ic+2]; 1010 t[ii+3] = b[ic+3]; 1011 t[ii+4] = b[ic+4]; 1012 t[ii+5] = b[ic+5]; 1013 t[ii+6] = b[ic+6]; 1014 ii += 7; 1015 } 1016 1017 /* forward solve the U^T */ 1018 idx = 0; 1019 for (i=0; i<n; i++) { 1020 1021 v = aa + 49*diag[i]; 1022 /* multiply by the inverse of the block diagonal */ 1023 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1024 x6 = t[5+idx]; x7 = t[6+idx]; 1025 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 1026 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 1027 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 1028 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 1029 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 1030 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 1031 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 1032 v += 49; 1033 1034 vi = aj + diag[i] + 1; 1035 nz = ai[i+1] - diag[i] - 1; 1036 while (nz--) { 1037 oidx = 7*(*vi++); 1038 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1039 t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1040 t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1041 t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1042 t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1043 t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1044 t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1045 v += 49; 1046 } 1047 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1048 t[5+idx] = s6;t[6+idx] = s7; 1049 idx += 7; 1050 } 1051 /* backward solve the L^T */ 1052 for (i=n-1; i>=0; i--){ 1053 v = aa + 49*diag[i] - 49; 1054 vi = aj + diag[i] - 1; 1055 nz = diag[i] - ai[i]; 1056 idt = 7*i; 1057 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1058 s6 = t[5+idt];s7 = t[6+idt]; 1059 while (nz--) { 1060 idx = 7*(*vi--); 1061 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1062 t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1063 t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1064 t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1065 t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1066 t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1067 t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1068 v -= 49; 1069 } 1070 } 1071 1072 /* copy t into x according to permutation */ 1073 ii = 0; 1074 for (i=0; i<n; i++) { 1075 ir = 7*r[i]; 1076 x[ir] = t[ii]; 1077 x[ir+1] = t[ii+1]; 1078 x[ir+2] = t[ii+2]; 1079 x[ir+3] = t[ii+3]; 1080 x[ir+4] = t[ii+4]; 1081 x[ir+5] = t[ii+5]; 1082 x[ir+6] = t[ii+6]; 1083 ii += 7; 1084 } 1085 1086 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1087 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1088 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1089 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1090 PetscLogFlops(2*49*(a->nz) - 7*A->n); 1091 PetscFunctionReturn(0); 1092 } 1093 1094 /* ----------------------------------------------------------- */ 1095 #undef __FUNCT__ 1096 #define __FUNCT__ "MatSolve_SeqBAIJ_N" 1097 int MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 1098 { 1099 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1100 IS iscol=a->col,isrow=a->row; 1101 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1102 int nz,bs=a->bs,bs2=a->bs2,*rout,*cout; 1103 MatScalar *aa=a->a,*v; 1104 PetscScalar *x,*b,*s,*t,*ls; 1105 1106 PetscFunctionBegin; 1107 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1108 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1109 t = a->solve_work; 1110 1111 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1112 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1113 1114 /* forward solve the lower triangular */ 1115 ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 1116 for (i=1; i<n; i++) { 1117 v = aa + bs2*ai[i]; 1118 vi = aj + ai[i]; 1119 nz = a->diag[i] - ai[i]; 1120 s = t + bs*i; 1121 ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 1122 while (nz--) { 1123 Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 1124 v += bs2; 1125 } 1126 } 1127 /* backward solve the upper triangular */ 1128 ls = a->solve_work + A->n; 1129 for (i=n-1; i>=0; i--){ 1130 v = aa + bs2*(a->diag[i] + 1); 1131 vi = aj + a->diag[i] + 1; 1132 nz = ai[i+1] - a->diag[i] - 1; 1133 ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 1134 while (nz--) { 1135 Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 1136 v += bs2; 1137 } 1138 Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 1139 ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 1140 } 1141 1142 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1143 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1144 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1145 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1146 PetscLogFlops(2*(a->bs2)*(a->nz) - a->bs*A->n); 1147 PetscFunctionReturn(0); 1148 } 1149 1150 #undef __FUNCT__ 1151 #define __FUNCT__ "MatSolve_SeqBAIJ_7" 1152 int MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 1153 { 1154 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1155 IS iscol=a->col,isrow=a->row; 1156 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1157 int *diag = a->diag; 1158 MatScalar *aa=a->a,*v; 1159 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1160 PetscScalar *x,*b,*t; 1161 1162 PetscFunctionBegin; 1163 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1164 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1165 t = a->solve_work; 1166 1167 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1168 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1169 1170 /* forward solve the lower triangular */ 1171 idx = 7*(*r++); 1172 t[0] = b[idx]; t[1] = b[1+idx]; 1173 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1174 t[5] = b[5+idx]; t[6] = b[6+idx]; 1175 1176 for (i=1; i<n; i++) { 1177 v = aa + 49*ai[i]; 1178 vi = aj + ai[i]; 1179 nz = diag[i] - ai[i]; 1180 idx = 7*(*r++); 1181 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1182 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 1183 while (nz--) { 1184 idx = 7*(*vi++); 1185 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1186 x4 = t[3+idx];x5 = t[4+idx]; 1187 x6 = t[5+idx];x7 = t[6+idx]; 1188 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1189 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1190 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1191 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1192 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1193 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1194 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1195 v += 49; 1196 } 1197 idx = 7*i; 1198 t[idx] = s1;t[1+idx] = s2; 1199 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1200 t[5+idx] = s6;t[6+idx] = s7; 1201 } 1202 /* backward solve the upper triangular */ 1203 for (i=n-1; i>=0; i--){ 1204 v = aa + 49*diag[i] + 49; 1205 vi = aj + diag[i] + 1; 1206 nz = ai[i+1] - diag[i] - 1; 1207 idt = 7*i; 1208 s1 = t[idt]; s2 = t[1+idt]; 1209 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1210 s6 = t[5+idt];s7 = t[6+idt]; 1211 while (nz--) { 1212 idx = 7*(*vi++); 1213 x1 = t[idx]; x2 = t[1+idx]; 1214 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1215 x6 = t[5+idx]; x7 = t[6+idx]; 1216 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1217 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1218 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1219 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1220 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1221 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1222 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1223 v += 49; 1224 } 1225 idc = 7*(*c--); 1226 v = aa + 49*diag[i]; 1227 x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 1228 v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 1229 x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 1230 v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 1231 x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 1232 v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 1233 x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 1234 v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 1235 x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 1236 v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 1237 x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 1238 v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 1239 x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 1240 v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 1241 } 1242 1243 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1244 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1245 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1246 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1247 PetscLogFlops(2*49*(a->nz) - 7*A->n); 1248 PetscFunctionReturn(0); 1249 } 1250 1251 #undef __FUNCT__ 1252 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering" 1253 int MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 1254 { 1255 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1256 int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1257 int ierr,*diag = a->diag,jdx; 1258 MatScalar *aa=a->a,*v; 1259 PetscScalar *x,*b,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1260 1261 PetscFunctionBegin; 1262 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1263 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1264 /* forward solve the lower triangular */ 1265 idx = 0; 1266 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 1267 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 1268 x[6] = b[6+idx]; 1269 for (i=1; i<n; i++) { 1270 v = aa + 49*ai[i]; 1271 vi = aj + ai[i]; 1272 nz = diag[i] - ai[i]; 1273 idx = 7*i; 1274 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1275 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1276 s7 = b[6+idx]; 1277 while (nz--) { 1278 jdx = 7*(*vi++); 1279 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 1280 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 1281 x7 = x[6+jdx]; 1282 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1283 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1284 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1285 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1286 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1287 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1288 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1289 v += 49; 1290 } 1291 x[idx] = s1; 1292 x[1+idx] = s2; 1293 x[2+idx] = s3; 1294 x[3+idx] = s4; 1295 x[4+idx] = s5; 1296 x[5+idx] = s6; 1297 x[6+idx] = s7; 1298 } 1299 /* backward solve the upper triangular */ 1300 for (i=n-1; i>=0; i--){ 1301 v = aa + 49*diag[i] + 49; 1302 vi = aj + diag[i] + 1; 1303 nz = ai[i+1] - diag[i] - 1; 1304 idt = 7*i; 1305 s1 = x[idt]; s2 = x[1+idt]; 1306 s3 = x[2+idt]; s4 = x[3+idt]; 1307 s5 = x[4+idt]; s6 = x[5+idt]; 1308 s7 = x[6+idt]; 1309 while (nz--) { 1310 idx = 7*(*vi++); 1311 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 1312 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 1313 x7 = x[6+idx]; 1314 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1315 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1316 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1317 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1318 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1319 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1320 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1321 v += 49; 1322 } 1323 v = aa + 49*diag[i]; 1324 x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 1325 + v[28]*s5 + v[35]*s6 + v[42]*s7; 1326 x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 1327 + v[29]*s5 + v[36]*s6 + v[43]*s7; 1328 x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 1329 + v[30]*s5 + v[37]*s6 + v[44]*s7; 1330 x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 1331 + v[31]*s5 + v[38]*s6 + v[45]*s7; 1332 x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 1333 + v[32]*s5 + v[39]*s6 + v[46]*s7; 1334 x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 1335 + v[33]*s5 + v[40]*s6 + v[47]*s7; 1336 x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 1337 + v[34]*s5 + v[41]*s6 + v[48]*s7; 1338 } 1339 1340 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1341 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1342 PetscLogFlops(2*36*(a->nz) - 6*A->n); 1343 PetscFunctionReturn(0); 1344 } 1345 1346 #undef __FUNCT__ 1347 #define __FUNCT__ "MatSolve_SeqBAIJ_6" 1348 int MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 1349 { 1350 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1351 IS iscol=a->col,isrow=a->row; 1352 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1353 int *diag = a->diag; 1354 MatScalar *aa=a->a,*v; 1355 PetscScalar *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 1356 1357 PetscFunctionBegin; 1358 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1359 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1360 t = a->solve_work; 1361 1362 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1363 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1364 1365 /* forward solve the lower triangular */ 1366 idx = 6*(*r++); 1367 t[0] = b[idx]; t[1] = b[1+idx]; 1368 t[2] = b[2+idx]; t[3] = b[3+idx]; 1369 t[4] = b[4+idx]; t[5] = b[5+idx]; 1370 for (i=1; i<n; i++) { 1371 v = aa + 36*ai[i]; 1372 vi = aj + ai[i]; 1373 nz = diag[i] - ai[i]; 1374 idx = 6*(*r++); 1375 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1376 s5 = b[4+idx]; s6 = b[5+idx]; 1377 while (nz--) { 1378 idx = 6*(*vi++); 1379 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1380 x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 1381 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1382 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1383 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1384 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1385 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1386 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1387 v += 36; 1388 } 1389 idx = 6*i; 1390 t[idx] = s1;t[1+idx] = s2; 1391 t[2+idx] = s3;t[3+idx] = s4; 1392 t[4+idx] = s5;t[5+idx] = s6; 1393 } 1394 /* backward solve the upper triangular */ 1395 for (i=n-1; i>=0; i--){ 1396 v = aa + 36*diag[i] + 36; 1397 vi = aj + diag[i] + 1; 1398 nz = ai[i+1] - diag[i] - 1; 1399 idt = 6*i; 1400 s1 = t[idt]; s2 = t[1+idt]; 1401 s3 = t[2+idt];s4 = t[3+idt]; 1402 s5 = t[4+idt];s6 = t[5+idt]; 1403 while (nz--) { 1404 idx = 6*(*vi++); 1405 x1 = t[idx]; x2 = t[1+idx]; 1406 x3 = t[2+idx]; x4 = t[3+idx]; 1407 x5 = t[4+idx]; x6 = t[5+idx]; 1408 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1409 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1410 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1411 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1412 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1413 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1414 v += 36; 1415 } 1416 idc = 6*(*c--); 1417 v = aa + 36*diag[i]; 1418 x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 1419 v[18]*s4+v[24]*s5+v[30]*s6; 1420 x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 1421 v[19]*s4+v[25]*s5+v[31]*s6; 1422 x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 1423 v[20]*s4+v[26]*s5+v[32]*s6; 1424 x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 1425 v[21]*s4+v[27]*s5+v[33]*s6; 1426 x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 1427 v[22]*s4+v[28]*s5+v[34]*s6; 1428 x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 1429 v[23]*s4+v[29]*s5+v[35]*s6; 1430 } 1431 1432 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1433 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1434 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1435 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1436 PetscLogFlops(2*36*(a->nz) - 6*A->n); 1437 PetscFunctionReturn(0); 1438 } 1439 1440 #undef __FUNCT__ 1441 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering" 1442 int MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 1443 { 1444 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1445 int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1446 int ierr,*diag = a->diag,jdx; 1447 MatScalar *aa=a->a,*v; 1448 PetscScalar *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 1449 1450 PetscFunctionBegin; 1451 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1452 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1453 /* forward solve the lower triangular */ 1454 idx = 0; 1455 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 1456 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 1457 for (i=1; i<n; i++) { 1458 v = aa + 36*ai[i]; 1459 vi = aj + ai[i]; 1460 nz = diag[i] - ai[i]; 1461 idx = 6*i; 1462 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1463 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1464 while (nz--) { 1465 jdx = 6*(*vi++); 1466 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 1467 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 1468 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1469 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1470 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1471 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1472 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1473 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1474 v += 36; 1475 } 1476 x[idx] = s1; 1477 x[1+idx] = s2; 1478 x[2+idx] = s3; 1479 x[3+idx] = s4; 1480 x[4+idx] = s5; 1481 x[5+idx] = s6; 1482 } 1483 /* backward solve the upper triangular */ 1484 for (i=n-1; i>=0; i--){ 1485 v = aa + 36*diag[i] + 36; 1486 vi = aj + diag[i] + 1; 1487 nz = ai[i+1] - diag[i] - 1; 1488 idt = 6*i; 1489 s1 = x[idt]; s2 = x[1+idt]; 1490 s3 = x[2+idt]; s4 = x[3+idt]; 1491 s5 = x[4+idt]; s6 = x[5+idt]; 1492 while (nz--) { 1493 idx = 6*(*vi++); 1494 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 1495 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 1496 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1497 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1498 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1499 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1500 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1501 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1502 v += 36; 1503 } 1504 v = aa + 36*diag[i]; 1505 x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 1506 x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 1507 x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 1508 x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 1509 x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 1510 x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 1511 } 1512 1513 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1514 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1515 PetscLogFlops(2*36*(a->nz) - 6*A->n); 1516 PetscFunctionReturn(0); 1517 } 1518 1519 #undef __FUNCT__ 1520 #define __FUNCT__ "MatSolve_SeqBAIJ_5" 1521 int MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 1522 { 1523 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1524 IS iscol=a->col,isrow=a->row; 1525 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1526 int *diag = a->diag; 1527 MatScalar *aa=a->a,*v; 1528 PetscScalar *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 1529 1530 PetscFunctionBegin; 1531 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1532 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1533 t = a->solve_work; 1534 1535 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1536 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1537 1538 /* forward solve the lower triangular */ 1539 idx = 5*(*r++); 1540 t[0] = b[idx]; t[1] = b[1+idx]; 1541 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1542 for (i=1; i<n; i++) { 1543 v = aa + 25*ai[i]; 1544 vi = aj + ai[i]; 1545 nz = diag[i] - ai[i]; 1546 idx = 5*(*r++); 1547 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1548 s5 = b[4+idx]; 1549 while (nz--) { 1550 idx = 5*(*vi++); 1551 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1552 x4 = t[3+idx];x5 = t[4+idx]; 1553 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1554 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1555 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1556 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1557 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1558 v += 25; 1559 } 1560 idx = 5*i; 1561 t[idx] = s1;t[1+idx] = s2; 1562 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1563 } 1564 /* backward solve the upper triangular */ 1565 for (i=n-1; i>=0; i--){ 1566 v = aa + 25*diag[i] + 25; 1567 vi = aj + diag[i] + 1; 1568 nz = ai[i+1] - diag[i] - 1; 1569 idt = 5*i; 1570 s1 = t[idt]; s2 = t[1+idt]; 1571 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1572 while (nz--) { 1573 idx = 5*(*vi++); 1574 x1 = t[idx]; x2 = t[1+idx]; 1575 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1576 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1577 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1578 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1579 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1580 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1581 v += 25; 1582 } 1583 idc = 5*(*c--); 1584 v = aa + 25*diag[i]; 1585 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 1586 v[15]*s4+v[20]*s5; 1587 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 1588 v[16]*s4+v[21]*s5; 1589 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 1590 v[17]*s4+v[22]*s5; 1591 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 1592 v[18]*s4+v[23]*s5; 1593 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 1594 v[19]*s4+v[24]*s5; 1595 } 1596 1597 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1598 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1599 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1600 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1601 PetscLogFlops(2*25*(a->nz) - 5*A->n); 1602 PetscFunctionReturn(0); 1603 } 1604 1605 #undef __FUNCT__ 1606 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering" 1607 int MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 1608 { 1609 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1610 int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1611 int ierr,*diag = a->diag,jdx; 1612 MatScalar *aa=a->a,*v; 1613 PetscScalar *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 1614 1615 PetscFunctionBegin; 1616 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1617 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1618 /* forward solve the lower triangular */ 1619 idx = 0; 1620 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 1621 for (i=1; i<n; i++) { 1622 v = aa + 25*ai[i]; 1623 vi = aj + ai[i]; 1624 nz = diag[i] - ai[i]; 1625 idx = 5*i; 1626 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 1627 while (nz--) { 1628 jdx = 5*(*vi++); 1629 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 1630 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1631 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1632 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1633 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1634 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1635 v += 25; 1636 } 1637 x[idx] = s1; 1638 x[1+idx] = s2; 1639 x[2+idx] = s3; 1640 x[3+idx] = s4; 1641 x[4+idx] = s5; 1642 } 1643 /* backward solve the upper triangular */ 1644 for (i=n-1; i>=0; i--){ 1645 v = aa + 25*diag[i] + 25; 1646 vi = aj + diag[i] + 1; 1647 nz = ai[i+1] - diag[i] - 1; 1648 idt = 5*i; 1649 s1 = x[idt]; s2 = x[1+idt]; 1650 s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 1651 while (nz--) { 1652 idx = 5*(*vi++); 1653 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 1654 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1655 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1656 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1657 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1658 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1659 v += 25; 1660 } 1661 v = aa + 25*diag[i]; 1662 x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 1663 x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 1664 x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 1665 x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 1666 x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 1667 } 1668 1669 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1670 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1671 PetscLogFlops(2*25*(a->nz) - 5*A->n); 1672 PetscFunctionReturn(0); 1673 } 1674 1675 #undef __FUNCT__ 1676 #define __FUNCT__ "MatSolve_SeqBAIJ_4" 1677 int MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 1678 { 1679 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1680 IS iscol=a->col,isrow=a->row; 1681 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1682 int *diag = a->diag; 1683 MatScalar *aa=a->a,*v; 1684 PetscScalar *x,*b,s1,s2,s3,s4,x1,x2,x3,x4,*t; 1685 1686 PetscFunctionBegin; 1687 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1688 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1689 t = a->solve_work; 1690 1691 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1692 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1693 1694 /* forward solve the lower triangular */ 1695 idx = 4*(*r++); 1696 t[0] = b[idx]; t[1] = b[1+idx]; 1697 t[2] = b[2+idx]; t[3] = b[3+idx]; 1698 for (i=1; i<n; i++) { 1699 v = aa + 16*ai[i]; 1700 vi = aj + ai[i]; 1701 nz = diag[i] - ai[i]; 1702 idx = 4*(*r++); 1703 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1704 while (nz--) { 1705 idx = 4*(*vi++); 1706 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 1707 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1708 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1709 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1710 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1711 v += 16; 1712 } 1713 idx = 4*i; 1714 t[idx] = s1;t[1+idx] = s2; 1715 t[2+idx] = s3;t[3+idx] = s4; 1716 } 1717 /* backward solve the upper triangular */ 1718 for (i=n-1; i>=0; i--){ 1719 v = aa + 16*diag[i] + 16; 1720 vi = aj + diag[i] + 1; 1721 nz = ai[i+1] - diag[i] - 1; 1722 idt = 4*i; 1723 s1 = t[idt]; s2 = t[1+idt]; 1724 s3 = t[2+idt];s4 = t[3+idt]; 1725 while (nz--) { 1726 idx = 4*(*vi++); 1727 x1 = t[idx]; x2 = t[1+idx]; 1728 x3 = t[2+idx]; x4 = t[3+idx]; 1729 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1730 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1731 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1732 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1733 v += 16; 1734 } 1735 idc = 4*(*c--); 1736 v = aa + 16*diag[i]; 1737 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 1738 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 1739 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 1740 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 1741 } 1742 1743 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1744 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1745 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1746 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1747 PetscLogFlops(2*16*(a->nz) - 4*A->n); 1748 PetscFunctionReturn(0); 1749 } 1750 1751 #undef __FUNCT__ 1752 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion" 1753 int MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx) 1754 { 1755 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1756 IS iscol=a->col,isrow=a->row; 1757 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1758 int *diag = a->diag; 1759 MatScalar *aa=a->a,*v,s1,s2,s3,s4,x1,x2,x3,x4,*t; 1760 PetscScalar *x,*b; 1761 1762 PetscFunctionBegin; 1763 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1764 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1765 t = (MatScalar *)a->solve_work; 1766 1767 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1768 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1769 1770 /* forward solve the lower triangular */ 1771 idx = 4*(*r++); 1772 t[0] = (MatScalar)b[idx]; 1773 t[1] = (MatScalar)b[1+idx]; 1774 t[2] = (MatScalar)b[2+idx]; 1775 t[3] = (MatScalar)b[3+idx]; 1776 for (i=1; i<n; i++) { 1777 v = aa + 16*ai[i]; 1778 vi = aj + ai[i]; 1779 nz = diag[i] - ai[i]; 1780 idx = 4*(*r++); 1781 s1 = (MatScalar)b[idx]; 1782 s2 = (MatScalar)b[1+idx]; 1783 s3 = (MatScalar)b[2+idx]; 1784 s4 = (MatScalar)b[3+idx]; 1785 while (nz--) { 1786 idx = 4*(*vi++); 1787 x1 = t[idx]; 1788 x2 = t[1+idx]; 1789 x3 = t[2+idx]; 1790 x4 = t[3+idx]; 1791 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1792 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1793 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1794 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1795 v += 16; 1796 } 1797 idx = 4*i; 1798 t[idx] = s1; 1799 t[1+idx] = s2; 1800 t[2+idx] = s3; 1801 t[3+idx] = s4; 1802 } 1803 /* backward solve the upper triangular */ 1804 for (i=n-1; i>=0; i--){ 1805 v = aa + 16*diag[i] + 16; 1806 vi = aj + diag[i] + 1; 1807 nz = ai[i+1] - diag[i] - 1; 1808 idt = 4*i; 1809 s1 = t[idt]; 1810 s2 = t[1+idt]; 1811 s3 = t[2+idt]; 1812 s4 = t[3+idt]; 1813 while (nz--) { 1814 idx = 4*(*vi++); 1815 x1 = t[idx]; 1816 x2 = t[1+idx]; 1817 x3 = t[2+idx]; 1818 x4 = t[3+idx]; 1819 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1820 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1821 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1822 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1823 v += 16; 1824 } 1825 idc = 4*(*c--); 1826 v = aa + 16*diag[i]; 1827 t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 1828 t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 1829 t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 1830 t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 1831 x[idc] = (PetscScalar)t[idt]; 1832 x[1+idc] = (PetscScalar)t[1+idt]; 1833 x[2+idc] = (PetscScalar)t[2+idt]; 1834 x[3+idc] = (PetscScalar)t[3+idt]; 1835 } 1836 1837 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1838 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1839 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1840 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1841 PetscLogFlops(2*16*(a->nz) - 4*A->n); 1842 PetscFunctionReturn(0); 1843 } 1844 1845 #if defined (PETSC_HAVE_SSE) 1846 1847 #include PETSC_HAVE_SSE 1848 1849 #undef __FUNCT__ 1850 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion" 1851 int MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx) 1852 { 1853 /* 1854 Note: This code uses demotion of double 1855 to float when performing the mixed-mode computation. 1856 This may not be numerically reasonable for all applications. 1857 */ 1858 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1859 IS iscol=a->col,isrow=a->row; 1860 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1861 int *diag = a->diag,ai16; 1862 MatScalar *aa=a->a,*v; 1863 PetscScalar *x,*b,*t; 1864 1865 /* Make space in temp stack for 16 Byte Aligned arrays */ 1866 float ssealignedspace[11],*tmps,*tmpx; 1867 unsigned long offset; 1868 1869 PetscFunctionBegin; 1870 SSE_SCOPE_BEGIN; 1871 1872 offset = (unsigned long)ssealignedspace % 16; 1873 if (offset) offset = (16 - offset)/4; 1874 tmps = &ssealignedspace[offset]; 1875 tmpx = &ssealignedspace[offset+4]; 1876 PREFETCH_NTA(aa+16*ai[1]); 1877 1878 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1879 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1880 t = a->solve_work; 1881 1882 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1883 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1884 1885 /* forward solve the lower triangular */ 1886 idx = 4*(*r++); 1887 t[0] = b[idx]; t[1] = b[1+idx]; 1888 t[2] = b[2+idx]; t[3] = b[3+idx]; 1889 v = aa + 16*ai[1]; 1890 1891 for (i=1; i<n;) { 1892 PREFETCH_NTA(&v[8]); 1893 vi = aj + ai[i]; 1894 nz = diag[i] - ai[i]; 1895 idx = 4*(*r++); 1896 1897 /* Demote sum from double to float */ 1898 CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 1899 LOAD_PS(tmps,XMM7); 1900 1901 while (nz--) { 1902 PREFETCH_NTA(&v[16]); 1903 idx = 4*(*vi++); 1904 1905 /* Demote solution (so far) from double to float */ 1906 CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 1907 1908 /* 4x4 Matrix-Vector product with negative accumulation: */ 1909 SSE_INLINE_BEGIN_2(tmpx,v) 1910 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 1911 1912 /* First Column */ 1913 SSE_COPY_PS(XMM0,XMM6) 1914 SSE_SHUFFLE(XMM0,XMM0,0x00) 1915 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 1916 SSE_SUB_PS(XMM7,XMM0) 1917 1918 /* Second Column */ 1919 SSE_COPY_PS(XMM1,XMM6) 1920 SSE_SHUFFLE(XMM1,XMM1,0x55) 1921 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 1922 SSE_SUB_PS(XMM7,XMM1) 1923 1924 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 1925 1926 /* Third Column */ 1927 SSE_COPY_PS(XMM2,XMM6) 1928 SSE_SHUFFLE(XMM2,XMM2,0xAA) 1929 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 1930 SSE_SUB_PS(XMM7,XMM2) 1931 1932 /* Fourth Column */ 1933 SSE_COPY_PS(XMM3,XMM6) 1934 SSE_SHUFFLE(XMM3,XMM3,0xFF) 1935 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 1936 SSE_SUB_PS(XMM7,XMM3) 1937 SSE_INLINE_END_2 1938 1939 v += 16; 1940 } 1941 idx = 4*i; 1942 v = aa + 16*ai[++i]; 1943 PREFETCH_NTA(v); 1944 STORE_PS(tmps,XMM7); 1945 1946 /* Promote result from float to double */ 1947 CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps); 1948 } 1949 /* backward solve the upper triangular */ 1950 idt = 4*(n-1); 1951 ai16 = 16*diag[n-1]; 1952 v = aa + ai16 + 16; 1953 for (i=n-1; i>=0;){ 1954 PREFETCH_NTA(&v[8]); 1955 vi = aj + diag[i] + 1; 1956 nz = ai[i+1] - diag[i] - 1; 1957 1958 /* Demote accumulator from double to float */ 1959 CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]); 1960 LOAD_PS(tmps,XMM7); 1961 1962 while (nz--) { 1963 PREFETCH_NTA(&v[16]); 1964 idx = 4*(*vi++); 1965 1966 /* Demote solution (so far) from double to float */ 1967 CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]); 1968 1969 /* 4x4 Matrix-Vector Product with negative accumulation: */ 1970 SSE_INLINE_BEGIN_2(tmpx,v) 1971 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 1972 1973 /* First Column */ 1974 SSE_COPY_PS(XMM0,XMM6) 1975 SSE_SHUFFLE(XMM0,XMM0,0x00) 1976 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 1977 SSE_SUB_PS(XMM7,XMM0) 1978 1979 /* Second Column */ 1980 SSE_COPY_PS(XMM1,XMM6) 1981 SSE_SHUFFLE(XMM1,XMM1,0x55) 1982 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 1983 SSE_SUB_PS(XMM7,XMM1) 1984 1985 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 1986 1987 /* Third Column */ 1988 SSE_COPY_PS(XMM2,XMM6) 1989 SSE_SHUFFLE(XMM2,XMM2,0xAA) 1990 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 1991 SSE_SUB_PS(XMM7,XMM2) 1992 1993 /* Fourth Column */ 1994 SSE_COPY_PS(XMM3,XMM6) 1995 SSE_SHUFFLE(XMM3,XMM3,0xFF) 1996 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 1997 SSE_SUB_PS(XMM7,XMM3) 1998 SSE_INLINE_END_2 1999 v += 16; 2000 } 2001 v = aa + ai16; 2002 ai16 = 16*diag[--i]; 2003 PREFETCH_NTA(aa+ai16+16); 2004 /* 2005 Scale the result by the diagonal 4x4 block, 2006 which was inverted as part of the factorization 2007 */ 2008 SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 2009 /* First Column */ 2010 SSE_COPY_PS(XMM0,XMM7) 2011 SSE_SHUFFLE(XMM0,XMM0,0x00) 2012 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 2013 2014 /* Second Column */ 2015 SSE_COPY_PS(XMM1,XMM7) 2016 SSE_SHUFFLE(XMM1,XMM1,0x55) 2017 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 2018 SSE_ADD_PS(XMM0,XMM1) 2019 2020 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 2021 2022 /* Third Column */ 2023 SSE_COPY_PS(XMM2,XMM7) 2024 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2025 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 2026 SSE_ADD_PS(XMM0,XMM2) 2027 2028 /* Fourth Column */ 2029 SSE_COPY_PS(XMM3,XMM7) 2030 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2031 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 2032 SSE_ADD_PS(XMM0,XMM3) 2033 2034 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 2035 SSE_INLINE_END_3 2036 2037 /* Promote solution from float to double */ 2038 CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps); 2039 2040 /* Apply reordering to t and stream into x. */ 2041 /* This way, x doesn't pollute the cache. */ 2042 /* Be careful with size: 2 doubles = 4 floats! */ 2043 idc = 4*(*c--); 2044 SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc]) 2045 /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 2046 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 2047 SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0) 2048 /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 2049 SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1) 2050 SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1) 2051 SSE_INLINE_END_2 2052 v = aa + ai16 + 16; 2053 idt -= 4; 2054 } 2055 2056 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2057 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2058 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2059 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2060 PetscLogFlops(2*16*(a->nz) - 4*A->n); 2061 SSE_SCOPE_END; 2062 PetscFunctionReturn(0); 2063 } 2064 2065 #endif 2066 2067 2068 /* 2069 Special case where the matrix was ILU(0) factored in the natural 2070 ordering. This eliminates the need for the column and row permutation. 2071 */ 2072 #undef __FUNCT__ 2073 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering" 2074 int MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 2075 { 2076 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2077 int n=a->mbs,*ai=a->i,*aj=a->j; 2078 int ierr,*diag = a->diag; 2079 MatScalar *aa=a->a; 2080 PetscScalar *x,*b; 2081 2082 PetscFunctionBegin; 2083 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2084 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2085 2086 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS) 2087 { 2088 static PetscScalar w[2000]; /* very BAD need to fix */ 2089 fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w); 2090 } 2091 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 2092 { 2093 static PetscScalar w[2000]; /* very BAD need to fix */ 2094 fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w); 2095 } 2096 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 2097 fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b); 2098 #else 2099 { 2100 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 2101 MatScalar *v; 2102 int jdx,idt,idx,nz,*vi,i,ai16; 2103 2104 /* forward solve the lower triangular */ 2105 idx = 0; 2106 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 2107 for (i=1; i<n; i++) { 2108 v = aa + 16*ai[i]; 2109 vi = aj + ai[i]; 2110 nz = diag[i] - ai[i]; 2111 idx += 4; 2112 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2113 while (nz--) { 2114 jdx = 4*(*vi++); 2115 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx]; 2116 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2117 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2118 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2119 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2120 v += 16; 2121 } 2122 x[idx] = s1; 2123 x[1+idx] = s2; 2124 x[2+idx] = s3; 2125 x[3+idx] = s4; 2126 } 2127 /* backward solve the upper triangular */ 2128 idt = 4*(n-1); 2129 for (i=n-1; i>=0; i--){ 2130 ai16 = 16*diag[i]; 2131 v = aa + ai16 + 16; 2132 vi = aj + diag[i] + 1; 2133 nz = ai[i+1] - diag[i] - 1; 2134 s1 = x[idt]; s2 = x[1+idt]; 2135 s3 = x[2+idt];s4 = x[3+idt]; 2136 while (nz--) { 2137 idx = 4*(*vi++); 2138 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; 2139 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2140 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2141 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2142 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2143 v += 16; 2144 } 2145 v = aa + ai16; 2146 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 2147 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 2148 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 2149 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 2150 idt -= 4; 2151 } 2152 } 2153 #endif 2154 2155 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2156 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2157 PetscLogFlops(2*16*(a->nz) - 4*A->n); 2158 PetscFunctionReturn(0); 2159 } 2160 2161 #undef __FUNCT__ 2162 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion" 2163 int MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx) 2164 { 2165 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2166 int n=a->mbs,*ai=a->i,*aj=a->j; 2167 int ierr,*diag = a->diag; 2168 MatScalar *aa=a->a; 2169 PetscScalar *x,*b; 2170 2171 PetscFunctionBegin; 2172 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2173 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2174 2175 { 2176 MatScalar s1,s2,s3,s4,x1,x2,x3,x4; 2177 MatScalar *v,*t=(MatScalar *)x; 2178 int jdx,idt,idx,nz,*vi,i,ai16; 2179 2180 /* forward solve the lower triangular */ 2181 idx = 0; 2182 t[0] = (MatScalar)b[0]; 2183 t[1] = (MatScalar)b[1]; 2184 t[2] = (MatScalar)b[2]; 2185 t[3] = (MatScalar)b[3]; 2186 for (i=1; i<n; i++) { 2187 v = aa + 16*ai[i]; 2188 vi = aj + ai[i]; 2189 nz = diag[i] - ai[i]; 2190 idx += 4; 2191 s1 = (MatScalar)b[idx]; 2192 s2 = (MatScalar)b[1+idx]; 2193 s3 = (MatScalar)b[2+idx]; 2194 s4 = (MatScalar)b[3+idx]; 2195 while (nz--) { 2196 jdx = 4*(*vi++); 2197 x1 = t[jdx]; 2198 x2 = t[1+jdx]; 2199 x3 = t[2+jdx]; 2200 x4 = t[3+jdx]; 2201 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2202 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2203 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2204 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2205 v += 16; 2206 } 2207 t[idx] = s1; 2208 t[1+idx] = s2; 2209 t[2+idx] = s3; 2210 t[3+idx] = s4; 2211 } 2212 /* backward solve the upper triangular */ 2213 idt = 4*(n-1); 2214 for (i=n-1; i>=0; i--){ 2215 ai16 = 16*diag[i]; 2216 v = aa + ai16 + 16; 2217 vi = aj + diag[i] + 1; 2218 nz = ai[i+1] - diag[i] - 1; 2219 s1 = t[idt]; 2220 s2 = t[1+idt]; 2221 s3 = t[2+idt]; 2222 s4 = t[3+idt]; 2223 while (nz--) { 2224 idx = 4*(*vi++); 2225 x1 = (MatScalar)x[idx]; 2226 x2 = (MatScalar)x[1+idx]; 2227 x3 = (MatScalar)x[2+idx]; 2228 x4 = (MatScalar)x[3+idx]; 2229 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2230 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2231 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2232 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2233 v += 16; 2234 } 2235 v = aa + ai16; 2236 x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4); 2237 x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4); 2238 x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4); 2239 x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4); 2240 idt -= 4; 2241 } 2242 } 2243 2244 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2245 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2246 PetscLogFlops(2*16*(a->nz) - 4*A->n); 2247 PetscFunctionReturn(0); 2248 } 2249 2250 #if defined (PETSC_HAVE_SSE) 2251 2252 #include PETSC_HAVE_SSE 2253 #undef __FUNCT__ 2254 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj" 2255 int MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx) 2256 { 2257 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2258 unsigned short *aj=(unsigned short *)a->j; 2259 int ierr,*ai=a->i,n=a->mbs,*diag = a->diag; 2260 MatScalar *aa=a->a; 2261 PetscScalar *x,*b; 2262 2263 PetscFunctionBegin; 2264 SSE_SCOPE_BEGIN; 2265 /* 2266 Note: This code currently uses demotion of double 2267 to float when performing the mixed-mode computation. 2268 This may not be numerically reasonable for all applications. 2269 */ 2270 PREFETCH_NTA(aa+16*ai[1]); 2271 2272 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2273 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2274 { 2275 /* x will first be computed in single precision then promoted inplace to double */ 2276 MatScalar *v,*t=(MatScalar *)x; 2277 int nz,i,idt,ai16; 2278 unsigned int jdx,idx; 2279 unsigned short *vi; 2280 /* Forward solve the lower triangular factor. */ 2281 2282 /* First block is the identity. */ 2283 idx = 0; 2284 CONVERT_DOUBLE4_FLOAT4(t,b); 2285 v = aa + 16*((unsigned int)ai[1]); 2286 2287 for (i=1; i<n;) { 2288 PREFETCH_NTA(&v[8]); 2289 vi = aj + ai[i]; 2290 nz = diag[i] - ai[i]; 2291 idx += 4; 2292 2293 /* Demote RHS from double to float. */ 2294 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 2295 LOAD_PS(&t[idx],XMM7); 2296 2297 while (nz--) { 2298 PREFETCH_NTA(&v[16]); 2299 jdx = 4*((unsigned int)(*vi++)); 2300 2301 /* 4x4 Matrix-Vector product with negative accumulation: */ 2302 SSE_INLINE_BEGIN_2(&t[jdx],v) 2303 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2304 2305 /* First Column */ 2306 SSE_COPY_PS(XMM0,XMM6) 2307 SSE_SHUFFLE(XMM0,XMM0,0x00) 2308 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2309 SSE_SUB_PS(XMM7,XMM0) 2310 2311 /* Second Column */ 2312 SSE_COPY_PS(XMM1,XMM6) 2313 SSE_SHUFFLE(XMM1,XMM1,0x55) 2314 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2315 SSE_SUB_PS(XMM7,XMM1) 2316 2317 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2318 2319 /* Third Column */ 2320 SSE_COPY_PS(XMM2,XMM6) 2321 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2322 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2323 SSE_SUB_PS(XMM7,XMM2) 2324 2325 /* Fourth Column */ 2326 SSE_COPY_PS(XMM3,XMM6) 2327 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2328 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2329 SSE_SUB_PS(XMM7,XMM3) 2330 SSE_INLINE_END_2 2331 2332 v += 16; 2333 } 2334 v = aa + 16*ai[++i]; 2335 PREFETCH_NTA(v); 2336 STORE_PS(&t[idx],XMM7); 2337 } 2338 2339 /* Backward solve the upper triangular factor.*/ 2340 2341 idt = 4*(n-1); 2342 ai16 = 16*diag[n-1]; 2343 v = aa + ai16 + 16; 2344 for (i=n-1; i>=0;){ 2345 PREFETCH_NTA(&v[8]); 2346 vi = aj + diag[i] + 1; 2347 nz = ai[i+1] - diag[i] - 1; 2348 2349 LOAD_PS(&t[idt],XMM7); 2350 2351 while (nz--) { 2352 PREFETCH_NTA(&v[16]); 2353 idx = 4*((unsigned int)(*vi++)); 2354 2355 /* 4x4 Matrix-Vector Product with negative accumulation: */ 2356 SSE_INLINE_BEGIN_2(&t[idx],v) 2357 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2358 2359 /* First Column */ 2360 SSE_COPY_PS(XMM0,XMM6) 2361 SSE_SHUFFLE(XMM0,XMM0,0x00) 2362 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2363 SSE_SUB_PS(XMM7,XMM0) 2364 2365 /* Second Column */ 2366 SSE_COPY_PS(XMM1,XMM6) 2367 SSE_SHUFFLE(XMM1,XMM1,0x55) 2368 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2369 SSE_SUB_PS(XMM7,XMM1) 2370 2371 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2372 2373 /* Third Column */ 2374 SSE_COPY_PS(XMM2,XMM6) 2375 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2376 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2377 SSE_SUB_PS(XMM7,XMM2) 2378 2379 /* Fourth Column */ 2380 SSE_COPY_PS(XMM3,XMM6) 2381 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2382 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2383 SSE_SUB_PS(XMM7,XMM3) 2384 SSE_INLINE_END_2 2385 v += 16; 2386 } 2387 v = aa + ai16; 2388 ai16 = 16*diag[--i]; 2389 PREFETCH_NTA(aa+ai16+16); 2390 /* 2391 Scale the result by the diagonal 4x4 block, 2392 which was inverted as part of the factorization 2393 */ 2394 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 2395 /* First Column */ 2396 SSE_COPY_PS(XMM0,XMM7) 2397 SSE_SHUFFLE(XMM0,XMM0,0x00) 2398 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 2399 2400 /* Second Column */ 2401 SSE_COPY_PS(XMM1,XMM7) 2402 SSE_SHUFFLE(XMM1,XMM1,0x55) 2403 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 2404 SSE_ADD_PS(XMM0,XMM1) 2405 2406 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 2407 2408 /* Third Column */ 2409 SSE_COPY_PS(XMM2,XMM7) 2410 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2411 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 2412 SSE_ADD_PS(XMM0,XMM2) 2413 2414 /* Fourth Column */ 2415 SSE_COPY_PS(XMM3,XMM7) 2416 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2417 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 2418 SSE_ADD_PS(XMM0,XMM3) 2419 2420 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 2421 SSE_INLINE_END_3 2422 2423 v = aa + ai16 + 16; 2424 idt -= 4; 2425 } 2426 2427 /* Convert t from single precision back to double precision (inplace)*/ 2428 idt = 4*(n-1); 2429 for (i=n-1;i>=0;i--) { 2430 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 2431 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 2432 PetscScalar *xtemp=&x[idt]; 2433 MatScalar *ttemp=&t[idt]; 2434 xtemp[3] = (PetscScalar)ttemp[3]; 2435 xtemp[2] = (PetscScalar)ttemp[2]; 2436 xtemp[1] = (PetscScalar)ttemp[1]; 2437 xtemp[0] = (PetscScalar)ttemp[0]; 2438 idt -= 4; 2439 } 2440 2441 } /* End of artificial scope. */ 2442 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2443 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2444 PetscLogFlops(2*16*(a->nz) - 4*A->n); 2445 SSE_SCOPE_END; 2446 PetscFunctionReturn(0); 2447 } 2448 2449 #undef __FUNCT__ 2450 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion" 2451 int MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx) 2452 { 2453 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2454 int *aj=a->j; 2455 int ierr,*ai=a->i,n=a->mbs,*diag = a->diag; 2456 MatScalar *aa=a->a; 2457 PetscScalar *x,*b; 2458 2459 PetscFunctionBegin; 2460 SSE_SCOPE_BEGIN; 2461 /* 2462 Note: This code currently uses demotion of double 2463 to float when performing the mixed-mode computation. 2464 This may not be numerically reasonable for all applications. 2465 */ 2466 PREFETCH_NTA(aa+16*ai[1]); 2467 2468 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2469 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2470 { 2471 /* x will first be computed in single precision then promoted inplace to double */ 2472 MatScalar *v,*t=(MatScalar *)x; 2473 int nz,i,idt,ai16; 2474 int jdx,idx; 2475 int *vi; 2476 /* Forward solve the lower triangular factor. */ 2477 2478 /* First block is the identity. */ 2479 idx = 0; 2480 CONVERT_DOUBLE4_FLOAT4(t,b); 2481 v = aa + 16*ai[1]; 2482 2483 for (i=1; i<n;) { 2484 PREFETCH_NTA(&v[8]); 2485 vi = aj + ai[i]; 2486 nz = diag[i] - ai[i]; 2487 idx += 4; 2488 2489 /* Demote RHS from double to float. */ 2490 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 2491 LOAD_PS(&t[idx],XMM7); 2492 2493 while (nz--) { 2494 PREFETCH_NTA(&v[16]); 2495 jdx = 4*(*vi++); 2496 /* jdx = *vi++; */ 2497 2498 /* 4x4 Matrix-Vector product with negative accumulation: */ 2499 SSE_INLINE_BEGIN_2(&t[jdx],v) 2500 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2501 2502 /* First Column */ 2503 SSE_COPY_PS(XMM0,XMM6) 2504 SSE_SHUFFLE(XMM0,XMM0,0x00) 2505 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2506 SSE_SUB_PS(XMM7,XMM0) 2507 2508 /* Second Column */ 2509 SSE_COPY_PS(XMM1,XMM6) 2510 SSE_SHUFFLE(XMM1,XMM1,0x55) 2511 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2512 SSE_SUB_PS(XMM7,XMM1) 2513 2514 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2515 2516 /* Third Column */ 2517 SSE_COPY_PS(XMM2,XMM6) 2518 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2519 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2520 SSE_SUB_PS(XMM7,XMM2) 2521 2522 /* Fourth Column */ 2523 SSE_COPY_PS(XMM3,XMM6) 2524 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2525 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2526 SSE_SUB_PS(XMM7,XMM3) 2527 SSE_INLINE_END_2 2528 2529 v += 16; 2530 } 2531 v = aa + 16*ai[++i]; 2532 PREFETCH_NTA(v); 2533 STORE_PS(&t[idx],XMM7); 2534 } 2535 2536 /* Backward solve the upper triangular factor.*/ 2537 2538 idt = 4*(n-1); 2539 ai16 = 16*diag[n-1]; 2540 v = aa + ai16 + 16; 2541 for (i=n-1; i>=0;){ 2542 PREFETCH_NTA(&v[8]); 2543 vi = aj + diag[i] + 1; 2544 nz = ai[i+1] - diag[i] - 1; 2545 2546 LOAD_PS(&t[idt],XMM7); 2547 2548 while (nz--) { 2549 PREFETCH_NTA(&v[16]); 2550 idx = 4*(*vi++); 2551 /* idx = *vi++; */ 2552 2553 /* 4x4 Matrix-Vector Product with negative accumulation: */ 2554 SSE_INLINE_BEGIN_2(&t[idx],v) 2555 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2556 2557 /* First Column */ 2558 SSE_COPY_PS(XMM0,XMM6) 2559 SSE_SHUFFLE(XMM0,XMM0,0x00) 2560 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2561 SSE_SUB_PS(XMM7,XMM0) 2562 2563 /* Second Column */ 2564 SSE_COPY_PS(XMM1,XMM6) 2565 SSE_SHUFFLE(XMM1,XMM1,0x55) 2566 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2567 SSE_SUB_PS(XMM7,XMM1) 2568 2569 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2570 2571 /* Third Column */ 2572 SSE_COPY_PS(XMM2,XMM6) 2573 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2574 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2575 SSE_SUB_PS(XMM7,XMM2) 2576 2577 /* Fourth Column */ 2578 SSE_COPY_PS(XMM3,XMM6) 2579 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2580 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2581 SSE_SUB_PS(XMM7,XMM3) 2582 SSE_INLINE_END_2 2583 v += 16; 2584 } 2585 v = aa + ai16; 2586 ai16 = 16*diag[--i]; 2587 PREFETCH_NTA(aa+ai16+16); 2588 /* 2589 Scale the result by the diagonal 4x4 block, 2590 which was inverted as part of the factorization 2591 */ 2592 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 2593 /* First Column */ 2594 SSE_COPY_PS(XMM0,XMM7) 2595 SSE_SHUFFLE(XMM0,XMM0,0x00) 2596 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 2597 2598 /* Second Column */ 2599 SSE_COPY_PS(XMM1,XMM7) 2600 SSE_SHUFFLE(XMM1,XMM1,0x55) 2601 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 2602 SSE_ADD_PS(XMM0,XMM1) 2603 2604 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 2605 2606 /* Third Column */ 2607 SSE_COPY_PS(XMM2,XMM7) 2608 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2609 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 2610 SSE_ADD_PS(XMM0,XMM2) 2611 2612 /* Fourth Column */ 2613 SSE_COPY_PS(XMM3,XMM7) 2614 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2615 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 2616 SSE_ADD_PS(XMM0,XMM3) 2617 2618 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 2619 SSE_INLINE_END_3 2620 2621 v = aa + ai16 + 16; 2622 idt -= 4; 2623 } 2624 2625 /* Convert t from single precision back to double precision (inplace)*/ 2626 idt = 4*(n-1); 2627 for (i=n-1;i>=0;i--) { 2628 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 2629 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 2630 PetscScalar *xtemp=&x[idt]; 2631 MatScalar *ttemp=&t[idt]; 2632 xtemp[3] = (PetscScalar)ttemp[3]; 2633 xtemp[2] = (PetscScalar)ttemp[2]; 2634 xtemp[1] = (PetscScalar)ttemp[1]; 2635 xtemp[0] = (PetscScalar)ttemp[0]; 2636 idt -= 4; 2637 } 2638 2639 } /* End of artificial scope. */ 2640 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2641 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2642 PetscLogFlops(2*16*(a->nz) - 4*A->n); 2643 SSE_SCOPE_END; 2644 PetscFunctionReturn(0); 2645 } 2646 2647 #endif 2648 #undef __FUNCT__ 2649 #define __FUNCT__ "MatSolve_SeqBAIJ_3" 2650 int MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 2651 { 2652 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2653 IS iscol=a->col,isrow=a->row; 2654 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 2655 int *diag = a->diag; 2656 MatScalar *aa=a->a,*v; 2657 PetscScalar *x,*b,s1,s2,s3,x1,x2,x3,*t; 2658 2659 PetscFunctionBegin; 2660 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2661 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2662 t = a->solve_work; 2663 2664 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2665 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2666 2667 /* forward solve the lower triangular */ 2668 idx = 3*(*r++); 2669 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 2670 for (i=1; i<n; i++) { 2671 v = aa + 9*ai[i]; 2672 vi = aj + ai[i]; 2673 nz = diag[i] - ai[i]; 2674 idx = 3*(*r++); 2675 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 2676 while (nz--) { 2677 idx = 3*(*vi++); 2678 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2679 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2680 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2681 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2682 v += 9; 2683 } 2684 idx = 3*i; 2685 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 2686 } 2687 /* backward solve the upper triangular */ 2688 for (i=n-1; i>=0; i--){ 2689 v = aa + 9*diag[i] + 9; 2690 vi = aj + diag[i] + 1; 2691 nz = ai[i+1] - diag[i] - 1; 2692 idt = 3*i; 2693 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 2694 while (nz--) { 2695 idx = 3*(*vi++); 2696 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2697 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2698 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2699 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2700 v += 9; 2701 } 2702 idc = 3*(*c--); 2703 v = aa + 9*diag[i]; 2704 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 2705 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 2706 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 2707 } 2708 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2709 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2710 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2711 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2712 PetscLogFlops(2*9*(a->nz) - 3*A->n); 2713 PetscFunctionReturn(0); 2714 } 2715 2716 /* 2717 Special case where the matrix was ILU(0) factored in the natural 2718 ordering. This eliminates the need for the column and row permutation. 2719 */ 2720 #undef __FUNCT__ 2721 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering" 2722 int MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 2723 { 2724 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2725 int n=a->mbs,*ai=a->i,*aj=a->j; 2726 int ierr,*diag = a->diag; 2727 MatScalar *aa=a->a,*v; 2728 PetscScalar *x,*b,s1,s2,s3,x1,x2,x3; 2729 int jdx,idt,idx,nz,*vi,i; 2730 2731 PetscFunctionBegin; 2732 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2733 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2734 2735 2736 /* forward solve the lower triangular */ 2737 idx = 0; 2738 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; 2739 for (i=1; i<n; i++) { 2740 v = aa + 9*ai[i]; 2741 vi = aj + ai[i]; 2742 nz = diag[i] - ai[i]; 2743 idx += 3; 2744 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 2745 while (nz--) { 2746 jdx = 3*(*vi++); 2747 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx]; 2748 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2749 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2750 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2751 v += 9; 2752 } 2753 x[idx] = s1; 2754 x[1+idx] = s2; 2755 x[2+idx] = s3; 2756 } 2757 /* backward solve the upper triangular */ 2758 for (i=n-1; i>=0; i--){ 2759 v = aa + 9*diag[i] + 9; 2760 vi = aj + diag[i] + 1; 2761 nz = ai[i+1] - diag[i] - 1; 2762 idt = 3*i; 2763 s1 = x[idt]; s2 = x[1+idt]; 2764 s3 = x[2+idt]; 2765 while (nz--) { 2766 idx = 3*(*vi++); 2767 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; 2768 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2769 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2770 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2771 v += 9; 2772 } 2773 v = aa + 9*diag[i]; 2774 x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 2775 x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 2776 x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 2777 } 2778 2779 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2780 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2781 PetscLogFlops(2*9*(a->nz) - 3*A->n); 2782 PetscFunctionReturn(0); 2783 } 2784 2785 #undef __FUNCT__ 2786 #define __FUNCT__ "MatSolve_SeqBAIJ_2" 2787 int MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 2788 { 2789 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2790 IS iscol=a->col,isrow=a->row; 2791 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 2792 int *diag = a->diag; 2793 MatScalar *aa=a->a,*v; 2794 PetscScalar *x,*b,s1,s2,x1,x2,*t; 2795 2796 PetscFunctionBegin; 2797 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2798 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2799 t = a->solve_work; 2800 2801 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2802 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2803 2804 /* forward solve the lower triangular */ 2805 idx = 2*(*r++); 2806 t[0] = b[idx]; t[1] = b[1+idx]; 2807 for (i=1; i<n; i++) { 2808 v = aa + 4*ai[i]; 2809 vi = aj + ai[i]; 2810 nz = diag[i] - ai[i]; 2811 idx = 2*(*r++); 2812 s1 = b[idx]; s2 = b[1+idx]; 2813 while (nz--) { 2814 idx = 2*(*vi++); 2815 x1 = t[idx]; x2 = t[1+idx]; 2816 s1 -= v[0]*x1 + v[2]*x2; 2817 s2 -= v[1]*x1 + v[3]*x2; 2818 v += 4; 2819 } 2820 idx = 2*i; 2821 t[idx] = s1; t[1+idx] = s2; 2822 } 2823 /* backward solve the upper triangular */ 2824 for (i=n-1; i>=0; i--){ 2825 v = aa + 4*diag[i] + 4; 2826 vi = aj + diag[i] + 1; 2827 nz = ai[i+1] - diag[i] - 1; 2828 idt = 2*i; 2829 s1 = t[idt]; s2 = t[1+idt]; 2830 while (nz--) { 2831 idx = 2*(*vi++); 2832 x1 = t[idx]; x2 = t[1+idx]; 2833 s1 -= v[0]*x1 + v[2]*x2; 2834 s2 -= v[1]*x1 + v[3]*x2; 2835 v += 4; 2836 } 2837 idc = 2*(*c--); 2838 v = aa + 4*diag[i]; 2839 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 2840 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 2841 } 2842 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2843 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2844 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2845 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2846 PetscLogFlops(2*4*(a->nz) - 2*A->n); 2847 PetscFunctionReturn(0); 2848 } 2849 2850 /* 2851 Special case where the matrix was ILU(0) factored in the natural 2852 ordering. This eliminates the need for the column and row permutation. 2853 */ 2854 #undef __FUNCT__ 2855 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering" 2856 int MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 2857 { 2858 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2859 int n=a->mbs,*ai=a->i,*aj=a->j; 2860 int ierr,*diag = a->diag; 2861 MatScalar *aa=a->a,*v; 2862 PetscScalar *x,*b,s1,s2,x1,x2; 2863 int jdx,idt,idx,nz,*vi,i; 2864 2865 PetscFunctionBegin; 2866 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2867 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2868 2869 /* forward solve the lower triangular */ 2870 idx = 0; 2871 x[0] = b[0]; x[1] = b[1]; 2872 for (i=1; i<n; i++) { 2873 v = aa + 4*ai[i]; 2874 vi = aj + ai[i]; 2875 nz = diag[i] - ai[i]; 2876 idx += 2; 2877 s1 = b[idx];s2 = b[1+idx]; 2878 while (nz--) { 2879 jdx = 2*(*vi++); 2880 x1 = x[jdx];x2 = x[1+jdx]; 2881 s1 -= v[0]*x1 + v[2]*x2; 2882 s2 -= v[1]*x1 + v[3]*x2; 2883 v += 4; 2884 } 2885 x[idx] = s1; 2886 x[1+idx] = s2; 2887 } 2888 /* backward solve the upper triangular */ 2889 for (i=n-1; i>=0; i--){ 2890 v = aa + 4*diag[i] + 4; 2891 vi = aj + diag[i] + 1; 2892 nz = ai[i+1] - diag[i] - 1; 2893 idt = 2*i; 2894 s1 = x[idt]; s2 = x[1+idt]; 2895 while (nz--) { 2896 idx = 2*(*vi++); 2897 x1 = x[idx]; x2 = x[1+idx]; 2898 s1 -= v[0]*x1 + v[2]*x2; 2899 s2 -= v[1]*x1 + v[3]*x2; 2900 v += 4; 2901 } 2902 v = aa + 4*diag[i]; 2903 x[idt] = v[0]*s1 + v[2]*s2; 2904 x[1+idt] = v[1]*s1 + v[3]*s2; 2905 } 2906 2907 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2908 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2909 PetscLogFlops(2*4*(a->nz) - 2*A->n); 2910 PetscFunctionReturn(0); 2911 } 2912 2913 #undef __FUNCT__ 2914 #define __FUNCT__ "MatSolve_SeqBAIJ_1" 2915 int MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 2916 { 2917 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2918 IS iscol=a->col,isrow=a->row; 2919 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout; 2920 int *diag = a->diag; 2921 MatScalar *aa=a->a,*v; 2922 PetscScalar *x,*b,s1,*t; 2923 2924 PetscFunctionBegin; 2925 if (!n) PetscFunctionReturn(0); 2926 2927 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2928 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2929 t = a->solve_work; 2930 2931 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2932 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2933 2934 /* forward solve the lower triangular */ 2935 t[0] = b[*r++]; 2936 for (i=1; i<n; i++) { 2937 v = aa + ai[i]; 2938 vi = aj + ai[i]; 2939 nz = diag[i] - ai[i]; 2940 s1 = b[*r++]; 2941 while (nz--) { 2942 s1 -= (*v++)*t[*vi++]; 2943 } 2944 t[i] = s1; 2945 } 2946 /* backward solve the upper triangular */ 2947 for (i=n-1; i>=0; i--){ 2948 v = aa + diag[i] + 1; 2949 vi = aj + diag[i] + 1; 2950 nz = ai[i+1] - diag[i] - 1; 2951 s1 = t[i]; 2952 while (nz--) { 2953 s1 -= (*v++)*t[*vi++]; 2954 } 2955 x[*c--] = t[i] = aa[diag[i]]*s1; 2956 } 2957 2958 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2959 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2960 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2961 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2962 PetscLogFlops(2*1*(a->nz) - A->n); 2963 PetscFunctionReturn(0); 2964 } 2965 /* 2966 Special case where the matrix was ILU(0) factored in the natural 2967 ordering. This eliminates the need for the column and row permutation. 2968 */ 2969 #undef __FUNCT__ 2970 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering" 2971 int MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 2972 { 2973 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2974 int n=a->mbs,*ai=a->i,*aj=a->j; 2975 int ierr,*diag = a->diag; 2976 MatScalar *aa=a->a; 2977 PetscScalar *x,*b; 2978 PetscScalar s1,x1; 2979 MatScalar *v; 2980 int jdx,idt,idx,nz,*vi,i; 2981 2982 PetscFunctionBegin; 2983 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2984 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2985 2986 /* forward solve the lower triangular */ 2987 idx = 0; 2988 x[0] = b[0]; 2989 for (i=1; i<n; i++) { 2990 v = aa + ai[i]; 2991 vi = aj + ai[i]; 2992 nz = diag[i] - ai[i]; 2993 idx += 1; 2994 s1 = b[idx]; 2995 while (nz--) { 2996 jdx = *vi++; 2997 x1 = x[jdx]; 2998 s1 -= v[0]*x1; 2999 v += 1; 3000 } 3001 x[idx] = s1; 3002 } 3003 /* backward solve the upper triangular */ 3004 for (i=n-1; i>=0; i--){ 3005 v = aa + diag[i] + 1; 3006 vi = aj + diag[i] + 1; 3007 nz = ai[i+1] - diag[i] - 1; 3008 idt = i; 3009 s1 = x[idt]; 3010 while (nz--) { 3011 idx = *vi++; 3012 x1 = x[idx]; 3013 s1 -= v[0]*x1; 3014 v += 1; 3015 } 3016 v = aa + diag[i]; 3017 x[idt] = v[0]*s1; 3018 } 3019 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3020 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3021 PetscLogFlops(2*(a->nz) - A->n); 3022 PetscFunctionReturn(0); 3023 } 3024 3025 /* ----------------------------------------------------------------*/ 3026 /* 3027 This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 3028 except that the data structure of Mat_SeqAIJ is slightly different. 3029 Not a good example of code reuse. 3030 */ 3031 EXTERN int MatMissingDiagonal_SeqBAIJ(Mat); 3032 3033 #undef __FUNCT__ 3034 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ" 3035 int MatILUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,MatFactorInfo *info,Mat *fact) 3036 { 3037 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 3038 IS isicol; 3039 int *r,*ic,ierr,prow,n = a->mbs,*ai = a->i,*aj = a->j; 3040 int *ainew,*ajnew,jmax,*fill,*xi,nz,*im,*ajfill,*flev; 3041 int *dloc,idx,row,m,fm,nzf,nzi,len, reallocate = 0,dcount = 0; 3042 int incrlev,nnz,i,bs = a->bs,bs2 = a->bs2,levels,diagonal_fill; 3043 PetscTruth col_identity,row_identity; 3044 PetscReal f; 3045 3046 PetscFunctionBegin; 3047 f = info->fill; 3048 levels = (int)info->levels; 3049 diagonal_fill = (int)info->diagonal_fill; 3050 ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 3051 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 3052 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 3053 3054 if (!levels && row_identity && col_identity) { /* special case copy the nonzero structure */ 3055 ierr = MatDuplicate_SeqBAIJ(A,MAT_DO_NOT_COPY_VALUES,fact);CHKERRQ(ierr); 3056 (*fact)->factor = FACTOR_LU; 3057 b = (Mat_SeqBAIJ*)(*fact)->data; 3058 if (!b->diag) { 3059 ierr = MatMarkDiagonal_SeqBAIJ(*fact);CHKERRQ(ierr); 3060 } 3061 ierr = MatMissingDiagonal_SeqBAIJ(*fact);CHKERRQ(ierr); 3062 b->row = isrow; 3063 b->col = iscol; 3064 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 3065 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 3066 b->icol = isicol; 3067 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 3068 ierr = PetscMalloc(((*fact)->m+1+b->bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 3069 } else { /* general case perform the symbolic factorization */ 3070 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 3071 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 3072 3073 /* get new row pointers */ 3074 ierr = PetscMalloc((n+1)*sizeof(int),&ainew);CHKERRQ(ierr); 3075 ainew[0] = 0; 3076 /* don't know how many column pointers are needed so estimate */ 3077 jmax = (int)(f*ai[n] + 1); 3078 ierr = PetscMalloc((jmax)*sizeof(int),&ajnew);CHKERRQ(ierr); 3079 /* ajfill is level of fill for each fill entry */ 3080 ierr = PetscMalloc((jmax)*sizeof(int),&ajfill);CHKERRQ(ierr); 3081 /* fill is a linked list of nonzeros in active row */ 3082 ierr = PetscMalloc((n+1)*sizeof(int),&fill);CHKERRQ(ierr); 3083 /* im is level for each filled value */ 3084 ierr = PetscMalloc((n+1)*sizeof(int),&im);CHKERRQ(ierr); 3085 /* dloc is location of diagonal in factor */ 3086 ierr = PetscMalloc((n+1)*sizeof(int),&dloc);CHKERRQ(ierr); 3087 dloc[0] = 0; 3088 for (prow=0; prow<n; prow++) { 3089 3090 /* copy prow into linked list */ 3091 nzf = nz = ai[r[prow]+1] - ai[r[prow]]; 3092 if (!nz) SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix"); 3093 xi = aj + ai[r[prow]]; 3094 fill[n] = n; 3095 fill[prow] = -1; /* marker for diagonal entry */ 3096 while (nz--) { 3097 fm = n; 3098 idx = ic[*xi++]; 3099 do { 3100 m = fm; 3101 fm = fill[m]; 3102 } while (fm < idx); 3103 fill[m] = idx; 3104 fill[idx] = fm; 3105 im[idx] = 0; 3106 } 3107 3108 /* make sure diagonal entry is included */ 3109 if (diagonal_fill && fill[prow] == -1) { 3110 fm = n; 3111 while (fill[fm] < prow) fm = fill[fm]; 3112 fill[prow] = fill[fm]; /* insert diagonal into linked list */ 3113 fill[fm] = prow; 3114 im[prow] = 0; 3115 nzf++; 3116 dcount++; 3117 } 3118 3119 nzi = 0; 3120 row = fill[n]; 3121 while (row < prow) { 3122 incrlev = im[row] + 1; 3123 nz = dloc[row]; 3124 xi = ajnew + ainew[row] + nz + 1; 3125 flev = ajfill + ainew[row] + nz + 1; 3126 nnz = ainew[row+1] - ainew[row] - nz - 1; 3127 fm = row; 3128 while (nnz-- > 0) { 3129 idx = *xi++; 3130 if (*flev + incrlev > levels) { 3131 flev++; 3132 continue; 3133 } 3134 do { 3135 m = fm; 3136 fm = fill[m]; 3137 } while (fm < idx); 3138 if (fm != idx) { 3139 im[idx] = *flev + incrlev; 3140 fill[m] = idx; 3141 fill[idx] = fm; 3142 fm = idx; 3143 nzf++; 3144 } else { 3145 if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev; 3146 } 3147 flev++; 3148 } 3149 row = fill[row]; 3150 nzi++; 3151 } 3152 /* copy new filled row into permanent storage */ 3153 ainew[prow+1] = ainew[prow] + nzf; 3154 if (ainew[prow+1] > jmax) { 3155 3156 /* estimate how much additional space we will need */ 3157 /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 3158 /* just double the memory each time */ 3159 int maxadd = jmax; 3160 /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */ 3161 if (maxadd < nzf) maxadd = (n-prow)*(nzf+1); 3162 jmax += maxadd; 3163 3164 /* allocate a longer ajnew and ajfill */ 3165 ierr = PetscMalloc(jmax*sizeof(int),&xi);CHKERRQ(ierr); 3166 ierr = PetscMemcpy(xi,ajnew,ainew[prow]*sizeof(int));CHKERRQ(ierr); 3167 ierr = PetscFree(ajnew);CHKERRQ(ierr); 3168 ajnew = xi; 3169 ierr = PetscMalloc(jmax*sizeof(int),&xi);CHKERRQ(ierr); 3170 ierr = PetscMemcpy(xi,ajfill,ainew[prow]*sizeof(int));CHKERRQ(ierr); 3171 ierr = PetscFree(ajfill);CHKERRQ(ierr); 3172 ajfill = xi; 3173 reallocate++; /* count how many reallocations are needed */ 3174 } 3175 xi = ajnew + ainew[prow]; 3176 flev = ajfill + ainew[prow]; 3177 dloc[prow] = nzi; 3178 fm = fill[n]; 3179 while (nzf--) { 3180 *xi++ = fm; 3181 *flev++ = im[fm]; 3182 fm = fill[fm]; 3183 } 3184 /* make sure row has diagonal entry */ 3185 if (ajnew[ainew[prow]+dloc[prow]] != prow) { 3186 SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %d has missing diagonal in factored matrix\n\ 3187 try running with -pc_ilu_nonzeros_along_diagonal or -pc_ilu_diagonal_fill",prow); 3188 } 3189 } 3190 ierr = PetscFree(ajfill);CHKERRQ(ierr); 3191 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 3192 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 3193 ierr = PetscFree(fill);CHKERRQ(ierr); 3194 ierr = PetscFree(im);CHKERRQ(ierr); 3195 3196 { 3197 PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]); 3198 PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",reallocate,f,af); 3199 PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Run with -pc_ilu_fill %g or use \n",af); 3200 PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:PCILUSetFill(pc,%g);\n",af); 3201 PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:for best performance.\n"); 3202 if (diagonal_fill) { 3203 PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Detected and replaced %d missing diagonals",dcount); 3204 } 3205 } 3206 3207 /* put together the new matrix */ 3208 ierr = MatCreateSeqBAIJ(A->comm,bs,bs*n,bs*n,0,PETSC_NULL,fact);CHKERRQ(ierr); 3209 PetscLogObjectParent(*fact,isicol); 3210 b = (Mat_SeqBAIJ*)(*fact)->data; 3211 ierr = PetscFree(b->imax);CHKERRQ(ierr); 3212 b->singlemalloc = PETSC_FALSE; 3213 len = bs2*ainew[n]*sizeof(MatScalar); 3214 /* the next line frees the default space generated by the Create() */ 3215 ierr = PetscFree(b->a);CHKERRQ(ierr); 3216 ierr = PetscFree(b->ilen);CHKERRQ(ierr); 3217 ierr = PetscMalloc(len,&b->a);CHKERRQ(ierr); 3218 b->j = ajnew; 3219 b->i = ainew; 3220 for (i=0; i<n; i++) dloc[i] += ainew[i]; 3221 b->diag = dloc; 3222 b->ilen = 0; 3223 b->imax = 0; 3224 b->row = isrow; 3225 b->col = iscol; 3226 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 3227 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 3228 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 3229 b->icol = isicol; 3230 ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 3231 /* In b structure: Free imax, ilen, old a, old j. 3232 Allocate dloc, solve_work, new a, new j */ 3233 PetscLogObjectMemory(*fact,(ainew[n]-n)*(sizeof(int))+bs2*ainew[n]*sizeof(PetscScalar)); 3234 b->maxnz = b->nz = ainew[n]; 3235 (*fact)->factor = FACTOR_LU; 3236 3237 (*fact)->info.factor_mallocs = reallocate; 3238 (*fact)->info.fill_ratio_given = f; 3239 (*fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]); 3240 } 3241 3242 if (row_identity && col_identity) { 3243 ierr = MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(*fact);CHKERRQ(ierr); 3244 } 3245 PetscFunctionReturn(0); 3246 } 3247 3248 #undef __FUNCT__ 3249 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE" 3250 int MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A) 3251 { 3252 /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */ 3253 /* int i,*AJ=a->j,nz=a->nz; */ 3254 PetscFunctionBegin; 3255 /* Undo Column scaling */ 3256 /* while (nz--) { */ 3257 /* AJ[i] = AJ[i]/4; */ 3258 /* } */ 3259 /* This should really invoke a push/pop logic, but we don't have that yet. */ 3260 A->ops->setunfactored = PETSC_NULL; 3261 PetscFunctionReturn(0); 3262 } 3263 3264 #undef __FUNCT__ 3265 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj" 3266 int MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A) 3267 { 3268 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3269 int *AJ=a->j,nz=a->nz; 3270 unsigned short *aj=(unsigned short *)AJ; 3271 PetscFunctionBegin; 3272 /* Is this really necessary? */ 3273 while (nz--) { 3274 AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */ 3275 } 3276 A->ops->setunfactored = PETSC_NULL; 3277 PetscFunctionReturn(0); 3278 } 3279 3280 #undef __FUNCT__ 3281 #define __FUNCT__ "MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering" 3282 int MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(Mat inA) 3283 { 3284 /* 3285 Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver 3286 with natural ordering 3287 */ 3288 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)inA->data; 3289 3290 PetscFunctionBegin; 3291 inA->ops->solve = MatSolve_SeqBAIJ_Update; 3292 inA->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_Update; 3293 switch (a->bs) { 3294 case 1: 3295 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_1; 3296 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=1\n"); 3297 break; 3298 case 2: 3299 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering; 3300 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=2\n"); 3301 break; 3302 case 3: 3303 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering; 3304 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=3\n"); 3305 break; 3306 case 4: 3307 #if defined(PETSC_USE_MAT_SINGLE) 3308 { 3309 PetscTruth sse_enabled_local; 3310 int ierr; 3311 ierr = PetscSSEIsEnabled(inA->comm,&sse_enabled_local,PETSC_NULL);CHKERRQ(ierr); 3312 if (sse_enabled_local) { 3313 # if defined(PETSC_HAVE_SSE) 3314 int i,*AJ=a->j,nz=a->nz,n=a->mbs; 3315 if (n==(unsigned short)n) { 3316 unsigned short *aj=(unsigned short *)AJ; 3317 for (i=0;i<nz;i++) { 3318 aj[i] = (unsigned short)AJ[i]; 3319 } 3320 inA->ops->setunfactored = MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj; 3321 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj; 3322 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special SSE, in-place natural ordering, ushort j index factor BS=4\n"); 3323 } else { 3324 /* Scale the column indices for easier indexing in MatSolve. */ 3325 /* for (i=0;i<nz;i++) { */ 3326 /* AJ[i] = AJ[i]*4; */ 3327 /* } */ 3328 inA->ops->setunfactored = MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE; 3329 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE; 3330 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special SSE, in-place natural ordering, int j index factor BS=4\n"); 3331 } 3332 # else 3333 /* This should never be reached. If so, problem in PetscSSEIsEnabled. */ 3334 SETERRQ(PETSC_ERR_SUP,"SSE Hardware unavailable"); 3335 # endif 3336 } else { 3337 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering; 3338 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=4\n"); 3339 } 3340 } 3341 #else 3342 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering; 3343 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=4\n"); 3344 #endif 3345 break; 3346 case 5: 3347 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_5_NaturalOrdering; 3348 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=5\n"); 3349 break; 3350 case 6: 3351 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_6_NaturalOrdering; 3352 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=6\n"); 3353 break; 3354 case 7: 3355 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_7_NaturalOrdering; 3356 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=7\n"); 3357 break; 3358 } 3359 PetscFunctionReturn(0); 3360 } 3361 3362 #undef __FUNCT__ 3363 #define __FUNCT__ "MatSeqBAIJ_UpdateSolvers" 3364 int MatSeqBAIJ_UpdateSolvers(Mat A) 3365 { 3366 /* 3367 Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver 3368 with natural ordering 3369 */ 3370 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3371 IS row = a->row, col = a->col; 3372 PetscTruth row_identity, col_identity; 3373 PetscTruth use_natural; 3374 int ierr; 3375 3376 PetscFunctionBegin; 3377 3378 use_natural = PETSC_FALSE; 3379 if (row && col) { 3380 ierr = ISIdentity(row,&row_identity);CHKERRQ(ierr); 3381 ierr = ISIdentity(col,&col_identity);CHKERRQ(ierr); 3382 3383 if (row_identity && col_identity) { 3384 use_natural = PETSC_TRUE; 3385 } 3386 } else { 3387 use_natural = PETSC_TRUE; 3388 } 3389 3390 switch (a->bs) { 3391 case 1: 3392 if (use_natural) { 3393 A->ops->solve = MatSolve_SeqBAIJ_1_NaturalOrdering; 3394 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering; 3395 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=1\n"); 3396 PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n"); 3397 } else { 3398 A->ops->solve = MatSolve_SeqBAIJ_1; 3399 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1; 3400 } 3401 break; 3402 case 2: 3403 if (use_natural) { 3404 A->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering; 3405 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering; 3406 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=2\n"); 3407 PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n"); 3408 } else { 3409 A->ops->solve = MatSolve_SeqBAIJ_2; 3410 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2; 3411 } 3412 break; 3413 case 3: 3414 if (use_natural) { 3415 A->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering; 3416 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering; 3417 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=3\n"); 3418 PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n"); 3419 } else { 3420 A->ops->solve = MatSolve_SeqBAIJ_3; 3421 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3; 3422 } 3423 break; 3424 case 4: 3425 { 3426 PetscTruth sse_enabled_local; 3427 ierr = PetscSSEIsEnabled(A->comm,&sse_enabled_local,PETSC_NULL);CHKERRQ(ierr); 3428 if (use_natural) { 3429 #if defined(PETSC_USE_MAT_SINGLE) 3430 if (sse_enabled_local) { /* Natural + Single + SSE */ 3431 # if defined(PETSC_HAVE_SSE) 3432 int n=a->mbs; 3433 if (n==(unsigned short)n) { 3434 A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj; 3435 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE, in-place, ushort j index, natural ordering solve BS=4\n"); 3436 } else { 3437 A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion; 3438 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE, in-place, int j index, natural ordering solve BS=4\n"); 3439 } 3440 # else 3441 /* This should never be reached, unless there is a bug in PetscSSEIsEnabled(). */ 3442 SETERRQ(PETSC_ERR_SUP,"SSE implementations are unavailable."); 3443 # endif 3444 } else { /* Natural + Single */ 3445 A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion; 3446 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, in-place, natural ordering solve BS=4\n"); 3447 } 3448 #else 3449 A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering; 3450 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place, natural ordering solve BS=4\n"); 3451 #endif 3452 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering; 3453 PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place, natural ordering solve BS=4\n"); 3454 } else { /* Arbitrary ordering */ 3455 #if defined(PETSC_USE_MAT_SINGLE) 3456 if (sse_enabled_local) { /* Arbitrary + Single + SSE */ 3457 # if defined(PETSC_HAVE_SSE) 3458 A->ops->solve = MatSolve_SeqBAIJ_4_SSE_Demotion; 3459 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE solve BS=4\n"); 3460 # else 3461 /* This should never be reached, unless there is a bug in PetscSSEIsEnabled(). */ 3462 SETERRQ(PETSC_ERR_SUP,"SSE implementations are unavailable."); 3463 # endif 3464 } else { /* Arbitrary + Single */ 3465 A->ops->solve = MatSolve_SeqBAIJ_4_Demotion; 3466 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision solve BS=4\n"); 3467 } 3468 #else 3469 A->ops->solve = MatSolve_SeqBAIJ_4; 3470 #endif 3471 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4; 3472 } 3473 } 3474 break; 3475 case 5: 3476 if (use_natural) { 3477 A->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering; 3478 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_5_NaturalOrdering; 3479 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=5\n"); 3480 PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=5\n"); 3481 } else { 3482 A->ops->solve = MatSolve_SeqBAIJ_5; 3483 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_5; 3484 } 3485 break; 3486 case 6: 3487 if (use_natural) { 3488 A->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering; 3489 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_6_NaturalOrdering; 3490 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=6\n"); 3491 PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=6\n"); 3492 } else { 3493 A->ops->solve = MatSolve_SeqBAIJ_6; 3494 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_6; 3495 } 3496 break; 3497 case 7: 3498 if (use_natural) { 3499 A->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering; 3500 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_7_NaturalOrdering; 3501 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=7\n"); 3502 PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=7\n"); 3503 } else { 3504 A->ops->solve = MatSolve_SeqBAIJ_7; 3505 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_7; 3506 } 3507 break; 3508 default: 3509 A->ops->solve = MatSolve_SeqBAIJ_N; 3510 break; 3511 } 3512 PetscFunctionReturn(0); 3513 } 3514 3515 #undef __FUNCT__ 3516 #define __FUNCT__ "MatSolve_SeqBAIJ_Update" 3517 int MatSolve_SeqBAIJ_Update(Mat A,Vec x,Vec y) { 3518 int ierr; 3519 3520 PetscFunctionBegin; 3521 ierr = MatSeqBAIJ_UpdateSolvers(A); 3522 if (A->ops->solve != MatSolve_SeqBAIJ_Update) { 3523 ierr = (*A->ops->solve)(A,x,y);CHKERRQ(ierr); 3524 } else { 3525 SETERRQ(PETSC_ERR_SUP,"Something really wrong happened."); 3526 } 3527 PetscFunctionReturn(0); 3528 } 3529 3530 #undef __FUNCT__ 3531 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_Update" 3532 int MatSolveTranspose_SeqBAIJ_Update(Mat A,Vec x,Vec y) { 3533 int ierr; 3534 3535 PetscFunctionBegin; 3536 ierr = MatSeqBAIJ_UpdateSolvers(A); 3537 ierr = (*A->ops->solvetranspose)(A,x,y);CHKERRQ(ierr); 3538 PetscFunctionReturn(0); 3539 } 3540 3541 3542