1 /* 2 Factorization code for BAIJ format. 3 */ 4 5 #include "src/mat/impls/baij/seq/baij.h" 6 #include "src/inline/ilu.h" 7 #include "src/inline/dot.h" 8 9 #undef __FUNCT__ 10 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering" 11 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 12 { 13 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 14 PetscErrorCode ierr; 15 int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 16 int *diag = a->diag; 17 MatScalar *aa=a->a,*v; 18 PetscScalar s1,*x,*b; 19 20 PetscFunctionBegin; 21 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 22 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 23 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 24 25 /* forward solve the U^T */ 26 for (i=0; i<n; i++) { 27 28 v = aa + diag[i]; 29 /* multiply by the inverse of the block diagonal */ 30 s1 = (*v++)*x[i]; 31 vi = aj + diag[i] + 1; 32 nz = ai[i+1] - diag[i] - 1; 33 while (nz--) { 34 x[*vi++] -= (*v++)*s1; 35 } 36 x[i] = s1; 37 } 38 /* backward solve the L^T */ 39 for (i=n-1; i>=0; i--){ 40 v = aa + diag[i] - 1; 41 vi = aj + diag[i] - 1; 42 nz = diag[i] - ai[i]; 43 s1 = x[i]; 44 while (nz--) { 45 x[*vi--] -= (*v--)*s1; 46 } 47 } 48 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 49 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 50 PetscLogFlops(2*(a->nz) - A->n); 51 PetscFunctionReturn(0); 52 } 53 54 #undef __FUNCT__ 55 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering" 56 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 57 { 58 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 59 PetscErrorCode ierr; 60 int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 61 int *diag = a->diag,oidx; 62 MatScalar *aa=a->a,*v; 63 PetscScalar s1,s2,x1,x2; 64 PetscScalar *x,*b; 65 66 PetscFunctionBegin; 67 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 68 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 69 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 70 71 /* forward solve the U^T */ 72 idx = 0; 73 for (i=0; i<n; i++) { 74 75 v = aa + 4*diag[i]; 76 /* multiply by the inverse of the block diagonal */ 77 x1 = x[idx]; x2 = x[1+idx]; 78 s1 = v[0]*x1 + v[1]*x2; 79 s2 = v[2]*x1 + v[3]*x2; 80 v += 4; 81 82 vi = aj + diag[i] + 1; 83 nz = ai[i+1] - diag[i] - 1; 84 while (nz--) { 85 oidx = 2*(*vi++); 86 x[oidx] -= v[0]*s1 + v[1]*s2; 87 x[oidx+1] -= v[2]*s1 + v[3]*s2; 88 v += 4; 89 } 90 x[idx] = s1;x[1+idx] = s2; 91 idx += 2; 92 } 93 /* backward solve the L^T */ 94 for (i=n-1; i>=0; i--){ 95 v = aa + 4*diag[i] - 4; 96 vi = aj + diag[i] - 1; 97 nz = diag[i] - ai[i]; 98 idt = 2*i; 99 s1 = x[idt]; s2 = x[1+idt]; 100 while (nz--) { 101 idx = 2*(*vi--); 102 x[idx] -= v[0]*s1 + v[1]*s2; 103 x[idx+1] -= v[2]*s1 + v[3]*s2; 104 v -= 4; 105 } 106 } 107 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 108 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 109 PetscLogFlops(2*4*(a->nz) - 2*A->n); 110 PetscFunctionReturn(0); 111 } 112 113 #undef __FUNCT__ 114 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering" 115 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 116 { 117 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 118 PetscErrorCode ierr; 119 int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 120 int *diag = a->diag,oidx; 121 MatScalar *aa=a->a,*v; 122 PetscScalar s1,s2,s3,x1,x2,x3; 123 PetscScalar *x,*b; 124 125 PetscFunctionBegin; 126 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 127 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 128 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 129 130 /* forward solve the U^T */ 131 idx = 0; 132 for (i=0; i<n; i++) { 133 134 v = aa + 9*diag[i]; 135 /* multiply by the inverse of the block diagonal */ 136 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 137 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 138 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 139 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 140 v += 9; 141 142 vi = aj + diag[i] + 1; 143 nz = ai[i+1] - diag[i] - 1; 144 while (nz--) { 145 oidx = 3*(*vi++); 146 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 147 x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 148 x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 149 v += 9; 150 } 151 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 152 idx += 3; 153 } 154 /* backward solve the L^T */ 155 for (i=n-1; i>=0; i--){ 156 v = aa + 9*diag[i] - 9; 157 vi = aj + diag[i] - 1; 158 nz = diag[i] - ai[i]; 159 idt = 3*i; 160 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 161 while (nz--) { 162 idx = 3*(*vi--); 163 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 164 x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 165 x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 166 v -= 9; 167 } 168 } 169 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 170 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 171 PetscLogFlops(2*9*(a->nz) - 3*A->n); 172 PetscFunctionReturn(0); 173 } 174 175 #undef __FUNCT__ 176 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering" 177 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 178 { 179 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 180 PetscErrorCode ierr; 181 int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 182 int *diag = a->diag,oidx; 183 MatScalar *aa=a->a,*v; 184 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 185 PetscScalar *x,*b; 186 187 PetscFunctionBegin; 188 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 189 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 190 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 191 192 /* forward solve the U^T */ 193 idx = 0; 194 for (i=0; i<n; i++) { 195 196 v = aa + 16*diag[i]; 197 /* multiply by the inverse of the block diagonal */ 198 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 199 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 200 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 201 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 202 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 203 v += 16; 204 205 vi = aj + diag[i] + 1; 206 nz = ai[i+1] - diag[i] - 1; 207 while (nz--) { 208 oidx = 4*(*vi++); 209 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 210 x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 211 x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 212 x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 213 v += 16; 214 } 215 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; 216 idx += 4; 217 } 218 /* backward solve the L^T */ 219 for (i=n-1; i>=0; i--){ 220 v = aa + 16*diag[i] - 16; 221 vi = aj + diag[i] - 1; 222 nz = diag[i] - ai[i]; 223 idt = 4*i; 224 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; 225 while (nz--) { 226 idx = 4*(*vi--); 227 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 228 x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 229 x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 230 x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 231 v -= 16; 232 } 233 } 234 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 235 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 236 PetscLogFlops(2*16*(a->nz) - 4*A->n); 237 PetscFunctionReturn(0); 238 } 239 240 #undef __FUNCT__ 241 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering" 242 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 243 { 244 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 245 PetscErrorCode ierr; 246 int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 247 int *diag = a->diag,oidx; 248 MatScalar *aa=a->a,*v; 249 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 250 PetscScalar *x,*b; 251 252 PetscFunctionBegin; 253 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 254 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 255 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 256 257 /* forward solve the U^T */ 258 idx = 0; 259 for (i=0; i<n; i++) { 260 261 v = aa + 25*diag[i]; 262 /* multiply by the inverse of the block diagonal */ 263 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 264 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 265 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 266 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 267 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 268 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 269 v += 25; 270 271 vi = aj + diag[i] + 1; 272 nz = ai[i+1] - diag[i] - 1; 273 while (nz--) { 274 oidx = 5*(*vi++); 275 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 276 x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 277 x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 278 x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 279 x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 280 v += 25; 281 } 282 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 283 idx += 5; 284 } 285 /* backward solve the L^T */ 286 for (i=n-1; i>=0; i--){ 287 v = aa + 25*diag[i] - 25; 288 vi = aj + diag[i] - 1; 289 nz = diag[i] - ai[i]; 290 idt = 5*i; 291 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 292 while (nz--) { 293 idx = 5*(*vi--); 294 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 295 x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 296 x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 297 x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 298 x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 299 v -= 25; 300 } 301 } 302 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 303 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 304 PetscLogFlops(2*25*(a->nz) - 5*A->n); 305 PetscFunctionReturn(0); 306 } 307 308 #undef __FUNCT__ 309 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering" 310 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 311 { 312 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 313 PetscErrorCode ierr; 314 int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 315 int *diag = a->diag,oidx; 316 MatScalar *aa=a->a,*v; 317 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 318 PetscScalar *x,*b; 319 320 PetscFunctionBegin; 321 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 322 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 323 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 324 325 /* forward solve the U^T */ 326 idx = 0; 327 for (i=0; i<n; i++) { 328 329 v = aa + 36*diag[i]; 330 /* multiply by the inverse of the block diagonal */ 331 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 332 x6 = x[5+idx]; 333 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 334 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 335 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 336 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 337 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 338 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 339 v += 36; 340 341 vi = aj + diag[i] + 1; 342 nz = ai[i+1] - diag[i] - 1; 343 while (nz--) { 344 oidx = 6*(*vi++); 345 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 346 x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 347 x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 348 x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 349 x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 350 x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 351 v += 36; 352 } 353 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 354 x[5+idx] = s6; 355 idx += 6; 356 } 357 /* backward solve the L^T */ 358 for (i=n-1; i>=0; i--){ 359 v = aa + 36*diag[i] - 36; 360 vi = aj + diag[i] - 1; 361 nz = diag[i] - ai[i]; 362 idt = 6*i; 363 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 364 s6 = x[5+idt]; 365 while (nz--) { 366 idx = 6*(*vi--); 367 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 368 x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 369 x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 370 x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 371 x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 372 x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 373 v -= 36; 374 } 375 } 376 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 377 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 378 PetscLogFlops(2*36*(a->nz) - 6*A->n); 379 PetscFunctionReturn(0); 380 } 381 382 #undef __FUNCT__ 383 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering" 384 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 385 { 386 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 387 PetscErrorCode ierr; 388 int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 389 int *diag = a->diag,oidx; 390 MatScalar *aa=a->a,*v; 391 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 392 PetscScalar *x,*b; 393 394 PetscFunctionBegin; 395 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 396 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 397 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 398 399 /* forward solve the U^T */ 400 idx = 0; 401 for (i=0; i<n; i++) { 402 403 v = aa + 49*diag[i]; 404 /* multiply by the inverse of the block diagonal */ 405 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 406 x6 = x[5+idx]; x7 = x[6+idx]; 407 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 408 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 409 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 410 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 411 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 412 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 413 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 414 v += 49; 415 416 vi = aj + diag[i] + 1; 417 nz = ai[i+1] - diag[i] - 1; 418 while (nz--) { 419 oidx = 7*(*vi++); 420 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 421 x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 422 x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 423 x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 424 x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 425 x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 426 x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 427 v += 49; 428 } 429 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 430 x[5+idx] = s6;x[6+idx] = s7; 431 idx += 7; 432 } 433 /* backward solve the L^T */ 434 for (i=n-1; i>=0; i--){ 435 v = aa + 49*diag[i] - 49; 436 vi = aj + diag[i] - 1; 437 nz = diag[i] - ai[i]; 438 idt = 7*i; 439 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 440 s6 = x[5+idt];s7 = x[6+idt]; 441 while (nz--) { 442 idx = 7*(*vi--); 443 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 444 x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 445 x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 446 x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 447 x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 448 x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 449 x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 450 v -= 49; 451 } 452 } 453 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 454 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 455 PetscLogFlops(2*49*(a->nz) - 7*A->n); 456 PetscFunctionReturn(0); 457 } 458 459 /*---------------------------------------------------------------------------------------------*/ 460 #undef __FUNCT__ 461 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1" 462 PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 463 { 464 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 465 IS iscol=a->col,isrow=a->row; 466 PetscErrorCode ierr; 467 int *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout; 468 int *diag = a->diag; 469 MatScalar *aa=a->a,*v; 470 PetscScalar s1,*x,*b,*t; 471 472 PetscFunctionBegin; 473 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 474 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 475 t = a->solve_work; 476 477 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 478 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 479 480 /* copy the b into temp work space according to permutation */ 481 for (i=0; i<n; i++) { 482 t[i] = b[c[i]]; 483 } 484 485 /* forward solve the U^T */ 486 for (i=0; i<n; i++) { 487 488 v = aa + diag[i]; 489 /* multiply by the inverse of the block diagonal */ 490 s1 = (*v++)*t[i]; 491 vi = aj + diag[i] + 1; 492 nz = ai[i+1] - diag[i] - 1; 493 while (nz--) { 494 t[*vi++] -= (*v++)*s1; 495 } 496 t[i] = s1; 497 } 498 /* backward solve the L^T */ 499 for (i=n-1; i>=0; i--){ 500 v = aa + diag[i] - 1; 501 vi = aj + diag[i] - 1; 502 nz = diag[i] - ai[i]; 503 s1 = t[i]; 504 while (nz--) { 505 t[*vi--] -= (*v--)*s1; 506 } 507 } 508 509 /* copy t into x according to permutation */ 510 for (i=0; i<n; i++) { 511 x[r[i]] = t[i]; 512 } 513 514 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 515 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 516 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 517 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 518 PetscLogFlops(2*(a->nz) - A->n); 519 PetscFunctionReturn(0); 520 } 521 522 #undef __FUNCT__ 523 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2" 524 PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 525 { 526 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 527 IS iscol=a->col,isrow=a->row; 528 PetscErrorCode ierr; 529 int *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 530 int *diag = a->diag,ii,ic,ir,oidx; 531 MatScalar *aa=a->a,*v; 532 PetscScalar s1,s2,x1,x2; 533 PetscScalar *x,*b,*t; 534 535 PetscFunctionBegin; 536 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 537 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 538 t = a->solve_work; 539 540 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 541 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 542 543 /* copy the b into temp work space according to permutation */ 544 ii = 0; 545 for (i=0; i<n; i++) { 546 ic = 2*c[i]; 547 t[ii] = b[ic]; 548 t[ii+1] = b[ic+1]; 549 ii += 2; 550 } 551 552 /* forward solve the U^T */ 553 idx = 0; 554 for (i=0; i<n; i++) { 555 556 v = aa + 4*diag[i]; 557 /* multiply by the inverse of the block diagonal */ 558 x1 = t[idx]; x2 = t[1+idx]; 559 s1 = v[0]*x1 + v[1]*x2; 560 s2 = v[2]*x1 + v[3]*x2; 561 v += 4; 562 563 vi = aj + diag[i] + 1; 564 nz = ai[i+1] - diag[i] - 1; 565 while (nz--) { 566 oidx = 2*(*vi++); 567 t[oidx] -= v[0]*s1 + v[1]*s2; 568 t[oidx+1] -= v[2]*s1 + v[3]*s2; 569 v += 4; 570 } 571 t[idx] = s1;t[1+idx] = s2; 572 idx += 2; 573 } 574 /* backward solve the L^T */ 575 for (i=n-1; i>=0; i--){ 576 v = aa + 4*diag[i] - 4; 577 vi = aj + diag[i] - 1; 578 nz = diag[i] - ai[i]; 579 idt = 2*i; 580 s1 = t[idt]; s2 = t[1+idt]; 581 while (nz--) { 582 idx = 2*(*vi--); 583 t[idx] -= v[0]*s1 + v[1]*s2; 584 t[idx+1] -= v[2]*s1 + v[3]*s2; 585 v -= 4; 586 } 587 } 588 589 /* copy t into x according to permutation */ 590 ii = 0; 591 for (i=0; i<n; i++) { 592 ir = 2*r[i]; 593 x[ir] = t[ii]; 594 x[ir+1] = t[ii+1]; 595 ii += 2; 596 } 597 598 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 599 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 600 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 601 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 602 PetscLogFlops(2*4*(a->nz) - 2*A->n); 603 PetscFunctionReturn(0); 604 } 605 606 #undef __FUNCT__ 607 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3" 608 PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 609 { 610 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 611 IS iscol=a->col,isrow=a->row; 612 PetscErrorCode ierr; 613 int *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 614 int *diag = a->diag,ii,ic,ir,oidx; 615 MatScalar *aa=a->a,*v; 616 PetscScalar s1,s2,s3,x1,x2,x3; 617 PetscScalar *x,*b,*t; 618 619 PetscFunctionBegin; 620 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 621 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 622 t = a->solve_work; 623 624 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 625 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 626 627 /* copy the b into temp work space according to permutation */ 628 ii = 0; 629 for (i=0; i<n; i++) { 630 ic = 3*c[i]; 631 t[ii] = b[ic]; 632 t[ii+1] = b[ic+1]; 633 t[ii+2] = b[ic+2]; 634 ii += 3; 635 } 636 637 /* forward solve the U^T */ 638 idx = 0; 639 for (i=0; i<n; i++) { 640 641 v = aa + 9*diag[i]; 642 /* multiply by the inverse of the block diagonal */ 643 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 644 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 645 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 646 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 647 v += 9; 648 649 vi = aj + diag[i] + 1; 650 nz = ai[i+1] - diag[i] - 1; 651 while (nz--) { 652 oidx = 3*(*vi++); 653 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 654 t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 655 t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 656 v += 9; 657 } 658 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 659 idx += 3; 660 } 661 /* backward solve the L^T */ 662 for (i=n-1; i>=0; i--){ 663 v = aa + 9*diag[i] - 9; 664 vi = aj + diag[i] - 1; 665 nz = diag[i] - ai[i]; 666 idt = 3*i; 667 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 668 while (nz--) { 669 idx = 3*(*vi--); 670 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 671 t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 672 t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 673 v -= 9; 674 } 675 } 676 677 /* copy t into x according to permutation */ 678 ii = 0; 679 for (i=0; i<n; i++) { 680 ir = 3*r[i]; 681 x[ir] = t[ii]; 682 x[ir+1] = t[ii+1]; 683 x[ir+2] = t[ii+2]; 684 ii += 3; 685 } 686 687 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 688 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 689 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 690 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 691 PetscLogFlops(2*9*(a->nz) - 3*A->n); 692 PetscFunctionReturn(0); 693 } 694 695 #undef __FUNCT__ 696 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4" 697 PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 698 { 699 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 700 IS iscol=a->col,isrow=a->row; 701 PetscErrorCode ierr; 702 int *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 703 int *diag = a->diag,ii,ic,ir,oidx; 704 MatScalar *aa=a->a,*v; 705 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 706 PetscScalar *x,*b,*t; 707 708 PetscFunctionBegin; 709 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 710 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 711 t = a->solve_work; 712 713 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 714 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 715 716 /* copy the b into temp work space according to permutation */ 717 ii = 0; 718 for (i=0; i<n; i++) { 719 ic = 4*c[i]; 720 t[ii] = b[ic]; 721 t[ii+1] = b[ic+1]; 722 t[ii+2] = b[ic+2]; 723 t[ii+3] = b[ic+3]; 724 ii += 4; 725 } 726 727 /* forward solve the U^T */ 728 idx = 0; 729 for (i=0; i<n; i++) { 730 731 v = aa + 16*diag[i]; 732 /* multiply by the inverse of the block diagonal */ 733 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 734 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 735 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 736 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 737 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 738 v += 16; 739 740 vi = aj + diag[i] + 1; 741 nz = ai[i+1] - diag[i] - 1; 742 while (nz--) { 743 oidx = 4*(*vi++); 744 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 745 t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 746 t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 747 t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 748 v += 16; 749 } 750 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; 751 idx += 4; 752 } 753 /* backward solve the L^T */ 754 for (i=n-1; i>=0; i--){ 755 v = aa + 16*diag[i] - 16; 756 vi = aj + diag[i] - 1; 757 nz = diag[i] - ai[i]; 758 idt = 4*i; 759 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; 760 while (nz--) { 761 idx = 4*(*vi--); 762 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 763 t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 764 t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 765 t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 766 v -= 16; 767 } 768 } 769 770 /* copy t into x according to permutation */ 771 ii = 0; 772 for (i=0; i<n; i++) { 773 ir = 4*r[i]; 774 x[ir] = t[ii]; 775 x[ir+1] = t[ii+1]; 776 x[ir+2] = t[ii+2]; 777 x[ir+3] = t[ii+3]; 778 ii += 4; 779 } 780 781 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 782 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 783 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 784 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 785 PetscLogFlops(2*16*(a->nz) - 4*A->n); 786 PetscFunctionReturn(0); 787 } 788 789 #undef __FUNCT__ 790 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5" 791 PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 792 { 793 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 794 IS iscol=a->col,isrow=a->row; 795 PetscErrorCode ierr; 796 int *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 797 int *diag = a->diag,ii,ic,ir,oidx; 798 MatScalar *aa=a->a,*v; 799 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 800 PetscScalar *x,*b,*t; 801 802 PetscFunctionBegin; 803 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 804 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 805 t = a->solve_work; 806 807 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 808 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 809 810 /* copy the b into temp work space according to permutation */ 811 ii = 0; 812 for (i=0; i<n; i++) { 813 ic = 5*c[i]; 814 t[ii] = b[ic]; 815 t[ii+1] = b[ic+1]; 816 t[ii+2] = b[ic+2]; 817 t[ii+3] = b[ic+3]; 818 t[ii+4] = b[ic+4]; 819 ii += 5; 820 } 821 822 /* forward solve the U^T */ 823 idx = 0; 824 for (i=0; i<n; i++) { 825 826 v = aa + 25*diag[i]; 827 /* multiply by the inverse of the block diagonal */ 828 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 829 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 830 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 831 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 832 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 833 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 834 v += 25; 835 836 vi = aj + diag[i] + 1; 837 nz = ai[i+1] - diag[i] - 1; 838 while (nz--) { 839 oidx = 5*(*vi++); 840 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 841 t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 842 t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 843 t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 844 t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 845 v += 25; 846 } 847 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 848 idx += 5; 849 } 850 /* backward solve the L^T */ 851 for (i=n-1; i>=0; i--){ 852 v = aa + 25*diag[i] - 25; 853 vi = aj + diag[i] - 1; 854 nz = diag[i] - ai[i]; 855 idt = 5*i; 856 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 857 while (nz--) { 858 idx = 5*(*vi--); 859 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 860 t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 861 t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 862 t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 863 t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 864 v -= 25; 865 } 866 } 867 868 /* copy t into x according to permutation */ 869 ii = 0; 870 for (i=0; i<n; i++) { 871 ir = 5*r[i]; 872 x[ir] = t[ii]; 873 x[ir+1] = t[ii+1]; 874 x[ir+2] = t[ii+2]; 875 x[ir+3] = t[ii+3]; 876 x[ir+4] = t[ii+4]; 877 ii += 5; 878 } 879 880 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 881 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 882 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 883 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 884 PetscLogFlops(2*25*(a->nz) - 5*A->n); 885 PetscFunctionReturn(0); 886 } 887 888 #undef __FUNCT__ 889 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6" 890 PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 891 { 892 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 893 IS iscol=a->col,isrow=a->row; 894 PetscErrorCode ierr; 895 int *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 896 int *diag = a->diag,ii,ic,ir,oidx; 897 MatScalar *aa=a->a,*v; 898 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 899 PetscScalar *x,*b,*t; 900 901 PetscFunctionBegin; 902 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 903 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 904 t = a->solve_work; 905 906 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 907 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 908 909 /* copy the b into temp work space according to permutation */ 910 ii = 0; 911 for (i=0; i<n; i++) { 912 ic = 6*c[i]; 913 t[ii] = b[ic]; 914 t[ii+1] = b[ic+1]; 915 t[ii+2] = b[ic+2]; 916 t[ii+3] = b[ic+3]; 917 t[ii+4] = b[ic+4]; 918 t[ii+5] = b[ic+5]; 919 ii += 6; 920 } 921 922 /* forward solve the U^T */ 923 idx = 0; 924 for (i=0; i<n; i++) { 925 926 v = aa + 36*diag[i]; 927 /* multiply by the inverse of the block diagonal */ 928 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 929 x6 = t[5+idx]; 930 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 931 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 932 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 933 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 934 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 935 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 936 v += 36; 937 938 vi = aj + diag[i] + 1; 939 nz = ai[i+1] - diag[i] - 1; 940 while (nz--) { 941 oidx = 6*(*vi++); 942 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 943 t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 944 t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 945 t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 946 t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 947 t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 948 v += 36; 949 } 950 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 951 t[5+idx] = s6; 952 idx += 6; 953 } 954 /* backward solve the L^T */ 955 for (i=n-1; i>=0; i--){ 956 v = aa + 36*diag[i] - 36; 957 vi = aj + diag[i] - 1; 958 nz = diag[i] - ai[i]; 959 idt = 6*i; 960 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 961 s6 = t[5+idt]; 962 while (nz--) { 963 idx = 6*(*vi--); 964 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 965 t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 966 t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 967 t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 968 t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 969 t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 970 v -= 36; 971 } 972 } 973 974 /* copy t into x according to permutation */ 975 ii = 0; 976 for (i=0; i<n; i++) { 977 ir = 6*r[i]; 978 x[ir] = t[ii]; 979 x[ir+1] = t[ii+1]; 980 x[ir+2] = t[ii+2]; 981 x[ir+3] = t[ii+3]; 982 x[ir+4] = t[ii+4]; 983 x[ir+5] = t[ii+5]; 984 ii += 6; 985 } 986 987 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 988 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 989 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 990 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 991 PetscLogFlops(2*36*(a->nz) - 6*A->n); 992 PetscFunctionReturn(0); 993 } 994 995 #undef __FUNCT__ 996 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7" 997 PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 998 { 999 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1000 IS iscol=a->col,isrow=a->row; 1001 PetscErrorCode ierr; 1002 int *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 1003 int *diag = a->diag,ii,ic,ir,oidx; 1004 MatScalar *aa=a->a,*v; 1005 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1006 PetscScalar *x,*b,*t; 1007 1008 PetscFunctionBegin; 1009 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1010 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1011 t = a->solve_work; 1012 1013 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1014 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1015 1016 /* copy the b into temp work space according to permutation */ 1017 ii = 0; 1018 for (i=0; i<n; i++) { 1019 ic = 7*c[i]; 1020 t[ii] = b[ic]; 1021 t[ii+1] = b[ic+1]; 1022 t[ii+2] = b[ic+2]; 1023 t[ii+3] = b[ic+3]; 1024 t[ii+4] = b[ic+4]; 1025 t[ii+5] = b[ic+5]; 1026 t[ii+6] = b[ic+6]; 1027 ii += 7; 1028 } 1029 1030 /* forward solve the U^T */ 1031 idx = 0; 1032 for (i=0; i<n; i++) { 1033 1034 v = aa + 49*diag[i]; 1035 /* multiply by the inverse of the block diagonal */ 1036 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1037 x6 = t[5+idx]; x7 = t[6+idx]; 1038 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 1039 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 1040 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 1041 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 1042 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 1043 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 1044 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 1045 v += 49; 1046 1047 vi = aj + diag[i] + 1; 1048 nz = ai[i+1] - diag[i] - 1; 1049 while (nz--) { 1050 oidx = 7*(*vi++); 1051 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1052 t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1053 t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1054 t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1055 t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1056 t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1057 t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1058 v += 49; 1059 } 1060 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1061 t[5+idx] = s6;t[6+idx] = s7; 1062 idx += 7; 1063 } 1064 /* backward solve the L^T */ 1065 for (i=n-1; i>=0; i--){ 1066 v = aa + 49*diag[i] - 49; 1067 vi = aj + diag[i] - 1; 1068 nz = diag[i] - ai[i]; 1069 idt = 7*i; 1070 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1071 s6 = t[5+idt];s7 = t[6+idt]; 1072 while (nz--) { 1073 idx = 7*(*vi--); 1074 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1075 t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1076 t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1077 t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1078 t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1079 t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1080 t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1081 v -= 49; 1082 } 1083 } 1084 1085 /* copy t into x according to permutation */ 1086 ii = 0; 1087 for (i=0; i<n; i++) { 1088 ir = 7*r[i]; 1089 x[ir] = t[ii]; 1090 x[ir+1] = t[ii+1]; 1091 x[ir+2] = t[ii+2]; 1092 x[ir+3] = t[ii+3]; 1093 x[ir+4] = t[ii+4]; 1094 x[ir+5] = t[ii+5]; 1095 x[ir+6] = t[ii+6]; 1096 ii += 7; 1097 } 1098 1099 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1100 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1101 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1102 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1103 PetscLogFlops(2*49*(a->nz) - 7*A->n); 1104 PetscFunctionReturn(0); 1105 } 1106 1107 /* ----------------------------------------------------------- */ 1108 #undef __FUNCT__ 1109 #define __FUNCT__ "MatSolve_SeqBAIJ_N" 1110 PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 1111 { 1112 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1113 IS iscol=a->col,isrow=a->row; 1114 PetscErrorCode ierr; 1115 int *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1116 int nz,bs=a->bs,bs2=a->bs2,*rout,*cout; 1117 MatScalar *aa=a->a,*v; 1118 PetscScalar *x,*b,*s,*t,*ls; 1119 1120 PetscFunctionBegin; 1121 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1122 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1123 t = a->solve_work; 1124 1125 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1126 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1127 1128 /* forward solve the lower triangular */ 1129 ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 1130 for (i=1; i<n; i++) { 1131 v = aa + bs2*ai[i]; 1132 vi = aj + ai[i]; 1133 nz = a->diag[i] - ai[i]; 1134 s = t + bs*i; 1135 ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 1136 while (nz--) { 1137 Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 1138 v += bs2; 1139 } 1140 } 1141 /* backward solve the upper triangular */ 1142 ls = a->solve_work + A->n; 1143 for (i=n-1; i>=0; i--){ 1144 v = aa + bs2*(a->diag[i] + 1); 1145 vi = aj + a->diag[i] + 1; 1146 nz = ai[i+1] - a->diag[i] - 1; 1147 ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 1148 while (nz--) { 1149 Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 1150 v += bs2; 1151 } 1152 Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 1153 ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 1154 } 1155 1156 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1157 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1158 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1159 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1160 PetscLogFlops(2*(a->bs2)*(a->nz) - a->bs*A->n); 1161 PetscFunctionReturn(0); 1162 } 1163 1164 #undef __FUNCT__ 1165 #define __FUNCT__ "MatSolve_SeqBAIJ_7" 1166 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 1167 { 1168 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1169 IS iscol=a->col,isrow=a->row; 1170 PetscErrorCode ierr; 1171 int *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1172 int *diag = a->diag; 1173 MatScalar *aa=a->a,*v; 1174 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1175 PetscScalar *x,*b,*t; 1176 1177 PetscFunctionBegin; 1178 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1179 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1180 t = a->solve_work; 1181 1182 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1183 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1184 1185 /* forward solve the lower triangular */ 1186 idx = 7*(*r++); 1187 t[0] = b[idx]; t[1] = b[1+idx]; 1188 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1189 t[5] = b[5+idx]; t[6] = b[6+idx]; 1190 1191 for (i=1; i<n; i++) { 1192 v = aa + 49*ai[i]; 1193 vi = aj + ai[i]; 1194 nz = diag[i] - ai[i]; 1195 idx = 7*(*r++); 1196 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1197 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 1198 while (nz--) { 1199 idx = 7*(*vi++); 1200 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1201 x4 = t[3+idx];x5 = t[4+idx]; 1202 x6 = t[5+idx];x7 = t[6+idx]; 1203 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1204 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1205 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1206 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1207 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1208 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1209 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1210 v += 49; 1211 } 1212 idx = 7*i; 1213 t[idx] = s1;t[1+idx] = s2; 1214 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1215 t[5+idx] = s6;t[6+idx] = s7; 1216 } 1217 /* backward solve the upper triangular */ 1218 for (i=n-1; i>=0; i--){ 1219 v = aa + 49*diag[i] + 49; 1220 vi = aj + diag[i] + 1; 1221 nz = ai[i+1] - diag[i] - 1; 1222 idt = 7*i; 1223 s1 = t[idt]; s2 = t[1+idt]; 1224 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1225 s6 = t[5+idt];s7 = t[6+idt]; 1226 while (nz--) { 1227 idx = 7*(*vi++); 1228 x1 = t[idx]; x2 = t[1+idx]; 1229 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1230 x6 = t[5+idx]; x7 = t[6+idx]; 1231 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1232 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1233 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1234 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1235 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1236 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1237 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1238 v += 49; 1239 } 1240 idc = 7*(*c--); 1241 v = aa + 49*diag[i]; 1242 x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 1243 v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 1244 x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 1245 v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 1246 x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 1247 v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 1248 x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 1249 v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 1250 x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 1251 v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 1252 x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 1253 v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 1254 x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 1255 v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 1256 } 1257 1258 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1259 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1260 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1261 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1262 PetscLogFlops(2*49*(a->nz) - 7*A->n); 1263 PetscFunctionReturn(0); 1264 } 1265 1266 #undef __FUNCT__ 1267 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering" 1268 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 1269 { 1270 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1271 int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1272 PetscErrorCode ierr; 1273 int *diag = a->diag,jdx; 1274 MatScalar *aa=a->a,*v; 1275 PetscScalar *x,*b,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1276 1277 PetscFunctionBegin; 1278 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1279 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1280 /* forward solve the lower triangular */ 1281 idx = 0; 1282 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 1283 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 1284 x[6] = b[6+idx]; 1285 for (i=1; i<n; i++) { 1286 v = aa + 49*ai[i]; 1287 vi = aj + ai[i]; 1288 nz = diag[i] - ai[i]; 1289 idx = 7*i; 1290 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1291 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1292 s7 = b[6+idx]; 1293 while (nz--) { 1294 jdx = 7*(*vi++); 1295 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 1296 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 1297 x7 = x[6+jdx]; 1298 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1299 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1300 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1301 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1302 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1303 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1304 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1305 v += 49; 1306 } 1307 x[idx] = s1; 1308 x[1+idx] = s2; 1309 x[2+idx] = s3; 1310 x[3+idx] = s4; 1311 x[4+idx] = s5; 1312 x[5+idx] = s6; 1313 x[6+idx] = s7; 1314 } 1315 /* backward solve the upper triangular */ 1316 for (i=n-1; i>=0; i--){ 1317 v = aa + 49*diag[i] + 49; 1318 vi = aj + diag[i] + 1; 1319 nz = ai[i+1] - diag[i] - 1; 1320 idt = 7*i; 1321 s1 = x[idt]; s2 = x[1+idt]; 1322 s3 = x[2+idt]; s4 = x[3+idt]; 1323 s5 = x[4+idt]; s6 = x[5+idt]; 1324 s7 = x[6+idt]; 1325 while (nz--) { 1326 idx = 7*(*vi++); 1327 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 1328 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 1329 x7 = x[6+idx]; 1330 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1331 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1332 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1333 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1334 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1335 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1336 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1337 v += 49; 1338 } 1339 v = aa + 49*diag[i]; 1340 x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 1341 + v[28]*s5 + v[35]*s6 + v[42]*s7; 1342 x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 1343 + v[29]*s5 + v[36]*s6 + v[43]*s7; 1344 x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 1345 + v[30]*s5 + v[37]*s6 + v[44]*s7; 1346 x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 1347 + v[31]*s5 + v[38]*s6 + v[45]*s7; 1348 x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 1349 + v[32]*s5 + v[39]*s6 + v[46]*s7; 1350 x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 1351 + v[33]*s5 + v[40]*s6 + v[47]*s7; 1352 x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 1353 + v[34]*s5 + v[41]*s6 + v[48]*s7; 1354 } 1355 1356 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1357 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1358 PetscLogFlops(2*36*(a->nz) - 6*A->n); 1359 PetscFunctionReturn(0); 1360 } 1361 1362 #undef __FUNCT__ 1363 #define __FUNCT__ "MatSolve_SeqBAIJ_6" 1364 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 1365 { 1366 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1367 IS iscol=a->col,isrow=a->row; 1368 PetscErrorCode ierr; 1369 int *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1370 int *diag = a->diag; 1371 MatScalar *aa=a->a,*v; 1372 PetscScalar *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 1373 1374 PetscFunctionBegin; 1375 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1376 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1377 t = a->solve_work; 1378 1379 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1380 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1381 1382 /* forward solve the lower triangular */ 1383 idx = 6*(*r++); 1384 t[0] = b[idx]; t[1] = b[1+idx]; 1385 t[2] = b[2+idx]; t[3] = b[3+idx]; 1386 t[4] = b[4+idx]; t[5] = b[5+idx]; 1387 for (i=1; i<n; i++) { 1388 v = aa + 36*ai[i]; 1389 vi = aj + ai[i]; 1390 nz = diag[i] - ai[i]; 1391 idx = 6*(*r++); 1392 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1393 s5 = b[4+idx]; s6 = b[5+idx]; 1394 while (nz--) { 1395 idx = 6*(*vi++); 1396 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1397 x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 1398 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1399 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1400 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1401 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1402 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1403 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1404 v += 36; 1405 } 1406 idx = 6*i; 1407 t[idx] = s1;t[1+idx] = s2; 1408 t[2+idx] = s3;t[3+idx] = s4; 1409 t[4+idx] = s5;t[5+idx] = s6; 1410 } 1411 /* backward solve the upper triangular */ 1412 for (i=n-1; i>=0; i--){ 1413 v = aa + 36*diag[i] + 36; 1414 vi = aj + diag[i] + 1; 1415 nz = ai[i+1] - diag[i] - 1; 1416 idt = 6*i; 1417 s1 = t[idt]; s2 = t[1+idt]; 1418 s3 = t[2+idt];s4 = t[3+idt]; 1419 s5 = t[4+idt];s6 = t[5+idt]; 1420 while (nz--) { 1421 idx = 6*(*vi++); 1422 x1 = t[idx]; x2 = t[1+idx]; 1423 x3 = t[2+idx]; x4 = t[3+idx]; 1424 x5 = t[4+idx]; x6 = t[5+idx]; 1425 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1426 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1427 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1428 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1429 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1430 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1431 v += 36; 1432 } 1433 idc = 6*(*c--); 1434 v = aa + 36*diag[i]; 1435 x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 1436 v[18]*s4+v[24]*s5+v[30]*s6; 1437 x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 1438 v[19]*s4+v[25]*s5+v[31]*s6; 1439 x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 1440 v[20]*s4+v[26]*s5+v[32]*s6; 1441 x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 1442 v[21]*s4+v[27]*s5+v[33]*s6; 1443 x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 1444 v[22]*s4+v[28]*s5+v[34]*s6; 1445 x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 1446 v[23]*s4+v[29]*s5+v[35]*s6; 1447 } 1448 1449 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1450 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1451 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1452 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1453 PetscLogFlops(2*36*(a->nz) - 6*A->n); 1454 PetscFunctionReturn(0); 1455 } 1456 1457 #undef __FUNCT__ 1458 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering" 1459 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 1460 { 1461 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1462 int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1463 PetscErrorCode ierr; 1464 int *diag = a->diag,jdx; 1465 MatScalar *aa=a->a,*v; 1466 PetscScalar *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 1467 1468 PetscFunctionBegin; 1469 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1470 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1471 /* forward solve the lower triangular */ 1472 idx = 0; 1473 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 1474 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 1475 for (i=1; i<n; i++) { 1476 v = aa + 36*ai[i]; 1477 vi = aj + ai[i]; 1478 nz = diag[i] - ai[i]; 1479 idx = 6*i; 1480 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1481 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1482 while (nz--) { 1483 jdx = 6*(*vi++); 1484 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 1485 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 1486 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1487 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1488 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1489 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1490 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1491 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1492 v += 36; 1493 } 1494 x[idx] = s1; 1495 x[1+idx] = s2; 1496 x[2+idx] = s3; 1497 x[3+idx] = s4; 1498 x[4+idx] = s5; 1499 x[5+idx] = s6; 1500 } 1501 /* backward solve the upper triangular */ 1502 for (i=n-1; i>=0; i--){ 1503 v = aa + 36*diag[i] + 36; 1504 vi = aj + diag[i] + 1; 1505 nz = ai[i+1] - diag[i] - 1; 1506 idt = 6*i; 1507 s1 = x[idt]; s2 = x[1+idt]; 1508 s3 = x[2+idt]; s4 = x[3+idt]; 1509 s5 = x[4+idt]; s6 = x[5+idt]; 1510 while (nz--) { 1511 idx = 6*(*vi++); 1512 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 1513 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 1514 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1515 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1516 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1517 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1518 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1519 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1520 v += 36; 1521 } 1522 v = aa + 36*diag[i]; 1523 x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 1524 x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 1525 x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 1526 x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 1527 x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 1528 x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 1529 } 1530 1531 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1532 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1533 PetscLogFlops(2*36*(a->nz) - 6*A->n); 1534 PetscFunctionReturn(0); 1535 } 1536 1537 #undef __FUNCT__ 1538 #define __FUNCT__ "MatSolve_SeqBAIJ_5" 1539 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 1540 { 1541 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1542 IS iscol=a->col,isrow=a->row; 1543 PetscErrorCode ierr; 1544 int *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1545 int *diag = a->diag; 1546 MatScalar *aa=a->a,*v; 1547 PetscScalar *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 1548 1549 PetscFunctionBegin; 1550 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1551 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1552 t = a->solve_work; 1553 1554 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1555 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1556 1557 /* forward solve the lower triangular */ 1558 idx = 5*(*r++); 1559 t[0] = b[idx]; t[1] = b[1+idx]; 1560 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1561 for (i=1; i<n; i++) { 1562 v = aa + 25*ai[i]; 1563 vi = aj + ai[i]; 1564 nz = diag[i] - ai[i]; 1565 idx = 5*(*r++); 1566 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1567 s5 = b[4+idx]; 1568 while (nz--) { 1569 idx = 5*(*vi++); 1570 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1571 x4 = t[3+idx];x5 = t[4+idx]; 1572 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1573 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1574 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1575 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1576 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1577 v += 25; 1578 } 1579 idx = 5*i; 1580 t[idx] = s1;t[1+idx] = s2; 1581 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1582 } 1583 /* backward solve the upper triangular */ 1584 for (i=n-1; i>=0; i--){ 1585 v = aa + 25*diag[i] + 25; 1586 vi = aj + diag[i] + 1; 1587 nz = ai[i+1] - diag[i] - 1; 1588 idt = 5*i; 1589 s1 = t[idt]; s2 = t[1+idt]; 1590 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1591 while (nz--) { 1592 idx = 5*(*vi++); 1593 x1 = t[idx]; x2 = t[1+idx]; 1594 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1595 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1596 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1597 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1598 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1599 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1600 v += 25; 1601 } 1602 idc = 5*(*c--); 1603 v = aa + 25*diag[i]; 1604 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 1605 v[15]*s4+v[20]*s5; 1606 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 1607 v[16]*s4+v[21]*s5; 1608 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 1609 v[17]*s4+v[22]*s5; 1610 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 1611 v[18]*s4+v[23]*s5; 1612 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 1613 v[19]*s4+v[24]*s5; 1614 } 1615 1616 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1617 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1618 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1619 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1620 PetscLogFlops(2*25*(a->nz) - 5*A->n); 1621 PetscFunctionReturn(0); 1622 } 1623 1624 #undef __FUNCT__ 1625 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering" 1626 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 1627 { 1628 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1629 int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1630 PetscErrorCode ierr; 1631 int *diag = a->diag,jdx; 1632 MatScalar *aa=a->a,*v; 1633 PetscScalar *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 1634 1635 PetscFunctionBegin; 1636 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1637 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1638 /* forward solve the lower triangular */ 1639 idx = 0; 1640 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 1641 for (i=1; i<n; i++) { 1642 v = aa + 25*ai[i]; 1643 vi = aj + ai[i]; 1644 nz = diag[i] - ai[i]; 1645 idx = 5*i; 1646 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 1647 while (nz--) { 1648 jdx = 5*(*vi++); 1649 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 1650 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1651 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1652 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1653 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1654 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1655 v += 25; 1656 } 1657 x[idx] = s1; 1658 x[1+idx] = s2; 1659 x[2+idx] = s3; 1660 x[3+idx] = s4; 1661 x[4+idx] = s5; 1662 } 1663 /* backward solve the upper triangular */ 1664 for (i=n-1; i>=0; i--){ 1665 v = aa + 25*diag[i] + 25; 1666 vi = aj + diag[i] + 1; 1667 nz = ai[i+1] - diag[i] - 1; 1668 idt = 5*i; 1669 s1 = x[idt]; s2 = x[1+idt]; 1670 s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 1671 while (nz--) { 1672 idx = 5*(*vi++); 1673 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 1674 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1675 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1676 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1677 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1678 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1679 v += 25; 1680 } 1681 v = aa + 25*diag[i]; 1682 x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 1683 x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 1684 x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 1685 x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 1686 x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 1687 } 1688 1689 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1690 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1691 PetscLogFlops(2*25*(a->nz) - 5*A->n); 1692 PetscFunctionReturn(0); 1693 } 1694 1695 #undef __FUNCT__ 1696 #define __FUNCT__ "MatSolve_SeqBAIJ_4" 1697 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 1698 { 1699 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1700 IS iscol=a->col,isrow=a->row; 1701 PetscErrorCode ierr; 1702 int *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1703 int *diag = a->diag; 1704 MatScalar *aa=a->a,*v; 1705 PetscScalar *x,*b,s1,s2,s3,s4,x1,x2,x3,x4,*t; 1706 1707 PetscFunctionBegin; 1708 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1709 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1710 t = a->solve_work; 1711 1712 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1713 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1714 1715 /* forward solve the lower triangular */ 1716 idx = 4*(*r++); 1717 t[0] = b[idx]; t[1] = b[1+idx]; 1718 t[2] = b[2+idx]; t[3] = b[3+idx]; 1719 for (i=1; i<n; i++) { 1720 v = aa + 16*ai[i]; 1721 vi = aj + ai[i]; 1722 nz = diag[i] - ai[i]; 1723 idx = 4*(*r++); 1724 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1725 while (nz--) { 1726 idx = 4*(*vi++); 1727 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 1728 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1729 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1730 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1731 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1732 v += 16; 1733 } 1734 idx = 4*i; 1735 t[idx] = s1;t[1+idx] = s2; 1736 t[2+idx] = s3;t[3+idx] = s4; 1737 } 1738 /* backward solve the upper triangular */ 1739 for (i=n-1; i>=0; i--){ 1740 v = aa + 16*diag[i] + 16; 1741 vi = aj + diag[i] + 1; 1742 nz = ai[i+1] - diag[i] - 1; 1743 idt = 4*i; 1744 s1 = t[idt]; s2 = t[1+idt]; 1745 s3 = t[2+idt];s4 = t[3+idt]; 1746 while (nz--) { 1747 idx = 4*(*vi++); 1748 x1 = t[idx]; x2 = t[1+idx]; 1749 x3 = t[2+idx]; x4 = t[3+idx]; 1750 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1751 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1752 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1753 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1754 v += 16; 1755 } 1756 idc = 4*(*c--); 1757 v = aa + 16*diag[i]; 1758 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 1759 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 1760 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 1761 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 1762 } 1763 1764 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1765 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1766 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1767 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1768 PetscLogFlops(2*16*(a->nz) - 4*A->n); 1769 PetscFunctionReturn(0); 1770 } 1771 1772 #undef __FUNCT__ 1773 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion" 1774 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx) 1775 { 1776 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1777 IS iscol=a->col,isrow=a->row; 1778 PetscErrorCode ierr; 1779 int *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1780 int *diag = a->diag; 1781 MatScalar *aa=a->a,*v,s1,s2,s3,s4,x1,x2,x3,x4,*t; 1782 PetscScalar *x,*b; 1783 1784 PetscFunctionBegin; 1785 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1786 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1787 t = (MatScalar *)a->solve_work; 1788 1789 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1790 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1791 1792 /* forward solve the lower triangular */ 1793 idx = 4*(*r++); 1794 t[0] = (MatScalar)b[idx]; 1795 t[1] = (MatScalar)b[1+idx]; 1796 t[2] = (MatScalar)b[2+idx]; 1797 t[3] = (MatScalar)b[3+idx]; 1798 for (i=1; i<n; i++) { 1799 v = aa + 16*ai[i]; 1800 vi = aj + ai[i]; 1801 nz = diag[i] - ai[i]; 1802 idx = 4*(*r++); 1803 s1 = (MatScalar)b[idx]; 1804 s2 = (MatScalar)b[1+idx]; 1805 s3 = (MatScalar)b[2+idx]; 1806 s4 = (MatScalar)b[3+idx]; 1807 while (nz--) { 1808 idx = 4*(*vi++); 1809 x1 = t[idx]; 1810 x2 = t[1+idx]; 1811 x3 = t[2+idx]; 1812 x4 = t[3+idx]; 1813 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1814 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1815 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1816 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1817 v += 16; 1818 } 1819 idx = 4*i; 1820 t[idx] = s1; 1821 t[1+idx] = s2; 1822 t[2+idx] = s3; 1823 t[3+idx] = s4; 1824 } 1825 /* backward solve the upper triangular */ 1826 for (i=n-1; i>=0; i--){ 1827 v = aa + 16*diag[i] + 16; 1828 vi = aj + diag[i] + 1; 1829 nz = ai[i+1] - diag[i] - 1; 1830 idt = 4*i; 1831 s1 = t[idt]; 1832 s2 = t[1+idt]; 1833 s3 = t[2+idt]; 1834 s4 = t[3+idt]; 1835 while (nz--) { 1836 idx = 4*(*vi++); 1837 x1 = t[idx]; 1838 x2 = t[1+idx]; 1839 x3 = t[2+idx]; 1840 x4 = t[3+idx]; 1841 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1842 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1843 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1844 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1845 v += 16; 1846 } 1847 idc = 4*(*c--); 1848 v = aa + 16*diag[i]; 1849 t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 1850 t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 1851 t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 1852 t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 1853 x[idc] = (PetscScalar)t[idt]; 1854 x[1+idc] = (PetscScalar)t[1+idt]; 1855 x[2+idc] = (PetscScalar)t[2+idt]; 1856 x[3+idc] = (PetscScalar)t[3+idt]; 1857 } 1858 1859 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1860 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1861 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1862 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1863 PetscLogFlops(2*16*(a->nz) - 4*A->n); 1864 PetscFunctionReturn(0); 1865 } 1866 1867 #if defined (PETSC_HAVE_SSE) 1868 1869 #include PETSC_HAVE_SSE 1870 1871 #undef __FUNCT__ 1872 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion" 1873 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx) 1874 { 1875 /* 1876 Note: This code uses demotion of double 1877 to float when performing the mixed-mode computation. 1878 This may not be numerically reasonable for all applications. 1879 */ 1880 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1881 IS iscol=a->col,isrow=a->row; 1882 PetscErrorCode ierr; 1883 int *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1884 int *diag = a->diag,ai16; 1885 MatScalar *aa=a->a,*v; 1886 PetscScalar *x,*b,*t; 1887 1888 /* Make space in temp stack for 16 Byte Aligned arrays */ 1889 float ssealignedspace[11],*tmps,*tmpx; 1890 unsigned long offset; 1891 1892 PetscFunctionBegin; 1893 SSE_SCOPE_BEGIN; 1894 1895 offset = (unsigned long)ssealignedspace % 16; 1896 if (offset) offset = (16 - offset)/4; 1897 tmps = &ssealignedspace[offset]; 1898 tmpx = &ssealignedspace[offset+4]; 1899 PREFETCH_NTA(aa+16*ai[1]); 1900 1901 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1902 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1903 t = a->solve_work; 1904 1905 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1906 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1907 1908 /* forward solve the lower triangular */ 1909 idx = 4*(*r++); 1910 t[0] = b[idx]; t[1] = b[1+idx]; 1911 t[2] = b[2+idx]; t[3] = b[3+idx]; 1912 v = aa + 16*ai[1]; 1913 1914 for (i=1; i<n;) { 1915 PREFETCH_NTA(&v[8]); 1916 vi = aj + ai[i]; 1917 nz = diag[i] - ai[i]; 1918 idx = 4*(*r++); 1919 1920 /* Demote sum from double to float */ 1921 CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 1922 LOAD_PS(tmps,XMM7); 1923 1924 while (nz--) { 1925 PREFETCH_NTA(&v[16]); 1926 idx = 4*(*vi++); 1927 1928 /* Demote solution (so far) from double to float */ 1929 CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 1930 1931 /* 4x4 Matrix-Vector product with negative accumulation: */ 1932 SSE_INLINE_BEGIN_2(tmpx,v) 1933 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 1934 1935 /* First Column */ 1936 SSE_COPY_PS(XMM0,XMM6) 1937 SSE_SHUFFLE(XMM0,XMM0,0x00) 1938 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 1939 SSE_SUB_PS(XMM7,XMM0) 1940 1941 /* Second Column */ 1942 SSE_COPY_PS(XMM1,XMM6) 1943 SSE_SHUFFLE(XMM1,XMM1,0x55) 1944 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 1945 SSE_SUB_PS(XMM7,XMM1) 1946 1947 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 1948 1949 /* Third Column */ 1950 SSE_COPY_PS(XMM2,XMM6) 1951 SSE_SHUFFLE(XMM2,XMM2,0xAA) 1952 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 1953 SSE_SUB_PS(XMM7,XMM2) 1954 1955 /* Fourth Column */ 1956 SSE_COPY_PS(XMM3,XMM6) 1957 SSE_SHUFFLE(XMM3,XMM3,0xFF) 1958 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 1959 SSE_SUB_PS(XMM7,XMM3) 1960 SSE_INLINE_END_2 1961 1962 v += 16; 1963 } 1964 idx = 4*i; 1965 v = aa + 16*ai[++i]; 1966 PREFETCH_NTA(v); 1967 STORE_PS(tmps,XMM7); 1968 1969 /* Promote result from float to double */ 1970 CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps); 1971 } 1972 /* backward solve the upper triangular */ 1973 idt = 4*(n-1); 1974 ai16 = 16*diag[n-1]; 1975 v = aa + ai16 + 16; 1976 for (i=n-1; i>=0;){ 1977 PREFETCH_NTA(&v[8]); 1978 vi = aj + diag[i] + 1; 1979 nz = ai[i+1] - diag[i] - 1; 1980 1981 /* Demote accumulator from double to float */ 1982 CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]); 1983 LOAD_PS(tmps,XMM7); 1984 1985 while (nz--) { 1986 PREFETCH_NTA(&v[16]); 1987 idx = 4*(*vi++); 1988 1989 /* Demote solution (so far) from double to float */ 1990 CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]); 1991 1992 /* 4x4 Matrix-Vector Product with negative accumulation: */ 1993 SSE_INLINE_BEGIN_2(tmpx,v) 1994 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 1995 1996 /* First Column */ 1997 SSE_COPY_PS(XMM0,XMM6) 1998 SSE_SHUFFLE(XMM0,XMM0,0x00) 1999 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2000 SSE_SUB_PS(XMM7,XMM0) 2001 2002 /* Second Column */ 2003 SSE_COPY_PS(XMM1,XMM6) 2004 SSE_SHUFFLE(XMM1,XMM1,0x55) 2005 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2006 SSE_SUB_PS(XMM7,XMM1) 2007 2008 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2009 2010 /* Third Column */ 2011 SSE_COPY_PS(XMM2,XMM6) 2012 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2013 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2014 SSE_SUB_PS(XMM7,XMM2) 2015 2016 /* Fourth Column */ 2017 SSE_COPY_PS(XMM3,XMM6) 2018 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2019 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2020 SSE_SUB_PS(XMM7,XMM3) 2021 SSE_INLINE_END_2 2022 v += 16; 2023 } 2024 v = aa + ai16; 2025 ai16 = 16*diag[--i]; 2026 PREFETCH_NTA(aa+ai16+16); 2027 /* 2028 Scale the result by the diagonal 4x4 block, 2029 which was inverted as part of the factorization 2030 */ 2031 SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 2032 /* First Column */ 2033 SSE_COPY_PS(XMM0,XMM7) 2034 SSE_SHUFFLE(XMM0,XMM0,0x00) 2035 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 2036 2037 /* Second Column */ 2038 SSE_COPY_PS(XMM1,XMM7) 2039 SSE_SHUFFLE(XMM1,XMM1,0x55) 2040 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 2041 SSE_ADD_PS(XMM0,XMM1) 2042 2043 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 2044 2045 /* Third Column */ 2046 SSE_COPY_PS(XMM2,XMM7) 2047 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2048 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 2049 SSE_ADD_PS(XMM0,XMM2) 2050 2051 /* Fourth Column */ 2052 SSE_COPY_PS(XMM3,XMM7) 2053 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2054 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 2055 SSE_ADD_PS(XMM0,XMM3) 2056 2057 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 2058 SSE_INLINE_END_3 2059 2060 /* Promote solution from float to double */ 2061 CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps); 2062 2063 /* Apply reordering to t and stream into x. */ 2064 /* This way, x doesn't pollute the cache. */ 2065 /* Be careful with size: 2 doubles = 4 floats! */ 2066 idc = 4*(*c--); 2067 SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc]) 2068 /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 2069 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 2070 SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0) 2071 /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 2072 SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1) 2073 SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1) 2074 SSE_INLINE_END_2 2075 v = aa + ai16 + 16; 2076 idt -= 4; 2077 } 2078 2079 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2080 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2081 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2082 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2083 PetscLogFlops(2*16*(a->nz) - 4*A->n); 2084 SSE_SCOPE_END; 2085 PetscFunctionReturn(0); 2086 } 2087 2088 #endif 2089 2090 2091 /* 2092 Special case where the matrix was ILU(0) factored in the natural 2093 ordering. This eliminates the need for the column and row permutation. 2094 */ 2095 #undef __FUNCT__ 2096 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering" 2097 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 2098 { 2099 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2100 int n=a->mbs,*ai=a->i,*aj=a->j; 2101 PetscErrorCode ierr; 2102 int *diag = a->diag; 2103 MatScalar *aa=a->a; 2104 PetscScalar *x,*b; 2105 2106 PetscFunctionBegin; 2107 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2108 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2109 2110 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS) 2111 { 2112 static PetscScalar w[2000]; /* very BAD need to fix */ 2113 fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w); 2114 } 2115 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 2116 { 2117 static PetscScalar w[2000]; /* very BAD need to fix */ 2118 fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w); 2119 } 2120 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 2121 fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b); 2122 #else 2123 { 2124 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 2125 MatScalar *v; 2126 int jdx,idt,idx,nz,*vi,i,ai16; 2127 2128 /* forward solve the lower triangular */ 2129 idx = 0; 2130 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 2131 for (i=1; i<n; i++) { 2132 v = aa + 16*ai[i]; 2133 vi = aj + ai[i]; 2134 nz = diag[i] - ai[i]; 2135 idx += 4; 2136 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2137 while (nz--) { 2138 jdx = 4*(*vi++); 2139 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx]; 2140 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2141 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2142 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2143 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2144 v += 16; 2145 } 2146 x[idx] = s1; 2147 x[1+idx] = s2; 2148 x[2+idx] = s3; 2149 x[3+idx] = s4; 2150 } 2151 /* backward solve the upper triangular */ 2152 idt = 4*(n-1); 2153 for (i=n-1; i>=0; i--){ 2154 ai16 = 16*diag[i]; 2155 v = aa + ai16 + 16; 2156 vi = aj + diag[i] + 1; 2157 nz = ai[i+1] - diag[i] - 1; 2158 s1 = x[idt]; s2 = x[1+idt]; 2159 s3 = x[2+idt];s4 = x[3+idt]; 2160 while (nz--) { 2161 idx = 4*(*vi++); 2162 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; 2163 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2164 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2165 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2166 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2167 v += 16; 2168 } 2169 v = aa + ai16; 2170 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 2171 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 2172 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 2173 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 2174 idt -= 4; 2175 } 2176 } 2177 #endif 2178 2179 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2180 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2181 PetscLogFlops(2*16*(a->nz) - 4*A->n); 2182 PetscFunctionReturn(0); 2183 } 2184 2185 #undef __FUNCT__ 2186 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion" 2187 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx) 2188 { 2189 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2190 int n=a->mbs,*ai=a->i,*aj=a->j; 2191 PetscErrorCode ierr; 2192 int *diag = a->diag; 2193 MatScalar *aa=a->a; 2194 PetscScalar *x,*b; 2195 2196 PetscFunctionBegin; 2197 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2198 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2199 2200 { 2201 MatScalar s1,s2,s3,s4,x1,x2,x3,x4; 2202 MatScalar *v,*t=(MatScalar *)x; 2203 int jdx,idt,idx,nz,*vi,i,ai16; 2204 2205 /* forward solve the lower triangular */ 2206 idx = 0; 2207 t[0] = (MatScalar)b[0]; 2208 t[1] = (MatScalar)b[1]; 2209 t[2] = (MatScalar)b[2]; 2210 t[3] = (MatScalar)b[3]; 2211 for (i=1; i<n; i++) { 2212 v = aa + 16*ai[i]; 2213 vi = aj + ai[i]; 2214 nz = diag[i] - ai[i]; 2215 idx += 4; 2216 s1 = (MatScalar)b[idx]; 2217 s2 = (MatScalar)b[1+idx]; 2218 s3 = (MatScalar)b[2+idx]; 2219 s4 = (MatScalar)b[3+idx]; 2220 while (nz--) { 2221 jdx = 4*(*vi++); 2222 x1 = t[jdx]; 2223 x2 = t[1+jdx]; 2224 x3 = t[2+jdx]; 2225 x4 = t[3+jdx]; 2226 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2227 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2228 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2229 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2230 v += 16; 2231 } 2232 t[idx] = s1; 2233 t[1+idx] = s2; 2234 t[2+idx] = s3; 2235 t[3+idx] = s4; 2236 } 2237 /* backward solve the upper triangular */ 2238 idt = 4*(n-1); 2239 for (i=n-1; i>=0; i--){ 2240 ai16 = 16*diag[i]; 2241 v = aa + ai16 + 16; 2242 vi = aj + diag[i] + 1; 2243 nz = ai[i+1] - diag[i] - 1; 2244 s1 = t[idt]; 2245 s2 = t[1+idt]; 2246 s3 = t[2+idt]; 2247 s4 = t[3+idt]; 2248 while (nz--) { 2249 idx = 4*(*vi++); 2250 x1 = (MatScalar)x[idx]; 2251 x2 = (MatScalar)x[1+idx]; 2252 x3 = (MatScalar)x[2+idx]; 2253 x4 = (MatScalar)x[3+idx]; 2254 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2255 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2256 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2257 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2258 v += 16; 2259 } 2260 v = aa + ai16; 2261 x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4); 2262 x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4); 2263 x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4); 2264 x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4); 2265 idt -= 4; 2266 } 2267 } 2268 2269 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2270 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2271 PetscLogFlops(2*16*(a->nz) - 4*A->n); 2272 PetscFunctionReturn(0); 2273 } 2274 2275 #if defined (PETSC_HAVE_SSE) 2276 2277 #include PETSC_HAVE_SSE 2278 #undef __FUNCT__ 2279 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj" 2280 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx) 2281 { 2282 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2283 unsigned short *aj=(unsigned short *)a->j; 2284 PetscErrorCode ierr; 2285 int *ai=a->i,n=a->mbs,*diag = a->diag; 2286 MatScalar *aa=a->a; 2287 PetscScalar *x,*b; 2288 2289 PetscFunctionBegin; 2290 SSE_SCOPE_BEGIN; 2291 /* 2292 Note: This code currently uses demotion of double 2293 to float when performing the mixed-mode computation. 2294 This may not be numerically reasonable for all applications. 2295 */ 2296 PREFETCH_NTA(aa+16*ai[1]); 2297 2298 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2299 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2300 { 2301 /* x will first be computed in single precision then promoted inplace to double */ 2302 MatScalar *v,*t=(MatScalar *)x; 2303 int nz,i,idt,ai16; 2304 unsigned int jdx,idx; 2305 unsigned short *vi; 2306 /* Forward solve the lower triangular factor. */ 2307 2308 /* First block is the identity. */ 2309 idx = 0; 2310 CONVERT_DOUBLE4_FLOAT4(t,b); 2311 v = aa + 16*((unsigned int)ai[1]); 2312 2313 for (i=1; i<n;) { 2314 PREFETCH_NTA(&v[8]); 2315 vi = aj + ai[i]; 2316 nz = diag[i] - ai[i]; 2317 idx += 4; 2318 2319 /* Demote RHS from double to float. */ 2320 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 2321 LOAD_PS(&t[idx],XMM7); 2322 2323 while (nz--) { 2324 PREFETCH_NTA(&v[16]); 2325 jdx = 4*((unsigned int)(*vi++)); 2326 2327 /* 4x4 Matrix-Vector product with negative accumulation: */ 2328 SSE_INLINE_BEGIN_2(&t[jdx],v) 2329 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2330 2331 /* First Column */ 2332 SSE_COPY_PS(XMM0,XMM6) 2333 SSE_SHUFFLE(XMM0,XMM0,0x00) 2334 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2335 SSE_SUB_PS(XMM7,XMM0) 2336 2337 /* Second Column */ 2338 SSE_COPY_PS(XMM1,XMM6) 2339 SSE_SHUFFLE(XMM1,XMM1,0x55) 2340 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2341 SSE_SUB_PS(XMM7,XMM1) 2342 2343 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2344 2345 /* Third Column */ 2346 SSE_COPY_PS(XMM2,XMM6) 2347 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2348 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2349 SSE_SUB_PS(XMM7,XMM2) 2350 2351 /* Fourth Column */ 2352 SSE_COPY_PS(XMM3,XMM6) 2353 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2354 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2355 SSE_SUB_PS(XMM7,XMM3) 2356 SSE_INLINE_END_2 2357 2358 v += 16; 2359 } 2360 v = aa + 16*ai[++i]; 2361 PREFETCH_NTA(v); 2362 STORE_PS(&t[idx],XMM7); 2363 } 2364 2365 /* Backward solve the upper triangular factor.*/ 2366 2367 idt = 4*(n-1); 2368 ai16 = 16*diag[n-1]; 2369 v = aa + ai16 + 16; 2370 for (i=n-1; i>=0;){ 2371 PREFETCH_NTA(&v[8]); 2372 vi = aj + diag[i] + 1; 2373 nz = ai[i+1] - diag[i] - 1; 2374 2375 LOAD_PS(&t[idt],XMM7); 2376 2377 while (nz--) { 2378 PREFETCH_NTA(&v[16]); 2379 idx = 4*((unsigned int)(*vi++)); 2380 2381 /* 4x4 Matrix-Vector Product with negative accumulation: */ 2382 SSE_INLINE_BEGIN_2(&t[idx],v) 2383 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2384 2385 /* First Column */ 2386 SSE_COPY_PS(XMM0,XMM6) 2387 SSE_SHUFFLE(XMM0,XMM0,0x00) 2388 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2389 SSE_SUB_PS(XMM7,XMM0) 2390 2391 /* Second Column */ 2392 SSE_COPY_PS(XMM1,XMM6) 2393 SSE_SHUFFLE(XMM1,XMM1,0x55) 2394 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2395 SSE_SUB_PS(XMM7,XMM1) 2396 2397 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2398 2399 /* Third Column */ 2400 SSE_COPY_PS(XMM2,XMM6) 2401 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2402 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2403 SSE_SUB_PS(XMM7,XMM2) 2404 2405 /* Fourth Column */ 2406 SSE_COPY_PS(XMM3,XMM6) 2407 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2408 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2409 SSE_SUB_PS(XMM7,XMM3) 2410 SSE_INLINE_END_2 2411 v += 16; 2412 } 2413 v = aa + ai16; 2414 ai16 = 16*diag[--i]; 2415 PREFETCH_NTA(aa+ai16+16); 2416 /* 2417 Scale the result by the diagonal 4x4 block, 2418 which was inverted as part of the factorization 2419 */ 2420 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 2421 /* First Column */ 2422 SSE_COPY_PS(XMM0,XMM7) 2423 SSE_SHUFFLE(XMM0,XMM0,0x00) 2424 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 2425 2426 /* Second Column */ 2427 SSE_COPY_PS(XMM1,XMM7) 2428 SSE_SHUFFLE(XMM1,XMM1,0x55) 2429 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 2430 SSE_ADD_PS(XMM0,XMM1) 2431 2432 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 2433 2434 /* Third Column */ 2435 SSE_COPY_PS(XMM2,XMM7) 2436 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2437 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 2438 SSE_ADD_PS(XMM0,XMM2) 2439 2440 /* Fourth Column */ 2441 SSE_COPY_PS(XMM3,XMM7) 2442 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2443 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 2444 SSE_ADD_PS(XMM0,XMM3) 2445 2446 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 2447 SSE_INLINE_END_3 2448 2449 v = aa + ai16 + 16; 2450 idt -= 4; 2451 } 2452 2453 /* Convert t from single precision back to double precision (inplace)*/ 2454 idt = 4*(n-1); 2455 for (i=n-1;i>=0;i--) { 2456 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 2457 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 2458 PetscScalar *xtemp=&x[idt]; 2459 MatScalar *ttemp=&t[idt]; 2460 xtemp[3] = (PetscScalar)ttemp[3]; 2461 xtemp[2] = (PetscScalar)ttemp[2]; 2462 xtemp[1] = (PetscScalar)ttemp[1]; 2463 xtemp[0] = (PetscScalar)ttemp[0]; 2464 idt -= 4; 2465 } 2466 2467 } /* End of artificial scope. */ 2468 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2469 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2470 PetscLogFlops(2*16*(a->nz) - 4*A->n); 2471 SSE_SCOPE_END; 2472 PetscFunctionReturn(0); 2473 } 2474 2475 #undef __FUNCT__ 2476 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion" 2477 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx) 2478 { 2479 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2480 int *aj=a->j; 2481 PetscErrorCode ierr; 2482 int *ai=a->i,n=a->mbs,*diag = a->diag; 2483 MatScalar *aa=a->a; 2484 PetscScalar *x,*b; 2485 2486 PetscFunctionBegin; 2487 SSE_SCOPE_BEGIN; 2488 /* 2489 Note: This code currently uses demotion of double 2490 to float when performing the mixed-mode computation. 2491 This may not be numerically reasonable for all applications. 2492 */ 2493 PREFETCH_NTA(aa+16*ai[1]); 2494 2495 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2496 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2497 { 2498 /* x will first be computed in single precision then promoted inplace to double */ 2499 MatScalar *v,*t=(MatScalar *)x; 2500 int nz,i,idt,ai16; 2501 int jdx,idx; 2502 int *vi; 2503 /* Forward solve the lower triangular factor. */ 2504 2505 /* First block is the identity. */ 2506 idx = 0; 2507 CONVERT_DOUBLE4_FLOAT4(t,b); 2508 v = aa + 16*ai[1]; 2509 2510 for (i=1; i<n;) { 2511 PREFETCH_NTA(&v[8]); 2512 vi = aj + ai[i]; 2513 nz = diag[i] - ai[i]; 2514 idx += 4; 2515 2516 /* Demote RHS from double to float. */ 2517 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 2518 LOAD_PS(&t[idx],XMM7); 2519 2520 while (nz--) { 2521 PREFETCH_NTA(&v[16]); 2522 jdx = 4*(*vi++); 2523 /* jdx = *vi++; */ 2524 2525 /* 4x4 Matrix-Vector product with negative accumulation: */ 2526 SSE_INLINE_BEGIN_2(&t[jdx],v) 2527 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2528 2529 /* First Column */ 2530 SSE_COPY_PS(XMM0,XMM6) 2531 SSE_SHUFFLE(XMM0,XMM0,0x00) 2532 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2533 SSE_SUB_PS(XMM7,XMM0) 2534 2535 /* Second Column */ 2536 SSE_COPY_PS(XMM1,XMM6) 2537 SSE_SHUFFLE(XMM1,XMM1,0x55) 2538 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2539 SSE_SUB_PS(XMM7,XMM1) 2540 2541 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2542 2543 /* Third Column */ 2544 SSE_COPY_PS(XMM2,XMM6) 2545 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2546 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2547 SSE_SUB_PS(XMM7,XMM2) 2548 2549 /* Fourth Column */ 2550 SSE_COPY_PS(XMM3,XMM6) 2551 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2552 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2553 SSE_SUB_PS(XMM7,XMM3) 2554 SSE_INLINE_END_2 2555 2556 v += 16; 2557 } 2558 v = aa + 16*ai[++i]; 2559 PREFETCH_NTA(v); 2560 STORE_PS(&t[idx],XMM7); 2561 } 2562 2563 /* Backward solve the upper triangular factor.*/ 2564 2565 idt = 4*(n-1); 2566 ai16 = 16*diag[n-1]; 2567 v = aa + ai16 + 16; 2568 for (i=n-1; i>=0;){ 2569 PREFETCH_NTA(&v[8]); 2570 vi = aj + diag[i] + 1; 2571 nz = ai[i+1] - diag[i] - 1; 2572 2573 LOAD_PS(&t[idt],XMM7); 2574 2575 while (nz--) { 2576 PREFETCH_NTA(&v[16]); 2577 idx = 4*(*vi++); 2578 /* idx = *vi++; */ 2579 2580 /* 4x4 Matrix-Vector Product with negative accumulation: */ 2581 SSE_INLINE_BEGIN_2(&t[idx],v) 2582 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2583 2584 /* First Column */ 2585 SSE_COPY_PS(XMM0,XMM6) 2586 SSE_SHUFFLE(XMM0,XMM0,0x00) 2587 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2588 SSE_SUB_PS(XMM7,XMM0) 2589 2590 /* Second Column */ 2591 SSE_COPY_PS(XMM1,XMM6) 2592 SSE_SHUFFLE(XMM1,XMM1,0x55) 2593 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2594 SSE_SUB_PS(XMM7,XMM1) 2595 2596 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2597 2598 /* Third Column */ 2599 SSE_COPY_PS(XMM2,XMM6) 2600 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2601 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2602 SSE_SUB_PS(XMM7,XMM2) 2603 2604 /* Fourth Column */ 2605 SSE_COPY_PS(XMM3,XMM6) 2606 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2607 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2608 SSE_SUB_PS(XMM7,XMM3) 2609 SSE_INLINE_END_2 2610 v += 16; 2611 } 2612 v = aa + ai16; 2613 ai16 = 16*diag[--i]; 2614 PREFETCH_NTA(aa+ai16+16); 2615 /* 2616 Scale the result by the diagonal 4x4 block, 2617 which was inverted as part of the factorization 2618 */ 2619 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 2620 /* First Column */ 2621 SSE_COPY_PS(XMM0,XMM7) 2622 SSE_SHUFFLE(XMM0,XMM0,0x00) 2623 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 2624 2625 /* Second Column */ 2626 SSE_COPY_PS(XMM1,XMM7) 2627 SSE_SHUFFLE(XMM1,XMM1,0x55) 2628 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 2629 SSE_ADD_PS(XMM0,XMM1) 2630 2631 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 2632 2633 /* Third Column */ 2634 SSE_COPY_PS(XMM2,XMM7) 2635 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2636 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 2637 SSE_ADD_PS(XMM0,XMM2) 2638 2639 /* Fourth Column */ 2640 SSE_COPY_PS(XMM3,XMM7) 2641 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2642 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 2643 SSE_ADD_PS(XMM0,XMM3) 2644 2645 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 2646 SSE_INLINE_END_3 2647 2648 v = aa + ai16 + 16; 2649 idt -= 4; 2650 } 2651 2652 /* Convert t from single precision back to double precision (inplace)*/ 2653 idt = 4*(n-1); 2654 for (i=n-1;i>=0;i--) { 2655 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 2656 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 2657 PetscScalar *xtemp=&x[idt]; 2658 MatScalar *ttemp=&t[idt]; 2659 xtemp[3] = (PetscScalar)ttemp[3]; 2660 xtemp[2] = (PetscScalar)ttemp[2]; 2661 xtemp[1] = (PetscScalar)ttemp[1]; 2662 xtemp[0] = (PetscScalar)ttemp[0]; 2663 idt -= 4; 2664 } 2665 2666 } /* End of artificial scope. */ 2667 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2668 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2669 PetscLogFlops(2*16*(a->nz) - 4*A->n); 2670 SSE_SCOPE_END; 2671 PetscFunctionReturn(0); 2672 } 2673 2674 #endif 2675 #undef __FUNCT__ 2676 #define __FUNCT__ "MatSolve_SeqBAIJ_3" 2677 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 2678 { 2679 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2680 IS iscol=a->col,isrow=a->row; 2681 PetscErrorCode ierr; 2682 int *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 2683 int *diag = a->diag; 2684 MatScalar *aa=a->a,*v; 2685 PetscScalar *x,*b,s1,s2,s3,x1,x2,x3,*t; 2686 2687 PetscFunctionBegin; 2688 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2689 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2690 t = a->solve_work; 2691 2692 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2693 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2694 2695 /* forward solve the lower triangular */ 2696 idx = 3*(*r++); 2697 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 2698 for (i=1; i<n; i++) { 2699 v = aa + 9*ai[i]; 2700 vi = aj + ai[i]; 2701 nz = diag[i] - ai[i]; 2702 idx = 3*(*r++); 2703 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 2704 while (nz--) { 2705 idx = 3*(*vi++); 2706 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2707 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2708 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2709 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2710 v += 9; 2711 } 2712 idx = 3*i; 2713 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 2714 } 2715 /* backward solve the upper triangular */ 2716 for (i=n-1; i>=0; i--){ 2717 v = aa + 9*diag[i] + 9; 2718 vi = aj + diag[i] + 1; 2719 nz = ai[i+1] - diag[i] - 1; 2720 idt = 3*i; 2721 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 2722 while (nz--) { 2723 idx = 3*(*vi++); 2724 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2725 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2726 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2727 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2728 v += 9; 2729 } 2730 idc = 3*(*c--); 2731 v = aa + 9*diag[i]; 2732 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 2733 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 2734 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 2735 } 2736 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2737 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2738 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2739 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2740 PetscLogFlops(2*9*(a->nz) - 3*A->n); 2741 PetscFunctionReturn(0); 2742 } 2743 2744 /* 2745 Special case where the matrix was ILU(0) factored in the natural 2746 ordering. This eliminates the need for the column and row permutation. 2747 */ 2748 #undef __FUNCT__ 2749 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering" 2750 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 2751 { 2752 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2753 int n=a->mbs,*ai=a->i,*aj=a->j; 2754 PetscErrorCode ierr; 2755 int *diag = a->diag; 2756 MatScalar *aa=a->a,*v; 2757 PetscScalar *x,*b,s1,s2,s3,x1,x2,x3; 2758 int jdx,idt,idx,nz,*vi,i; 2759 2760 PetscFunctionBegin; 2761 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2762 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2763 2764 2765 /* forward solve the lower triangular */ 2766 idx = 0; 2767 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; 2768 for (i=1; i<n; i++) { 2769 v = aa + 9*ai[i]; 2770 vi = aj + ai[i]; 2771 nz = diag[i] - ai[i]; 2772 idx += 3; 2773 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 2774 while (nz--) { 2775 jdx = 3*(*vi++); 2776 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx]; 2777 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2778 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2779 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2780 v += 9; 2781 } 2782 x[idx] = s1; 2783 x[1+idx] = s2; 2784 x[2+idx] = s3; 2785 } 2786 /* backward solve the upper triangular */ 2787 for (i=n-1; i>=0; i--){ 2788 v = aa + 9*diag[i] + 9; 2789 vi = aj + diag[i] + 1; 2790 nz = ai[i+1] - diag[i] - 1; 2791 idt = 3*i; 2792 s1 = x[idt]; s2 = x[1+idt]; 2793 s3 = x[2+idt]; 2794 while (nz--) { 2795 idx = 3*(*vi++); 2796 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; 2797 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2798 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2799 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2800 v += 9; 2801 } 2802 v = aa + 9*diag[i]; 2803 x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 2804 x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 2805 x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 2806 } 2807 2808 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2809 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2810 PetscLogFlops(2*9*(a->nz) - 3*A->n); 2811 PetscFunctionReturn(0); 2812 } 2813 2814 #undef __FUNCT__ 2815 #define __FUNCT__ "MatSolve_SeqBAIJ_2" 2816 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 2817 { 2818 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2819 IS iscol=a->col,isrow=a->row; 2820 PetscErrorCode ierr; 2821 int *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 2822 int *diag = a->diag; 2823 MatScalar *aa=a->a,*v; 2824 PetscScalar *x,*b,s1,s2,x1,x2,*t; 2825 2826 PetscFunctionBegin; 2827 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2828 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2829 t = a->solve_work; 2830 2831 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2832 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2833 2834 /* forward solve the lower triangular */ 2835 idx = 2*(*r++); 2836 t[0] = b[idx]; t[1] = b[1+idx]; 2837 for (i=1; i<n; i++) { 2838 v = aa + 4*ai[i]; 2839 vi = aj + ai[i]; 2840 nz = diag[i] - ai[i]; 2841 idx = 2*(*r++); 2842 s1 = b[idx]; s2 = b[1+idx]; 2843 while (nz--) { 2844 idx = 2*(*vi++); 2845 x1 = t[idx]; x2 = t[1+idx]; 2846 s1 -= v[0]*x1 + v[2]*x2; 2847 s2 -= v[1]*x1 + v[3]*x2; 2848 v += 4; 2849 } 2850 idx = 2*i; 2851 t[idx] = s1; t[1+idx] = s2; 2852 } 2853 /* backward solve the upper triangular */ 2854 for (i=n-1; i>=0; i--){ 2855 v = aa + 4*diag[i] + 4; 2856 vi = aj + diag[i] + 1; 2857 nz = ai[i+1] - diag[i] - 1; 2858 idt = 2*i; 2859 s1 = t[idt]; s2 = t[1+idt]; 2860 while (nz--) { 2861 idx = 2*(*vi++); 2862 x1 = t[idx]; x2 = t[1+idx]; 2863 s1 -= v[0]*x1 + v[2]*x2; 2864 s2 -= v[1]*x1 + v[3]*x2; 2865 v += 4; 2866 } 2867 idc = 2*(*c--); 2868 v = aa + 4*diag[i]; 2869 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 2870 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 2871 } 2872 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2873 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2874 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2875 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2876 PetscLogFlops(2*4*(a->nz) - 2*A->n); 2877 PetscFunctionReturn(0); 2878 } 2879 2880 /* 2881 Special case where the matrix was ILU(0) factored in the natural 2882 ordering. This eliminates the need for the column and row permutation. 2883 */ 2884 #undef __FUNCT__ 2885 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering" 2886 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 2887 { 2888 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2889 int n=a->mbs,*ai=a->i,*aj=a->j; 2890 PetscErrorCode ierr; 2891 int *diag = a->diag; 2892 MatScalar *aa=a->a,*v; 2893 PetscScalar *x,*b,s1,s2,x1,x2; 2894 int jdx,idt,idx,nz,*vi,i; 2895 2896 PetscFunctionBegin; 2897 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2898 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2899 2900 /* forward solve the lower triangular */ 2901 idx = 0; 2902 x[0] = b[0]; x[1] = b[1]; 2903 for (i=1; i<n; i++) { 2904 v = aa + 4*ai[i]; 2905 vi = aj + ai[i]; 2906 nz = diag[i] - ai[i]; 2907 idx += 2; 2908 s1 = b[idx];s2 = b[1+idx]; 2909 while (nz--) { 2910 jdx = 2*(*vi++); 2911 x1 = x[jdx];x2 = x[1+jdx]; 2912 s1 -= v[0]*x1 + v[2]*x2; 2913 s2 -= v[1]*x1 + v[3]*x2; 2914 v += 4; 2915 } 2916 x[idx] = s1; 2917 x[1+idx] = s2; 2918 } 2919 /* backward solve the upper triangular */ 2920 for (i=n-1; i>=0; i--){ 2921 v = aa + 4*diag[i] + 4; 2922 vi = aj + diag[i] + 1; 2923 nz = ai[i+1] - diag[i] - 1; 2924 idt = 2*i; 2925 s1 = x[idt]; s2 = x[1+idt]; 2926 while (nz--) { 2927 idx = 2*(*vi++); 2928 x1 = x[idx]; x2 = x[1+idx]; 2929 s1 -= v[0]*x1 + v[2]*x2; 2930 s2 -= v[1]*x1 + v[3]*x2; 2931 v += 4; 2932 } 2933 v = aa + 4*diag[i]; 2934 x[idt] = v[0]*s1 + v[2]*s2; 2935 x[1+idt] = v[1]*s1 + v[3]*s2; 2936 } 2937 2938 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2939 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2940 PetscLogFlops(2*4*(a->nz) - 2*A->n); 2941 PetscFunctionReturn(0); 2942 } 2943 2944 #undef __FUNCT__ 2945 #define __FUNCT__ "MatSolve_SeqBAIJ_1" 2946 PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 2947 { 2948 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2949 IS iscol=a->col,isrow=a->row; 2950 PetscErrorCode ierr; 2951 int *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout; 2952 int *diag = a->diag; 2953 MatScalar *aa=a->a,*v; 2954 PetscScalar *x,*b,s1,*t; 2955 2956 PetscFunctionBegin; 2957 if (!n) PetscFunctionReturn(0); 2958 2959 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2960 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2961 t = a->solve_work; 2962 2963 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2964 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2965 2966 /* forward solve the lower triangular */ 2967 t[0] = b[*r++]; 2968 for (i=1; i<n; i++) { 2969 v = aa + ai[i]; 2970 vi = aj + ai[i]; 2971 nz = diag[i] - ai[i]; 2972 s1 = b[*r++]; 2973 while (nz--) { 2974 s1 -= (*v++)*t[*vi++]; 2975 } 2976 t[i] = s1; 2977 } 2978 /* backward solve the upper triangular */ 2979 for (i=n-1; i>=0; i--){ 2980 v = aa + diag[i] + 1; 2981 vi = aj + diag[i] + 1; 2982 nz = ai[i+1] - diag[i] - 1; 2983 s1 = t[i]; 2984 while (nz--) { 2985 s1 -= (*v++)*t[*vi++]; 2986 } 2987 x[*c--] = t[i] = aa[diag[i]]*s1; 2988 } 2989 2990 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2991 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2992 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2993 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2994 PetscLogFlops(2*1*(a->nz) - A->n); 2995 PetscFunctionReturn(0); 2996 } 2997 /* 2998 Special case where the matrix was ILU(0) factored in the natural 2999 ordering. This eliminates the need for the column and row permutation. 3000 */ 3001 #undef __FUNCT__ 3002 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering" 3003 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 3004 { 3005 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3006 int n=a->mbs,*ai=a->i,*aj=a->j; 3007 PetscErrorCode ierr; 3008 int *diag = a->diag; 3009 MatScalar *aa=a->a; 3010 PetscScalar *x,*b; 3011 PetscScalar s1,x1; 3012 MatScalar *v; 3013 int jdx,idt,idx,nz,*vi,i; 3014 3015 PetscFunctionBegin; 3016 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3017 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3018 3019 /* forward solve the lower triangular */ 3020 idx = 0; 3021 x[0] = b[0]; 3022 for (i=1; i<n; i++) { 3023 v = aa + ai[i]; 3024 vi = aj + ai[i]; 3025 nz = diag[i] - ai[i]; 3026 idx += 1; 3027 s1 = b[idx]; 3028 while (nz--) { 3029 jdx = *vi++; 3030 x1 = x[jdx]; 3031 s1 -= v[0]*x1; 3032 v += 1; 3033 } 3034 x[idx] = s1; 3035 } 3036 /* backward solve the upper triangular */ 3037 for (i=n-1; i>=0; i--){ 3038 v = aa + diag[i] + 1; 3039 vi = aj + diag[i] + 1; 3040 nz = ai[i+1] - diag[i] - 1; 3041 idt = i; 3042 s1 = x[idt]; 3043 while (nz--) { 3044 idx = *vi++; 3045 x1 = x[idx]; 3046 s1 -= v[0]*x1; 3047 v += 1; 3048 } 3049 v = aa + diag[i]; 3050 x[idt] = v[0]*s1; 3051 } 3052 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3053 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3054 PetscLogFlops(2*(a->nz) - A->n); 3055 PetscFunctionReturn(0); 3056 } 3057 3058 /* ----------------------------------------------------------------*/ 3059 /* 3060 This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 3061 except that the data structure of Mat_SeqAIJ is slightly different. 3062 Not a good example of code reuse. 3063 */ 3064 EXTERN PetscErrorCode MatMissingDiagonal_SeqBAIJ(Mat); 3065 3066 #undef __FUNCT__ 3067 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ" 3068 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,MatFactorInfo *info,Mat *fact) 3069 { 3070 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 3071 IS isicol; 3072 PetscErrorCode ierr; 3073 int *r,*ic,prow,n = a->mbs,*ai = a->i,*aj = a->j; 3074 int *ainew,*ajnew,jmax,*fill,*xi,nz,*im,*ajfill,*flev; 3075 int *dloc,idx,row,m,fm,nzf,nzi,len, reallocate = 0,dcount = 0; 3076 int incrlev,nnz,i,bs = a->bs,bs2 = a->bs2,levels,diagonal_fill; 3077 PetscTruth col_identity,row_identity; 3078 PetscReal f; 3079 3080 PetscFunctionBegin; 3081 f = info->fill; 3082 levels = (int)info->levels; 3083 diagonal_fill = (int)info->diagonal_fill; 3084 ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 3085 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 3086 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 3087 3088 if (!levels && row_identity && col_identity) { /* special case copy the nonzero structure */ 3089 ierr = MatDuplicate_SeqBAIJ(A,MAT_DO_NOT_COPY_VALUES,fact);CHKERRQ(ierr); 3090 (*fact)->factor = FACTOR_LU; 3091 b = (Mat_SeqBAIJ*)(*fact)->data; 3092 if (!b->diag) { 3093 ierr = MatMarkDiagonal_SeqBAIJ(*fact);CHKERRQ(ierr); 3094 } 3095 ierr = MatMissingDiagonal_SeqBAIJ(*fact);CHKERRQ(ierr); 3096 b->row = isrow; 3097 b->col = iscol; 3098 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 3099 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 3100 b->icol = isicol; 3101 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 3102 ierr = PetscMalloc(((*fact)->m+1+b->bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 3103 } else { /* general case perform the symbolic factorization */ 3104 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 3105 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 3106 3107 /* get new row pointers */ 3108 ierr = PetscMalloc((n+1)*sizeof(int),&ainew);CHKERRQ(ierr); 3109 ainew[0] = 0; 3110 /* don't know how many column pointers are needed so estimate */ 3111 jmax = (int)(f*ai[n] + 1); 3112 ierr = PetscMalloc((jmax)*sizeof(int),&ajnew);CHKERRQ(ierr); 3113 /* ajfill is level of fill for each fill entry */ 3114 ierr = PetscMalloc((jmax)*sizeof(int),&ajfill);CHKERRQ(ierr); 3115 /* fill is a linked list of nonzeros in active row */ 3116 ierr = PetscMalloc((n+1)*sizeof(int),&fill);CHKERRQ(ierr); 3117 /* im is level for each filled value */ 3118 ierr = PetscMalloc((n+1)*sizeof(int),&im);CHKERRQ(ierr); 3119 /* dloc is location of diagonal in factor */ 3120 ierr = PetscMalloc((n+1)*sizeof(int),&dloc);CHKERRQ(ierr); 3121 dloc[0] = 0; 3122 for (prow=0; prow<n; prow++) { 3123 3124 /* copy prow into linked list */ 3125 nzf = nz = ai[r[prow]+1] - ai[r[prow]]; 3126 if (!nz) SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix"); 3127 xi = aj + ai[r[prow]]; 3128 fill[n] = n; 3129 fill[prow] = -1; /* marker for diagonal entry */ 3130 while (nz--) { 3131 fm = n; 3132 idx = ic[*xi++]; 3133 do { 3134 m = fm; 3135 fm = fill[m]; 3136 } while (fm < idx); 3137 fill[m] = idx; 3138 fill[idx] = fm; 3139 im[idx] = 0; 3140 } 3141 3142 /* make sure diagonal entry is included */ 3143 if (diagonal_fill && fill[prow] == -1) { 3144 fm = n; 3145 while (fill[fm] < prow) fm = fill[fm]; 3146 fill[prow] = fill[fm]; /* insert diagonal into linked list */ 3147 fill[fm] = prow; 3148 im[prow] = 0; 3149 nzf++; 3150 dcount++; 3151 } 3152 3153 nzi = 0; 3154 row = fill[n]; 3155 while (row < prow) { 3156 incrlev = im[row] + 1; 3157 nz = dloc[row]; 3158 xi = ajnew + ainew[row] + nz + 1; 3159 flev = ajfill + ainew[row] + nz + 1; 3160 nnz = ainew[row+1] - ainew[row] - nz - 1; 3161 fm = row; 3162 while (nnz-- > 0) { 3163 idx = *xi++; 3164 if (*flev + incrlev > levels) { 3165 flev++; 3166 continue; 3167 } 3168 do { 3169 m = fm; 3170 fm = fill[m]; 3171 } while (fm < idx); 3172 if (fm != idx) { 3173 im[idx] = *flev + incrlev; 3174 fill[m] = idx; 3175 fill[idx] = fm; 3176 fm = idx; 3177 nzf++; 3178 } else { 3179 if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev; 3180 } 3181 flev++; 3182 } 3183 row = fill[row]; 3184 nzi++; 3185 } 3186 /* copy new filled row into permanent storage */ 3187 ainew[prow+1] = ainew[prow] + nzf; 3188 if (ainew[prow+1] > jmax) { 3189 3190 /* estimate how much additional space we will need */ 3191 /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 3192 /* just double the memory each time */ 3193 int maxadd = jmax; 3194 /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */ 3195 if (maxadd < nzf) maxadd = (n-prow)*(nzf+1); 3196 jmax += maxadd; 3197 3198 /* allocate a longer ajnew and ajfill */ 3199 ierr = PetscMalloc(jmax*sizeof(int),&xi);CHKERRQ(ierr); 3200 ierr = PetscMemcpy(xi,ajnew,ainew[prow]*sizeof(int));CHKERRQ(ierr); 3201 ierr = PetscFree(ajnew);CHKERRQ(ierr); 3202 ajnew = xi; 3203 ierr = PetscMalloc(jmax*sizeof(int),&xi);CHKERRQ(ierr); 3204 ierr = PetscMemcpy(xi,ajfill,ainew[prow]*sizeof(int));CHKERRQ(ierr); 3205 ierr = PetscFree(ajfill);CHKERRQ(ierr); 3206 ajfill = xi; 3207 reallocate++; /* count how many reallocations are needed */ 3208 } 3209 xi = ajnew + ainew[prow]; 3210 flev = ajfill + ainew[prow]; 3211 dloc[prow] = nzi; 3212 fm = fill[n]; 3213 while (nzf--) { 3214 *xi++ = fm; 3215 *flev++ = im[fm]; 3216 fm = fill[fm]; 3217 } 3218 /* make sure row has diagonal entry */ 3219 if (ajnew[ainew[prow]+dloc[prow]] != prow) { 3220 SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %d has missing diagonal in factored matrix\n\ 3221 try running with -pc_ilu_nonzeros_along_diagonal or -pc_ilu_diagonal_fill",prow); 3222 } 3223 } 3224 ierr = PetscFree(ajfill);CHKERRQ(ierr); 3225 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 3226 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 3227 ierr = PetscFree(fill);CHKERRQ(ierr); 3228 ierr = PetscFree(im);CHKERRQ(ierr); 3229 3230 { 3231 PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]); 3232 PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",reallocate,f,af); 3233 PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Run with -pc_ilu_fill %g or use \n",af); 3234 PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:PCILUSetFill(pc,%g);\n",af); 3235 PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:for best performance.\n"); 3236 if (diagonal_fill) { 3237 PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Detected and replaced %d missing diagonals",dcount); 3238 } 3239 } 3240 3241 /* put together the new matrix */ 3242 ierr = MatCreate(A->comm,bs*n,bs*n,bs*n,bs*n,fact);CHKERRQ(ierr); 3243 ierr = MatSetType(*fact,A->type_name);CHKERRQ(ierr); 3244 ierr = MatSeqBAIJSetPreallocation(*fact,bs,0,PETSC_NULL);CHKERRQ(ierr); 3245 PetscLogObjectParent(*fact,isicol); 3246 b = (Mat_SeqBAIJ*)(*fact)->data; 3247 ierr = PetscFree(b->imax);CHKERRQ(ierr); 3248 b->singlemalloc = PETSC_FALSE; 3249 len = bs2*ainew[n]*sizeof(MatScalar); 3250 /* the next line frees the default space generated by the Create() */ 3251 ierr = PetscFree(b->a);CHKERRQ(ierr); 3252 ierr = PetscFree(b->ilen);CHKERRQ(ierr); 3253 ierr = PetscMalloc(len,&b->a);CHKERRQ(ierr); 3254 b->j = ajnew; 3255 b->i = ainew; 3256 for (i=0; i<n; i++) dloc[i] += ainew[i]; 3257 b->diag = dloc; 3258 b->ilen = 0; 3259 b->imax = 0; 3260 b->row = isrow; 3261 b->col = iscol; 3262 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 3263 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 3264 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 3265 b->icol = isicol; 3266 ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 3267 /* In b structure: Free imax, ilen, old a, old j. 3268 Allocate dloc, solve_work, new a, new j */ 3269 PetscLogObjectMemory(*fact,(ainew[n]-n)*(sizeof(int))+bs2*ainew[n]*sizeof(PetscScalar)); 3270 b->maxnz = b->nz = ainew[n]; 3271 (*fact)->factor = FACTOR_LU; 3272 3273 (*fact)->info.factor_mallocs = reallocate; 3274 (*fact)->info.fill_ratio_given = f; 3275 (*fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]); 3276 } 3277 3278 if (row_identity && col_identity) { 3279 ierr = MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(*fact);CHKERRQ(ierr); 3280 } 3281 PetscFunctionReturn(0); 3282 } 3283 3284 #undef __FUNCT__ 3285 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE" 3286 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A) 3287 { 3288 /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */ 3289 /* int i,*AJ=a->j,nz=a->nz; */ 3290 PetscFunctionBegin; 3291 /* Undo Column scaling */ 3292 /* while (nz--) { */ 3293 /* AJ[i] = AJ[i]/4; */ 3294 /* } */ 3295 /* This should really invoke a push/pop logic, but we don't have that yet. */ 3296 A->ops->setunfactored = PETSC_NULL; 3297 PetscFunctionReturn(0); 3298 } 3299 3300 #undef __FUNCT__ 3301 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj" 3302 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A) 3303 { 3304 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3305 int *AJ=a->j,nz=a->nz; 3306 unsigned short *aj=(unsigned short *)AJ; 3307 PetscFunctionBegin; 3308 /* Is this really necessary? */ 3309 while (nz--) { 3310 AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */ 3311 } 3312 A->ops->setunfactored = PETSC_NULL; 3313 PetscFunctionReturn(0); 3314 } 3315 3316 #undef __FUNCT__ 3317 #define __FUNCT__ "MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering" 3318 PetscErrorCode MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(Mat inA) 3319 { 3320 /* 3321 Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver 3322 with natural ordering 3323 */ 3324 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)inA->data; 3325 3326 PetscFunctionBegin; 3327 inA->ops->solve = MatSolve_SeqBAIJ_Update; 3328 inA->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_Update; 3329 switch (a->bs) { 3330 case 1: 3331 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_1; 3332 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=1\n"); 3333 break; 3334 case 2: 3335 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering; 3336 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=2\n"); 3337 break; 3338 case 3: 3339 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering; 3340 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=3\n"); 3341 break; 3342 case 4: 3343 #if defined(PETSC_USE_MAT_SINGLE) 3344 { 3345 PetscTruth sse_enabled_local; 3346 PetscErrorCode ierr; 3347 ierr = PetscSSEIsEnabled(inA->comm,&sse_enabled_local,PETSC_NULL);CHKERRQ(ierr); 3348 if (sse_enabled_local) { 3349 # if defined(PETSC_HAVE_SSE) 3350 int i,*AJ=a->j,nz=a->nz,n=a->mbs; 3351 if (n==(unsigned short)n) { 3352 unsigned short *aj=(unsigned short *)AJ; 3353 for (i=0;i<nz;i++) { 3354 aj[i] = (unsigned short)AJ[i]; 3355 } 3356 inA->ops->setunfactored = MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj; 3357 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj; 3358 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special SSE, in-place natural ordering, ushort j index factor BS=4\n"); 3359 } else { 3360 /* Scale the column indices for easier indexing in MatSolve. */ 3361 /* for (i=0;i<nz;i++) { */ 3362 /* AJ[i] = AJ[i]*4; */ 3363 /* } */ 3364 inA->ops->setunfactored = MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE; 3365 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE; 3366 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special SSE, in-place natural ordering, int j index factor BS=4\n"); 3367 } 3368 # else 3369 /* This should never be reached. If so, problem in PetscSSEIsEnabled. */ 3370 SETERRQ(PETSC_ERR_SUP,"SSE Hardware unavailable"); 3371 # endif 3372 } else { 3373 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering; 3374 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=4\n"); 3375 } 3376 } 3377 #else 3378 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering; 3379 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=4\n"); 3380 #endif 3381 break; 3382 case 5: 3383 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_5_NaturalOrdering; 3384 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=5\n"); 3385 break; 3386 case 6: 3387 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_6_NaturalOrdering; 3388 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=6\n"); 3389 break; 3390 case 7: 3391 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_7_NaturalOrdering; 3392 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=7\n"); 3393 break; 3394 } 3395 PetscFunctionReturn(0); 3396 } 3397 3398 #undef __FUNCT__ 3399 #define __FUNCT__ "MatSeqBAIJ_UpdateSolvers" 3400 PetscErrorCode MatSeqBAIJ_UpdateSolvers(Mat A) 3401 { 3402 /* 3403 Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver 3404 with natural ordering 3405 */ 3406 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3407 IS row = a->row, col = a->col; 3408 PetscTruth row_identity, col_identity; 3409 PetscTruth use_natural; 3410 PetscErrorCode ierr; 3411 3412 PetscFunctionBegin; 3413 3414 use_natural = PETSC_FALSE; 3415 if (row && col) { 3416 ierr = ISIdentity(row,&row_identity);CHKERRQ(ierr); 3417 ierr = ISIdentity(col,&col_identity);CHKERRQ(ierr); 3418 3419 if (row_identity && col_identity) { 3420 use_natural = PETSC_TRUE; 3421 } 3422 } else { 3423 use_natural = PETSC_TRUE; 3424 } 3425 3426 switch (a->bs) { 3427 case 1: 3428 if (use_natural) { 3429 A->ops->solve = MatSolve_SeqBAIJ_1_NaturalOrdering; 3430 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering; 3431 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=1\n"); 3432 PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n"); 3433 } else { 3434 A->ops->solve = MatSolve_SeqBAIJ_1; 3435 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1; 3436 } 3437 break; 3438 case 2: 3439 if (use_natural) { 3440 A->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering; 3441 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering; 3442 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=2\n"); 3443 PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n"); 3444 } else { 3445 A->ops->solve = MatSolve_SeqBAIJ_2; 3446 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2; 3447 } 3448 break; 3449 case 3: 3450 if (use_natural) { 3451 A->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering; 3452 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering; 3453 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=3\n"); 3454 PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n"); 3455 } else { 3456 A->ops->solve = MatSolve_SeqBAIJ_3; 3457 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3; 3458 } 3459 break; 3460 case 4: 3461 { 3462 PetscTruth sse_enabled_local; 3463 ierr = PetscSSEIsEnabled(A->comm,&sse_enabled_local,PETSC_NULL);CHKERRQ(ierr); 3464 if (use_natural) { 3465 #if defined(PETSC_USE_MAT_SINGLE) 3466 if (sse_enabled_local) { /* Natural + Single + SSE */ 3467 # if defined(PETSC_HAVE_SSE) 3468 int n=a->mbs; 3469 if (n==(unsigned short)n) { 3470 A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj; 3471 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE, in-place, ushort j index, natural ordering solve BS=4\n"); 3472 } else { 3473 A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion; 3474 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE, in-place, int j index, natural ordering solve BS=4\n"); 3475 } 3476 # else 3477 /* This should never be reached, unless there is a bug in PetscSSEIsEnabled(). */ 3478 SETERRQ(PETSC_ERR_SUP,"SSE implementations are unavailable."); 3479 # endif 3480 } else { /* Natural + Single */ 3481 A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion; 3482 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, in-place, natural ordering solve BS=4\n"); 3483 } 3484 #else 3485 A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering; 3486 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place, natural ordering solve BS=4\n"); 3487 #endif 3488 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering; 3489 PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place, natural ordering solve BS=4\n"); 3490 } else { /* Arbitrary ordering */ 3491 #if defined(PETSC_USE_MAT_SINGLE) 3492 if (sse_enabled_local) { /* Arbitrary + Single + SSE */ 3493 # if defined(PETSC_HAVE_SSE) 3494 A->ops->solve = MatSolve_SeqBAIJ_4_SSE_Demotion; 3495 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE solve BS=4\n"); 3496 # else 3497 /* This should never be reached, unless there is a bug in PetscSSEIsEnabled(). */ 3498 SETERRQ(PETSC_ERR_SUP,"SSE implementations are unavailable."); 3499 # endif 3500 } else { /* Arbitrary + Single */ 3501 A->ops->solve = MatSolve_SeqBAIJ_4_Demotion; 3502 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision solve BS=4\n"); 3503 } 3504 #else 3505 A->ops->solve = MatSolve_SeqBAIJ_4; 3506 #endif 3507 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4; 3508 } 3509 } 3510 break; 3511 case 5: 3512 if (use_natural) { 3513 A->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering; 3514 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_5_NaturalOrdering; 3515 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=5\n"); 3516 PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=5\n"); 3517 } else { 3518 A->ops->solve = MatSolve_SeqBAIJ_5; 3519 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_5; 3520 } 3521 break; 3522 case 6: 3523 if (use_natural) { 3524 A->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering; 3525 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_6_NaturalOrdering; 3526 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=6\n"); 3527 PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=6\n"); 3528 } else { 3529 A->ops->solve = MatSolve_SeqBAIJ_6; 3530 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_6; 3531 } 3532 break; 3533 case 7: 3534 if (use_natural) { 3535 A->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering; 3536 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_7_NaturalOrdering; 3537 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=7\n"); 3538 PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=7\n"); 3539 } else { 3540 A->ops->solve = MatSolve_SeqBAIJ_7; 3541 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_7; 3542 } 3543 break; 3544 default: 3545 A->ops->solve = MatSolve_SeqBAIJ_N; 3546 break; 3547 } 3548 PetscFunctionReturn(0); 3549 } 3550 3551 #undef __FUNCT__ 3552 #define __FUNCT__ "MatSolve_SeqBAIJ_Update" 3553 PetscErrorCode MatSolve_SeqBAIJ_Update(Mat A,Vec x,Vec y) { 3554 PetscErrorCode ierr; 3555 3556 PetscFunctionBegin; 3557 ierr = MatSeqBAIJ_UpdateSolvers(A); 3558 if (A->ops->solve != MatSolve_SeqBAIJ_Update) { 3559 ierr = (*A->ops->solve)(A,x,y);CHKERRQ(ierr); 3560 } else { 3561 SETERRQ(PETSC_ERR_SUP,"Something really wrong happened."); 3562 } 3563 PetscFunctionReturn(0); 3564 } 3565 3566 #undef __FUNCT__ 3567 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_Update" 3568 PetscErrorCode MatSolveTranspose_SeqBAIJ_Update(Mat A,Vec x,Vec y) { 3569 PetscErrorCode ierr; 3570 3571 PetscFunctionBegin; 3572 ierr = MatSeqBAIJ_UpdateSolvers(A); 3573 ierr = (*A->ops->solvetranspose)(A,x,y);CHKERRQ(ierr); 3574 PetscFunctionReturn(0); 3575 } 3576 3577 3578