1 #define PETSCMAT_DLL 2 3 /* 4 Factorization code for BAIJ format. 5 */ 6 7 #include "src/mat/impls/baij/seq/baij.h" 8 #include "src/inline/ilu.h" 9 #include "src/inline/dot.h" 10 11 #undef __FUNCT__ 12 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering" 13 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 14 { 15 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 16 PetscErrorCode ierr; 17 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 18 PetscInt *diag = a->diag; 19 MatScalar *aa=a->a,*v; 20 PetscScalar s1,*x,*b; 21 22 PetscFunctionBegin; 23 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 24 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 25 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 26 27 /* forward solve the U^T */ 28 for (i=0; i<n; i++) { 29 30 v = aa + diag[i]; 31 /* multiply by the inverse of the block diagonal */ 32 s1 = (*v++)*x[i]; 33 vi = aj + diag[i] + 1; 34 nz = ai[i+1] - diag[i] - 1; 35 while (nz--) { 36 x[*vi++] -= (*v++)*s1; 37 } 38 x[i] = s1; 39 } 40 /* backward solve the L^T */ 41 for (i=n-1; i>=0; i--){ 42 v = aa + diag[i] - 1; 43 vi = aj + diag[i] - 1; 44 nz = diag[i] - ai[i]; 45 s1 = x[i]; 46 while (nz--) { 47 x[*vi--] -= (*v--)*s1; 48 } 49 } 50 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 51 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 52 ierr = PetscLogFlops(2*(a->nz) - A->cmap.n);CHKERRQ(ierr); 53 PetscFunctionReturn(0); 54 } 55 56 #undef __FUNCT__ 57 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering" 58 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 59 { 60 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 61 PetscErrorCode ierr; 62 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 63 PetscInt *diag = a->diag,oidx; 64 MatScalar *aa=a->a,*v; 65 PetscScalar s1,s2,x1,x2; 66 PetscScalar *x,*b; 67 68 PetscFunctionBegin; 69 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 70 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 71 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 72 73 /* forward solve the U^T */ 74 idx = 0; 75 for (i=0; i<n; i++) { 76 77 v = aa + 4*diag[i]; 78 /* multiply by the inverse of the block diagonal */ 79 x1 = x[idx]; x2 = x[1+idx]; 80 s1 = v[0]*x1 + v[1]*x2; 81 s2 = v[2]*x1 + v[3]*x2; 82 v += 4; 83 84 vi = aj + diag[i] + 1; 85 nz = ai[i+1] - diag[i] - 1; 86 while (nz--) { 87 oidx = 2*(*vi++); 88 x[oidx] -= v[0]*s1 + v[1]*s2; 89 x[oidx+1] -= v[2]*s1 + v[3]*s2; 90 v += 4; 91 } 92 x[idx] = s1;x[1+idx] = s2; 93 idx += 2; 94 } 95 /* backward solve the L^T */ 96 for (i=n-1; i>=0; i--){ 97 v = aa + 4*diag[i] - 4; 98 vi = aj + diag[i] - 1; 99 nz = diag[i] - ai[i]; 100 idt = 2*i; 101 s1 = x[idt]; s2 = x[1+idt]; 102 while (nz--) { 103 idx = 2*(*vi--); 104 x[idx] -= v[0]*s1 + v[1]*s2; 105 x[idx+1] -= v[2]*s1 + v[3]*s2; 106 v -= 4; 107 } 108 } 109 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 110 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 111 ierr = PetscLogFlops(2*4*(a->nz) - 2*A->cmap.n);CHKERRQ(ierr); 112 PetscFunctionReturn(0); 113 } 114 115 #undef __FUNCT__ 116 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering" 117 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 118 { 119 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 120 PetscErrorCode ierr; 121 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 122 PetscInt *diag = a->diag,oidx; 123 MatScalar *aa=a->a,*v; 124 PetscScalar s1,s2,s3,x1,x2,x3; 125 PetscScalar *x,*b; 126 127 PetscFunctionBegin; 128 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 129 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 130 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 131 132 /* forward solve the U^T */ 133 idx = 0; 134 for (i=0; i<n; i++) { 135 136 v = aa + 9*diag[i]; 137 /* multiply by the inverse of the block diagonal */ 138 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 139 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 140 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 141 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 142 v += 9; 143 144 vi = aj + diag[i] + 1; 145 nz = ai[i+1] - diag[i] - 1; 146 while (nz--) { 147 oidx = 3*(*vi++); 148 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 149 x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 150 x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 151 v += 9; 152 } 153 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 154 idx += 3; 155 } 156 /* backward solve the L^T */ 157 for (i=n-1; i>=0; i--){ 158 v = aa + 9*diag[i] - 9; 159 vi = aj + diag[i] - 1; 160 nz = diag[i] - ai[i]; 161 idt = 3*i; 162 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 163 while (nz--) { 164 idx = 3*(*vi--); 165 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 166 x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 167 x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 168 v -= 9; 169 } 170 } 171 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 172 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 173 ierr = PetscLogFlops(2*9*(a->nz) - 3*A->cmap.n);CHKERRQ(ierr); 174 PetscFunctionReturn(0); 175 } 176 177 #undef __FUNCT__ 178 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering" 179 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 180 { 181 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 182 PetscErrorCode ierr; 183 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 184 PetscInt *diag = a->diag,oidx; 185 MatScalar *aa=a->a,*v; 186 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 187 PetscScalar *x,*b; 188 189 PetscFunctionBegin; 190 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 191 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 192 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 193 194 /* forward solve the U^T */ 195 idx = 0; 196 for (i=0; i<n; i++) { 197 198 v = aa + 16*diag[i]; 199 /* multiply by the inverse of the block diagonal */ 200 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 201 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 202 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 203 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 204 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 205 v += 16; 206 207 vi = aj + diag[i] + 1; 208 nz = ai[i+1] - diag[i] - 1; 209 while (nz--) { 210 oidx = 4*(*vi++); 211 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 212 x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 213 x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 214 x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 215 v += 16; 216 } 217 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; 218 idx += 4; 219 } 220 /* backward solve the L^T */ 221 for (i=n-1; i>=0; i--){ 222 v = aa + 16*diag[i] - 16; 223 vi = aj + diag[i] - 1; 224 nz = diag[i] - ai[i]; 225 idt = 4*i; 226 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; 227 while (nz--) { 228 idx = 4*(*vi--); 229 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 230 x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 231 x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 232 x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 233 v -= 16; 234 } 235 } 236 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 237 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 238 ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap.n);CHKERRQ(ierr); 239 PetscFunctionReturn(0); 240 } 241 242 #undef __FUNCT__ 243 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering" 244 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 245 { 246 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 247 PetscErrorCode ierr; 248 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 249 PetscInt *diag = a->diag,oidx; 250 MatScalar *aa=a->a,*v; 251 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 252 PetscScalar *x,*b; 253 254 PetscFunctionBegin; 255 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 256 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 257 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 258 259 /* forward solve the U^T */ 260 idx = 0; 261 for (i=0; i<n; i++) { 262 263 v = aa + 25*diag[i]; 264 /* multiply by the inverse of the block diagonal */ 265 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 266 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 267 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 268 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 269 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 270 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 271 v += 25; 272 273 vi = aj + diag[i] + 1; 274 nz = ai[i+1] - diag[i] - 1; 275 while (nz--) { 276 oidx = 5*(*vi++); 277 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 278 x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 279 x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 280 x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 281 x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 282 v += 25; 283 } 284 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 285 idx += 5; 286 } 287 /* backward solve the L^T */ 288 for (i=n-1; i>=0; i--){ 289 v = aa + 25*diag[i] - 25; 290 vi = aj + diag[i] - 1; 291 nz = diag[i] - ai[i]; 292 idt = 5*i; 293 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 294 while (nz--) { 295 idx = 5*(*vi--); 296 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 297 x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 298 x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 299 x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 300 x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 301 v -= 25; 302 } 303 } 304 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 305 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 306 ierr = PetscLogFlops(2*25*(a->nz) - 5*A->cmap.n);CHKERRQ(ierr); 307 PetscFunctionReturn(0); 308 } 309 310 #undef __FUNCT__ 311 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering" 312 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 313 { 314 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 315 PetscErrorCode ierr; 316 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 317 PetscInt *diag = a->diag,oidx; 318 MatScalar *aa=a->a,*v; 319 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 320 PetscScalar *x,*b; 321 322 PetscFunctionBegin; 323 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 324 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 325 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 326 327 /* forward solve the U^T */ 328 idx = 0; 329 for (i=0; i<n; i++) { 330 331 v = aa + 36*diag[i]; 332 /* multiply by the inverse of the block diagonal */ 333 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 334 x6 = x[5+idx]; 335 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 336 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 337 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 338 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 339 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 340 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 341 v += 36; 342 343 vi = aj + diag[i] + 1; 344 nz = ai[i+1] - diag[i] - 1; 345 while (nz--) { 346 oidx = 6*(*vi++); 347 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 348 x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 349 x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 350 x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 351 x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 352 x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 353 v += 36; 354 } 355 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 356 x[5+idx] = s6; 357 idx += 6; 358 } 359 /* backward solve the L^T */ 360 for (i=n-1; i>=0; i--){ 361 v = aa + 36*diag[i] - 36; 362 vi = aj + diag[i] - 1; 363 nz = diag[i] - ai[i]; 364 idt = 6*i; 365 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 366 s6 = x[5+idt]; 367 while (nz--) { 368 idx = 6*(*vi--); 369 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 370 x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 371 x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 372 x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 373 x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 374 x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 375 v -= 36; 376 } 377 } 378 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 379 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 380 ierr = PetscLogFlops(2*36*(a->nz) - 6*A->cmap.n);CHKERRQ(ierr); 381 PetscFunctionReturn(0); 382 } 383 384 #undef __FUNCT__ 385 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering" 386 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 387 { 388 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 389 PetscErrorCode ierr; 390 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 391 PetscInt *diag = a->diag,oidx; 392 MatScalar *aa=a->a,*v; 393 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 394 PetscScalar *x,*b; 395 396 PetscFunctionBegin; 397 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 398 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 399 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 400 401 /* forward solve the U^T */ 402 idx = 0; 403 for (i=0; i<n; i++) { 404 405 v = aa + 49*diag[i]; 406 /* multiply by the inverse of the block diagonal */ 407 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 408 x6 = x[5+idx]; x7 = x[6+idx]; 409 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 410 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 411 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 412 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 413 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 414 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 415 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 416 v += 49; 417 418 vi = aj + diag[i] + 1; 419 nz = ai[i+1] - diag[i] - 1; 420 while (nz--) { 421 oidx = 7*(*vi++); 422 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 423 x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 424 x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 425 x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 426 x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 427 x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 428 x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 429 v += 49; 430 } 431 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 432 x[5+idx] = s6;x[6+idx] = s7; 433 idx += 7; 434 } 435 /* backward solve the L^T */ 436 for (i=n-1; i>=0; i--){ 437 v = aa + 49*diag[i] - 49; 438 vi = aj + diag[i] - 1; 439 nz = diag[i] - ai[i]; 440 idt = 7*i; 441 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 442 s6 = x[5+idt];s7 = x[6+idt]; 443 while (nz--) { 444 idx = 7*(*vi--); 445 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 446 x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 447 x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 448 x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 449 x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 450 x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 451 x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 452 v -= 49; 453 } 454 } 455 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 456 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 457 ierr = PetscLogFlops(2*49*(a->nz) - 7*A->cmap.n);CHKERRQ(ierr); 458 PetscFunctionReturn(0); 459 } 460 461 /*---------------------------------------------------------------------------------------------*/ 462 #undef __FUNCT__ 463 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1" 464 PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 465 { 466 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 467 IS iscol=a->col,isrow=a->row; 468 PetscErrorCode ierr; 469 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout; 470 PetscInt *diag = a->diag; 471 MatScalar *aa=a->a,*v; 472 PetscScalar s1,*x,*b,*t; 473 474 PetscFunctionBegin; 475 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 476 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 477 t = a->solve_work; 478 479 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 480 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 481 482 /* copy the b into temp work space according to permutation */ 483 for (i=0; i<n; i++) { 484 t[i] = b[c[i]]; 485 } 486 487 /* forward solve the U^T */ 488 for (i=0; i<n; i++) { 489 490 v = aa + diag[i]; 491 /* multiply by the inverse of the block diagonal */ 492 s1 = (*v++)*t[i]; 493 vi = aj + diag[i] + 1; 494 nz = ai[i+1] - diag[i] - 1; 495 while (nz--) { 496 t[*vi++] -= (*v++)*s1; 497 } 498 t[i] = s1; 499 } 500 /* backward solve the L^T */ 501 for (i=n-1; i>=0; i--){ 502 v = aa + diag[i] - 1; 503 vi = aj + diag[i] - 1; 504 nz = diag[i] - ai[i]; 505 s1 = t[i]; 506 while (nz--) { 507 t[*vi--] -= (*v--)*s1; 508 } 509 } 510 511 /* copy t into x according to permutation */ 512 for (i=0; i<n; i++) { 513 x[r[i]] = t[i]; 514 } 515 516 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 517 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 518 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 519 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 520 ierr = PetscLogFlops(2*(a->nz) - A->cmap.n);CHKERRQ(ierr); 521 PetscFunctionReturn(0); 522 } 523 524 #undef __FUNCT__ 525 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2" 526 PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 527 { 528 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 529 IS iscol=a->col,isrow=a->row; 530 PetscErrorCode ierr; 531 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 532 PetscInt *diag = a->diag,ii,ic,ir,oidx; 533 MatScalar *aa=a->a,*v; 534 PetscScalar s1,s2,x1,x2; 535 PetscScalar *x,*b,*t; 536 537 PetscFunctionBegin; 538 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 539 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 540 t = a->solve_work; 541 542 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 543 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 544 545 /* copy the b into temp work space according to permutation */ 546 ii = 0; 547 for (i=0; i<n; i++) { 548 ic = 2*c[i]; 549 t[ii] = b[ic]; 550 t[ii+1] = b[ic+1]; 551 ii += 2; 552 } 553 554 /* forward solve the U^T */ 555 idx = 0; 556 for (i=0; i<n; i++) { 557 558 v = aa + 4*diag[i]; 559 /* multiply by the inverse of the block diagonal */ 560 x1 = t[idx]; x2 = t[1+idx]; 561 s1 = v[0]*x1 + v[1]*x2; 562 s2 = v[2]*x1 + v[3]*x2; 563 v += 4; 564 565 vi = aj + diag[i] + 1; 566 nz = ai[i+1] - diag[i] - 1; 567 while (nz--) { 568 oidx = 2*(*vi++); 569 t[oidx] -= v[0]*s1 + v[1]*s2; 570 t[oidx+1] -= v[2]*s1 + v[3]*s2; 571 v += 4; 572 } 573 t[idx] = s1;t[1+idx] = s2; 574 idx += 2; 575 } 576 /* backward solve the L^T */ 577 for (i=n-1; i>=0; i--){ 578 v = aa + 4*diag[i] - 4; 579 vi = aj + diag[i] - 1; 580 nz = diag[i] - ai[i]; 581 idt = 2*i; 582 s1 = t[idt]; s2 = t[1+idt]; 583 while (nz--) { 584 idx = 2*(*vi--); 585 t[idx] -= v[0]*s1 + v[1]*s2; 586 t[idx+1] -= v[2]*s1 + v[3]*s2; 587 v -= 4; 588 } 589 } 590 591 /* copy t into x according to permutation */ 592 ii = 0; 593 for (i=0; i<n; i++) { 594 ir = 2*r[i]; 595 x[ir] = t[ii]; 596 x[ir+1] = t[ii+1]; 597 ii += 2; 598 } 599 600 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 601 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 602 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 603 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 604 ierr = PetscLogFlops(2*4*(a->nz) - 2*A->cmap.n);CHKERRQ(ierr); 605 PetscFunctionReturn(0); 606 } 607 608 #undef __FUNCT__ 609 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3" 610 PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 611 { 612 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 613 IS iscol=a->col,isrow=a->row; 614 PetscErrorCode ierr; 615 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 616 PetscInt *diag = a->diag,ii,ic,ir,oidx; 617 MatScalar *aa=a->a,*v; 618 PetscScalar s1,s2,s3,x1,x2,x3; 619 PetscScalar *x,*b,*t; 620 621 PetscFunctionBegin; 622 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 623 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 624 t = a->solve_work; 625 626 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 627 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 628 629 /* copy the b into temp work space according to permutation */ 630 ii = 0; 631 for (i=0; i<n; i++) { 632 ic = 3*c[i]; 633 t[ii] = b[ic]; 634 t[ii+1] = b[ic+1]; 635 t[ii+2] = b[ic+2]; 636 ii += 3; 637 } 638 639 /* forward solve the U^T */ 640 idx = 0; 641 for (i=0; i<n; i++) { 642 643 v = aa + 9*diag[i]; 644 /* multiply by the inverse of the block diagonal */ 645 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 646 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 647 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 648 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 649 v += 9; 650 651 vi = aj + diag[i] + 1; 652 nz = ai[i+1] - diag[i] - 1; 653 while (nz--) { 654 oidx = 3*(*vi++); 655 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 656 t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 657 t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 658 v += 9; 659 } 660 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 661 idx += 3; 662 } 663 /* backward solve the L^T */ 664 for (i=n-1; i>=0; i--){ 665 v = aa + 9*diag[i] - 9; 666 vi = aj + diag[i] - 1; 667 nz = diag[i] - ai[i]; 668 idt = 3*i; 669 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 670 while (nz--) { 671 idx = 3*(*vi--); 672 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 673 t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 674 t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 675 v -= 9; 676 } 677 } 678 679 /* copy t into x according to permutation */ 680 ii = 0; 681 for (i=0; i<n; i++) { 682 ir = 3*r[i]; 683 x[ir] = t[ii]; 684 x[ir+1] = t[ii+1]; 685 x[ir+2] = t[ii+2]; 686 ii += 3; 687 } 688 689 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 690 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 691 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 692 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 693 ierr = PetscLogFlops(2*9*(a->nz) - 3*A->cmap.n);CHKERRQ(ierr); 694 PetscFunctionReturn(0); 695 } 696 697 #undef __FUNCT__ 698 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4" 699 PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 700 { 701 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 702 IS iscol=a->col,isrow=a->row; 703 PetscErrorCode ierr; 704 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 705 PetscInt *diag = a->diag,ii,ic,ir,oidx; 706 MatScalar *aa=a->a,*v; 707 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 708 PetscScalar *x,*b,*t; 709 710 PetscFunctionBegin; 711 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 712 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 713 t = a->solve_work; 714 715 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 716 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 717 718 /* copy the b into temp work space according to permutation */ 719 ii = 0; 720 for (i=0; i<n; i++) { 721 ic = 4*c[i]; 722 t[ii] = b[ic]; 723 t[ii+1] = b[ic+1]; 724 t[ii+2] = b[ic+2]; 725 t[ii+3] = b[ic+3]; 726 ii += 4; 727 } 728 729 /* forward solve the U^T */ 730 idx = 0; 731 for (i=0; i<n; i++) { 732 733 v = aa + 16*diag[i]; 734 /* multiply by the inverse of the block diagonal */ 735 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 736 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 737 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 738 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 739 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 740 v += 16; 741 742 vi = aj + diag[i] + 1; 743 nz = ai[i+1] - diag[i] - 1; 744 while (nz--) { 745 oidx = 4*(*vi++); 746 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 747 t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 748 t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 749 t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 750 v += 16; 751 } 752 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; 753 idx += 4; 754 } 755 /* backward solve the L^T */ 756 for (i=n-1; i>=0; i--){ 757 v = aa + 16*diag[i] - 16; 758 vi = aj + diag[i] - 1; 759 nz = diag[i] - ai[i]; 760 idt = 4*i; 761 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; 762 while (nz--) { 763 idx = 4*(*vi--); 764 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 765 t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 766 t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 767 t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 768 v -= 16; 769 } 770 } 771 772 /* copy t into x according to permutation */ 773 ii = 0; 774 for (i=0; i<n; i++) { 775 ir = 4*r[i]; 776 x[ir] = t[ii]; 777 x[ir+1] = t[ii+1]; 778 x[ir+2] = t[ii+2]; 779 x[ir+3] = t[ii+3]; 780 ii += 4; 781 } 782 783 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 784 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 785 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 786 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 787 ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap.n);CHKERRQ(ierr); 788 PetscFunctionReturn(0); 789 } 790 791 #undef __FUNCT__ 792 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5" 793 PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 794 { 795 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 796 IS iscol=a->col,isrow=a->row; 797 PetscErrorCode ierr; 798 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 799 PetscInt *diag = a->diag,ii,ic,ir,oidx; 800 MatScalar *aa=a->a,*v; 801 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 802 PetscScalar *x,*b,*t; 803 804 PetscFunctionBegin; 805 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 806 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 807 t = a->solve_work; 808 809 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 810 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 811 812 /* copy the b into temp work space according to permutation */ 813 ii = 0; 814 for (i=0; i<n; i++) { 815 ic = 5*c[i]; 816 t[ii] = b[ic]; 817 t[ii+1] = b[ic+1]; 818 t[ii+2] = b[ic+2]; 819 t[ii+3] = b[ic+3]; 820 t[ii+4] = b[ic+4]; 821 ii += 5; 822 } 823 824 /* forward solve the U^T */ 825 idx = 0; 826 for (i=0; i<n; i++) { 827 828 v = aa + 25*diag[i]; 829 /* multiply by the inverse of the block diagonal */ 830 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 831 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 832 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 833 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 834 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 835 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 836 v += 25; 837 838 vi = aj + diag[i] + 1; 839 nz = ai[i+1] - diag[i] - 1; 840 while (nz--) { 841 oidx = 5*(*vi++); 842 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 843 t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 844 t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 845 t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 846 t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 847 v += 25; 848 } 849 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 850 idx += 5; 851 } 852 /* backward solve the L^T */ 853 for (i=n-1; i>=0; i--){ 854 v = aa + 25*diag[i] - 25; 855 vi = aj + diag[i] - 1; 856 nz = diag[i] - ai[i]; 857 idt = 5*i; 858 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 859 while (nz--) { 860 idx = 5*(*vi--); 861 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 862 t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 863 t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 864 t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 865 t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 866 v -= 25; 867 } 868 } 869 870 /* copy t into x according to permutation */ 871 ii = 0; 872 for (i=0; i<n; i++) { 873 ir = 5*r[i]; 874 x[ir] = t[ii]; 875 x[ir+1] = t[ii+1]; 876 x[ir+2] = t[ii+2]; 877 x[ir+3] = t[ii+3]; 878 x[ir+4] = t[ii+4]; 879 ii += 5; 880 } 881 882 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 883 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 884 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 885 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 886 ierr = PetscLogFlops(2*25*(a->nz) - 5*A->cmap.n);CHKERRQ(ierr); 887 PetscFunctionReturn(0); 888 } 889 890 #undef __FUNCT__ 891 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6" 892 PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 893 { 894 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 895 IS iscol=a->col,isrow=a->row; 896 PetscErrorCode ierr; 897 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 898 PetscInt *diag = a->diag,ii,ic,ir,oidx; 899 MatScalar *aa=a->a,*v; 900 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 901 PetscScalar *x,*b,*t; 902 903 PetscFunctionBegin; 904 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 905 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 906 t = a->solve_work; 907 908 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 909 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 910 911 /* copy the b into temp work space according to permutation */ 912 ii = 0; 913 for (i=0; i<n; i++) { 914 ic = 6*c[i]; 915 t[ii] = b[ic]; 916 t[ii+1] = b[ic+1]; 917 t[ii+2] = b[ic+2]; 918 t[ii+3] = b[ic+3]; 919 t[ii+4] = b[ic+4]; 920 t[ii+5] = b[ic+5]; 921 ii += 6; 922 } 923 924 /* forward solve the U^T */ 925 idx = 0; 926 for (i=0; i<n; i++) { 927 928 v = aa + 36*diag[i]; 929 /* multiply by the inverse of the block diagonal */ 930 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 931 x6 = t[5+idx]; 932 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 933 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 934 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 935 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 936 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 937 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 938 v += 36; 939 940 vi = aj + diag[i] + 1; 941 nz = ai[i+1] - diag[i] - 1; 942 while (nz--) { 943 oidx = 6*(*vi++); 944 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 945 t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 946 t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 947 t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 948 t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 949 t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 950 v += 36; 951 } 952 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 953 t[5+idx] = s6; 954 idx += 6; 955 } 956 /* backward solve the L^T */ 957 for (i=n-1; i>=0; i--){ 958 v = aa + 36*diag[i] - 36; 959 vi = aj + diag[i] - 1; 960 nz = diag[i] - ai[i]; 961 idt = 6*i; 962 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 963 s6 = t[5+idt]; 964 while (nz--) { 965 idx = 6*(*vi--); 966 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 967 t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 968 t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 969 t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 970 t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 971 t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 972 v -= 36; 973 } 974 } 975 976 /* copy t into x according to permutation */ 977 ii = 0; 978 for (i=0; i<n; i++) { 979 ir = 6*r[i]; 980 x[ir] = t[ii]; 981 x[ir+1] = t[ii+1]; 982 x[ir+2] = t[ii+2]; 983 x[ir+3] = t[ii+3]; 984 x[ir+4] = t[ii+4]; 985 x[ir+5] = t[ii+5]; 986 ii += 6; 987 } 988 989 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 990 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 991 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 992 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 993 ierr = PetscLogFlops(2*36*(a->nz) - 6*A->cmap.n);CHKERRQ(ierr); 994 PetscFunctionReturn(0); 995 } 996 997 #undef __FUNCT__ 998 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7" 999 PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 1000 { 1001 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1002 IS iscol=a->col,isrow=a->row; 1003 PetscErrorCode ierr; 1004 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 1005 PetscInt *diag = a->diag,ii,ic,ir,oidx; 1006 MatScalar *aa=a->a,*v; 1007 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1008 PetscScalar *x,*b,*t; 1009 1010 PetscFunctionBegin; 1011 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1012 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1013 t = a->solve_work; 1014 1015 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1016 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1017 1018 /* copy the b into temp work space according to permutation */ 1019 ii = 0; 1020 for (i=0; i<n; i++) { 1021 ic = 7*c[i]; 1022 t[ii] = b[ic]; 1023 t[ii+1] = b[ic+1]; 1024 t[ii+2] = b[ic+2]; 1025 t[ii+3] = b[ic+3]; 1026 t[ii+4] = b[ic+4]; 1027 t[ii+5] = b[ic+5]; 1028 t[ii+6] = b[ic+6]; 1029 ii += 7; 1030 } 1031 1032 /* forward solve the U^T */ 1033 idx = 0; 1034 for (i=0; i<n; i++) { 1035 1036 v = aa + 49*diag[i]; 1037 /* multiply by the inverse of the block diagonal */ 1038 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1039 x6 = t[5+idx]; x7 = t[6+idx]; 1040 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 1041 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 1042 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 1043 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 1044 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 1045 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 1046 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 1047 v += 49; 1048 1049 vi = aj + diag[i] + 1; 1050 nz = ai[i+1] - diag[i] - 1; 1051 while (nz--) { 1052 oidx = 7*(*vi++); 1053 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1054 t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1055 t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1056 t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1057 t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1058 t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1059 t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1060 v += 49; 1061 } 1062 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1063 t[5+idx] = s6;t[6+idx] = s7; 1064 idx += 7; 1065 } 1066 /* backward solve the L^T */ 1067 for (i=n-1; i>=0; i--){ 1068 v = aa + 49*diag[i] - 49; 1069 vi = aj + diag[i] - 1; 1070 nz = diag[i] - ai[i]; 1071 idt = 7*i; 1072 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1073 s6 = t[5+idt];s7 = t[6+idt]; 1074 while (nz--) { 1075 idx = 7*(*vi--); 1076 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1077 t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1078 t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1079 t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1080 t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1081 t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1082 t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1083 v -= 49; 1084 } 1085 } 1086 1087 /* copy t into x according to permutation */ 1088 ii = 0; 1089 for (i=0; i<n; i++) { 1090 ir = 7*r[i]; 1091 x[ir] = t[ii]; 1092 x[ir+1] = t[ii+1]; 1093 x[ir+2] = t[ii+2]; 1094 x[ir+3] = t[ii+3]; 1095 x[ir+4] = t[ii+4]; 1096 x[ir+5] = t[ii+5]; 1097 x[ir+6] = t[ii+6]; 1098 ii += 7; 1099 } 1100 1101 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1102 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1103 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1104 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1105 ierr = PetscLogFlops(2*49*(a->nz) - 7*A->cmap.n);CHKERRQ(ierr); 1106 PetscFunctionReturn(0); 1107 } 1108 1109 /* ----------------------------------------------------------- */ 1110 #undef __FUNCT__ 1111 #define __FUNCT__ "MatSolve_SeqBAIJ_N" 1112 PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 1113 { 1114 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1115 IS iscol=a->col,isrow=a->row; 1116 PetscErrorCode ierr; 1117 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1118 PetscInt nz,bs=A->rmap.bs,bs2=a->bs2,*rout,*cout; 1119 MatScalar *aa=a->a,*v; 1120 PetscScalar *x,*b,*s,*t,*ls; 1121 1122 PetscFunctionBegin; 1123 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1124 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1125 t = a->solve_work; 1126 1127 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1128 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1129 1130 /* forward solve the lower triangular */ 1131 ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 1132 for (i=1; i<n; i++) { 1133 v = aa + bs2*ai[i]; 1134 vi = aj + ai[i]; 1135 nz = a->diag[i] - ai[i]; 1136 s = t + bs*i; 1137 ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 1138 while (nz--) { 1139 Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 1140 v += bs2; 1141 } 1142 } 1143 /* backward solve the upper triangular */ 1144 ls = a->solve_work + A->cmap.n; 1145 for (i=n-1; i>=0; i--){ 1146 v = aa + bs2*(a->diag[i] + 1); 1147 vi = aj + a->diag[i] + 1; 1148 nz = ai[i+1] - a->diag[i] - 1; 1149 ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 1150 while (nz--) { 1151 Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 1152 v += bs2; 1153 } 1154 Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 1155 ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 1156 } 1157 1158 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1159 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1160 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1161 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1162 ierr = PetscLogFlops(2*(a->bs2)*(a->nz) - A->rmap.bs*A->cmap.n);CHKERRQ(ierr); 1163 PetscFunctionReturn(0); 1164 } 1165 1166 #undef __FUNCT__ 1167 #define __FUNCT__ "MatSolve_SeqBAIJ_7" 1168 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 1169 { 1170 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1171 IS iscol=a->col,isrow=a->row; 1172 PetscErrorCode ierr; 1173 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1174 PetscInt *diag = a->diag; 1175 MatScalar *aa=a->a,*v; 1176 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1177 PetscScalar *x,*b,*t; 1178 1179 PetscFunctionBegin; 1180 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1181 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1182 t = a->solve_work; 1183 1184 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1185 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1186 1187 /* forward solve the lower triangular */ 1188 idx = 7*(*r++); 1189 t[0] = b[idx]; t[1] = b[1+idx]; 1190 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1191 t[5] = b[5+idx]; t[6] = b[6+idx]; 1192 1193 for (i=1; i<n; i++) { 1194 v = aa + 49*ai[i]; 1195 vi = aj + ai[i]; 1196 nz = diag[i] - ai[i]; 1197 idx = 7*(*r++); 1198 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1199 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 1200 while (nz--) { 1201 idx = 7*(*vi++); 1202 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1203 x4 = t[3+idx];x5 = t[4+idx]; 1204 x6 = t[5+idx];x7 = t[6+idx]; 1205 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1206 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1207 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1208 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1209 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1210 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1211 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1212 v += 49; 1213 } 1214 idx = 7*i; 1215 t[idx] = s1;t[1+idx] = s2; 1216 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1217 t[5+idx] = s6;t[6+idx] = s7; 1218 } 1219 /* backward solve the upper triangular */ 1220 for (i=n-1; i>=0; i--){ 1221 v = aa + 49*diag[i] + 49; 1222 vi = aj + diag[i] + 1; 1223 nz = ai[i+1] - diag[i] - 1; 1224 idt = 7*i; 1225 s1 = t[idt]; s2 = t[1+idt]; 1226 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1227 s6 = t[5+idt];s7 = t[6+idt]; 1228 while (nz--) { 1229 idx = 7*(*vi++); 1230 x1 = t[idx]; x2 = t[1+idx]; 1231 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1232 x6 = t[5+idx]; x7 = t[6+idx]; 1233 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1234 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1235 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1236 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1237 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1238 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1239 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1240 v += 49; 1241 } 1242 idc = 7*(*c--); 1243 v = aa + 49*diag[i]; 1244 x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 1245 v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 1246 x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 1247 v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 1248 x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 1249 v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 1250 x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 1251 v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 1252 x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 1253 v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 1254 x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 1255 v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 1256 x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 1257 v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 1258 } 1259 1260 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1261 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1262 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1263 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1264 ierr = PetscLogFlops(2*49*(a->nz) - 7*A->cmap.n);CHKERRQ(ierr); 1265 PetscFunctionReturn(0); 1266 } 1267 1268 #undef __FUNCT__ 1269 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering" 1270 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 1271 { 1272 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1273 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1274 PetscErrorCode ierr; 1275 PetscInt *diag = a->diag,jdx; 1276 const MatScalar *aa=a->a,*v; 1277 PetscScalar *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1278 const PetscScalar *b; 1279 1280 PetscFunctionBegin; 1281 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1282 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1283 /* forward solve the lower triangular */ 1284 idx = 0; 1285 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 1286 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 1287 x[6] = b[6+idx]; 1288 for (i=1; i<n; i++) { 1289 v = aa + 49*ai[i]; 1290 vi = aj + ai[i]; 1291 nz = diag[i] - ai[i]; 1292 idx = 7*i; 1293 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1294 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1295 s7 = b[6+idx]; 1296 while (nz--) { 1297 jdx = 7*(*vi++); 1298 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 1299 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 1300 x7 = x[6+jdx]; 1301 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1302 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1303 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1304 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1305 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1306 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1307 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1308 v += 49; 1309 } 1310 x[idx] = s1; 1311 x[1+idx] = s2; 1312 x[2+idx] = s3; 1313 x[3+idx] = s4; 1314 x[4+idx] = s5; 1315 x[5+idx] = s6; 1316 x[6+idx] = s7; 1317 } 1318 /* backward solve the upper triangular */ 1319 for (i=n-1; i>=0; i--){ 1320 v = aa + 49*diag[i] + 49; 1321 vi = aj + diag[i] + 1; 1322 nz = ai[i+1] - diag[i] - 1; 1323 idt = 7*i; 1324 s1 = x[idt]; s2 = x[1+idt]; 1325 s3 = x[2+idt]; s4 = x[3+idt]; 1326 s5 = x[4+idt]; s6 = x[5+idt]; 1327 s7 = x[6+idt]; 1328 while (nz--) { 1329 idx = 7*(*vi++); 1330 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 1331 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 1332 x7 = x[6+idx]; 1333 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1334 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1335 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1336 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1337 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1338 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1339 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1340 v += 49; 1341 } 1342 v = aa + 49*diag[i]; 1343 x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 1344 + v[28]*s5 + v[35]*s6 + v[42]*s7; 1345 x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 1346 + v[29]*s5 + v[36]*s6 + v[43]*s7; 1347 x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 1348 + v[30]*s5 + v[37]*s6 + v[44]*s7; 1349 x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 1350 + v[31]*s5 + v[38]*s6 + v[45]*s7; 1351 x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 1352 + v[32]*s5 + v[39]*s6 + v[46]*s7; 1353 x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 1354 + v[33]*s5 + v[40]*s6 + v[47]*s7; 1355 x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 1356 + v[34]*s5 + v[41]*s6 + v[48]*s7; 1357 } 1358 1359 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1360 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1361 ierr = PetscLogFlops(2*36*(a->nz) - 6*A->cmap.n);CHKERRQ(ierr); 1362 PetscFunctionReturn(0); 1363 } 1364 1365 #undef __FUNCT__ 1366 #define __FUNCT__ "MatSolve_SeqBAIJ_6" 1367 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 1368 { 1369 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1370 IS iscol=a->col,isrow=a->row; 1371 PetscErrorCode ierr; 1372 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1373 PetscInt *diag = a->diag; 1374 const MatScalar *aa=a->a,*v; 1375 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 1376 const PetscScalar *b; 1377 PetscFunctionBegin; 1378 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1379 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1380 t = a->solve_work; 1381 1382 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1383 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1384 1385 /* forward solve the lower triangular */ 1386 idx = 6*(*r++); 1387 t[0] = b[idx]; t[1] = b[1+idx]; 1388 t[2] = b[2+idx]; t[3] = b[3+idx]; 1389 t[4] = b[4+idx]; t[5] = b[5+idx]; 1390 for (i=1; i<n; i++) { 1391 v = aa + 36*ai[i]; 1392 vi = aj + ai[i]; 1393 nz = diag[i] - ai[i]; 1394 idx = 6*(*r++); 1395 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1396 s5 = b[4+idx]; s6 = b[5+idx]; 1397 while (nz--) { 1398 idx = 6*(*vi++); 1399 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1400 x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 1401 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1402 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1403 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1404 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1405 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1406 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1407 v += 36; 1408 } 1409 idx = 6*i; 1410 t[idx] = s1;t[1+idx] = s2; 1411 t[2+idx] = s3;t[3+idx] = s4; 1412 t[4+idx] = s5;t[5+idx] = s6; 1413 } 1414 /* backward solve the upper triangular */ 1415 for (i=n-1; i>=0; i--){ 1416 v = aa + 36*diag[i] + 36; 1417 vi = aj + diag[i] + 1; 1418 nz = ai[i+1] - diag[i] - 1; 1419 idt = 6*i; 1420 s1 = t[idt]; s2 = t[1+idt]; 1421 s3 = t[2+idt];s4 = t[3+idt]; 1422 s5 = t[4+idt];s6 = t[5+idt]; 1423 while (nz--) { 1424 idx = 6*(*vi++); 1425 x1 = t[idx]; x2 = t[1+idx]; 1426 x3 = t[2+idx]; x4 = t[3+idx]; 1427 x5 = t[4+idx]; x6 = t[5+idx]; 1428 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1429 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1430 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1431 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1432 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1433 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1434 v += 36; 1435 } 1436 idc = 6*(*c--); 1437 v = aa + 36*diag[i]; 1438 x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 1439 v[18]*s4+v[24]*s5+v[30]*s6; 1440 x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 1441 v[19]*s4+v[25]*s5+v[31]*s6; 1442 x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 1443 v[20]*s4+v[26]*s5+v[32]*s6; 1444 x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 1445 v[21]*s4+v[27]*s5+v[33]*s6; 1446 x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 1447 v[22]*s4+v[28]*s5+v[34]*s6; 1448 x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 1449 v[23]*s4+v[29]*s5+v[35]*s6; 1450 } 1451 1452 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1453 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1454 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1455 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1456 ierr = PetscLogFlops(2*36*(a->nz) - 6*A->cmap.n);CHKERRQ(ierr); 1457 PetscFunctionReturn(0); 1458 } 1459 1460 #undef __FUNCT__ 1461 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering" 1462 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 1463 { 1464 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1465 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1466 PetscErrorCode ierr; 1467 PetscInt *diag = a->diag,jdx; 1468 const MatScalar *aa=a->a,*v; 1469 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 1470 const PetscScalar *b; 1471 1472 PetscFunctionBegin; 1473 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1474 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1475 /* forward solve the lower triangular */ 1476 idx = 0; 1477 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 1478 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 1479 for (i=1; i<n; i++) { 1480 v = aa + 36*ai[i]; 1481 vi = aj + ai[i]; 1482 nz = diag[i] - ai[i]; 1483 idx = 6*i; 1484 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1485 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1486 while (nz--) { 1487 jdx = 6*(*vi++); 1488 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 1489 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 1490 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1491 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1492 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1493 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1494 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1495 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1496 v += 36; 1497 } 1498 x[idx] = s1; 1499 x[1+idx] = s2; 1500 x[2+idx] = s3; 1501 x[3+idx] = s4; 1502 x[4+idx] = s5; 1503 x[5+idx] = s6; 1504 } 1505 /* backward solve the upper triangular */ 1506 for (i=n-1; i>=0; i--){ 1507 v = aa + 36*diag[i] + 36; 1508 vi = aj + diag[i] + 1; 1509 nz = ai[i+1] - diag[i] - 1; 1510 idt = 6*i; 1511 s1 = x[idt]; s2 = x[1+idt]; 1512 s3 = x[2+idt]; s4 = x[3+idt]; 1513 s5 = x[4+idt]; s6 = x[5+idt]; 1514 while (nz--) { 1515 idx = 6*(*vi++); 1516 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 1517 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 1518 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1519 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1520 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1521 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1522 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1523 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1524 v += 36; 1525 } 1526 v = aa + 36*diag[i]; 1527 x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 1528 x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 1529 x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 1530 x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 1531 x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 1532 x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 1533 } 1534 1535 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1536 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1537 ierr = PetscLogFlops(2*36*(a->nz) - 6*A->cmap.n);CHKERRQ(ierr); 1538 PetscFunctionReturn(0); 1539 } 1540 1541 #undef __FUNCT__ 1542 #define __FUNCT__ "MatSolve_SeqBAIJ_5" 1543 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 1544 { 1545 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1546 IS iscol=a->col,isrow=a->row; 1547 PetscErrorCode ierr; 1548 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1549 PetscInt *diag = a->diag; 1550 const MatScalar *aa=a->a,*v; 1551 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 1552 const PetscScalar *b; 1553 1554 PetscFunctionBegin; 1555 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1556 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1557 t = a->solve_work; 1558 1559 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1560 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1561 1562 /* forward solve the lower triangular */ 1563 idx = 5*(*r++); 1564 t[0] = b[idx]; t[1] = b[1+idx]; 1565 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1566 for (i=1; i<n; i++) { 1567 v = aa + 25*ai[i]; 1568 vi = aj + ai[i]; 1569 nz = diag[i] - ai[i]; 1570 idx = 5*(*r++); 1571 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1572 s5 = b[4+idx]; 1573 while (nz--) { 1574 idx = 5*(*vi++); 1575 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1576 x4 = t[3+idx];x5 = t[4+idx]; 1577 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1578 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1579 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1580 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1581 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1582 v += 25; 1583 } 1584 idx = 5*i; 1585 t[idx] = s1;t[1+idx] = s2; 1586 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1587 } 1588 /* backward solve the upper triangular */ 1589 for (i=n-1; i>=0; i--){ 1590 v = aa + 25*diag[i] + 25; 1591 vi = aj + diag[i] + 1; 1592 nz = ai[i+1] - diag[i] - 1; 1593 idt = 5*i; 1594 s1 = t[idt]; s2 = t[1+idt]; 1595 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1596 while (nz--) { 1597 idx = 5*(*vi++); 1598 x1 = t[idx]; x2 = t[1+idx]; 1599 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1600 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1601 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1602 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1603 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1604 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1605 v += 25; 1606 } 1607 idc = 5*(*c--); 1608 v = aa + 25*diag[i]; 1609 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 1610 v[15]*s4+v[20]*s5; 1611 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 1612 v[16]*s4+v[21]*s5; 1613 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 1614 v[17]*s4+v[22]*s5; 1615 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 1616 v[18]*s4+v[23]*s5; 1617 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 1618 v[19]*s4+v[24]*s5; 1619 } 1620 1621 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1622 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1623 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1624 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1625 ierr = PetscLogFlops(2*25*(a->nz) - 5*A->cmap.n);CHKERRQ(ierr); 1626 PetscFunctionReturn(0); 1627 } 1628 1629 #undef __FUNCT__ 1630 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering" 1631 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 1632 { 1633 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1634 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1635 PetscErrorCode ierr; 1636 PetscInt *diag = a->diag,jdx; 1637 const MatScalar *aa=a->a,*v; 1638 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 1639 const PetscScalar *b; 1640 1641 PetscFunctionBegin; 1642 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1643 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1644 /* forward solve the lower triangular */ 1645 idx = 0; 1646 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 1647 for (i=1; i<n; i++) { 1648 v = aa + 25*ai[i]; 1649 vi = aj + ai[i]; 1650 nz = diag[i] - ai[i]; 1651 idx = 5*i; 1652 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 1653 while (nz--) { 1654 jdx = 5*(*vi++); 1655 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 1656 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1657 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1658 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1659 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1660 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1661 v += 25; 1662 } 1663 x[idx] = s1; 1664 x[1+idx] = s2; 1665 x[2+idx] = s3; 1666 x[3+idx] = s4; 1667 x[4+idx] = s5; 1668 } 1669 /* backward solve the upper triangular */ 1670 for (i=n-1; i>=0; i--){ 1671 v = aa + 25*diag[i] + 25; 1672 vi = aj + diag[i] + 1; 1673 nz = ai[i+1] - diag[i] - 1; 1674 idt = 5*i; 1675 s1 = x[idt]; s2 = x[1+idt]; 1676 s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 1677 while (nz--) { 1678 idx = 5*(*vi++); 1679 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 1680 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1681 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1682 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1683 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1684 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1685 v += 25; 1686 } 1687 v = aa + 25*diag[i]; 1688 x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 1689 x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 1690 x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 1691 x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 1692 x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 1693 } 1694 1695 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1696 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1697 ierr = PetscLogFlops(2*25*(a->nz) - 5*A->cmap.n);CHKERRQ(ierr); 1698 PetscFunctionReturn(0); 1699 } 1700 1701 #undef __FUNCT__ 1702 #define __FUNCT__ "MatSolve_SeqBAIJ_4" 1703 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 1704 { 1705 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1706 IS iscol=a->col,isrow=a->row; 1707 PetscErrorCode ierr; 1708 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1709 PetscInt *diag = a->diag; 1710 const MatScalar *aa=a->a,*v; 1711 PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 1712 const PetscScalar *b; 1713 1714 PetscFunctionBegin; 1715 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1716 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1717 t = a->solve_work; 1718 1719 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1720 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1721 1722 /* forward solve the lower triangular */ 1723 idx = 4*(*r++); 1724 t[0] = b[idx]; t[1] = b[1+idx]; 1725 t[2] = b[2+idx]; t[3] = b[3+idx]; 1726 for (i=1; i<n; i++) { 1727 v = aa + 16*ai[i]; 1728 vi = aj + ai[i]; 1729 nz = diag[i] - ai[i]; 1730 idx = 4*(*r++); 1731 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1732 while (nz--) { 1733 idx = 4*(*vi++); 1734 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 1735 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1736 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1737 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1738 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1739 v += 16; 1740 } 1741 idx = 4*i; 1742 t[idx] = s1;t[1+idx] = s2; 1743 t[2+idx] = s3;t[3+idx] = s4; 1744 } 1745 /* backward solve the upper triangular */ 1746 for (i=n-1; i>=0; i--){ 1747 v = aa + 16*diag[i] + 16; 1748 vi = aj + diag[i] + 1; 1749 nz = ai[i+1] - diag[i] - 1; 1750 idt = 4*i; 1751 s1 = t[idt]; s2 = t[1+idt]; 1752 s3 = t[2+idt];s4 = t[3+idt]; 1753 while (nz--) { 1754 idx = 4*(*vi++); 1755 x1 = t[idx]; x2 = t[1+idx]; 1756 x3 = t[2+idx]; x4 = t[3+idx]; 1757 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1758 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1759 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1760 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1761 v += 16; 1762 } 1763 idc = 4*(*c--); 1764 v = aa + 16*diag[i]; 1765 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 1766 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 1767 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 1768 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 1769 } 1770 1771 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1772 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1773 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1774 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1775 ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap.n);CHKERRQ(ierr); 1776 PetscFunctionReturn(0); 1777 } 1778 1779 #undef __FUNCT__ 1780 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion" 1781 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx) 1782 { 1783 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1784 IS iscol=a->col,isrow=a->row; 1785 PetscErrorCode ierr; 1786 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1787 PetscInt *diag = a->diag; 1788 const MatScalar *aa=a->a,*v; 1789 MatScalar s1,s2,s3,s4,x1,x2,x3,x4,*t; 1790 PetscScalar *x; 1791 const PetscScalar *b; 1792 1793 PetscFunctionBegin; 1794 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1795 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1796 t = (MatScalar *)a->solve_work; 1797 1798 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1799 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1800 1801 /* forward solve the lower triangular */ 1802 idx = 4*(*r++); 1803 t[0] = (MatScalar)b[idx]; 1804 t[1] = (MatScalar)b[1+idx]; 1805 t[2] = (MatScalar)b[2+idx]; 1806 t[3] = (MatScalar)b[3+idx]; 1807 for (i=1; i<n; i++) { 1808 v = aa + 16*ai[i]; 1809 vi = aj + ai[i]; 1810 nz = diag[i] - ai[i]; 1811 idx = 4*(*r++); 1812 s1 = (MatScalar)b[idx]; 1813 s2 = (MatScalar)b[1+idx]; 1814 s3 = (MatScalar)b[2+idx]; 1815 s4 = (MatScalar)b[3+idx]; 1816 while (nz--) { 1817 idx = 4*(*vi++); 1818 x1 = t[idx]; 1819 x2 = t[1+idx]; 1820 x3 = t[2+idx]; 1821 x4 = t[3+idx]; 1822 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1823 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1824 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1825 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1826 v += 16; 1827 } 1828 idx = 4*i; 1829 t[idx] = s1; 1830 t[1+idx] = s2; 1831 t[2+idx] = s3; 1832 t[3+idx] = s4; 1833 } 1834 /* backward solve the upper triangular */ 1835 for (i=n-1; i>=0; i--){ 1836 v = aa + 16*diag[i] + 16; 1837 vi = aj + diag[i] + 1; 1838 nz = ai[i+1] - diag[i] - 1; 1839 idt = 4*i; 1840 s1 = t[idt]; 1841 s2 = t[1+idt]; 1842 s3 = t[2+idt]; 1843 s4 = t[3+idt]; 1844 while (nz--) { 1845 idx = 4*(*vi++); 1846 x1 = t[idx]; 1847 x2 = t[1+idx]; 1848 x3 = t[2+idx]; 1849 x4 = t[3+idx]; 1850 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1851 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1852 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1853 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1854 v += 16; 1855 } 1856 idc = 4*(*c--); 1857 v = aa + 16*diag[i]; 1858 t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 1859 t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 1860 t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 1861 t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 1862 x[idc] = (PetscScalar)t[idt]; 1863 x[1+idc] = (PetscScalar)t[1+idt]; 1864 x[2+idc] = (PetscScalar)t[2+idt]; 1865 x[3+idc] = (PetscScalar)t[3+idt]; 1866 } 1867 1868 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1869 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1870 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1871 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1872 ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap.n);CHKERRQ(ierr); 1873 PetscFunctionReturn(0); 1874 } 1875 1876 #if defined (PETSC_HAVE_SSE) 1877 1878 #include PETSC_HAVE_SSE 1879 1880 #undef __FUNCT__ 1881 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion" 1882 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx) 1883 { 1884 /* 1885 Note: This code uses demotion of double 1886 to float when performing the mixed-mode computation. 1887 This may not be numerically reasonable for all applications. 1888 */ 1889 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1890 IS iscol=a->col,isrow=a->row; 1891 PetscErrorCode ierr; 1892 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1893 PetscInt *diag = a->diag,ai16; 1894 MatScalar *aa=a->a,*v; 1895 PetscScalar *x,*b,*t; 1896 1897 /* Make space in temp stack for 16 Byte Aligned arrays */ 1898 float ssealignedspace[11],*tmps,*tmpx; 1899 unsigned long offset; 1900 1901 PetscFunctionBegin; 1902 SSE_SCOPE_BEGIN; 1903 1904 offset = (unsigned long)ssealignedspace % 16; 1905 if (offset) offset = (16 - offset)/4; 1906 tmps = &ssealignedspace[offset]; 1907 tmpx = &ssealignedspace[offset+4]; 1908 PREFETCH_NTA(aa+16*ai[1]); 1909 1910 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1911 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1912 t = a->solve_work; 1913 1914 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1915 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1916 1917 /* forward solve the lower triangular */ 1918 idx = 4*(*r++); 1919 t[0] = b[idx]; t[1] = b[1+idx]; 1920 t[2] = b[2+idx]; t[3] = b[3+idx]; 1921 v = aa + 16*ai[1]; 1922 1923 for (i=1; i<n;) { 1924 PREFETCH_NTA(&v[8]); 1925 vi = aj + ai[i]; 1926 nz = diag[i] - ai[i]; 1927 idx = 4*(*r++); 1928 1929 /* Demote sum from double to float */ 1930 CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 1931 LOAD_PS(tmps,XMM7); 1932 1933 while (nz--) { 1934 PREFETCH_NTA(&v[16]); 1935 idx = 4*(*vi++); 1936 1937 /* Demote solution (so far) from double to float */ 1938 CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 1939 1940 /* 4x4 Matrix-Vector product with negative accumulation: */ 1941 SSE_INLINE_BEGIN_2(tmpx,v) 1942 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 1943 1944 /* First Column */ 1945 SSE_COPY_PS(XMM0,XMM6) 1946 SSE_SHUFFLE(XMM0,XMM0,0x00) 1947 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 1948 SSE_SUB_PS(XMM7,XMM0) 1949 1950 /* Second Column */ 1951 SSE_COPY_PS(XMM1,XMM6) 1952 SSE_SHUFFLE(XMM1,XMM1,0x55) 1953 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 1954 SSE_SUB_PS(XMM7,XMM1) 1955 1956 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 1957 1958 /* Third Column */ 1959 SSE_COPY_PS(XMM2,XMM6) 1960 SSE_SHUFFLE(XMM2,XMM2,0xAA) 1961 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 1962 SSE_SUB_PS(XMM7,XMM2) 1963 1964 /* Fourth Column */ 1965 SSE_COPY_PS(XMM3,XMM6) 1966 SSE_SHUFFLE(XMM3,XMM3,0xFF) 1967 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 1968 SSE_SUB_PS(XMM7,XMM3) 1969 SSE_INLINE_END_2 1970 1971 v += 16; 1972 } 1973 idx = 4*i; 1974 v = aa + 16*ai[++i]; 1975 PREFETCH_NTA(v); 1976 STORE_PS(tmps,XMM7); 1977 1978 /* Promote result from float to double */ 1979 CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps); 1980 } 1981 /* backward solve the upper triangular */ 1982 idt = 4*(n-1); 1983 ai16 = 16*diag[n-1]; 1984 v = aa + ai16 + 16; 1985 for (i=n-1; i>=0;){ 1986 PREFETCH_NTA(&v[8]); 1987 vi = aj + diag[i] + 1; 1988 nz = ai[i+1] - diag[i] - 1; 1989 1990 /* Demote accumulator from double to float */ 1991 CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]); 1992 LOAD_PS(tmps,XMM7); 1993 1994 while (nz--) { 1995 PREFETCH_NTA(&v[16]); 1996 idx = 4*(*vi++); 1997 1998 /* Demote solution (so far) from double to float */ 1999 CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]); 2000 2001 /* 4x4 Matrix-Vector Product with negative accumulation: */ 2002 SSE_INLINE_BEGIN_2(tmpx,v) 2003 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2004 2005 /* First Column */ 2006 SSE_COPY_PS(XMM0,XMM6) 2007 SSE_SHUFFLE(XMM0,XMM0,0x00) 2008 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2009 SSE_SUB_PS(XMM7,XMM0) 2010 2011 /* Second Column */ 2012 SSE_COPY_PS(XMM1,XMM6) 2013 SSE_SHUFFLE(XMM1,XMM1,0x55) 2014 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2015 SSE_SUB_PS(XMM7,XMM1) 2016 2017 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2018 2019 /* Third Column */ 2020 SSE_COPY_PS(XMM2,XMM6) 2021 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2022 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2023 SSE_SUB_PS(XMM7,XMM2) 2024 2025 /* Fourth Column */ 2026 SSE_COPY_PS(XMM3,XMM6) 2027 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2028 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2029 SSE_SUB_PS(XMM7,XMM3) 2030 SSE_INLINE_END_2 2031 v += 16; 2032 } 2033 v = aa + ai16; 2034 ai16 = 16*diag[--i]; 2035 PREFETCH_NTA(aa+ai16+16); 2036 /* 2037 Scale the result by the diagonal 4x4 block, 2038 which was inverted as part of the factorization 2039 */ 2040 SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 2041 /* First Column */ 2042 SSE_COPY_PS(XMM0,XMM7) 2043 SSE_SHUFFLE(XMM0,XMM0,0x00) 2044 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 2045 2046 /* Second Column */ 2047 SSE_COPY_PS(XMM1,XMM7) 2048 SSE_SHUFFLE(XMM1,XMM1,0x55) 2049 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 2050 SSE_ADD_PS(XMM0,XMM1) 2051 2052 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 2053 2054 /* Third Column */ 2055 SSE_COPY_PS(XMM2,XMM7) 2056 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2057 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 2058 SSE_ADD_PS(XMM0,XMM2) 2059 2060 /* Fourth Column */ 2061 SSE_COPY_PS(XMM3,XMM7) 2062 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2063 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 2064 SSE_ADD_PS(XMM0,XMM3) 2065 2066 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 2067 SSE_INLINE_END_3 2068 2069 /* Promote solution from float to double */ 2070 CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps); 2071 2072 /* Apply reordering to t and stream into x. */ 2073 /* This way, x doesn't pollute the cache. */ 2074 /* Be careful with size: 2 doubles = 4 floats! */ 2075 idc = 4*(*c--); 2076 SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc]) 2077 /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 2078 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 2079 SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0) 2080 /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 2081 SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1) 2082 SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1) 2083 SSE_INLINE_END_2 2084 v = aa + ai16 + 16; 2085 idt -= 4; 2086 } 2087 2088 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2089 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2090 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2091 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2092 ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap.n);CHKERRQ(ierr); 2093 SSE_SCOPE_END; 2094 PetscFunctionReturn(0); 2095 } 2096 2097 #endif 2098 2099 2100 /* 2101 Special case where the matrix was ILU(0) factored in the natural 2102 ordering. This eliminates the need for the column and row permutation. 2103 */ 2104 #undef __FUNCT__ 2105 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering" 2106 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 2107 { 2108 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2109 PetscInt n=a->mbs; 2110 const PetscInt *ai=a->i,*aj=a->j; 2111 PetscErrorCode ierr; 2112 const PetscInt *diag = a->diag; 2113 const MatScalar *aa=a->a; 2114 PetscScalar *x; 2115 const PetscScalar *b; 2116 2117 PetscFunctionBegin; 2118 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2119 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2120 2121 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS) 2122 { 2123 static PetscScalar w[2000]; /* very BAD need to fix */ 2124 fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w); 2125 } 2126 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 2127 { 2128 static PetscScalar w[2000]; /* very BAD need to fix */ 2129 fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w); 2130 } 2131 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 2132 fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b); 2133 #else 2134 { 2135 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 2136 const MatScalar *v; 2137 PetscInt jdx,idt,idx,nz,i,ai16; 2138 const PetscInt *vi; 2139 2140 /* forward solve the lower triangular */ 2141 idx = 0; 2142 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 2143 for (i=1; i<n; i++) { 2144 v = aa + 16*ai[i]; 2145 vi = aj + ai[i]; 2146 nz = diag[i] - ai[i]; 2147 idx += 4; 2148 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2149 while (nz--) { 2150 jdx = 4*(*vi++); 2151 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx]; 2152 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2153 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2154 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2155 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2156 v += 16; 2157 } 2158 x[idx] = s1; 2159 x[1+idx] = s2; 2160 x[2+idx] = s3; 2161 x[3+idx] = s4; 2162 } 2163 /* backward solve the upper triangular */ 2164 idt = 4*(n-1); 2165 for (i=n-1; i>=0; i--){ 2166 ai16 = 16*diag[i]; 2167 v = aa + ai16 + 16; 2168 vi = aj + diag[i] + 1; 2169 nz = ai[i+1] - diag[i] - 1; 2170 s1 = x[idt]; s2 = x[1+idt]; 2171 s3 = x[2+idt];s4 = x[3+idt]; 2172 while (nz--) { 2173 idx = 4*(*vi++); 2174 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; 2175 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2176 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2177 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2178 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2179 v += 16; 2180 } 2181 v = aa + ai16; 2182 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 2183 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 2184 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 2185 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 2186 idt -= 4; 2187 } 2188 } 2189 #endif 2190 2191 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2192 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2193 ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap.n);CHKERRQ(ierr); 2194 PetscFunctionReturn(0); 2195 } 2196 2197 #undef __FUNCT__ 2198 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion" 2199 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx) 2200 { 2201 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2202 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 2203 PetscErrorCode ierr; 2204 PetscInt *diag = a->diag; 2205 MatScalar *aa=a->a; 2206 PetscScalar *x,*b; 2207 2208 PetscFunctionBegin; 2209 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2210 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2211 2212 { 2213 MatScalar s1,s2,s3,s4,x1,x2,x3,x4; 2214 MatScalar *v,*t=(MatScalar *)x; 2215 PetscInt jdx,idt,idx,nz,*vi,i,ai16; 2216 2217 /* forward solve the lower triangular */ 2218 idx = 0; 2219 t[0] = (MatScalar)b[0]; 2220 t[1] = (MatScalar)b[1]; 2221 t[2] = (MatScalar)b[2]; 2222 t[3] = (MatScalar)b[3]; 2223 for (i=1; i<n; i++) { 2224 v = aa + 16*ai[i]; 2225 vi = aj + ai[i]; 2226 nz = diag[i] - ai[i]; 2227 idx += 4; 2228 s1 = (MatScalar)b[idx]; 2229 s2 = (MatScalar)b[1+idx]; 2230 s3 = (MatScalar)b[2+idx]; 2231 s4 = (MatScalar)b[3+idx]; 2232 while (nz--) { 2233 jdx = 4*(*vi++); 2234 x1 = t[jdx]; 2235 x2 = t[1+jdx]; 2236 x3 = t[2+jdx]; 2237 x4 = t[3+jdx]; 2238 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2239 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2240 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2241 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2242 v += 16; 2243 } 2244 t[idx] = s1; 2245 t[1+idx] = s2; 2246 t[2+idx] = s3; 2247 t[3+idx] = s4; 2248 } 2249 /* backward solve the upper triangular */ 2250 idt = 4*(n-1); 2251 for (i=n-1; i>=0; i--){ 2252 ai16 = 16*diag[i]; 2253 v = aa + ai16 + 16; 2254 vi = aj + diag[i] + 1; 2255 nz = ai[i+1] - diag[i] - 1; 2256 s1 = t[idt]; 2257 s2 = t[1+idt]; 2258 s3 = t[2+idt]; 2259 s4 = t[3+idt]; 2260 while (nz--) { 2261 idx = 4*(*vi++); 2262 x1 = (MatScalar)x[idx]; 2263 x2 = (MatScalar)x[1+idx]; 2264 x3 = (MatScalar)x[2+idx]; 2265 x4 = (MatScalar)x[3+idx]; 2266 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2267 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2268 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2269 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2270 v += 16; 2271 } 2272 v = aa + ai16; 2273 x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4); 2274 x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4); 2275 x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4); 2276 x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4); 2277 idt -= 4; 2278 } 2279 } 2280 2281 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2282 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2283 ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap.n);CHKERRQ(ierr); 2284 PetscFunctionReturn(0); 2285 } 2286 2287 #if defined (PETSC_HAVE_SSE) 2288 2289 #include PETSC_HAVE_SSE 2290 #undef __FUNCT__ 2291 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj" 2292 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx) 2293 { 2294 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2295 unsigned short *aj=(unsigned short *)a->j; 2296 PetscErrorCode ierr; 2297 int *ai=a->i,n=a->mbs,*diag = a->diag; 2298 MatScalar *aa=a->a; 2299 PetscScalar *x,*b; 2300 2301 PetscFunctionBegin; 2302 SSE_SCOPE_BEGIN; 2303 /* 2304 Note: This code currently uses demotion of double 2305 to float when performing the mixed-mode computation. 2306 This may not be numerically reasonable for all applications. 2307 */ 2308 PREFETCH_NTA(aa+16*ai[1]); 2309 2310 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2311 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2312 { 2313 /* x will first be computed in single precision then promoted inplace to double */ 2314 MatScalar *v,*t=(MatScalar *)x; 2315 int nz,i,idt,ai16; 2316 unsigned int jdx,idx; 2317 unsigned short *vi; 2318 /* Forward solve the lower triangular factor. */ 2319 2320 /* First block is the identity. */ 2321 idx = 0; 2322 CONVERT_DOUBLE4_FLOAT4(t,b); 2323 v = aa + 16*((unsigned int)ai[1]); 2324 2325 for (i=1; i<n;) { 2326 PREFETCH_NTA(&v[8]); 2327 vi = aj + ai[i]; 2328 nz = diag[i] - ai[i]; 2329 idx += 4; 2330 2331 /* Demote RHS from double to float. */ 2332 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 2333 LOAD_PS(&t[idx],XMM7); 2334 2335 while (nz--) { 2336 PREFETCH_NTA(&v[16]); 2337 jdx = 4*((unsigned int)(*vi++)); 2338 2339 /* 4x4 Matrix-Vector product with negative accumulation: */ 2340 SSE_INLINE_BEGIN_2(&t[jdx],v) 2341 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2342 2343 /* First Column */ 2344 SSE_COPY_PS(XMM0,XMM6) 2345 SSE_SHUFFLE(XMM0,XMM0,0x00) 2346 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2347 SSE_SUB_PS(XMM7,XMM0) 2348 2349 /* Second Column */ 2350 SSE_COPY_PS(XMM1,XMM6) 2351 SSE_SHUFFLE(XMM1,XMM1,0x55) 2352 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2353 SSE_SUB_PS(XMM7,XMM1) 2354 2355 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2356 2357 /* Third Column */ 2358 SSE_COPY_PS(XMM2,XMM6) 2359 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2360 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2361 SSE_SUB_PS(XMM7,XMM2) 2362 2363 /* Fourth Column */ 2364 SSE_COPY_PS(XMM3,XMM6) 2365 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2366 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2367 SSE_SUB_PS(XMM7,XMM3) 2368 SSE_INLINE_END_2 2369 2370 v += 16; 2371 } 2372 v = aa + 16*ai[++i]; 2373 PREFETCH_NTA(v); 2374 STORE_PS(&t[idx],XMM7); 2375 } 2376 2377 /* Backward solve the upper triangular factor.*/ 2378 2379 idt = 4*(n-1); 2380 ai16 = 16*diag[n-1]; 2381 v = aa + ai16 + 16; 2382 for (i=n-1; i>=0;){ 2383 PREFETCH_NTA(&v[8]); 2384 vi = aj + diag[i] + 1; 2385 nz = ai[i+1] - diag[i] - 1; 2386 2387 LOAD_PS(&t[idt],XMM7); 2388 2389 while (nz--) { 2390 PREFETCH_NTA(&v[16]); 2391 idx = 4*((unsigned int)(*vi++)); 2392 2393 /* 4x4 Matrix-Vector Product with negative accumulation: */ 2394 SSE_INLINE_BEGIN_2(&t[idx],v) 2395 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2396 2397 /* First Column */ 2398 SSE_COPY_PS(XMM0,XMM6) 2399 SSE_SHUFFLE(XMM0,XMM0,0x00) 2400 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2401 SSE_SUB_PS(XMM7,XMM0) 2402 2403 /* Second Column */ 2404 SSE_COPY_PS(XMM1,XMM6) 2405 SSE_SHUFFLE(XMM1,XMM1,0x55) 2406 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2407 SSE_SUB_PS(XMM7,XMM1) 2408 2409 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2410 2411 /* Third Column */ 2412 SSE_COPY_PS(XMM2,XMM6) 2413 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2414 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2415 SSE_SUB_PS(XMM7,XMM2) 2416 2417 /* Fourth Column */ 2418 SSE_COPY_PS(XMM3,XMM6) 2419 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2420 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2421 SSE_SUB_PS(XMM7,XMM3) 2422 SSE_INLINE_END_2 2423 v += 16; 2424 } 2425 v = aa + ai16; 2426 ai16 = 16*diag[--i]; 2427 PREFETCH_NTA(aa+ai16+16); 2428 /* 2429 Scale the result by the diagonal 4x4 block, 2430 which was inverted as part of the factorization 2431 */ 2432 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 2433 /* First Column */ 2434 SSE_COPY_PS(XMM0,XMM7) 2435 SSE_SHUFFLE(XMM0,XMM0,0x00) 2436 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 2437 2438 /* Second Column */ 2439 SSE_COPY_PS(XMM1,XMM7) 2440 SSE_SHUFFLE(XMM1,XMM1,0x55) 2441 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 2442 SSE_ADD_PS(XMM0,XMM1) 2443 2444 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 2445 2446 /* Third Column */ 2447 SSE_COPY_PS(XMM2,XMM7) 2448 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2449 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 2450 SSE_ADD_PS(XMM0,XMM2) 2451 2452 /* Fourth Column */ 2453 SSE_COPY_PS(XMM3,XMM7) 2454 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2455 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 2456 SSE_ADD_PS(XMM0,XMM3) 2457 2458 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 2459 SSE_INLINE_END_3 2460 2461 v = aa + ai16 + 16; 2462 idt -= 4; 2463 } 2464 2465 /* Convert t from single precision back to double precision (inplace)*/ 2466 idt = 4*(n-1); 2467 for (i=n-1;i>=0;i--) { 2468 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 2469 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 2470 PetscScalar *xtemp=&x[idt]; 2471 MatScalar *ttemp=&t[idt]; 2472 xtemp[3] = (PetscScalar)ttemp[3]; 2473 xtemp[2] = (PetscScalar)ttemp[2]; 2474 xtemp[1] = (PetscScalar)ttemp[1]; 2475 xtemp[0] = (PetscScalar)ttemp[0]; 2476 idt -= 4; 2477 } 2478 2479 } /* End of artificial scope. */ 2480 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2481 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2482 ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap.n);CHKERRQ(ierr); 2483 SSE_SCOPE_END; 2484 PetscFunctionReturn(0); 2485 } 2486 2487 #undef __FUNCT__ 2488 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion" 2489 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx) 2490 { 2491 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2492 int *aj=a->j; 2493 PetscErrorCode ierr; 2494 int *ai=a->i,n=a->mbs,*diag = a->diag; 2495 MatScalar *aa=a->a; 2496 PetscScalar *x,*b; 2497 2498 PetscFunctionBegin; 2499 SSE_SCOPE_BEGIN; 2500 /* 2501 Note: This code currently uses demotion of double 2502 to float when performing the mixed-mode computation. 2503 This may not be numerically reasonable for all applications. 2504 */ 2505 PREFETCH_NTA(aa+16*ai[1]); 2506 2507 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2508 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2509 { 2510 /* x will first be computed in single precision then promoted inplace to double */ 2511 MatScalar *v,*t=(MatScalar *)x; 2512 int nz,i,idt,ai16; 2513 int jdx,idx; 2514 int *vi; 2515 /* Forward solve the lower triangular factor. */ 2516 2517 /* First block is the identity. */ 2518 idx = 0; 2519 CONVERT_DOUBLE4_FLOAT4(t,b); 2520 v = aa + 16*ai[1]; 2521 2522 for (i=1; i<n;) { 2523 PREFETCH_NTA(&v[8]); 2524 vi = aj + ai[i]; 2525 nz = diag[i] - ai[i]; 2526 idx += 4; 2527 2528 /* Demote RHS from double to float. */ 2529 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 2530 LOAD_PS(&t[idx],XMM7); 2531 2532 while (nz--) { 2533 PREFETCH_NTA(&v[16]); 2534 jdx = 4*(*vi++); 2535 /* jdx = *vi++; */ 2536 2537 /* 4x4 Matrix-Vector product with negative accumulation: */ 2538 SSE_INLINE_BEGIN_2(&t[jdx],v) 2539 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2540 2541 /* First Column */ 2542 SSE_COPY_PS(XMM0,XMM6) 2543 SSE_SHUFFLE(XMM0,XMM0,0x00) 2544 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2545 SSE_SUB_PS(XMM7,XMM0) 2546 2547 /* Second Column */ 2548 SSE_COPY_PS(XMM1,XMM6) 2549 SSE_SHUFFLE(XMM1,XMM1,0x55) 2550 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2551 SSE_SUB_PS(XMM7,XMM1) 2552 2553 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2554 2555 /* Third Column */ 2556 SSE_COPY_PS(XMM2,XMM6) 2557 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2558 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2559 SSE_SUB_PS(XMM7,XMM2) 2560 2561 /* Fourth Column */ 2562 SSE_COPY_PS(XMM3,XMM6) 2563 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2564 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2565 SSE_SUB_PS(XMM7,XMM3) 2566 SSE_INLINE_END_2 2567 2568 v += 16; 2569 } 2570 v = aa + 16*ai[++i]; 2571 PREFETCH_NTA(v); 2572 STORE_PS(&t[idx],XMM7); 2573 } 2574 2575 /* Backward solve the upper triangular factor.*/ 2576 2577 idt = 4*(n-1); 2578 ai16 = 16*diag[n-1]; 2579 v = aa + ai16 + 16; 2580 for (i=n-1; i>=0;){ 2581 PREFETCH_NTA(&v[8]); 2582 vi = aj + diag[i] + 1; 2583 nz = ai[i+1] - diag[i] - 1; 2584 2585 LOAD_PS(&t[idt],XMM7); 2586 2587 while (nz--) { 2588 PREFETCH_NTA(&v[16]); 2589 idx = 4*(*vi++); 2590 /* idx = *vi++; */ 2591 2592 /* 4x4 Matrix-Vector Product with negative accumulation: */ 2593 SSE_INLINE_BEGIN_2(&t[idx],v) 2594 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2595 2596 /* First Column */ 2597 SSE_COPY_PS(XMM0,XMM6) 2598 SSE_SHUFFLE(XMM0,XMM0,0x00) 2599 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2600 SSE_SUB_PS(XMM7,XMM0) 2601 2602 /* Second Column */ 2603 SSE_COPY_PS(XMM1,XMM6) 2604 SSE_SHUFFLE(XMM1,XMM1,0x55) 2605 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2606 SSE_SUB_PS(XMM7,XMM1) 2607 2608 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2609 2610 /* Third Column */ 2611 SSE_COPY_PS(XMM2,XMM6) 2612 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2613 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2614 SSE_SUB_PS(XMM7,XMM2) 2615 2616 /* Fourth Column */ 2617 SSE_COPY_PS(XMM3,XMM6) 2618 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2619 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2620 SSE_SUB_PS(XMM7,XMM3) 2621 SSE_INLINE_END_2 2622 v += 16; 2623 } 2624 v = aa + ai16; 2625 ai16 = 16*diag[--i]; 2626 PREFETCH_NTA(aa+ai16+16); 2627 /* 2628 Scale the result by the diagonal 4x4 block, 2629 which was inverted as part of the factorization 2630 */ 2631 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 2632 /* First Column */ 2633 SSE_COPY_PS(XMM0,XMM7) 2634 SSE_SHUFFLE(XMM0,XMM0,0x00) 2635 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 2636 2637 /* Second Column */ 2638 SSE_COPY_PS(XMM1,XMM7) 2639 SSE_SHUFFLE(XMM1,XMM1,0x55) 2640 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 2641 SSE_ADD_PS(XMM0,XMM1) 2642 2643 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 2644 2645 /* Third Column */ 2646 SSE_COPY_PS(XMM2,XMM7) 2647 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2648 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 2649 SSE_ADD_PS(XMM0,XMM2) 2650 2651 /* Fourth Column */ 2652 SSE_COPY_PS(XMM3,XMM7) 2653 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2654 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 2655 SSE_ADD_PS(XMM0,XMM3) 2656 2657 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 2658 SSE_INLINE_END_3 2659 2660 v = aa + ai16 + 16; 2661 idt -= 4; 2662 } 2663 2664 /* Convert t from single precision back to double precision (inplace)*/ 2665 idt = 4*(n-1); 2666 for (i=n-1;i>=0;i--) { 2667 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 2668 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 2669 PetscScalar *xtemp=&x[idt]; 2670 MatScalar *ttemp=&t[idt]; 2671 xtemp[3] = (PetscScalar)ttemp[3]; 2672 xtemp[2] = (PetscScalar)ttemp[2]; 2673 xtemp[1] = (PetscScalar)ttemp[1]; 2674 xtemp[0] = (PetscScalar)ttemp[0]; 2675 idt -= 4; 2676 } 2677 2678 } /* End of artificial scope. */ 2679 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2680 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2681 ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap.n);CHKERRQ(ierr); 2682 SSE_SCOPE_END; 2683 PetscFunctionReturn(0); 2684 } 2685 2686 #endif 2687 #undef __FUNCT__ 2688 #define __FUNCT__ "MatSolve_SeqBAIJ_3" 2689 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 2690 { 2691 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2692 IS iscol=a->col,isrow=a->row; 2693 PetscErrorCode ierr; 2694 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 2695 PetscInt *diag = a->diag; 2696 const MatScalar *aa=a->a,*v; 2697 PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 2698 const PetscScalar *b; 2699 2700 PetscFunctionBegin; 2701 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2702 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2703 t = a->solve_work; 2704 2705 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2706 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2707 2708 /* forward solve the lower triangular */ 2709 idx = 3*(*r++); 2710 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 2711 for (i=1; i<n; i++) { 2712 v = aa + 9*ai[i]; 2713 vi = aj + ai[i]; 2714 nz = diag[i] - ai[i]; 2715 idx = 3*(*r++); 2716 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 2717 while (nz--) { 2718 idx = 3*(*vi++); 2719 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2720 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2721 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2722 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2723 v += 9; 2724 } 2725 idx = 3*i; 2726 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 2727 } 2728 /* backward solve the upper triangular */ 2729 for (i=n-1; i>=0; i--){ 2730 v = aa + 9*diag[i] + 9; 2731 vi = aj + diag[i] + 1; 2732 nz = ai[i+1] - diag[i] - 1; 2733 idt = 3*i; 2734 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 2735 while (nz--) { 2736 idx = 3*(*vi++); 2737 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2738 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2739 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2740 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2741 v += 9; 2742 } 2743 idc = 3*(*c--); 2744 v = aa + 9*diag[i]; 2745 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 2746 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 2747 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 2748 } 2749 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2750 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2751 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2752 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2753 ierr = PetscLogFlops(2*9*(a->nz) - 3*A->cmap.n);CHKERRQ(ierr); 2754 PetscFunctionReturn(0); 2755 } 2756 2757 /* 2758 Special case where the matrix was ILU(0) factored in the natural 2759 ordering. This eliminates the need for the column and row permutation. 2760 */ 2761 #undef __FUNCT__ 2762 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering" 2763 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 2764 { 2765 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2766 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 2767 PetscErrorCode ierr; 2768 PetscInt *diag = a->diag; 2769 const MatScalar *aa=a->a,*v; 2770 PetscScalar *x,s1,s2,s3,x1,x2,x3; 2771 const PetscScalar *b; 2772 PetscInt jdx,idt,idx,nz,*vi,i; 2773 2774 PetscFunctionBegin; 2775 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2776 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2777 2778 /* forward solve the lower triangular */ 2779 idx = 0; 2780 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; 2781 for (i=1; i<n; i++) { 2782 v = aa + 9*ai[i]; 2783 vi = aj + ai[i]; 2784 nz = diag[i] - ai[i]; 2785 idx += 3; 2786 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 2787 while (nz--) { 2788 jdx = 3*(*vi++); 2789 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx]; 2790 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2791 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2792 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2793 v += 9; 2794 } 2795 x[idx] = s1; 2796 x[1+idx] = s2; 2797 x[2+idx] = s3; 2798 } 2799 /* backward solve the upper triangular */ 2800 for (i=n-1; i>=0; i--){ 2801 v = aa + 9*diag[i] + 9; 2802 vi = aj + diag[i] + 1; 2803 nz = ai[i+1] - diag[i] - 1; 2804 idt = 3*i; 2805 s1 = x[idt]; s2 = x[1+idt]; 2806 s3 = x[2+idt]; 2807 while (nz--) { 2808 idx = 3*(*vi++); 2809 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; 2810 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2811 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2812 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2813 v += 9; 2814 } 2815 v = aa + 9*diag[i]; 2816 x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 2817 x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 2818 x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 2819 } 2820 2821 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2822 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2823 ierr = PetscLogFlops(2*9*(a->nz) - 3*A->cmap.n);CHKERRQ(ierr); 2824 PetscFunctionReturn(0); 2825 } 2826 2827 #undef __FUNCT__ 2828 #define __FUNCT__ "MatSolve_SeqBAIJ_2" 2829 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 2830 { 2831 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2832 IS iscol=a->col,isrow=a->row; 2833 PetscErrorCode ierr; 2834 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 2835 PetscInt *diag = a->diag; 2836 const MatScalar *aa=a->a,*v; 2837 PetscScalar *x,s1,s2,x1,x2,*t; 2838 const PetscScalar *b; 2839 2840 PetscFunctionBegin; 2841 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2842 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2843 t = a->solve_work; 2844 2845 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2846 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2847 2848 /* forward solve the lower triangular */ 2849 idx = 2*(*r++); 2850 t[0] = b[idx]; t[1] = b[1+idx]; 2851 for (i=1; i<n; i++) { 2852 v = aa + 4*ai[i]; 2853 vi = aj + ai[i]; 2854 nz = diag[i] - ai[i]; 2855 idx = 2*(*r++); 2856 s1 = b[idx]; s2 = b[1+idx]; 2857 while (nz--) { 2858 idx = 2*(*vi++); 2859 x1 = t[idx]; x2 = t[1+idx]; 2860 s1 -= v[0]*x1 + v[2]*x2; 2861 s2 -= v[1]*x1 + v[3]*x2; 2862 v += 4; 2863 } 2864 idx = 2*i; 2865 t[idx] = s1; t[1+idx] = s2; 2866 } 2867 /* backward solve the upper triangular */ 2868 for (i=n-1; i>=0; i--){ 2869 v = aa + 4*diag[i] + 4; 2870 vi = aj + diag[i] + 1; 2871 nz = ai[i+1] - diag[i] - 1; 2872 idt = 2*i; 2873 s1 = t[idt]; s2 = t[1+idt]; 2874 while (nz--) { 2875 idx = 2*(*vi++); 2876 x1 = t[idx]; x2 = t[1+idx]; 2877 s1 -= v[0]*x1 + v[2]*x2; 2878 s2 -= v[1]*x1 + v[3]*x2; 2879 v += 4; 2880 } 2881 idc = 2*(*c--); 2882 v = aa + 4*diag[i]; 2883 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 2884 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 2885 } 2886 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2887 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2888 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2889 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2890 ierr = PetscLogFlops(2*4*(a->nz) - 2*A->cmap.n);CHKERRQ(ierr); 2891 PetscFunctionReturn(0); 2892 } 2893 2894 /* 2895 Special case where the matrix was ILU(0) factored in the natural 2896 ordering. This eliminates the need for the column and row permutation. 2897 */ 2898 #undef __FUNCT__ 2899 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering" 2900 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 2901 { 2902 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2903 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 2904 PetscErrorCode ierr; 2905 PetscInt *diag = a->diag; 2906 const MatScalar *aa=a->a,*v; 2907 PetscScalar *x,s1,s2,x1,x2; 2908 const PetscScalar *b; 2909 PetscInt jdx,idt,idx,nz,*vi,i; 2910 2911 PetscFunctionBegin; 2912 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2913 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2914 2915 /* forward solve the lower triangular */ 2916 idx = 0; 2917 x[0] = b[0]; x[1] = b[1]; 2918 for (i=1; i<n; i++) { 2919 v = aa + 4*ai[i]; 2920 vi = aj + ai[i]; 2921 nz = diag[i] - ai[i]; 2922 idx += 2; 2923 s1 = b[idx];s2 = b[1+idx]; 2924 while (nz--) { 2925 jdx = 2*(*vi++); 2926 x1 = x[jdx];x2 = x[1+jdx]; 2927 s1 -= v[0]*x1 + v[2]*x2; 2928 s2 -= v[1]*x1 + v[3]*x2; 2929 v += 4; 2930 } 2931 x[idx] = s1; 2932 x[1+idx] = s2; 2933 } 2934 /* backward solve the upper triangular */ 2935 for (i=n-1; i>=0; i--){ 2936 v = aa + 4*diag[i] + 4; 2937 vi = aj + diag[i] + 1; 2938 nz = ai[i+1] - diag[i] - 1; 2939 idt = 2*i; 2940 s1 = x[idt]; s2 = x[1+idt]; 2941 while (nz--) { 2942 idx = 2*(*vi++); 2943 x1 = x[idx]; x2 = x[1+idx]; 2944 s1 -= v[0]*x1 + v[2]*x2; 2945 s2 -= v[1]*x1 + v[3]*x2; 2946 v += 4; 2947 } 2948 v = aa + 4*diag[i]; 2949 x[idt] = v[0]*s1 + v[2]*s2; 2950 x[1+idt] = v[1]*s1 + v[3]*s2; 2951 } 2952 2953 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2954 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2955 ierr = PetscLogFlops(2*4*(a->nz) - 2*A->cmap.n);CHKERRQ(ierr); 2956 PetscFunctionReturn(0); 2957 } 2958 2959 #undef __FUNCT__ 2960 #define __FUNCT__ "MatSolve_SeqBAIJ_1" 2961 PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 2962 { 2963 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2964 IS iscol=a->col,isrow=a->row; 2965 PetscErrorCode ierr; 2966 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout; 2967 PetscInt *diag = a->diag; 2968 MatScalar *aa=a->a,*v; 2969 PetscScalar *x,*b,s1,*t; 2970 2971 PetscFunctionBegin; 2972 if (!n) PetscFunctionReturn(0); 2973 2974 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2975 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2976 t = a->solve_work; 2977 2978 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2979 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2980 2981 /* forward solve the lower triangular */ 2982 t[0] = b[*r++]; 2983 for (i=1; i<n; i++) { 2984 v = aa + ai[i]; 2985 vi = aj + ai[i]; 2986 nz = diag[i] - ai[i]; 2987 s1 = b[*r++]; 2988 while (nz--) { 2989 s1 -= (*v++)*t[*vi++]; 2990 } 2991 t[i] = s1; 2992 } 2993 /* backward solve the upper triangular */ 2994 for (i=n-1; i>=0; i--){ 2995 v = aa + diag[i] + 1; 2996 vi = aj + diag[i] + 1; 2997 nz = ai[i+1] - diag[i] - 1; 2998 s1 = t[i]; 2999 while (nz--) { 3000 s1 -= (*v++)*t[*vi++]; 3001 } 3002 x[*c--] = t[i] = aa[diag[i]]*s1; 3003 } 3004 3005 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3006 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3007 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3008 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3009 ierr = PetscLogFlops(2*1*(a->nz) - A->cmap.n);CHKERRQ(ierr); 3010 PetscFunctionReturn(0); 3011 } 3012 /* 3013 Special case where the matrix was ILU(0) factored in the natural 3014 ordering. This eliminates the need for the column and row permutation. 3015 */ 3016 #undef __FUNCT__ 3017 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering" 3018 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 3019 { 3020 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3021 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 3022 PetscErrorCode ierr; 3023 PetscInt *diag = a->diag; 3024 MatScalar *aa=a->a; 3025 PetscScalar *x,*b; 3026 PetscScalar s1,x1; 3027 MatScalar *v; 3028 PetscInt jdx,idt,idx,nz,*vi,i; 3029 3030 PetscFunctionBegin; 3031 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3032 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3033 3034 /* forward solve the lower triangular */ 3035 idx = 0; 3036 x[0] = b[0]; 3037 for (i=1; i<n; i++) { 3038 v = aa + ai[i]; 3039 vi = aj + ai[i]; 3040 nz = diag[i] - ai[i]; 3041 idx += 1; 3042 s1 = b[idx]; 3043 while (nz--) { 3044 jdx = *vi++; 3045 x1 = x[jdx]; 3046 s1 -= v[0]*x1; 3047 v += 1; 3048 } 3049 x[idx] = s1; 3050 } 3051 /* backward solve the upper triangular */ 3052 for (i=n-1; i>=0; i--){ 3053 v = aa + diag[i] + 1; 3054 vi = aj + diag[i] + 1; 3055 nz = ai[i+1] - diag[i] - 1; 3056 idt = i; 3057 s1 = x[idt]; 3058 while (nz--) { 3059 idx = *vi++; 3060 x1 = x[idx]; 3061 s1 -= v[0]*x1; 3062 v += 1; 3063 } 3064 v = aa + diag[i]; 3065 x[idt] = v[0]*s1; 3066 } 3067 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3068 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3069 ierr = PetscLogFlops(2*(a->nz) - A->cmap.n);CHKERRQ(ierr); 3070 PetscFunctionReturn(0); 3071 } 3072 3073 /* ----------------------------------------------------------------*/ 3074 /* 3075 This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 3076 except that the data structure of Mat_SeqAIJ is slightly different. 3077 Not a good example of code reuse. 3078 */ 3079 EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,MatDuplicateOption,Mat*); 3080 3081 #undef __FUNCT__ 3082 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ" 3083 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,MatFactorInfo *info,Mat *fact) 3084 { 3085 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 3086 IS isicol; 3087 PetscErrorCode ierr; 3088 PetscInt *r,*ic,prow,n = a->mbs,*ai = a->i,*aj = a->j; 3089 PetscInt *ainew,*ajnew,jmax,*fill,*xi,nz,*im,*ajfill,*flev; 3090 PetscInt *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0; 3091 PetscInt incrlev,nnz,i,bs = A->rmap.bs,bs2 = a->bs2,levels,diagonal_fill,dd; 3092 PetscTruth col_identity,row_identity,flg; 3093 PetscReal f; 3094 3095 PetscFunctionBegin; 3096 f = info->fill; 3097 levels = (PetscInt)info->levels; 3098 diagonal_fill = (PetscInt)info->diagonal_fill; 3099 ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 3100 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 3101 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 3102 3103 if (!levels && row_identity && col_identity) { /* special case copy the nonzero structure */ 3104 ierr = MatDuplicateNoCreate_SeqBAIJ(A,MAT_DO_NOT_COPY_VALUES,fact);CHKERRQ(ierr); 3105 (*fact)->factor = MAT_FACTOR_LU; 3106 b = (Mat_SeqBAIJ*)(*fact)->data; 3107 ierr = MatMissingDiagonal_SeqBAIJ(*fact,&flg,&dd);CHKERRQ(ierr); 3108 if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry in row %D",dd); 3109 b->row = isrow; 3110 b->col = iscol; 3111 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 3112 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 3113 b->icol = isicol; 3114 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 3115 ierr = PetscMalloc(((*fact)->rmap.N+1+(*fact)->rmap.bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 3116 } else { /* general case perform the symbolic factorization */ 3117 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 3118 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 3119 3120 /* get new row pointers */ 3121 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr); 3122 ainew[0] = 0; 3123 /* don't know how many column pointers are needed so estimate */ 3124 jmax = (PetscInt)(f*ai[n] + 1); 3125 ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr); 3126 /* ajfill is level of fill for each fill entry */ 3127 ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr); 3128 /* fill is a linked list of nonzeros in active row */ 3129 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr); 3130 /* im is level for each filled value */ 3131 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr); 3132 /* dloc is location of diagonal in factor */ 3133 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr); 3134 dloc[0] = 0; 3135 for (prow=0; prow<n; prow++) { 3136 3137 /* copy prow into linked list */ 3138 nzf = nz = ai[r[prow]+1] - ai[r[prow]]; 3139 if (!nz) SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix"); 3140 xi = aj + ai[r[prow]]; 3141 fill[n] = n; 3142 fill[prow] = -1; /* marker for diagonal entry */ 3143 while (nz--) { 3144 fm = n; 3145 idx = ic[*xi++]; 3146 do { 3147 m = fm; 3148 fm = fill[m]; 3149 } while (fm < idx); 3150 fill[m] = idx; 3151 fill[idx] = fm; 3152 im[idx] = 0; 3153 } 3154 3155 /* make sure diagonal entry is included */ 3156 if (diagonal_fill && fill[prow] == -1) { 3157 fm = n; 3158 while (fill[fm] < prow) fm = fill[fm]; 3159 fill[prow] = fill[fm]; /* insert diagonal into linked list */ 3160 fill[fm] = prow; 3161 im[prow] = 0; 3162 nzf++; 3163 dcount++; 3164 } 3165 3166 nzi = 0; 3167 row = fill[n]; 3168 while (row < prow) { 3169 incrlev = im[row] + 1; 3170 nz = dloc[row]; 3171 xi = ajnew + ainew[row] + nz + 1; 3172 flev = ajfill + ainew[row] + nz + 1; 3173 nnz = ainew[row+1] - ainew[row] - nz - 1; 3174 fm = row; 3175 while (nnz-- > 0) { 3176 idx = *xi++; 3177 if (*flev + incrlev > levels) { 3178 flev++; 3179 continue; 3180 } 3181 do { 3182 m = fm; 3183 fm = fill[m]; 3184 } while (fm < idx); 3185 if (fm != idx) { 3186 im[idx] = *flev + incrlev; 3187 fill[m] = idx; 3188 fill[idx] = fm; 3189 fm = idx; 3190 nzf++; 3191 } else { 3192 if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev; 3193 } 3194 flev++; 3195 } 3196 row = fill[row]; 3197 nzi++; 3198 } 3199 /* copy new filled row into permanent storage */ 3200 ainew[prow+1] = ainew[prow] + nzf; 3201 if (ainew[prow+1] > jmax) { 3202 3203 /* estimate how much additional space we will need */ 3204 /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 3205 /* just double the memory each time */ 3206 PetscInt maxadd = jmax; 3207 /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */ 3208 if (maxadd < nzf) maxadd = (n-prow)*(nzf+1); 3209 jmax += maxadd; 3210 3211 /* allocate a longer ajnew and ajfill */ 3212 ierr = PetscMalloc(jmax*sizeof(PetscInt),&xi);CHKERRQ(ierr); 3213 ierr = PetscMemcpy(xi,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 3214 ierr = PetscFree(ajnew);CHKERRQ(ierr); 3215 ajnew = xi; 3216 ierr = PetscMalloc(jmax*sizeof(PetscInt),&xi);CHKERRQ(ierr); 3217 ierr = PetscMemcpy(xi,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 3218 ierr = PetscFree(ajfill);CHKERRQ(ierr); 3219 ajfill = xi; 3220 reallocate++; /* count how many reallocations are needed */ 3221 } 3222 xi = ajnew + ainew[prow]; 3223 flev = ajfill + ainew[prow]; 3224 dloc[prow] = nzi; 3225 fm = fill[n]; 3226 while (nzf--) { 3227 *xi++ = fm; 3228 *flev++ = im[fm]; 3229 fm = fill[fm]; 3230 } 3231 /* make sure row has diagonal entry */ 3232 if (ajnew[ainew[prow]+dloc[prow]] != prow) { 3233 SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 3234 try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow); 3235 } 3236 } 3237 ierr = PetscFree(ajfill);CHKERRQ(ierr); 3238 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 3239 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 3240 ierr = PetscFree(fill);CHKERRQ(ierr); 3241 ierr = PetscFree(im);CHKERRQ(ierr); 3242 3243 #if defined(PETSC_USE_INFO) 3244 { 3245 PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]); 3246 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr); 3247 ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 3248 ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr); 3249 ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 3250 if (diagonal_fill) { 3251 ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr); 3252 } 3253 } 3254 #endif 3255 3256 /* put together the new matrix */ 3257 ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(*fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 3258 ierr = PetscLogObjectParent(*fact,isicol);CHKERRQ(ierr); 3259 b = (Mat_SeqBAIJ*)(*fact)->data; 3260 b->free_a = PETSC_TRUE; 3261 b->free_ij = PETSC_TRUE; 3262 b->singlemalloc = PETSC_FALSE; 3263 ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 3264 b->j = ajnew; 3265 b->i = ainew; 3266 for (i=0; i<n; i++) dloc[i] += ainew[i]; 3267 b->diag = dloc; 3268 b->ilen = 0; 3269 b->imax = 0; 3270 b->row = isrow; 3271 b->col = iscol; 3272 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 3273 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 3274 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 3275 b->icol = isicol; 3276 ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 3277 /* In b structure: Free imax, ilen, old a, old j. 3278 Allocate dloc, solve_work, new a, new j */ 3279 ierr = PetscLogObjectMemory(*fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr); 3280 b->maxnz = b->nz = ainew[n]; 3281 (*fact)->factor = MAT_FACTOR_LU; 3282 3283 (*fact)->info.factor_mallocs = reallocate; 3284 (*fact)->info.fill_ratio_given = f; 3285 (*fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]); 3286 } 3287 3288 if (row_identity && col_identity) { 3289 ierr = MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(*fact);CHKERRQ(ierr); 3290 } 3291 PetscFunctionReturn(0); 3292 } 3293 3294 #undef __FUNCT__ 3295 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE" 3296 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A) 3297 { 3298 /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */ 3299 /* int i,*AJ=a->j,nz=a->nz; */ 3300 PetscFunctionBegin; 3301 /* Undo Column scaling */ 3302 /* while (nz--) { */ 3303 /* AJ[i] = AJ[i]/4; */ 3304 /* } */ 3305 /* This should really invoke a push/pop logic, but we don't have that yet. */ 3306 A->ops->setunfactored = PETSC_NULL; 3307 PetscFunctionReturn(0); 3308 } 3309 3310 #undef __FUNCT__ 3311 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj" 3312 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A) 3313 { 3314 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3315 PetscInt *AJ=a->j,nz=a->nz; 3316 unsigned short *aj=(unsigned short *)AJ; 3317 PetscFunctionBegin; 3318 /* Is this really necessary? */ 3319 while (nz--) { 3320 AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */ 3321 } 3322 A->ops->setunfactored = PETSC_NULL; 3323 PetscFunctionReturn(0); 3324 } 3325 3326 #undef __FUNCT__ 3327 #define __FUNCT__ "MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering" 3328 PetscErrorCode MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(Mat inA) 3329 { 3330 PetscErrorCode ierr; 3331 /* 3332 Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver 3333 with natural ordering 3334 */ 3335 PetscFunctionBegin; 3336 inA->ops->solve = MatSolve_SeqBAIJ_Update; 3337 inA->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_Update; 3338 switch (inA->rmap.bs) { 3339 case 1: 3340 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_1; 3341 ierr = PetscInfo(inA,"Using special in-place natural ordering factor BS=1\n");CHKERRQ(ierr); 3342 break; 3343 case 2: 3344 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering; 3345 ierr = PetscInfo(inA,"Using special in-place natural ordering factor BS=2\n");CHKERRQ(ierr); 3346 break; 3347 case 3: 3348 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering; 3349 ierr = PetscInfo(inA,"Using special in-place natural ordering factor BS=3\n");CHKERRQ(ierr); 3350 break; 3351 case 4: 3352 #if defined(PETSC_USE_MAT_SINGLE) 3353 { 3354 PetscTruth sse_enabled_local; 3355 PetscErrorCode ierr; 3356 ierr = PetscSSEIsEnabled(((PetscObject)inA)->comm,&sse_enabled_local,PETSC_NULL);CHKERRQ(ierr); 3357 if (sse_enabled_local) { 3358 # if defined(PETSC_HAVE_SSE) 3359 int i,*AJ=a->j,nz=a->nz,n=a->mbs; 3360 if (n==(unsigned short)n) { 3361 unsigned short *aj=(unsigned short *)AJ; 3362 for (i=0;i<nz;i++) { 3363 aj[i] = (unsigned short)AJ[i]; 3364 } 3365 inA->ops->setunfactored = MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj; 3366 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj; 3367 ierr = PetscInfo(inA,"Using special SSE, in-place natural ordering, ushort j index factor BS=4\n");CHKERRQ(ierr); 3368 } else { 3369 /* Scale the column indices for easier indexing in MatSolve. */ 3370 /* for (i=0;i<nz;i++) { */ 3371 /* AJ[i] = AJ[i]*4; */ 3372 /* } */ 3373 inA->ops->setunfactored = MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE; 3374 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE; 3375 ierr = PetscInfo(inA,"Using special SSE, in-place natural ordering, int j index factor BS=4\n");CHKERRQ(ierr); 3376 } 3377 # else 3378 /* This should never be reached. If so, problem in PetscSSEIsEnabled. */ 3379 SETERRQ(PETSC_ERR_SUP,"SSE Hardware unavailable"); 3380 # endif 3381 } else { 3382 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering; 3383 ierr = PetscInfo(inA,"Using special in-place natural ordering factor BS=4\n");CHKERRQ(ierr); 3384 } 3385 } 3386 #else 3387 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering; 3388 ierr = PetscInfo(inA,"Using special in-place natural ordering factor BS=4\n");CHKERRQ(ierr); 3389 #endif 3390 break; 3391 case 5: 3392 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_5_NaturalOrdering; 3393 ierr = PetscInfo(inA,"Using special in-place natural ordering factor BS=5\n");CHKERRQ(ierr); 3394 break; 3395 case 6: 3396 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_6_NaturalOrdering; 3397 ierr = PetscInfo(inA,"Using special in-place natural ordering factor BS=6\n");CHKERRQ(ierr); 3398 break; 3399 case 7: 3400 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_7_NaturalOrdering; 3401 ierr = PetscInfo(inA,"Using special in-place natural ordering factor BS=7\n");CHKERRQ(ierr); 3402 break; 3403 } 3404 PetscFunctionReturn(0); 3405 } 3406 3407 #undef __FUNCT__ 3408 #define __FUNCT__ "MatSeqBAIJ_UpdateSolvers" 3409 PetscErrorCode MatSeqBAIJ_UpdateSolvers(Mat A) 3410 { 3411 /* 3412 Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver 3413 with natural ordering 3414 */ 3415 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3416 IS row = a->row, col = a->col; 3417 PetscTruth row_identity, col_identity; 3418 PetscTruth use_natural; 3419 PetscErrorCode ierr; 3420 3421 PetscFunctionBegin; 3422 3423 use_natural = PETSC_FALSE; 3424 if (row && col) { 3425 ierr = ISIdentity(row,&row_identity);CHKERRQ(ierr); 3426 ierr = ISIdentity(col,&col_identity);CHKERRQ(ierr); 3427 3428 if (row_identity && col_identity) { 3429 use_natural = PETSC_TRUE; 3430 } 3431 } else { 3432 use_natural = PETSC_TRUE; 3433 } 3434 3435 switch (A->rmap.bs) { 3436 case 1: 3437 if (use_natural) { 3438 A->ops->solve = MatSolve_SeqBAIJ_1_NaturalOrdering; 3439 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering; 3440 ierr = PetscInfo(A,"Using special in-place natural ordering solve BS=1\n");CHKERRQ(ierr); 3441 ierr = PetscInfo(A,"Using special in-place natural ordering solve BS=4\n");CHKERRQ(ierr); 3442 } else { 3443 A->ops->solve = MatSolve_SeqBAIJ_1; 3444 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1; 3445 } 3446 break; 3447 case 2: 3448 if (use_natural) { 3449 A->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering; 3450 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering; 3451 ierr = PetscInfo(A,"Using special in-place natural ordering solve BS=2\n");CHKERRQ(ierr); 3452 ierr = PetscInfo(A,"Using special in-place natural ordering solve BS=4\n");CHKERRQ(ierr); 3453 } else { 3454 A->ops->solve = MatSolve_SeqBAIJ_2; 3455 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2; 3456 } 3457 break; 3458 case 3: 3459 if (use_natural) { 3460 A->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering; 3461 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering; 3462 ierr = PetscInfo(A,"Using special in-place natural ordering solve BS=3\n");CHKERRQ(ierr); 3463 ierr = PetscInfo(A,"Using special in-place natural ordering solve BS=4\n");CHKERRQ(ierr); 3464 } else { 3465 A->ops->solve = MatSolve_SeqBAIJ_3; 3466 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3; 3467 } 3468 break; 3469 case 4: 3470 { 3471 PetscTruth sse_enabled_local; 3472 ierr = PetscSSEIsEnabled(((PetscObject)A)->comm,&sse_enabled_local,PETSC_NULL);CHKERRQ(ierr); 3473 if (use_natural) { 3474 #if defined(PETSC_USE_MAT_SINGLE) 3475 if (sse_enabled_local) { /* Natural + Single + SSE */ 3476 # if defined(PETSC_HAVE_SSE) 3477 PetscInt n=a->mbs; 3478 if (n==(unsigned short)n) { 3479 A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj; 3480 ierr = PetscInfo(A,"Using single precision, SSE, in-place, ushort j index, natural ordering solve BS=4\n");CHKERRQ(ierr); 3481 } else { 3482 A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion; 3483 ierr = PetscInfo(A,"Using single precision, SSE, in-place, int j index, natural ordering solve BS=4\n");CHKERRQ(ierr); 3484 } 3485 # else 3486 /* This should never be reached, unless there is a bug in PetscSSEIsEnabled(). */ 3487 SETERRQ(PETSC_ERR_SUP,"SSE implementations are unavailable."); 3488 # endif 3489 } else { /* Natural + Single */ 3490 A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion; 3491 ierr = PetscInfo(A,"Using single precision, in-place, natural ordering solve BS=4\n");CHKERRQ(ierr); 3492 } 3493 #else 3494 A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering; 3495 ierr = PetscInfo(A,"Using special in-place, natural ordering solve BS=4\n");CHKERRQ(ierr); 3496 #endif 3497 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering; 3498 ierr = PetscInfo(A,"Using special in-place, natural ordering solve BS=4\n");CHKERRQ(ierr); 3499 } else { /* Arbitrary ordering */ 3500 #if defined(PETSC_USE_MAT_SINGLE) 3501 if (sse_enabled_local) { /* Arbitrary + Single + SSE */ 3502 # if defined(PETSC_HAVE_SSE) 3503 A->ops->solve = MatSolve_SeqBAIJ_4_SSE_Demotion; 3504 ierr = PetscInfo(A,"Using single precision, SSE solve BS=4\n");CHKERRQ(ierr); 3505 # else 3506 /* This should never be reached, unless there is a bug in PetscSSEIsEnabled(). */ 3507 SETERRQ(PETSC_ERR_SUP,"SSE implementations are unavailable."); 3508 # endif 3509 } else { /* Arbitrary + Single */ 3510 A->ops->solve = MatSolve_SeqBAIJ_4_Demotion; 3511 ierr = PetscInfo(A,"Using single precision solve BS=4\n");CHKERRQ(ierr); 3512 } 3513 #else 3514 A->ops->solve = MatSolve_SeqBAIJ_4; 3515 #endif 3516 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4; 3517 } 3518 } 3519 break; 3520 case 5: 3521 if (use_natural) { 3522 A->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering; 3523 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_5_NaturalOrdering; 3524 ierr = PetscInfo(A,"Using special in-place natural ordering solve BS=5\n");CHKERRQ(ierr); 3525 ierr = PetscInfo(A,"Using special in-place natural ordering solve BS=5\n");CHKERRQ(ierr); 3526 } else { 3527 A->ops->solve = MatSolve_SeqBAIJ_5; 3528 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_5; 3529 } 3530 break; 3531 case 6: 3532 if (use_natural) { 3533 A->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering; 3534 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_6_NaturalOrdering; 3535 ierr = PetscInfo(A,"Using special in-place natural ordering solve BS=6\n");CHKERRQ(ierr); 3536 ierr = PetscInfo(A,"Using special in-place natural ordering solve BS=6\n");CHKERRQ(ierr); 3537 } else { 3538 A->ops->solve = MatSolve_SeqBAIJ_6; 3539 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_6; 3540 } 3541 break; 3542 case 7: 3543 if (use_natural) { 3544 A->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering; 3545 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_7_NaturalOrdering; 3546 ierr = PetscInfo(A,"Using special in-place natural ordering solve BS=7\n");CHKERRQ(ierr); 3547 ierr = PetscInfo(A,"Using special in-place natural ordering solve BS=7\n");CHKERRQ(ierr); 3548 } else { 3549 A->ops->solve = MatSolve_SeqBAIJ_7; 3550 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_7; 3551 } 3552 break; 3553 default: 3554 A->ops->solve = MatSolve_SeqBAIJ_N; 3555 break; 3556 } 3557 PetscFunctionReturn(0); 3558 } 3559 3560 #undef __FUNCT__ 3561 #define __FUNCT__ "MatSolve_SeqBAIJ_Update" 3562 PetscErrorCode MatSolve_SeqBAIJ_Update(Mat A,Vec x,Vec y) 3563 { 3564 PetscErrorCode ierr; 3565 3566 PetscFunctionBegin; 3567 ierr = MatSeqBAIJ_UpdateSolvers(A); 3568 if (A->ops->solve != MatSolve_SeqBAIJ_Update) { 3569 ierr = (*A->ops->solve)(A,x,y);CHKERRQ(ierr); 3570 } else { 3571 SETERRQ(PETSC_ERR_SUP,"Something really wrong happened."); 3572 } 3573 PetscFunctionReturn(0); 3574 } 3575 3576 #undef __FUNCT__ 3577 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_Update" 3578 PetscErrorCode MatSolveTranspose_SeqBAIJ_Update(Mat A,Vec x,Vec y) 3579 { 3580 PetscErrorCode ierr; 3581 3582 PetscFunctionBegin; 3583 ierr = MatSeqBAIJ_UpdateSolvers(A); 3584 ierr = (*A->ops->solvetranspose)(A,x,y);CHKERRQ(ierr); 3585 PetscFunctionReturn(0); 3586 } 3587 3588 3589