1 #define PETSCMAT_DLL 2 3 /* 4 Factorization code for BAIJ format. 5 */ 6 7 #include "src/mat/impls/baij/seq/baij.h" 8 #include "src/inline/ilu.h" 9 #include "src/inline/dot.h" 10 11 #undef __FUNCT__ 12 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering" 13 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 14 { 15 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 16 PetscErrorCode ierr; 17 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 18 PetscInt *diag = a->diag; 19 MatScalar *aa=a->a,*v; 20 PetscScalar s1,*x,*b; 21 22 PetscFunctionBegin; 23 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 24 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 25 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 26 27 /* forward solve the U^T */ 28 for (i=0; i<n; i++) { 29 30 v = aa + diag[i]; 31 /* multiply by the inverse of the block diagonal */ 32 s1 = (*v++)*x[i]; 33 vi = aj + diag[i] + 1; 34 nz = ai[i+1] - diag[i] - 1; 35 while (nz--) { 36 x[*vi++] -= (*v++)*s1; 37 } 38 x[i] = s1; 39 } 40 /* backward solve the L^T */ 41 for (i=n-1; i>=0; i--){ 42 v = aa + diag[i] - 1; 43 vi = aj + diag[i] - 1; 44 nz = diag[i] - ai[i]; 45 s1 = x[i]; 46 while (nz--) { 47 x[*vi--] -= (*v--)*s1; 48 } 49 } 50 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 51 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 52 ierr = PetscLogFlops(2*(a->nz) - A->cmap.n);CHKERRQ(ierr); 53 PetscFunctionReturn(0); 54 } 55 56 #undef __FUNCT__ 57 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering" 58 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 59 { 60 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 61 PetscErrorCode ierr; 62 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 63 PetscInt *diag = a->diag,oidx; 64 MatScalar *aa=a->a,*v; 65 PetscScalar s1,s2,x1,x2; 66 PetscScalar *x,*b; 67 68 PetscFunctionBegin; 69 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 70 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 71 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 72 73 /* forward solve the U^T */ 74 idx = 0; 75 for (i=0; i<n; i++) { 76 77 v = aa + 4*diag[i]; 78 /* multiply by the inverse of the block diagonal */ 79 x1 = x[idx]; x2 = x[1+idx]; 80 s1 = v[0]*x1 + v[1]*x2; 81 s2 = v[2]*x1 + v[3]*x2; 82 v += 4; 83 84 vi = aj + diag[i] + 1; 85 nz = ai[i+1] - diag[i] - 1; 86 while (nz--) { 87 oidx = 2*(*vi++); 88 x[oidx] -= v[0]*s1 + v[1]*s2; 89 x[oidx+1] -= v[2]*s1 + v[3]*s2; 90 v += 4; 91 } 92 x[idx] = s1;x[1+idx] = s2; 93 idx += 2; 94 } 95 /* backward solve the L^T */ 96 for (i=n-1; i>=0; i--){ 97 v = aa + 4*diag[i] - 4; 98 vi = aj + diag[i] - 1; 99 nz = diag[i] - ai[i]; 100 idt = 2*i; 101 s1 = x[idt]; s2 = x[1+idt]; 102 while (nz--) { 103 idx = 2*(*vi--); 104 x[idx] -= v[0]*s1 + v[1]*s2; 105 x[idx+1] -= v[2]*s1 + v[3]*s2; 106 v -= 4; 107 } 108 } 109 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 110 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 111 ierr = PetscLogFlops(2*4*(a->nz) - 2*A->cmap.n);CHKERRQ(ierr); 112 PetscFunctionReturn(0); 113 } 114 115 #undef __FUNCT__ 116 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering" 117 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 118 { 119 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 120 PetscErrorCode ierr; 121 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 122 PetscInt *diag = a->diag,oidx; 123 MatScalar *aa=a->a,*v; 124 PetscScalar s1,s2,s3,x1,x2,x3; 125 PetscScalar *x,*b; 126 127 PetscFunctionBegin; 128 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 129 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 130 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 131 132 /* forward solve the U^T */ 133 idx = 0; 134 for (i=0; i<n; i++) { 135 136 v = aa + 9*diag[i]; 137 /* multiply by the inverse of the block diagonal */ 138 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 139 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 140 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 141 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 142 v += 9; 143 144 vi = aj + diag[i] + 1; 145 nz = ai[i+1] - diag[i] - 1; 146 while (nz--) { 147 oidx = 3*(*vi++); 148 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 149 x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 150 x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 151 v += 9; 152 } 153 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 154 idx += 3; 155 } 156 /* backward solve the L^T */ 157 for (i=n-1; i>=0; i--){ 158 v = aa + 9*diag[i] - 9; 159 vi = aj + diag[i] - 1; 160 nz = diag[i] - ai[i]; 161 idt = 3*i; 162 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 163 while (nz--) { 164 idx = 3*(*vi--); 165 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 166 x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 167 x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 168 v -= 9; 169 } 170 } 171 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 172 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 173 ierr = PetscLogFlops(2*9*(a->nz) - 3*A->cmap.n);CHKERRQ(ierr); 174 PetscFunctionReturn(0); 175 } 176 177 #undef __FUNCT__ 178 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering" 179 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 180 { 181 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 182 PetscErrorCode ierr; 183 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 184 PetscInt *diag = a->diag,oidx; 185 MatScalar *aa=a->a,*v; 186 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 187 PetscScalar *x,*b; 188 189 PetscFunctionBegin; 190 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 191 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 192 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 193 194 /* forward solve the U^T */ 195 idx = 0; 196 for (i=0; i<n; i++) { 197 198 v = aa + 16*diag[i]; 199 /* multiply by the inverse of the block diagonal */ 200 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 201 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 202 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 203 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 204 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 205 v += 16; 206 207 vi = aj + diag[i] + 1; 208 nz = ai[i+1] - diag[i] - 1; 209 while (nz--) { 210 oidx = 4*(*vi++); 211 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 212 x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 213 x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 214 x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 215 v += 16; 216 } 217 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; 218 idx += 4; 219 } 220 /* backward solve the L^T */ 221 for (i=n-1; i>=0; i--){ 222 v = aa + 16*diag[i] - 16; 223 vi = aj + diag[i] - 1; 224 nz = diag[i] - ai[i]; 225 idt = 4*i; 226 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; 227 while (nz--) { 228 idx = 4*(*vi--); 229 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 230 x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 231 x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 232 x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 233 v -= 16; 234 } 235 } 236 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 237 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 238 ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap.n);CHKERRQ(ierr); 239 PetscFunctionReturn(0); 240 } 241 242 #undef __FUNCT__ 243 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering" 244 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 245 { 246 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 247 PetscErrorCode ierr; 248 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 249 PetscInt *diag = a->diag,oidx; 250 MatScalar *aa=a->a,*v; 251 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 252 PetscScalar *x,*b; 253 254 PetscFunctionBegin; 255 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 256 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 257 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 258 259 /* forward solve the U^T */ 260 idx = 0; 261 for (i=0; i<n; i++) { 262 263 v = aa + 25*diag[i]; 264 /* multiply by the inverse of the block diagonal */ 265 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 266 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 267 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 268 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 269 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 270 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 271 v += 25; 272 273 vi = aj + diag[i] + 1; 274 nz = ai[i+1] - diag[i] - 1; 275 while (nz--) { 276 oidx = 5*(*vi++); 277 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 278 x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 279 x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 280 x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 281 x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 282 v += 25; 283 } 284 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 285 idx += 5; 286 } 287 /* backward solve the L^T */ 288 for (i=n-1; i>=0; i--){ 289 v = aa + 25*diag[i] - 25; 290 vi = aj + diag[i] - 1; 291 nz = diag[i] - ai[i]; 292 idt = 5*i; 293 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 294 while (nz--) { 295 idx = 5*(*vi--); 296 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 297 x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 298 x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 299 x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 300 x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 301 v -= 25; 302 } 303 } 304 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 305 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 306 ierr = PetscLogFlops(2*25*(a->nz) - 5*A->cmap.n);CHKERRQ(ierr); 307 PetscFunctionReturn(0); 308 } 309 310 #undef __FUNCT__ 311 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering" 312 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 313 { 314 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 315 PetscErrorCode ierr; 316 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 317 PetscInt *diag = a->diag,oidx; 318 MatScalar *aa=a->a,*v; 319 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 320 PetscScalar *x,*b; 321 322 PetscFunctionBegin; 323 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 324 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 325 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 326 327 /* forward solve the U^T */ 328 idx = 0; 329 for (i=0; i<n; i++) { 330 331 v = aa + 36*diag[i]; 332 /* multiply by the inverse of the block diagonal */ 333 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 334 x6 = x[5+idx]; 335 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 336 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 337 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 338 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 339 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 340 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 341 v += 36; 342 343 vi = aj + diag[i] + 1; 344 nz = ai[i+1] - diag[i] - 1; 345 while (nz--) { 346 oidx = 6*(*vi++); 347 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 348 x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 349 x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 350 x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 351 x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 352 x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 353 v += 36; 354 } 355 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 356 x[5+idx] = s6; 357 idx += 6; 358 } 359 /* backward solve the L^T */ 360 for (i=n-1; i>=0; i--){ 361 v = aa + 36*diag[i] - 36; 362 vi = aj + diag[i] - 1; 363 nz = diag[i] - ai[i]; 364 idt = 6*i; 365 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 366 s6 = x[5+idt]; 367 while (nz--) { 368 idx = 6*(*vi--); 369 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 370 x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 371 x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 372 x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 373 x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 374 x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 375 v -= 36; 376 } 377 } 378 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 379 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 380 ierr = PetscLogFlops(2*36*(a->nz) - 6*A->cmap.n);CHKERRQ(ierr); 381 PetscFunctionReturn(0); 382 } 383 384 #undef __FUNCT__ 385 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering" 386 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 387 { 388 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 389 PetscErrorCode ierr; 390 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 391 PetscInt *diag = a->diag,oidx; 392 MatScalar *aa=a->a,*v; 393 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 394 PetscScalar *x,*b; 395 396 PetscFunctionBegin; 397 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 398 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 399 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 400 401 /* forward solve the U^T */ 402 idx = 0; 403 for (i=0; i<n; i++) { 404 405 v = aa + 49*diag[i]; 406 /* multiply by the inverse of the block diagonal */ 407 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 408 x6 = x[5+idx]; x7 = x[6+idx]; 409 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 410 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 411 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 412 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 413 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 414 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 415 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 416 v += 49; 417 418 vi = aj + diag[i] + 1; 419 nz = ai[i+1] - diag[i] - 1; 420 while (nz--) { 421 oidx = 7*(*vi++); 422 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 423 x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 424 x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 425 x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 426 x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 427 x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 428 x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 429 v += 49; 430 } 431 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 432 x[5+idx] = s6;x[6+idx] = s7; 433 idx += 7; 434 } 435 /* backward solve the L^T */ 436 for (i=n-1; i>=0; i--){ 437 v = aa + 49*diag[i] - 49; 438 vi = aj + diag[i] - 1; 439 nz = diag[i] - ai[i]; 440 idt = 7*i; 441 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 442 s6 = x[5+idt];s7 = x[6+idt]; 443 while (nz--) { 444 idx = 7*(*vi--); 445 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 446 x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 447 x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 448 x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 449 x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 450 x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 451 x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 452 v -= 49; 453 } 454 } 455 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 456 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 457 ierr = PetscLogFlops(2*49*(a->nz) - 7*A->cmap.n);CHKERRQ(ierr); 458 PetscFunctionReturn(0); 459 } 460 461 /*---------------------------------------------------------------------------------------------*/ 462 #undef __FUNCT__ 463 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1" 464 PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 465 { 466 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 467 IS iscol=a->col,isrow=a->row; 468 PetscErrorCode ierr; 469 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout; 470 PetscInt *diag = a->diag; 471 MatScalar *aa=a->a,*v; 472 PetscScalar s1,*x,*b,*t; 473 474 PetscFunctionBegin; 475 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 476 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 477 t = a->solve_work; 478 479 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 480 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 481 482 /* copy the b into temp work space according to permutation */ 483 for (i=0; i<n; i++) { 484 t[i] = b[c[i]]; 485 } 486 487 /* forward solve the U^T */ 488 for (i=0; i<n; i++) { 489 490 v = aa + diag[i]; 491 /* multiply by the inverse of the block diagonal */ 492 s1 = (*v++)*t[i]; 493 vi = aj + diag[i] + 1; 494 nz = ai[i+1] - diag[i] - 1; 495 while (nz--) { 496 t[*vi++] -= (*v++)*s1; 497 } 498 t[i] = s1; 499 } 500 /* backward solve the L^T */ 501 for (i=n-1; i>=0; i--){ 502 v = aa + diag[i] - 1; 503 vi = aj + diag[i] - 1; 504 nz = diag[i] - ai[i]; 505 s1 = t[i]; 506 while (nz--) { 507 t[*vi--] -= (*v--)*s1; 508 } 509 } 510 511 /* copy t into x according to permutation */ 512 for (i=0; i<n; i++) { 513 x[r[i]] = t[i]; 514 } 515 516 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 517 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 518 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 519 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 520 ierr = PetscLogFlops(2*(a->nz) - A->cmap.n);CHKERRQ(ierr); 521 PetscFunctionReturn(0); 522 } 523 524 #undef __FUNCT__ 525 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2" 526 PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 527 { 528 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 529 IS iscol=a->col,isrow=a->row; 530 PetscErrorCode ierr; 531 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 532 PetscInt *diag = a->diag,ii,ic,ir,oidx; 533 MatScalar *aa=a->a,*v; 534 PetscScalar s1,s2,x1,x2; 535 PetscScalar *x,*b,*t; 536 537 PetscFunctionBegin; 538 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 539 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 540 t = a->solve_work; 541 542 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 543 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 544 545 /* copy the b into temp work space according to permutation */ 546 ii = 0; 547 for (i=0; i<n; i++) { 548 ic = 2*c[i]; 549 t[ii] = b[ic]; 550 t[ii+1] = b[ic+1]; 551 ii += 2; 552 } 553 554 /* forward solve the U^T */ 555 idx = 0; 556 for (i=0; i<n; i++) { 557 558 v = aa + 4*diag[i]; 559 /* multiply by the inverse of the block diagonal */ 560 x1 = t[idx]; x2 = t[1+idx]; 561 s1 = v[0]*x1 + v[1]*x2; 562 s2 = v[2]*x1 + v[3]*x2; 563 v += 4; 564 565 vi = aj + diag[i] + 1; 566 nz = ai[i+1] - diag[i] - 1; 567 while (nz--) { 568 oidx = 2*(*vi++); 569 t[oidx] -= v[0]*s1 + v[1]*s2; 570 t[oidx+1] -= v[2]*s1 + v[3]*s2; 571 v += 4; 572 } 573 t[idx] = s1;t[1+idx] = s2; 574 idx += 2; 575 } 576 /* backward solve the L^T */ 577 for (i=n-1; i>=0; i--){ 578 v = aa + 4*diag[i] - 4; 579 vi = aj + diag[i] - 1; 580 nz = diag[i] - ai[i]; 581 idt = 2*i; 582 s1 = t[idt]; s2 = t[1+idt]; 583 while (nz--) { 584 idx = 2*(*vi--); 585 t[idx] -= v[0]*s1 + v[1]*s2; 586 t[idx+1] -= v[2]*s1 + v[3]*s2; 587 v -= 4; 588 } 589 } 590 591 /* copy t into x according to permutation */ 592 ii = 0; 593 for (i=0; i<n; i++) { 594 ir = 2*r[i]; 595 x[ir] = t[ii]; 596 x[ir+1] = t[ii+1]; 597 ii += 2; 598 } 599 600 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 601 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 602 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 603 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 604 ierr = PetscLogFlops(2*4*(a->nz) - 2*A->cmap.n);CHKERRQ(ierr); 605 PetscFunctionReturn(0); 606 } 607 608 #undef __FUNCT__ 609 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3" 610 PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 611 { 612 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 613 IS iscol=a->col,isrow=a->row; 614 PetscErrorCode ierr; 615 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 616 PetscInt *diag = a->diag,ii,ic,ir,oidx; 617 MatScalar *aa=a->a,*v; 618 PetscScalar s1,s2,s3,x1,x2,x3; 619 PetscScalar *x,*b,*t; 620 621 PetscFunctionBegin; 622 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 623 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 624 t = a->solve_work; 625 626 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 627 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 628 629 /* copy the b into temp work space according to permutation */ 630 ii = 0; 631 for (i=0; i<n; i++) { 632 ic = 3*c[i]; 633 t[ii] = b[ic]; 634 t[ii+1] = b[ic+1]; 635 t[ii+2] = b[ic+2]; 636 ii += 3; 637 } 638 639 /* forward solve the U^T */ 640 idx = 0; 641 for (i=0; i<n; i++) { 642 643 v = aa + 9*diag[i]; 644 /* multiply by the inverse of the block diagonal */ 645 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 646 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 647 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 648 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 649 v += 9; 650 651 vi = aj + diag[i] + 1; 652 nz = ai[i+1] - diag[i] - 1; 653 while (nz--) { 654 oidx = 3*(*vi++); 655 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 656 t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 657 t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 658 v += 9; 659 } 660 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 661 idx += 3; 662 } 663 /* backward solve the L^T */ 664 for (i=n-1; i>=0; i--){ 665 v = aa + 9*diag[i] - 9; 666 vi = aj + diag[i] - 1; 667 nz = diag[i] - ai[i]; 668 idt = 3*i; 669 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 670 while (nz--) { 671 idx = 3*(*vi--); 672 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 673 t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 674 t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 675 v -= 9; 676 } 677 } 678 679 /* copy t into x according to permutation */ 680 ii = 0; 681 for (i=0; i<n; i++) { 682 ir = 3*r[i]; 683 x[ir] = t[ii]; 684 x[ir+1] = t[ii+1]; 685 x[ir+2] = t[ii+2]; 686 ii += 3; 687 } 688 689 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 690 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 691 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 692 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 693 ierr = PetscLogFlops(2*9*(a->nz) - 3*A->cmap.n);CHKERRQ(ierr); 694 PetscFunctionReturn(0); 695 } 696 697 #undef __FUNCT__ 698 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4" 699 PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 700 { 701 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 702 IS iscol=a->col,isrow=a->row; 703 PetscErrorCode ierr; 704 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 705 PetscInt *diag = a->diag,ii,ic,ir,oidx; 706 MatScalar *aa=a->a,*v; 707 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 708 PetscScalar *x,*b,*t; 709 710 PetscFunctionBegin; 711 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 712 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 713 t = a->solve_work; 714 715 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 716 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 717 718 /* copy the b into temp work space according to permutation */ 719 ii = 0; 720 for (i=0; i<n; i++) { 721 ic = 4*c[i]; 722 t[ii] = b[ic]; 723 t[ii+1] = b[ic+1]; 724 t[ii+2] = b[ic+2]; 725 t[ii+3] = b[ic+3]; 726 ii += 4; 727 } 728 729 /* forward solve the U^T */ 730 idx = 0; 731 for (i=0; i<n; i++) { 732 733 v = aa + 16*diag[i]; 734 /* multiply by the inverse of the block diagonal */ 735 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 736 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 737 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 738 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 739 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 740 v += 16; 741 742 vi = aj + diag[i] + 1; 743 nz = ai[i+1] - diag[i] - 1; 744 while (nz--) { 745 oidx = 4*(*vi++); 746 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 747 t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 748 t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 749 t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 750 v += 16; 751 } 752 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; 753 idx += 4; 754 } 755 /* backward solve the L^T */ 756 for (i=n-1; i>=0; i--){ 757 v = aa + 16*diag[i] - 16; 758 vi = aj + diag[i] - 1; 759 nz = diag[i] - ai[i]; 760 idt = 4*i; 761 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; 762 while (nz--) { 763 idx = 4*(*vi--); 764 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 765 t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 766 t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 767 t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 768 v -= 16; 769 } 770 } 771 772 /* copy t into x according to permutation */ 773 ii = 0; 774 for (i=0; i<n; i++) { 775 ir = 4*r[i]; 776 x[ir] = t[ii]; 777 x[ir+1] = t[ii+1]; 778 x[ir+2] = t[ii+2]; 779 x[ir+3] = t[ii+3]; 780 ii += 4; 781 } 782 783 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 784 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 785 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 786 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 787 ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap.n);CHKERRQ(ierr); 788 PetscFunctionReturn(0); 789 } 790 791 #undef __FUNCT__ 792 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5" 793 PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 794 { 795 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 796 IS iscol=a->col,isrow=a->row; 797 PetscErrorCode ierr; 798 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 799 PetscInt *diag = a->diag,ii,ic,ir,oidx; 800 MatScalar *aa=a->a,*v; 801 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 802 PetscScalar *x,*b,*t; 803 804 PetscFunctionBegin; 805 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 806 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 807 t = a->solve_work; 808 809 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 810 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 811 812 /* copy the b into temp work space according to permutation */ 813 ii = 0; 814 for (i=0; i<n; i++) { 815 ic = 5*c[i]; 816 t[ii] = b[ic]; 817 t[ii+1] = b[ic+1]; 818 t[ii+2] = b[ic+2]; 819 t[ii+3] = b[ic+3]; 820 t[ii+4] = b[ic+4]; 821 ii += 5; 822 } 823 824 /* forward solve the U^T */ 825 idx = 0; 826 for (i=0; i<n; i++) { 827 828 v = aa + 25*diag[i]; 829 /* multiply by the inverse of the block diagonal */ 830 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 831 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 832 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 833 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 834 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 835 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 836 v += 25; 837 838 vi = aj + diag[i] + 1; 839 nz = ai[i+1] - diag[i] - 1; 840 while (nz--) { 841 oidx = 5*(*vi++); 842 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 843 t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 844 t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 845 t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 846 t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 847 v += 25; 848 } 849 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 850 idx += 5; 851 } 852 /* backward solve the L^T */ 853 for (i=n-1; i>=0; i--){ 854 v = aa + 25*diag[i] - 25; 855 vi = aj + diag[i] - 1; 856 nz = diag[i] - ai[i]; 857 idt = 5*i; 858 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 859 while (nz--) { 860 idx = 5*(*vi--); 861 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 862 t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 863 t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 864 t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 865 t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 866 v -= 25; 867 } 868 } 869 870 /* copy t into x according to permutation */ 871 ii = 0; 872 for (i=0; i<n; i++) { 873 ir = 5*r[i]; 874 x[ir] = t[ii]; 875 x[ir+1] = t[ii+1]; 876 x[ir+2] = t[ii+2]; 877 x[ir+3] = t[ii+3]; 878 x[ir+4] = t[ii+4]; 879 ii += 5; 880 } 881 882 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 883 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 884 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 885 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 886 ierr = PetscLogFlops(2*25*(a->nz) - 5*A->cmap.n);CHKERRQ(ierr); 887 PetscFunctionReturn(0); 888 } 889 890 #undef __FUNCT__ 891 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6" 892 PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 893 { 894 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 895 IS iscol=a->col,isrow=a->row; 896 PetscErrorCode ierr; 897 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 898 PetscInt *diag = a->diag,ii,ic,ir,oidx; 899 MatScalar *aa=a->a,*v; 900 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 901 PetscScalar *x,*b,*t; 902 903 PetscFunctionBegin; 904 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 905 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 906 t = a->solve_work; 907 908 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 909 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 910 911 /* copy the b into temp work space according to permutation */ 912 ii = 0; 913 for (i=0; i<n; i++) { 914 ic = 6*c[i]; 915 t[ii] = b[ic]; 916 t[ii+1] = b[ic+1]; 917 t[ii+2] = b[ic+2]; 918 t[ii+3] = b[ic+3]; 919 t[ii+4] = b[ic+4]; 920 t[ii+5] = b[ic+5]; 921 ii += 6; 922 } 923 924 /* forward solve the U^T */ 925 idx = 0; 926 for (i=0; i<n; i++) { 927 928 v = aa + 36*diag[i]; 929 /* multiply by the inverse of the block diagonal */ 930 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 931 x6 = t[5+idx]; 932 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 933 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 934 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 935 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 936 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 937 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 938 v += 36; 939 940 vi = aj + diag[i] + 1; 941 nz = ai[i+1] - diag[i] - 1; 942 while (nz--) { 943 oidx = 6*(*vi++); 944 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 945 t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 946 t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 947 t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 948 t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 949 t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 950 v += 36; 951 } 952 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 953 t[5+idx] = s6; 954 idx += 6; 955 } 956 /* backward solve the L^T */ 957 for (i=n-1; i>=0; i--){ 958 v = aa + 36*diag[i] - 36; 959 vi = aj + diag[i] - 1; 960 nz = diag[i] - ai[i]; 961 idt = 6*i; 962 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 963 s6 = t[5+idt]; 964 while (nz--) { 965 idx = 6*(*vi--); 966 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 967 t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 968 t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 969 t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 970 t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 971 t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 972 v -= 36; 973 } 974 } 975 976 /* copy t into x according to permutation */ 977 ii = 0; 978 for (i=0; i<n; i++) { 979 ir = 6*r[i]; 980 x[ir] = t[ii]; 981 x[ir+1] = t[ii+1]; 982 x[ir+2] = t[ii+2]; 983 x[ir+3] = t[ii+3]; 984 x[ir+4] = t[ii+4]; 985 x[ir+5] = t[ii+5]; 986 ii += 6; 987 } 988 989 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 990 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 991 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 992 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 993 ierr = PetscLogFlops(2*36*(a->nz) - 6*A->cmap.n);CHKERRQ(ierr); 994 PetscFunctionReturn(0); 995 } 996 997 #undef __FUNCT__ 998 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7" 999 PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 1000 { 1001 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1002 IS iscol=a->col,isrow=a->row; 1003 PetscErrorCode ierr; 1004 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 1005 PetscInt *diag = a->diag,ii,ic,ir,oidx; 1006 MatScalar *aa=a->a,*v; 1007 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1008 PetscScalar *x,*b,*t; 1009 1010 PetscFunctionBegin; 1011 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1012 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1013 t = a->solve_work; 1014 1015 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1016 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1017 1018 /* copy the b into temp work space according to permutation */ 1019 ii = 0; 1020 for (i=0; i<n; i++) { 1021 ic = 7*c[i]; 1022 t[ii] = b[ic]; 1023 t[ii+1] = b[ic+1]; 1024 t[ii+2] = b[ic+2]; 1025 t[ii+3] = b[ic+3]; 1026 t[ii+4] = b[ic+4]; 1027 t[ii+5] = b[ic+5]; 1028 t[ii+6] = b[ic+6]; 1029 ii += 7; 1030 } 1031 1032 /* forward solve the U^T */ 1033 idx = 0; 1034 for (i=0; i<n; i++) { 1035 1036 v = aa + 49*diag[i]; 1037 /* multiply by the inverse of the block diagonal */ 1038 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1039 x6 = t[5+idx]; x7 = t[6+idx]; 1040 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 1041 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 1042 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 1043 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 1044 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 1045 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 1046 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 1047 v += 49; 1048 1049 vi = aj + diag[i] + 1; 1050 nz = ai[i+1] - diag[i] - 1; 1051 while (nz--) { 1052 oidx = 7*(*vi++); 1053 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1054 t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1055 t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1056 t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1057 t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1058 t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1059 t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1060 v += 49; 1061 } 1062 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1063 t[5+idx] = s6;t[6+idx] = s7; 1064 idx += 7; 1065 } 1066 /* backward solve the L^T */ 1067 for (i=n-1; i>=0; i--){ 1068 v = aa + 49*diag[i] - 49; 1069 vi = aj + diag[i] - 1; 1070 nz = diag[i] - ai[i]; 1071 idt = 7*i; 1072 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1073 s6 = t[5+idt];s7 = t[6+idt]; 1074 while (nz--) { 1075 idx = 7*(*vi--); 1076 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1077 t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1078 t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1079 t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1080 t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1081 t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1082 t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1083 v -= 49; 1084 } 1085 } 1086 1087 /* copy t into x according to permutation */ 1088 ii = 0; 1089 for (i=0; i<n; i++) { 1090 ir = 7*r[i]; 1091 x[ir] = t[ii]; 1092 x[ir+1] = t[ii+1]; 1093 x[ir+2] = t[ii+2]; 1094 x[ir+3] = t[ii+3]; 1095 x[ir+4] = t[ii+4]; 1096 x[ir+5] = t[ii+5]; 1097 x[ir+6] = t[ii+6]; 1098 ii += 7; 1099 } 1100 1101 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1102 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1103 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1104 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1105 ierr = PetscLogFlops(2*49*(a->nz) - 7*A->cmap.n);CHKERRQ(ierr); 1106 PetscFunctionReturn(0); 1107 } 1108 1109 /* ----------------------------------------------------------- */ 1110 #undef __FUNCT__ 1111 #define __FUNCT__ "MatSolve_SeqBAIJ_N" 1112 PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 1113 { 1114 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1115 IS iscol=a->col,isrow=a->row; 1116 PetscErrorCode ierr; 1117 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1118 PetscInt nz,bs=A->rmap.bs,bs2=a->bs2,*rout,*cout; 1119 MatScalar *aa=a->a,*v; 1120 PetscScalar *x,*b,*s,*t,*ls; 1121 1122 PetscFunctionBegin; 1123 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1124 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1125 t = a->solve_work; 1126 1127 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1128 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1129 1130 /* forward solve the lower triangular */ 1131 ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 1132 for (i=1; i<n; i++) { 1133 v = aa + bs2*ai[i]; 1134 vi = aj + ai[i]; 1135 nz = a->diag[i] - ai[i]; 1136 s = t + bs*i; 1137 ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 1138 while (nz--) { 1139 Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 1140 v += bs2; 1141 } 1142 } 1143 /* backward solve the upper triangular */ 1144 ls = a->solve_work + A->cmap.n; 1145 for (i=n-1; i>=0; i--){ 1146 v = aa + bs2*(a->diag[i] + 1); 1147 vi = aj + a->diag[i] + 1; 1148 nz = ai[i+1] - a->diag[i] - 1; 1149 ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 1150 while (nz--) { 1151 Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 1152 v += bs2; 1153 } 1154 Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 1155 ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 1156 } 1157 1158 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1159 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1160 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1161 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1162 ierr = PetscLogFlops(2*(a->bs2)*(a->nz) - A->rmap.bs*A->cmap.n);CHKERRQ(ierr); 1163 PetscFunctionReturn(0); 1164 } 1165 1166 #undef __FUNCT__ 1167 #define __FUNCT__ "MatSolve_SeqBAIJ_7" 1168 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 1169 { 1170 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1171 IS iscol=a->col,isrow=a->row; 1172 PetscErrorCode ierr; 1173 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1174 PetscInt *diag = a->diag; 1175 MatScalar *aa=a->a,*v; 1176 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1177 PetscScalar *x,*b,*t; 1178 1179 PetscFunctionBegin; 1180 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1181 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1182 t = a->solve_work; 1183 1184 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1185 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1186 1187 /* forward solve the lower triangular */ 1188 idx = 7*(*r++); 1189 t[0] = b[idx]; t[1] = b[1+idx]; 1190 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1191 t[5] = b[5+idx]; t[6] = b[6+idx]; 1192 1193 for (i=1; i<n; i++) { 1194 v = aa + 49*ai[i]; 1195 vi = aj + ai[i]; 1196 nz = diag[i] - ai[i]; 1197 idx = 7*(*r++); 1198 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1199 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 1200 while (nz--) { 1201 idx = 7*(*vi++); 1202 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1203 x4 = t[3+idx];x5 = t[4+idx]; 1204 x6 = t[5+idx];x7 = t[6+idx]; 1205 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1206 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1207 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1208 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1209 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1210 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1211 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1212 v += 49; 1213 } 1214 idx = 7*i; 1215 t[idx] = s1;t[1+idx] = s2; 1216 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1217 t[5+idx] = s6;t[6+idx] = s7; 1218 } 1219 /* backward solve the upper triangular */ 1220 for (i=n-1; i>=0; i--){ 1221 v = aa + 49*diag[i] + 49; 1222 vi = aj + diag[i] + 1; 1223 nz = ai[i+1] - diag[i] - 1; 1224 idt = 7*i; 1225 s1 = t[idt]; s2 = t[1+idt]; 1226 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1227 s6 = t[5+idt];s7 = t[6+idt]; 1228 while (nz--) { 1229 idx = 7*(*vi++); 1230 x1 = t[idx]; x2 = t[1+idx]; 1231 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1232 x6 = t[5+idx]; x7 = t[6+idx]; 1233 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1234 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1235 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1236 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1237 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1238 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1239 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1240 v += 49; 1241 } 1242 idc = 7*(*c--); 1243 v = aa + 49*diag[i]; 1244 x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 1245 v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 1246 x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 1247 v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 1248 x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 1249 v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 1250 x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 1251 v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 1252 x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 1253 v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 1254 x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 1255 v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 1256 x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 1257 v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 1258 } 1259 1260 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1261 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1262 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1263 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1264 ierr = PetscLogFlops(2*49*(a->nz) - 7*A->cmap.n);CHKERRQ(ierr); 1265 PetscFunctionReturn(0); 1266 } 1267 1268 #undef __FUNCT__ 1269 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering" 1270 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 1271 { 1272 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1273 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1274 PetscErrorCode ierr; 1275 PetscInt *diag = a->diag,jdx; 1276 const MatScalar *aa=a->a,*v; 1277 PetscScalar *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1278 const PetscScalar *b; 1279 1280 PetscFunctionBegin; 1281 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1282 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1283 /* forward solve the lower triangular */ 1284 idx = 0; 1285 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 1286 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 1287 x[6] = b[6+idx]; 1288 for (i=1; i<n; i++) { 1289 v = aa + 49*ai[i]; 1290 vi = aj + ai[i]; 1291 nz = diag[i] - ai[i]; 1292 idx = 7*i; 1293 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1294 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1295 s7 = b[6+idx]; 1296 while (nz--) { 1297 jdx = 7*(*vi++); 1298 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 1299 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 1300 x7 = x[6+jdx]; 1301 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1302 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1303 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1304 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1305 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1306 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1307 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1308 v += 49; 1309 } 1310 x[idx] = s1; 1311 x[1+idx] = s2; 1312 x[2+idx] = s3; 1313 x[3+idx] = s4; 1314 x[4+idx] = s5; 1315 x[5+idx] = s6; 1316 x[6+idx] = s7; 1317 } 1318 /* backward solve the upper triangular */ 1319 for (i=n-1; i>=0; i--){ 1320 v = aa + 49*diag[i] + 49; 1321 vi = aj + diag[i] + 1; 1322 nz = ai[i+1] - diag[i] - 1; 1323 idt = 7*i; 1324 s1 = x[idt]; s2 = x[1+idt]; 1325 s3 = x[2+idt]; s4 = x[3+idt]; 1326 s5 = x[4+idt]; s6 = x[5+idt]; 1327 s7 = x[6+idt]; 1328 while (nz--) { 1329 idx = 7*(*vi++); 1330 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 1331 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 1332 x7 = x[6+idx]; 1333 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1334 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1335 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1336 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1337 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1338 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1339 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1340 v += 49; 1341 } 1342 v = aa + 49*diag[i]; 1343 x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 1344 + v[28]*s5 + v[35]*s6 + v[42]*s7; 1345 x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 1346 + v[29]*s5 + v[36]*s6 + v[43]*s7; 1347 x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 1348 + v[30]*s5 + v[37]*s6 + v[44]*s7; 1349 x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 1350 + v[31]*s5 + v[38]*s6 + v[45]*s7; 1351 x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 1352 + v[32]*s5 + v[39]*s6 + v[46]*s7; 1353 x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 1354 + v[33]*s5 + v[40]*s6 + v[47]*s7; 1355 x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 1356 + v[34]*s5 + v[41]*s6 + v[48]*s7; 1357 } 1358 1359 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1360 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1361 ierr = PetscLogFlops(2*36*(a->nz) - 6*A->cmap.n);CHKERRQ(ierr); 1362 PetscFunctionReturn(0); 1363 } 1364 1365 #undef __FUNCT__ 1366 #define __FUNCT__ "MatSolve_SeqBAIJ_6" 1367 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 1368 { 1369 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1370 IS iscol=a->col,isrow=a->row; 1371 PetscErrorCode ierr; 1372 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1373 PetscInt *diag = a->diag; 1374 const MatScalar *aa=a->a,*v; 1375 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 1376 const PetscScalar *b; 1377 PetscFunctionBegin; 1378 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1379 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1380 t = a->solve_work; 1381 1382 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1383 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1384 1385 /* forward solve the lower triangular */ 1386 idx = 6*(*r++); 1387 t[0] = b[idx]; t[1] = b[1+idx]; 1388 t[2] = b[2+idx]; t[3] = b[3+idx]; 1389 t[4] = b[4+idx]; t[5] = b[5+idx]; 1390 for (i=1; i<n; i++) { 1391 v = aa + 36*ai[i]; 1392 vi = aj + ai[i]; 1393 nz = diag[i] - ai[i]; 1394 idx = 6*(*r++); 1395 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1396 s5 = b[4+idx]; s6 = b[5+idx]; 1397 while (nz--) { 1398 idx = 6*(*vi++); 1399 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1400 x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 1401 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1402 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1403 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1404 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1405 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1406 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1407 v += 36; 1408 } 1409 idx = 6*i; 1410 t[idx] = s1;t[1+idx] = s2; 1411 t[2+idx] = s3;t[3+idx] = s4; 1412 t[4+idx] = s5;t[5+idx] = s6; 1413 } 1414 /* backward solve the upper triangular */ 1415 for (i=n-1; i>=0; i--){ 1416 v = aa + 36*diag[i] + 36; 1417 vi = aj + diag[i] + 1; 1418 nz = ai[i+1] - diag[i] - 1; 1419 idt = 6*i; 1420 s1 = t[idt]; s2 = t[1+idt]; 1421 s3 = t[2+idt];s4 = t[3+idt]; 1422 s5 = t[4+idt];s6 = t[5+idt]; 1423 while (nz--) { 1424 idx = 6*(*vi++); 1425 x1 = t[idx]; x2 = t[1+idx]; 1426 x3 = t[2+idx]; x4 = t[3+idx]; 1427 x5 = t[4+idx]; x6 = t[5+idx]; 1428 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1429 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1430 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1431 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1432 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1433 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1434 v += 36; 1435 } 1436 idc = 6*(*c--); 1437 v = aa + 36*diag[i]; 1438 x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 1439 v[18]*s4+v[24]*s5+v[30]*s6; 1440 x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 1441 v[19]*s4+v[25]*s5+v[31]*s6; 1442 x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 1443 v[20]*s4+v[26]*s5+v[32]*s6; 1444 x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 1445 v[21]*s4+v[27]*s5+v[33]*s6; 1446 x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 1447 v[22]*s4+v[28]*s5+v[34]*s6; 1448 x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 1449 v[23]*s4+v[29]*s5+v[35]*s6; 1450 } 1451 1452 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1453 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1454 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1455 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1456 ierr = PetscLogFlops(2*36*(a->nz) - 6*A->cmap.n);CHKERRQ(ierr); 1457 PetscFunctionReturn(0); 1458 } 1459 1460 #undef __FUNCT__ 1461 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering" 1462 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 1463 { 1464 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1465 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1466 PetscErrorCode ierr; 1467 PetscInt *diag = a->diag,jdx; 1468 const MatScalar *aa=a->a,*v; 1469 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 1470 const PetscScalar *b; 1471 1472 PetscFunctionBegin; 1473 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1474 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1475 /* forward solve the lower triangular */ 1476 idx = 0; 1477 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 1478 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 1479 for (i=1; i<n; i++) { 1480 v = aa + 36*ai[i]; 1481 vi = aj + ai[i]; 1482 nz = diag[i] - ai[i]; 1483 idx = 6*i; 1484 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1485 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1486 while (nz--) { 1487 jdx = 6*(*vi++); 1488 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 1489 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 1490 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1491 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1492 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1493 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1494 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1495 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1496 v += 36; 1497 } 1498 x[idx] = s1; 1499 x[1+idx] = s2; 1500 x[2+idx] = s3; 1501 x[3+idx] = s4; 1502 x[4+idx] = s5; 1503 x[5+idx] = s6; 1504 } 1505 /* backward solve the upper triangular */ 1506 for (i=n-1; i>=0; i--){ 1507 v = aa + 36*diag[i] + 36; 1508 vi = aj + diag[i] + 1; 1509 nz = ai[i+1] - diag[i] - 1; 1510 idt = 6*i; 1511 s1 = x[idt]; s2 = x[1+idt]; 1512 s3 = x[2+idt]; s4 = x[3+idt]; 1513 s5 = x[4+idt]; s6 = x[5+idt]; 1514 while (nz--) { 1515 idx = 6*(*vi++); 1516 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 1517 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 1518 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1519 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1520 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1521 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1522 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1523 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1524 v += 36; 1525 } 1526 v = aa + 36*diag[i]; 1527 x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 1528 x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 1529 x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 1530 x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 1531 x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 1532 x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 1533 } 1534 1535 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1536 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1537 ierr = PetscLogFlops(2*36*(a->nz) - 6*A->cmap.n);CHKERRQ(ierr); 1538 PetscFunctionReturn(0); 1539 } 1540 1541 #undef __FUNCT__ 1542 #define __FUNCT__ "MatSolve_SeqBAIJ_5" 1543 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 1544 { 1545 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1546 IS iscol=a->col,isrow=a->row; 1547 PetscErrorCode ierr; 1548 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1549 PetscInt *diag = a->diag; 1550 const MatScalar *aa=a->a,*v; 1551 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 1552 const PetscScalar *b; 1553 1554 PetscFunctionBegin; 1555 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1556 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1557 t = a->solve_work; 1558 1559 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1560 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1561 1562 /* forward solve the lower triangular */ 1563 idx = 5*(*r++); 1564 t[0] = b[idx]; t[1] = b[1+idx]; 1565 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1566 for (i=1; i<n; i++) { 1567 v = aa + 25*ai[i]; 1568 vi = aj + ai[i]; 1569 nz = diag[i] - ai[i]; 1570 idx = 5*(*r++); 1571 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1572 s5 = b[4+idx]; 1573 while (nz--) { 1574 idx = 5*(*vi++); 1575 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1576 x4 = t[3+idx];x5 = t[4+idx]; 1577 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1578 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1579 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1580 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1581 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1582 v += 25; 1583 } 1584 idx = 5*i; 1585 t[idx] = s1;t[1+idx] = s2; 1586 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1587 } 1588 /* backward solve the upper triangular */ 1589 for (i=n-1; i>=0; i--){ 1590 v = aa + 25*diag[i] + 25; 1591 vi = aj + diag[i] + 1; 1592 nz = ai[i+1] - diag[i] - 1; 1593 idt = 5*i; 1594 s1 = t[idt]; s2 = t[1+idt]; 1595 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1596 while (nz--) { 1597 idx = 5*(*vi++); 1598 x1 = t[idx]; x2 = t[1+idx]; 1599 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1600 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1601 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1602 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1603 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1604 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1605 v += 25; 1606 } 1607 idc = 5*(*c--); 1608 v = aa + 25*diag[i]; 1609 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 1610 v[15]*s4+v[20]*s5; 1611 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 1612 v[16]*s4+v[21]*s5; 1613 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 1614 v[17]*s4+v[22]*s5; 1615 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 1616 v[18]*s4+v[23]*s5; 1617 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 1618 v[19]*s4+v[24]*s5; 1619 } 1620 1621 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1622 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1623 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1624 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1625 ierr = PetscLogFlops(2*25*(a->nz) - 5*A->cmap.n);CHKERRQ(ierr); 1626 PetscFunctionReturn(0); 1627 } 1628 1629 #undef __FUNCT__ 1630 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering" 1631 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 1632 { 1633 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1634 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1635 PetscErrorCode ierr; 1636 PetscInt *diag = a->diag,jdx; 1637 const MatScalar *aa=a->a,*v; 1638 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 1639 const PetscScalar *b; 1640 1641 PetscFunctionBegin; 1642 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1643 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1644 /* forward solve the lower triangular */ 1645 idx = 0; 1646 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 1647 for (i=1; i<n; i++) { 1648 v = aa + 25*ai[i]; 1649 vi = aj + ai[i]; 1650 nz = diag[i] - ai[i]; 1651 idx = 5*i; 1652 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 1653 while (nz--) { 1654 jdx = 5*(*vi++); 1655 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 1656 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1657 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1658 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1659 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1660 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1661 v += 25; 1662 } 1663 x[idx] = s1; 1664 x[1+idx] = s2; 1665 x[2+idx] = s3; 1666 x[3+idx] = s4; 1667 x[4+idx] = s5; 1668 } 1669 /* backward solve the upper triangular */ 1670 for (i=n-1; i>=0; i--){ 1671 v = aa + 25*diag[i] + 25; 1672 vi = aj + diag[i] + 1; 1673 nz = ai[i+1] - diag[i] - 1; 1674 idt = 5*i; 1675 s1 = x[idt]; s2 = x[1+idt]; 1676 s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 1677 while (nz--) { 1678 idx = 5*(*vi++); 1679 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 1680 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1681 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1682 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1683 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1684 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1685 v += 25; 1686 } 1687 v = aa + 25*diag[i]; 1688 x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 1689 x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 1690 x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 1691 x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 1692 x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 1693 } 1694 1695 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1696 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1697 ierr = PetscLogFlops(2*25*(a->nz) - 5*A->cmap.n);CHKERRQ(ierr); 1698 PetscFunctionReturn(0); 1699 } 1700 1701 #undef __FUNCT__ 1702 #define __FUNCT__ "MatSolve_SeqBAIJ_4" 1703 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 1704 { 1705 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1706 IS iscol=a->col,isrow=a->row; 1707 PetscErrorCode ierr; 1708 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1709 PetscInt *diag = a->diag; 1710 const MatScalar *aa=a->a,*v; 1711 PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 1712 const PetscScalar *b; 1713 1714 PetscFunctionBegin; 1715 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1716 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1717 t = a->solve_work; 1718 1719 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1720 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1721 1722 /* forward solve the lower triangular */ 1723 idx = 4*(*r++); 1724 t[0] = b[idx]; t[1] = b[1+idx]; 1725 t[2] = b[2+idx]; t[3] = b[3+idx]; 1726 for (i=1; i<n; i++) { 1727 v = aa + 16*ai[i]; 1728 vi = aj + ai[i]; 1729 nz = diag[i] - ai[i]; 1730 idx = 4*(*r++); 1731 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1732 while (nz--) { 1733 idx = 4*(*vi++); 1734 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 1735 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1736 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1737 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1738 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1739 v += 16; 1740 } 1741 idx = 4*i; 1742 t[idx] = s1;t[1+idx] = s2; 1743 t[2+idx] = s3;t[3+idx] = s4; 1744 } 1745 /* backward solve the upper triangular */ 1746 for (i=n-1; i>=0; i--){ 1747 v = aa + 16*diag[i] + 16; 1748 vi = aj + diag[i] + 1; 1749 nz = ai[i+1] - diag[i] - 1; 1750 idt = 4*i; 1751 s1 = t[idt]; s2 = t[1+idt]; 1752 s3 = t[2+idt];s4 = t[3+idt]; 1753 while (nz--) { 1754 idx = 4*(*vi++); 1755 x1 = t[idx]; x2 = t[1+idx]; 1756 x3 = t[2+idx]; x4 = t[3+idx]; 1757 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1758 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1759 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1760 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1761 v += 16; 1762 } 1763 idc = 4*(*c--); 1764 v = aa + 16*diag[i]; 1765 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 1766 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 1767 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 1768 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 1769 } 1770 1771 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1772 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1773 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1774 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1775 ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap.n);CHKERRQ(ierr); 1776 PetscFunctionReturn(0); 1777 } 1778 1779 #undef __FUNCT__ 1780 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion" 1781 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx) 1782 { 1783 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1784 IS iscol=a->col,isrow=a->row; 1785 PetscErrorCode ierr; 1786 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1787 PetscInt *diag = a->diag; 1788 const MatScalar *aa=a->a,*v; 1789 MatScalar s1,s2,s3,s4,x1,x2,x3,x4,*t; 1790 PetscScalar *x; 1791 const PetscScalar *b; 1792 1793 PetscFunctionBegin; 1794 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1795 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1796 t = (MatScalar *)a->solve_work; 1797 1798 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1799 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1800 1801 /* forward solve the lower triangular */ 1802 idx = 4*(*r++); 1803 t[0] = (MatScalar)b[idx]; 1804 t[1] = (MatScalar)b[1+idx]; 1805 t[2] = (MatScalar)b[2+idx]; 1806 t[3] = (MatScalar)b[3+idx]; 1807 for (i=1; i<n; i++) { 1808 v = aa + 16*ai[i]; 1809 vi = aj + ai[i]; 1810 nz = diag[i] - ai[i]; 1811 idx = 4*(*r++); 1812 s1 = (MatScalar)b[idx]; 1813 s2 = (MatScalar)b[1+idx]; 1814 s3 = (MatScalar)b[2+idx]; 1815 s4 = (MatScalar)b[3+idx]; 1816 while (nz--) { 1817 idx = 4*(*vi++); 1818 x1 = t[idx]; 1819 x2 = t[1+idx]; 1820 x3 = t[2+idx]; 1821 x4 = t[3+idx]; 1822 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1823 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1824 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1825 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1826 v += 16; 1827 } 1828 idx = 4*i; 1829 t[idx] = s1; 1830 t[1+idx] = s2; 1831 t[2+idx] = s3; 1832 t[3+idx] = s4; 1833 } 1834 /* backward solve the upper triangular */ 1835 for (i=n-1; i>=0; i--){ 1836 v = aa + 16*diag[i] + 16; 1837 vi = aj + diag[i] + 1; 1838 nz = ai[i+1] - diag[i] - 1; 1839 idt = 4*i; 1840 s1 = t[idt]; 1841 s2 = t[1+idt]; 1842 s3 = t[2+idt]; 1843 s4 = t[3+idt]; 1844 while (nz--) { 1845 idx = 4*(*vi++); 1846 x1 = t[idx]; 1847 x2 = t[1+idx]; 1848 x3 = t[2+idx]; 1849 x4 = t[3+idx]; 1850 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1851 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1852 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1853 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1854 v += 16; 1855 } 1856 idc = 4*(*c--); 1857 v = aa + 16*diag[i]; 1858 t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 1859 t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 1860 t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 1861 t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 1862 x[idc] = (PetscScalar)t[idt]; 1863 x[1+idc] = (PetscScalar)t[1+idt]; 1864 x[2+idc] = (PetscScalar)t[2+idt]; 1865 x[3+idc] = (PetscScalar)t[3+idt]; 1866 } 1867 1868 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1869 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1870 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1871 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1872 ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap.n);CHKERRQ(ierr); 1873 PetscFunctionReturn(0); 1874 } 1875 1876 #if defined (PETSC_HAVE_SSE) 1877 1878 #include PETSC_HAVE_SSE 1879 1880 #undef __FUNCT__ 1881 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion" 1882 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx) 1883 { 1884 /* 1885 Note: This code uses demotion of double 1886 to float when performing the mixed-mode computation. 1887 This may not be numerically reasonable for all applications. 1888 */ 1889 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1890 IS iscol=a->col,isrow=a->row; 1891 PetscErrorCode ierr; 1892 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1893 PetscInt *diag = a->diag,ai16; 1894 MatScalar *aa=a->a,*v; 1895 PetscScalar *x,*b,*t; 1896 1897 /* Make space in temp stack for 16 Byte Aligned arrays */ 1898 float ssealignedspace[11],*tmps,*tmpx; 1899 unsigned long offset; 1900 1901 PetscFunctionBegin; 1902 SSE_SCOPE_BEGIN; 1903 1904 offset = (unsigned long)ssealignedspace % 16; 1905 if (offset) offset = (16 - offset)/4; 1906 tmps = &ssealignedspace[offset]; 1907 tmpx = &ssealignedspace[offset+4]; 1908 PREFETCH_NTA(aa+16*ai[1]); 1909 1910 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1911 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1912 t = a->solve_work; 1913 1914 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1915 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1916 1917 /* forward solve the lower triangular */ 1918 idx = 4*(*r++); 1919 t[0] = b[idx]; t[1] = b[1+idx]; 1920 t[2] = b[2+idx]; t[3] = b[3+idx]; 1921 v = aa + 16*ai[1]; 1922 1923 for (i=1; i<n;) { 1924 PREFETCH_NTA(&v[8]); 1925 vi = aj + ai[i]; 1926 nz = diag[i] - ai[i]; 1927 idx = 4*(*r++); 1928 1929 /* Demote sum from double to float */ 1930 CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 1931 LOAD_PS(tmps,XMM7); 1932 1933 while (nz--) { 1934 PREFETCH_NTA(&v[16]); 1935 idx = 4*(*vi++); 1936 1937 /* Demote solution (so far) from double to float */ 1938 CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 1939 1940 /* 4x4 Matrix-Vector product with negative accumulation: */ 1941 SSE_INLINE_BEGIN_2(tmpx,v) 1942 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 1943 1944 /* First Column */ 1945 SSE_COPY_PS(XMM0,XMM6) 1946 SSE_SHUFFLE(XMM0,XMM0,0x00) 1947 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 1948 SSE_SUB_PS(XMM7,XMM0) 1949 1950 /* Second Column */ 1951 SSE_COPY_PS(XMM1,XMM6) 1952 SSE_SHUFFLE(XMM1,XMM1,0x55) 1953 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 1954 SSE_SUB_PS(XMM7,XMM1) 1955 1956 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 1957 1958 /* Third Column */ 1959 SSE_COPY_PS(XMM2,XMM6) 1960 SSE_SHUFFLE(XMM2,XMM2,0xAA) 1961 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 1962 SSE_SUB_PS(XMM7,XMM2) 1963 1964 /* Fourth Column */ 1965 SSE_COPY_PS(XMM3,XMM6) 1966 SSE_SHUFFLE(XMM3,XMM3,0xFF) 1967 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 1968 SSE_SUB_PS(XMM7,XMM3) 1969 SSE_INLINE_END_2 1970 1971 v += 16; 1972 } 1973 idx = 4*i; 1974 v = aa + 16*ai[++i]; 1975 PREFETCH_NTA(v); 1976 STORE_PS(tmps,XMM7); 1977 1978 /* Promote result from float to double */ 1979 CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps); 1980 } 1981 /* backward solve the upper triangular */ 1982 idt = 4*(n-1); 1983 ai16 = 16*diag[n-1]; 1984 v = aa + ai16 + 16; 1985 for (i=n-1; i>=0;){ 1986 PREFETCH_NTA(&v[8]); 1987 vi = aj + diag[i] + 1; 1988 nz = ai[i+1] - diag[i] - 1; 1989 1990 /* Demote accumulator from double to float */ 1991 CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]); 1992 LOAD_PS(tmps,XMM7); 1993 1994 while (nz--) { 1995 PREFETCH_NTA(&v[16]); 1996 idx = 4*(*vi++); 1997 1998 /* Demote solution (so far) from double to float */ 1999 CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]); 2000 2001 /* 4x4 Matrix-Vector Product with negative accumulation: */ 2002 SSE_INLINE_BEGIN_2(tmpx,v) 2003 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2004 2005 /* First Column */ 2006 SSE_COPY_PS(XMM0,XMM6) 2007 SSE_SHUFFLE(XMM0,XMM0,0x00) 2008 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2009 SSE_SUB_PS(XMM7,XMM0) 2010 2011 /* Second Column */ 2012 SSE_COPY_PS(XMM1,XMM6) 2013 SSE_SHUFFLE(XMM1,XMM1,0x55) 2014 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2015 SSE_SUB_PS(XMM7,XMM1) 2016 2017 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2018 2019 /* Third Column */ 2020 SSE_COPY_PS(XMM2,XMM6) 2021 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2022 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2023 SSE_SUB_PS(XMM7,XMM2) 2024 2025 /* Fourth Column */ 2026 SSE_COPY_PS(XMM3,XMM6) 2027 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2028 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2029 SSE_SUB_PS(XMM7,XMM3) 2030 SSE_INLINE_END_2 2031 v += 16; 2032 } 2033 v = aa + ai16; 2034 ai16 = 16*diag[--i]; 2035 PREFETCH_NTA(aa+ai16+16); 2036 /* 2037 Scale the result by the diagonal 4x4 block, 2038 which was inverted as part of the factorization 2039 */ 2040 SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 2041 /* First Column */ 2042 SSE_COPY_PS(XMM0,XMM7) 2043 SSE_SHUFFLE(XMM0,XMM0,0x00) 2044 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 2045 2046 /* Second Column */ 2047 SSE_COPY_PS(XMM1,XMM7) 2048 SSE_SHUFFLE(XMM1,XMM1,0x55) 2049 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 2050 SSE_ADD_PS(XMM0,XMM1) 2051 2052 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 2053 2054 /* Third Column */ 2055 SSE_COPY_PS(XMM2,XMM7) 2056 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2057 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 2058 SSE_ADD_PS(XMM0,XMM2) 2059 2060 /* Fourth Column */ 2061 SSE_COPY_PS(XMM3,XMM7) 2062 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2063 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 2064 SSE_ADD_PS(XMM0,XMM3) 2065 2066 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 2067 SSE_INLINE_END_3 2068 2069 /* Promote solution from float to double */ 2070 CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps); 2071 2072 /* Apply reordering to t and stream into x. */ 2073 /* This way, x doesn't pollute the cache. */ 2074 /* Be careful with size: 2 doubles = 4 floats! */ 2075 idc = 4*(*c--); 2076 SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc]) 2077 /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 2078 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 2079 SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0) 2080 /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 2081 SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1) 2082 SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1) 2083 SSE_INLINE_END_2 2084 v = aa + ai16 + 16; 2085 idt -= 4; 2086 } 2087 2088 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2089 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2090 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2091 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2092 ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap.n);CHKERRQ(ierr); 2093 SSE_SCOPE_END; 2094 PetscFunctionReturn(0); 2095 } 2096 2097 #endif 2098 2099 2100 /* 2101 Special case where the matrix was ILU(0) factored in the natural 2102 ordering. This eliminates the need for the column and row permutation. 2103 */ 2104 #undef __FUNCT__ 2105 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering" 2106 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 2107 { 2108 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2109 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 2110 PetscErrorCode ierr; 2111 PetscInt *diag = a->diag; 2112 const MatScalar *aa=a->a; 2113 PetscScalar *x; 2114 const PetscScalar *b; 2115 2116 PetscFunctionBegin; 2117 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2118 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2119 2120 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS) 2121 { 2122 static PetscScalar w[2000]; /* very BAD need to fix */ 2123 fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w); 2124 } 2125 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 2126 { 2127 static PetscScalar w[2000]; /* very BAD need to fix */ 2128 fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w); 2129 } 2130 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 2131 fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b); 2132 #else 2133 { 2134 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 2135 const MatScalar *v; 2136 PetscInt jdx,idt,idx,nz,*vi,i,ai16; 2137 2138 /* forward solve the lower triangular */ 2139 idx = 0; 2140 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 2141 for (i=1; i<n; i++) { 2142 v = aa + 16*ai[i]; 2143 vi = aj + ai[i]; 2144 nz = diag[i] - ai[i]; 2145 idx += 4; 2146 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2147 while (nz--) { 2148 jdx = 4*(*vi++); 2149 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx]; 2150 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2151 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2152 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2153 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2154 v += 16; 2155 } 2156 x[idx] = s1; 2157 x[1+idx] = s2; 2158 x[2+idx] = s3; 2159 x[3+idx] = s4; 2160 } 2161 /* backward solve the upper triangular */ 2162 idt = 4*(n-1); 2163 for (i=n-1; i>=0; i--){ 2164 ai16 = 16*diag[i]; 2165 v = aa + ai16 + 16; 2166 vi = aj + diag[i] + 1; 2167 nz = ai[i+1] - diag[i] - 1; 2168 s1 = x[idt]; s2 = x[1+idt]; 2169 s3 = x[2+idt];s4 = x[3+idt]; 2170 while (nz--) { 2171 idx = 4*(*vi++); 2172 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; 2173 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2174 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2175 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2176 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2177 v += 16; 2178 } 2179 v = aa + ai16; 2180 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 2181 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 2182 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 2183 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 2184 idt -= 4; 2185 } 2186 } 2187 #endif 2188 2189 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2190 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2191 ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap.n);CHKERRQ(ierr); 2192 PetscFunctionReturn(0); 2193 } 2194 2195 #undef __FUNCT__ 2196 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion" 2197 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx) 2198 { 2199 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2200 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 2201 PetscErrorCode ierr; 2202 PetscInt *diag = a->diag; 2203 MatScalar *aa=a->a; 2204 PetscScalar *x,*b; 2205 2206 PetscFunctionBegin; 2207 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2208 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2209 2210 { 2211 MatScalar s1,s2,s3,s4,x1,x2,x3,x4; 2212 MatScalar *v,*t=(MatScalar *)x; 2213 PetscInt jdx,idt,idx,nz,*vi,i,ai16; 2214 2215 /* forward solve the lower triangular */ 2216 idx = 0; 2217 t[0] = (MatScalar)b[0]; 2218 t[1] = (MatScalar)b[1]; 2219 t[2] = (MatScalar)b[2]; 2220 t[3] = (MatScalar)b[3]; 2221 for (i=1; i<n; i++) { 2222 v = aa + 16*ai[i]; 2223 vi = aj + ai[i]; 2224 nz = diag[i] - ai[i]; 2225 idx += 4; 2226 s1 = (MatScalar)b[idx]; 2227 s2 = (MatScalar)b[1+idx]; 2228 s3 = (MatScalar)b[2+idx]; 2229 s4 = (MatScalar)b[3+idx]; 2230 while (nz--) { 2231 jdx = 4*(*vi++); 2232 x1 = t[jdx]; 2233 x2 = t[1+jdx]; 2234 x3 = t[2+jdx]; 2235 x4 = t[3+jdx]; 2236 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2237 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2238 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2239 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2240 v += 16; 2241 } 2242 t[idx] = s1; 2243 t[1+idx] = s2; 2244 t[2+idx] = s3; 2245 t[3+idx] = s4; 2246 } 2247 /* backward solve the upper triangular */ 2248 idt = 4*(n-1); 2249 for (i=n-1; i>=0; i--){ 2250 ai16 = 16*diag[i]; 2251 v = aa + ai16 + 16; 2252 vi = aj + diag[i] + 1; 2253 nz = ai[i+1] - diag[i] - 1; 2254 s1 = t[idt]; 2255 s2 = t[1+idt]; 2256 s3 = t[2+idt]; 2257 s4 = t[3+idt]; 2258 while (nz--) { 2259 idx = 4*(*vi++); 2260 x1 = (MatScalar)x[idx]; 2261 x2 = (MatScalar)x[1+idx]; 2262 x3 = (MatScalar)x[2+idx]; 2263 x4 = (MatScalar)x[3+idx]; 2264 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2265 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2266 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2267 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2268 v += 16; 2269 } 2270 v = aa + ai16; 2271 x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4); 2272 x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4); 2273 x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4); 2274 x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4); 2275 idt -= 4; 2276 } 2277 } 2278 2279 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2280 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2281 ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap.n);CHKERRQ(ierr); 2282 PetscFunctionReturn(0); 2283 } 2284 2285 #if defined (PETSC_HAVE_SSE) 2286 2287 #include PETSC_HAVE_SSE 2288 #undef __FUNCT__ 2289 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj" 2290 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx) 2291 { 2292 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2293 unsigned short *aj=(unsigned short *)a->j; 2294 PetscErrorCode ierr; 2295 int *ai=a->i,n=a->mbs,*diag = a->diag; 2296 MatScalar *aa=a->a; 2297 PetscScalar *x,*b; 2298 2299 PetscFunctionBegin; 2300 SSE_SCOPE_BEGIN; 2301 /* 2302 Note: This code currently uses demotion of double 2303 to float when performing the mixed-mode computation. 2304 This may not be numerically reasonable for all applications. 2305 */ 2306 PREFETCH_NTA(aa+16*ai[1]); 2307 2308 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2309 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2310 { 2311 /* x will first be computed in single precision then promoted inplace to double */ 2312 MatScalar *v,*t=(MatScalar *)x; 2313 int nz,i,idt,ai16; 2314 unsigned int jdx,idx; 2315 unsigned short *vi; 2316 /* Forward solve the lower triangular factor. */ 2317 2318 /* First block is the identity. */ 2319 idx = 0; 2320 CONVERT_DOUBLE4_FLOAT4(t,b); 2321 v = aa + 16*((unsigned int)ai[1]); 2322 2323 for (i=1; i<n;) { 2324 PREFETCH_NTA(&v[8]); 2325 vi = aj + ai[i]; 2326 nz = diag[i] - ai[i]; 2327 idx += 4; 2328 2329 /* Demote RHS from double to float. */ 2330 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 2331 LOAD_PS(&t[idx],XMM7); 2332 2333 while (nz--) { 2334 PREFETCH_NTA(&v[16]); 2335 jdx = 4*((unsigned int)(*vi++)); 2336 2337 /* 4x4 Matrix-Vector product with negative accumulation: */ 2338 SSE_INLINE_BEGIN_2(&t[jdx],v) 2339 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2340 2341 /* First Column */ 2342 SSE_COPY_PS(XMM0,XMM6) 2343 SSE_SHUFFLE(XMM0,XMM0,0x00) 2344 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2345 SSE_SUB_PS(XMM7,XMM0) 2346 2347 /* Second Column */ 2348 SSE_COPY_PS(XMM1,XMM6) 2349 SSE_SHUFFLE(XMM1,XMM1,0x55) 2350 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2351 SSE_SUB_PS(XMM7,XMM1) 2352 2353 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2354 2355 /* Third Column */ 2356 SSE_COPY_PS(XMM2,XMM6) 2357 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2358 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2359 SSE_SUB_PS(XMM7,XMM2) 2360 2361 /* Fourth Column */ 2362 SSE_COPY_PS(XMM3,XMM6) 2363 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2364 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2365 SSE_SUB_PS(XMM7,XMM3) 2366 SSE_INLINE_END_2 2367 2368 v += 16; 2369 } 2370 v = aa + 16*ai[++i]; 2371 PREFETCH_NTA(v); 2372 STORE_PS(&t[idx],XMM7); 2373 } 2374 2375 /* Backward solve the upper triangular factor.*/ 2376 2377 idt = 4*(n-1); 2378 ai16 = 16*diag[n-1]; 2379 v = aa + ai16 + 16; 2380 for (i=n-1; i>=0;){ 2381 PREFETCH_NTA(&v[8]); 2382 vi = aj + diag[i] + 1; 2383 nz = ai[i+1] - diag[i] - 1; 2384 2385 LOAD_PS(&t[idt],XMM7); 2386 2387 while (nz--) { 2388 PREFETCH_NTA(&v[16]); 2389 idx = 4*((unsigned int)(*vi++)); 2390 2391 /* 4x4 Matrix-Vector Product with negative accumulation: */ 2392 SSE_INLINE_BEGIN_2(&t[idx],v) 2393 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2394 2395 /* First Column */ 2396 SSE_COPY_PS(XMM0,XMM6) 2397 SSE_SHUFFLE(XMM0,XMM0,0x00) 2398 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2399 SSE_SUB_PS(XMM7,XMM0) 2400 2401 /* Second Column */ 2402 SSE_COPY_PS(XMM1,XMM6) 2403 SSE_SHUFFLE(XMM1,XMM1,0x55) 2404 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2405 SSE_SUB_PS(XMM7,XMM1) 2406 2407 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2408 2409 /* Third Column */ 2410 SSE_COPY_PS(XMM2,XMM6) 2411 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2412 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2413 SSE_SUB_PS(XMM7,XMM2) 2414 2415 /* Fourth Column */ 2416 SSE_COPY_PS(XMM3,XMM6) 2417 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2418 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2419 SSE_SUB_PS(XMM7,XMM3) 2420 SSE_INLINE_END_2 2421 v += 16; 2422 } 2423 v = aa + ai16; 2424 ai16 = 16*diag[--i]; 2425 PREFETCH_NTA(aa+ai16+16); 2426 /* 2427 Scale the result by the diagonal 4x4 block, 2428 which was inverted as part of the factorization 2429 */ 2430 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 2431 /* First Column */ 2432 SSE_COPY_PS(XMM0,XMM7) 2433 SSE_SHUFFLE(XMM0,XMM0,0x00) 2434 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 2435 2436 /* Second Column */ 2437 SSE_COPY_PS(XMM1,XMM7) 2438 SSE_SHUFFLE(XMM1,XMM1,0x55) 2439 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 2440 SSE_ADD_PS(XMM0,XMM1) 2441 2442 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 2443 2444 /* Third Column */ 2445 SSE_COPY_PS(XMM2,XMM7) 2446 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2447 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 2448 SSE_ADD_PS(XMM0,XMM2) 2449 2450 /* Fourth Column */ 2451 SSE_COPY_PS(XMM3,XMM7) 2452 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2453 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 2454 SSE_ADD_PS(XMM0,XMM3) 2455 2456 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 2457 SSE_INLINE_END_3 2458 2459 v = aa + ai16 + 16; 2460 idt -= 4; 2461 } 2462 2463 /* Convert t from single precision back to double precision (inplace)*/ 2464 idt = 4*(n-1); 2465 for (i=n-1;i>=0;i--) { 2466 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 2467 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 2468 PetscScalar *xtemp=&x[idt]; 2469 MatScalar *ttemp=&t[idt]; 2470 xtemp[3] = (PetscScalar)ttemp[3]; 2471 xtemp[2] = (PetscScalar)ttemp[2]; 2472 xtemp[1] = (PetscScalar)ttemp[1]; 2473 xtemp[0] = (PetscScalar)ttemp[0]; 2474 idt -= 4; 2475 } 2476 2477 } /* End of artificial scope. */ 2478 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2479 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2480 ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap.n);CHKERRQ(ierr); 2481 SSE_SCOPE_END; 2482 PetscFunctionReturn(0); 2483 } 2484 2485 #undef __FUNCT__ 2486 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion" 2487 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx) 2488 { 2489 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2490 int *aj=a->j; 2491 PetscErrorCode ierr; 2492 int *ai=a->i,n=a->mbs,*diag = a->diag; 2493 MatScalar *aa=a->a; 2494 PetscScalar *x,*b; 2495 2496 PetscFunctionBegin; 2497 SSE_SCOPE_BEGIN; 2498 /* 2499 Note: This code currently uses demotion of double 2500 to float when performing the mixed-mode computation. 2501 This may not be numerically reasonable for all applications. 2502 */ 2503 PREFETCH_NTA(aa+16*ai[1]); 2504 2505 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2506 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2507 { 2508 /* x will first be computed in single precision then promoted inplace to double */ 2509 MatScalar *v,*t=(MatScalar *)x; 2510 int nz,i,idt,ai16; 2511 int jdx,idx; 2512 int *vi; 2513 /* Forward solve the lower triangular factor. */ 2514 2515 /* First block is the identity. */ 2516 idx = 0; 2517 CONVERT_DOUBLE4_FLOAT4(t,b); 2518 v = aa + 16*ai[1]; 2519 2520 for (i=1; i<n;) { 2521 PREFETCH_NTA(&v[8]); 2522 vi = aj + ai[i]; 2523 nz = diag[i] - ai[i]; 2524 idx += 4; 2525 2526 /* Demote RHS from double to float. */ 2527 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 2528 LOAD_PS(&t[idx],XMM7); 2529 2530 while (nz--) { 2531 PREFETCH_NTA(&v[16]); 2532 jdx = 4*(*vi++); 2533 /* jdx = *vi++; */ 2534 2535 /* 4x4 Matrix-Vector product with negative accumulation: */ 2536 SSE_INLINE_BEGIN_2(&t[jdx],v) 2537 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2538 2539 /* First Column */ 2540 SSE_COPY_PS(XMM0,XMM6) 2541 SSE_SHUFFLE(XMM0,XMM0,0x00) 2542 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2543 SSE_SUB_PS(XMM7,XMM0) 2544 2545 /* Second Column */ 2546 SSE_COPY_PS(XMM1,XMM6) 2547 SSE_SHUFFLE(XMM1,XMM1,0x55) 2548 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2549 SSE_SUB_PS(XMM7,XMM1) 2550 2551 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2552 2553 /* Third Column */ 2554 SSE_COPY_PS(XMM2,XMM6) 2555 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2556 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2557 SSE_SUB_PS(XMM7,XMM2) 2558 2559 /* Fourth Column */ 2560 SSE_COPY_PS(XMM3,XMM6) 2561 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2562 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2563 SSE_SUB_PS(XMM7,XMM3) 2564 SSE_INLINE_END_2 2565 2566 v += 16; 2567 } 2568 v = aa + 16*ai[++i]; 2569 PREFETCH_NTA(v); 2570 STORE_PS(&t[idx],XMM7); 2571 } 2572 2573 /* Backward solve the upper triangular factor.*/ 2574 2575 idt = 4*(n-1); 2576 ai16 = 16*diag[n-1]; 2577 v = aa + ai16 + 16; 2578 for (i=n-1; i>=0;){ 2579 PREFETCH_NTA(&v[8]); 2580 vi = aj + diag[i] + 1; 2581 nz = ai[i+1] - diag[i] - 1; 2582 2583 LOAD_PS(&t[idt],XMM7); 2584 2585 while (nz--) { 2586 PREFETCH_NTA(&v[16]); 2587 idx = 4*(*vi++); 2588 /* idx = *vi++; */ 2589 2590 /* 4x4 Matrix-Vector Product with negative accumulation: */ 2591 SSE_INLINE_BEGIN_2(&t[idx],v) 2592 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2593 2594 /* First Column */ 2595 SSE_COPY_PS(XMM0,XMM6) 2596 SSE_SHUFFLE(XMM0,XMM0,0x00) 2597 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2598 SSE_SUB_PS(XMM7,XMM0) 2599 2600 /* Second Column */ 2601 SSE_COPY_PS(XMM1,XMM6) 2602 SSE_SHUFFLE(XMM1,XMM1,0x55) 2603 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2604 SSE_SUB_PS(XMM7,XMM1) 2605 2606 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2607 2608 /* Third Column */ 2609 SSE_COPY_PS(XMM2,XMM6) 2610 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2611 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2612 SSE_SUB_PS(XMM7,XMM2) 2613 2614 /* Fourth Column */ 2615 SSE_COPY_PS(XMM3,XMM6) 2616 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2617 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2618 SSE_SUB_PS(XMM7,XMM3) 2619 SSE_INLINE_END_2 2620 v += 16; 2621 } 2622 v = aa + ai16; 2623 ai16 = 16*diag[--i]; 2624 PREFETCH_NTA(aa+ai16+16); 2625 /* 2626 Scale the result by the diagonal 4x4 block, 2627 which was inverted as part of the factorization 2628 */ 2629 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 2630 /* First Column */ 2631 SSE_COPY_PS(XMM0,XMM7) 2632 SSE_SHUFFLE(XMM0,XMM0,0x00) 2633 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 2634 2635 /* Second Column */ 2636 SSE_COPY_PS(XMM1,XMM7) 2637 SSE_SHUFFLE(XMM1,XMM1,0x55) 2638 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 2639 SSE_ADD_PS(XMM0,XMM1) 2640 2641 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 2642 2643 /* Third Column */ 2644 SSE_COPY_PS(XMM2,XMM7) 2645 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2646 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 2647 SSE_ADD_PS(XMM0,XMM2) 2648 2649 /* Fourth Column */ 2650 SSE_COPY_PS(XMM3,XMM7) 2651 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2652 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 2653 SSE_ADD_PS(XMM0,XMM3) 2654 2655 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 2656 SSE_INLINE_END_3 2657 2658 v = aa + ai16 + 16; 2659 idt -= 4; 2660 } 2661 2662 /* Convert t from single precision back to double precision (inplace)*/ 2663 idt = 4*(n-1); 2664 for (i=n-1;i>=0;i--) { 2665 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 2666 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 2667 PetscScalar *xtemp=&x[idt]; 2668 MatScalar *ttemp=&t[idt]; 2669 xtemp[3] = (PetscScalar)ttemp[3]; 2670 xtemp[2] = (PetscScalar)ttemp[2]; 2671 xtemp[1] = (PetscScalar)ttemp[1]; 2672 xtemp[0] = (PetscScalar)ttemp[0]; 2673 idt -= 4; 2674 } 2675 2676 } /* End of artificial scope. */ 2677 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2678 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2679 ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap.n);CHKERRQ(ierr); 2680 SSE_SCOPE_END; 2681 PetscFunctionReturn(0); 2682 } 2683 2684 #endif 2685 #undef __FUNCT__ 2686 #define __FUNCT__ "MatSolve_SeqBAIJ_3" 2687 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 2688 { 2689 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2690 IS iscol=a->col,isrow=a->row; 2691 PetscErrorCode ierr; 2692 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 2693 PetscInt *diag = a->diag; 2694 const MatScalar *aa=a->a,*v; 2695 PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 2696 const PetscScalar *b; 2697 2698 PetscFunctionBegin; 2699 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2700 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2701 t = a->solve_work; 2702 2703 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2704 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2705 2706 /* forward solve the lower triangular */ 2707 idx = 3*(*r++); 2708 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 2709 for (i=1; i<n; i++) { 2710 v = aa + 9*ai[i]; 2711 vi = aj + ai[i]; 2712 nz = diag[i] - ai[i]; 2713 idx = 3*(*r++); 2714 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 2715 while (nz--) { 2716 idx = 3*(*vi++); 2717 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2718 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2719 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2720 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2721 v += 9; 2722 } 2723 idx = 3*i; 2724 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 2725 } 2726 /* backward solve the upper triangular */ 2727 for (i=n-1; i>=0; i--){ 2728 v = aa + 9*diag[i] + 9; 2729 vi = aj + diag[i] + 1; 2730 nz = ai[i+1] - diag[i] - 1; 2731 idt = 3*i; 2732 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 2733 while (nz--) { 2734 idx = 3*(*vi++); 2735 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2736 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2737 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2738 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2739 v += 9; 2740 } 2741 idc = 3*(*c--); 2742 v = aa + 9*diag[i]; 2743 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 2744 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 2745 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 2746 } 2747 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2748 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2749 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2750 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2751 ierr = PetscLogFlops(2*9*(a->nz) - 3*A->cmap.n);CHKERRQ(ierr); 2752 PetscFunctionReturn(0); 2753 } 2754 2755 /* 2756 Special case where the matrix was ILU(0) factored in the natural 2757 ordering. This eliminates the need for the column and row permutation. 2758 */ 2759 #undef __FUNCT__ 2760 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering" 2761 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 2762 { 2763 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2764 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 2765 PetscErrorCode ierr; 2766 PetscInt *diag = a->diag; 2767 const MatScalar *aa=a->a,*v; 2768 PetscScalar *x,s1,s2,s3,x1,x2,x3; 2769 const PetscScalar *b; 2770 PetscInt jdx,idt,idx,nz,*vi,i; 2771 2772 PetscFunctionBegin; 2773 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2774 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2775 2776 /* forward solve the lower triangular */ 2777 idx = 0; 2778 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; 2779 for (i=1; i<n; i++) { 2780 v = aa + 9*ai[i]; 2781 vi = aj + ai[i]; 2782 nz = diag[i] - ai[i]; 2783 idx += 3; 2784 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 2785 while (nz--) { 2786 jdx = 3*(*vi++); 2787 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx]; 2788 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2789 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2790 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2791 v += 9; 2792 } 2793 x[idx] = s1; 2794 x[1+idx] = s2; 2795 x[2+idx] = s3; 2796 } 2797 /* backward solve the upper triangular */ 2798 for (i=n-1; i>=0; i--){ 2799 v = aa + 9*diag[i] + 9; 2800 vi = aj + diag[i] + 1; 2801 nz = ai[i+1] - diag[i] - 1; 2802 idt = 3*i; 2803 s1 = x[idt]; s2 = x[1+idt]; 2804 s3 = x[2+idt]; 2805 while (nz--) { 2806 idx = 3*(*vi++); 2807 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; 2808 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2809 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2810 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2811 v += 9; 2812 } 2813 v = aa + 9*diag[i]; 2814 x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 2815 x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 2816 x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 2817 } 2818 2819 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2820 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2821 ierr = PetscLogFlops(2*9*(a->nz) - 3*A->cmap.n);CHKERRQ(ierr); 2822 PetscFunctionReturn(0); 2823 } 2824 2825 #undef __FUNCT__ 2826 #define __FUNCT__ "MatSolve_SeqBAIJ_2" 2827 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 2828 { 2829 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2830 IS iscol=a->col,isrow=a->row; 2831 PetscErrorCode ierr; 2832 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 2833 PetscInt *diag = a->diag; 2834 const MatScalar *aa=a->a,*v; 2835 PetscScalar *x,s1,s2,x1,x2,*t; 2836 const PetscScalar *b; 2837 2838 PetscFunctionBegin; 2839 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2840 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2841 t = a->solve_work; 2842 2843 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2844 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2845 2846 /* forward solve the lower triangular */ 2847 idx = 2*(*r++); 2848 t[0] = b[idx]; t[1] = b[1+idx]; 2849 for (i=1; i<n; i++) { 2850 v = aa + 4*ai[i]; 2851 vi = aj + ai[i]; 2852 nz = diag[i] - ai[i]; 2853 idx = 2*(*r++); 2854 s1 = b[idx]; s2 = b[1+idx]; 2855 while (nz--) { 2856 idx = 2*(*vi++); 2857 x1 = t[idx]; x2 = t[1+idx]; 2858 s1 -= v[0]*x1 + v[2]*x2; 2859 s2 -= v[1]*x1 + v[3]*x2; 2860 v += 4; 2861 } 2862 idx = 2*i; 2863 t[idx] = s1; t[1+idx] = s2; 2864 } 2865 /* backward solve the upper triangular */ 2866 for (i=n-1; i>=0; i--){ 2867 v = aa + 4*diag[i] + 4; 2868 vi = aj + diag[i] + 1; 2869 nz = ai[i+1] - diag[i] - 1; 2870 idt = 2*i; 2871 s1 = t[idt]; s2 = t[1+idt]; 2872 while (nz--) { 2873 idx = 2*(*vi++); 2874 x1 = t[idx]; x2 = t[1+idx]; 2875 s1 -= v[0]*x1 + v[2]*x2; 2876 s2 -= v[1]*x1 + v[3]*x2; 2877 v += 4; 2878 } 2879 idc = 2*(*c--); 2880 v = aa + 4*diag[i]; 2881 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 2882 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 2883 } 2884 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2885 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2886 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2887 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2888 ierr = PetscLogFlops(2*4*(a->nz) - 2*A->cmap.n);CHKERRQ(ierr); 2889 PetscFunctionReturn(0); 2890 } 2891 2892 /* 2893 Special case where the matrix was ILU(0) factored in the natural 2894 ordering. This eliminates the need for the column and row permutation. 2895 */ 2896 #undef __FUNCT__ 2897 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering" 2898 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 2899 { 2900 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2901 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 2902 PetscErrorCode ierr; 2903 PetscInt *diag = a->diag; 2904 const MatScalar *aa=a->a,*v; 2905 PetscScalar *x,s1,s2,x1,x2; 2906 const PetscScalar *b; 2907 PetscInt jdx,idt,idx,nz,*vi,i; 2908 2909 PetscFunctionBegin; 2910 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2911 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2912 2913 /* forward solve the lower triangular */ 2914 idx = 0; 2915 x[0] = b[0]; x[1] = b[1]; 2916 for (i=1; i<n; i++) { 2917 v = aa + 4*ai[i]; 2918 vi = aj + ai[i]; 2919 nz = diag[i] - ai[i]; 2920 idx += 2; 2921 s1 = b[idx];s2 = b[1+idx]; 2922 while (nz--) { 2923 jdx = 2*(*vi++); 2924 x1 = x[jdx];x2 = x[1+jdx]; 2925 s1 -= v[0]*x1 + v[2]*x2; 2926 s2 -= v[1]*x1 + v[3]*x2; 2927 v += 4; 2928 } 2929 x[idx] = s1; 2930 x[1+idx] = s2; 2931 } 2932 /* backward solve the upper triangular */ 2933 for (i=n-1; i>=0; i--){ 2934 v = aa + 4*diag[i] + 4; 2935 vi = aj + diag[i] + 1; 2936 nz = ai[i+1] - diag[i] - 1; 2937 idt = 2*i; 2938 s1 = x[idt]; s2 = x[1+idt]; 2939 while (nz--) { 2940 idx = 2*(*vi++); 2941 x1 = x[idx]; x2 = x[1+idx]; 2942 s1 -= v[0]*x1 + v[2]*x2; 2943 s2 -= v[1]*x1 + v[3]*x2; 2944 v += 4; 2945 } 2946 v = aa + 4*diag[i]; 2947 x[idt] = v[0]*s1 + v[2]*s2; 2948 x[1+idt] = v[1]*s1 + v[3]*s2; 2949 } 2950 2951 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2952 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2953 ierr = PetscLogFlops(2*4*(a->nz) - 2*A->cmap.n);CHKERRQ(ierr); 2954 PetscFunctionReturn(0); 2955 } 2956 2957 #undef __FUNCT__ 2958 #define __FUNCT__ "MatSolve_SeqBAIJ_1" 2959 PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 2960 { 2961 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2962 IS iscol=a->col,isrow=a->row; 2963 PetscErrorCode ierr; 2964 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout; 2965 PetscInt *diag = a->diag; 2966 MatScalar *aa=a->a,*v; 2967 PetscScalar *x,*b,s1,*t; 2968 2969 PetscFunctionBegin; 2970 if (!n) PetscFunctionReturn(0); 2971 2972 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2973 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2974 t = a->solve_work; 2975 2976 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2977 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2978 2979 /* forward solve the lower triangular */ 2980 t[0] = b[*r++]; 2981 for (i=1; i<n; i++) { 2982 v = aa + ai[i]; 2983 vi = aj + ai[i]; 2984 nz = diag[i] - ai[i]; 2985 s1 = b[*r++]; 2986 while (nz--) { 2987 s1 -= (*v++)*t[*vi++]; 2988 } 2989 t[i] = s1; 2990 } 2991 /* backward solve the upper triangular */ 2992 for (i=n-1; i>=0; i--){ 2993 v = aa + diag[i] + 1; 2994 vi = aj + diag[i] + 1; 2995 nz = ai[i+1] - diag[i] - 1; 2996 s1 = t[i]; 2997 while (nz--) { 2998 s1 -= (*v++)*t[*vi++]; 2999 } 3000 x[*c--] = t[i] = aa[diag[i]]*s1; 3001 } 3002 3003 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3004 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3005 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3006 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3007 ierr = PetscLogFlops(2*1*(a->nz) - A->cmap.n);CHKERRQ(ierr); 3008 PetscFunctionReturn(0); 3009 } 3010 /* 3011 Special case where the matrix was ILU(0) factored in the natural 3012 ordering. This eliminates the need for the column and row permutation. 3013 */ 3014 #undef __FUNCT__ 3015 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering" 3016 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 3017 { 3018 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3019 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 3020 PetscErrorCode ierr; 3021 PetscInt *diag = a->diag; 3022 MatScalar *aa=a->a; 3023 PetscScalar *x,*b; 3024 PetscScalar s1,x1; 3025 MatScalar *v; 3026 PetscInt jdx,idt,idx,nz,*vi,i; 3027 3028 PetscFunctionBegin; 3029 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3030 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3031 3032 /* forward solve the lower triangular */ 3033 idx = 0; 3034 x[0] = b[0]; 3035 for (i=1; i<n; i++) { 3036 v = aa + ai[i]; 3037 vi = aj + ai[i]; 3038 nz = diag[i] - ai[i]; 3039 idx += 1; 3040 s1 = b[idx]; 3041 while (nz--) { 3042 jdx = *vi++; 3043 x1 = x[jdx]; 3044 s1 -= v[0]*x1; 3045 v += 1; 3046 } 3047 x[idx] = s1; 3048 } 3049 /* backward solve the upper triangular */ 3050 for (i=n-1; i>=0; i--){ 3051 v = aa + diag[i] + 1; 3052 vi = aj + diag[i] + 1; 3053 nz = ai[i+1] - diag[i] - 1; 3054 idt = i; 3055 s1 = x[idt]; 3056 while (nz--) { 3057 idx = *vi++; 3058 x1 = x[idx]; 3059 s1 -= v[0]*x1; 3060 v += 1; 3061 } 3062 v = aa + diag[i]; 3063 x[idt] = v[0]*s1; 3064 } 3065 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3066 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3067 ierr = PetscLogFlops(2*(a->nz) - A->cmap.n);CHKERRQ(ierr); 3068 PetscFunctionReturn(0); 3069 } 3070 3071 /* ----------------------------------------------------------------*/ 3072 /* 3073 This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 3074 except that the data structure of Mat_SeqAIJ is slightly different. 3075 Not a good example of code reuse. 3076 */ 3077 3078 #undef __FUNCT__ 3079 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ" 3080 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,MatFactorInfo *info,Mat *fact) 3081 { 3082 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 3083 IS isicol; 3084 PetscErrorCode ierr; 3085 PetscInt *r,*ic,prow,n = a->mbs,*ai = a->i,*aj = a->j; 3086 PetscInt *ainew,*ajnew,jmax,*fill,*xi,nz,*im,*ajfill,*flev; 3087 PetscInt *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0; 3088 PetscInt incrlev,nnz,i,bs = A->rmap.bs,bs2 = a->bs2,levels,diagonal_fill,dd; 3089 PetscTruth col_identity,row_identity,flg; 3090 PetscReal f; 3091 3092 PetscFunctionBegin; 3093 f = info->fill; 3094 levels = (PetscInt)info->levels; 3095 diagonal_fill = (PetscInt)info->diagonal_fill; 3096 ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 3097 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 3098 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 3099 3100 if (!levels && row_identity && col_identity) { /* special case copy the nonzero structure */ 3101 ierr = MatDuplicate_SeqBAIJ(A,MAT_DO_NOT_COPY_VALUES,fact);CHKERRQ(ierr); 3102 (*fact)->factor = FACTOR_LU; 3103 b = (Mat_SeqBAIJ*)(*fact)->data; 3104 ierr = MatMissingDiagonal_SeqBAIJ(*fact,&flg,&dd);CHKERRQ(ierr); 3105 if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry in row %D",dd); 3106 b->row = isrow; 3107 b->col = iscol; 3108 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 3109 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 3110 b->icol = isicol; 3111 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 3112 ierr = PetscMalloc(((*fact)->rmap.N+1+(*fact)->rmap.bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 3113 } else { /* general case perform the symbolic factorization */ 3114 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 3115 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 3116 3117 /* get new row pointers */ 3118 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr); 3119 ainew[0] = 0; 3120 /* don't know how many column pointers are needed so estimate */ 3121 jmax = (PetscInt)(f*ai[n] + 1); 3122 ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr); 3123 /* ajfill is level of fill for each fill entry */ 3124 ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr); 3125 /* fill is a linked list of nonzeros in active row */ 3126 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr); 3127 /* im is level for each filled value */ 3128 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr); 3129 /* dloc is location of diagonal in factor */ 3130 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr); 3131 dloc[0] = 0; 3132 for (prow=0; prow<n; prow++) { 3133 3134 /* copy prow into linked list */ 3135 nzf = nz = ai[r[prow]+1] - ai[r[prow]]; 3136 if (!nz) SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix"); 3137 xi = aj + ai[r[prow]]; 3138 fill[n] = n; 3139 fill[prow] = -1; /* marker for diagonal entry */ 3140 while (nz--) { 3141 fm = n; 3142 idx = ic[*xi++]; 3143 do { 3144 m = fm; 3145 fm = fill[m]; 3146 } while (fm < idx); 3147 fill[m] = idx; 3148 fill[idx] = fm; 3149 im[idx] = 0; 3150 } 3151 3152 /* make sure diagonal entry is included */ 3153 if (diagonal_fill && fill[prow] == -1) { 3154 fm = n; 3155 while (fill[fm] < prow) fm = fill[fm]; 3156 fill[prow] = fill[fm]; /* insert diagonal into linked list */ 3157 fill[fm] = prow; 3158 im[prow] = 0; 3159 nzf++; 3160 dcount++; 3161 } 3162 3163 nzi = 0; 3164 row = fill[n]; 3165 while (row < prow) { 3166 incrlev = im[row] + 1; 3167 nz = dloc[row]; 3168 xi = ajnew + ainew[row] + nz + 1; 3169 flev = ajfill + ainew[row] + nz + 1; 3170 nnz = ainew[row+1] - ainew[row] - nz - 1; 3171 fm = row; 3172 while (nnz-- > 0) { 3173 idx = *xi++; 3174 if (*flev + incrlev > levels) { 3175 flev++; 3176 continue; 3177 } 3178 do { 3179 m = fm; 3180 fm = fill[m]; 3181 } while (fm < idx); 3182 if (fm != idx) { 3183 im[idx] = *flev + incrlev; 3184 fill[m] = idx; 3185 fill[idx] = fm; 3186 fm = idx; 3187 nzf++; 3188 } else { 3189 if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev; 3190 } 3191 flev++; 3192 } 3193 row = fill[row]; 3194 nzi++; 3195 } 3196 /* copy new filled row into permanent storage */ 3197 ainew[prow+1] = ainew[prow] + nzf; 3198 if (ainew[prow+1] > jmax) { 3199 3200 /* estimate how much additional space we will need */ 3201 /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 3202 /* just double the memory each time */ 3203 PetscInt maxadd = jmax; 3204 /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */ 3205 if (maxadd < nzf) maxadd = (n-prow)*(nzf+1); 3206 jmax += maxadd; 3207 3208 /* allocate a longer ajnew and ajfill */ 3209 ierr = PetscMalloc(jmax*sizeof(PetscInt),&xi);CHKERRQ(ierr); 3210 ierr = PetscMemcpy(xi,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 3211 ierr = PetscFree(ajnew);CHKERRQ(ierr); 3212 ajnew = xi; 3213 ierr = PetscMalloc(jmax*sizeof(PetscInt),&xi);CHKERRQ(ierr); 3214 ierr = PetscMemcpy(xi,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 3215 ierr = PetscFree(ajfill);CHKERRQ(ierr); 3216 ajfill = xi; 3217 reallocate++; /* count how many reallocations are needed */ 3218 } 3219 xi = ajnew + ainew[prow]; 3220 flev = ajfill + ainew[prow]; 3221 dloc[prow] = nzi; 3222 fm = fill[n]; 3223 while (nzf--) { 3224 *xi++ = fm; 3225 *flev++ = im[fm]; 3226 fm = fill[fm]; 3227 } 3228 /* make sure row has diagonal entry */ 3229 if (ajnew[ainew[prow]+dloc[prow]] != prow) { 3230 SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 3231 try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow); 3232 } 3233 } 3234 ierr = PetscFree(ajfill);CHKERRQ(ierr); 3235 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 3236 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 3237 ierr = PetscFree(fill);CHKERRQ(ierr); 3238 ierr = PetscFree(im);CHKERRQ(ierr); 3239 3240 #if defined(PETSC_USE_INFO) 3241 { 3242 PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]); 3243 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr); 3244 ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 3245 ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr); 3246 ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 3247 if (diagonal_fill) { 3248 ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr); 3249 } 3250 } 3251 #endif 3252 3253 /* put together the new matrix */ 3254 ierr = MatCreate(((PetscObject)A)->comm,fact);CHKERRQ(ierr); 3255 ierr = MatSetSizes(*fact,bs*n,bs*n,bs*n,bs*n);CHKERRQ(ierr); 3256 ierr = MatSetType(*fact,((PetscObject)A)->type_name);CHKERRQ(ierr); 3257 ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(*fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 3258 ierr = PetscLogObjectParent(*fact,isicol);CHKERRQ(ierr); 3259 b = (Mat_SeqBAIJ*)(*fact)->data; 3260 b->free_a = PETSC_TRUE; 3261 b->free_ij = PETSC_TRUE; 3262 b->singlemalloc = PETSC_FALSE; 3263 ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 3264 b->j = ajnew; 3265 b->i = ainew; 3266 for (i=0; i<n; i++) dloc[i] += ainew[i]; 3267 b->diag = dloc; 3268 b->ilen = 0; 3269 b->imax = 0; 3270 b->row = isrow; 3271 b->col = iscol; 3272 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 3273 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 3274 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 3275 b->icol = isicol; 3276 ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 3277 /* In b structure: Free imax, ilen, old a, old j. 3278 Allocate dloc, solve_work, new a, new j */ 3279 ierr = PetscLogObjectMemory(*fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr); 3280 b->maxnz = b->nz = ainew[n]; 3281 (*fact)->factor = FACTOR_LU; 3282 3283 (*fact)->info.factor_mallocs = reallocate; 3284 (*fact)->info.fill_ratio_given = f; 3285 (*fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]); 3286 } 3287 3288 if (row_identity && col_identity) { 3289 ierr = MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(*fact);CHKERRQ(ierr); 3290 } 3291 PetscFunctionReturn(0); 3292 } 3293 3294 #undef __FUNCT__ 3295 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE" 3296 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A) 3297 { 3298 /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */ 3299 /* int i,*AJ=a->j,nz=a->nz; */ 3300 PetscFunctionBegin; 3301 /* Undo Column scaling */ 3302 /* while (nz--) { */ 3303 /* AJ[i] = AJ[i]/4; */ 3304 /* } */ 3305 /* This should really invoke a push/pop logic, but we don't have that yet. */ 3306 A->ops->setunfactored = PETSC_NULL; 3307 PetscFunctionReturn(0); 3308 } 3309 3310 #undef __FUNCT__ 3311 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj" 3312 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A) 3313 { 3314 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3315 PetscInt *AJ=a->j,nz=a->nz; 3316 unsigned short *aj=(unsigned short *)AJ; 3317 PetscFunctionBegin; 3318 /* Is this really necessary? */ 3319 while (nz--) { 3320 AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */ 3321 } 3322 A->ops->setunfactored = PETSC_NULL; 3323 PetscFunctionReturn(0); 3324 } 3325 3326 #undef __FUNCT__ 3327 #define __FUNCT__ "MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering" 3328 PetscErrorCode MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(Mat inA) 3329 { 3330 PetscErrorCode ierr; 3331 /* 3332 Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver 3333 with natural ordering 3334 */ 3335 PetscFunctionBegin; 3336 inA->ops->solve = MatSolve_SeqBAIJ_Update; 3337 inA->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_Update; 3338 switch (inA->rmap.bs) { 3339 case 1: 3340 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_1; 3341 ierr = PetscInfo(inA,"Using special in-place natural ordering factor BS=1\n");CHKERRQ(ierr); 3342 break; 3343 case 2: 3344 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering; 3345 ierr = PetscInfo(inA,"Using special in-place natural ordering factor BS=2\n");CHKERRQ(ierr); 3346 break; 3347 case 3: 3348 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering; 3349 ierr = PetscInfo(inA,"Using special in-place natural ordering factor BS=3\n");CHKERRQ(ierr); 3350 break; 3351 case 4: 3352 #if defined(PETSC_USE_MAT_SINGLE) 3353 { 3354 PetscTruth sse_enabled_local; 3355 PetscErrorCode ierr; 3356 ierr = PetscSSEIsEnabled(((PetscObject)inA)->comm,&sse_enabled_local,PETSC_NULL);CHKERRQ(ierr); 3357 if (sse_enabled_local) { 3358 # if defined(PETSC_HAVE_SSE) 3359 int i,*AJ=a->j,nz=a->nz,n=a->mbs; 3360 if (n==(unsigned short)n) { 3361 unsigned short *aj=(unsigned short *)AJ; 3362 for (i=0;i<nz;i++) { 3363 aj[i] = (unsigned short)AJ[i]; 3364 } 3365 inA->ops->setunfactored = MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj; 3366 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj; 3367 ierr = PetscInfo(inA,"Using special SSE, in-place natural ordering, ushort j index factor BS=4\n");CHKERRQ(ierr); 3368 } else { 3369 /* Scale the column indices for easier indexing in MatSolve. */ 3370 /* for (i=0;i<nz;i++) { */ 3371 /* AJ[i] = AJ[i]*4; */ 3372 /* } */ 3373 inA->ops->setunfactored = MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE; 3374 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE; 3375 ierr = PetscInfo(inA,"Using special SSE, in-place natural ordering, int j index factor BS=4\n");CHKERRQ(ierr); 3376 } 3377 # else 3378 /* This should never be reached. If so, problem in PetscSSEIsEnabled. */ 3379 SETERRQ(PETSC_ERR_SUP,"SSE Hardware unavailable"); 3380 # endif 3381 } else { 3382 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering; 3383 ierr = PetscInfo(inA,"Using special in-place natural ordering factor BS=4\n");CHKERRQ(ierr); 3384 } 3385 } 3386 #else 3387 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering; 3388 ierr = PetscInfo(inA,"Using special in-place natural ordering factor BS=4\n");CHKERRQ(ierr); 3389 #endif 3390 break; 3391 case 5: 3392 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_5_NaturalOrdering; 3393 ierr = PetscInfo(inA,"Using special in-place natural ordering factor BS=5\n");CHKERRQ(ierr); 3394 break; 3395 case 6: 3396 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_6_NaturalOrdering; 3397 ierr = PetscInfo(inA,"Using special in-place natural ordering factor BS=6\n");CHKERRQ(ierr); 3398 break; 3399 case 7: 3400 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_7_NaturalOrdering; 3401 ierr = PetscInfo(inA,"Using special in-place natural ordering factor BS=7\n");CHKERRQ(ierr); 3402 break; 3403 } 3404 PetscFunctionReturn(0); 3405 } 3406 3407 #undef __FUNCT__ 3408 #define __FUNCT__ "MatSeqBAIJ_UpdateSolvers" 3409 PetscErrorCode MatSeqBAIJ_UpdateSolvers(Mat A) 3410 { 3411 /* 3412 Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver 3413 with natural ordering 3414 */ 3415 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3416 IS row = a->row, col = a->col; 3417 PetscTruth row_identity, col_identity; 3418 PetscTruth use_natural; 3419 PetscErrorCode ierr; 3420 3421 PetscFunctionBegin; 3422 3423 use_natural = PETSC_FALSE; 3424 if (row && col) { 3425 ierr = ISIdentity(row,&row_identity);CHKERRQ(ierr); 3426 ierr = ISIdentity(col,&col_identity);CHKERRQ(ierr); 3427 3428 if (row_identity && col_identity) { 3429 use_natural = PETSC_TRUE; 3430 } 3431 } else { 3432 use_natural = PETSC_TRUE; 3433 } 3434 3435 switch (A->rmap.bs) { 3436 case 1: 3437 if (use_natural) { 3438 A->ops->solve = MatSolve_SeqBAIJ_1_NaturalOrdering; 3439 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering; 3440 ierr = PetscInfo(A,"Using special in-place natural ordering solve BS=1\n");CHKERRQ(ierr); 3441 ierr = PetscInfo(A,"Using special in-place natural ordering solve BS=4\n");CHKERRQ(ierr); 3442 } else { 3443 A->ops->solve = MatSolve_SeqBAIJ_1; 3444 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1; 3445 } 3446 break; 3447 case 2: 3448 if (use_natural) { 3449 A->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering; 3450 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering; 3451 ierr = PetscInfo(A,"Using special in-place natural ordering solve BS=2\n");CHKERRQ(ierr); 3452 ierr = PetscInfo(A,"Using special in-place natural ordering solve BS=4\n");CHKERRQ(ierr); 3453 } else { 3454 A->ops->solve = MatSolve_SeqBAIJ_2; 3455 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2; 3456 } 3457 break; 3458 case 3: 3459 if (use_natural) { 3460 A->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering; 3461 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering; 3462 ierr = PetscInfo(A,"Using special in-place natural ordering solve BS=3\n");CHKERRQ(ierr); 3463 ierr = PetscInfo(A,"Using special in-place natural ordering solve BS=4\n");CHKERRQ(ierr); 3464 } else { 3465 A->ops->solve = MatSolve_SeqBAIJ_3; 3466 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3; 3467 } 3468 break; 3469 case 4: 3470 { 3471 PetscTruth sse_enabled_local; 3472 ierr = PetscSSEIsEnabled(((PetscObject)A)->comm,&sse_enabled_local,PETSC_NULL);CHKERRQ(ierr); 3473 if (use_natural) { 3474 #if defined(PETSC_USE_MAT_SINGLE) 3475 if (sse_enabled_local) { /* Natural + Single + SSE */ 3476 # if defined(PETSC_HAVE_SSE) 3477 PetscInt n=a->mbs; 3478 if (n==(unsigned short)n) { 3479 A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj; 3480 ierr = PetscInfo(A,"Using single precision, SSE, in-place, ushort j index, natural ordering solve BS=4\n");CHKERRQ(ierr); 3481 } else { 3482 A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion; 3483 ierr = PetscInfo(A,"Using single precision, SSE, in-place, int j index, natural ordering solve BS=4\n");CHKERRQ(ierr); 3484 } 3485 # else 3486 /* This should never be reached, unless there is a bug in PetscSSEIsEnabled(). */ 3487 SETERRQ(PETSC_ERR_SUP,"SSE implementations are unavailable."); 3488 # endif 3489 } else { /* Natural + Single */ 3490 A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion; 3491 ierr = PetscInfo(A,"Using single precision, in-place, natural ordering solve BS=4\n");CHKERRQ(ierr); 3492 } 3493 #else 3494 A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering; 3495 ierr = PetscInfo(A,"Using special in-place, natural ordering solve BS=4\n");CHKERRQ(ierr); 3496 #endif 3497 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering; 3498 ierr = PetscInfo(A,"Using special in-place, natural ordering solve BS=4\n");CHKERRQ(ierr); 3499 } else { /* Arbitrary ordering */ 3500 #if defined(PETSC_USE_MAT_SINGLE) 3501 if (sse_enabled_local) { /* Arbitrary + Single + SSE */ 3502 # if defined(PETSC_HAVE_SSE) 3503 A->ops->solve = MatSolve_SeqBAIJ_4_SSE_Demotion; 3504 ierr = PetscInfo(A,"Using single precision, SSE solve BS=4\n");CHKERRQ(ierr); 3505 # else 3506 /* This should never be reached, unless there is a bug in PetscSSEIsEnabled(). */ 3507 SETERRQ(PETSC_ERR_SUP,"SSE implementations are unavailable."); 3508 # endif 3509 } else { /* Arbitrary + Single */ 3510 A->ops->solve = MatSolve_SeqBAIJ_4_Demotion; 3511 ierr = PetscInfo(A,"Using single precision solve BS=4\n");CHKERRQ(ierr); 3512 } 3513 #else 3514 A->ops->solve = MatSolve_SeqBAIJ_4; 3515 #endif 3516 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4; 3517 } 3518 } 3519 break; 3520 case 5: 3521 if (use_natural) { 3522 A->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering; 3523 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_5_NaturalOrdering; 3524 ierr = PetscInfo(A,"Using special in-place natural ordering solve BS=5\n");CHKERRQ(ierr); 3525 ierr = PetscInfo(A,"Using special in-place natural ordering solve BS=5\n");CHKERRQ(ierr); 3526 } else { 3527 A->ops->solve = MatSolve_SeqBAIJ_5; 3528 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_5; 3529 } 3530 break; 3531 case 6: 3532 if (use_natural) { 3533 A->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering; 3534 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_6_NaturalOrdering; 3535 ierr = PetscInfo(A,"Using special in-place natural ordering solve BS=6\n");CHKERRQ(ierr); 3536 ierr = PetscInfo(A,"Using special in-place natural ordering solve BS=6\n");CHKERRQ(ierr); 3537 } else { 3538 A->ops->solve = MatSolve_SeqBAIJ_6; 3539 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_6; 3540 } 3541 break; 3542 case 7: 3543 if (use_natural) { 3544 A->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering; 3545 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_7_NaturalOrdering; 3546 ierr = PetscInfo(A,"Using special in-place natural ordering solve BS=7\n");CHKERRQ(ierr); 3547 ierr = PetscInfo(A,"Using special in-place natural ordering solve BS=7\n");CHKERRQ(ierr); 3548 } else { 3549 A->ops->solve = MatSolve_SeqBAIJ_7; 3550 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_7; 3551 } 3552 break; 3553 default: 3554 A->ops->solve = MatSolve_SeqBAIJ_N; 3555 break; 3556 } 3557 PetscFunctionReturn(0); 3558 } 3559 3560 #undef __FUNCT__ 3561 #define __FUNCT__ "MatSolve_SeqBAIJ_Update" 3562 PetscErrorCode MatSolve_SeqBAIJ_Update(Mat A,Vec x,Vec y) 3563 { 3564 PetscErrorCode ierr; 3565 3566 PetscFunctionBegin; 3567 ierr = MatSeqBAIJ_UpdateSolvers(A); 3568 if (A->ops->solve != MatSolve_SeqBAIJ_Update) { 3569 ierr = (*A->ops->solve)(A,x,y);CHKERRQ(ierr); 3570 } else { 3571 SETERRQ(PETSC_ERR_SUP,"Something really wrong happened."); 3572 } 3573 PetscFunctionReturn(0); 3574 } 3575 3576 #undef __FUNCT__ 3577 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_Update" 3578 PetscErrorCode MatSolveTranspose_SeqBAIJ_Update(Mat A,Vec x,Vec y) 3579 { 3580 PetscErrorCode ierr; 3581 3582 PetscFunctionBegin; 3583 ierr = MatSeqBAIJ_UpdateSolvers(A); 3584 ierr = (*A->ops->solvetranspose)(A,x,y);CHKERRQ(ierr); 3585 PetscFunctionReturn(0); 3586 } 3587 3588 3589